diff --git a/.flake8 b/.flake8
new file mode 100644
index 0000000000..1e35e0c496
--- /dev/null
+++ b/.flake8
@@ -0,0 +1,4 @@
+[flake8]
+max-line-length = 100
+extend-ignore = E203,E501,F401,E402,E714
+per-file-ignores = __init__.py:F401
\ No newline at end of file
diff --git a/.github/ISSUE_TEMPLATE/bug.md b/.github/ISSUE_TEMPLATE/bug.md
new file mode 100644
index 0000000000..b639acd3c0
--- /dev/null
+++ b/.github/ISSUE_TEMPLATE/bug.md
@@ -0,0 +1,32 @@
+---
+name: BUG
+about: Report a bug that needs attention
+title: "[BUG]"
+labels: ''
+assignees: ''
+
+---
+
+**Describe the bug**
+A clear and concise description of what the bug is.
+
+**To Reproduce**
+Steps to reproduce the behavior. The easier it is to reproduce the faster it will get maintainer attention.
+
+**Expected behavior**
+A clear and concise description of what you expected to happen.
+
+**Stack trace/logs**
+If applicable, add the stack trace or logs from the time of the error.
+
+**Environment (please complete the following information):**
+ - Megatron-LM commit ID
+ - PyTorch version
+ - CUDA version
+ - NCCL version
+
+**Proposed fix**
+If you have a proposal for how to fix the issue state it here or link to a PR.
+
+**Additional context**
+Add any other context about the problem here.
diff --git a/.github/ISSUE_TEMPLATE/enhancement.md b/.github/ISSUE_TEMPLATE/enhancement.md
new file mode 100644
index 0000000000..076f7195ba
--- /dev/null
+++ b/.github/ISSUE_TEMPLATE/enhancement.md
@@ -0,0 +1,23 @@
+---
+name: ENHANCEMENT
+about: Suggest an idea to improve this project
+title: "[ENHANCEMENT]"
+labels: ''
+assignees: ''
+
+---
+
+**Is your feature request related to a problem? Please describe.**
+A clear and concise description of what the problem is. Ex. I'm always frustrated when [...]
+
+**Describe the solution you'd like**
+A clear and concise description of what you want to happen.
+
+**Describe alternatives you've considered**
+A clear and concise description of any alternative solutions or features you've considered.
+
+**Proposed implementation**
+If you have a proposed implementation for the feature state it here or link to a PR.
+
+**Additional context**
+Add any other context or screenshots about the feature request here.
diff --git a/.github/ISSUE_TEMPLATE/question.md b/.github/ISSUE_TEMPLATE/question.md
new file mode 100644
index 0000000000..b3d89a0ac1
--- /dev/null
+++ b/.github/ISSUE_TEMPLATE/question.md
@@ -0,0 +1,12 @@
+---
+name: QUESTION
+about: Ask a question about Megatron-LM that is not a bug, regression or enhancement
+  request
+title: "[QUESTION]"
+labels: ''
+assignees: ''
+
+---
+
+**Your question**
+Ask a clear and concise question about Megatron-LM.
diff --git a/.github/ISSUE_TEMPLATE/regression.md b/.github/ISSUE_TEMPLATE/regression.md
new file mode 100644
index 0000000000..10078d23a6
--- /dev/null
+++ b/.github/ISSUE_TEMPLATE/regression.md
@@ -0,0 +1,39 @@
+---
+name: REGRESSION
+about: Report a regression in speed or accuracy due to a Megatron-LM update
+title: "[REGRESSION]"
+labels: ''
+assignees: ''
+
+---
+
+**Describe the regression**
+A clear and concise description of what the regression is.
+
+**To Reproduce**
+Steps to reproduce the behavior. The easier it is to reproduce the faster it will get maintainer attention.
+
+**Previous performance**
+What speed or accuracy did you previously see.
+
+**New performance**
+What speed or accuracy do you see after the update.
+
+**Stack trace/logs**
+If applicable, add the stack trace or logs related to the regression.
+
+**Environment (please complete the following information):**
+ - Previous Megatron-LM commit ID
+ - New Megatron-LM commit ID
+ - Previous PyTorch version
+ - New PyTorch version
+ - Previous CUDA version
+ - New CUDA version
+ - Previous NCCL version
+ - New NCCL version
+
+**Proposed fix**
+If you have a proposal for how to fix the issue state it here or link to a PR.
+
+**Additional context**
+Add any other context about the problem here.
diff --git a/.github/workflows/stale.yml b/.github/workflows/stale.yml
new file mode 100644
index 0000000000..58ba38e060
--- /dev/null
+++ b/.github/workflows/stale.yml
@@ -0,0 +1,31 @@
+# This workflow warns and then closes issues and PRs that have had no activity for a specified amount of time.
+#
+# You can adjust the behavior by modifying this file.
+# For more information, see:
+# https://github.com/actions/stale
+name: Mark stale issues and pull requests
+
+on:
+  schedule:
+  - cron: '15 18 * * *'
+
+jobs:
+  stale:
+
+    runs-on: ubuntu-latest
+    permissions:
+      issues: write
+      pull-requests: write
+
+    steps:
+    - uses: actions/stale@v5
+      with:
+        repo-token: ${{ secrets.GITHUB_TOKEN }}
+        days-before-stale: 60
+        stale-issue-message: 'Marking as stale. No activity in 60 days.'
+        stale-pr-message: 'Marking as stale. No activity in 60 days.'
+        stale-issue-label: 'stale'
+        stale-pr-label: 'stale'
+        remove-stale-when-updated: true
+        operations-per-run: 1000
+        days-before-close: -1
diff --git a/.gitignore b/.gitignore
index cac3499524..7a2be414f2 100644
--- a/.gitignore
+++ b/.gitignore
@@ -6,3 +6,6 @@ build
 *~
 slurm*
 logs
+.vscode
+local/
+.gitmodules
\ No newline at end of file
diff --git a/.gitlab-ci.yml b/.gitlab-ci.yml
index 3cd1c2f2e6..e72df05ac7 100644
--- a/.gitlab-ci.yml
+++ b/.gitlab-ci.yml
@@ -1,299 +1,109 @@
-image: gitlab-master.nvidia.com:5005/adlr/megatron-lm/ngc/pytorch:22.12-py3_pytest-cov
-
-stages:
-  - test
-  - cleanup
-
-variables: &VARS
-  SELENE_ADLR_CI_PATH: "/lustre/fsw/adlr/adlr-nlp/adlr_ci/megatron"
-  DATA_DIR: "/lustre/fsw/adlr/adlr-nlp/adlr_ci/megatron/data"
-  PYTORCH_IMAGE: gitlab-master.nvidia.com:5005/adlr/megatron-lm/ngc/pytorch:22.12-py3_pytest-cov
-  PYTHON_VIRTUAL_ENV: /lustre/fsw/adlr/adlr-nlp/adlr_ci/cienv/bin/activate
-  TESTS_TO_RUN_AFTER_MERGE_REQ_APPROVED: L0 # Can specify levels
-  TESTS_TO_RUN_AFTER_MERGING: L0  # Can specify levels
-  TESTS_TO_RUN_ON_THIS_COMMIT: unit_tests
-  TEST_REGEX_ON_THIS_COMMIT: NONE #https://github.com/google/re2/wiki/Syntax (Can define regex as in this spec) e.g /.*gpt3.*/
-  DISPLAY_OUTPUT: "True" # Set to true for new tests to copy the logs for creating golden truth file
-
-unit_tests:
-  tags:
-    - docker_local_runner
-  stage: test
-  script:
-    - torchrun --nproc_per_node=8 -m pytest --cov-report=term --cov-report=html --cov=megatron/core tests/unit_tests
-  coverage: '/(?i)total.*? (100(?:\.0+)?\%|[1-9]?\d(?:\.\d+)?\%)$/'
-  artifacts:
-    paths:
-      - coverage
-    expire_in: 30 days
-  only:
-    - merge_requests
-
-.selene_test_resume_checkpoint_launcher: &selene-test-resume-checkpoint-launcher
-  tags:
-    - ssh_selene_runner
-  stage: test
-  script: &selene-test-resume-launcher-script
-    - echo "Running selene resume from checkpoint test. "
-    - pwd
-    - export BUILD_DIR=`pwd`
-    - export RUN_NAME=resume_${RUN_MODEL}_tp${TP_SIZE}_pp${PP_SIZE}_${NUM_NODES}nodes
-    - echo "In case of error check ${SELENE_ADLR_CI_PATH}/${CI_PIPELINE_ID}/${RUN_NAME}/results directory for result logs."
-    - export TP_SIZE PP_SIZE NUM_NODES MAX_STEPS 
-    - export DATA_DIR=$DATA_DIR
-    - echo "Run name is $RUN_NAME"
-    - mkdir -p $SELENE_ADLR_CI_PATH/$CI_PIPELINE_ID/$RUN_NAME/checkpoints
-    - mkdir -p $SELENE_ADLR_CI_PATH/$CI_PIPELINE_ID/$RUN_NAME/logs
-    - mkdir -p $SELENE_ADLR_CI_PATH/$CI_PIPELINE_ID/$RUN_NAME/results
-    - rm -rf $SELENE_ADLR_CI_PATH/$CI_PIPELINE_ID/$RUN_NAME/checkpoints/*
-    - rm -rf $SELENE_ADLR_CI_PATH/$CI_PIPELINE_ID/$RUN_NAME/logs/*
-    - rm -rf $SELENE_ADLR_CI_PATH/$CI_PIPELINE_ID/$RUN_NAME/results/*
-    - export BASE_DIR=$SELENE_ADLR_CI_PATH/$CI_PIPELINE_ID/$RUN_NAME
-    - export LOGS_DIR=$BASE_DIR/logs
-    - export RESULTS_DIR=$BASE_DIR/results
-    - export CHECKPOINTS_DIR=$BASE_DIR/checkpoints
-    - echo "Submitting job"
-    - sbatch_submission=`sbatch $BUILD_DIR/tests/functional_tests/test_scripts/$RUN_MODEL/sbatch_${RUN_MODEL}_distributed_resume_checkpoint_test.sh --export=BASE_DIR,BUILD_DIR,DATA_DIR,TP_SIZE,PP_SIZE,NUM_NODES`
-    - export SLURM_JOBID=$(echo $sbatch_submission| grep 'Submitted batch job' | awk '{ print $4 }');
-    - bash $BUILD_DIR/tests/functional_tests/shell_test_utils/jobwait.sh $SLURM_JOBID
-    - \[ ! -z ${SLURM_JOBID} \] && echo -e " --------------------------------------------------\n"
-                "----------WAITING FOR SLURM JOB TO BEGIN-----------\n"
-                "---------------------------------------------------\n"
-                "$(scontrol show job=${SLURM_JOBID})\n"
-                "---------------------------------------------------\n"
-    # Gitlab logs collapsible section markers
-    - echo -e "\e[0Ksection_end:`date +%s`:slurm_setup\r\e[0K"
-    # Follow output of the job
-    - echo "Finished job"  
-    - export SLURM_STATE=$(sacct -j "${SLURM_JOBID}" --format State --parsable2 --noheader |& head -n 1)
-    - echo "Slurm job state $SLURM_STATE"
-    - if [[ "$SLURM_STATE" != "COMPLETED" ]]; then echo "Slurm job did not complete. See ${SELENE_ADLR_CI_PATH}/${CI_PIPELINE_ID}/${RUN_NAME}/results directory for result logs. Skipping pytest."; exit 1; fi
-    - source $PYTHON_VIRTUAL_ENV
-    - pytest $BUILD_DIR/tests/functional_tests/python_test_utils/test_resume_checkpoint_pipeline.py || echo "Pytest failed. See ${SELENE_ADLR_CI_PATH}/${CI_PIPELINE_ID}/${RUN_NAME}/results directory for result logs."
-    - echo "Completed the job"
+workflow:
   rules:
-    - if: $TEST_LEVEL =~ $TESTS_TO_RUN_ON_THIS_COMMIT || $CI_JOB_NAME =~ $TESTS_TO_RUN_ON_THIS_COMMIT || $CI_JOB_NAME =~ $TEST_REGEX_ON_THIS_COMMIT
-      when: always
-    - if: '$CI_COMMIT_REF_NAME == $CI_DEFAULT_BRANCH && $TEST_LEVEL =~ $TESTS_TO_RUN_AFTER_MERGING'
-      when: always
-    - if: $CI_MERGE_REQUEST_APPROVED && $TEST_LEVEL =~ $TESTS_TO_RUN_AFTER_MERGE_REQ_APPROVED
-      when: always      
-  allow_failure: false
-
-.selene_test_launcher: &selene-test-launcher
-  tags:
-    - ssh_selene_runner
-  stage: test
-  script: &selene-test-launcher-script
-    - echo "Running selene test"
-    - echo "$CI_MERGE_REQUEST_APPROVED"
-    - pwd
-    - export BUILD_DIR=`pwd`
-    - RUN_NAME=${RUN_MODEL}_tp${TP_SIZE}_pp${PP_SIZE}_${NUM_NODES}nodes_${MAX_STEPS}steps
-    - if [[ $USE_TE == 1 ]]; then RUN_NAME=${RUN_NAME}_te_enabled; fi
-    - export $RUN_NAME
-    - echo "In case of error check ${SELENE_ADLR_CI_PATH}/${CI_PIPELINE_ID}/${RUN_NAME}/results directory for result logs."
-    - export USE_TE TP_SIZE PP_SIZE NUM_NODES MAX_STEPS VP_SIZE
-    - export MBS GBS
-    - export DATA_DIR=$DATA_DIR
-    - echo "Run name is $RUN_NAME"
-    - mkdir -p $SELENE_ADLR_CI_PATH/$CI_PIPELINE_ID/$RUN_NAME/checkpoints
-    - mkdir -p $SELENE_ADLR_CI_PATH/$CI_PIPELINE_ID/$RUN_NAME/logs
-    - mkdir -p $SELENE_ADLR_CI_PATH/$CI_PIPELINE_ID/$RUN_NAME/results
-    - rm -rf $SELENE_ADLR_CI_PATH/$CI_PIPELINE_ID/$RUN_NAME/checkpoints/*
-    - rm -rf $SELENE_ADLR_CI_PATH/$CI_PIPELINE_ID/$RUN_NAME/logs/*
-    - rm -rf $SELENE_ADLR_CI_PATH/$CI_PIPELINE_ID/$RUN_NAME/results/*
-    - export BASE_DIR=$SELENE_ADLR_CI_PATH/$CI_PIPELINE_ID/$RUN_NAME
-    - export LOGS_DIR=$BASE_DIR/logs
-    - export RESULTS_DIR=$BASE_DIR/results
-    - export CHECKPOINTS_DIR=$BASE_DIR/checkpoints
-    - echo "Submitting job"
-    - sbatch_submission=`sbatch $BUILD_DIR/tests/functional_tests/test_scripts/$RUN_MODEL/sbatch_${RUN_MODEL}_distributed_test.sh --export=BASE_DIR,BUILD_DIR,DATA_DIR,USE_TE,TP_SIZE,PP_SIZE,NUM_NODES,MAX_STEPS,VP_SIZE,MBS,GBS`
-    - export SLURM_JOBID=$(echo $sbatch_submission| grep 'Submitted batch job' | awk '{ print $4 }');
-    - bash $BUILD_DIR/tests/functional_tests/shell_test_utils/jobwait.sh $SLURM_JOBID
-    - \[ ! -z ${SLURM_JOBID} \] && echo -e " --------------------------------------------------\n"
-                "----------WAITING FOR SLURM JOB TO BEGIN-----------\n"
-                "---------------------------------------------------\n"
-                "$(scontrol show job=${SLURM_JOBID})\n"
-                "---------------------------------------------------\n"
-    # Gitlab logs collapsible section markers
-    - echo -e "\e[0Ksection_end:`date +%s`:slurm_setup\r\e[0K"
-    # Follow output of the job
-    - echo "Finished job"
-    - echo "Slurm log dump start ------------------------------------------------------------"
-    - cat $SELENE_ADLR_CI_PATH/$CI_PIPELINE_ID/$RUN_NAME/results/*
-    - echo "Slurm log dump end --------------------------------------------------------------"
-    - python3 $BUILD_DIR/tests/functional_tests/python_test_utils/check_slurm_job_completion.py $SLURM_JOBID
-    - if [ $? -ne 0 ]; then echo "Slurm job did not complete. See ${SELENE_ADLR_CI_PATH}/${CI_PIPELINE_ID}/${RUN_NAME}/results directory for result logs. Skipping pytest."; exit 1; fi
-    - source $PYTHON_VIRTUAL_ENV
-    - |
-      if [[ "$DISPLAY_OUTPUT" == "True" ]]; then
-        python3 $BUILD_DIR/tests/functional_tests/python_test_utils/get_test_results_from_tensorboard_logs.py $LOGS_DIR $RUN_NAME
-      fi
-    - |
-      if [[ $USE_TE -ne 1 ]]; then
-        echo "Checking against ground truth file"
-        export EXPECTED_METRICS_FILE=$BUILD_DIR/tests/functional_tests/test_results/$RUN_MODEL/$RUN_NAME.json
-        pytest $BUILD_DIR/tests/functional_tests/python_test_utils/test_ci_pipeline.py || echo "Pytest failed. See ${SELENE_ADLR_CI_PATH}/${CI_PIPELINE_ID}/${RUN_NAME}/results directory for result logs."
-      fi
-    - echo "Completed the job"
-  rules:
-    - if: $TEST_LEVEL =~ $TESTS_TO_RUN_ON_THIS_COMMIT || $CI_JOB_NAME =~ $TESTS_TO_RUN_ON_THIS_COMMIT || $CI_JOB_NAME =~ $TEST_REGEX_ON_THIS_COMMIT
-      when: always
-    - if: '$CI_COMMIT_REF_NAME == $CI_DEFAULT_BRANCH && $TEST_LEVEL =~ $TESTS_TO_RUN_AFTER_MERGING'
-      when: always
-    - if: $CI_MERGE_REQUEST_APPROVED && $TEST_LEVEL =~ $TESTS_TO_RUN_AFTER_MERGE_REQ_APPROVED
-      when: always      
-  allow_failure: false
-
-train.te_gpt3.345m_tp2_pp2_1node_50steps:
-  <<: *selene-test-launcher
-  variables:
-    <<: [*VARS]
-    RUN_MODEL: gpt3
-    USE_TE: 1
-    TP_SIZE: 2
-    PP_SIZE: 2
-    NUM_NODES: 1
-    MAX_STEPS: 50
-    TIME_LIMIT: "50:00"
-    TEST_LEVEL: L0
-
-train.gpt3.345m_tp4_pp1_1node_50steps:
-  <<: *selene-test-launcher
-  variables:
-    <<: [*VARS]
-    RUN_MODEL: gpt3
-    USE_TE: 0
-    TP_SIZE: 4
-    PP_SIZE: 1
-    NUM_NODES: 1
-    MAX_STEPS: 50
-    TIME_LIMIT: "20:00"
-    TEST_LEVEL: L0
-
-train.gpt3.345m_tp2_pp2_1node_50steps:
-  <<: *selene-test-launcher
-  variables:
-    <<: [*VARS]
-    RUN_MODEL: gpt3
-    USE_TE: 0
-    TP_SIZE: 2
-    PP_SIZE: 2
-    NUM_NODES: 1
-    MAX_STEPS: 50
-    TIME_LIMIT: "20:00"
-    TEST_LEVEL: L0
+    - if: $CI_PROJECT_NAMESPACE != "ADLR"
+      when: never
+    - if: $CI_COMMIT_BRANCH =~ /ci-/ && $CI_PIPELINE_SOURCE != "schedule"
+      when: never
+    - if: $CI_PIPELINE_SOURCE == "schedule"
+      auto_cancel:
+        on_new_commit: none
+    - if: $CI_PIPELINE_SOURCE == "web"
+    - if: $CI_COMMIT_REF_PROTECTED == "true"
+      variables:
+        FUNCTIONAL_TEST: "no"
+    - if: $CI_MERGE_REQUEST_LABELS =~ /Run tests/ && $CI_MERGE_REQUEST_TARGET_BRANCH_SHA != ""
+      variables:
+        UNIT_TEST_REPEAT: 5
+        UNIT_TEST_TIMEOUT: 75
+        FUNCTIONAL_TEST: "yes"
+        FUNCTIONAL_TEST_SCOPE: mr
+        FUNCTIONAL_TEST_CLUSTER_A100: ""
+        FUNCTIONAL_TEST_CLUSTER_H100: ""
+    - if: $CI_MERGE_REQUEST_LABELS =~ /Run nightly/ && $CI_MERGE_REQUEST_TARGET_BRANCH_SHA != ""
+      variables:
+        UNIT_TEST_REPEAT: 5
+        UNIT_TEST_TIMEOUT: 75
+        FUNCTIONAL_TEST: "yes"
+        FUNCTIONAL_TEST_SCOPE: nightly
+        FUNCTIONAL_TEST_CLUSTER_A100: ""
+        FUNCTIONAL_TEST_CLUSTER_H100: ""
+    - if: $CI_MERGE_REQUEST_LABELS =~ /Run weekly/ && $CI_MERGE_REQUEST_TARGET_BRANCH_SHA != ""
+      variables:
+        UNIT_TEST_REPEAT: 5
+        UNIT_TEST_TIMEOUT: 75
+        FUNCTIONAL_TEST: "yes"
+        FUNCTIONAL_TEST_SCOPE: weekly
+        FUNCTIONAL_TEST_CLUSTER_A100: ""
+        FUNCTIONAL_TEST_CLUSTER_H100: ""
+    - if: $CI_PIPELINE_SOURCE == "merge_request_event" && $CI_MERGE_REQUEST_TARGET_BRANCH_SHA != ""
+      variables:
+        FUNCTIONAL_TEST: "no"
+    - when: never
+  auto_cancel:
+    on_new_commit: interruptible
 
-train.gpt3.345m_tp1_pp2_1node_50steps:
-  <<: *selene-test-launcher
-  variables:
-    <<: [*VARS]
-    RUN_MODEL: gpt3
-    USE_TE: 0
-    TP_SIZE: 1
-    PP_SIZE: 2
-    NUM_NODES: 1
-    MAX_STEPS: 50
-    TIME_LIMIT: "20:00"
-    TEST_LEVEL: L0 
-
-train.gpt3.345m_tp1_pp4_1node_50steps:
-  <<: *selene-test-launcher
-  variables:
-    <<: [*VARS]
-    RUN_MODEL: gpt3
-    USE_TE: 0
-    TP_SIZE: 1
-    PP_SIZE: 4
-    VP_SIZE: 1
-    NUM_NODES: 1
-    MAX_STEPS: 50
-    TIME_LIMIT: "20:00"
-    TEST_LEVEL: L0
-
-resume.checkpoint.gpt3.345m_tp1_pp2_1node:
-  <<: *selene-test-resume-checkpoint-launcher
-  variables:
-    <<: [*VARS]
-    RUN_MODEL: gpt3
-    TP_SIZE: 1
-    PP_SIZE: 2
-    NUM_NODES: 1
-    TIME_LIMIT: "30:00"
-    TEST_LEVEL: L0 
-
-train.bert.345m_tp4_pp1_1node_50steps:
-  <<: *selene-test-launcher
-  variables:
-    <<: [*VARS]
-    RUN_MODEL: bert
-    TP_SIZE: 4
-    PP_SIZE: 1
-    NUM_NODES: 1
-    MAX_STEPS: 50
-    TIME_LIMIT: "20:00"
-    TEST_LEVEL: L0
-
-train.bert.345m_tp2_pp2_1node_50steps:
-  <<: *selene-test-launcher
-  variables:
-    <<: [*VARS]
-    RUN_MODEL: bert
-    TP_SIZE: 2
-    PP_SIZE: 2
-    NUM_NODES: 1
-    MAX_STEPS: 50
-    TIME_LIMIT: "20:00"
-    TEST_LEVEL: L0
-
-train.bert.345m_tp1_pp2_1node_50steps:
-  <<: *selene-test-launcher
-  variables:
-    <<: [*VARS]
-    RUN_MODEL: bert
-    TP_SIZE: 1
-    PP_SIZE: 2
-    NUM_NODES: 1
-    MAX_STEPS: 50
-    TIME_LIMIT: "20:00"
-    TEST_LEVEL: L0 
-
-train.bert.345m_tp1_pp4_1node_50steps:
-  <<: *selene-test-launcher
-  variables:
-    <<: [*VARS]
-    RUN_MODEL: bert
-    TP_SIZE: 1
-    PP_SIZE: 4
-    VP_SIZE: 2
-    NUM_NODES: 1
-    MAX_STEPS: 50
-    TIME_LIMIT: "20:00"
-    TEST_LEVEL: L0
-
-resume.checkpoint.bert.345m_tp1_pp2_1node:
-  <<: *selene-test-resume-checkpoint-launcher
-  variables:
-    <<: [*VARS]
-    RUN_MODEL: bert
-    TP_SIZE: 1
-    PP_SIZE: 2
-    NUM_NODES: 1
-    TIME_LIMIT: "30:00"
-    TEST_LEVEL: L0       
-
-cleanup.selene:
-  tags:
-    - ssh_selene_runner
-  stage: cleanup
-  variables:
-    <<: [*VARS]
-  script:
-    - set +e
-    - NUM_CLEANUP=`find ${SELENE_ADLR_CI_PATH}/* -type d -ctime +20 | grep -v data | wc -l`
-    - find ${SELENE_ADLR_CI_PATH}/* -type d -ctime +20 | grep -v data | xargs rm -rf
-    - echo "Finished cleaning $NUM_CLEANUP directories older than 20 days everything in Selene"
-  allow_failure: true
-  rules:
-    - when: always
+stages:
+  - test 
+  - functional_tests
+  - convergence_tests
+  - publish
+
+default:
+  interruptible: true
+
+variables:
+  FUNCTIONAL_TEST: 
+    value: "yes"
+    options:
+      - "yes"
+      - "no"
+    description: To run the funtional test suite
+  FUNCTIONAL_TEST_SCOPE:
+    value: "mr"
+    options:
+      - "mr"
+      - "nightly"
+      - "weekly"
+      - "pre-release"
+      - "release"
+    description: "Testsuite to run (only for FUNCTIONAL_TEST=yes)"
+  FUNCTIONAL_TEST_CLUSTER_A100:
+    value: "dgxa100_dracooci"
+    options:
+      - "dgxa100_dracooci"
+      - "dgxa100_dracooci-ord"
+    description: 'Cluster for A100 workloads'
+  FUNCTIONAL_TEST_CLUSTER_H100:
+    value: "dgxh100_eos"
+    options:
+      - "dgxh100_coreweave"
+      - "dgxh100_eos"
+    description: 'Cluster for H100 workloads'
+  FUNCTIONAL_TEST_NAME:
+    description: "Name of functional test run (only for pre-release and release)"
+  PUBLISH: 
+    value: "no"
+    options: 
+      - "yes"
+      - "no"
+    description: Build and publish a wheel to PyPi
+  PUBLISH_SCOPE:
+    value: "code-freeze"
+    options:
+      - "code-freeze"
+      - "release"
+    description: Type of publish (freeze or final release)
+
+  # CI wide variables
+  CI_MCORE_LTS_IMAGE: ${GITLAB_ENDPOINT}:5005/adlr/megatron-lm/mcore_ci_lts
+  CI_MCORE_DEV_IMAGE: ${GITLAB_ENDPOINT}:5005/adlr/megatron-lm/mcore_ci_dev
+  CI_NEMO_IMAGE: ${GITLAB_ENDPOINT}:5005/adlr/megatron-lm/nemo_ci
+  LINTING_IMAGE: ${GITLAB_ENDPOINT}:5005/adlr/megatron-lm/mcore_linting
+  UNIT_TEST_TIMEOUT: 15
+  UNIT_TEST_REPEAT: 1
+
+include:
+  - .gitlab/stages/00.pre.yml
+  - .gitlab/stages/01.test.yml
+  - .gitlab/stages/02.functional-tests.yml
+  - .gitlab/stages/03.publish.yml
diff --git a/.gitlab/labeler-config.yml b/.gitlab/labeler-config.yml
new file mode 100644
index 0000000000..3dc4001cd7
--- /dev/null
+++ b/.gitlab/labeler-config.yml
@@ -0,0 +1,33 @@
+CI:
+- .gitlab-ci.yml
+- Dockerfile.ci.lts
+- Dockerfile.ci.dev
+- .github/**
+- .gitlab/**
+
+Datasets:
+- megatron/core/datasets/**
+
+BERT:
+- megatron/core/models/bert/**
+
+GPT:
+- megatron/core/models/gpt/**
+
+RETRO:
+- megatron/core/models/retro/**
+
+Dist-Ckpt:
+- megatron/core/dist_checkpointing
+
+Dist-Opt:
+- megatron/core/optimizer/distrib_optimizer 
+
+Inference:
+- megatron/core/inference
+
+MoE:
+- megatron/core/transformer/moe
+
+Tests:
+- tests/**
\ No newline at end of file
diff --git a/.gitlab/stages/00.pre.yml b/.gitlab/stages/00.pre.yml
new file mode 100644
index 0000000000..453025d4b9
--- /dev/null
+++ b/.gitlab/stages/00.pre.yml
@@ -0,0 +1,189 @@
+include:
+  - template: Security/Secret-Detection.gitlab-ci.yml
+
+.pre_rules:
+  rules:
+    - if: $CI_PIPELINE_SOURCE == 'merge_request_event' && $CI_MERGE_REQUEST_TARGET_BRANCH_PROTECTED != "true"
+      allow_failure: true
+      when: always
+    - if: $CI_PIPELINE_SOURCE == 'merge_request_event'
+    - when: never
+  stage: .pre
+
+.dind_rules:
+  image: docker:26.1.4-dind
+  variables:
+    DOCKER_HOST: unix:///var/run/docker.sock
+  before_script:
+    - docker system prune -a --filter "until=36h" -f || true
+    - echo "$NGC_API_KEY" | docker login nvcr.io -u '$oauthtoken' --password-stdin
+    - echo "$CI_REGISTRY_PASSWORD" | docker login $CI_REGISTRY -u $CI_REGISTRY_USER --password-stdin
+
+pre:mirror_to_github:
+  rules:
+    - if: '$CI_COMMIT_REF_PROTECTED == "true" && $CI_PIPELINE_SOURCE == "push"'
+    - when: never
+  tags: [mcore-docker-node-small]
+  stage: .pre
+  image: python:3.10
+  variables:
+    GIT_STRATEGY: "clone"
+  script:
+    - git checkout $CI_COMMIT_BRANCH
+    - git remote add github https://ko3n1g:$GH_TOKEN@github.com/NVIDIA/Megatron-LM.git || true
+    - git push -u github $CI_COMMIT_BRANCH
+
+pre:create_ci_branches:
+  rules:
+    - if: '$CI_COMMIT_BRANCH == $CI_DEFAULT_BRANCH && $CI_PIPELINE_SOURCE == "push"'
+    - when: never
+  parallel:
+    matrix:
+      - branch: ci-unit-test-extended
+      - branch: ci-rebuild-mcore-nemo-image
+      - branch: ci-mr-a100
+      - branch: ci-nightly-a100
+      - branch: ci-weekly-a100
+      - branch: ci-weekly-h100
+      - branch: ci-pre-release
+  tags: [mcore-docker-node-small]
+  stage: .pre
+  image: python:3.10
+  variables:
+    GIT_STRATEGY: "clone"
+  script:
+    - git remote set-url origin "https://gitlab-ci-token:${PROJECT_ACCESS_TOKEN_MCORE}@${GITLAB_ENDPOINT}/adlr/megatron-lm.git"
+    - git switch --force-create $branch
+    - git push --force -u origin $branch
+
+pre:label_merge_request:
+  extends: [.pre_rules]
+  image: golang:1.22
+  tags:
+    - mcore-docker-node-small
+  before_script:
+    - git clone -b nv https://${GITLAB_ENDPOINT}/okoenig/gitlab-mr-labeler.git
+    - cd gitlab-mr-labeler
+    - go install .
+    - cd ..
+    - go install github.com/itchyny/gojq/cmd/gojq@latest
+    - |
+      echo LABELS=$(curl --header "PRIVATE-TOKEN: ${PROJECT_ACCESS_TOKEN_MCORE}" --url "https://${GITLAB_ENDPOINT}/api/v4/projects/${CI_PROJECT_ID}/merge_requests/${CI_MERGE_REQUEST_IID}" | gojq '.labels | join(",")') > labels
+  script:
+    - gitlab-mr-labeler -f .gitlab/labeler-config.yml -t ${PROJECT_ACCESS_TOKEN_MCORE} --debug true
+  after_script:
+    - |
+      source labels
+      curl --header "PRIVATE-TOKEN: ${PROJECT_ACCESS_TOKEN_MCORE}" --url "https://${GITLAB_ENDPOINT}/api/v4/projects/${CI_PROJECT_ID}/merge_requests/${CI_MERGE_REQUEST_IID}" --data-urlencode "add_labels=$LABELS" -X PUT
+
+pre:clean_docker_node:
+  extends: [.pre_rules, .dind_rules]
+  tags: 
+    - ${node}
+  parallel:
+    matrix:
+      - node: mcore-docker-node-small
+      - node: mcore-docker-node-large
+  script: ':'
+
+pre:maybe_cherry_pick_commit:
+  rules:
+    - if: '$CI_COMMIT_BRANCH == $CI_DEFAULT_BRANCH && $CI_PIPELINE_SOURCE == "push"'
+    - when: never
+  tags: [mcore-docker-node-small]
+  stage: .pre
+  image:
+    name: registry.gitlab.com/gitlab-ci-utils/curl-jq
+    entrypoint: [""]
+  variables:
+    GIT_STRATEGY: "clone"
+  script: 
+    - set -x
+    - set +e
+    - SHA=$(git rev-list --no-merges -n 1 HEAD)
+    - MESSAGE=$(git log -n 1 --pretty=format:%s $SHA)
+    - MR_ID=$(echo $MESSAGE | awk -F'!' '{print $2}' | awk '{print $1}' )  
+    - git remote set-url origin "https://gitlab-ci-token:${PROJECT_ACCESS_TOKEN_MCORE}@${GITLAB_ENDPOINT}/$CI_PROJECT_NAMESPACE/megatron-lm.git"
+    - git config --global user.email "mcore-bot@nvidia.com"
+    - git config --global user.name "Mcore Bot"
+    - |
+      MR=$(curl --header "PRIVATE-TOKEN: ${PROJECT_ACCESS_TOKEN_MCORE}" --url "https://${GITLAB_ENDPOINT}/api/v4/projects/${CI_PROJECT_ID}/merge_requests/${MR_ID}")
+
+      LABELS=$(echo -E $MR | jq '.labels | join(",")' | tr -d '"')
+      AUTHOR_ID=$(echo -E $MR | jq '.author.id' | tr -d '"')
+      AUTHOR_NAME=$(echo -E $MR | jq '.author.username' | tr -d '"')
+      TITLE=$(echo -E $MR | jq '.title' | tr -d '"')
+      MILESTONE_ID=$(echo -E $MR | jq '.milestone.id' | tr -d '"')
+      TARGET_BRANCHES=$(echo "$LABELS" | grep -o 'core_[^,]*')
+
+      if [[ $TARGET_BRANCHES == "" ]]; then
+        echo Nothing to cherry pick
+        exit 0
+      fi
+      
+      echo $TARGET_BRANCHES | while read -r RELEASE_BRANCH ; do
+        TARGET_BRANCH_EXISTS_OK=$([[ "$(git ls-remote --heads origin refs/heads/$RELEASE_BRANCH)" != "" ]] && echo true || echo false)
+      
+        if [[ "$TARGET_BRANCH_EXISTS_OK" == "false" ]]; then
+          echo Release branch does not yet exist, will not  cherry-pick
+          continue
+        fi
+        
+        (
+          git fetch origin $RELEASE_BRANCH:$RELEASE_BRANCH
+          git switch --force-create cherry-pick-$MR_ID-$RELEASE_BRANCH $RELEASE_BRANCH
+          git cherry-pick $SHA
+          git push -u origin --force cherry-pick-$MR_ID-$RELEASE_BRANCH
+          git checkout ${CI_DEFAULT_BRANCH:-main}
+        )
+
+        CHERRYPICK_SUCCESSFUL=$?
+
+        if [[ $CHERRYPICK_SUCCESSFUL -eq 0 ]]; then
+          curl \
+            --header "PRIVATE-TOKEN: $PAT" \
+            --url https://${GITLAB_ENDPOINT}/api/v4/projects/${CI_PROJECT_ID}/merge_requests \
+            -d "source_branch=cherry-pick-$MR_ID-$RELEASE_BRANCH" \
+            -d "target_branch=$RELEASE_BRANCH" \
+            -d "title=Cherry pick \`$TITLE ($MR_ID)\` into \`$RELEASE_BRANCH\`" \
+            -d "labels=cherry-pick" \
+            -d "reviewer_ids=$AUTHOR_ID" \
+            -d "milestone_id=$MILESTONE_ID" \
+            -d "description=[🤖]: Hi @$AUTHOR_NAME 👋,<br><br>we've cherry picked \`$TITLE ($MR_ID)\` into \`$RELEASE_BRANCH\` for you! 🚀<br><br>Please review and approve this cherry pick by your convenience\!"
+
+        else
+          URL=https://${GITLAB_ENDPOINT}/ADLR/megatron-lm/-/merge_requests/$MR_ID
+
+          MESSAGE='{
+            "blocks": [
+              {
+                "type": "section",
+                "text": {
+                  "type": "mrkdwn",
+                  "text": ":alert: Cherrypick bot 🤖: Cherry-pick of <'$URL'|!'$MR_ID'> failed"
+                }
+              }
+            ]
+          }'
+
+          curl -X POST -H "Content-type: application/json" --data "$MESSAGE" ${MCORE_NOTIFICATION_HOOK}
+
+        fi
+
+      done
+  interruptible: false
+
+pre:check_milestone:
+  extends: [.pre_rules]
+  image: ${GITLAB_ENDPOINT}:5005/adlr/megatron-lm/mcore_ci:buildcache
+  tags: [mcore-docker-node-small]
+  script:
+    - env
+    - |
+      MILESTONE=$(curl --header "PRIVATE-TOKEN: ${PROJECT_ACCESS_TOKEN_MCORE}" --url "https://${GITLAB_ENDPOINT}/api/v4/projects/${CI_PROJECT_ID}/merge_requests/${CI_MERGE_REQUEST_IID}" | jq '.milestone')
+    - |
+      if [[ "$MILESTONE" == "null" ]]; then
+        echo Please assign a Milestone to this MR!
+        exit 1
+      fi
+  
\ No newline at end of file
diff --git a/.gitlab/stages/01.test.yml b/.gitlab/stages/01.test.yml
new file mode 100644
index 0000000000..c2d2634f35
--- /dev/null
+++ b/.gitlab/stages/01.test.yml
@@ -0,0 +1,229 @@
+.test_rules:
+  rules:
+    - if: $CI_PIPELINE_SOURCE == 'merge_request_event' && $CI_MERGE_REQUEST_TARGET_BRANCH_PROTECTED != "true"
+      allow_failure: true
+      when: always
+    - when: always
+  stage: test
+
+include:
+  - template: Security/Secret-Detection.gitlab-ci.yml
+
+test:build_image:
+  extends: [.test_rules, .dind_rules]
+  tags:
+    - ${TAG}
+  timeout: 45m
+  parallel:
+    matrix:
+      - IMAGE: CI_MCORE_LTS_IMAGE
+        FILE: Dockerfile.ci.lts
+        BASE_IMAGE: nvcr.io/nvidia/pytorch:24.01-py3
+        TAG: mcore-docker-node-large
+      - IMAGE: CI_MCORE_DEV_IMAGE
+        FILE: Dockerfile.ci.dev
+        BASE_IMAGE: nvcr.io/nvidia/pytorch:24.07-py3
+        TAG: mcore-docker-node-large
+      - IMAGE: CI_NEMO_IMAGE
+        FILE: Dockerfile.ci.lts
+        BASE_IMAGE: nvcr.io/nvidian/nemo:nightly
+        TAG: mcore-docker-node-large
+      - IMAGE: LINTING_IMAGE
+        FILE: Dockerfile.linting
+        BASE_IMAGE: python:3.10
+        TAG: mcore-docker-node-small
+  variables:
+    STAGE: main
+  script:
+    - apk add bash
+    - |
+      bash -c '
+        set -x
+        env
+        eval "IMAGE=\$$IMAGE"
+        
+        docker buildx create --name container --driver=docker-container
+      
+        ADDITIONAL_PARAMS=()
+
+        if [[ "$CI_COMMIT_BRANCH" == "$CI_DEFAULT_BRANCH" ]]; then
+          ADDITIONAL_PARAMS+=("--pull")
+          ADDITIONAL_PARAMS+=("--cache-to type=registry,ref=${IMAGE}-buildcache:main")
+        fi
+
+        if [[ "$CI_COMMIT_BRANCH" == "ci-nightly-a100" ]]; then
+          ADDITIONAL_PARAMS+=("-t ${IMAGE}:nightly")
+        fi
+
+        if [[ "$CI_PIPELINE_SOURCE" == "merge_request_event" ]]; then
+          MCORE_REF=$(echo ${CI_MERGE_REQUEST_REF_PATH} | sed 's/head$/merge/')
+        else
+          MCORE_REF=$CI_COMMIT_SHA
+        fi
+
+        DOCKER_BUILDKIT=1 docker build \
+          --secret id=JET_INDEX_URLS \
+          --target $STAGE \
+          -f $FILE \
+          -t ${IMAGE}:${CI_PIPELINE_ID} \
+          --builder=container \
+          --build-arg CACHEBUST=$(cat /proc/sys/kernel/random/uuid) \
+          --build-arg MCORE_REPO=${CI_REPOSITORY_URL} \
+          --build-arg MCORE_REF=${MCORE_REF} \
+          --build-arg MCORE_BACKWARDS_REF="core_r0.9.0" \
+          --cache-to type=registry,ref=${IMAGE}-buildcache:${CI_PIPELINE_ID} \
+          --cache-to type=registry,ref=${IMAGE}-buildcache:${CI_MERGE_REQUEST_IID:-noop} \
+          --cache-from type=registry,ref=${IMAGE}-buildcache:main \
+          --cache-from type=registry,ref=${IMAGE}-buildcache:${CI_PIPELINE_ID} \
+          --cache-from type=registry,ref=${IMAGE}-buildcache:${CI_MERGE_REQUEST_IID:-noop} \
+          --build-arg FROM_IMAGE_NAME=$BASE_IMAGE \
+          --push \
+          ${ADDITIONAL_PARAMS[@]} .
+        '
+  retry:
+    max: 2
+
+.unit_tests:
+  extends: [.test_rules, .dind_rules]
+  needs: 
+    - test:build_image
+    - test:docs_build
+    - test:formatting
+    - test:copyright
+    - test:secret_detection
+  timeout: 180m
+  tags: [8xL40S]
+  variables:
+    GIT_STRATEGY: none
+  script:
+    - if [ $UNIT_TEST_REPEAT -eq 0 ]; then exit 0; fi;
+    - docker run --name mcore_ci_${CI_PIPELINE_ID} -d --rm -e TAG -e UNIT_TEST_REPEAT -e UNIT_TEST_TIMEOUT --gpus all --ipc=host --ulimit memlock=-1 --ulimit stack=67108864 ${IMAGE}:${CI_PIPELINE_ID} bash -c "sleep $(( ${UNIT_TEST_TIMEOUT} * 60 + 60 ))"
+    - |
+      docker exec mcore_ci_${CI_PIPELINE_ID} bash -c '
+        set -e
+        
+        MCORE_DIR=$([[ "$TAG" == "latest" ]] && echo "" || echo "-$TAG/")
+
+        cd /opt/megatron-lm$MCORE_DIR;
+
+        for i in $(seq $UNIT_TEST_REPEAT); do
+          SEED=$((RANDOM % 9000 + 1000));
+          ARGS=()
+          if [[ $TAG != latest ]]; then
+            ARGS+=(-m "not internal and not flaky and not flaky_in_dev")
+          else
+            ARGS+=(-m "not flaky and not flaky_in_dev")
+          fi
+          timeout ${UNIT_TEST_TIMEOUT}m torchrun --nproc_per_node=8 -m pytest --random-order --random-order-seed ${SEED} -xvs --cov-report=term --cov-report=html --cov=megatron/core --no-cov-on-fail "${ARGS[@]}" tests/unit_tests
+        done
+      '
+  after_script:
+    - docker container stop mcore_ci_${CI_PIPELINE_ID} || true
+  artifacts:
+    paths:
+      - coverage
+  rules:
+    - if: $CI_PIPELINE_SOURCE == 'merge_request_event' && $CI_MERGE_REQUEST_TARGET_BRANCH_PROTECTED != "true"
+      allow_failure: true
+      when: always
+    - when: always
+
+test:pyt(LTS)_mcore(latest):
+  extends: [.unit_tests]
+  variables:
+    TAG: latest
+    IMAGE: ${CI_MCORE_LTS_IMAGE}
+
+test:pyt(LTS)_mcore(0.9.0):
+  extends: [.unit_tests]
+  variables:
+    TAG: core_r0.9.0
+    IMAGE: ${CI_MCORE_LTS_IMAGE}
+
+test:pyt(DEV)_mcore(latest):
+  extends: [.unit_tests]
+  variables:
+    TAG: latest
+    IMAGE: ${CI_MCORE_DEV_IMAGE}
+
+test:pyt(DEV)_mcore(0.9.0):
+  extends: [.unit_tests]
+  variables:
+    TAG: core_r0.9.0
+    IMAGE: ${CI_MCORE_DEV_IMAGE}
+
+test:notify:
+  extends: [.test_rules]
+  image: ${CI_MCORE_LTS_IMAGE}:${CI_PIPELINE_ID}
+  needs:
+    - test:pyt(LTS)_mcore(latest)
+    - test:pyt(DEV)_mcore(latest)
+    - test:pyt(LTS)_mcore(0.9.0)
+    - test:pyt(DEV)_mcore(0.9.0)
+  tags:
+    - mcore-docker-node-small
+  script:
+    - env
+    - export WEBHOOK_URL=${MCORE_NOTIFICATION_HOOK}
+    - export RO_API_TOKEN=${PROJECT_ACCESS_TOKEN_MCORE}
+    - export GITLAB_ENDPOINT
+    - export DATE=$(date +"%Y-%m-%d")
+    - bash tests/functional_tests/shell_test_utils/notify_unit_tests.sh ${CI_PIPELINE_ID}
+  artifacts:
+    when: always
+    paths:
+      - scripts
+  rules:
+    - if: $CI_PIPELINE_SOURCE == "schedule" && $CI_COMMIT_BRANCH == "ci-unit-test-extended"
+      when: always
+    - when: never
+
+test:docs_build:
+  extends: [.test_rules]
+  image: ${CI_MCORE_LTS_IMAGE}:${CI_PIPELINE_ID}
+  tags: [mcore-docker-node-small]
+  needs: [test:build_image]
+  script:
+    - cd ..
+    - rm -rf documentation && git clone https://gitlab-ci-token:${CI_JOB_TOKEN}@${GITLAB_ENDPOINT}/nemo-megatron-core-tme/documentation.git
+    - mv megatron-lm/ documentation/
+    - cd documentation/
+    - ./repo docs
+
+test:formatting:
+  extends: [.test_rules]
+  image: ${LINTING_IMAGE}:${CI_PIPELINE_ID}
+  tags: [mcore-docker-node-small]
+  needs: [test:build_image]
+  script:
+    - env
+    - git fetch origin main
+    - BASE_REF="$CI_MERGE_REQUEST_TARGET_BRANCH_NAME" CHECK_ONLY=true SKIP_DOCS=$([[ "$CI_MERGE_REQUEST_LABELS" == *"Skip docs"* ]] && echo "true" || echo "false") bash tools/autoformat.sh
+
+test:copyright:
+  extends: [.test_rules]
+  tags: [mcore-docker-node-small]
+  image: ${CI_MCORE_LTS_IMAGE}:${CI_PIPELINE_ID}
+  needs: [test:build_image]
+  script:
+    - git fetch origin main
+    - bash tools/copyright.sh
+
+test:secret_detection:
+  tags: [mcore-docker-node-small]
+  extends: ".secret-analyzer"
+  variables:
+    GIT_DEPTH: 0
+    SECRET_DETECTION_LOG_OPTIONS: ${CI_MERGE_REQUEST_DIFF_BASE_SHA}..${CI_COMMIT_SHA}
+  allow_failure: true
+  rules:
+    - if: $CI_PIPELINE_SOURCE == "merge_request_event"
+  script:
+    - apk add jq
+    - /analyzer run
+    - |
+      if [[ $(cat gl-secret-detection-report.json | jq '.vulnerabilities | length > 0') == true ]]; then
+        echo "Atleast one vulnerability has been found"
+        cat gl-secret-detection-report.json | jq '.'
+        exit 1
+      fi
\ No newline at end of file
diff --git a/.gitlab/stages/02.functional-tests.yml b/.gitlab/stages/02.functional-tests.yml
new file mode 100644
index 0000000000..68d776b45d
--- /dev/null
+++ b/.gitlab/stages/02.functional-tests.yml
@@ -0,0 +1,149 @@
+.functional_tests_rules:
+  stage: functional_tests
+  rules:
+    - if: $FUNCTIONAL_TEST == "yes" && ($CI_PIPELINE_SOURCE == 'merge_request_event' && $CI_MERGE_REQUEST_TARGET_BRANCH_PROTECTED != "true")
+      allow_failure: true
+    - if: $FUNCTIONAL_TEST == "yes"
+    - when: never
+
+default:
+  id_tokens:
+    VAULT_JWT_TOKEN:
+      aud: https://stg.vault.nvidia.com
+
+include:
+  - project: dl/jet/gitlab-templates
+    ref: main
+    file: downstreams.yml
+
+functional:clean_docker_node:
+  extends: [.functional_tests_rules, .dind_rules]
+  tags: [mcore-docker-node-jet]
+  script: ':'
+
+functional:build_image:
+  extends: [test:build_image, .functional_tests_rules]
+  variables:
+    STAGE: jet
+
+functional:configure:
+  needs: [functional:build_image]
+  extends: [.functional_tests_rules]
+  image: ${CI_MCORE_LTS_IMAGE}:${CI_PIPELINE_ID}
+  tags: [mcore-docker-node-small]
+  before_script:
+    - git rm -r tests/functional_tests/local_recipes || true
+    - git submodule add --force https://gitlab-ci-token:${CI_JOB_TOKEN}@${GITLAB_ENDPOINT}/ADLR/megatron-lm-convergence-tests.git tests/functional_tests/local_recipes
+    - ls tests/functional_tests/local_recipes
+  script: 
+    - set -x
+    - |
+      A100_CLUSTER=$([[ "$FUNCTIONAL_TEST_CLUSTER_A100" != "" ]] && echo $FUNCTIONAL_TEST_CLUSTER_A100 || echo $DEFAULT_A100_CLUSTER)
+      H100_CLUSTER=$([[ "$FUNCTIONAL_TEST_CLUSTER_H100" != "" ]] && echo $FUNCTIONAL_TEST_CLUSTER_H100 || echo $DEFAULT_H100_CLUSTER)
+    - |
+      if [[ "$FUNCTIONAL_TEST_SCOPE" == "release" || "$FUNCTIONAL_TEST_SCOPE" == "pre-release" ]]; then
+        RELEASE_ARGS=(
+          "--run-name"
+          $FUNCTIONAL_TEST_NAME
+          "--wandb-experiment"
+          $(echo $FUNCTIONAL_TEST_NAME | tr '/' '-')
+        )
+      else
+        RELEASE_ARGS=()
+      fi
+    - |
+      export PYTHONPATH=$(pwd)
+      python tests/functional_tests/python_test_utils/jet/generate_jet_trigger_job.py \
+        --scope $FUNCTIONAL_TEST_SCOPE \
+        --environment dev \
+        --a100-cluster $A100_CLUSTER \
+        --h100-cluster $H100_CLUSTER \
+        --container-image ${CI_MCORE_LTS_IMAGE} \
+        --container-tag ${CI_PIPELINE_ID} \
+        --output-path "jet-trigger-job-dev.yaml" \
+        ${RELEASE_ARGS[@]}
+    - |
+      export PYTHONPATH=$(pwd)
+      python tests/functional_tests/python_test_utils/jet/generate_jet_trigger_job.py \
+        --scope $FUNCTIONAL_TEST_SCOPE \
+        --environment lts \
+        --a100-cluster $A100_CLUSTER \
+        --h100-cluster $H100_CLUSTER \
+        --container-image ${CI_MCORE_LTS_IMAGE} \
+        --container-tag ${CI_PIPELINE_ID} \
+        --output-path "jet-trigger-job-lts.yaml" \
+        ${RELEASE_ARGS[@]}
+  artifacts:
+    paths:
+      - jet-trigger-job-lts.yaml
+      - jet-trigger-job-dev.yaml
+      - tests/functional_tests/local_recipes
+
+.run:
+  stage: functional_tests
+  needs: [functional:configure, functional:clean_docker_node]
+  extends: [.functional_tests_rules]
+  trigger:
+    include:
+      - artifact: jet-trigger-job-$ENVIRONMENT.yaml
+        job: functional:configure
+    strategy: depend
+  variables:
+    RO_API_TOKEN: $PAT
+    CONTAINER_TAG: $CI_PIPELINE_ID
+    CI_MCORE_LTS_IMAGE: $CI_MCORE_LTS_IMAGE
+    GITLAB_ENDPOINT: $GITLAB_ENDPOINT
+    PARENT_PIPELINE_ID: $CI_PIPELINE_ID
+  inherit:
+    variables: true
+
+functional:run_lts:
+  extends: [.run]
+  variables:
+    ENVIRONMENT: lts
+
+# functional:run_dev:
+#   extends: [.run]
+#   variables:
+#     ENVIRONMENT: dev
+
+.notify:
+  extends: [.functional_tests_rules]
+  image: ${GITLAB_ENDPOINT}:5005/dl/jet/api:latest
+  needs: 
+    - functional:run_lts
+    # - functional:run_dev
+  tags:
+    - mcore-docker-node-small
+  before_script:
+    - jet secrets jwt-login jwt/nvidia/gitlab-master adlr-megatron-lm-ci $VAULT_JWT_TOKEN
+  variables:
+    WEBHOOK_URL: ${MCORE_NOTIFICATION_HOOK}
+    RO_API_TOKEN: ${PROJECT_ACCESS_TOKEN_MCORE}
+    CONTEXT: $FUNCTIONAL_TEST_SCOPE
+  script:
+    - env
+    - export WEBHOOK_URL=${MCORE_NOTIFICATION_HOOK}
+    - export RO_API_TOKEN=${PROJECT_ACCESS_TOKEN_MCORE}
+    - export GITLAB_ENDPOINT
+    - export CONTEXT=$FUNCTIONAL_TEST_SCOPE
+    - export DATE=$(date +"%Y-%m-%d")
+    - bash tests/functional_tests/shell_test_utils/notify.sh ${CI_PIPELINE_ID} ${ENVIRONMENT}
+  artifacts:
+    when: always
+    paths:
+      - scripts
+  rules:
+    - if: $CI_PIPELINE_SOURCE == "schedule" && $FUNCTIONAL_TEST == "yes"
+      when: always
+    - when: never
+
+functional:notify-lts:
+  extends: [.notify]
+  variables:
+    ENVIRONMENT: lts
+
+functional:notify-dev:
+  extends: [.notify]
+  variables:
+    ENVIRONMENT: dev
\ No newline at end of file
diff --git a/.gitlab/stages/03.publish.yml b/.gitlab/stages/03.publish.yml
new file mode 100644
index 0000000000..e1ee94bd19
--- /dev/null
+++ b/.gitlab/stages/03.publish.yml
@@ -0,0 +1,95 @@
+.publish_common_freeze:
+  stage: functional_tests
+  rules:
+    - if: $CI_COMMIT_BRANCH == $CI_DEFAULT_BRANCH && $PUBLISH == "yes" && $PUBLISH_SCOPE == "code-freeze"
+      when: manual
+    - when: never
+  
+.publish_common_release:
+  stage: functional_tests
+  rules:
+    - if: $CI_COMMIT_BRANCH =~ /^core_r/ && $PUBLISH == "yes" && $PUBLISH_SCOPE == "release"
+      when: manual
+    - when: never
+
+create-release-branch:
+  extends: [.publish_common_freeze]
+  image: ${CI_MCORE_LTS_IMAGE}:${CI_PIPELINE_ID}
+  needs: [test:build_image]
+  tags: [mcore-docker-node-small]
+  variables:
+    GIT_STRATEGY: "clone"
+  script:
+    - git fetch origin $CI_DEFAULT_BRANCH
+    - git config --global user.email "mcore-bot@nvidia.com"
+    - git config --global user.name "Mcore Bot"
+    - git remote set-url origin "https://gitlab-ci-token:${PAT}@${GITLAB_ENDPOINT}/$CI_PROJECT_NAMESPACE/megatron-lm.git"
+    - sed -i "/^PRE_RELEASE/c\PRE_RELEASE = ''" megatron/core/package_info.py 
+    - VERSION=$(python -c "from megatron import core; print(core.__version__)")
+    - git switch --force-create core_r$VERSION origin/$CI_DEFAULT_BRANCH
+    - git push -u origin core_r$VERSION --force
+    - |
+      MESSAGE='{
+        "blocks": [
+          {
+            "type": "section",
+            "text": {
+              "type": "mrkdwn",
+              "text": "Releasebot 🤖: Megatron Core has been frozen 🎉 to branch `core_r$VERSION`"
+            }
+          }
+        ]
+      }'
+
+      curl -X POST -H "Content-type: application/json" --data "$MESSAGE" ${MCORE_NOTIFICATION_HOOK_MAIN}
+
+publish-wheel:
+  extends: [.publish_common_release]
+  image: quay.io/pypa/manylinux_2_28_x86_64  
+  tags: [mcore-docker-node-small]
+  script:
+    - export TWINE_USERNAME
+    - export TWINE_PASSWORT
+    - /opt/python/cp311-cp311/bin/pip install twine
+    - /opt/python/cp310-cp310/bin/python -m build
+    - /opt/python/cp311-cp311/bin/python -m build
+    - auditwheel repair dist/*.whl
+    - twine upload --repository pypi wheelhouse/*
+
+create-gh-release:
+  extends: [.publish_common_release]
+  tags: [mcore-docker-node-small]
+  image:
+    name: registry.gitlab.com/gitlab-ci-utils/curl-jq
+    entrypoint: [""]
+  script: 
+    - |
+      RELEASE_NUMBER=$(python -c "from megatron import core; print(core.__version__)")
+      NAME="NVIDIA Megatron Core $RELEASE_NUMBER"
+      CHANGELOG=$(awk '/^## '$NAME'/{flag=1; next} /^## /{flag=0} flag' CHANGELOG.md)
+      CHANGELOG=$(echo "$CHANGELOG" | sed '/./!d')
+
+      PAYLOAD=$(jq \
+                  -n \
+                  -c \
+                  --arg CI_COMMIT_BRANCH "$CI_COMMIT_BRANCH" \
+                  --arg NAME "$NAME" \
+                  --arg BODY "$CHANGELOG" \
+                  '{
+                    "tag_name": $CI_COMMIT_BRANCH,
+                    "target_commitish": $CI_COMMIT_BRANCH,
+                    "name": $NAME,
+                    "body": $BODY,
+                    "draft": false,
+                    "prerelease": false,
+                    "generate_release_notes": false
+                  }'
+               )
+
+      curl -L \
+        -X POST \
+        -H "Accept: application/vnd.github+json" \
+        -H "Authorization: Bearer $GH_TOKEN" \
+        -H "X-GitHub-Api-Version: 2022-11-28" \
+        https://api.github.com/repos/NVIDIA/Megatron-LM/releases \
+        -d $PAYLOAD
\ No newline at end of file
diff --git a/.pylintrc b/.pylintrc
new file mode 100644
index 0000000000..7981e5c511
--- /dev/null
+++ b/.pylintrc
@@ -0,0 +1,12 @@
+[MAIN]
+ignore-paths=tests
+max-line-length=100
+
+[MESSAGES CONTROL]
+disable=all
+
+enable=C0115,C0116,W0611,C0301
+# C0115: missing-class-docstring
+# C0116: missing-function-docstring
+# W0611: unused-import
+# C0301: line-too-long
diff --git a/CHANGELOG.md b/CHANGELOG.md
new file mode 100644
index 0000000000..78db8212aa
--- /dev/null
+++ b/CHANGELOG.md
@@ -0,0 +1,104 @@
+# Changelog
+
+## NVIDIA Megatron Core 0.8.0
+
+- Multimodal
+  - Added initial support for training vision language models using the LLaVA architecture
+  - Added initial support for inference with multimodal inputs
+  - End-to-end multimodal example from data collection to training to evaluation is provided in examples/multimodal
+- MoE
+  - Context Parallel support.
+  - Distributed checkpoint support for grouped GEMM.
+- Mamba
+
+## NVIDIA Megatron Core 0.7.0
+
+- MoE
+  - Token drop support
+  - Several efficiency optimizations
+  - Improved model parallelism
+  - Memory optimizations
+- Distributed checkpointing
+  - Enabled for Retro
+  - Asynchronous checkpoint saving
+- Several minor bug fixes, speed improvements, and memory optimizations
+
+## NVIDIA Megatron Core 0.6.0
+
+- MoE (Mixture of Experts)
+  - Performance optimization
+    - Communication optimization for multi GPU and Single GPU 
+    - 23% improvement (323 TFLOPS/GPU) over MCore 0.5.0 on Mixtral with Hopper BF16
+    - GroupedMLP enhancement for Hopper
+    - DP Overlapping. Support overlapping computation with gradient reduction and parameter gathering.
+  - All-to-All based Token Dispatcher
+  - Layer-wise logging for load balancing loss.
+  - Improved expert parallel support including distributed optimizer.
+- Distributed optimizer
+- RETRO
+  - Data processing
+- BERT
+  - Distributed checkpointing
+- Dist checkpointing
+  - PyTorch native distributed backend
+  - Improved saving/loading speed
+- TensorRT-LLM Export
+  - Integration with TensorRT Model Optimizer Post-training quantization (PTQ)
+  - Text generation driver to perform PTQ in Megatron-LM
+  - Llama2 and Nemotron3-8b examples to use TensorRT-LLM unified build API to build engine after training.
+- Several minor enhancements, bug fixes, and documentation updates
+
+## NVIDIA Megatron Core 0.5.0
+
+### Key Features and Enhancements
+
+Megatron core documentation is now [live!](https://docs.nvidia.com/megatron-core/developer-guide/latest/user-guide/index.html#quick-start)
+
+### Model Features
+
+- MoE (Mixture of Experts)
+  - Support for Z-loss, Load balancing and Sinkhorn
+  - Layer and communications refactor
+  - Richer parallelism mappings and EP can be combined with other model parallel techniques for larger MoE variants, e.g. EP + TP + DP + SP + PP
+  - Token dropless architecture with Top-K routing
+  - Performance optimization with with GroupedGEMM when number of local experts is > 1
+  - Distributed checkpointing
+- Interleaved rotary embedding
+
+### Datasets
+
+- Masked WordPiece datasets for BERT and T5
+- Raw and mock datasets
+
+### Parallelism
+
+### Performance
+
+- Activation offloading to CPU
+- Rope and Swiglu fusion
+- Sliding window attention (via Transformer Engine)
+
+### General Improvements
+
+- Timers
+
+## NVIDIA Megatron Core 0.4.0
+
+### Key Features and Enhancements
+
+#### Models
+
+- BERT
+- RETRO
+- T5
+
+#### Parallelism
+
+- Mixture of Experts support for GPT
+- Model parallel efficient Distributed Data Parallel (DDP)
+- Context Parallel (2D Tensor Parallel) support
+
+#### Datasets
+
+- GPT Dataset
+- Blended Dataset
diff --git a/CODEOWNERS b/CODEOWNERS
new file mode 100644
index 0000000000..8a115ed7b3
--- /dev/null
+++ b/CODEOWNERS
@@ -0,0 +1,49 @@
+[Core-ADLR] @mcore-reviewers/core-adlr
+megatron/core/ 
+
+[Core-NeMo] @mcore-reviewers/core-nemo
+megatron/core/ 
+
+^[Core-MLPerf] @mcore-reviewers/mlperf
+megatron/core/
+
+[MoE-ADLR] @mcore-reviewers/moe-adlr
+megatron/core/transformer/moe/
+
+[MoE-Moe] @mcore-reviewers/moe-moe
+megatron/core/transformer/moe/
+
+[Datasets] @mcore-reviewers/datasets
+megatron/core/datasets/
+
+[BERT] @mcore-reviewers/bert
+megatron/core/models/bert/
+
+[GPT] @mcore-reviewers/gpt
+megatron/core/models/gpt/
+
+[Retro] @mcore-reviewers/retro
+megatron/core/models/retro/
+
+[Distributed Checkpointing] @mcore-reviewers/dist-checkpointing
+megatron/core/dist_checkpointing/
+
+[Distributed Optimizer] @mcore-reviewers/dist-optimizer
+megatron/core/optimizer/distrib_optimizer/ 
+
+[Inference] @mcore-reviewers/inference
+megatron/core/inference/
+
+[Quantization and Inference (QAT)] @mcore-reviewers/quantization-and-inference
+megatron/core/inference/
+
+; [Context Parallelism] @mcore-reviewers/context-parallelism
+; 
+
+[CI] @mcore-reviewers/ci
+.gitlab/
+.github/
+.gitlab-ci.yml
+Dockerfile.ci.lts
+Dockerfile.ci.dev
+tests/
diff --git a/CONTRIBUTING.md b/CONTRIBUTING.md
new file mode 100644
index 0000000000..615227600c
--- /dev/null
+++ b/CONTRIBUTING.md
@@ -0,0 +1,66 @@
+# Contributing to Megatron-LM
+
+This document outlines the processes and policies for issues and pull requests by non-NVIDIA contributors to the Megatron-LM github repository.
+
+Everyone is welcome to contribute to the project but development of Megatron-LM continues internally at NVIDIA. When contributing it important to ensure that changes are in line with the project direction. Small changes to fix bugs are welcomed and appreciated. If proposing large architectural changes or changes for stylistic reasons open an issue first so we can discuss it.
+
+PRs will first be pulled into NVIDIA's internal Megatron-LM repo and then pushed back out to the open github repo with proper credit given to the committers.
+
+## Issue policy
+
+Please do file any bugs you find, keeping the following in mind:
+
+- If filing a bug, i.e. you have found something that doesn't work as expected, use the BUG template.
+- If you've found a regression in speed or accuracy use the REGRESSION template.
+- If you are requesting a new feature or modification of an existing feature use the ENHANCEMENT template.
+- If opening an issue to ask a question no template is needed but please make your question as clear and concise as possible.
+- One issue per bug. Putting multiple things in the same issue makes both discussion and completion unnecessarily complicated.
+- Your bug is mostly likely to get attention from the development team quickly if we can easily reproduce it.
+- Use proper spelling, grammar, and punctuation.
+- Write in an authoritative and technical tone.
+
+## Code submission policy
+
+Here are some dos & don'ts to try and stick to:
+
+### Do:
+
+- Format new code in a style that is consistent with the file being changed. Megatron-LM doesn't (yet) have a style guide or enforced formatting.
+- Split your changes into separate, atomic commits i.e. A commit per feature or fix.
+- Make sure your commits are rebased on the master branch.
+- Write the commit message subject line in the imperative mood ("Change the default argument for X", not "Changed the default argument for X").
+- Write your commit messages in proper English, with care and punctuation.
+- Check the spelling of your code, comments and commit messages.
+
+### Don't:
+
+- Submit code that's incompatible with the project licence.
+- Touch anything outside the stated scope of the PR. This includes formatting changes to code not relevant to the PR.
+- Iterate excessively on your design across multiple commits.
+- Include commented-out code.
+- Attempt large architectural changes without first opening an issue to discuss.
+
+## Issue and Pull Request Q&A (Updated Jul 2023)
+
+### I've submitted an issue and PR. When can I expect to get some feedback?
+
+Megatron-LM is developed and maintained by a small team of researchers. We will endeavour to read and acknowledge all new issues and PRs within a week. A few rules of thumb:
+- Reproducible bugs/regressions and bug/regression fixes are likely to get the attention of maintainers the quickest.
+- Issues requesting an enhancement may only recieve acknowlegement that they've been read and may be closed with a "wontfix" label if they're not inline with the project direction. If they are acknowledged and remain open you can assume the maintainers agree they're a desirable feature.
+- Support requests, i.e. requests for help running the code, have the lowest priority and will be responded to as maintainer time permits.
+
+### If my issue or PR isn't getting attention, how long should I wait before pinging one of the project maintainers?
+
+One week if there is no acknowledgement of the intial request.
+
+### Who are the project maintainers I should ping?
+
+The corresponding maintainers at this time are @jaredcasper and @jon-barker.
+
+### Is there a policy for issues and PRs that haven't been touched in X days? Should they be closed?
+
+Yes, starting in July 2023 we have a bot that will mark untouched PRs as "stale" after 60 days.
+
+We have a long backlog of issues and PRs dating back 3.5 years. We are trying to triage these now by working backwards. Older issues we believe may still be relevant may recieve a request to re-test them with the latest code. If there's no response they may be closed. Again, if you they should be re-opened then just respond with a comment to that effect.
+
+Thank-you!
\ No newline at end of file
diff --git a/Dockerfile.ci.dev b/Dockerfile.ci.dev
new file mode 100644
index 0000000000..43b64233f3
--- /dev/null
+++ b/Dockerfile.ci.dev
@@ -0,0 +1,83 @@
+# syntax=docker/dockerfile:1.3-labs
+
+ARG FROM_IMAGE_NAME
+FROM $FROM_IMAGE_NAME as build_causal_conv1d
+WORKDIR /opt
+RUN CAUSAL_CONV1D_FORCE_BUILD=TRUE pip3 wheel -v git+https://github.com/Dao-AILab/causal-conv1d.git@v1.2.2.post1
+
+FROM $FROM_IMAGE_NAME as build_grouped_gemm
+WORKDIR /opt
+RUN pip3 wheel -v git+https://github.com/fanshiqing/grouped_gemm@v1.1.2
+
+FROM $FROM_IMAGE_NAME as build_mamba_ssm
+WORKDIR /opt
+RUN MAMBA_FORCE_BUILD=TRUE pip3 wheel -v git+https://github.com/state-spaces/mamba.git@v2.2.0
+
+FROM $FROM_IMAGE_NAME as main
+ENV DEBIAN_FRONTEND=noninteractive
+
+RUN apt-get update && \
+    apt-get install -y --no-install-recommends gettext python3-venv && \
+    apt-get clean && \
+    python -m venv /opt/jet && \
+    wget https://github.com/mikefarah/yq/releases/download/v4.44.1/yq_linux_amd64 -O /usr/local/bin/yq && \
+    chmod a+x /usr/local/bin/yq
+
+COPY --from=build_causal_conv1d /opt/causal_conv1d-*.whl ./
+COPY --from=build_grouped_gemm /opt/grouped_gemm-*.whl ./
+COPY --from=build_mamba_ssm /opt/mamba_ssm-*.whl ./
+
+RUN pip3 install --no-cache-dir --upgrade-strategy only-if-needed -v \
+einops \
+flask-restful \
+nltk \
+pytest \
+pytest-cov \
+pytest_mock \
+pytest-random-order \
+sentencepiece \
+tiktoken \
+wrapt \
+zarr \
+wandb \
+causal_conv1d-*.whl \
+mamba_ssm-*.whl \
+grouped_gemm-*.whl \
+tensorstore==0.1.45 && \
+rm *.whl
+
+# Since megatron does not have any dependencies (and isn't a dependency to any other package), we can install it separately to make everything a bit quicker
+ARG MCORE_REPO
+ARG MCORE_REF
+ARG MCORE_BACKWARDS_REF
+RUN <<"EOF" bash -exu
+# Checkout latest
+cd /opt
+rm -rf /opt/megatron-lm; mkdir megatron-lm; cd megatron-lm
+git init
+git remote add origin ${MCORE_REPO}
+git fetch origin ${MCORE_REF}:MCORE_LATEST
+git checkout MCORE_LATEST
+
+# Checkout backwards-ref
+cd /opt
+rm -rf /opt/megatron-lm-$MCORE_BACKWARDS_REF; mkdir megatron-lm-$MCORE_BACKWARDS_REF; cd megatron-lm-$MCORE_BACKWARDS_REF
+git init
+git remote add origin ${MCORE_REPO}
+git fetch origin ${MCORE_BACKWARDS_REF}:MCORE_BACKWARDS_REF
+git checkout MCORE_BACKWARDS_REF
+rm -rf megatron; cp -a /opt/megatron-lm/megatron ./
+EOF
+
+RUN pip install /opt/megatron-lm
+ENV PYTHONPATH="/opt/megatron-lm:$PYTHONPATH"
+
+##### For NVIDIANS only #####
+FROM main as jet
+ARG CACHEBUST=0
+RUN --mount=type=secret,id=JET_INDEX_URLS \
+    JET_INDEX_URLS=$(cat /run/secrets/JET_INDEX_URLS) && \
+    pip install jet-client --upgrade $JET_INDEX_URLS && \
+    /opt/jet/bin/pip install jet-api --upgrade $JET_INDEX_URLS 
+ENV PATH="$PATH:/opt/jet/bin"
+###
\ No newline at end of file
diff --git a/Dockerfile.ci.lts b/Dockerfile.ci.lts
new file mode 100644
index 0000000000..1d0ffd736a
--- /dev/null
+++ b/Dockerfile.ci.lts
@@ -0,0 +1,84 @@
+# syntax=docker/dockerfile:1.3-labs
+
+ARG FROM_IMAGE_NAME
+FROM $FROM_IMAGE_NAME as build_causal_conv1d
+WORKDIR /opt
+RUN CAUSAL_CONV1D_FORCE_BUILD=TRUE pip3 wheel -v git+https://github.com/Dao-AILab/causal-conv1d.git@v1.2.2.post1
+
+FROM $FROM_IMAGE_NAME as build_grouped_gemm
+WORKDIR /opt
+RUN pip3 wheel -v git+https://github.com/fanshiqing/grouped_gemm@v1.1.2
+
+FROM $FROM_IMAGE_NAME as build_mamba_ssm
+WORKDIR /opt
+RUN MAMBA_FORCE_BUILD=TRUE pip3 wheel -v git+https://github.com/state-spaces/mamba.git@v2.0.3
+
+FROM $FROM_IMAGE_NAME as main
+ENV DEBIAN_FRONTEND=noninteractive
+
+RUN apt-get update && \
+    apt-get install -y --no-install-recommends gettext python3-venv && \
+    apt-get clean && \
+    python -m venv /opt/jet && \
+    wget https://github.com/mikefarah/yq/releases/download/v4.44.1/yq_linux_amd64 -O /usr/local/bin/yq && \
+    chmod a+x /usr/local/bin/yq
+
+COPY --from=build_causal_conv1d /opt/causal_conv1d-1.2.2.post1-cp310-cp310-linux_x86_64.whl ./
+COPY --from=build_grouped_gemm /opt/grouped_gemm-1.1.2-cp310-cp310-linux_x86_64.whl ./
+COPY --from=build_mamba_ssm /opt/mamba_ssm-2.0.3-cp310-cp310-linux_x86_64.whl ./
+
+RUN pip3 install --no-cache-dir --upgrade-strategy only-if-needed -v \
+einops \
+flask-restful \
+nltk \
+pytest \
+pytest-cov \
+pytest_mock \
+pytest-random-order \
+sentencepiece \
+tiktoken \
+wrapt \
+zarr \
+wandb \
+triton==2.1.0 \
+causal_conv1d-1.2.2.post1-cp310-cp310-linux_x86_64.whl \
+mamba_ssm-2.0.3-cp310-cp310-linux_x86_64.whl \
+grouped_gemm-1.1.2-cp310-cp310-linux_x86_64.whl \
+tensorstore==0.1.45 && \
+rm *.whl
+
+# Since megatron does not have any dependencies (and isn't a dependency to any other package), we can install it separately to make everything a bit quicker
+ARG MCORE_REPO
+ARG MCORE_REF
+ARG MCORE_BACKWARDS_REF
+RUN <<"EOF" bash -exu
+# Checkout latest
+cd /opt
+rm -rf /opt/megatron-lm; mkdir megatron-lm; cd megatron-lm
+git init
+git remote add origin ${MCORE_REPO}
+git fetch origin ${MCORE_REF}:MCORE_LATEST
+git checkout MCORE_LATEST
+
+# Checkout backwards-ref
+cd /opt
+rm -rf /opt/megatron-lm-$MCORE_BACKWARDS_REF; mkdir megatron-lm-$MCORE_BACKWARDS_REF; cd megatron-lm-$MCORE_BACKWARDS_REF
+git init
+git remote add origin ${MCORE_REPO}
+git fetch origin ${MCORE_BACKWARDS_REF}:MCORE_BACKWARDS_REF
+git checkout MCORE_BACKWARDS_REF
+rm -rf megatron; cp -a /opt/megatron-lm/megatron ./
+EOF
+
+RUN pip install /opt/megatron-lm
+ENV PYTHONPATH="/opt/megatron-lm:$PYTHONPATH"
+
+##### For NVIDIANS only #####
+FROM main as jet
+ARG CACHEBUST=0
+RUN --mount=type=secret,id=JET_INDEX_URLS \
+    JET_INDEX_URLS=$(cat /run/secrets/JET_INDEX_URLS) && \
+    pip install jet-client --upgrade $JET_INDEX_URLS && \
+    /opt/jet/bin/pip install jet-api --upgrade $JET_INDEX_URLS 
+ENV PATH="$PATH:/opt/jet/bin"
+###
\ No newline at end of file
diff --git a/Dockerfile.linting b/Dockerfile.linting
new file mode 100644
index 0000000000..b0670af9d1
--- /dev/null
+++ b/Dockerfile.linting
@@ -0,0 +1,21 @@
+# syntax=docker/dockerfile:experimental
+
+ARG FROM_IMAGE_NAME
+FROM $FROM_IMAGE_NAME as main
+ENV DEBIAN_FRONTEND=noninteractive
+
+RUN sed -i -e 's/^APT/# APT/' -e 's/^DPkg/# DPkg/' \
+      /etc/apt/apt.conf.d/docker-clean
+
+
+RUN pip3 install --no-cache-dir \
+      black==24.4.2 \
+      isort==5.13.2 \
+      flake8==7.1.0 \
+      pylint==3.2.6
+
+COPY . /opt/megatron-lm
+
+WORKDIR /opt/megatron-lm
+
+FROM main as jet
\ No newline at end of file
diff --git a/LICENSE b/LICENSE
index 281fde95a6..b4193aff50 100644
--- a/LICENSE
+++ b/LICENSE
@@ -29,13 +29,15 @@ The following applies to all files unless otherwise noted:
 --
 
 This repository also contains code from Hugging Face Inc., Google Research,
-Facebook (from their Fairseq and Dino projects), Microsoft(from their 
-Swin-Transformer project)and Philip Popien. Files from these 
-organizations have notices at  the top of each file. Below are 
-licenses used in those files, as indicated.
+Facebook (from their Fairseq, Dino, and ParlAI projects), Microsoft (from their
+Swin-Transformer project), Philip Popien, the Mamba project (Tri Dao and
+Albert Gu), and the Triton language and compiler project (Philippe Tillet and
+OpenAI). Files from these organizations have notices at the top of each file.
+Below are licenses used in those files, as indicated.
 
 
-------------- LICENSE FOR Facebook, huggingface and Google Research code  --------------
+--------------------------------------------------------------------------------
+-- LICENSE FOR Facebook, huggingface, Google Research, LLaVA, and Mamba code  --
 
 
                                  Apache License
@@ -240,12 +242,16 @@ licenses used in those files, as indicated.
    See the License for the specific language governing permissions and
    limitations under the License.
 
-------------- LICENSE FOR Facebook Fairseq code --------------
+--------------------------------------------------------------------------------
+LICENSE FOR
+Facebook, Inc. and its affiliates,
+Meta Platforms, Inc. and its affiliates,
+Microsoft Corporation,
+OpenGVLab/InternVL, and
+Triton language and compiler.
 
 MIT License
 
-Copyright (c) Facebook, Inc. and its affiliates.
-
 Permission is hereby granted, free of charge, to any person obtaining a copy
 of this software and associated documentation files (the "Software"), to deal
 in the Software without restriction, including without limitation the rights
@@ -264,113 +270,3 @@ LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
 OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
 SOFTWARE.
 
-------------- LICENSE FOR Mircrosoft Swin transformer code --------------
-
-MIT License
-
-Copyright (c) Microsoft Corporation.
-
-Permission is hereby granted, free of charge, to any person obtaining a copy
-of this software and associated documentation files (the "Software"), to deal
-in the Software without restriction, including without limitation the rights
-to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
-copies of the Software, and to permit persons to whom the Software is
-furnished to do so, subject to the following conditions:
-
-The above copyright notice and this permission notice shall be included in all
-copies or substantial portions of the Software.
-
-THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
-IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
-FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
-AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
-LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
-OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
-SOFTWARE
-
-
---------------- NVIDIA Source Code License for SegFormer -----------------
-1. Definitions
-
-“Licensor” means any person or entity that distributes its Work.
-
-“Software” means the original work of authorship made available under this
-License.
-
-“Work” means the Software and any additions to or derivative works of the
-Software that are made available under this License.
-
-The terms “reproduce,” “reproduction,” “derivative works,” and 
-“distribution” have the meaning as provided under U.S. copyright law;
-provided, however, that for the purposes of this License, derivative works
-shall not include works that remain separable from, or merely link 
-(or bind by name) to the interfaces of, the Work.
-
-Works, including the Software, are “made available” under this License by 
-including in or with the Work either (a) a copyright notice referencing 
-the applicability of this License to the Work, or (b) a copy of this License.
-
-2. License Grant
-
-2.1 Copyright Grant. Subject to the terms and conditions of this License,
-each Licensor grants to you a perpetual, worldwide, non-exclusive, 
-royalty-free, copyright license to reproduce, prepare derivative works of, 
-publicly  display, publicly perform, sublicense and distribute its Work 
-and any resulting derivative works in any form.
-
-3. Limitations
-
-3.1 Redistribution. You may reproduce or distribute the Work only if 
-(a) you do so under this License, (b) you include a complete copy of this 
-License with your distribution, and (c) you retain without modification any
-copyright, patent, trademark, or attribution notices that are present
-in the Work.
-
-3.2 Derivative Works. You may specify that additional or different terms 
-apply to the use, reproduction, and distribution of your derivative works 
-of the Work (“Your Terms”) only if (a) Your Terms provide that the use
-limitation in Section 3.3 applies to your derivative works, and (b) you 
-identify the specific derivative works that are subject to Your Terms. 
-Notwithstanding Your Terms, this License (including the redistribution
-requirements in Section 3.1) will continue to apply to the Work itself.
-
-3.3 Use Limitation. The Work and any derivative works thereof only may 
-be used or intended for use non-commercially. Notwithstanding the 
-foregoing, NVIDIA and its affiliates may use the Work and any derivative
-works commercially. As used herein, “non-commercially” means for research 
-or evaluation purposes only.
-
-3.4 Patent Claims. If you bring or threaten to bring a patent claim against 
-any Licensor (including any claim, cross-claim or counterclaim in a lawsuit) 
-to enforce any patents that you allege are infringed by any Work, then
-your rights under this License from such Licensor (including the grant 
-in Section 2.1) will terminate immediately.
-
-3.5 Trademarks. This License does not grant any rights to use any Licensor’s 
-or its affiliates’ names, logos, or trademarks, except as necessary to 
-reproduce the notices described in this License.
-
-3.6 Termination. If you violate any term of this License, then your rights 
-under this License (including the grant in Section 2.1) will terminate 
-immediately.
-
-4. Disclaimer of Warranty.
-
-THE WORK IS PROVIDED “AS IS” WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, 
-EITHER EXPRESS OR IMPLIED, INCLUDING WARRANTIES OR CONDITIONS OF 
-MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE, TITLE OR NON-INFRINGEMENT.
-YOU BEAR THE RISK OF UNDERTAKING ANY ACTIVITIES UNDER THIS LICENSE.
-
-5. Limitation of Liability.
-
-EXCEPT AS PROHIBITED BY APPLICABLE LAW, IN NO EVENT AND UNDER NO LEGAL 
-THEORY, WHETHER IN TORT (INCLUDING NEGLIGENCE), CONTRACT, OR OTHERWISE 
-SHALL ANY LICENSOR BE LIABLE TO YOU FOR DAMAGES, INCLUDING ANY DIRECT,
-INDIRECT, SPECIAL, INCIDENTAL, OR CONSEQUENTIAL DAMAGES ARISING OUT 
-OF OR RELATED TO THIS LICENSE, THE USE OR INABILITY TO USE THE WORK 
-(INCLUDING BUT NOT LIMITED TO LOSS OF GOODWILL, BUSINESS INTERRUPTION,
-LOST PROFITS OR DATA, COMPUTER FAILURE OR MALFUNCTION, OR ANY OTHER 
-COMMERCIAL DAMAGES OR LOSSES), EVEN IF THE LICENSOR HAS BEEN
-ADVISED OF THE POSSIBILITY OF SUCH DAMAGES.
-
-
diff --git a/MANIFEST.in b/MANIFEST.in
new file mode 100644
index 0000000000..dbb29b0a1c
--- /dev/null
+++ b/MANIFEST.in
@@ -0,0 +1,2 @@
+include megatron/core/requirements.txt
+include megatron/core/README.md
\ No newline at end of file
diff --git a/README.md b/README.md
index 6bb334e8e1..138944b5cd 100644
--- a/README.md
+++ b/README.md
@@ -1,36 +1,28 @@
-Megatron ([1](https://arxiv.org/pdf/1909.08053.pdf), [2](https://arxiv.org/pdf/2104.04473.pdf), and [3](https://arxiv.org/pdf/2205.05198)) is a large, powerful transformer developed by the Applied Deep Learning Research team at NVIDIA. This repository is for ongoing research on training large transformer language models at scale. We developed efficient, model-parallel ([tensor](https://arxiv.org/pdf/1909.08053.pdf), [sequence](https://arxiv.org/pdf/2205.05198), and [pipeline](https://arxiv.org/pdf/2104.04473.pdf)), and multi-node pre-training of transformer based models such as [GPT](https://arxiv.org/abs/2005.14165), [BERT](https://arxiv.org/pdf/1810.04805.pdf), and [T5](https://arxiv.org/abs/1910.10683) using mixed precision.
+<div align="center">
 
-Below are some of the projects where we have directly used Megatron:
-* [BERT and GPT Studies Using Megatron](https://arxiv.org/pdf/1909.08053.pdf)
-* [BioMegatron: Larger Biomedical Domain Language Model](https://www.aclweb.org/anthology/2020.emnlp-main.379.pdf)
-* [End-to-End Training of Neural Retrievers for Open-Domain Question Answering](https://arxiv.org/abs/2101.00408)
-* [Large Scale Multi-Actor Generative Dialog Modeling](https://www.aclweb.org/anthology/2020.acl-main.8.pdf)
-* [Local Knowledge Powered Conversational Agents](https://arxiv.org/abs/2010.10150)
-* [MEGATRON-CNTRL: Controllable Story Generation with External Knowledge Using Large-Scale Language Models](https://www.aclweb.org/anthology/2020.emnlp-main.226.pdf)
-* [RACE Reading Comprehension Dataset Leaderboard](http://www.qizhexie.com/data/RACE_leaderboard.html)
-* [Training Question Answering Models From Synthetic Data](https://www.aclweb.org/anthology/2020.emnlp-main.468.pdf)
-* [Few-shot Instruction Prompts for Pretrained Language Models to Detect Social Biases](https://arxiv.org/abs/2112.07868)
-* [Exploring the Limits of Domain-Adaptive Training for Detoxifying Large-Scale Language Models](https://arxiv.org/abs/2202.04173)
-* [Using DeepSpeed and Megatron to Train Megatron-Turing NLG 530B, A Large-Scale Generative Language Model](https://arxiv.org/abs/2201.11990)
-* [Multi-Stage Prompting for Knowledgeable Dialogue Generation](https://arxiv.org/abs/2203.08745)
+Megatron-LM & Megatron-Core
+===========================
+<h4>GPU optimized techniques for training transformer models at-scale</h4>
+
+[![Documentation](https://img.shields.io/badge/docs-latest-brightgreen.svg?style=flat)](https://docs.nvidia.com/megatron-core/developer-guide/latest/index.html)
+[![version](https://img.shields.io/badge/release-0.5.0-green)](./setup.py)
+[![license](https://img.shields.io/badge/license-OpenBSD-blue)](./LICENSE)
 
-Megatron is also used in [NeMo Megatron](https://developer.nvidia.com/nvidia-nemo#nemo-megatron), a framework to help enterprises overcome the challenges of building and training sophisticated natural language processing models with billions and trillions of parameters.
+<div align="left">
 
-Our codebase is capable of efficiently training very large (hundreds of billions of parameters) language models with both model and data parallelism. To demonstrate how the code scales with multiple GPUs and model sizes, we consider GPT models from 1 billion all the way to 1 trillion parameters. All models use a vocabulary size of 51,200 and a sequence length of 2048. We vary hidden size, number of attention heads, and number of layers to arrive at a specifc model size. As the model size increases, we also modestly increase the batch size. We leverage [NVIDIA's Selene supercomputer](https://www.top500.org/system/179842/) to perform scaling studies and use up to 3072 [A100](https://www.nvidia.com/en-us/data-center/a100/) GPUs for the largest model. Each cluster node has 8 NVIDIA 80GB A100 GPUs. The graph below shows that we scale nearly linear up to 1 trillion parameter models running on 3072 GPUs. Note that these results are from benchmark runs and these models were not trained to convergence; however, the FLOPs are measured for end-to-end training, i.e., includes all operations including data loading, optimization, and even logging.
+# Latest News
 
-![Scaling Graph](images/Achieved_petaFLOPs.png)
+- **[2024/7]** Megatron-Core v0.7 improves scalability and training resiliency and adds support for multimodal training ([blog](https://developer.nvidia.com/blog/train-generative-ai-models-more-efficiently-with-new-nvidia-megatron-core-functionalities/)). 
+- **[2024/6]** Megatron-Core added supports for Mamba-based models. Check out our paper [An Empirical Study of Mamba-based Language Models](https://arxiv.org/pdf/2406.07887) and [code example](https://github.com/NVIDIA/Megatron-LM/tree/ssm/examples/mamba).
+- **[2024/1 Announcement]** NVIDIA has released the core capabilities in **Megatron-LM** into [**Megatron-Core**](https://github.com/NVIDIA/Megatron-LM/tree/main/megatron/core) in this repository. Megatron-Core expands upon Megatron-LM's GPU-optimized techniques with more cutting-edge innovations on system-level optimizations, featuring composable and modular APIs. Explore the [Megatron-Core intro](#megatron-core) for more details.
 
-The following table shows both model (MFU) and hardware (HFU) FLOPs utilization for select configurations up to 1T parameters (see [our paper](https://arxiv.org/pdf/2205.05198) for a description of how these are calculated). As the model size increases, we achieve better GPU utilization and for the one trillion parameter model, we reach a MFU and HFU of 56.3% and 57.0%, respectively. Note that these numbers are also measured on benchmark runs and in this case are measured using a data parallel size of one. Data parallelism introduces some overhead due to the gradient all-reduce required between the data parallel groups. However, for large transformer models, this overhead is not large and can almost entirely eliminted by overlapping the gradient all-reduce with backpropagation.
 
-| Model Size | Model FLOPs Utilization | Hardware FLOPs Utilization |
-| :---: | :---: | :---: |
-| 22B   | 41.5% | 43.7% |
-| 175B  | 51.4% | 52.8% |
-| 530B  | 56.0% | 57.0% |
-| 1T    | 56.3% | 57.0% |
 
-# Contents
-   * [Contents](#contents)
+# Table of Contents
+   * [Megatron Overview](#megatron-overview)
+	   * [Megatron-LM](#megatron-lm)
+      * [Megatron-Core](#megatron-core)
+   * [Training Speed and Scalability](#training-speed-and-scalability)
    * [Setup](#setup)
       * [Downloading Checkpoints](#downloading-checkpoints)
    * [Usage](#usage)
@@ -44,7 +36,7 @@ The following table shows both model (MFU) and hardware (HFU) FLOPs utilization
       * [Distributed Optimizer](#distributed-optimizer)
       * [FlashAttention](#flashattention)
       * [GPT-3 Example](#gpt-3-example)
-      * [Retro](#retro)
+      * [Retro and InstructRetro](#retro-and-instructretro)
    * [Evaluation and Tasks](#evaluation-and-tasks)
       * [GPT Text Generation](#gpt-text-generation)
       * [GPT Evaluation](#gpt-evaluation)
@@ -53,9 +45,40 @@ The following table shows both model (MFU) and hardware (HFU) FLOPs utilization
       * [BERT Task Evaluation](#bert-task-evaluation)
          * [RACE Evaluation](#race-evaluation)
          * [MNLI Evaluation](#mnli-evaluation)
+      * [Llama-2 Inference and Finetuning](#llama-2-inference-and-finetuning)
    * [Datasets](#datasets)
       * [Collecting Wikipedia Training Data](#collecting-wikipedia-training-data)
       * [Collecting GPT Webtext Data](#collecting-gpt-webtext-data)
+   * [Reproducibility](#reproducibility)
+   * [Projects using Megatron](#projects-using-megatron)
+
+# Megatron Overview
+This repository comprises two essential components: **Megatron-LM** and **Megatron-Core**. Megatron-LM serves as a ressearch-oriented framework leveraging Megatron-Core for large language model (LLM) training. Megatron-Core, on the other hand, is a library of GPU optimized training techniques that comes with formal product support including versioned APIs and regular releases. You can use Megatron-Core alongside Megatron-LM or [Nvidia NeMo Framework](https://docs.nvidia.com/deeplearning/nemo/user-guide/docs/en/main/nlp/nemo_megatron/mcore_customization.html) for an end-to-end and cloud-native solution. Alternatively, you can integrate Megatron-Core's building blocks into your preferred training framework.
+
+## Megatron-LM
+First introduced in 2019, Megatron ([1](https://arxiv.org/pdf/1909.08053.pdf), [2](https://arxiv.org/pdf/2104.04473.pdf), and [3](https://arxiv.org/pdf/2205.05198)) sparked a wave of innovation in the AI community, enabling researchers and developers to utilize the underpinnings of this library to further LLM advancements. Today, many of the most popular LLM developer frameworks have been inspired by and built directly leveraging the open-source Megatron-LM library, spurring a wave of foundation models and AI startups. Some of the most popular LLM frameworks built on top of Megatron-LM include [Colossal-AI](https://github.com/hpcaitech/ColossalAI), [HuggingFace Accelerate](https://github.com/huggingface/accelerate), and [NVIDIA NeMo Framework](https://www.nvidia.com/en-us/ai-data-science/generative-ai/nemo-framework/). A list of projects that have directly used Megatron can be found [here](#projects-using-megatron).
+
+## Megatron-Core
+Megatron-Core is an open-source PyTorch-based library that contains GPU-optimized techniques and cutting-edge system-level optimizations. It abstracts them into composable and modular APIs, allowing full flexibility for developers and model researchers to train custom transformers at-scale on NVIDIA accelerated computing infrastructure. This library is compatible with all NVIDIA Tensor Core GPUs, including FP8 acceleration support for [NVIDIA Hopper architectures](https://www.nvidia.com/en-us/data-center/technologies/hopper-architecture/). 
+
+Megatron-Core offers core building blocks such as attention mechanisms, transformer blocks and layers, normalization layers, and embedding techniques. Additional functionality like activation recomputation, distributed checkpointing is also natively built-in to the library. The building blocks and functionality are all GPU optimized, and can be built with advanced parallelization strategies for optimal training speed and stability on NVIDIA Accelerated Computing Infrastructure. Another key component of the Megatron-Core library includes advanced model parallelism techniques (tensor, sequence, pipeline, context, and MoE expert parallelism). 
+
+Megatron-Core can be used with [NVIDIA NeMo](https://www.nvidia.com/en-us/ai-data-science/products/nemo/), an enterprise-grade AI platform. Alternatively, you can explore Megatron-Core with the native PyTorch training loop [here](https://github.com/NVIDIA/Megatron-LM/tree/main/examples). Visit [Megatron-Core documentation](https://docs.nvidia.com/megatron-core/developer-guide/latest/index.html) to learn more.
+
+
+# Training Speed and Scalability
+Our codebase is capable of efficiently training large language models (i.e., models with hundreds of billions of parameters) with both model and data parallelism. To demonstrate how our software scales with multiple GPUs and model sizes, we consider GPT models ranging from 2 billion parameters to 462 billion parameters. All models use a vocabulary size of 131,072 and a sequence length of 4096. We vary hidden size, number of attention heads, and number of layers to arrive at a specific model size. As the model size increases, we also modestly increase batch size. Our experiments use up to 6144 [H100](https://www.nvidia.com/en-us/data-center/h100/) GPUs. We perform fine-grained overlapping of data-parallel (`--overlap-grad-reduce --overlap-param-gather`), tensor-parallel (`--tp-comm-overlap`) and pipeline-parallel communication (enabled by default) with computation to improve scalability. The reported throughputs are measured for end-to-end training and include all operations including data loading, optimizer steps, communication, and even logging. Note that we did not train these models to convergence.
+
+![Model table](images/model_table.png)
+
+Our weak scaled results show superlinear scaling (MFU increases from 41% for the smallest model considered to 47-48% for the largest models); this is because larger GEMMs have higher arithmetic intensity and are consequently more efficient to execute.
+
+![Weak scaling](images/weak_scaling.png)
+
+We also strong scaled the standard GPT-3 model (our version has slightly more than 175 billion parameters due to larger vocabulary size) from 96 H100 GPUs to 4608 GPUs, using the same batch size of 1152 sequences throughout. Communication becomes more exposed at larger scale, leading to a reduction in MFU from 47% to 42%.
+
+![Strong scaling](images/strong_scaling.png)
+
 
 # Setup
 We strongly recommend using the latest release of [NGC's PyTorch container](https://ngc.nvidia.com/catalog/containers/nvidia:pytorch) with DGX nodes. If you can't use this for some reason, use the latest pytorch, cuda, nccl, and NVIDIA [APEX](https://github.com/NVIDIA/apex#quick-start) releases.  Data preprocessing requires [NLTK](https://www.nltk.org/install.html), though this is not required for training, evaluation, or downstream tasks.
@@ -67,7 +90,7 @@ docker run --gpus all -it --rm -v /path/to/megatron:/workspace/megatron -v /path
 ```
 
 ## Downloading Checkpoints
-We have provided pretrained [BERT-345M](https://ngc.nvidia.com/catalog/models/nvidia:megatron_bert_345m) and [GPT-345M](https://ngc.nvidia.com/catalog/models/nvidia:megatron_lm_345m) checkpoints for use to evaluate or finetuning downstream tasks. To access these checkpoints, first [sign up](https://ngc.nvidia.com/signup) for and [setup](https://ngc.nvidia.com/setup/installers/cli) the NVIDIA GPU Cloud (NGC) Registry CLI. Further documentation for downloading models can be found in the [NGC documentation](https://docs.nvidia.com/dgx/ngc-registry-cli-user-guide/index.html#topic_6_4_1).
+We have provided pretrained [BERT-345M](https://ngc.nvidia.com/catalog/models/nvidia:megatron_bert_345m) and [GPT-345M](https://ngc.nvidia.com/catalog/models/nvidia:megatron_lm_345m) checkpoints to evaluate or for finetuning downstream tasks. To access these checkpoints, first [sign up](https://ngc.nvidia.com/signup) for and [setup](https://ngc.nvidia.com/setup/installers/cli) the NVIDIA GPU Cloud (NGC) Registry CLI. Further documentation for downloading models can be found in the [NGC documentation](https://docs.nvidia.com/dgx/ngc-registry-cli-user-guide/index.html#topic_6_4_1).
 
 Alternatively, you can directly download the checkpoints using:
 
@@ -89,7 +112,7 @@ After installation, there are several possible workflows. The most comprehensive
 
 However, steps 1 and 2 can be replaced by using one of the pretrained models mentioned above.
 
-We've provided several scripts for pretraining both BERT and GPT in [`examples`](./examples) directory, as well as scripts for both zero-shot and fine-tuned downstream tasks including MNLI, RACE, WikiText103, and LAMBADA evaluation. There is also a script for GPT interactive text generation.
+We've provided several scripts for pretraining both BERT and GPT in the [`examples`](./examples) directory, as well as scripts for both zero-shot and fine-tuned downstream tasks including MNLI, RACE, WikiText103, and LAMBADA evaluation. There is also a script for GPT interactive text generation.
 
 # Training
 ## Data Preprocessing
@@ -101,13 +124,12 @@ The training data requires preprocessing. First, place your training data in a l
 
 The name of the `text` field of the json can be changed by using the `--json-key` flag in [`preprocess_data.py`](./tools/preprocess_data.py) The other metadata are optional and are not used in training.
 
-The loose json is then processed into a binary format for training. To convert the json into mmap, cached index file, or the lazy loader format use `preprocess_data.py`. Set the `--dataset-impl` flag to `mmap`, `cached`, or `lazy`, respectively (default is `mmap`). An example script to prepare data for BERT training is:
+The loose json is then processed into a binary format for training. To convert the json into mmap format use `preprocess_data.py`. An example script to prepare data for BERT training is:
 <pre>
 python tools/preprocess_data.py \
        --input my-corpus.json \
        --output-prefix my-bert \
-       --vocab bert-vocab.txt \
-       --dataset-impl mmap \
+       --vocab-file bert-vocab.txt \
        --tokenizer-type BertWordPieceLowerCase \
        --split-sentences
 </pre>
@@ -124,8 +146,7 @@ Some minor modifications are required for GPT data preprocessing, namely, the ad
 python tools/preprocess_data.py \
        --input my-corpus.json \
        --output-prefix my-gpt2 \
-       --vocab gpt2-vocab.json \
-       --dataset-impl mmap \
+       --vocab-file gpt2-vocab.json \
        --tokenizer-type GPT2BPETokenizer \
        --merge-file gpt2-merges.txt \
        --append-eod
@@ -138,27 +159,28 @@ Further command line arguments are described in the source file [`preprocess_dat
 ## BERT Pretraining
 
 
-The [`examples/pretrain_bert.sh`](./examples/pretrain_bert.sh) script runs single GPU 345M parameter BERT pretraining. Debugging is the primary use for single GPU training, as the code base and command line arguments are optimized for highly distributed training. Most of the arguments are fairly self-explanatory. By default, the learning rate decays linearly over the training iterations starting at `--lr` to a minimum set by `--min-lr` over `--lr-decay-iters` iterations. The fraction of training iterations used for warmup is set by `--lr-warmup-fraction`. While this is single GPU training, the batch size specified by `--micro-batch-size` is a single forward-backward path batch-size and the code will perform gradient accumulation steps until it reaches `global-batch-size` which is the batch size per iteration. The data is partitioned into a 949:50:1 ratio for training/validation/test sets (default is 969:30:1). This partitioning happens on the fly, but is consistent across runs with the same random seed (1234 by default, or specified manually with `--seed`). We use `train-iters` as the training iterations requested. Alternatively, one can provide `--train-samples` which is total number of samples to train on. If this option is present, then instead of providing `--lr-decay-iters`, one will need to provide `--lr-decay-samples`.
+The [`examples/bert/train_bert_340m_distributed.sh`](examples/bert/train_bert_340m_distributed.sh) script runs single GPU 345M parameter BERT pretraining. Debugging is the primary use for single GPU training, as the code base and command line arguments are optimized for highly distributed training. Most of the arguments are fairly self-explanatory. By default, the learning rate decays linearly over the training iterations starting at `--lr` to a minimum set by `--min-lr` over `--lr-decay-iters` iterations. The fraction of training iterations used for warmup is set by `--lr-warmup-fraction`. While this is single GPU training, the batch size specified by `--micro-batch-size` is a single forward-backward path batch-size and the code will perform gradient accumulation steps until it reaches `global-batch-size` which is the batch size per iteration. The data is partitioned into a 949:50:1 ratio for training/validation/test sets (default is 969:30:1). This partitioning happens on the fly, but is consistent across runs with the same random seed (1234 by default, or specified manually with `--seed`). We use `train-iters` as the training iterations requested. Alternatively, one can provide `--train-samples` which is total number of samples to train on. If this option is present, then instead of providing `--lr-decay-iters`, one will need to provide `--lr-decay-samples`.
 
-The logging, checkpoint-saving, and evaluation intervals are specified. Checkpointing the activations facilitates the training of larger models and/or batches. Note that the `--data-path` now includes the additional `_text_sentence` suffix added in preprocessing, but does not include the file extensions.
+The logging, checkpoint-saving, and evaluation interval options are specified. Note that the `--data-path` now includes the additional `_text_sentence` suffix added in preprocessing, but does not include the file extensions.
 
-Further command line arguments are described in the source file [`arguments.py`](./megatron/arguments.py).
+Further command line arguments are described in the source file [`arguments.py`](./megatron/training/arguments.py).
 
-To run `examples/pretrain_bert.sh`, make any desired modifications including setting the environment variables for `CHECKPOINT_PATH`, `VOCAB_FILE`, and `DATA_PATH`. Make sure to set these variables to their paths in the container. Then launch the container with Megatron and necessary paths mounted (as explained in [Setup](#setup)) and run the example script.
+To run `train_bert_340m_distributed.sh`, make any desired modifications including setting the environment variables for `CHECKPOINT_PATH`, `VOCAB_FILE`, and `DATA_PATH`. Make sure to set these variables to their paths in the container. Then launch the container with Megatron and necessary paths mounted (as explained in [Setup](#setup)) and run the example script.
 
 ## GPT Pretraining
 
-The `examples/pretrain_gpt.sh` script runs single GPU 345M parameter GPT pretraining. As mentioned above, single GPU training is primarily intended for debugging purposes, as the code is optimized for distributed training.
+The `examples/gpt3/train_gpt3_175b_distributed.sh` script runs single GPU 345M parameter GPT pretraining. As mentioned above, single GPU training is primarily intended for debugging purposes, as the code is optimized for distributed training.
 
 It follows largely the same format as the previous BERT script with a few notable differences: the tokenization scheme used is BPE (which requires a merge table and a `json` vocabulary file) instead of WordPiece, the model architecture allows for longer sequences (note that the max position embedding must be greater than or equal to the maximum sequence length), and the `--lr-decay-style` has been set to cosine decay.  Note that the `--data-path` now includes the additional `_text_document` suffix added in preprocessing, but does not include the file extensions.
 
-Further command line arguments are described in the source file [`arguments.py`](./megatron/arguments.py).
+Further command line arguments are described in the source file [`arguments.py`](./megatron/training/arguments.py).
 
-`examples/pretrain_gpt.sh` can be launched the same way as described for BERT. Set the env vars and make any other modifications, launch the container with appropriate mounts, and run the script.
+`train_gpt3_175b_distributed.sh` can be launched the same way as described for BERT. Set the env vars and make any other modifications, launch the container with appropriate mounts, and run the script.
+More details in [`examples/gpt3/README.md`](./examples/gpt3/README.md)
 
 ## T5 Pretraining
 
-Very similar to BERT and GPT, the `examples/pretrain_t5.sh` script runs single GPU "base" (~220M parameter) T5 pretraining. The primary difference from BERT and GPT is the addition of the following arguments to accommodate the T5 architecture:
+Very similar to BERT and GPT, the `examples/t5/train_t5_220m_distributed.sh` script runs single GPU "base" (~220M parameter) T5 pretraining. The primary difference from BERT and GPT is the addition of the following arguments to accommodate the T5 architecture:
 
 * `--kv-channels` sets the inner dimension of the "key" and "value" matrices of all attention mechanisms in the model. For BERT and GPT this defaults to the hidden size divided by the number of attention heads, but can be configured for T5.
 
@@ -168,19 +190,19 @@ Very similar to BERT and GPT, the `examples/pretrain_t5.sh` script runs single G
 
 All of the other arguments remain as they were for BERT and GPT pretraining. Run this example with the same steps described above for the other scripts.
 
+More details in [`examples/t5/README.md`](./examples/t5/README.md)
+
 ## Distributed Pretraining
 
-The `examples/pretrain_{bert,gpt,t5}_distributed.sh` scripts use the PyTorch distributed launcher for distributed training. As such, multi-node training can be achieved by properly setting environment variables. See the official PyTorch [documentation](https://pytorch.org/docs/stable/elastic/run.html#launcher-api) for further description of these [environment variables](https://pytorch.org/docs/stable/distributed.html#environment-variable-initialization). By default, multi-node training uses the [nccl](https://developer.nvidia.com/nccl) distributed backend. A simple set of additional arguments and the use of the PyTorch distributed module with the `torchrun` elastic launcher (equivalent to `python -m torch.distributed.run`) are the only additional requirements to adopt distributed training. See any of `examples/pretrain_{bert,gpt,t5}_distributed.sh` for more details.
+The `pretrain_{bert,gpt,t5}_distributed.sh` scripts use the PyTorch distributed launcher for distributed training. As such, multi-node training can be achieved by properly setting environment variables. See the official PyTorch [documentation](https://pytorch.org/docs/stable/elastic/run.html#launcher-api) for further description of these [environment variables](https://pytorch.org/docs/stable/distributed.html#environment-variable-initialization). By default, multi-node training uses the [nccl](https://developer.nvidia.com/nccl) distributed backend. A simple set of additional arguments and the use of the PyTorch distributed module with the `torchrun` elastic launcher (equivalent to `python -m torch.distributed.run`) are the only additional requirements to adopt distributed training. See any of `pretrain_{bert,gpt,t5}_distributed.sh` for more details.
 
-We use two types of parallelism: data and model parallelism. We facilitate two distributed data parallel implementations: a simple one of our own that performs gradient all-reduce at the end of back propagation step, and Torch's distributed data parallel wrapper that overlaps gradient reduction with back propagation computation. To switch between these two options use `--DDP-impl local` or `--DDP-impl torch`, respectively. As expected, Torch distributed data parallelism is more efficient at larger model sizes. For example, for the 8.3 billion parameters model running on 512 GPUs, the scaling increases from 60% to 76% when Torch's distributed data parallel is used. However, the overlapping method requires more memory and for some configurations (e.g., 2.5 billion parameters using 2-way model parallel and 1.2 billion parameters with no model parallel) can make the overall training slower as a result. We empirically found that using a smaller model in those cases improves the training time.
+We use two types of parallelism: data and model parallelism. Our data parallelism implementation is in `megatron/core/distributed`, and supports overlapping of the gradient reduction with the backward pass when the `--overlap-grad-reduce` command-line option is used.
 
-Second, we developed a simple and efficient two-dimensional model-parallel approach. To use tensor model parallelism (splitting execution of a single transformer module over multiple GPUs, see Section 3 of [our paper](https://arxiv.org/pdf/1909.08053.pdf)), add the `--tensor-model-parallel-size` flag to specify the number of GPUs among which to split the model, along with the arguments passed to the distributed launcher as mentioned above. To use sequence parallelism specify `--sequence-parallel`, which requires tensor model parallel as it split among the same GPUs (more details in Section 4.2.2 of [our paper](https://arxiv.org/pdf/2205.05198.pdf)).
+Second, we developed a simple and efficient two-dimensional model-parallel approach. To use the first dimension, tensor model parallelism (splitting execution of a single transformer module over multiple GPUs, see Section 3 of [our paper](https://arxiv.org/pdf/1909.08053.pdf)), add the `--tensor-model-parallel-size` flag to specify the number of GPUs among which to split the model, along with the arguments passed to the distributed launcher as mentioned above. To use the second dimension, sequence parallelism, specify `--sequence-parallel`, which also requires tensor model parallelism to be enabled because it splits across the same GPUs (more details in Section 4.2.2 of [our paper](https://arxiv.org/pdf/2205.05198.pdf)).
 
 To use pipeline model parallelism (sharding the transformer modules into stages with an equal number of transformer modules on each stage, and then pipelining execution by breaking the batch into smaller microbatches, see Section 2.2 of [our paper](https://arxiv.org/pdf/2104.04473.pdf)), use the `--pipeline-model-parallel-size` flag to specify the number of stages to split the model into (e.g., splitting a model with 24 transformer layers across 4 stages would mean each stage gets 6 transformer layers each).
 
-<!-- The number of microbatches in a per-pipeline minibatch is controlled by the `--num-microbatches-in-minibatch` argument. With `WORLD_SIZE` GPUs, `TENSOR_MP_SIZE` tensor-model-parallel size, `PIPELINE_MP_SIZE` pipeline-model-parallel-size, `WORLD_SIZE`/(`TENSOR_MP_SIZE` * `PIPELINE_MP_SIZE`) GPUs will be used for data parallelism. The default values for `--tensor-model-parallel-size` and `--pipeline-model-parallel-size` is 1, which will not implement either form of model parallelism. -->
-
-We have examples of how to use these two different forms of model parallelism the example scripts ending in `distributed_with_mp.sh`:
+We have examples of how to use these two different forms of model parallelism the example scripts ending in `distributed_with_mp.sh`.
 
 Other than these minor changes, the distributed training is identical to the training on a single GPU.
 
@@ -188,13 +210,15 @@ The interleaved pipelining schedule (more details in Section 2.2.2 of [our paper
 
 ## Activation Checkpointing and Recomputation
 
-To reduce GPU memory usage so deploy a large model to a training system, we support activation checkpointing and recomputation. We support two levels of recompute granularity: `selective` and `full`. Selective recomputation is the default and recommended in almost all cases. It saves the activations that take less space and are expensive to recompute and recomputes activations that take a lot of space but are relatively cheap to recompute (see [our paper](https://arxiv.org/pdf/2205.05198) for details). To enable selective activation recompute simply use `--recompute-activations`.
+To reduce GPU memory usage when training a large model, we support various forms of activation checkpointing and recomputation. Instead of all activations being stored in memory to be used during backprop, as was traditionally the case in deep learning models, only activations at certain "checkpoints" in the model are retained (or stored) in memory, and the other activations are recomputed on-the-fly when needed for backprop. Note that this kind of checkpointing, *activation* checkpointing, is very different from the checkpointing of model parameters and optimizer state, which is mentioned elsewhere.
+
+We support two levels of recompute granularity: `selective` and `full`. Selective recomputation is the default and is recommended in almost all cases. This mode retains in memory the activations that take less memory storage space and are more expensive to recompute and recomputes the activations that take more memory storage space but are relatively inexpensive to recompute. See [our paper](https://arxiv.org/pdf/2205.05198) for details. You should find that this mode maximizes performance while minimizing the memory required to store activations. To enable selective activation recompute simply use `--recompute-activations`.
 
-For cases where memory is very tight, `full` checkpointing saves just the inputs to a transformer layer, or a block of transformer layers, and recomputes everything else. To turn on full activation recompute use `--recompute-granularity full`. When using full activation recomputation, there are two methods: `uniform` and `block`, chosen using the `--recompute-method` argument.
+For cases where memory is very limited, `full` recompute saves just the inputs to a transformer layer, or a group, or block, of transformer layers, and recomputes everything else. To enable full activation recompute use `--recompute-granularity full`. When using `full` activation recompute, there are two methods: `uniform` and `block`, chosen using the `--recompute-method` argument.
 
-* Uniform method uniformly divides the Transformer layers into groups of layers and stores the input activations of each group in the memory. The baseline group size is 1 and, in this case, the input activation of each Transformer layer is checkpointed. When the GPU memory is insufficient, increasing the number of layers per group reduces the memory usage thus enables running a bigger model. For example, when using the number of layers per group of 4, the input activation of each group of 4 Transformer layers is checkpointed.
+* The `uniform` method uniformly divides the transformer layers into groups of layers (each group of size `--recompute-num-layers`) and stores the input activations of each group in memory. The baseline group size is 1 and, in this case, the input activation of each transformer layer is stored. When the GPU memory is insufficient, increasing the number of layers per group reduces the memory usage, enabling a bigger model to be trained. For example, when `--recompute-num-layers` is set to 4, only the input activation of each group of 4 transformer layers is stored.
 
-* Block method checkpoints the input activations of a set number of individual Transformer layers per pipeline stage and do the rest of layers without any checkpointing. This method can be used to skip checkpointing some Transformer layers until the GPU memory is fully used, which is applicable only when there is unused GPU memory. Checkpointing fewer transformer layers avoids unnecessary activation recomputation in the backprop thus improves training performance. For example, when we specify 5 layers to checkpoint of 8 layers per pipeline stage, the input activations of only the first 5 Transformer layers are checkpointed and activation recomputation for the rest 3 layers is not needed in the backprop.
+* The `block` method recomputes the input activations of a specific number (given by `--recompute-num-layers`) of individual transformer layers per pipeline stage and stores the input activations of the remaining layers in the pipeline stage. Reducing `--recompute-num-layers` results in storing the input activations to more transformer layers, which reduces the activation recomputation required in the backprop, thus improving training performance while increasing memory usage. For example, when we specify 5 layers to recompute of 8 layers per pipeline stage, the input activations of only the first 5 transformer layers are recomputed in the backprop step while the input activations for the final 3 layers are stored. `--recompute-num-layers` can be incrementally increased until the amount of memory storage space required is just small enough to fit in the available memory, thereby both maximally utilizing memory and maximizing performance.
 
 
 ## Distributed Optimizer
@@ -211,6 +235,8 @@ Theoretical memory savings vary depending on the combination of the model's para
 | bf16 param, fp32 grads | 18 | 6 + 12/d |
 | fp32 param, fp32 grads | 16 | 8 + 8/d |
 
+As with regular data parallelism, overlapping of the gradient reduction (in this case, a reduce-scatter) with the backward pass can be facilitated using the `--overlap-grad-reduce` flag. Additionally, overlapping of the parameter all-gather can be overlapped with the forward pass using `--overlap-param-gather`.
+
 ## FlashAttention
 
 Usage: `--use-flash-attn`. Support attention head dimensions at most 128.
@@ -226,23 +252,35 @@ pip install flash-attn
 
 ## GPT-3 Example
 
-In `examples/pretrain_gpt3_175B.sh` we have provided an example of how to configure Megatron to run [GPT-3](https://arxiv.org/abs/2005.14165) with 175 billion parameters on 1024 GPUs. The script is designed for [slurm](https://slurm.schedmd.com/documentation.html) with [pyxis](https://github.com/NVIDIA/pyxis) plugin but can be easily adopted to any other scheduler. It uses 8-way and 16-way tensor and pipeline parallelism, respectively. With options `global-batch-size 1536` and `rampup-batch-size 16 16 5859375`, the training will start with global batch size 16 and linearly increase the global batch size to 1536 over 5,859,375 samples with incrmeental steps 16. The training dataset can be either a single set or a multiple datasets combined with a set of weights.
+In `examples/gpt3/train_gpt3_175b_distributed.sh` we have provided an example of how to configure Megatron to train [GPT-3](https://arxiv.org/abs/2005.14165) with 175 billion parameters on 1024 GPUs. The script is designed for [slurm](https://slurm.schedmd.com/documentation.html) with [pyxis](https://github.com/NVIDIA/pyxis) plugin but can be easily adopted to any other scheduler. It uses 8-way tensor parallelism and 16-way pipeline parallelism. With options `global-batch-size 1536` and `rampup-batch-size 16 16 5859375`, the training will start with global batch size 16 and linearly increase the global batch size to 1536 over 5,859,375 samples with incremental steps 16. The training dataset can be either a single set or a multiple datasets combined with a set of weights.
 
 With full global batch size of 1536 on 1024 A100 GPUs, each iteration takes around 32 seconds resulting in 138 teraFLOPs per GPU which is 44% of the theoretical peak FLOPs.
 
+## Retro and InstructRetro
+
 
-## Retro
+Retro [(Borgeaud et al., 2022)](https://arxiv.org/abs/2112.04426) is an autoregressive decoder-only language model (LM) pretrained with retrieval-augmentation.
+Retro features practical scalability to support large-scale pretraining from scratch by retrieving from trillions of tokens.
+Pretraining with retrieval provides a more efficient storage mechanism of factual knowledge, when compared to storing factual knowledge implicitly within the network's parameters, thus largely reducing model parameters while achieving lower perplexity than standard GPT.
+Retro also provides the flexibility to update the
+knowledge stored in LMs [(Wang et al., 2023a)](https://arxiv.org/abs/2304.06762)
+by updating the retrieval database without training LMs again.
 
-See:
+InstructRetro [(Wang et al., 2023b)](https://arxiv.org/abs/2310.07713) further scales up the size of Retro to 48B, featuring the largest LLM pretrained with retrieval (as of December 2023).
+The obtained foundation model, Retro 48B, largely outperforms the GPT counterpart in terms of perplexity.
+With instruction tuning on Retro, InstructRetro demonstrates significant improvement over the instruction tuned GPT on downstream tasks in the zero-shot setting. Specifically, the average improvement of InstructRetro is 7% over its GPT counterpart across 8 short-form QA tasks, and 10% over GPT across 4 challenging long-form QA tasks. We also find that one can ablate the encoder from InstructRetro architecture and directly use the InstructRetro decoder backbone as GPT, while achieving comparable results.
 
-- `tools/retro/README.md` for an overview.
-- `tools/retro/examples/get_preprocess_cmd.sh` for an example of common preprocessing arguments.
-- `tools/retro/examples/preprocess_data.sh` for an example of how to preprocess data.
-- `tools/retro/examples/pretrain_model.sh` for an example of how to pretrain a model.
+In this repo, we provide an end-to-end reproduction guide to implement Retro and InstructRetro, covering
+- **Retrieval database construction**, which supports billions or even trillions of tokens as a large-scale retrieval database.
+- **Pretraining with retrieval**, which supports pretraining from scratch and pretraining from a pretrained GPT model (Retro-fitting).
+- **Instruction tuning**, where we provide an open-source instruction tuning dataset and the training recipe for instruction tuning on Retro.
+- **Downstream task evaluation**, where we provide the text generation and evaluation scripts for zero-shot question answering tasks.
 
-Retro is a retrieval-enhanced model that is based on GPT. As described in [Improving language models by retrieving from trillions of tokens](https://arxiv.org/abs/2112.04426), Retro retrieves from a database of document chunks by performing locality search using a sample's tokens. The retrieval database can be large -- often billions or even trillions of tokens -- and provides a more efficient storage mechanism of factual knowledge, when compared to storing factual knowledge implicitly within the network's parameters.
+See [tools/retro/README.md](tools/retro/README.md) for a detailed overview.
 
-Using Retro requires two steps: 1) preprocessing the retrieval database and pretraining neighbors, and 2) pretraining a model using this data. Please see `tools/retro/README.md` for a detailed overview.
+## Mamba-based Language Models
+
+See [examples/mamba](./examples/mamba) for details.
 
 <!--
 ## REALM Pipeline
@@ -264,12 +302,12 @@ python preprocess_data.py \
     --workers 5  # works well for 10 CPU cores. Scale up accordingly.
 </pre>
 
-2. Use a custom samples mapping function in place of `megatron/data/realm_dataset_utils.get_block_samples_mapping` if required. To do this, you will need to implement a new function in C++ inside of `megatron/data/helpers.cpp`. The samples mapping data structure is used to select the data that will constitute every training sample in advance of the training loop.
+2. Use a custom samples mapping function in place of `megatron/legacy/data/realm_dataset_utils.get_block_samples_mapping` if required. To do this, you will need to implement a new function in C++ inside of `megatron/core/datasets/helpers.cpp`. The samples mapping data structure is used to select the data that will constitute every training sample in advance of the training loop.
  The samples mapping is responsible for holding all of the required metadata needed to construct the sample from one or more indexed datasets. In REALM, the samples mapping contains the start and end sentence indices, as well as the document index (to find the correct title for a body) and a unique ID for every block.
 3. Pretrain a BERT language model using `pretrain_bert.py`, with the sequence length equal to the block size in token ids. This model should be trained on the same indexed dataset that is used to supply the blocks for the information retrieval task.
 In REALM, this is an uncased bert base model trained with the standard hyperparameters.
 4. Use `pretrain_ict.py` to train an `ICTBertModel` which uses two BERT-based encoders to encode queries and blocks to perform retrieval with.
-The script below trains the ICT model from REALM. It refrences a pretrained BERT model (step 3) in the `--bert-load` argument. The batch size used in the paper is 4096, so this would need to be run with data parallel world size 32.
+The script below trains the ICT model from REALM. It references a pretrained BERT model (step 3) in the `--bert-load` argument. The batch size used in the paper is 4096, so this would need to be run with data parallel world size 32.
 <pre>
 python pretrain_ict.py \
     --num-layers 12 \
@@ -280,7 +318,6 @@ python pretrain_ict.py \
     --max-position-embeddings 256 \
     --ict-head-size 128 \
     --train-iters 100000 \
-    --activations-checkpoint-method uniform \
     --bert-load /path/to/pretrained_bert \
     --load checkpoints \
     --save checkpoints \
@@ -310,7 +347,6 @@ python tools/create_doc_index.py \
     --ict-head-size 128 \
     --num-attention-heads 12 \
     --batch-size 128 \
-    --activations-checkpoint-method uniform \
     --seq-length 256 \
     --max-position-embeddings 256 \
     --ict-load /path/to/pretrained_ict \
@@ -333,7 +369,7 @@ We provide several command line arguments, detailed in the scripts listed below,
 Because evaluation requires substantially less memory than training, it may be advantageous to merge a model trained in parallel for use on fewer GPUs in downstream tasks. The following script accomplishes this. This example reads in a GPT model with 4-way tensor and 4-way pipeline model parallelism and writes out a model with 2-way tensor and 2-way pipeline model parallelism.
 
 <pre>
-python tools/checkpoint_util.py \
+python tools/checkpoint/convert.py \
         --model-type GPT \
         --load-dir checkpoints/gpt3_tp4_pp4 \
         --save-dir checkpoints/gpt3_tp2_pp2 \
@@ -346,7 +382,7 @@ Several downstream tasks are described for both GPT and BERT models below. They
 
 ## GPT Text Generation
 
-We have included a simple REST server to use for text generation in `tools/run_text_generation_server.py`. You run it much like you would start a pretraining job, specifying an appropriate pretrained checkpoint. There are also few optional parameters: `temperature`, `top-k`and `top-p`. See `--help` or the source file for more information. See [examples/run_text_generation_server_345M.sh](examples/run_text_generation_server_345M.sh) for an example of how to run the server.
+We have included a simple REST server to use for text generation in `tools/run_text_generation_server.py`. You run it much like you would start a pretraining job, specifying an appropriate pretrained checkpoint. There are also few optional parameters: `temperature`, `top-k`and `top-p`. See `--help` or the source file for more information. See [examples/inference/run_text_generation_server_345M.sh](examples/inference/run_text_generation_server_345M.sh) for an example of how to run the server.
 
 Once the server is running you can use `tools/text_generation_cli.py` to query it, it takes one argument which is the host the server is running on.
 
@@ -360,12 +396,12 @@ You can also use CURL or any other tools to query the server directly:
 curl 'http://localhost:5000/api' -X 'PUT' -H 'Content-Type: application/json; charset=UTF-8'  -d '{"prompts":["Hello world"], "tokens_to_generate":1}'
 </pre>
 
-See [megatron/text_generation_server.py](megatron/text_generation_server.py) for more API options.
+See [megatron/inference/text_generation_server.py](megatron/inference/text_generation_server.py) for more API options.
 
 ### Detoxify GPT via Self-generation
-We include an example in `examples/detxoify_lm/` to detoxify language models by leveraging the generative power of language models.
+We include an example in `examples/academic_paper_scripts/detxoify_lm/` to detoxify language models by leveraging the generative power of language models.
 
-See [examples/detxoify_lm/README.md](examples/detxoify_lm/README.md) for step-by-step tutorials on how to perform domain-adaptive training and detoxify LM using self-generated corpus. 
+See [examples/academic_paper_scripts/detxoify_lm/README.md](examples/academic_paper_scripts/detxoify_lm/README.md) for step-by-step tutorials on how to perform domain-adaptive training and detoxify LM using self-generated corpus.
 
 
 ## GPT Evaluation
@@ -399,7 +435,6 @@ python tasks/main.py \
        --merge-file $MERGE_FILE \
        --load $CHECKPOINT_PATH \
        --micro-batch-size 8 \
-       --activations-checkpoint-method uniform \
        --log-interval 10 \
        --no-load-optim \
        --no-load-rng
@@ -409,7 +444,7 @@ python tasks/main.py \
 ### LAMBADA Cloze Accuracy
 To compute LAMBADA cloze accuracy (the accuracy of predicting the last token given the preceding tokens) we utilize a detokenized, processed version of the [LAMBADA dataset](https://github.com/cybertronai/bflm/blob/master/lambada_test.jsonl).
 
-We use the following command to run LAMBADA evaluation on a 345M parameter model. Note that the `--strict-lambada` flag should be used to require whole word matching. Make that `lambada` is part of the file path.
+We use the following command to run LAMBADA evaluation on a 345M parameter model. Note that the `--strict-lambada` flag should be used to require whole word matching. Ensure that `lambada` is part of the file path.
 
 <pre>
 TASK="LAMBADA"
@@ -429,7 +464,6 @@ python tasks/main.py \
        --merge-file $MERGE_FILE \
        --load $CHECKPOINT_PATH \
        --micro-batch-size 8 \
-       --activations-checkpoint-method uniform \
        --log-interval 10 \
        --no-load-optim \
        --no-load-rng
@@ -459,7 +493,6 @@ COMMON_TASK_ARGS="--num-layers 24 \
 COMMON_TASK_ARGS_EXT="--train-data $TRAIN_DATA \
                       --valid-data $VALID_DATA \
                       --pretrained-checkpoint $PRETRAINED_CHECKPOINT \
-                      --activations-checkpoint-method uniform \
                       --save-interval 10000 \
                       --save $CHECKPOINT_PATH \
                       --log-interval 100 \
@@ -503,13 +536,55 @@ python tasks/main.py \
        --lr-warmup-fraction 0.065
 </pre>
 
+## Llama-2 Inference and Finetuning
+
+The Llama-2 [family of models](https://ai.meta.com/llama/) are an open-source set of pretrained & finetuned (for chat) models that have achieved strong results across a wide set of benchmarks. At the time of release, Llama-2 models achieved among the best results for open-source models, and were competitive with the closed-source GPT-3.5 model (see https://arxiv.org/pdf/2307.09288.pdf).
+
+The Llama-2 checkpoints can be loaded into Megatron for inference and finetuning. See documentation [here](docs/llama2.md).
+
+# Model Optimization and Deployment
+Megatron-Core (MCore) `GPTModel` family supports advanced quantization algorithms and high-performance inference through TensorRT-LLM.
+
+## Quantization and TensorRT-LLM Deployment
+See [Megatron Model Optimization and Deployment](examples/inference/quantization/README.md) for `llama2` and `nemotron3` examples.
+
 # Datasets
 We do not host any datasets for GPT or BERT training, however, we detail their collection so that our results may be reproduced.
 
 ## Collecting Wikipedia Training Data
 We recommend following the Wikipedia data extraction process specified by Google research: "the recommended pre-processing is to download [the latest dump](https://dumps.wikimedia.org/enwiki/latest/enwiki-latest-pages-articles.xml.bz2), extract the text with [WikiExtractor.py](https://github.com/attardi/wikiextractor), and then apply any necessary cleanup to convert it into plain text."
 
-We recommend using the `--json` argument when using WikiExtractor, which will dump the Wikipedia data into loose json format (one json per line), making it more manageable on the file system and also readily consumable by our codebase. We recommend further preprocessing this json dataset by nltk punctuation standardization. For BERT training, use the `--split-sentences` flag to `preprocess_data.py` as described [above](#data-preprocessing) to include sentence breaks in the produced index. If you'd like to use Wikipedia data for GPT training you should still clean it with nltk/spacy/ftfy, but do not use the `--split-sentences` flag.
+We recommend using the `--json` argument when using WikiExtractor, which will dump the Wikipedia data into loose json format (one json object per line), making it more manageable on the file system and also readily consumable by our codebase. We recommend further preprocessing this json dataset with nltk punctuation standardization. For BERT training, use the `--split-sentences` flag to `preprocess_data.py` as described [above](#data-preprocessing) to include sentence breaks in the produced index. If you'd like to use Wikipedia data for GPT training you should still clean it with nltk/spacy/ftfy, but do not use the `--split-sentences` flag.
 
 ## Collecting GPT Webtext Data
-We utilize the publicly available [OpenWebText](https://github.com/eukaryote31/openwebtext) library from [jcpeterson](https://github.com/jcpeterson/openwebtext) and [eukaryote31's](https://github.com/eukaryote31/openwebtext) work to download urls. We then filtered, cleaned, and deduplicated all downloaded content according to the procedure described in our [openwebtext](./tools/openwebtext) directory. For reddit URLs corresponding to content up to October 2018 we arrived at approximately 37GB of content.
+We utilize the publicly available [OpenWebText](https://github.com/eukaryote31/openwebtext) library from [jcpeterson](https://github.com/jcpeterson/openwebtext) and [eukaryote31's](https://github.com/eukaryote31/openwebtext) work to download urls. We then filter, clean, and deduplicate all downloaded content according to the procedure described in our [openwebtext](./tools/openwebtext) directory. For reddit URLs corresponding to content up to October 2018 we arrived at approximately 37GB of content.
+
+# Reproducibility
+Megatron training can be bitwise reproducible; to enable this mode use `--deterministic-mode`. This means that the same training config run twice in the same HW and SW environment should produce identical model checkpoints, losses and accuracy metric values (iteration time metrics may vary).
+
+There are currently three known Megatron optimizations that break reproducibility whilst still producing almost identical training runs:
+1. The specific NCCL algorithm that is used during an all-reduce (as specified by the environment variable `NCCL_ALGO`) is important. We have tested the following: `^NVLS`, `Tree`, `Ring`, `CollnetDirect`, `CollnetChain`. The code admits the use of `^NVLS`, which allows NCCL the choice of non-NVLS algorithms; its choice seems to be stable.
+2. Flash attention is non-deterministic; do not use `--use-flash-attn`.
+3. If using Transformer Engine, you must also set the environment variable `NVTE_ALLOW_NONDETERMINISTIC_ALGO=0`.
+
+In addition, determinisim has only been verified in NGC PyTorch containers up to and newer than 23.12. If you observe nondeterminism in Megatron training under other circumstances please open an issue.
+
+## Projects Using Megatron
+Below are some of the projects where we have directly used Megatron:
+* [BERT and GPT Studies Using Megatron](https://arxiv.org/pdf/1909.08053.pdf)
+* [BioMegatron: Larger Biomedical Domain Language Model](https://www.aclweb.org/anthology/2020.emnlp-main.379.pdf)
+* [End-to-End Training of Neural Retrievers for Open-Domain Question Answering](https://arxiv.org/abs/2101.00408)
+* [Large Scale Multi-Actor Generative Dialog Modeling](https://www.aclweb.org/anthology/2020.acl-main.8.pdf)
+* [Local Knowledge Powered Conversational Agents](https://arxiv.org/abs/2010.10150)
+* [MEGATRON-CNTRL: Controllable Story Generation with External Knowledge Using Large-Scale Language Models](https://www.aclweb.org/anthology/2020.emnlp-main.226.pdf)
+* [RACE Reading Comprehension Dataset Leaderboard](http://www.qizhexie.com/data/RACE_leaderboard.html)
+* [Training Question Answering Models From Synthetic Data](https://www.aclweb.org/anthology/2020.emnlp-main.468.pdf)
+* [Few-shot Instruction Prompts for Pretrained Language Models to Detect Social Biases](https://arxiv.org/abs/2112.07868)
+* [Exploring the Limits of Domain-Adaptive Training for Detoxifying Large-Scale Language Models](https://arxiv.org/abs/2202.04173)
+* [Using DeepSpeed and Megatron to Train Megatron-Turing NLG 530B, A Large-Scale Generative Language Model](https://arxiv.org/abs/2201.11990)
+* [Multi-Stage Prompting for Knowledgeable Dialogue Generation](https://arxiv.org/abs/2203.08745)
+* [Evaluating Parameter Efficient Learning for Generation](https://aclanthology.org/2022.emnlp-main.319.pdf)
+* [Exploring the Limits of Domain-Adaptive Training for Detoxifying Large-Scale Language Models](https://arxiv.org/abs/2202.04173)
+* [Shall We Pretrain Autoregressive Language Models with Retrieval? A Comprehensive Study](https://arxiv.org/abs/2304.06762)
+* [InstructRetro: Instruction Tuning post Retrieval-Augmented Pretraining](https://arxiv.org/abs/2310.07713)
+* [An Empirical Study of Mamba-based Language Models](https://arxiv.org/abs/2406.07887)
diff --git a/docs/distrib_optimizer.md b/docs/distrib_optimizer.md
deleted file mode 100644
index def23b20eb..0000000000
--- a/docs/distrib_optimizer.md
+++ /dev/null
@@ -1,54 +0,0 @@
-# Distributed Optimizer
-
-The motivation for the distributed optimizer is to save memory by distributing the optimizer state evenly across data parallel ranks, versus the current method of replicating the optimizer state across data parallel ranks. As described in https://arxiv.org/abs/1910.02054, this branch specifically implements the following:
-
-- [yes] distribute all 'non-overlapping' optimizer state (i.e., model params already in fp32 are NOT distributed)
-- [no] distribute model gradients
-- [no] distribute model parameters
-
-Theoretical memory savings vary depending on the combination of the model's param dtype and grad dtype. In the current implementation, the theoretical number of bytes per parameter is (where 'd' is the data parallel size):
-
-|        | Non-distributed optim | Distributed optim |
-| ------ | ------ | ------ |
-| float16 param, float16 grads | 20 | 4 + 16/d |
-| float16 param, fp32 grads    | 18 | 6 + 12/d |
-| fp32 param, fp32 grads       | 16 | 8 + 8/d  |
-
-The implementation of the distributed optimizer is centered on using the contiguous grad buffer for communicating grads & params between the model state and the optimizer state. The grad buffer at any given moment either holds:
-
-1. all model grads
-2. a 1/d size _copy_ of the main grads (before copying to the optimizer state)
-3. a 1/d size _copy_ of the main params (after copying from the optimizer state)
-4. all model params
-5. zeros (or None), between iterations
-
-The grad buffer is used for performing reduce-scatter and all-gather operations, for passing grads & params between the model state and optimizer state. With this implementation, no dynamic buffers are allocated.
-
-The figures below illustrate the grad buffer's sharding scheme, and the key steps of the distributed optimizer's param update:
-
-## Data flow
-
-![Data flow](images/distrib_optimizer/data_flow.png)
-
-## Sharding scheme
-
-![Sharding scheme](images/distrib_optimizer/sharding_scheme.png)
-
-## Key steps
-
-_(note: using illustrations above, and assuming fp16 grads)_
-
-- Backward pass finishes (grad buffer holds 16 fp16 grad elements)
-- Call reduce-scatter on each DP rank
-- Each DP rank now has 4 elements within the grad buffer that are fully reduced (remaining 12 elements are garbage)
-- Each DP rank copies its relevant 4 fp16 grad elements from the grad buffer into 4 fp32 main grad elements (separate buffer, owned by the optimizer); i.e.
-  - DP rank 0 copies elements [0:4]
-  - DP rank 1 copies elements [4:8]
-  - DP rank 2 copies elements [8:12]
-  - DP rank 3 copies elements [12:16]
-- Optimizer.step()
-- Each DP rank copies its 4 fp32 main (/optimizer) param elements into the corresponding 4 fp16 elements in the grad buffer
-- Call all-gather on each DP rank
-- Grad buffer now contains all 16, fully updated, fp16 model param elements
-- Copy updated model params from grad buffer into their respective param tensors
-- (At this point, grad buffer is ready to be zero'd for the next iteration)
diff --git a/docs/images/distrib_optimizer/data_flow.png b/docs/images/distrib_optimizer/data_flow.png
deleted file mode 100644
index d48fc134c4..0000000000
Binary files a/docs/images/distrib_optimizer/data_flow.png and /dev/null differ
diff --git a/docs/images/distrib_optimizer/sharding_scheme.png b/docs/images/distrib_optimizer/sharding_scheme.png
deleted file mode 100644
index b07c25b05f..0000000000
Binary files a/docs/images/distrib_optimizer/sharding_scheme.png and /dev/null differ
diff --git a/docs/llama_mistral.md b/docs/llama_mistral.md
new file mode 100644
index 0000000000..11601fd44f
--- /dev/null
+++ b/docs/llama_mistral.md
@@ -0,0 +1,480 @@
+# Llama, Mistral and other Llama-like model support in Megatron-LM
+
+NOTE: In order to simplify code we now only support converting llama-3.x and mistral checkpoints downloaded from Huggingface.
+
+The [Llama-2](https://ai.meta.com/llama/) and [Llama-3](https://llama.meta.com/) family of models are an open-source set of pretrained & finetuned (for chat) models that have achieved strong results across a wide set of benchmarks. At their times of release, both Llama-2 and Llama-3 models achieved among the best results for open-source models, and were competitive with leading closed-source models (see https://arxiv.org/pdf/2307.09288.pdf and https://ai.meta.com/blog/meta-llama-3/).
+
+Similarly, [Mistral-7b](https://mistral.ai/news/announcing-mistral-7b/) is an open-source model with pretrained and finetuned (for chat) variants that achieve strong benchmark results.
+
+Architecturally Llama-2, Llama-3 and Mistral-7b are very similar. As such Megatron can support loading checkpoints from all three for inference and finetuning. Converting the checkpoints and loading them is slightly different for each model and is detailed for each below.
+
+# Llama-2
+
+Llama-2 checkpoints can be loaded into Megatron for inference and for finetuning. Loading these checkpoints consists of three steps:
+
+1. Get access to download the checkpoints.
+2. Convert the checkpoints from Meta/Huggingface format to Megatron format.
+3. Setup arguments for launching the model.
+
+The following sections detail these steps. The final section lists benchmark result comparisons between: 1) Llama-2 inference code running the Meta-format checkpoints, and 2) Megatron inference code running the converted checkpoints.
+
+## Contents
+  * [Download Meta or Huggingface checkpoints](#download-meta-or-huggingface-checkpoints)
+  * [Convert checkpoint format](#convert-checkpoint-format)
+    * [Meta format](#meta-format)
+    * [Huggingface format](#huggingface-format)
+  * [Launch model](#launch-model)
+    * [Megatron](#launch-megatron)
+    * [Meta](#launch-meta)
+    * [Huggingface](#launch-hf)
+  * [Benchmark results](#benchmark-results)
+
+## Download Meta or Huggingface checkpoints
+
+Users must first apply for access to download the Llama-2 checkpoints either directly from [Meta](https://ai.meta.com/resources/models-and-libraries/llama-downloads/) or through [Huggingface](https://huggingface.co/docs/transformers/main/model_doc/llama2) (HF). The checkpoints are available in two formats, Meta's native format (available from both the Meta and HF links), and HF's format (available only from HF). Either format can be converted to Megatron, as detailed next.
+
+## Convert checkpoint format
+
+We recommend passing `--dtype bf16` for training or finetuning. Inference can be done in bfloat16 or float16.
+
+### Meta format
+
+The Meta format checkpoints are converted to HF format as an intermediate step before converting to Megatron format. The `transformers` package is required, and must have version >=4.31.0 (e.g., `pip install transformers>=4.31.0`). (**Note**: we have specifically tested with versions `4.31.0` and `4.32.0`; your experience may vary with newer versions.) Assuming the downloaded checkpoints are in `$CHECKPOINT_DIR` (with separate sub-directories for 7B, 13B, 70B, etc.), the following example command can be used to convert from Llama-2 format to HF format in bfloat16:
+
+```
+python tools/checkpoint/convert.py --model-type GPT \
+>   --loader llama_mistral \
+>   --saver megatron \
+>   --checkpoint-type meta \
+>   --model-size llama2-7B \
+>   --load-dir $LLAMA_META_FORMAT_DIR \
+>   --save-dir ${MEGATRON_FORMAT_DIR} \
+>   --tokenizer-model ${TOKENIZER_MODEL} \
+>   --target-tensor-parallel-size ${TP} \
+>   --target-pipeline-parallel-size ${PP} \
+>   --bf16
+```
+
+Valid values for `--model-size` are `llama2-7B`, `llama2-13B`, and `llama2-70B` (for pretrained-only models), and `llama2-7Bf`, `llama2-13Bf`, and `llama2-70Bf` (for chat-finetuned models).
+
+### Huggingface format
+
+The HF checkpoints can be converted to Megatron format by using Megatron's own Llama-2 checkpoint converter for HF format (see script `tools/checkpoint/loader_llama_mistral.py`). One important argument that must be set correctly is the tensor parallel size (`TP`) for each model. The following table shows these values:
+
+| Model size | Tensor parallel size (`TP`) |
+| ---------- | --------------------------- |
+|  7B        | 1                           |
+| 13B        | 2                           |
+| 70B        | 8                           |
+
+Using these values for `TP`, along with the path to the Llama-2 tokenizer model (automatically downloaded with original checkpoint download; see `${TOKENIZER_MODEL}` below), run the following command from the root of your Megatron source code to convert from HF format to Megatron format:
+
+```
+$>: python tools/checkpoint/convert.py \
+ >    --model-type GPT \
+ >    --loader llama_mistral \
+ >    --saver megatron \
+ >    --target-tensor-parallel-size ${TP} \
+ >    --checkpoint-type hf
+ >    --load-dir ${HF_FORMAT_DIR} \
+ >    --save-dir ${MEGATRON_FORMAT_DIR} \
+ >    --tokenizer-model ${TOKENIZER_MODEL}
+```
+
+After this conversion, we are ready to load the checkpoints into a Megatron GPT model.
+
+## Launch model
+
+### Launch Megatron
+
+If loading for either inference or finetuning, use the following arguments:
+
+```
+--tensor-model-parallel-size ${TP} \
+--pipeline-model-parallel-size 1 \
+--seq-length 4096 \
+--max-position-embeddings 4096 \
+--tokenizer-type Llama2Tokenizer \
+--tokenizer-model ${TOKENIZER_MODEL} \
+--load ${CHECKPOINT_DIR} \
+--exit-on-missing-checkpoint \
+--use-checkpoint-args \
+--no-load-optim \
+--no-load-rng \
+--untie-embeddings-and-output-weights \
+--use-rotary-position-embeddings \
+--normalization RMSNorm \
+--no-position-embedding \
+--no-masked-softmax-fusion \
+--attention-softmax-in-fp32
+```
+
+### Launch Meta
+
+Meta checkpoints can be launched with: https://github.com/facebookresearch/llama
+
+### Launch Huggingface
+
+Huggingface checkpoints can be launched with: https://github.com/huggingface/transformers/blob/main/src/transformers/models/llama/modeling_llama.py
+
+## Benchmark results
+
+The tables below list the benchmark comparisons between native Llama-2 (using Meta's checkpoint and Meta's inference code) and Megatron (using a converted HF checkpoint and Megatron's inference code).
+
+The values are the percent error between Megatron and Llama-2, calculated using the formula: `|<llama_score> - <megatron_score>| / <llama_score>`, where the type of score is detailed before each table. Across all tests (80 total per model size), the mean error is 0.15%. The small difference in benchmark scores between the two models is due to minor arithmetic differences in implementation that alter the numerics slightly. Some of the factors that influence this difference include:
+
+- Megatron performs batch matrix multiplications in a couple places, such as within self attention and in SwiGLU, that Llama performs separately.
+- Megatron uses `torch.baddbmm` within self attention, versus Llama using `torch.matmul`.
+- Megatron uses a `sin`/`cos` implementation for rotary position embeddings, versus Llama using a `polar`/`complex` implementation.
+- Llama calls `torch.set_default_dtype(torch.float16)` during initialization, which Megatron does not.
+
+### Big Bench
+
+Score type: multiple choice grade.
+
+| bigbench / standard | 7b | 13b | 70b |
+| -- | -- | -- | -- |
+| date_understanding | 0.29% | 0.13% | 0.12% |
+| general_knowledge | 0.00% | 0.00% | 0.00% |
+| human_organs_senses | 0.00% | 0.00% | 0.00% |
+| intent_recognition | 0.00% | 0.11% | 0.00% |
+| riddle_sense | 0.00% | 0.00% | 0.00% |
+| similarities_abstraction | 0.00% | 0.58% | 0.00% |
+| simple_arithmetic_json_multiple_choice | 0.00% | 0.00% | 0.00% |
+| undo_permutation | 0.19% | 0.19% | 0.18% |
+
+### Multilingual
+
+Score type: multiple choice grade.
+
+| multilingual / xcopa | 7b  | 13b  | 70b |
+| -- | -- | -- | -- |
+| en-template-mGPT-remove-punctuation | 0.08% | 0.00% | 0.00% |
+| et-template-mGPT-remove-punctuation | 0.00% | 0.13% | 0.25% |
+| ht-template-mGPT-remove-punctuation | 0.26% | 0.13% | 0.26% |
+| id-template-mGPT-remove-punctuation | 0.11% | 0.00% | 0.19% |
+| it-template-mGPT-remove-punctuation | 0.00% | 0.10% | 0.09% |
+| qu-template-mGPT-remove-punctuation | 0.00% | 0.00% | 0.27% |
+| sw-template-mGPT-remove-punctuation | 0.14% | 0.13% | 0.13% |
+| th-template-mGPT-remove-punctuation | 0.25% | 0.13% | 0.13% |
+| tr-template-mGPT-remove-punctuation | 0.26% | 0.00% | 0.34% |
+| vi-template-mGPT-remove-punctuation | 0.00% | 0.11% | 0.00% |
+| zh-template-mGPT-remove-punctuation | 0.00% | 0.10% | 0.09% |
+
+### LM Evaluation Harness
+
+Score type: multiple choice grade.
+
+| lm-eval | 7b  | 13b  | 70b |
+| -- | -- | -- | -- |
+| boolq | 0.04% | 0.04% | 0.07% |
+| hellaswag | 0.02% | 0.03% | 0.03% |
+| piqa | 0.00% | 0.00% | 0.07% |
+| winogrande | 0.00% | 0.11% | 0.20% |
+
+### MMLU
+
+Score type: multiple choice grade.
+
+Note: the number in brackets is the number of sub-tasks for each supercategory.
+
+| mmlu | 7b  | 13b  | 70b |
+| -- | -- | -- | -- |
+| stem [18]  | 0.79% | 0.05% | 0.01% |
+| humanities [13]  | 0.19% | 0.01% | 0.02% |
+| other (business, health, misc.) [14]  | 0.08% | 0.06% | 0.12% |
+| social sciences [12]  | 0.37% | 0.21% | 0.01% |
+
+# Llama-3
+
+Llama-3 checkpoints can be loaded into Megatron for inference and for finetuning. Loading these checkpoints consists of several steps:
+
+1. Get access to download the checkpoints (weights and tokenizer).
+2. Convert the checkpoints from Huggingface format to Megatron format.
+3. (Optional) Validate converted checkpoints
+4. Setup arguments for launching the model.
+
+The following sections detail these steps.
+
+## Contents
+  * [Download Huggingface checkpoints](#download-huggingface-checkpoints)
+  * [Convert checkpoint format](#convert-checkpoint-format)
+    * [Huggingface format](#huggingface-format)
+  * [Validate checkpoint](#optional-validate-checkpoint)
+  * [Launch model](#launch-model)
+
+## Download Huggingface checkpoints
+
+Users must first apply for access to download the Llama-3 checkpoints from [Huggingface](https://huggingface.co/meta-llama).
+
+## Convert checkpoint format
+
+We recommend passing `--dtype bf16` for training or finetuning. Inference can be done in bfloat16 or float16.
+
+### Huggingface format
+
+The HF checkpoints can be converted to Megatron format by using Megatron's own Llama-3 checkpoint converter for HF format (see script `tools/checkpoint/loader_llama_mistral.py`). One important argument that must be set correctly is the tensor parallel size (`TP`) for each model. The following table shows these values:
+
+| Model size | Tensor parallel size (`TP`) |
+| ---------- | --------------------------- |
+|  8B        | 1                           |
+| 70B        | 8                           |
+
+Using these values for `TP`, along with the path to the Llama-3 tokenizer model (automatically downloaded with original checkpoint download; see `${TOKENIZER_MODEL}` below), run the following command from the root of your Megatron source code to convert from HF format to Megatron format:
+
+```
+$>: python tools/checkpoint/convert.py \
+ >    --bf16 \
+ >    --model-type GPT \
+ >    --loader llama_mistral \
+ >    --saver mcore \
+ >    --target-tensor-parallel-size ${TP} \
+ >    --checkpoint-type hf
+ >    --load-dir ${HF_FORMAT_DIR} \
+ >    --save-dir ${MEGATRON_FORMAT_DIR} \
+ >    --tokenizer-model ${TOKENIZER_MODEL}
+ >    --model-size llama3-8B \
+```
+
+Valid values for `--model-size` are `llama3-8B` and `llama3-70B` (for pretrained-only models), and `llama3-8Bf` and `llama3-70Bf` (for chat-finetuned models).
+
+After this conversion, we are ready to load the checkpoints into a Megatron GPT model.
+
+## (Optional) Validate checkpoints
+
+A Megatron-LM text generation server for Llama3 can be launched using the script `examples/llama_mistral/run_text_generation_llama3.sh <PATH_TO_CONVERTED_MCORE_CHECKPOINT> <PATH_TO_DOWNLOADED_HUGGINGFACE_CHECKPOINT>`.
+
+Once running, query the server with `curl 'http://<TEXT_GENERATION_SERVER_IP>:5000/api' -X 'PUT' -H 'Content-Type: application/json; charset=UTF-8'  -d '{"prompts":["<SOME_PROMPT>"], "tokens_to_generate":100, "top_k":1}'`.
+
+A reference generation for comparison can be obtained from the Huggingface transformers library by running `python examples/llama_mistral/huggingface_reference.py --model_path <PATH_TO_DOWNLOADED_HUGGINGFACE_CHECKPOINT> --prompt <SOME_PROMPT>`.
+
+## Launch model
+
+If loading for either inference or finetuning, use the following arguments:
+
+```
+--tensor-model-parallel-size ${TP} \
+--pipeline-model-parallel-size 1 \
+--seq-length 8192 \
+--max-position-embeddings 8192 \
+--tokenizer-type HuggingFaceTokenizer \
+--tokenizer-model ${TOKENIZER_MODEL} \
+--load ${CHECKPOINT_DIR} \
+--exit-on-missing-checkpoint \
+--use-checkpoint-args \
+--no-load-optim \
+--no-load-rng \
+--untie-embeddings-and-output-weights \
+--normalization RMSNorm \
+--position-embedding-type rope \
+--no-masked-softmax-fusion \
+--attention-softmax-in-fp32 \
+--disable-bias-linear \
+--transformer-impl transformer_engine \
+--group-query-attention 8 \
+--attention-dropout 0.0 \
+--hidden-dropout 0.0 \
+--rotary-base 500000 \
+--rotary-percent 1.0 \
+--ffn-hidden-size 14336 \
+--num-attention-heads 32 \
+--swiglu \
+--bf16 \
+```
+
+# Llama-3.1
+
+Llama-3 checkpoints can be loaded into Megatron for inference and for finetuning. Loading these checkpoints consists of several steps:
+
+1. Get access to download the checkpoints (weights and tokenizer).
+2. Convert the checkpoints from Huggingface format to Megatron format.
+3. (Optional) Validate converted checkpoints
+4. Setup arguments for launching the model.
+
+The following sections detail these steps.
+
+## Contents
+  * [Download Huggingface checkpoints](#download-huggingface-checkpoints)
+  * [Convert checkpoint format](#convert-checkpoint-format)
+    * [Huggingface format](#huggingface-format)
+  * [Validate checkpoint](#optional-validate-checkpoint)
+  * [Launch model](#launch-model)
+
+## Download Huggingface checkpoints
+
+Users must first apply for access to download the Llama-3 checkpoints from [Huggingface](https://huggingface.co/meta-llama).
+
+## Convert checkpoint format
+
+We recommend passing `--dtype bf16` for training or finetuning. Inference can be done in bfloat16 or float16.
+
+### Huggingface format
+
+The HF checkpoints can be converted to Megatron format by using Megatron's own Llama-3 checkpoint converter for HF format (see script `tools/checkpoint/loader_llama_mistral.py`). One important argument that must be set correctly is the tensor parallel size (`TP`) for each model. The following table shows these values:
+
+| Model size | Tensor parallel size (`TP`) |
+| ---------- | --------------------------- |
+|  8B        | 1                           |
+| 70B        | 8                           |
+
+Using these values for `TP`, along with the path to the Llama-3 tokenizer model (automatically downloaded with original checkpoint download; see `${TOKENIZER_MODEL}` below), run the following command from the root of your Megatron source code to convert from HF format to Megatron format:
+
+```
+$>: python tools/checkpoint/convert.py \
+ >    --bf16 \
+ >    --model-type GPT \
+ >    --loader llama_mistral \
+ >    --saver mcore \
+ >    --target-tensor-parallel-size ${TP} \
+ >    --checkpoint-type hf
+ >    --load-dir ${HF_FORMAT_DIR} \
+ >    --save-dir ${MEGATRON_FORMAT_DIR} \
+ >    --tokenizer-model ${TOKENIZER_MODEL}
+ >    --model-size llama3-8B \
+```
+
+Valid values for `--model-size` are `llama3.1-8B` and `llama3.1-70B` (for pretrained-only models), and `llama3.1-8Bf` and `llama3.1-70Bf` (for chat-finetuned models).
+
+After this conversion, we are ready to load the checkpoints into a Megatron GPT model.
+
+## (Optional) Validate checkpoints
+
+A Megatron-LM text generation server for Llama3.1 can be launched using the script `examples/llama_mistral/run_text_generation_llama3.1.sh <PATH_TO_CONVERTED_MCORE_CHECKPOINT> <PATH_TO_DOWNLOADED_HUGGINGFACE_CHECKPOINT>`.
+
+Once running, query the server with `curl 'http://<TEXT_GENERATION_SERVER_IP>:5000/api' -X 'PUT' -H 'Content-Type: application/json; charset=UTF-8'  -d '{"prompts":["<SOME_PROMPT>"], "tokens_to_generate":100, "top_k":1}'`.
+
+A reference generation for comparison can be obtained from the Huggingface transformers library by running `python examples/llama_mistral/huggingface_reference.py --model_path <PATH_TO_DOWNLOADED_HUGGINGFACE_CHECKPOINT> --prompt <SOME_PROMPT>`.
+
+## Launch model
+
+If loading for either inference or finetuning, use the following arguments:
+
+```
+--tensor-model-parallel-size ${TP} \
+--pipeline-model-parallel-size 1 \
+--seq-length 8192 \
+--max-position-embeddings 131072 \
+--tokenizer-type HuggingFaceTokenizer \
+--tokenizer-model ${TOKENIZER_MODEL} \
+--load ${CHECKPOINT_DIR} \
+--exit-on-missing-checkpoint \
+--use-checkpoint-args \
+--no-load-optim \
+--no-load-rng \
+--untie-embeddings-and-output-weights \
+--normalization RMSNorm \
+--position-embedding-type rope \
+--no-masked-softmax-fusion \
+--attention-softmax-in-fp32 \
+--disable-bias-linear \
+--transformer-impl transformer_engine \
+--group-query-attention 8 \
+--attention-dropout 0.0 \
+--hidden-dropout 0.0 \
+--rotary-base 500000 \
+--rotary-percent 1.0 \
+--use-rope-scaling \
+--ffn-hidden-size 14336 \
+--num-attention-heads 32 \
+--swiglu \
+--bf16 \
+```
+
+# Mistral-7b
+
+Megatron currently supports loading the v0.3 release of Mistral-7b (which does not use sliding window attention and offers a larger 32768 vocabulary) for inference and finetuning. Loading these checkpoints consists of several steps:
+
+1. Get access to download the checkpoints (weights and tokenizer).
+2. Convert the checkpoints from HuggingFace format to Megatron format.
+3. (Optional) Validate converted checkpoints
+4. Setup arguments for launching the model.
+
+The following sections detail these steps.
+
+## Contents
+  * [Download Huggingface checkpoints](#download-huggingface-checkpoints)
+  * [Convert checkpoint format](#convert-checkpoint-format)
+  * [(Optional) Validate checkpoint](#optional-validate-checkpoint)
+  * [Launch model](#launch-model)
+
+## Download Huggingface checkpoints
+
+Users must first apply for access to download the Mistral-7b checkpoints through [Huggingface](https://huggingface.co/mistralai/Mistral-7B-v0.3) (HF).
+
+## Convert checkpoint format
+
+The HF checkpoints can be converted to Megatron format by using Megatron's own Mistral checkpoint converter for HF format (see script `tools/checkpoint/loader_llama_mistral.py`).
+
+Using the path to the Mistral tokenizer model (downloaded alongside the HF checkpoint), run the following command from the root of your Megatron source code to convert from HF format to mcore format:
+
+```
+$>: python tools/checkpoint/convert.py \
+ >    --bf16 \
+ >    --model-type GPT \
+ >    --loader llama_mistral \
+ >    --saver mcore \
+ >    --target-tensor-parallel-size ${TP} \
+ >    --checkpoint-type hf \
+ >    --load-dir ${HF_FORMAT_DIR} \
+ >    --save-dir ${MEGATRON_FORMAT_DIR} \
+ >    --tokenizer-model ${TOKENIZER_MODEL} \
+ >    --model-size mistral-7B \
+```
+
+Valid values for `--model-size` are mistral-7B for the pretrained model or mistral-7Bf for the chat fine-tuned model.
+
+After this conversion, we are ready to load the checkpoints into an mcore GPT model.
+
+## (Optional) Validate checkpoints
+
+A Megatron-LM text generation server for Mistral-7B can be launched using the script `examples/llama_mistral/run_text_generation_mistral.sh <PATH_TO_CONVERTED_MCORE_CHECKPOINT> <PATH_TO_DOWNLOADED_HUGGINGFACE_CHECKPOINT>`.
+
+Once running, query the server with `curl 'http://<TEXT_GENERATION_SERVER_IP>:5000/api' -X 'PUT' -H 'Content-Type: application/json; charset=UTF-8'  -d '{"prompts":["<SOME_PROMPT>"], "tokens_to_generate":100, "top_k":1}'`.
+
+A reference generation for comparison can be obtained from the Huggingface transformers library by running `python examples/llama_mistral/huggingface_reference.py --model_path <PATH_TO_DOWNLOADED_HUGGINGFACE_CHECKPOINT> --prompt <SOME_PROMPT>`.
+
+## Launch model
+
+If loading for either inference or finetuning, use the following arguments:
+
+```
+--tensor-model-parallel-size ${TP} \
+--pipeline-model-parallel-size 1 \
+--seq-length 4096 \
+--max-position-embeddings 4096 \
+--tokenizer-type HuggingFaceTokenizer \
+--tokenizer-model ${TOKENIZER_MODEL} \
+--load ${CHECKPOINT_DIR} \
+--exit-on-missing-checkpoint \
+--use-checkpoint-args \
+--no-load-optim \
+--no-load-rng \
+--untie-embeddings-and-output-weights \
+--normalization RMSNorm \
+--position-embedding-type rope \
+--no-masked-softmax-fusion \
+--attention-softmax-in-fp32
+--apply-layernorm-1p \
+--transformer-impl transformer_engine \
+--group-query-attention 8 \
+--disable-bia-linear \
+--rotary-base 1000000 \
+--rotary-percent 1.0 \
+--swiglu \
+--ffn-hidden-size 14336 \
+--num-attention-heads 32
+```
+
+# Other Llama-like model support
+
+*Note: Experimental*
+
+Many models such as Yi-34B use the Llama architecture and may be converted from HuggingFace to Megatron using the commands in [Llama3](#llama-3).
+
+# Known numerical differences
+
+It is not expected that the megatron and Huggingface implementations of llama3.x and mistral models will produce numerically identical results. There are multiple points where small numerical differences are expected. This is a non-exhaustive list:
+
+1. TransformerEngine (TE) uses the model params_dtype inside RMSNorm whereas the Huggingface implementation uses fp32. See for details: https://github.com/NVIDIA/TransformerEngine/issues/1132
+2. Huggingface `transformers` implements the q, k and v projections in self-attention as separate GEMMs whereas mcore combines them into a single GEMM for efficiency. This leads to small numerical differences.
+
diff --git a/docs/source/api-guide/context_parallel.rst b/docs/source/api-guide/context_parallel.rst
new file mode 100644
index 0000000000..c381f66e8b
--- /dev/null
+++ b/docs/source/api-guide/context_parallel.rst
@@ -0,0 +1,35 @@
+context\_parallel package
+=========================
+
+Context parallelism overview 
+----------------------------
+
+.. figure:: ../images/context_parallel/CP_overview.png
+   :alt: cp_overview
+   :align: center
+   
+   Figure 1: A transformer layer running with TP2CP2. Communications next to Attention are for CP, others are for TP. (AG/RS: all-gather in forward and reduce-scatter in backward, RS/AG: reduce-scatter in forward and all-gather in backward, /AG: no-op in forward and all-gather in backward).
+
+Context Parallelism ("CP") is a parallelization scheme on the dimension of sequence length. Unlike prior SP (sequence parallelism) which only splits the sequence of Dropout and LayerNorm activations, CP partitions the network inputs and all activations along sequence dimension. With CP, all modules except attention (e.g., Linear, LayerNorm, etc.) can work as usual without any changes, because they do not have inter-token operations. As for attention, the Q (query) of each token needs to compute with the KV (key and value) of all tokens in the same sequence. Hence, CP requires additional all-gather across GPUs to collect the full sequence of KV. Correspondingly, reduce-scatter should be applied to the activation gradients of KV in backward propagation. To reduce activation memory footprint, each GPU only stores the KV of a sequence chunk in forward and gathers KV again in backward. KV communication happens between a GPU and its counterparts in other TP groups. The all-gather and reduce-scatter are transformed to point-to-point communications in ring topology under the hood. Exchanging KV also can leverage MQA/GQA to reduce communication volumes, as they only have one or few attention heads for KV.
+
+For example, in Figure 1, assuming sequence length is 8K, each GPU processes 4K tokens. GPU0 and GPU2 compose a CP group, they exchange KV with each other. Same thing also happens between GPU1 and GPU3. CP is similar to `Ring Attention <https://arxiv.org/abs/2310.01889>`_ but provides better performance by (1) leveraging the latest OSS and cuDNN flash attention kernels; (2) removing unnecessary computation resulted from low-triangle causal masking and achieving optimal load balance among GPUs.
+
+Context parallelism benefits 
+----------------------------
+
+.. figure:: ../images/context_parallel/CP_results.png
+   :alt: cp_results
+   :align: center
+   
+   Figure 2: Speedup of 175B GPT with various TP+CP combinations vs. full recompute (i.e., TP8CP1).
+
+LLM encounters OOM (out of memory) issue with long context (i.e., long sequence length) because of linearly increasing memory footprint of activations. Recomputing activations in backward can avoid OOM but also introduce significant overheads (~30% with full recompute). Enlarging TP (tensor model parallelism) can fix the OOM issue as well, but it potentially makes compute (e.g., Linear) too short to overlap communication latencies. To be clear, scaling out to more GPUs with bigger TP can hit the overlapping problem no matter if OOM happens.
+
+CP can better address the issues. With CP, each GPU only computes on a part of the sequence, which reduces both computation and communication by CP times. Therefore, there are no concerns about the overlapping between them. The activation memory footprint per GPU is also CP times smaller, hence no OOM issue any more. As Figure 2 shows, the combinations of TP and CP can achieve optimal performance by eliminating recompute overheads and making the best tradeoff between computation and communications.
+
+Enabling context parallelism
+----------------------------
+
+CP support has been added to GPT. All models that share GPT code path also should be able to benefit from CP, such as Llama. CP can work with TP (tensor model parallelism), PP (pipeline model parallelism), and DP (data parallelism), where the total number of GPUs equals TPxCPxPPxDP. CP also can work with different attention variants, including MHA/MQA/GQA, uni-directional and bi-directional masking.
+
+CP is enabled by simply setting context_parallel_size=<CP_SIZE> in command line. Default context_parallel_size is 1, which means CP is disabled. Running with CP requires Megatron-Core (>=0.5.0) and Transformer Engine (>=1.1).
diff --git a/docs/source/api-guide/datasets.rst b/docs/source/api-guide/datasets.rst
new file mode 100644
index 0000000000..247a3f07d3
--- /dev/null
+++ b/docs/source/api-guide/datasets.rst
@@ -0,0 +1,104 @@
+datasets package
+================
+
+.. mdinclude :: ../../../megatron/core/datasets/readme.md
+
+Submodules
+----------
+
+datasets.blended\_megatron\_dataset\_config module
+---------------------------------------------------
+
+.. automodule:: core.datasets.blended_megatron_dataset_config
+   :members:
+   :undoc-members:
+   :show-inheritance:
+
+datasets.blended\_megatron\_dataset\_builder module
+---------------------------------------------------
+
+.. automodule:: core.datasets.blended_megatron_dataset_builder
+   :members:
+   :undoc-members:
+   :show-inheritance:
+
+datasets.megatron\_tokenizer module
+-----------------------------------
+
+.. automodule:: core.datasets.megatron_tokenizer
+   :members:
+   :undoc-members:
+   :show-inheritance:
+
+datasets.indexed\_dataset module
+--------------------------------
+
+.. automodule:: core.datasets.indexed_dataset
+   :members:
+   :undoc-members:
+   :show-inheritance:
+
+datasets.megatron\_dataset module
+---------------------------------
+
+.. automodule:: core.datasets.megatron_dataset
+   :members:
+   :undoc-members:
+   :show-inheritance:
+
+datasets.gpt\_dataset module
+----------------------------
+
+.. automodule:: core.datasets.gpt_dataset
+   :members:
+   :undoc-members:
+   :show-inheritance:
+
+datasets.masked\_dataset module
+-------------------------------
+
+.. automodule:: core.datasets.masked_dataset
+   :members:
+   :undoc-members:
+   :show-inheritance:
+
+datasets.bert\_dataset module
+-----------------------------
+
+.. automodule:: core.datasets.bert_dataset
+   :members:
+   :undoc-members:
+   :show-inheritance:
+
+datasets.t5\_dataset module
+---------------------------
+
+.. automodule:: core.datasets.t5_dataset
+   :members:
+   :undoc-members:
+   :show-inheritance:
+
+datasets.blended\_dataset module
+----------------------------------
+
+.. automodule:: core.datasets.blended_dataset
+   :members:
+   :undoc-members:
+   :show-inheritance:
+
+datasets.utils module
+---------------------
+
+.. automodule:: core.datasets.utils
+   :members:
+   :undoc-members:
+   :show-inheritance:
+
+Module contents
+---------------
+
+.. automodule:: core.datasets
+   :members:
+   :undoc-members:
+   :show-inheritance:
+
diff --git a/docs/source/api-guide/dist_checkpointing.rst b/docs/source/api-guide/dist_checkpointing.rst
new file mode 100644
index 0000000000..7e384a08a3
--- /dev/null
+++ b/docs/source/api-guide/dist_checkpointing.rst
@@ -0,0 +1,79 @@
+dist\_checkpointing package
+===========================
+
+A library for saving and loading the distributed checkpoints.
+A "distributed checkpoint" can have various underlying formats (current default format is based on Zarr)
+but has a distinctive property - the checkpoint saved in one parallel configuration (tensor/pipeline/data parallelism)
+can be loaded in a different parallel configuration.
+
+Using the library requires defining sharded state_dict dictionaries with functions from  *mapping* and *optimizer* modules.
+Those state dicts can be saved or loaded with a *serialization* module using strategies from *strategies* module.
+
+
+Subpackages
+-----------
+
+.. toctree::
+   :maxdepth: 4
+
+   dist_checkpointing.strategies
+
+Submodules
+----------
+
+dist\_checkpointing.serialization module
+----------------------------------------
+
+.. automodule:: core.dist_checkpointing.serialization
+   :members:
+   :undoc-members:
+   :show-inheritance:
+
+dist\_checkpointing.mapping module
+----------------------------------
+
+.. automodule:: core.dist_checkpointing.mapping
+   :members:
+   :undoc-members:
+   :show-inheritance:
+
+dist\_checkpointing.optimizer module
+------------------------------------
+
+.. automodule:: core.dist_checkpointing.optimizer
+   :members:
+   :undoc-members:
+   :show-inheritance:
+
+dist\_checkpointing.core module
+-------------------------------
+
+.. automodule:: core.dist_checkpointing.core
+   :members:
+   :undoc-members:
+   :show-inheritance:
+
+dist\_checkpointing.dict\_utils module
+--------------------------------------
+
+.. automodule:: core.dist_checkpointing.dict_utils
+   :members:
+   :undoc-members:
+   :show-inheritance:
+
+
+dist\_checkpointing.utils module
+--------------------------------
+
+.. automodule:: core.dist_checkpointing.utils
+   :members:
+   :undoc-members:
+   :show-inheritance:
+
+Module contents
+---------------
+
+.. automodule:: core.dist_checkpointing
+   :members:
+   :undoc-members:
+   :show-inheritance:
diff --git a/docs/source/api-guide/dist_checkpointing.strategies.rst b/docs/source/api-guide/dist_checkpointing.strategies.rst
new file mode 100644
index 0000000000..41e674c761
--- /dev/null
+++ b/docs/source/api-guide/dist_checkpointing.strategies.rst
@@ -0,0 +1,50 @@
+dist\_checkpointing.strategies package
+======================================
+
+Package defining different checkpoint formats (backends) and saving/loading algorithms (strategies).
+
+Strategies can be used for implementing new checkpoint formats or implementing new (more optimal for a given use case) ways of saving/loading of existing formats.
+Strategies are passed to `dist_checkpointing.load` and `dist_checkpointing.save` functions and control the actual saving/loading procedure.
+
+Submodules
+----------
+
+dist\_checkpointing.strategies.base module
+------------------------------------------
+
+.. automodule:: core.dist_checkpointing.strategies.base
+   :members:
+   :undoc-members:
+   :show-inheritance:
+
+dist\_checkpointing.strategies.tensorstore module
+-------------------------------------------------
+
+.. automodule:: core.dist_checkpointing.strategies.tensorstore
+   :members:
+   :undoc-members:
+   :show-inheritance:
+
+dist\_checkpointing.strategies.two\_stage module
+------------------------------------------------
+
+.. automodule:: core.dist_checkpointing.strategies.two_stage
+   :members:
+   :undoc-members:
+   :show-inheritance:
+
+dist\_checkpointing.strategies.zarr module
+------------------------------------------
+
+.. automodule:: core.dist_checkpointing.strategies.zarr
+   :members:
+   :undoc-members:
+   :show-inheritance:
+
+Module contents
+---------------
+
+.. automodule:: core.dist_checkpointing.strategies
+   :members:
+   :undoc-members:
+   :show-inheritance:
diff --git a/docs/source/api-guide/dist_optimizer.md b/docs/source/api-guide/dist_optimizer.md
new file mode 100644
index 0000000000..34f42d5343
--- /dev/null
+++ b/docs/source/api-guide/dist_optimizer.md
@@ -0,0 +1,40 @@
+# Distributed Optimizer
+
+The motivation for the distributed optimizer is to save memory by distributing the optimizer state evenly across data parallel ranks (https://arxiv.org/abs/1910.02054), versus the naive method of replicating the optimizer state across data parallel ranks.
+
+Theoretical memory savings vary depending on the combination of the datatype of the model's parameters (`param_dtype`) and main gradients accumulated across data-parallel replicas (`grad_dtype`). We always use `fp32` main parameters for optimizer steps. In the current implementation, the theoretical number of bytes per parameter is (where d is the data parallel size):
+
+|        | Non-distributed optim | Distributed optim |
+| ------ | ------ | ------ |
+| `fp16` parameters, `fp16` gradients | 20 | 4 + 16/d |
+| `bf16` parameters, `fp32` gradients    | 18 | 6 + 12/d |
+| `fp32` parameters, `fp32` gradients       | 16 | 8 + 8/d  |
+
+Our implementation of the distributed optimizer uses contiguous buffers for parameters and main gradients; model gradients are copied over to the main gradients as soon as they are fully computed.
+
+The figures below illustrate the distributed optimizer's sharding scheme, and the key steps of the distributed optimizer's parameter update:
+
+## Data flow
+
+![Data flow](../images/distrib_optimizer/data_flow.png)
+
+## Sharding scheme
+
+![Sharding scheme](../images/distrib_optimizer/sharding_scheme.png)
+
+## Key steps
+
+_(note: using illustrations above, assuming `bf16` model weights, `bf16` model gradients that are computed by the backward pass and `fp32` main gradients that are also used for optimizer steps; we always use `fp32` main weights for optimizer steps)_
+
+- Backward pass finishes (gradient buffer holds 16 `fp32` gradient elements).
+- Call reduce-scatter on each DP rank.
+- Each DP rank now has 4 elements within the gradient buffer that are fully reduced (remaining 12 elements are garbage).
+  - DP rank 0 has gradient values for elements [0:4].
+  - DP rank 1 has gradient values for elements [4:8].
+  - DP rank 2 has gradient values for elements [8:12].
+  - DP rank 3 has gradient values for elements [12:16].
+- Optimizer.step().
+- Each DP rank copies its 4 `fp32` main parameter elements into the corresponding `bf16` parameter buffer (each element is cast from fp32 to fp16).
+- Call all-gather on each DP rank.
+- The parameter buffer now contains all 16, fully updated, `bf16` model parameter elements. Parameters in PyTorch modules already point to the appropriate locations in this parameter buffer, and thus forward passes are ready to run after the all-gather completes.
+- At this point, the gradient buffer is also ready to be zero'd for the next iteration.
diff --git a/docs/source/api-guide/distributed.rst b/docs/source/api-guide/distributed.rst
new file mode 100644
index 0000000000..737820331c
--- /dev/null
+++ b/docs/source/api-guide/distributed.rst
@@ -0,0 +1,53 @@
+distributed package
+===================
+
+This package contains various utilities to finalize model weight gradients
+on each rank before the optimizer step. This includes a distributed data
+parallelism wrapper to all-reduce or reduce-scatter the gradients across
+data-parallel replicas, and a `finalize\_model\_grads` method to
+synchronize gradients across different parallelism modes (e.g., 'tied'
+layers on different pipeline stages, or gradients for experts in a MoE on
+different ranks due to expert parallelism).
+
+Submodules
+----------
+
+distributed.distributed\_data\_parallel
+---------------------------------------
+
+Model wrapper for distributed data parallelism. Stores gradients in a
+contiguous buffer, and supports the option of overlapping communication
+(all-reduce or reduce-scatter) with backprop computation by breaking up
+full model's gradients into smaller buckets and running all-reduce /
+reduce-scatter on each bucket asynchronously. 
+
+.. automodule:: core.distributed.distributed_data_parallel
+   :members:
+   :undoc-members:
+   :show-inheritance:
+
+distributed.finalize\_model\_grads
+----------------------------------
+
+Finalize model gradients for optimizer step across all used parallelism modes.
+Synchronizes the all-reduce / reduce-scatter of model gradients across DP replicas,
+all-reduces the layernorm gradients for sequence parallelism, embedding gradients
+across first and last pipeline stages (if not tied), and expert gradients for expert
+parallelism.
+
+.. automodule:: core.distributed.finalize_model_grads
+   :members:
+   :undoc-members:
+   :show-inheritance:
+
+
+Module contents
+---------------
+
+Contains functionality to synchronize gradients across different ranks before
+optimizer step.
+
+.. automodule:: core.distributed
+   :members:
+   :undoc-members:
+   :show-inheritance:
diff --git a/docs/source/api-guide/encoder_decoder_parallelism.rst b/docs/source/api-guide/encoder_decoder_parallelism.rst
new file mode 100644
index 0000000000..7cdff941de
--- /dev/null
+++ b/docs/source/api-guide/encoder_decoder_parallelism.rst
@@ -0,0 +1,54 @@
+encoder-decoder-parallelism package
+===================================
+
+Mcore (as of 0.9) supports heterogeneous parallelism for encoder-decoder models.
+In particular, the user is now able to specify the amount of tensor and pipeline parallelism and have it be
+distinct from that in the decoder.
+
+Submodules
+----------
+
+Encoder Pipeline Parallelism
+----------------------------
+
+Supported in: T5, LLaVa.
+
+The new argument for encoder parallelism is `--encoder-pipeline-model-parallel-size`. This argument is completely distinct
+from the usual argument that controls pipelining: `--pipeline-model-parallel-size`, which controls the amount of pipelining in the decoder
+in the context of encoder-decoder models.
+
+The total amount of pipelining in an encoder-decoder model is the sum of these two arguments. By default, the amount of
+encoder pipelining is 0, and the amount of decoder pipelining is 1, meaning that the encoder & decoder share the single pipeline rank.
+If `--pipeline-model-parallel-size` > 1,then the amount of encoder parallelism has to be specified and has to be greater than 0.
+This is because we are not able to share pipeline ranks between the encoder and decoder anymore.
+
+Encoder Tensor Parallelism
+--------------------------
+
+Supported in: LLaVa.
+
+Since we expect encoders to be much smaller than decoders, we also give users the ability to set a different amount of tensor
+parallelism than the decoder. This is achieved with the argument `--encoder-tensor-model-parallel-size`. To use this option, you must
+be using encoder pipeline parallelism (ie, `--encoder-pipeline-model-parallel-size` > 0).
+
+Unlike with encoder pipeline parallelism, which was unrestricted by the amount of decoder pipeline parallelism, we only allow encoders to have
+less than or the same amount of tensor parallelism as the decoder. The summary of how we do this is that within p2p_communication.py, we have
+to send the activations of one encoder rank to several decoder ranks; correspondingly, we have to add support for summing gradients from several
+(downstream) decoder ranks for the encoder rank. We have not seen a quantization-related degradation from summing these gradient tensors
+together yet; it could happen in very large models.
+
+
+Number of GPUs Required
+-----------------------
+
+The total amount of GPUs required to train a model when these options enabled is:
+
+dp * etp * epp * cp + dp * tp * pp * cp
+
+where:
+dp: amount of data parallelism (this is the same for the encoder & decoder)
+[e]tp: amount of tensor parallelism
+[e]pp: amount of pipeline parallelism
+cp: amount of context parallelism (as with dp, this is the same for the encoder & decoder)
+
+The default value of this argument is 0; in practice, we will use the amount of tensor parallelism in the decoder to construct the encoder.
diff --git a/docs/source/api-guide/fusions.rst b/docs/source/api-guide/fusions.rst
new file mode 100644
index 0000000000..22782ca84e
--- /dev/null
+++ b/docs/source/api-guide/fusions.rst
@@ -0,0 +1,65 @@
+fusions package
+===============
+
+This package provides modules that provide commonly fused
+operations. Fusing operations improves compute efficiency by
+increasing the amount of work done each time a tensor is read from
+memory. To perform the fusion, modules in this either rely on PyTorch
+functionality for doing just-in-time compilation
+(i.e. `torch.jit.script` in older PyTorch versions of `torch.compile`
+in recent versions), or call into custom kernels in external libraries
+such as Apex or TransformerEngine.
+
+Submodules
+----------
+
+fusions.fused\_bias\_dropout module
+-----------------------------------
+
+This module uses PyTorch JIT to fuse the bias add and dropout operations. Since dropout is not used during inference, different functions are used when in train mode and when in inference mode.
+
+.. automodule:: core.fusions.fused_bias_dropout
+   :members:
+   :undoc-members:
+   :show-inheritance:
+
+fusions.fused\_bias\_gelu module
+--------------------------------
+
+This module uses PyTorch JIT to fuse the bias add and GeLU nonlinearity operations.
+
+.. automodule:: core.fusions.fused_bias_gelu
+   :members:
+   :undoc-members:
+   :show-inheritance:
+
+fusions.fused\_layer\_norm module
+---------------------------------
+
+This module provides a wrapper around various fused LayerNorm implementation in Apex.
+
+.. automodule:: core.fusions.fused_layer_norm
+   :members:
+   :undoc-members:
+   :show-inheritance:
+
+fusions.fused\_softmax module
+-----------------------------
+
+This module provides wrappers around variations of Softmax in Apex.
+
+.. automodule:: core.fusions.fused_softmax
+   :members:
+   :undoc-members:
+   :show-inheritance:
+
+fusions.fused\_cross\_entropy\_loss module
+------------------------------------------
+
+This module uses PyTorch JIT to fuse the cross entropy loss calculation and batches communication calls.
+
+.. automodule:: core.fusions.fused_cross_entropy
+   :members:
+   :undoc-members:
+   :show-inheritance:
+
diff --git a/docs/source/api-guide/index.rst b/docs/source/api-guide/index.rst
new file mode 100644
index 0000000000..dac785af04
--- /dev/null
+++ b/docs/source/api-guide/index.rst
@@ -0,0 +1,20 @@
+API Guide
+=========
+
+.. toctree::
+   :maxdepth: 4
+
+   models
+   tensor_parallel
+   context_parallel
+   pipeline_parallel
+   fusions
+   transformer
+   moe
+   dist_checkpointing
+   dist_optimizer
+   distributed
+   datasets
+   num_microbatches_calculator
+   optimizer_param_scheduler
+   encoder_decoder_parallelism
\ No newline at end of file
diff --git a/docs/source/api-guide/models.bert.rst b/docs/source/api-guide/models.bert.rst
new file mode 100644
index 0000000000..1b562ce72c
--- /dev/null
+++ b/docs/source/api-guide/models.bert.rst
@@ -0,0 +1,22 @@
+models.bert package
+===================
+Useful package for training bert and bert like encoder only models. It optionally comes with a binary head that can be used for classification tasks . 
+
+Submodules
+----------
+
+models.bert.bert\_model module
+------------------------------
+
+.. automodule:: core.models.bert.bert_model
+   :members:
+   :undoc-members:
+   :show-inheritance:
+
+Module contents
+---------------
+
+.. automodule:: core.models.bert
+   :members:
+   :undoc-members:
+   :show-inheritance:
diff --git a/docs/source/api-guide/models.gpt.rst b/docs/source/api-guide/models.gpt.rst
new file mode 100644
index 0000000000..31c4da6a9c
--- /dev/null
+++ b/docs/source/api-guide/models.gpt.rst
@@ -0,0 +1,22 @@
+models.gpt package
+==================
+This is the implementation of the popular GPT model. It supports several features like model parallelization (Tensor Parallel, Pipeline Parallel, Data Parallel) , mixture of experts, FP8 , Distributed optimizer etc. We are constantly adding new features. So be on the lookout or raise an issue if you want to have something added. 
+
+Submodules
+----------
+
+models.gpt.gpt\_model module
+----------------------------
+
+.. automodule:: core.models.gpt.gpt_model
+   :members:
+   :undoc-members:
+   :show-inheritance:
+
+Module contents
+---------------
+
+.. automodule:: core.models.gpt
+   :members:
+   :undoc-members:
+   :show-inheritance:
diff --git a/docs/source/api-guide/models.rst b/docs/source/api-guide/models.rst
new file mode 100644
index 0000000000..12c40e4f35
--- /dev/null
+++ b/docs/source/api-guide/models.rst
@@ -0,0 +1,21 @@
+models package
+==============
+This package contains most of the popular LLMs . Currently we have support for GPT, Bert, T5 and Retro . This is an ever growing list so keep an eye out. 
+
+Subpackages
+-----------
+
+.. toctree::
+   :maxdepth: 4
+
+   models.gpt
+   models.t5
+   models.bert
+
+Module contents
+---------------
+
+.. automodule:: core.models
+   :members:
+   :undoc-members:
+   :show-inheritance:
diff --git a/docs/source/api-guide/models.t5.rst b/docs/source/api-guide/models.t5.rst
new file mode 100644
index 0000000000..1cc3315682
--- /dev/null
+++ b/docs/source/api-guide/models.t5.rst
@@ -0,0 +1,21 @@
+models.t5 package
+=================
+
+Submodules
+----------
+
+models.t5.t5\_model module
+--------------------------
+
+.. automodule:: core.models.T5.t5_model
+   :members:
+   :undoc-members:
+   :show-inheritance:
+
+Module contents
+---------------
+
+.. automodule:: core.models.T5
+   :members:
+   :undoc-members:
+   :show-inheritance:
diff --git a/docs/source/api-guide/moe.rst b/docs/source/api-guide/moe.rst
new file mode 100644
index 0000000000..9afc01e080
--- /dev/null
+++ b/docs/source/api-guide/moe.rst
@@ -0,0 +1,4 @@
+Mixture of Experts package
+==========================
+
+.. mdinclude :: ../../../megatron/core/transformer/moe/README.md
diff --git a/docs/source/api-guide/num_microbatches_calculator.rst b/docs/source/api-guide/num_microbatches_calculator.rst
new file mode 100644
index 0000000000..4790b31749
--- /dev/null
+++ b/docs/source/api-guide/num_microbatches_calculator.rst
@@ -0,0 +1,12 @@
+Microbatches Calculator
+=======================
+This api is used to calculate the number of microbatches required to fit a given model on a given batch size.
+
+
+Module contents
+---------------
+
+.. automodule:: core.num_microbatches_calculator
+   :members:
+   :undoc-members:
+   :show-inheritance:
diff --git a/docs/source/api-guide/optimizer_param_scheduler.rst b/docs/source/api-guide/optimizer_param_scheduler.rst
new file mode 100644
index 0000000000..caf5d8abfb
--- /dev/null
+++ b/docs/source/api-guide/optimizer_param_scheduler.rst
@@ -0,0 +1,12 @@
+Optimizer Parameters Scheduler
+==============================
+This api is used to calculate the learning rate and weight decay for the optimizer.
+
+
+Module contents
+---------------
+
+.. automodule:: core.optimizer_param_scheduler
+   :members:
+   :undoc-members:
+   :show-inheritance:
diff --git a/docs/source/api-guide/pipeline_parallel.rst b/docs/source/api-guide/pipeline_parallel.rst
new file mode 100644
index 0000000000..5c67079a70
--- /dev/null
+++ b/docs/source/api-guide/pipeline_parallel.rst
@@ -0,0 +1,47 @@
+pipeline\_parallel package
+==========================
+
+This package contains implementations for two different pipeline parallelism
+schedules (one without interleaving and one with interleaving, see `Efficient
+Large-Scale Language Model Training on GPU Clusters Using Megatron-LM <https://arxiv.org/abs/2104.04473>`_
+for details), and a default no-pipelining schedule. It also contains methods
+for the point-to-point communication that is needed between pipeline stages.
+
+Submodules
+----------
+
+pipeline\_parallel.p2p\_communication module
+--------------------------------------------
+
+Contains implementations for the various point-to-point communication needed
+(e.g., `recv_forward` and `recv_backward`) in the different pipeline parallelism
+schedules.
+
+.. automodule:: core.pipeline_parallel.p2p_communication
+   :members:
+   :undoc-members:
+   :show-inheritance:
+
+pipeline\_parallel.schedules module
+-----------------------------------
+
+Contains implementations for two pipeline parallelism schedules
+(`forward_backward_pipelining_with_interleaving`for pipeline parallelism with
+interleaving, `forward_backward_pipelining_without_interleaving` for pipeline
+parallelism without interleaving) and a default no-pipelining schedule
+(`forward_backward_no_pipelining`). `get_forward_backward_func` returns the right
+scheduling function to use based on the configuration being trained
+(e.g., if pipeline-parallel size is 1, use `forward_backward_no_pipelining`).
+
+.. automodule:: core.pipeline_parallel.schedules
+   :members:
+   :undoc-members:
+   :show-inheritance:
+
+Module contents
+---------------
+
+.. automodule:: core.pipeline_parallel
+   :members:
+   :undoc-members:
+   :show-inheritance:
diff --git a/docs/source/api-guide/tensor_parallel.rst b/docs/source/api-guide/tensor_parallel.rst
new file mode 100644
index 0000000000..d8ae9dea22
--- /dev/null
+++ b/docs/source/api-guide/tensor_parallel.rst
@@ -0,0 +1,67 @@
+tensor\_parallel package
+========================
+
+This package contains an implementation for tensor parallelism in transformer
+models (see `Megatron-LM: Training Multi-Billion Parameter Language Models
+Using Model Parallelism <https://arxiv.org/abs/1909.08053>`_ and `Reducing
+Activation Recomputation in Large Transformer Models <https://arxiv.org/abs/2205.05198>`_
+for details).
+
+Submodules
+----------
+
+tensor\_parallel.cross\_entropy module
+--------------------------------------
+
+.. automodule:: core.tensor_parallel.cross_entropy
+   :members:
+   :undoc-members:
+   :show-inheritance:
+
+tensor\_parallel.data module
+----------------------------
+
+.. automodule:: core.tensor_parallel.data
+   :members:
+   :undoc-members:
+   :show-inheritance:
+
+tensor\_parallel.layers module
+------------------------------
+
+.. automodule:: core.tensor_parallel.layers
+   :members:
+   :undoc-members:
+   :show-inheritance:
+
+tensor\_parallel.mappings module
+--------------------------------
+
+.. automodule:: core.tensor_parallel.mappings
+   :members:
+   :undoc-members:
+   :show-inheritance:
+
+tensor\_parallel.random module
+------------------------------
+
+.. automodule:: core.tensor_parallel.random
+   :members:
+   :undoc-members:
+   :show-inheritance:
+
+tensor\_parallel.utils module
+-----------------------------
+
+.. automodule:: core.tensor_parallel.utils
+   :members:
+   :undoc-members:
+   :show-inheritance:
+
+Module contents
+---------------
+
+.. automodule:: core.tensor_parallel
+   :members:
+   :undoc-members:
+   :show-inheritance:
diff --git a/docs/source/api-guide/transformer.rst b/docs/source/api-guide/transformer.rst
new file mode 100644
index 0000000000..6e2e894d54
--- /dev/null
+++ b/docs/source/api-guide/transformer.rst
@@ -0,0 +1,136 @@
+transformer package
+===================
+
+The `transformer` package provides a customizable and configurable
+implementation of the transformer model architecture. Each component
+of a transformer stack, from entire layers down to individual linear
+layers, can be customized by swapping in different PyTorch modules
+using the "spec" parameters (see `here
+<https://docs.nvidia.com/nemo-framework/user-guide/latest/nemotoolkit/nlp/nemo_megatron/mcore_customization.html>`_). The
+configuration of the transformer (hidden size, number of layers,
+number of attention heads, etc.) is provided via a `TransformerConfig`
+object.
+
+Submodules
+----------
+
+transformer.attention module
+----------------------------
+
+This is the entire attention portion, either self or cross attention,
+of a transformer layer including the query, key, and value
+projections, a "core" attention calculation (e.g. dot product
+attention), and final output linear projection.
+
+.. automodule:: core.transformer.attention
+   :members:
+   :undoc-members:
+   :show-inheritance:
+
+transformer.dot\_product\_attention module
+------------------------------------------
+
+This is a PyTorch-only implementation of dot product attention. A more
+efficient implementation, like those provided by FlashAttention or
+CUDNN's FusedAttention, are typically used when training speed is
+important.
+
+.. automodule:: core.transformer.dot_product_attention
+   :members:
+   :undoc-members:
+   :show-inheritance:
+
+transformer.enums module
+------------------------
+
+.. automodule:: core.transformer.enums
+   :members:
+   :undoc-members:
+   :show-inheritance:
+
+transformer.identity\_op module
+-------------------------------
+
+This provides a pass-through module that can be used in specs to
+indicate that the operation should not be performed. For example, when
+using LayerNorm with the subsequent linear layer, an IdentityOp can be
+passed in as the LayerNorm module to use.
+
+.. automodule:: core.transformer.identity_op
+   :members:
+   :undoc-members:
+   :show-inheritance:
+
+transformer.mlp module
+----------------------
+
+This is the entire MLP portion of the transformer layer with an input
+projection, non-linearity, and output projection.
+
+.. automodule:: core.transformer.mlp
+   :members:
+   :undoc-members:
+   :show-inheritance:
+
+transformer.module module
+-------------------------
+
+This provides a common base class for all modules used in the
+transformer that contains some common functionality.
+
+.. automodule:: core.transformer.module
+   :members:
+   :undoc-members:
+   :show-inheritance:
+
+transformer.transformer\_block module
+-------------------------------------
+
+A block, or stack, of several transformer layers. The layers can all
+be the same or each can be unique.
+
+.. automodule:: core.transformer.transformer_block
+   :members:
+   :undoc-members:
+   :show-inheritance:
+
+transformer.transformer\_config module
+--------------------------------------
+
+This contains all of the configuration options for the
+transformer. Using a dataclass reduces code bloat by keeping all
+arguments together in a dataclass instead of passing several arguments
+through multiple layers of function calls.
+
+.. automodule:: core.transformer.transformer_config
+   :members:
+   :undoc-members:
+   :show-inheritance:
+
+transformer.transformer\_layer module
+-------------------------------------
+
+A single standard transformer layer including attention and MLP blocks.
+
+.. automodule:: core.transformer.transformer_layer
+   :members:
+   :undoc-members:
+   :show-inheritance:
+
+transformer.utils module
+------------------------
+
+Various utilities used in the transformer implementation.
+
+.. automodule:: core.transformer.utils
+   :members:
+   :undoc-members:
+   :show-inheritance:
+
+Module contents
+---------------
+
+.. automodule:: core.transformer
+   :members:
+   :undoc-members:
+   :show-inheritance:
diff --git a/docs/source/images/context_parallel/CP_overview.png b/docs/source/images/context_parallel/CP_overview.png
new file mode 100644
index 0000000000..38c55b371a
Binary files /dev/null and b/docs/source/images/context_parallel/CP_overview.png differ
diff --git a/docs/source/images/context_parallel/CP_results.png b/docs/source/images/context_parallel/CP_results.png
new file mode 100644
index 0000000000..e0415ce86e
Binary files /dev/null and b/docs/source/images/context_parallel/CP_results.png differ
diff --git a/docs/source/images/distrib_optimizer/data_flow.png b/docs/source/images/distrib_optimizer/data_flow.png
new file mode 100644
index 0000000000..01f5cfb2e7
Binary files /dev/null and b/docs/source/images/distrib_optimizer/data_flow.png differ
diff --git a/docs/source/images/distrib_optimizer/sharding_scheme.png b/docs/source/images/distrib_optimizer/sharding_scheme.png
new file mode 100644
index 0000000000..e48dd95024
Binary files /dev/null and b/docs/source/images/distrib_optimizer/sharding_scheme.png differ
diff --git a/docs/source/images/moe/token_drop.png b/docs/source/images/moe/token_drop.png
new file mode 100644
index 0000000000..1c335ee7aa
Binary files /dev/null and b/docs/source/images/moe/token_drop.png differ
diff --git a/docs/source/index.rst b/docs/source/index.rst
new file mode 100644
index 0000000000..f2a89b8ac7
--- /dev/null
+++ b/docs/source/index.rst
@@ -0,0 +1,23 @@
+.. Lumache documentation master file, created by
+   sphinx-quickstart on Tue Aug 15 13:44:10 2023.
+   You can adapt this file completely to your liking, but it should at least
+   contain the root `toctree` directive.
+
+Megatron Core User Guide
+===================================
+
+**Megatron Core** is a Python library that has the core components required to build your language models. 
+A reference implementation of Megatron Core can be found in  `NeMo <https://github.com/NVIDIA/NeMo/tree/main>`_ It offers a *simple* and
+*intuitive* API.
+
+.. toctree::
+   :maxdepth: 2
+   :caption: User Guide
+
+   user-guide/index
+
+.. toctree::
+   :maxdepth: 3
+   :caption: API Guide
+   
+   api-guide/index
diff --git a/docs/source/user-guide/index.rst b/docs/source/user-guide/index.rst
new file mode 100644
index 0000000000..0fb996a4f0
--- /dev/null
+++ b/docs/source/user-guide/index.rst
@@ -0,0 +1,4 @@
+User Guide 
+============
+
+.. mdinclude:: ../../../megatron/core/QuickStart.md
\ No newline at end of file
diff --git a/examples/detxoify_lm/README.md b/examples/academic_paper_scripts/detxoify_lm/README.md
similarity index 100%
rename from examples/detxoify_lm/README.md
rename to examples/academic_paper_scripts/detxoify_lm/README.md
diff --git a/examples/detxoify_lm/annotations/filter-selfgeneration.py b/examples/academic_paper_scripts/detxoify_lm/annotations/filter-selfgeneration.py
similarity index 100%
rename from examples/detxoify_lm/annotations/filter-selfgeneration.py
rename to examples/academic_paper_scripts/detxoify_lm/annotations/filter-selfgeneration.py
diff --git a/examples/detxoify_lm/annotations/perspective_api_annotate.py b/examples/academic_paper_scripts/detxoify_lm/annotations/perspective_api_annotate.py
similarity index 98%
rename from examples/detxoify_lm/annotations/perspective_api_annotate.py
rename to examples/academic_paper_scripts/detxoify_lm/annotations/perspective_api_annotate.py
index fd82c2a2ae..9736db099a 100644
--- a/examples/detxoify_lm/annotations/perspective_api_annotate.py
+++ b/examples/academic_paper_scripts/detxoify_lm/annotations/perspective_api_annotate.py
@@ -107,7 +107,7 @@ def get_score(line):
             except UnicodeDecodeError:
                 try:
                     decoded_text = encoded_text[:20476].decode('utf8')
-                except:
+                except Exception:
                     print("Error occurred")
                     data['score'] = None
                     return json.dumps(data)
@@ -138,7 +138,7 @@ def get_scores(lines):
                 except UnicodeDecodeError:
                     try:
                         decoded_text = encoded_text[:20476].decode('utf8')
-                    except:
+                    except Exception:
                         print("Error occurred")
                         data['score'] = None
                         all_data.append(json.dumps(data))
diff --git a/examples/detxoify_lm/annotations/preprocess.sh b/examples/academic_paper_scripts/detxoify_lm/annotations/preprocess.sh
similarity index 100%
rename from examples/detxoify_lm/annotations/preprocess.sh
rename to examples/academic_paper_scripts/detxoify_lm/annotations/preprocess.sh
diff --git a/examples/detxoify_lm/finetune_gpt.py b/examples/academic_paper_scripts/detxoify_lm/finetune_gpt.py
similarity index 70%
rename from examples/detxoify_lm/finetune_gpt.py
rename to examples/academic_paper_scripts/detxoify_lm/finetune_gpt.py
index 70b781e0ee..6a3696d388 100644
--- a/examples/detxoify_lm/finetune_gpt.py
+++ b/examples/academic_paper_scripts/detxoify_lm/finetune_gpt.py
@@ -10,18 +10,20 @@
 import sys
 sys.path.append(os.path.abspath(os.path.join(os.path.dirname(__file__),
                                              os.path.pardir, os.path.pardir)))
-from megatron import get_args
-from megatron import get_timers
-from megatron import get_tokenizer
-from megatron import print_rank_0
+from megatron.training import get_args
+from megatron.training import get_timers
+from megatron.training import get_tokenizer
+from megatron.training import print_rank_0
 from megatron.core import mpu
-from megatron.data.blendable_dataset import BlendableDataset
-from megatron.data.gpt_dataset import build_train_valid_test_datasets
-from megatron.model import GPTModel
+from megatron.core.datasets.blended_megatron_dataset_builder import BlendedMegatronDatasetBuilder
+from megatron.core.datasets.blended_megatron_dataset_config import GPTDatasetConfig
+from megatron.core.datasets.gpt_dataset import GPTDataset
+from megatron.core.datasets.utils import get_blend_from_list
+from megatron.legacy.model import GPTModel
 from megatron.core.enums import ModelType
 from megatron.training import pretrain
-from megatron.utils import get_ltor_masks_and_position_ids
-from megatron.utils import average_losses_across_data_parallel_group
+from megatron.training.utils import get_ltor_masks_and_position_ids
+from megatron.training.utils import average_losses_across_data_parallel_group
 
 def model_provider(pre_process=True, post_process=True):
     """Build the model."""
@@ -101,24 +103,34 @@ def train_valid_test_datasets_provider(train_val_test_num_samples):
 
     print_rank_0('> building train, validation, and test datasets '
                  'for GPT ...')
-    train_ds, valid_ds1, test_ds = build_train_valid_test_datasets(
-        data_prefix=args.data_path,
-        data_impl=args.data_impl,
-        splits_string=args.split,
-        train_valid_test_num_samples=train_val_test_num_samples,
-        seq_length=args.seq_length,
-        seed=args.seed,
-        skip_warmup=(not args.mmap_warmup))
+    train_ds, _, test_ds = BlendedMegatronDatasetBuilder(
+        GPTDataset,
+        train_val_test_num_samples,
+        lambda: True,
+        GPTDatasetConfig(
+            blend=get_blend_from_list(args.data_path),
+            split=args.split,
+            random_seed=args.seed,
+            sequence_length=args.seq_length,
+            path_to_cache=args.data_cache_path,
+            return_document_ids=False
+        )
+    ).build()
     print_rank_0("> finished creating finetuning GPT datasets ...")
 
-    _, valid_ds, _ = build_train_valid_test_datasets(
-        data_prefix=args.data_path2,
-        data_impl="mmap",
-        splits_string="98,2,0",
-        train_valid_test_num_samples=train_val_test_num_samples,
-        seq_length=2048,
-        seed=1234,
-        skip_warmup=(not args.mmap_warmup))
+    _, valid_ds, _ = BlendedMegatronDatasetBuilder(
+        GPTDataset,
+        train_val_test_num_samples,
+        lambda: True,
+        GPTDatasetConfig(
+            blend=get_blend_from_list(args.data_path2),
+            split="98,2,0",
+            random_seed=1234,
+            sequence_length=2048,
+            path_to_cache=args.data_cache_path,
+            return_document_ids=False
+        )
+    ).build()
     print_rank_0("> finished creating pretrained GPT datasets ...")
 
     return train_ds, valid_ds, test_ds
diff --git a/examples/detxoify_lm/finetune_gpt_distributed-1.3b.sh b/examples/academic_paper_scripts/detxoify_lm/finetune_gpt_distributed-1.3b.sh
similarity index 98%
rename from examples/detxoify_lm/finetune_gpt_distributed-1.3b.sh
rename to examples/academic_paper_scripts/detxoify_lm/finetune_gpt_distributed-1.3b.sh
index 62a36c0b79..a212fbdf3f 100755
--- a/examples/detxoify_lm/finetune_gpt_distributed-1.3b.sh
+++ b/examples/academic_paper_scripts/detxoify_lm/finetune_gpt_distributed-1.3b.sh
@@ -43,7 +43,6 @@ python -m torch.distributed.run $DISTRIBUTED_ARGS \
      --data-path2 ${DATA_BLEND} \
      --vocab-file $VOCAB_FILE \
      --merge-file $MERGE_FILE \
-     --data-impl mmap \
      --split 100,0,0 \
      --distributed-backend nccl \
      --lr-decay-style constant \
diff --git a/examples/detxoify_lm/generate-1.3b.sh b/examples/academic_paper_scripts/detxoify_lm/generate-1.3b.sh
similarity index 100%
rename from examples/detxoify_lm/generate-1.3b.sh
rename to examples/academic_paper_scripts/detxoify_lm/generate-1.3b.sh
diff --git a/examples/detxoify_lm/generate_samples_gpt.py b/examples/academic_paper_scripts/detxoify_lm/generate_samples_gpt.py
similarity index 68%
rename from examples/detxoify_lm/generate_samples_gpt.py
rename to examples/academic_paper_scripts/detxoify_lm/generate_samples_gpt.py
index 47e1590ea5..895a45d024 100644
--- a/examples/detxoify_lm/generate_samples_gpt.py
+++ b/examples/academic_paper_scripts/detxoify_lm/generate_samples_gpt.py
@@ -9,23 +9,84 @@
 sys.path.append(os.path.abspath(os.path.join(os.path.dirname(__file__),
                                              os.path.pardir, os.path.pardir)))
 import torch
-from megatron import get_args
-from megatron import get_tokenizer
-from megatron import print_rank_0
-from megatron.checkpointing import load_checkpoint
+from megatron.training import get_args
+from megatron.training import get_tokenizer
+from megatron.training import print_rank_0
+from megatron.training.checkpointing import load_checkpoint
 from megatron.core import mpu
-from megatron.initialize import initialize_megatron
-from megatron.model import GPTModel
+from megatron.training.initialize import initialize_megatron
+from megatron.legacy.model import GPTModel
 from megatron.training import get_model
-from megatron.text_generation import generate_and_post_process
+from megatron.inference.text_generation import generate_and_post_process
+from megatron.training.arguments import core_transformer_config_from_args
+from megatron.core.models.gpt import GPTModel
+from typing import Union
+import megatron.legacy.model
+from megatron.core.transformer.spec_utils import import_module
+from megatron.training.arguments import core_transformer_config_from_args
+from megatron.core.models.gpt.gpt_layer_specs import get_gpt_layer_with_transformer_engine_spec, get_gpt_layer_local_spec
 
+def model_provider(pre_process=True, post_process=True) -> Union[GPTModel, megatron.legacy.model.GPTModel]:
+    """Builds the model.
 
-def model_provider(pre_process=True, post_process=True):
-    """Build the model."""
+    If you set the use_legacy_models to True, it will return the legacy GPT model and if not the core GPT model.
+
+    Args:
+        pre_process (bool, optional): Set to true if you need to compute embedings. Defaults to True.
+        post_process (bool, optional): Set to true if you need to want to compute output logits/loss. Defaults to True.
+
+
+    Returns:
+        Union[GPTModel, megatron.legacy.model.GPTModel]: The returned model
+    """
+    args = get_args()
 
     print_rank_0('building GPT model ...')
-    model = GPTModel(num_tokentypes=0, parallel_output=False,
-                     pre_process=pre_process, post_process=post_process)
+    config = core_transformer_config_from_args(args)
+
+    if args.use_legacy_models:
+        model = megatron.legacy.model.GPTModel(
+            config,
+            num_tokentypes=0,
+            parallel_output=False,
+            pre_process=pre_process,
+            post_process=post_process
+        )
+    else:
+        if args.spec is None:
+            if args.transformer_impl == 'local':
+                transformer_layer_spec = get_gpt_layer_local_spec(
+                    num_experts=args.num_experts,
+                    moe_grouped_gemm=args.moe_grouped_gemm
+                )
+            elif args.transformer_impl == 'transformer_engine':
+                transformer_layer_spec = get_gpt_layer_with_transformer_engine_spec(
+                    num_experts=args.num_experts,
+                    moe_grouped_gemm=args.moe_grouped_gemm
+                )
+            else:
+                raise ValueError(f"Invalid transformer_impl {args.transformer_impl}")
+        elif args.spec[0] == 'local':
+            transformer_layer_spec = get_gpt_layer_local_spec(
+                num_experts=args.num_experts,
+                moe_grouped_gemm=args.moe_grouped_gemm
+            )
+        else:
+            transformer_layer_spec = import_module(args.spec)
+
+        model = GPTModel(
+            config=config,
+            transformer_layer_spec=transformer_layer_spec,
+            vocab_size=args.padded_vocab_size,
+            max_sequence_length=args.max_position_embeddings,
+            pre_process=pre_process,
+            post_process=post_process,
+            fp16_lm_cross_entropy=args.fp16_lm_cross_entropy,
+            parallel_output=False,
+            share_embeddings_and_output_weights=not args.untie_embeddings_and_output_weights,
+            position_embedding_type=args.position_embedding_type,
+            rotary_percent=args.rotary_percent
+        )
 
     return model
 
diff --git a/examples/detxoify_lm/perspective_api.py b/examples/academic_paper_scripts/detxoify_lm/perspective_api.py
similarity index 100%
rename from examples/detxoify_lm/perspective_api.py
rename to examples/academic_paper_scripts/detxoify_lm/perspective_api.py
diff --git a/examples/detxoify_lm/self_generation/selfgenerate-1.3b-unconditional.sh b/examples/academic_paper_scripts/detxoify_lm/self_generation/selfgenerate-1.3b-unconditional.sh
similarity index 100%
rename from examples/detxoify_lm/self_generation/selfgenerate-1.3b-unconditional.sh
rename to examples/academic_paper_scripts/detxoify_lm/self_generation/selfgenerate-1.3b-unconditional.sh
diff --git a/examples/msdp/README.md b/examples/academic_paper_scripts/msdp/README.md
similarity index 100%
rename from examples/msdp/README.md
rename to examples/academic_paper_scripts/msdp/README.md
diff --git a/examples/msdp/data_processing.sh b/examples/academic_paper_scripts/msdp/data_processing.sh
similarity index 100%
rename from examples/msdp/data_processing.sh
rename to examples/academic_paper_scripts/msdp/data_processing.sh
diff --git a/examples/msdp/eval_knwl_generation.sh b/examples/academic_paper_scripts/msdp/eval_knwl_generation.sh
similarity index 100%
rename from examples/msdp/eval_knwl_generation.sh
rename to examples/academic_paper_scripts/msdp/eval_knwl_generation.sh
diff --git a/examples/msdp/eval_resp_generation.sh b/examples/academic_paper_scripts/msdp/eval_resp_generation.sh
similarity index 100%
rename from examples/msdp/eval_resp_generation.sh
rename to examples/academic_paper_scripts/msdp/eval_resp_generation.sh
diff --git a/examples/msdp/prep_resp_gen.sh b/examples/academic_paper_scripts/msdp/prep_resp_gen.sh
similarity index 100%
rename from examples/msdp/prep_resp_gen.sh
rename to examples/academic_paper_scripts/msdp/prep_resp_gen.sh
diff --git a/examples/msdp/prompt_knwl_gen.sh b/examples/academic_paper_scripts/msdp/prompt_knwl_gen.sh
similarity index 100%
rename from examples/msdp/prompt_knwl_gen.sh
rename to examples/academic_paper_scripts/msdp/prompt_knwl_gen.sh
diff --git a/examples/msdp/prompt_resp_gen.sh b/examples/academic_paper_scripts/msdp/prompt_resp_gen.sh
similarity index 100%
rename from examples/msdp/prompt_resp_gen.sh
rename to examples/academic_paper_scripts/msdp/prompt_resp_gen.sh
diff --git a/examples/sc21/CONFIG.sh b/examples/academic_paper_scripts/sc21/CONFIG.sh
similarity index 100%
rename from examples/sc21/CONFIG.sh
rename to examples/academic_paper_scripts/sc21/CONFIG.sh
diff --git a/examples/sc21/README.md b/examples/academic_paper_scripts/sc21/README.md
similarity index 94%
rename from examples/sc21/README.md
rename to examples/academic_paper_scripts/sc21/README.md
index 940c37903e..ec922d153d 100644
--- a/examples/sc21/README.md
+++ b/examples/academic_paper_scripts/sc21/README.md
@@ -9,6 +9,11 @@ scripts use [Slurm](https://slurm.schedmd.com/documentation.html) with the
 schedulers as well.
 
 
+## Git commit
+
+To replicate these results use Megatron-LM commit: 6985e58938d40ad91ac07b0fddcfad8132e1447e
+
+
 ## Setup
 
 All the cluster-dependent variables are in [`CONFIG.sh`](./CONFIG.sh). Please
diff --git a/examples/sc21/SBATCH.sh b/examples/academic_paper_scripts/sc21/SBATCH.sh
similarity index 100%
rename from examples/sc21/SBATCH.sh
rename to examples/academic_paper_scripts/sc21/SBATCH.sh
diff --git a/examples/sc21/SRUN.sh b/examples/academic_paper_scripts/sc21/SRUN.sh
similarity index 100%
rename from examples/sc21/SRUN.sh
rename to examples/academic_paper_scripts/sc21/SRUN.sh
diff --git a/examples/sc21/run_figure_11.sh b/examples/academic_paper_scripts/sc21/run_figure_11.sh
similarity index 100%
rename from examples/sc21/run_figure_11.sh
rename to examples/academic_paper_scripts/sc21/run_figure_11.sh
diff --git a/examples/sc21/run_figure_12.sh b/examples/academic_paper_scripts/sc21/run_figure_12.sh
similarity index 100%
rename from examples/sc21/run_figure_12.sh
rename to examples/academic_paper_scripts/sc21/run_figure_12.sh
diff --git a/examples/sc21/run_figure_13.sh b/examples/academic_paper_scripts/sc21/run_figure_13.sh
similarity index 100%
rename from examples/sc21/run_figure_13.sh
rename to examples/academic_paper_scripts/sc21/run_figure_13.sh
diff --git a/examples/sc21/run_figure_14.sh b/examples/academic_paper_scripts/sc21/run_figure_14.sh
similarity index 100%
rename from examples/sc21/run_figure_14.sh
rename to examples/academic_paper_scripts/sc21/run_figure_14.sh
diff --git a/examples/sc21/run_figure_15.sh b/examples/academic_paper_scripts/sc21/run_figure_15.sh
similarity index 100%
rename from examples/sc21/run_figure_15.sh
rename to examples/academic_paper_scripts/sc21/run_figure_15.sh
diff --git a/examples/sc21/run_figure_16.sh b/examples/academic_paper_scripts/sc21/run_figure_16.sh
similarity index 100%
rename from examples/sc21/run_figure_16.sh
rename to examples/academic_paper_scripts/sc21/run_figure_16.sh
diff --git a/examples/sc21/run_figure_17.sh b/examples/academic_paper_scripts/sc21/run_figure_17.sh
similarity index 100%
rename from examples/sc21/run_figure_17.sh
rename to examples/academic_paper_scripts/sc21/run_figure_17.sh
diff --git a/examples/sc21/run_figure_18.sh b/examples/academic_paper_scripts/sc21/run_figure_18.sh
similarity index 100%
rename from examples/sc21/run_figure_18.sh
rename to examples/academic_paper_scripts/sc21/run_figure_18.sh
diff --git a/examples/sc21/run_table_1.sh b/examples/academic_paper_scripts/sc21/run_table_1.sh
similarity index 100%
rename from examples/sc21/run_table_1.sh
rename to examples/academic_paper_scripts/sc21/run_table_1.sh
diff --git a/examples/bert/README.md b/examples/bert/README.md
new file mode 100644
index 0000000000..6c1fe95bf0
--- /dev/null
+++ b/examples/bert/README.md
@@ -0,0 +1,53 @@
+# BERT MODEL
+
+## Table of contents
+- [1. Training Setup](#1-training-setup)
+- [2. Configurations](#2-configurations)
+
+## 1. Training setup
+<a id="markdown-training-setup" name="training-setup"></a>
+
+To run the model using a docker container run it as follows
+```
+PYTORCH_IMAGE=nvcr.io/nvidia/pytorch:24.01-py3
+CHECKPOINT_PATH="" #<Specify path>
+TENSORBOARD_LOGS_PATH=""#<Specify path>
+VOCAB_FILE="" #<Specify path to file>//bert-vocab.txt
+DATA_PATH="" #<Specify path and file prefix>_text_document
+
+docker run \
+  --gpus=all \
+  --ipc=host \
+  --workdir /workspace/megatron-lm \
+  -v /path/to/data:/path/to/data \
+  -v /path/to/megatron-lm:/workspace/megatron-lm \
+  megatron-lm nvcr.io/nvidia/pytorch:24.01-py3 \
+  bash examples/bert/train_bert_340m_distributed.sh $CHECKPOINT_PATH $TENSORBOARD_LOGS_PATH $VOCAB_FILE $DATA_PATH "
+
+```
+NOTE: Depending on the environment you are running it the above command might like slightly different.
+
+
+## 2. Configurations
+<a id="markdown-configurations" name="configurations"></a>
+The example in this folder shows you how to run 340m large model. There are other configs you could run as well
+
+### 4B
+```
+       --num-layers 48 \
+       --hidden-size 2560 \
+       --num-attention-heads 32 \
+       --tensor-model-parallel-size 1 \
+       --pipeline-model-parallel-size 1 \
+
+```
+
+### 20B
+```
+       --num-layers 48 \
+       --hidden-size 6144 \
+       --num-attention-heads 96 \
+       --tensor-model-parallel-size 4 \
+       --pipeline-model-parallel-size 4 \
+
+```
\ No newline at end of file
diff --git a/examples/bert/train_bert_340m_distributed.sh b/examples/bert/train_bert_340m_distributed.sh
new file mode 100644
index 0000000000..dada370a94
--- /dev/null
+++ b/examples/bert/train_bert_340m_distributed.sh
@@ -0,0 +1,78 @@
+#!/bin/bash
+
+# Runs the "340M" parameter model (Bert - Large)
+
+export CUDA_DEVICE_MAX_CONNECTIONS=1
+
+GPUS_PER_NODE=8
+# Change for multinode config
+MASTER_ADDR=localhost
+MASTER_PORT=6000
+NUM_NODES=1
+NODE_RANK=0
+WORLD_SIZE=$(($GPUS_PER_NODE*$NUM_NODES))
+
+CHECKPOINT_PATH=$1 #<Specify path>
+TENSORBOARD_LOGS_PATH=$2 #<Specify path>
+VOCAB_FILE=$3 #<Specify path to file>/bert-vocab.json
+DATA_PATH=$4 #<Specify path and file prefix>_text_document
+
+DISTRIBUTED_ARGS=(
+    --nproc_per_node $GPUS_PER_NODE 
+    --nnodes $NUM_NODES 
+    --master_addr $MASTER_ADDR 
+    --master_port $MASTER_PORT
+)
+
+BERT_MODEL_ARGS=(
+    --num-layers 24 
+    --hidden-size 1024 
+    --num-attention-heads 16 
+    --seq-length 512 
+    --max-position-embeddings 512 
+)
+
+TRAINING_ARGS=(
+    --micro-batch-size 4 
+    --global-batch-size 32 
+    --train-iters 1000000 
+    --weight-decay 1e-2 
+    --clip-grad 1.0 
+    --fp16
+    --lr 0.0001
+    --lr-decay-iters 990000 
+    --lr-decay-style linear 
+    --min-lr 1.0e-5 
+    --weight-decay 1e-2 
+    --lr-warmup-fraction .01 
+    --clip-grad 1.0 
+)
+
+MODEL_PARALLEL_ARGS=(
+	--tensor-model-parallel-size 8 
+	--pipeline-model-parallel-size 16 
+)
+
+DATA_ARGS=(
+    --data-path $DATA_PATH 
+    --vocab-file $VOCAB_FILE 
+    --split 949,50,1
+)
+
+EVAL_AND_LOGGING_ARGS=(
+    --log-interval 100
+    --save-interval 10000 
+    --eval-interval 1000 
+    --save $CHECKPOINT_PATH 
+    --load $CHECKPOINT_PATH 
+    --eval-iters 10
+    --tensorboard-dir $TENSORBOARD_LOGS_PATH 
+)
+
+torchrun ${DISTRIBUTED_ARGS[@]} pretrain_bert.py \
+    ${BERT_MODEL_ARGS[@]} \
+    ${TRAINING_ARGS[@]} \
+    ${MODEL_PARALLEL_ARGS[@]} \
+    ${DATA_ARGS[@]} \
+    ${EVAL_AND_LOGGING_ARGS[@]}
+    
\ No newline at end of file
diff --git a/examples/evaluate_retriever_nq.sh b/examples/evaluate_retriever_nq.sh
deleted file mode 100644
index 16e937f4fd..0000000000
--- a/examples/evaluate_retriever_nq.sh
+++ /dev/null
@@ -1,38 +0,0 @@
-#!/bin/bash
-
-# Evaluate natural question test data given Wikipedia embeddings and pretrained
-# ICT model or a finetuned model for Natural Question task
-
-# Datasets can be downloaded from the following link:
-# https://github.com/facebookresearch/DPR/blob/master/data/download_data.py
-
-EVIDENCE_DATA_DIR=<Specify path of Wikipedia dataset>
-EMBEDDING_PATH=<Specify path of the embeddings>
-CHECKPOINT_PATH=<Specify path of pretrained ICT model or finetuned model>
-
-QA_FILE=<Path of the natural question dev or test dataset>
-
-python tasks/main.py \
-    --task RETRIEVER-EVAL \
-    --tokenizer-type BertWordPieceLowerCase \
-    --num-layers 12 \
-    --hidden-size 768 \
-    --num-attention-heads 12 \
-    --tensor-model-parallel-size 1 \
-    --micro-batch-size 128 \
-    --activations-checkpoint-method uniform \
-    --seq-length 512 \
-    --max-position-embeddings 512 \
-    --load ${CHECKPOINT_PATH} \
-    --evidence-data-path ${EVIDENCE_DATA_DIR} \
-    --embedding-path ${EMBEDDING_PATH} \
-    --retriever-seq-length 256 \
-    --vocab-file  bert-vocab.txt\
-    --qa-data-test ${QA_FILE} \
-    --faiss-use-gpu \
-    --retriever-report-topk-accuracies 1 5 20 100 \
-    --fp16 \
-    --indexer-log-interval 1000 \
-    --indexer-batch-size 128
-
-
diff --git a/examples/evaluate_zeroshot_gpt.sh b/examples/evaluate_zeroshot_gpt.sh
deleted file mode 100755
index f8c38dc01d..0000000000
--- a/examples/evaluate_zeroshot_gpt.sh
+++ /dev/null
@@ -1,38 +0,0 @@
-#!/bin/bash
-
-WORLD_SIZE=8
-
-DISTRIBUTED_ARGS="--nproc_per_node $WORLD_SIZE \
-                  --nnodes 1 \
-                  --node_rank 0 \
-                  --master_addr localhost \
-                  --master_port 6000"
-
-TASK="LAMBADA"
-
-VALID_DATA=<lambada path>
-VOCAB_FILE=gpt2-vocab.json
-MERGE_FILE=gpt2-merges.txt
-CHECKPOINT=checkpoints/gpt2_345m
-
-
-python -m torch.distributed.launch $DISTRIBUTED_ARGS ./tasks/main.py \
-               --task $TASK \
-               --valid-data $VALID_DATA \
-               --tokenizer-type GPT2BPETokenizer \
-               --strict-lambada \
-               --vocab-file $VOCAB_FILE \
-               --merge-file $MERGE_FILE \
-               --load $CHECKPOINT \
-               --tensor-model-parallel-size 1 \
-               --num-layers 24 \
-               --hidden-size 1024 \
-               --num-attention-heads 16 \
-               --batch-size 8 \
-               --activations-checkpoint-method uniform \
-               --seq-length 1024 \
-               --max-position-embeddings 1024 \
-               --log-interval 10 \
-               --fp16 \
-               --no-load-optim \
-               --no-load-rng
diff --git a/examples/export/README.md b/examples/export/README.md
new file mode 100644
index 0000000000..ddb8216f94
--- /dev/null
+++ b/examples/export/README.md
@@ -0,0 +1,10 @@
+# Megatron Core Export
+
+This module is used to export megatron core models to different inference frameworks. 
+Currently we support TRTLLM export . In the future we will be adding support for VLLM etc. 
+
+## PTQ AND EXPORT
+Follow the instructions in [ptq_and_trtllm_export](./ptq_and_trtllm_export) to do post training quantization, followed by an export to TRTLLM format. 
+
+# TRTLLM EXPORT
+Follow the instructions in [trtllm_export](./trtllm_export/) to do export to TRTLLM checkpoint format alone.
\ No newline at end of file
diff --git a/examples/export/ptq_and_trtllm_export/README.md b/examples/export/ptq_and_trtllm_export/README.md
new file mode 100644
index 0000000000..c5255f7ccf
--- /dev/null
+++ b/examples/export/ptq_and_trtllm_export/README.md
@@ -0,0 +1,253 @@
+# Megatron Model Optimization and Deployment
+
+## Installation
+We recommend that users follow TensorRT-LLM's official installation guide to build it from source
+and proceed with a containerized environment (`docker.io/tensorrt_llm/release:latest`):
+
+```sh
+git clone https://github.com/NVIDIA/TensorRT-LLM.git
+cd TensorRT-LLM
+git checkout v0.10.0
+make -C docker release_build
+```
+
+> **TROUBLE SHOOTING:** rather than copying each folder separately in `docker/Dockerfile.multi`,
+> you may need to copy the entire dir as `COPY ./ /src/tensorrt_llm` since a `git submodule` is
+> called later which requires `.git` to continue.
+
+Once the container is built, install `nvidia-modelopt` and additional dependencies for sharded checkpoint support:
+```sh
+pip install "nvidia-modelopt[all]~=0.13.0" --extra-index-url https://pypi.nvidia.com
+pip install zarr tensorstore==0.1.45
+```
+TensorRT-LLM quantization functionalities are currently packaged in `nvidia-modelopt`.
+You can find more documentation about `nvidia-modelopt` [here](https://nvidia.github.io/TensorRT-Model-Optimizer/).
+
+## Support Matrix
+
+The following matrix shows the current support for the PTQ + TensorRT-LLM export flow.
+
+| model                       | fp16 | int8_sq | fp8 | int4_awq |
+|-----------------------------|------|---------| ----| -------- |
+| nextllm-2b                  | x    | x       |   x |          |
+| nemotron3-8b                | x    |         |   x |          |
+| nemotron3-15b               | x    |         |   x |          |
+| llama2-text-7b              | x    | x       |   x |      TP2 |
+| llama2-chat-70b             | x    | x       |   x |      TP4 |
+
+Our PTQ + TensorRT-LLM flow has native support on MCore `GPTModel` with a mixed layer spec (native ParallelLinear
+and Transformer-Engine Norm (`TENorm`). Note that this is not the default mcore gpt spec. You can still load the
+following checkpoint formats with some remedy:
+
+| GPTModel                          | sharded |                        remedy arguments     |
+|-----------------------------------|---------|---------------------------------------------|
+| megatron.legacy.model             |         | `--export-legacy-megatron` |
+| TE-Fused (default mcore gpt spec) |         | `--export-te-mcore-model`       |
+| TE-Fused (default mcore gpt spec) |       x |                                             |
+
+> **TROUBLE SHOOTING:** If you are trying to load an unpacked `.nemo` sharded checkpoint, then typically you will
+> need to adding `additional_sharded_prefix="model."` to `modelopt_load_checkpoint()` since NeMo has an additional
+> `model.` wrapper on top of the `GPTModel`.
+
+> **NOTE:** flag `--export-legacy-megatron` may not work on all legacy checkpoint versions.
+
+## Examples
+
+> **NOTE:** we only provide a simple text generation script to test the generated TensorRT-LLM engines. For
+> a production-level API server or enterprise support, see [NeMo](https://github.com/NVIDIA/NeMo) and TensorRT-LLM's
+> backend for [NVIDIA Triton Inference Server](https://developer.nvidia.com/nvidia-triton-inference-server).
+
+### Minitron-8B FP8 Quantization and TensorRT-LLM Deployment
+First download the nemotron checkpoint from https://huggingface.co/nvidia/Minitron-8B-Base, extract the
+sharded checkpoint from the `.nemo` tarbal and fix the tokenizer file name.
+
+> **NOTE:** The following cloning method uses `ssh`, and assume you have registered the `ssh-key` in Hugging Face.
+> If you are want to clone with `https`, then `git clone https://huggingface.co/nvidia/Minitron-8B-Base` with an access token.
+
+```sh
+git lfs install
+git clone git@hf.co:nvidia/Minitron-8B-Base
+cd Minitron-8B-Base/nemo
+tar -xvf minitron-8b-base.nemo
+cd ../..
+```
+
+Now launch the PTQ + TensorRT-LLM export script,
+```sh
+bash examples/export/ptq_and_trtllm_export/ptq_trtllm_minitron_8b ./Minitron-8B-Base None
+```
+By default, `cnn_dailymail` is used for calibration. The `GPTModel` will have quantizers for simulating the
+quantization effect. The checkpoint will be saved optionally (with quantizers as additional states) and can
+be restored for further evaluation or quantization-aware training. TensorRT-LLM checkpoint and engine are exported to `/tmp/trtllm_ckpt` and
+built in `/tmp/trtllm_engine` by default.
+
+The script expects `${CHECKPOINT_DIR}` (`./Minitron-8B-Base/nemo`) to have the following structure:
+
+> **NOTE:** The .nemo checkpoint after extraction (including examples below) should all have the following strucure.
+
+```
+├── model_weights
+│   ├── common.pt
+│   ...
+│
+├── model_config.yaml
+│...
+```
+
+> **NOTE:** The script is using `TP=8`. Change `$TP` in the script if your checkpoint has a different tensor
+> model parallelism.
+
+Then build TensorRT engine and run text generation example using the newly built TensorRT engine
+
+```sh
+export trtllm_options=" \
+    --checkpoint_dir /tmp/trtllm_ckpt \
+    --output_dir /tmp/trtllm_engine \
+    --max_input_len 2048 \
+    --max_seq_len 512 \
+    --max_batch_size 8 "
+
+trtllm-build ${trtllm_options}
+
+python examples/export/ptq_and_trtllm_export/trtllm_text_generation.py --tokenizer nvidia/Minitron-8B-Base
+```
+
+### mistral-12B FP8 Quantization and TensorRT-LLM Deployment
+First download the nemotron checkpoint from https://huggingface.co/nvidia/Mistral-NeMo-12B-Base, extract the
+sharded checkpoint from the `.nemo` tarbal.
+
+> **NOTE:** The following cloning method uses `ssh`, and assume you have registered the `ssh-key` in Hugging Face.
+> If you are want to clone with `https`, then `git clone https://huggingface.co/nvidia/Mistral-NeMo-12B-Base` with an access token.
+
+```sh
+git lfs install
+git clone git@hf.co:nvidia/Mistral-NeMo-12B-Base
+cd Mistral-NeMo-12B-Base
+tar -xvf Mistral-NeMo-12B-Base.nemo
+cd ..
+```
+
+Then log in to huggingface so that you can access to model
+
+> **NOTE:** You need a token generated from huggingface.co/settings/tokens and access to mistralai/Mistral-Nemo-Base-2407 on huggingface
+
+```sh
+pip install -U "huggingface_hub[cli]"
+huggingface-cli login
+```
+
+Now launch the PTQ + TensorRT-LLM checkpoint export script,
+
+```sh
+bash examples/export/ptq_and_trtllm_export/ptq_trtllm_mistral_12b.sh ./Mistral-NeMo-12B-Base None
+```
+
+Then build TensorRT engine and run text generation example using the newly built TensorRT engine
+
+```sh
+export trtllm_options=" \
+    --checkpoint_dir /tmp/trtllm_ckpt \
+    --output_dir /tmp/trtllm_engine \
+    --max_input_len 2048 \
+    --max_seq_len 512 \
+    --max_batch_size 8 "
+
+trtllm-build ${trtllm_options}
+
+python examples/export/ptq_and_trtllm_export/trtllm_text_generation.py --tokenizer mistralai/Mistral-Nemo-Base-2407
+```
+
+
+### llama2-text-7b INT8 SmoothQuant and TensorRT-LLM Deployment
+> **NOTE:** Due to the LICENSE issue, we do not provide a MCore checkpoint to download. Users can follow
+> the instruction in `docs/llama2.md` to convert the checkpoint to megatron legacy `GPTModel` format and
+> use `--export-legacy-megatron` flag which will remap the checkpoint to the MCore `GPTModel` spec
+> that we support.
+
+```sh
+bash examples/export/ptq_and_trtllm_export/ptq_trtllm_llama_7b.sh ${CHECKPOINT_DIR}
+```
+
+The script expect `${CHECKPOINT_DIR}` to have the following structure:
+```
+├── hf
+│   ├── tokenizer.config
+│   ├── tokenizer.model
+│   ...
+│
+├── iter_0000001
+│   ├── mp_rank_00
+│   ...
+│
+├── latest_checkpointed_iteration.txt
+```
+In short, other than the converted llama megatron checkpoint, also put the Hugging Face checkpoint inside as
+the source of the tokenizer.
+
+Then build TensorRT engine and run text generation example using the newly built TensorRT engine
+
+```sh
+export trtllm_options=" \
+    --checkpoint_dir /tmp/trtllm_ckpt \
+    --output_dir /tmp/trtllm_engine \
+    --max_input_len 2048 \
+    --max_seq_len 512 \
+    --max_batch_size 8 "
+
+trtllm-build ${trtllm_options}
+
+python examples/export/ptq_and_trtllm_export/trtllm_text_generation.py --tokenizer meta-llama/Llama-2-7b
+```
+
+### llama3-8b / llama3.1-8b INT8 SmoothQuant and TensorRT-LLM Deployment
+> **NOTE:** For llama3.1, the missing rope_scaling parameter will be fixed in modelopt-0.19 and trtllm-0.13.
+
+> **NOTE:** There are two ways to acquire the checkpoint. Users can follow
+> the instruction in `docs/llama2.md` to convert the checkpoint to megatron legacy `GPTModel` format and
+> use `--export-legacy-megatron` flag which will remap the checkpoint to the MCore `GPTModel` spec
+> that we support.
+> Or Users can download [nemo model](https://catalog.ngc.nvidia.com/orgs/nvidia/teams/nemo/models/llama38bnemo) from NGC and extract the sharded checkpoint from the .nemo tarbal.
+
+If users choose to download the model from NGC, first extract the sharded checkpoint from the .nemo tarbal.
+
+```sh
+tar -xvf 8b_pre_trained_bf16.nemo
+```
+
+> **NOTE:** You need a token generated from huggingface.co/settings/tokens and access to meta-llama/Llama-3.1-8B or meta-llama/Llama-3-8B on huggingface
+
+```sh
+pip install -U "huggingface_hub[cli]"
+huggingface-cli login
+```
+
+Now launch the PTQ + TensorRT-LLM checkpoint export script for llama-3,
+
+```sh
+bash examples/export/ptq_and_trtllm_export/ptq_trtllm_llama3_8b.sh ./llama-3-8b-nemo_v1.0 None
+```
+
+or llama-3.1
+
+```sh
+bash examples/export/ptq_and_trtllm_export/ptq_trtllm_llama3_1_8b.sh ./llama-3_1-8b-nemo_v1.0 None
+```
+
+Then build TensorRT engine and run text generation example using the newly built TensorRT engine
+
+```sh
+export trtllm_options=" \
+    --checkpoint_dir /tmp/trtllm_ckpt \
+    --output_dir /tmp/trtllm_engine \
+    --max_input_len 2048 \
+    --max_seq_len 512 \
+    --max_batch_size 8 "
+
+trtllm-build ${trtllm_options}
+
+python examples/export/ptq_and_trtllm_export/trtllm_text_generation.py --tokenizer meta-llama/Meta-Llama-3-8B
+# For llama-3
+
+python examples/export/ptq_and_trtllm_export/trtllm_text_generation.py --tokenizer meta-llama/Meta-Llama-3.1-8B
+#For llama-3.1
+```
\ No newline at end of file
diff --git a/examples/export/ptq_and_trtllm_export/ptq_trtllm_llama2_7b.sh b/examples/export/ptq_and_trtllm_export/ptq_trtllm_llama2_7b.sh
new file mode 100644
index 0000000000..ebcc448955
--- /dev/null
+++ b/examples/export/ptq_and_trtllm_export/ptq_trtllm_llama2_7b.sh
@@ -0,0 +1,80 @@
+#!/bin/bash
+set -e
+
+DEFAULT_NAME="/checkpoints/llama2-text-7b_v0.2.0"
+NAME="${1:-$DEFAULT_NAME}"
+
+DEFAULT_QUANT_CFG="int8_sq"
+QUANT_CFG="${2:-$DEFAULT_QUANT_CFG}"
+
+# CHANGE THE FOLLOWING IF YOU MOUNT YOUR DATA AND CHECKPOINTS DIFFERENTLY IN THE CONTAINER.
+TP="8"
+INFERENCE_TP=${TP}
+DECODER_TYPE="llama"
+CHECKPOINT_LOAD_DIR="${NAME}"
+TOKENIZER_MODEL="${CHECKPOINT_LOAD_DIR}/hf/tokenizer.model"
+
+# LLaMA2 text 7b has ffn_hidden_size 11008. int4_awq requires a block_size of 128 as a result the TP can at most be 2
+if [ "$QUANT_CFG" = "int4_awq" ]; then
+    INFERENCE_TP="2"
+fi
+
+additional_options=" \
+    --export-quant-cfg ${QUANT_CFG} \
+    --export-legacy-megatron \
+    --export-te-mcore-model \
+    --calib-batch-size 8 \
+    --decoder ${DECODER_TYPE} \
+    --export-dir /tmp/trtllm_ckpt \
+    --inference-tensor-parallel ${INFERENCE_TP} "
+
+trtllm_options=" \
+    --tensorrt-llm-checkpoint-dir /tmp/trtllm_ckpt \
+    --engine-dir /tmp/trtllm_engine \
+    --tokenizer ${CHECKPOINT_LOAD_DIR}/hf \
+    --max-input-len 2048 \
+    --max-output-len 512 \
+    --max-batch-size 8 "
+
+# DO NOT CHANGE THE SETTING BELOW UNLESS YOU KNOW WHAT YOU ARE DOING!!!
+export CUDA_DEVICE_MAX_CONNECTIONS=1
+
+options=" \
+    --disable-bias-linear \
+    --swiglu \
+    --no-rope-fusion \
+    --untie-embeddings-and-output-weights \
+    --use-rotary-position-embeddings \
+    --normalization RMSNorm \
+    --rotary-percent 1.0 \
+    --no-position-embedding \
+    --no-masked-softmax-fusion \
+    --no-bias-gelu-fusion \
+    --no-bias-dropout-fusion \
+    --no-async-tensor-model-parallel-allreduce \
+    --tensor-model-parallel-size ${TP} \
+    --pipeline-model-parallel-size 1 \
+    --num-layers 32 \
+    --hidden-size 4096 \
+    --ffn-hidden-size 11008 \
+    --num-attention-heads 32 \
+    --seq-length 4096 \
+    --max-position-embeddings 4096 \
+    --micro-batch-size 1 \
+    --make-vocab-size-divisible-by 1 \
+    --tokenizer-type Llama2Tokenizer \
+    --tokenizer-model ${TOKENIZER_MODEL} \
+    --save-interval 1000000 \
+    --use-dist-ckpt \
+    --load ${CHECKPOINT_LOAD_DIR} \
+    --fp16"
+
+# Precompile CUDA extentions
+python -c "import modelopt.torch.quantization.extensions as ext; print(ext.cuda_ext); print(ext.cuda_ext_fp8)"
+
+# Acquire launch configuration where variable launch_config will be set
+launch_config="--nproc_per_node=${TP}"
+
+# Launch multi-process with torchrun
+torchrun ${launch_config} examples/export/ptq_and_trtllm_export/text_generation_ptq.py ${options} ${additional_options}
+
diff --git a/examples/export/ptq_and_trtllm_export/ptq_trtllm_llama3_1_8b.sh b/examples/export/ptq_and_trtllm_export/ptq_trtllm_llama3_1_8b.sh
new file mode 100644
index 0000000000..a6251663f7
--- /dev/null
+++ b/examples/export/ptq_and_trtllm_export/ptq_trtllm_llama3_1_8b.sh
@@ -0,0 +1,79 @@
+#!/bin/bash
+set -e
+
+DEFAULT_NAME="/checkpoints/llama-3_1-8b-nemo_v1.0"
+NAME="${1:-$DEFAULT_NAME}"
+
+DEFAULT_QUANT_CFG="int8_sq"
+QUANT_CFG="${2:-$DEFAULT_QUANT_CFG}"
+
+# NOTE: UNFUSED ATTENTION MUST BE USED TO AVOID ADDITIONAL STATE_DICT KEY MISMATCH.
+export NVTE_FLASH_ATTN=0
+export NVTE_FUSED_ATTN=0
+export NVTE_UNFUSED_ATTN=1
+
+# CHANGE THE FOLLOWING IF YOU MOUNT YOUR DATA AND CHECKPOINTS DIFFERENTLY IN THE CONTAINER.
+TP="1"
+INFERENCE_TP=${TP}
+DECODER_TYPE="llama"
+CHECKPOINT_LOAD_DIR="${NAME}"
+
+# LLaMA2 text 7b has ffn_hidden_size 11008. int4_awq requires a block_size of 128 as a result the TP can at most be 2
+if [ "$QUANT_CFG" = "int4_awq" ]; then
+    INFERENCE_TP="2"
+fi
+
+additional_options=" \
+    --export-quant-cfg ${QUANT_CFG} \
+    --export-legacy-megatron \
+    --export-te-mcore-model \
+    --calib-batch-size 8 \
+    --decoder ${DECODER_TYPE} \
+    --export-dir /tmp/trtllm_ckpt \
+    --inference-tensor-parallel ${INFERENCE_TP} "
+
+# DO NOT CHANGE THE SETTING BELOW UNLESS YOU KNOW WHAT YOU ARE DOING!!!
+export CUDA_DEVICE_MAX_CONNECTIONS=1
+
+options=" \
+    --disable-bias-linear \
+    --swiglu \
+    --no-rope-fusion \
+    --untie-embeddings-and-output-weights \
+    --use-rotary-position-embeddings \
+    --normalization RMSNorm \
+    --rotary-percent 1.0 \
+    --hidden-dropout 0.0 \
+    --attention-dropout 0.0 \
+    --no-bias-gelu-fusion \
+    --no-bias-dropout-fusion \
+    --no-async-tensor-model-parallel-allreduce \
+    --tensor-model-parallel-size ${TP} \
+    --pipeline-model-parallel-size 1 \
+    --num-layers 32 \
+    --hidden-size 4096 \
+    --group-query-attention \
+    --num-query-groups 8 \
+    --ffn-hidden-size 14336 \
+    --num-attention-heads 32 \
+    --seq-length 131072 \
+    --max-position-embeddings 131072 \
+    --micro-batch-size 4 \
+    --make-vocab-size-divisible-by 128 \
+    --tokenizer-type HuggingFaceTokenizer \
+    --tokenizer-model meta-llama/Meta-Llama-3.1-8B \
+    --save-interval 1000000 \
+    --use-rope-scaling \
+    --use-dist-ckpt \
+    --load ${CHECKPOINT_LOAD_DIR} \
+    --rotary-base 500000 \
+    --fp16"
+
+# Precompile CUDA extentions
+python -c "import modelopt.torch.quantization.extensions as ext; print(ext.cuda_ext); print(ext.cuda_ext_fp8)"
+
+# Acquire launch configuration where variable launch_config will be set
+launch_config="--nproc_per_node=${TP}"
+
+# Launch multi-process with torchrun
+torchrun ${launch_config} examples/export/ptq_and_trtllm_export/text_generation_ptq.py ${options} ${additional_options}
diff --git a/examples/export/ptq_and_trtllm_export/ptq_trtllm_llama3_8b.sh b/examples/export/ptq_and_trtllm_export/ptq_trtllm_llama3_8b.sh
new file mode 100644
index 0000000000..f181c8c2dd
--- /dev/null
+++ b/examples/export/ptq_and_trtllm_export/ptq_trtllm_llama3_8b.sh
@@ -0,0 +1,78 @@
+#!/bin/bash
+set -e
+
+DEFAULT_NAME="/checkpoints/llama-3_1-8b-nemo_v1.0"
+NAME="${1:-$DEFAULT_NAME}"
+
+DEFAULT_QUANT_CFG="int8_sq"
+QUANT_CFG="${2:-$DEFAULT_QUANT_CFG}"
+
+# NOTE: UNFUSED ATTENTION MUST BE USED TO AVOID ADDITIONAL STATE_DICT KEY MISMATCH.
+export NVTE_FLASH_ATTN=0
+export NVTE_FUSED_ATTN=0
+export NVTE_UNFUSED_ATTN=1
+
+# CHANGE THE FOLLOWING IF YOU MOUNT YOUR DATA AND CHECKPOINTS DIFFERENTLY IN THE CONTAINER.
+TP="1"
+INFERENCE_TP=${TP}
+DECODER_TYPE="llama"
+CHECKPOINT_LOAD_DIR="${NAME}"
+
+# LLaMA2 text 7b has ffn_hidden_size 11008. int4_awq requires a block_size of 128 as a result the TP can at most be 2
+if [ "$QUANT_CFG" = "int4_awq" ]; then
+    INFERENCE_TP="2"
+fi
+
+additional_options=" \
+    --export-quant-cfg ${QUANT_CFG} \
+    --export-legacy-megatron \
+    --export-te-mcore-model \
+    --calib-batch-size 8 \
+    --decoder ${DECODER_TYPE} \
+    --export-dir /tmp/trtllm_ckpt \
+    --inference-tensor-parallel ${INFERENCE_TP} "
+
+# DO NOT CHANGE THE SETTING BELOW UNLESS YOU KNOW WHAT YOU ARE DOING!!!
+export CUDA_DEVICE_MAX_CONNECTIONS=1
+
+options=" \
+    --disable-bias-linear \
+    --swiglu \
+    --no-rope-fusion \
+    --untie-embeddings-and-output-weights \
+    --use-rotary-position-embeddings \
+    --normalization RMSNorm \
+    --rotary-percent 1.0 \
+    --hidden-dropout 0.0 \
+    --attention-dropout 0.0 \
+    --no-bias-gelu-fusion \
+    --no-bias-dropout-fusion \
+    --no-async-tensor-model-parallel-allreduce \
+    --tensor-model-parallel-size ${TP} \
+    --pipeline-model-parallel-size 1 \
+    --num-layers 32 \
+    --hidden-size 4096 \
+    --group-query-attention \
+    --num-query-groups 8 \
+    --ffn-hidden-size 14336 \
+    --num-attention-heads 32 \
+    --seq-length 8192 \
+    --max-position-embeddings 8192 \
+    --micro-batch-size 4 \
+    --make-vocab-size-divisible-by 128 \
+    --tokenizer-type HuggingFaceTokenizer \
+    --tokenizer-model meta-llama/Meta-Llama-3-8B \
+    --save-interval 1000000 \
+    --use-dist-ckpt \
+    --load ${CHECKPOINT_LOAD_DIR} \
+    --rotary-base 500000 \
+    --fp16"
+
+# Precompile CUDA extentions
+python -c "import modelopt.torch.quantization.extensions as ext; print(ext.cuda_ext); print(ext.cuda_ext_fp8)"
+
+# Acquire launch configuration where variable launch_config will be set
+launch_config="--nproc_per_node=${TP}"
+
+# Launch multi-process with torchrun
+torchrun ${launch_config} examples/export/ptq_and_trtllm_export/text_generation_ptq.py ${options} ${additional_options}
diff --git a/examples/export/ptq_and_trtllm_export/ptq_trtllm_minitron_8b.sh b/examples/export/ptq_and_trtllm_export/ptq_trtllm_minitron_8b.sh
new file mode 100644
index 0000000000..31ec192fd5
--- /dev/null
+++ b/examples/export/ptq_and_trtllm_export/ptq_trtllm_minitron_8b.sh
@@ -0,0 +1,74 @@
+#!/bin/bash
+set -e
+
+DEFAULT_NAME="/checkpoints/nemotron3-8b_v0.3.0"
+NAME="${1:-$DEFAULT_NAME}"
+
+DEFAULT_QUANT_CFG="fp8"
+QUANT_CFG="${2:-$DEFAULT_QUANT_CFG}"
+
+# NOTE: UNFUSED ATTENTION MUST BE USED TO AVOID ADDITIONAL STATE_DICT KEY MISMATCH.
+export NVTE_FLASH_ATTN=0
+export NVTE_FUSED_ATTN=0
+export NVTE_UNFUSED_ATTN=1
+
+# CHANGE THE FOLLOWING IF YOU MOUNT YOUR DATA AND CHECKPOINTS DIFFERENTLY IN THE CONTAINER.
+TP="8"
+INFERENCE_TP=${TP}
+DECODER_TYPE="gptnext"
+CHECKPOINT_LOAD_DIR="${NAME}/nemo"
+
+if [ "$QUANT_CFG" = "int4_awq" ]; then
+    INFERENCE_TP="1"
+fi
+
+additional_options=" \
+    --export-quant-cfg ${QUANT_CFG} \
+    --export-legacy-megatron \
+    --export-te-mcore-model \
+    --calib-batch-size 8 \
+    --decoder ${DECODER_TYPE} \
+    --export-dir /tmp/trtllm_ckpt \
+    --inference-tensor-parallel ${INFERENCE_TP} "
+
+# DO NOT CHANGE THE SETTING BELOW UNLESS YOU KNOW WHAT YOU ARE DOING!!!
+export CUDA_DEVICE_MAX_CONNECTIONS=1
+
+options=" \
+    --apply-layernorm-1p \
+    --untie-embeddings-and-output-weights \
+    --disable-bias-linear \
+    --no-rope-fusion \
+    --no-position-embedding \
+    --use-rotary-position-embeddings \
+    --rotary-percent 0.5 \
+    --squared-relu \
+    --attention-dropout 0.0 \
+    --hidden-dropout 0.0 \
+    --tensor-model-parallel-size ${TP} \
+    --pipeline-model-parallel-size 1 \
+    --num-layers 32 \
+    --hidden-size 4096 \
+    --ffn-hidden-size 16384 \
+    --group-query-attention \
+    --num-attention-heads 48 \
+    --kv-channels 128 \
+    --seq-length 4096 \
+    --num-query-groups 8 \
+    --max-position-embeddings 4096 \
+    --micro-batch-size 4 \
+    --tokenizer-type HuggingFaceTokenizer \
+    --tokenizer-model nvidia/Minitron-8B-Base \
+    --save-interval 1000000 \
+    --load ${CHECKPOINT_LOAD_DIR} \
+    --bf16 \
+    --use-dist-ckpt"
+
+# Precompile CUDA extentions
+python -c "import modelopt.torch.quantization.extensions as ext; print(ext.cuda_ext); print(ext.cuda_ext_fp8)"
+
+# Acquire launch configuration where variable launch_config will be set
+launch_config="--nproc_per_node=${TP}"
+
+# Launch multi-process with torchrun
+torchrun ${launch_config} examples/export/ptq_and_trtllm_export/text_generation_ptq.py ${options} ${additional_options}
diff --git a/examples/export/ptq_and_trtllm_export/ptq_trtllm_mistral_12b.sh b/examples/export/ptq_and_trtllm_export/ptq_trtllm_mistral_12b.sh
new file mode 100644
index 0000000000..3eb02d2e1d
--- /dev/null
+++ b/examples/export/ptq_and_trtllm_export/ptq_trtllm_mistral_12b.sh
@@ -0,0 +1,75 @@
+#!/bin/bash
+set -e
+
+DEFAULT_NAME="/checkpoints/Mistral-NeMo-12B-Base"
+NAME="${1:-$DEFAULT_NAME}"
+
+DEFAULT_QUANT_CFG="fp8"
+QUANT_CFG="${2:-$DEFAULT_QUANT_CFG}"
+
+# NOTE: UNFUSED ATTENTION MUST BE USED TO AVOID ADDITIONAL STATE_DICT KEY MISMATCH.
+export NVTE_FLASH_ATTN=0
+export NVTE_FUSED_ATTN=0
+export NVTE_UNFUSED_ATTN=1
+
+# CHANGE THE FOLLOWING IF YOU MOUNT YOUR DATA AND CHECKPOINTS DIFFERENTLY IN THE CONTAINER.
+TP="8"
+INFERENCE_TP=${TP}
+DECODER_TYPE="llama"
+CHECKPOINT_LOAD_DIR="${NAME}"
+
+if [ "$QUANT_CFG" = "int4_awq" ]; then
+    INFERENCE_TP="1"
+fi
+
+additional_options=" \
+    --export-quant-cfg ${QUANT_CFG} \
+    --export-legacy-megatron \
+    --export-te-mcore-model \
+    --calib-batch-size 8 \
+    --decoder ${DECODER_TYPE} \
+    --export-dir /tmp/trtllm_ckpt \
+    --inference-tensor-parallel ${INFERENCE_TP} "
+
+# DO NOT CHANGE THE SETTING BELOW UNLESS YOU KNOW WHAT YOU ARE DOING!!!
+export CUDA_DEVICE_MAX_CONNECTIONS=1
+
+options=" \
+    --untie-embeddings-and-output-weights \
+    --disable-bias-linear \
+    --use-rotary-position-embeddings \
+    --rotary-percent 1.0 \
+    --attention-dropout 0.0 \
+    --hidden-dropout 0.0 \
+    --tensor-model-parallel-size ${TP} \
+    --pipeline-model-parallel-size 1 \
+    --num-layers 40 \
+    --hidden-size 5120 \
+    --ffn-hidden-size 14336 \
+    --num-attention-heads 32 \
+    --seq-length 8192 \
+    --kv-channels 128 \
+    --normalization RMSNorm \
+    --swiglu \
+    --num-query-groups 8 \
+    --group-query-attention \
+    --position-embedding-type rope \
+    --max-position-embeddings 8192 \
+    --micro-batch-size 1 \
+    --tokenizer-type HuggingFaceTokenizer \
+    --tiktoken-pattern v2 \
+    --tokenizer-model mistralai/Mistral-Nemo-Base-2407 \
+    --save-interval 1000000 \
+    --load ${CHECKPOINT_LOAD_DIR} \
+    --fp16 \
+    --rotary-base 1000000 \
+    --use-dist-ckpt"
+
+# Precompile CUDA extentions
+python -c "import modelopt.torch.quantization.extensions as ext; print(ext.cuda_ext); print(ext.cuda_ext_fp8)"
+
+# Acquire launch configuration where variable launch_config will be set
+launch_config="--nproc_per_node=${TP}"
+
+# Launch multi-process with torchrun
+torchrun ${launch_config} examples/export/ptq_and_trtllm_export/text_generation_ptq.py ${options} ${additional_options}
diff --git a/examples/export/ptq_and_trtllm_export/text_generation_ptq.py b/examples/export/ptq_and_trtllm_export/text_generation_ptq.py
new file mode 100644
index 0000000000..340c9c90f7
--- /dev/null
+++ b/examples/export/ptq_and_trtllm_export/text_generation_ptq.py
@@ -0,0 +1,218 @@
+# Copyright (c) 2024, NVIDIA CORPORATION. All rights reserved.
+
+"""Sample Generate GPT."""
+import functools
+import os
+import sys
+from pathlib import Path
+
+sys.path.append(os.path.abspath(os.path.join(os.path.dirname(__file__), "../../../../")))
+
+import modelopt.torch.quantization as mtq
+import torch
+from datasets import load_dataset
+from tqdm import tqdm
+
+# [ModelOpt]: changing the default model provider to the ModelOpt version
+from megatron.core import mpu
+from megatron.inference.arguments import add_modelopt_args
+from megatron.inference.checkpointing import load_modelopt_checkpoint
+from megatron.inference.gpt.model_provider import model_provider
+from megatron.inference.text_generation import generate_and_post_process
+from megatron.training import get_args, get_model, initialize_megatron
+from megatron.training.checkpointing import save_checkpoint
+from megatron.training.utils import print_rank_0, unwrap_model
+
+QUANT_CFG_CHOICES = {
+    "int8": mtq.INT8_DEFAULT_CFG,
+    "int8_sq": mtq.INT8_SMOOTHQUANT_CFG,
+    "fp8": mtq.FP8_DEFAULT_CFG,
+    "int4_awq": mtq.INT4_AWQ_CFG,
+    "w4a8_awq": mtq.W4A8_AWQ_BETA_CFG,
+    "int4": mtq.INT4_BLOCKWISE_WEIGHT_ONLY_CFG,
+}
+
+
+def add_trtllm_ckpt_export_args(parser):
+    """Add additional arguments for TensorRT-LLM."""
+    group = parser.add_argument_group(title="trtllm")
+
+    group.add_argument(
+        "--export-dir", type=str, help="The output TensorRT-LLM checkpoint.",
+    )
+    group.add_argument(
+        "--decoder", type=str, choices=["gptnext", 'llama'], help="The decoder type of the model.",
+    )
+    group.add_argument(
+        "--inference-tensor-parallel",
+        type=int,
+        help="Tensor parallel for the inference time, can be different from the training config.",
+        default=1,
+    )
+
+
+def add_text_generate_ptq_args(parser):
+    """Add additional arguments for ModelOpt text generation PTQ."""
+    group = parser.add_argument_group(title='ModelOpt text generation ptq')
+    group.add_argument(
+        "--calib-dataset",
+        type=str,
+        default="cnn_dailymail",
+        help="Calibration datasets from HuggingFace datasets.",
+    )
+    group.add_argument(
+        "--calib-batch-size", type=int, default=4, help="Batch size to use for ptq calibration."
+    )
+    group.add_argument(
+        "--calib-size", type=int, default=512, help="Samples to use for ptq calibration."
+    )
+    parser.add_argument(
+        "--prompts",
+        type=str,
+        default=(
+            "Born in north-east France, Soyer trained as a|Born in California, Soyer trained as a"
+        ),
+        help="Input texts. Please use | to separate different batches.",
+    )
+    add_modelopt_args(parser)
+    add_trtllm_ckpt_export_args(parser)
+    return parser
+
+
+def get_calib_dataloader(
+    data="cnn_dailymail", batch_size=4, calib_size=512, max_sequence_length=512
+):
+    if data == "pileval":
+        dataset = load_dataset(
+            "json", data_files="https://the-eye.eu/public/AI/pile/val.jsonl.zst", split="train"
+        )
+        text_column = "text"
+    elif data == "wikitext":
+        dataset = load_dataset("wikitext", "wikitext-103-v1", split="train")
+        text_column = "text"
+    elif data == "cnn_dailymail":
+        dataset = load_dataset("cnn_dailymail", name="3.0.0", split="train")
+        text_column = "article"
+
+    calib_size = max(min(len(dataset), calib_size), batch_size)
+    for i in range(calib_size // batch_size):
+        batch = dataset[i * batch_size : (i + 1) * batch_size][text_column]
+        for j in range(len(batch)):
+            batch[j] = batch[j][:max_sequence_length]
+        yield batch
+
+
+
+if __name__ == "__main__":
+    initialize_megatron(
+        extra_args_provider=add_text_generate_ptq_args,
+        args_defaults={
+            'tokenizer_type': 'GPT2BPETokenizer',
+            'no_load_rng': True,
+            'no_load_optim': True,
+        },
+    )
+
+    args = get_args()
+    if args.num_layers_per_virtual_pipeline_stage is not None:
+        print_rank_0("Interleaved pipeline schedule is not yet supported for text generation.")
+        exit()
+
+    print_rank_0("WARNING: Forcing exit_on_missing_checkpoint to True for text generation.")
+    args.exit_on_missing_checkpoint = True
+
+    # Set up model and load checkpoint
+    # [ModelOpt]: make sure that output logits are allgathered.
+    text_generation_model_provider = functools.partial(model_provider, parallel_output=False)
+    model = get_model(text_generation_model_provider, wrap_with_ddp=False)
+
+    if args.load is not None:
+        load_modelopt_checkpoint(model, strict=not args.untie_embeddings_and_output_weights)
+        print_rank_0("Done loading checkpoint")
+
+    # Removing virtual pipeline parallel and other wrapper
+    assert len(model) == 1, "Above condition should have caught this"
+    unwrapped_model = unwrap_model(model)
+
+    all_prompts = args.prompts.split("|")
+
+    def custom_prompt_forward_loop_func(model):
+        for prompt in tqdm(all_prompts):
+            if mpu.is_pipeline_first_stage() and mpu.get_tensor_model_parallel_rank() == 0:
+                (
+                    prompts_plus_generations,
+                    prompts_plus_generations_segments,
+                    logprobs,
+                    _,
+                ) = generate_and_post_process(
+                    model,
+                    prompts=[prompt],
+                    tokens_to_generate=128,
+                    return_output_log_probs=True,
+                    temperature=1.0,
+                )
+                print_rank_0(prompts_plus_generations)
+            else:
+                generate_and_post_process(model)
+
+    def hf_dataset_forword_loop_func(model):
+        dataloader = get_calib_dataloader(args.calib_dataset, args.calib_batch_size, args.calib_size)
+        for prompts in tqdm(dataloader, total=args.calib_size//args.calib_batch_size):
+            if mpu.is_pipeline_first_stage() and mpu.get_tensor_model_parallel_rank() == 0:
+                (
+                    prompts_plus_generations,
+                    prompts_plus_generations_segments,
+                    logprobs,
+                    _,
+                ) = generate_and_post_process(
+                    model,
+                    prompts=prompts,
+                    tokens_to_generate=0,
+                    return_output_log_probs=True,
+                    temperature=1.0,
+                )
+            else:
+                generate_and_post_process(model)
+
+    ptq_forward_loop_func = custom_prompt_forward_loop_func
+    if args.calib_dataset is not None:
+        ptq_forward_loop_func = hf_dataset_forword_loop_func
+
+    if args.export_quant_cfg in QUANT_CFG_CHOICES:
+        mtq_config = QUANT_CFG_CHOICES[args.export_quant_cfg]
+        if "*output_layer*" not in mtq_config["quant_cfg"]:
+            mtq_config["quant_cfg"]["*output_layer*"] = {"enable": False}
+        if "awq" in args.export_quant_cfg:
+            weight_quantizer = mtq_config["quant_cfg"]["*weight_quantizer"]  # type: ignore
+            if isinstance(weight_quantizer, list):
+                weight_quantizer = weight_quantizer[0]
+            weight_quantizer["block_sizes"][-1] = 128
+        print_rank_0("Quantizing the model...")
+        mtq.quantize(unwrapped_model[0], mtq_config, ptq_forward_loop_func)
+
+    custom_prompt_forward_loop_func(model[0])
+
+    if args.save is not None and args.export_quant_cfg in QUANT_CFG_CHOICES:
+        save_checkpoint(1, unwrapped_model, None, None, 0)
+
+    print_rank_0(f"Fake Quantized Model:\n {unwrapped_model[0]}")
+
+    if args.export_dir:
+        assert args.decoder in ["gptnext", "llama"], f"Decoder type {args.decoder} not supported."
+        Path(args.export_dir).mkdir(parents=True, exist_ok=True)
+        print_rank_0("Exporting TensorRT-LLM checkpoints.")
+
+        from modelopt.torch.export import export_tensorrt_llm_checkpoint
+
+        # In TRT LLM, squared relu activation does not support bf16. So we use fp16 by default.
+        export_tensorrt_llm_checkpoint(
+            unwrapped_model[0],
+            args.decoder,
+            torch.bfloat16 if args.bf16 else torch.float16,
+            export_dir=args.export_dir,
+            inference_tensor_parallel=args.inference_tensor_parallel,
+            inference_pipeline_parallel=1,
+            use_nfs_workspace=True,
+        )
+
+        print_rank_0(f"TensorRT-LLM checkpoints saved to {args.export_dir}")
diff --git a/examples/export/ptq_and_trtllm_export/trtllm_text_generation.py b/examples/export/ptq_and_trtllm_export/trtllm_text_generation.py
new file mode 100644
index 0000000000..ab8aa25a96
--- /dev/null
+++ b/examples/export/ptq_and_trtllm_export/trtllm_text_generation.py
@@ -0,0 +1,64 @@
+# Copyright (c) 2024, NVIDIA CORPORATION. All rights reserved.
+
+"""An example script to run the tensorrt_llm engine."""
+
+import argparse
+from pathlib import Path
+import subprocess
+from typing import Optional, Union
+
+import numpy as np
+import torch
+from modelopt.deploy.llm import LLM
+from tensorrt_llm.models import PretrainedConfig
+from transformers import AutoTokenizer, T5Tokenizer
+import tensorrt_llm
+
+
+def parse_arguments():
+    parser = argparse.ArgumentParser()
+    parser.add_argument("--tokenizer", type=str, default="")
+    parser.add_argument("--engine-dir", type=str, default="/tmp/trtllm_engine")
+    parser.add_argument(
+        "--input-texts",
+        type=str,
+        default=(
+            "Born in north-east France, Soyer trained as a|Born in California, Soyer trained as a"
+        ),
+        help="Input texts. Please use | to separate different batches.",
+    )
+    return parser.parse_args()
+
+
+def run(args):
+    try:
+        tokenizer = AutoTokenizer.from_pretrained(args.tokenizer, trust_remote_code=True)
+    except Exception as e:
+        raise Exception(f"Failed to load tokenizer: {e}")
+
+    print(tokenizer, tokenizer.vocab_size)
+
+    input_texts = args.input_texts.split("|")
+    assert input_texts, "input_text not specified"
+    print(input_texts)
+
+    free_memory_before = torch.cuda.mem_get_info()
+
+    # This is a ModelOpt wrapper on top of tensorrt_llm.hlapi.llm.LLM
+    llm_engine = LLM(args.engine_dir, tokenizer)
+
+    torch.cuda.cudart().cudaProfilerStart()
+    # outputs = llm_engine.generate_text(input_texts, args.max_output_len, args.max_beam_width)
+    outputs = llm_engine.generate(input_texts)
+    torch.cuda.cudart().cudaProfilerStop()
+
+    free_memory_after = torch.cuda.mem_get_info()
+    print(
+        f"Used GPU memory: {(free_memory_before[0] - free_memory_after[0]) / 1024 / 1024 / 1024} GB"
+    )
+    print(outputs)
+
+
+if __name__ == "__main__":
+    args = parse_arguments()
+    run(args)
diff --git a/examples/export/trtllm_export/README.md b/examples/export/trtllm_export/README.md
new file mode 100644
index 0000000000..52cad78583
--- /dev/null
+++ b/examples/export/trtllm_export/README.md
@@ -0,0 +1,161 @@
+# Megatron Core To TRTLLM Export Documentation
+This guide will walk you through how you can use the megatron core export for exporting models to trtllm format
+
+### Contents
+- [Megatron Core To TRTLLM Export Documentation](#megatron-core-to-trtllm-export-documentation)
+- [Contents](#contents)
+  - [1. Quick Start](#1-quick-start)
+    - [1.1 Understanding The Code](#11-understanding-the-code)
+    - [1.2 Running The Code](#12-running-the-code)
+  - [2. GPU Export](#2-gpu-export)
+  - [3. Future work](#4-future-work)
+
+#### 1. Quick Start
+This will walk you through the flow of converting an mcore gpt model to trtllm format using single device mode. The file can be found at [gpt_single_device_cpu_export.py](./single_device_export/gpt_single_device_cpu_export.py)
+
+NOTE: For faster performance, if your entire model will fit into gpu memory, pre transfer the model state dict to gpu and then call the get_trtllm_pretrained_config_and_model_weights function.
+
+<br>
+
+##### 1.1 Understanding The Code
+***STEP 1 - We initialize model parallel and other default arguments***
+We initalize tp and pp to 1 so that we can get the full model state dict on cpu
+```python
+    initialize_distributed(tensor_model_parallel_size=1, pipeline_model_parallel_size=1)
+```
+
+***STEP 2 - We load the model using the model_provider_function***
+NOTE: We create a simple gpt model
+
+```python
+    transformer_config = TransformerConfig(
+        num_layers=2, 
+        hidden_size=64, # Needs to be atleast 32 times num_attn_heads
+        num_attention_heads=2, 
+        use_cpu_initialization=True, 
+        pipeline_dtype=torch.float32,
+    )
+
+    gpt_model = GPTModel(
+        config=transformer_config, 
+        transformer_layer_spec=get_gpt_layer_local_spec(), 
+        vocab_size=100, 
+        max_sequence_length=_SEQUENCE_LENGTH,
+    )
+
+    # Optionally you can also load a model using this code 
+    # sharded_state_dict=gpt_model.sharded_state_dict(prefix='')
+    # checkpoint = dist_checkpointing.load(sharded_state_dict=sharded_state_dict, checkpoint_dir=checkpoint_path)
+    # gpt_model.load_state_dict(checkpoint)
+
+```
+
+***STEP 3 - Instantiate the TRTLLM Helper***
+We instantiate the [TRTLLM Helper](../../../megatron/core/export/trtllm/trtllm_helper.py)  For the GPT model we instantiate trtllm_helper as shown below.
+```python
+    if hasattr(gpt_model, "rotary_pos_emb"):
+        seq_len_interpolation_factor =  gpt_model.rotary_pos_emb.seq_len_interpolation_factor
+
+    trtllm_helper = TRTLLMHelper(
+                        transformer_config=gpt_model.config, 
+                        model_type=ModelType.gpt,
+                        position_embedding_type = gpt_model.position_embedding_type, 
+                        max_position_embeddings = gpt_model.max_position_embeddings, 
+                        rotary_percentage = gpt_model.rotary_percent,
+                        rotary_base = gpt_model.rotary_base,
+                        moe_tp_mode = 2,
+                        multi_query_mode = False,
+                        activation = "gelu", 
+                        seq_len_interpolation_factor = seq_len_interpolation_factor,
+                        share_embeddings_and_output_weights=gpt_model.share_embeddings_and_output_weights
+                    )   
+```
+
+***STEP 4 - Get the TRTLLM Weights and configs***
+To convert model weights to trtllm weights and configs, we use the [single_device_converter](../../../megatron/core/export/trtllm/trtllm_weights_converter/single_device_trtllm_model_weights_converter.py). We pass as inputs the model state dict, and export config. In this example we use inference tp size as 2 for the export. 
+
+```python
+    model_state_dict={}
+    for key , val in gpt_model.state_dict().items():
+        # val is non for _extra_state layers . We filter it out
+        if val is not None:
+            model_state_dict[key] = val
+
+    export_config = ExportConfig(inference_tp_size = 2)
+    weight_list, config_list = trtllm_helper.get_trtllm_pretrained_config_and_model_weights(
+        model_state_dict= model_state_dict,
+        dtype = DataType.bfloat16,
+        export_config=export_config
+    )
+```
+
+***STEP 5 - Build the TRTLLM Engine***
+Following code is used to build the TRTLLM Engine. 
+
+```python
+    for trtllm_model_weights, trtllm_model_config in zip(weight_list, config_list):
+        trtllm_helper.build_and_save_engine(
+            max_input_len=256,
+            max_output_len=256,
+            max_batch_size=8,
+            engine_dir='/opt/megatron-lm/engine',
+            trtllm_model_weights=trtllm_model_weights,
+            trtllm_model_config=trtllm_model_config,
+            lora_ckpt_list=None,
+            use_lora_plugin=None,
+            max_lora_rank=64,
+            lora_target_modules=None,
+            max_prompt_embedding_table_size=0,
+            paged_kv_cache=True,
+            remove_input_padding=True,
+            paged_context_fmha=False,
+            use_refit=False,
+            max_num_tokens=None,
+            max_seq_len=512,
+            opt_num_tokens=None,
+            max_beam_width=1,
+            tokens_per_block=128,
+            multiple_profiles=False,
+            gpt_attention_plugin="auto",
+            gemm_plugin="auto",
+        )
+```
+<br>
+
+##### 1.2 Running The Code
+An example run script is shown below. 
+
+```
+# In a workstation 
+MLM_PATH=/path/to/megatron-lm
+CONTAINER_IMAGE=gitlab-master.nvidia.com:5005/dl/joc/nemo-ci/trtllm_0.12/train:pipe.17669124-x86
+
+docker run -it --gpus=all --ipc=host -v $MLM_PATH/:/opt/megatron-lm $CONTAINER_IMAGE bash
+
+# Inside the container run the following. 
+
+cd /opt/megatron-lm/
+
+CUDA_VISIBLE_DEVICES=0 torchrun --nproc-per-node 1  examples/export/trtllm_export/single_device_export/gpt_single_device_cpu_export.py
+```
+
+<br>
+
+#### 2. GPU Export
+You can use the [gpt_distributed_gpu_export.py](./distributed_export/gpt_distributed_gpu_export.py) to run a more optimized on device distributed. version of trtllm export. Internally this uses the [distributed_converter](../../../megatron/core/export/trtllm/trtllm_weights_converter/distributed_trtllm_model_weights_converter.py) to convert model weights on device. 
+In the single device version you collect all the model weights on CPU/GPU, convert it to trtllm format, and then store the engine back on disk. In the GPU version you load each individual state dict on the gpus, convert it on the device itself and store the engine on disk. 
+
+To run the gpu version 
+
+```
+CUDA_VISIBLE_DEVICES=0,1 torchrun --nproc-per-node 2  examples/export/trtllm_export/distributed_export/gpt_distributed_gpu_export.py
+```
+
+<br>
+
+#### 3. Future work
+The following are planned for the future releases . 
+* Pipeline parallellism for export (Work in progress) 
+* GPU Export for more models (Work in progress for some models)
+* Refit functionality
+* VLLM Support
\ No newline at end of file
diff --git a/examples/export/trtllm_export/distributed_export/gpt_distributed_gpu_export.py b/examples/export/trtllm_export/distributed_export/gpt_distributed_gpu_export.py
new file mode 100644
index 0000000000..57d44f9f62
--- /dev/null
+++ b/examples/export/trtllm_export/distributed_export/gpt_distributed_gpu_export.py
@@ -0,0 +1,117 @@
+import os
+import torch
+from megatron.core import parallel_state
+from megatron.core import dist_checkpointing
+from megatron.core.export.model_type import ModelType
+from megatron.core.export.data_type import DataType
+from megatron.core.export.trtllm.trtllm_helper import TRTLLMHelper
+from megatron.core.tensor_parallel.random import model_parallel_cuda_manual_seed
+from megatron.core.transformer.transformer_config import TransformerConfig
+from megatron.core.models.gpt.gpt_model import GPTModel
+from megatron.core.models.gpt.gpt_layer_specs import get_gpt_layer_local_spec
+
+
+_SEQUENCE_LENGTH = 64
+_VOCAB_SIZE = 256
+
+def initialize_distributed(tensor_model_parallel_size=1, pipeline_model_parallel_size=1):
+    parallel_state.destroy_model_parallel()
+
+    # Torch setup for distributed training
+    rank = int(os.environ['LOCAL_RANK'])
+    world_size = torch.cuda.device_count()
+    torch.cuda.set_device(rank)
+    torch.distributed.init_process_group(world_size=world_size, rank=rank)
+
+    # Megatron core distributed training initialization
+    parallel_state.initialize_model_parallel(tensor_model_parallel_size = tensor_model_parallel_size, pipeline_model_parallel_size=pipeline_model_parallel_size)
+
+def model_provider():
+    """Build the model."""
+
+    transformer_config = TransformerConfig(
+        num_layers=2, 
+        hidden_size=64, 
+        num_attention_heads=2, 
+        use_cpu_initialization=True, 
+        pipeline_dtype=torch.float32
+    )
+
+    gpt_model = GPTModel(
+        config=transformer_config, 
+        transformer_layer_spec=get_gpt_layer_local_spec(), 
+        vocab_size=_VOCAB_SIZE, 
+        max_sequence_length=_SEQUENCE_LENGTH,
+    )
+
+    return gpt_model
+
+def load_distributed_checkpoint(checkpoint_path, gpt_model):
+    sharded_state_dict=gpt_model.sharded_state_dict(prefix='')
+    checkpoint = dist_checkpointing.load(sharded_state_dict=sharded_state_dict, checkpoint_dir=checkpoint_path)
+    gpt_model.load_state_dict(checkpoint)
+    return gpt_model
+
+if __name__ == "__main__":
+    initialize_distributed(tensor_model_parallel_size=2, pipeline_model_parallel_size=1)
+    model_parallel_cuda_manual_seed(123)
+
+    gpt_model = model_provider()
+    device = torch.device("cuda")
+    gpt_model.to(device) 
+    
+    # Optionally you can also load a gpt model from ckpt_path using this code below
+    # gpt_model = load_distributed_checkpoint(gpt_model=gpt_model, checkpoint_path=ckpt_path)
+
+    seq_len_interpolation_factor = None
+    if hasattr(gpt_model, "rotary_pos_emb"):
+        seq_len_interpolation_factor =  gpt_model.rotary_pos_emb.seq_len_interpolation_factor
+
+    trtllm_helper = TRTLLMHelper(
+                        transformer_config=gpt_model.config, 
+                        model_type=ModelType.gpt,
+                        position_embedding_type = gpt_model.position_embedding_type, 
+                        max_position_embeddings = gpt_model.max_position_embeddings, 
+                        rotary_percentage = gpt_model.rotary_percent,
+                        rotary_base = gpt_model.rotary_base,
+                        moe_tp_mode = 2,
+                        multi_query_mode = False,
+                        activation = "gelu", 
+                        seq_len_interpolation_factor = seq_len_interpolation_factor,
+                        share_embeddings_and_output_weights=gpt_model.share_embeddings_and_output_weights
+                    )
+    
+
+    trtllm_model_weights, trtllm_model_config = trtllm_helper.get_trtllm_pretrained_config_and_model_weights(
+        model_state_dict= gpt_model.state_dict(),
+        dtype = DataType.bfloat16,
+        on_device_distributed_conversion=True, 
+        vocab_size=_VOCAB_SIZE, 
+        gpus_per_node=2,
+    )
+
+    trtllm_helper.build_and_save_engine(
+        max_input_len=256,
+        max_output_len=256,
+        max_batch_size=8,
+        engine_dir='/opt/megatron-lm/engine',
+        trtllm_model_weights=trtllm_model_weights[0],
+        trtllm_model_config=trtllm_model_config[0],
+        lora_ckpt_list=None,
+        use_lora_plugin=None,
+        max_lora_rank=64,
+        lora_target_modules=None,
+        max_prompt_embedding_table_size=0,
+        paged_kv_cache=True,
+        remove_input_padding=True,
+        paged_context_fmha=False,
+        use_refit=False,
+        max_num_tokens=None,
+        max_seq_len=512,
+        opt_num_tokens=None,
+        max_beam_width=1,
+        tokens_per_block=128,
+        multiple_profiles=False,
+        gpt_attention_plugin="auto",
+        gemm_plugin="auto",
+    )
diff --git a/examples/export/trtllm_export/single_device_export/gpt_single_device_cpu_export.py b/examples/export/trtllm_export/single_device_export/gpt_single_device_cpu_export.py
new file mode 100644
index 0000000000..587e7cfdd3
--- /dev/null
+++ b/examples/export/trtllm_export/single_device_export/gpt_single_device_cpu_export.py
@@ -0,0 +1,118 @@
+import os
+import torch
+from megatron.core import parallel_state
+from megatron.core import dist_checkpointing
+from megatron.core.export.model_type import ModelType
+from megatron.core.export.data_type import DataType
+from megatron.core.export.export_config import ExportConfig
+from megatron.core.export.trtllm.trtllm_helper import TRTLLMHelper
+from megatron.core.tensor_parallel.random import model_parallel_cuda_manual_seed
+from megatron.core.transformer.transformer_config import TransformerConfig
+from megatron.core.models.gpt.gpt_model import GPTModel
+from megatron.core.models.gpt.gpt_layer_specs import get_gpt_layer_local_spec
+
+
+_SEQUENCE_LENGTH = 64
+
+
+def initialize_distributed(tensor_model_parallel_size=1, pipeline_model_parallel_size=1):
+    parallel_state.destroy_model_parallel()
+
+    # Torch setup for distributed training
+    rank = int(os.environ['LOCAL_RANK'])
+    world_size = torch.cuda.device_count()
+    torch.cuda.set_device(rank)
+    torch.distributed.init_process_group(world_size=world_size, rank=rank)
+
+    # Megatron core distributed training initialization
+    parallel_state.initialize_model_parallel(tensor_model_parallel_size, pipeline_model_parallel_size)
+
+def model_provider():
+    """Build the model."""
+
+    transformer_config = TransformerConfig(
+        num_layers=2, 
+        hidden_size=64, # Needs to be atleast 32 times num_attn_heads
+        num_attention_heads=2, 
+        use_cpu_initialization=True, 
+        pipeline_dtype=torch.float32,
+    )
+
+    gpt_model = GPTModel(
+        config=transformer_config, 
+        transformer_layer_spec=get_gpt_layer_local_spec(), 
+        vocab_size=100, 
+        max_sequence_length=_SEQUENCE_LENGTH,
+    )
+
+    return gpt_model
+
+def load_distributed_checkpoint(checkpoint_path, gpt_model):
+    sharded_state_dict=gpt_model.sharded_state_dict(prefix='')
+    checkpoint = dist_checkpointing.load(sharded_state_dict=sharded_state_dict, checkpoint_dir=checkpoint_path)
+    gpt_model.load_state_dict(checkpoint)
+    return gpt_model
+
+if __name__ == "__main__":
+    # Need to use TP1 PP1 for export on single device
+    initialize_distributed(tensor_model_parallel_size=1, pipeline_model_parallel_size=1)
+    model_parallel_cuda_manual_seed(123)
+
+    gpt_model = model_provider()
+
+    # Optionally you can also load a gpt model from ckpt_path using this code below
+    # gpt_model = load_distributed_checkpoint(gpt_model=gpt_model, checkpoint_path=ckpt_path)
+
+    seq_len_interpolation_factor = None
+    if hasattr(gpt_model, "rotary_pos_emb"):
+        seq_len_interpolation_factor =  gpt_model.rotary_pos_emb.seq_len_interpolation_factor
+
+    trtllm_helper = TRTLLMHelper(
+                        transformer_config=gpt_model.config, 
+                        model_type=ModelType.gpt,
+                        position_embedding_type = gpt_model.position_embedding_type, 
+                        max_position_embeddings = gpt_model.max_position_embeddings, 
+                        rotary_percentage = gpt_model.rotary_percent,
+                        rotary_base = gpt_model.rotary_base,
+                        moe_tp_mode = 2,
+                        multi_query_mode = False,
+                        activation = "gelu", 
+                        seq_len_interpolation_factor = seq_len_interpolation_factor,
+                        share_embeddings_and_output_weights=gpt_model.share_embeddings_and_output_weights
+                    )
+    
+
+    export_config = ExportConfig(inference_tp_size = 2)
+    # NOTE : For faster performance, if your entire model will fit in gpu memory, transfer model state dict to GPU and then call this api
+    weight_list, config_list = trtllm_helper.get_trtllm_pretrained_config_and_model_weights(
+        model_state_dict= gpt_model.state_dict(),
+        dtype = DataType.bfloat16,
+        export_config=export_config
+    )
+
+    for trtllm_model_weights, trtllm_model_config in zip(weight_list, config_list):
+        trtllm_helper.build_and_save_engine(
+            max_input_len=256,
+            max_output_len=256,
+            max_batch_size=8,
+            engine_dir='/opt/megatron-lm/engine',
+            trtllm_model_weights=trtllm_model_weights,
+            trtllm_model_config=trtllm_model_config,
+            lora_ckpt_list=None,
+            use_lora_plugin=None,
+            max_lora_rank=64,
+            lora_target_modules=None,
+            max_prompt_embedding_table_size=0,
+            paged_kv_cache=True,
+            remove_input_padding=True,
+            paged_context_fmha=False,
+            use_refit=False,
+            max_num_tokens=None,
+            max_seq_len=512,
+            opt_num_tokens=None,
+            max_beam_width=1,
+            tokens_per_block=128,
+            multiple_profiles=False,
+            gpt_attention_plugin="auto",
+            gemm_plugin="auto",
+        )
\ No newline at end of file
diff --git a/examples/finetune_mnli_distributed.sh b/examples/finetune_mnli_distributed.sh
deleted file mode 100755
index 9219e595dd..0000000000
--- a/examples/finetune_mnli_distributed.sh
+++ /dev/null
@@ -1,44 +0,0 @@
-#!/bin/bash
-
-WORLD_SIZE=8
-
-DISTRIBUTED_ARGS="--nproc_per_node $WORLD_SIZE \
-                  --nnodes 1 \
-                  --node_rank 0 \
-                  --master_addr localhost \
-                  --master_port 6000"
-
-TRAIN_DATA="data/glue_data/MNLI/train.tsv"
-VALID_DATA="data/glue_data/MNLI/dev_matched.tsv \
-            data/glue_data/MNLI/dev_mismatched.tsv"
-PRETRAINED_CHECKPOINT=checkpoints/bert_345m
-VOCAB_FILE=bert-vocab.txt
-CHECKPOINT_PATH=checkpoints/bert_345m_mnli
-
-python -m torch.distributed.launch $DISTRIBUTED_ARGS ./tasks/main.py \
-               --task MNLI \
-               --seed 1234 \
-               --train-data $TRAIN_DATA \
-               --valid-data $VALID_DATA \
-               --tokenizer-type BertWordPieceLowerCase \
-               --vocab-file $VOCAB_FILE \
-               --epochs 5 \
-               --pretrained-checkpoint $PRETRAINED_CHECKPOINT \
-               --tensor-model-parallel-size 1 \
-               --num-layers 24 \
-               --hidden-size 1024 \
-               --num-attention-heads 16 \
-               --micro-batch-size 8 \
-               --activations-checkpoint-method uniform \
-               --lr 5.0e-5 \
-               --lr-decay-style linear \
-               --lr-warmup-fraction 0.065 \
-               --seq-length 512 \
-               --max-position-embeddings 512 \
-               --save-interval 500000 \
-               --save $CHECKPOINT_PATH \
-               --log-interval 10 \
-               --eval-interval 100 \
-               --eval-iters 50 \
-               --weight-decay 1.0e-1 \
-               --fp16
diff --git a/examples/finetune_race_distributed.sh b/examples/finetune_race_distributed.sh
deleted file mode 100755
index e7f70a70ab..0000000000
--- a/examples/finetune_race_distributed.sh
+++ /dev/null
@@ -1,47 +0,0 @@
-#!/bin/bash
-
-WORLD_SIZE=8
-
-DISTRIBUTED_ARGS="--nproc_per_node $WORLD_SIZE \
-                  --nnodes 1 \
-                  --node_rank 0 \
-                  --master_addr localhost \
-                  --master_port 6000"
-
-TRAIN_DATA="data/RACE/train/middle"
-VALID_DATA="data/RACE/dev/middle \
-            data/RACE/dev/high"
-VOCAB_FILE=bert-vocab.txt
-PRETRAINED_CHECKPOINT=checkpoints/bert_345m
-CHECKPOINT_PATH=checkpoints/bert_345m_race
-
-python -m torch.distributed.launch $DISTRIBUTED_ARGS ./tasks/main.py \
-               --task RACE \
-               --seed 1234 \
-               --train-data $TRAIN_DATA \
-               --valid-data $VALID_DATA \
-               --tokenizer-type BertWordPieceLowerCase \
-               --vocab-file $VOCAB_FILE \
-               --epochs 3 \
-               --pretrained-checkpoint $PRETRAINED_CHECKPOINT \
-               --tensor-model-parallel-size 1 \
-               --num-layers 24 \
-               --hidden-size 1024 \
-               --num-attention-heads 16 \
-               --micro-batch-size 4 \
-               --activations-checkpoint-method uniform \
-               --lr 1.0e-5 \
-               --lr-decay-style linear \
-               --lr-warmup-fraction 0.06 \
-               --seq-length 512 \
-               --max-position-embeddings 512 \
-               --save-interval 100000 \
-               --save $CHECKPOINT_PATH \
-               --log-interval 10 \
-               --eval-interval 100 \
-               --eval-iters 50 \
-               --weight-decay 1.0e-1 \
-               --clip-grad 1.0 \
-               --hidden-dropout 0.1 \
-               --attention-dropout 0.1 \
-               --fp16
diff --git a/examples/finetune_retriever_distributed.sh b/examples/finetune_retriever_distributed.sh
deleted file mode 100755
index 535a2e053d..0000000000
--- a/examples/finetune_retriever_distributed.sh
+++ /dev/null
@@ -1,56 +0,0 @@
-#!/bin/bash
-
-# Finetune a BERT or pretrained ICT model using Google natural question data 
-# Datasets can be downloaded from the following link:
-# https://github.com/facebookresearch/DPR/blob/master/data/download_data.py
-
-WORLD_SIZE=8
-
-DISTRIBUTED_ARGS="--nproc_per_node $WORLD_SIZE \
-                  --nnodes 1 \
-                  --node_rank 0 \
-                  --master_addr localhost \
-                  --master_port 6000"
-
-CHECKPOINT_PATH=<Specify path for the finetuned retriever model>
-
-# Load either of the below
-BERT_LOAD_PATH=<Path of BERT pretrained model>
-PRETRAINED_CHECKPOINT=<Path of Pretrained ICT model>
-
-python -m torch.distributed.launch $DISTRIBUTED_ARGS ./tasks/main.py \
-        --task RET-FINETUNE-NQ \
-        --train-with-neg \
-        --train-hard-neg 1 \
-        --pretrained-checkpoint ${PRETRAINED_CHECKPOINT} \
-        --num-layers 12 \
-        --hidden-size 768 \
-        --num-attention-heads 12 \
-        --tensor-model-parallel-size 1 \
-        --tokenizer-type BertWordPieceLowerCase \
-        --train-data nq-train.json \
-        --valid-data nq-dev.json \
-        --save ${CHECKPOINT_PATH} \
-        --load ${CHECKPOINT_PATH} \
-        --vocab-file bert-vocab.txt \
-        --bert-load ${BERT_LOAD_PATH} \
-        --save-interval 5000 \
-        --log-interval 10 \
-        --eval-interval 20000 \
-        --eval-iters 100 \
-        --indexer-log-interval 1000 \
-        --faiss-use-gpu \
-        --DDP-impl torch \
-        --fp16 \
-        --retriever-report-topk-accuracies 1 5 10 20 100 \
-        --seq-length 512 \
-        --retriever-seq-length 256 \
-        --max-position-embeddings 512 \
-        --retriever-score-scaling \
-        --epochs 80 \
-        --micro-batch-size 8 \
-        --eval-micro-batch-size 16 \
-        --indexer-batch-size 128 \
-        --lr 2e-5 \
-        --lr-warmup-fraction 0.01 \
-        --weight-decay 1e-1
diff --git a/examples/gpt3/README.md b/examples/gpt3/README.md
new file mode 100644
index 0000000000..8d6f267416
--- /dev/null
+++ b/examples/gpt3/README.md
@@ -0,0 +1,57 @@
+# GPT3 MODEL
+
+## Table of contents
+- [1. Training Setup](#1-training-setup)
+- [2. Configurations](#2-configurations)
+- [3. Training Results](#3-training-results)
+
+## 1. Training setup
+<a id="markdown-training-setup" name="training-setup"></a>
+
+To run the model using a docker container run it as follows
+```
+PYTORCH_IMAGE=nvcr.io/nvidia/pytorch:24.01-py3
+CHECKPOINT_PATH="" #<Specify path>
+TENSORBOARD_LOGS_PATH=""#<Specify path>
+VOCAB_FILE="" #<Specify path to file>/gpt2-vocab.json
+MERGE_FILE="" #<Specify path to file>/gpt2-merges.txt
+DATA_PATH="" #<Specify path and file prefix>_text_document
+
+docker run \
+  --gpus=all \
+  --ipc=host \
+  --workdir /workspace/megatron-lm \
+  -v /path/to/data:/path/to/data \
+  -v /path/to/megatron-lm:/workspace/megatron-lm \
+  megatron-lm nvcr.io/nvidia/pytorch:24.01-py3 \
+  bash examples/gpt3/train_gpt3_175b_distributed.sh $CHECKPOINT_PATH $TENSORBOARD_LOGS_PATH $VOCAB_FILE $MERGE_FILE $DATA_PATH "
+
+```
+NOTE: Depending on the environment you are running it the above command might like slightly different.
+
+
+## 2. Configurations
+<a id="markdown-configurations" name="configurations"></a>
+The example in this folder shows you how to run 175B model. There are other configs you could run as well
+
+### 345M
+```
+       --num-layers 12 \
+       --hidden-size 512 \
+       --num-attention-heads 8 \
+       --seq-length 1024 \
+       --tensor-model-parallel-size 1 \
+       --pipeline-model-parallel-size 1 \
+
+```
+
+### 857M
+```
+       --num-layers 24 \
+       --hidden-size 1024 \
+       --num-attention-heads 16 \
+       --seq-length 2048 \
+       --tensor-model-parallel-size 1 \
+       --pipeline-model-parallel-size 1 \
+
+```
diff --git a/examples/gpt3/gpt_config.yaml b/examples/gpt3/gpt_config.yaml
new file mode 100644
index 0000000000..443e4b79b8
--- /dev/null
+++ b/examples/gpt3/gpt_config.yaml
@@ -0,0 +1,299 @@
+# WARNING: Yaml configs is currently an experimental feature
+language_model:
+  # model architecture
+  num_layers: 24
+  hidden_size: 1024
+  num_attention_heads: 16
+  num_query_groups: null
+
+  ffn_hidden_size: null
+  kv_channels: null
+  hidden_dropout: 0.0
+  attention_dropout: 0.0
+  fp32_residual_connection: False
+
+  apply_residual_connection_post_layernorm: False
+  layernorm_epsilon: 1.e-5
+  layernorm_zero_centered_gamma: True
+  add_bias_linear: False
+  bias_activation_fusion: False
+  add_qkv_bias: False
+  gated_linear_unit: False
+  activation_func: swiglu
+  num_moe_experts: null
+  rotary_interleaved: False
+  window_size: null
+
+  # initialization
+  init_method: null
+  init_method_std: 0.02
+  output_layer_init_method: null
+
+  # mixed-precision
+  apply_query_key_layer_scaling: False
+  attention_softmax_in_fp32: False
+
+  # fusion
+  bias_swiglu_fusion: True
+  masked_softmax_fusion: True
+  persist_layer_norm: False
+  memory_efficient_layer_norm: False
+  bias_dropout_fusion: True
+  apply_rope_fusion: True
+
+  # activation recomputation
+  recompute_granularity: null
+  recompute_method: null
+  recompute_num_layers: null
+  distribute_saved_activations: null
+
+  # fp8 related
+  fp8: null
+  fp8_margin: 0
+  fp8_interval: 1
+  fp8_amax_history_len: 1
+  fp8_amax_compute_algo: "most_recent"
+  fp8_wgrad: True
+
+  # miscellaneous
+  clone_scatter_output_in_embedding: True
+
+  normalization: "LayerNorm"  # alt value supported by TE: "RMSNorm"
+
+  # MoE related
+  moe_router_load_balancing_type: "aux_loss"
+  moe_router_topk: 2
+  moe_grouped_gemm: False
+  moe_aux_loss_coeff: 0  # 1e-2 would be a good start value for load balance loss.
+  moe_z_loss_coeff: null  # 1e-3 would be a good start value for z-loss
+  moe_input_jitter_eps: null
+  moe_token_dropping: False
+
+model_parallel:
+  # Model parallelism
+  tensor_model_parallel_size: 1
+  context_parallel_size: 1
+  pipeline_model_parallel_size: 1
+  virtual_pipeline_model_parallel_size: null
+  sequence_parallel: True
+  expert_model_parallel_size: 1
+
+  # Initialization
+  perform_initialization: True
+  use_cpu_initialization: null
+
+  # Training
+  fp16: False
+  bf16: True
+  params_dtype: null # Set from above arguments for core
+  timers: null
+
+  # Optimizations
+  gradient_accumulation_fusion: True
+  async_tensor_model_parallel_allreduce: True
+  tp_comm_overlap: False
+
+  # Debug Options
+  tp_comm_split_ag: True
+  tp_comm_atomic_ag: True
+  tp_comm_split_rs: True
+  tp_comm_atomic_rs: True
+  tp_comm_bulk_wgrad: True
+  tp_comm_bulk_dgrad: True
+
+  # Parallelism
+  finalize_model_grads_func: null
+
+  # Pipeline Parallel
+  pipeline_dtype: null
+  grad_scale_func: null
+  enable_autocast: False
+  autocast_dtype: null
+  variable_seq_lengths: False
+  num_microbatches_with_partial_activation_checkpoints: null
+  overlap_p2p_comm: False
+  batch_p2p_comm: True
+  batch_p2p_sync: True
+  use_ring_exchange_p2p: False
+  deallocate_pipeline_outputs: False
+  no_sync_func: null
+  grad_sync_func: null
+  param_sync_func: null
+  pipeline_model_parallel_split_rank: null
+
+  # CPU Offloading
+  cpu_offloading: False
+  cpu_offloading_num_layers: 0
+  _cpu_offloading_context: null
+  cpu_offloading_weights: False
+  cpu_offloading_activations: True
+
+  # Timing
+  barrier_with_L1_time: True
+
+# training:
+use_legacy_models: False
+spec: null
+micro_batch_size: 2
+global_batch_size: 128
+rampup_batch_size: [32, 32, 65324160] 
+check_for_nan_in_loss_and_grad: True
+num_layers_per_virtual_pipeline_stage: null
+
+encoder_num_layers: null
+decoder_num_layers: null
+rotary_seq_len_interpolation_factor: null
+add_position_embedding: False
+make_vocab_size_divisible_by: 128
+group_query_attention: False
+
+
+exit_signal_handler: False
+exit_duration_in_mins: null
+exit_interval: null
+
+untie_embeddings_and_output_weights: True
+position_embedding_type: rope
+rotary_percent: 0.5
+openai_gelu: False
+squared_relu: False
+swiglu: True
+onnx_safe: null
+bert_binary_head: True
+max_position_embeddings: 4096
+
+transformer_impl: local
+use_flash_attn: False
+seed: 1234
+data_parallel_random_init: False
+
+# Optimizer
+optimizer: adam
+lr: 2.5e-4
+lr_decay_style: cosine
+lr_decay_iters: null
+lr_decay_samples: 255126953
+lr_warmup_fraction: null
+lr_warmup_iters: 0
+lr_warmup_samples: 81381
+lr_warmup_init: 0.0
+min_lr: 2.5e-5
+weight_decay: 0.1
+start_weight_decay: null
+end_weight_decay: null
+weight_decay_incr_style: constant
+clip_grad: 1.0
+adam_beta1: 0.9
+adam_beta2: 0.95
+adam_eps: 1.e-08
+sgd_momentum: 0.9
+override_opt_param_scheduler: False
+use_checkpoint_opt_param_scheduler: False
+
+# checkpointing arguments
+save: null
+save_interval: 20000
+no_save_optim: null
+no_save_rng: null
+load: null
+no_load_optim: null
+no_load_rng: null
+finetune: False
+use_checkpoint_args: False
+exit_on_missing_checkpoint: False
+
+# loss arguments
+loss_scale: null
+initial_loss_scale: 4294967296
+min_loss_scale: 1.0
+loss_scale_window: 1000 
+hysteresis: 2
+accumulate_allreduce_grads_in_fp32: False
+fp16_lm_cross_entropy: False
+
+# distributed arguments
+distributed_backend: nccl
+distributed_timeout_minutes: 10
+overlap_grad_reduce: False
+align_grad_reduce: True
+overlap_param_gather: False
+align_param_gather: False
+scatter_gather_tensors_in_pipeline: True
+local_rank: null
+lazy_mpu_init: null
+empty_unused_memory_level: 0
+standalone_embedding_stage: False
+use_distributed_optimizer: False
+nccl_communicator_config_path: null
+
+train_iters: null
+eval_iters: 32
+eval_interval: 2000
+skip_train: False
+
+adlr_autoresume: False
+adlr_autoresume_interval: 1000
+
+# garbage collection
+manual_gc: False
+manual_gc_interval: 0
+manual_gc_eval: True
+
+tp_comm_overlap_cfg: null
+
+#data
+data_path: null
+split: '99,1,0'
+train_data_path: null
+valid_data_path: null
+test_data_path: null
+data_cache_path: null
+mock_data: False
+vocab_size: null
+vocab_file: null
+merge_file: null
+vocab_extra_ids: 0
+seq_length: 4096
+encoder_seq_length: null
+decoder_seq_length: null
+retriever_seq_length: 256
+sample_rate: 1.0
+mask_prob: 0.15
+short_seq_prob: 0.1
+num_workers: 2
+tokenizer_type: GPTSentencePieceTokenizer
+tokenizer_model: null
+reset_position_ids: False
+reset_attention_mask: False
+eod_mask_loss: False
+train_samples: 268554688
+dataloader_type: null
+
+#profile:
+profile: False
+profile_ranks: [0]
+profile_step_end: 12
+profile_step_start: 10
+
+#logging:
+log_params_norm: True
+log_num_zeros_in_grad: True
+log_throughput: False
+log_progress: False
+timing_log_level: 0
+timing_log_option: minmax
+tensorboard_log_interval: 1
+tensorboard_queue_size: 1000
+log_timers_to_tensorboard: False
+log_validation_ppl_to_tensorboard: False
+log_memory_to_tensorboard: False
+log_world_size_to_tensorboard: False
+log_loss_scale_to_tensorboard: True
+wandb_project: ''
+wandb_exp_name: ''
+wandb_save_dir: ''
+enable_one_logger: True
+one_logger_project: megatron-lm
+one_logger_run_name: null
+log_interval: 100
+tensorboard_dir: null
diff --git a/examples/gpt3/train_gpt3_175b_distributed.sh b/examples/gpt3/train_gpt3_175b_distributed.sh
new file mode 100755
index 0000000000..b164ae2e91
--- /dev/null
+++ b/examples/gpt3/train_gpt3_175b_distributed.sh
@@ -0,0 +1,81 @@
+#!/bin/bash
+
+# Runs the "175B" parameter model
+
+export CUDA_DEVICE_MAX_CONNECTIONS=1
+
+GPUS_PER_NODE=8
+# Change for multinode config
+MASTER_ADDR=localhost
+MASTER_PORT=6000
+NUM_NODES=1
+NODE_RANK=0
+WORLD_SIZE=$(($GPUS_PER_NODE*$NUM_NODES))
+
+CHECKPOINT_PATH=$1 #<Specify path>
+TENSORBOARD_LOGS_PATH=$2 #<Specify path>
+VOCAB_FILE=$3 #<Specify path to file>/gpt2-vocab.json
+MERGE_FILE=$4 #<Specify path to file>/gpt2-merges.txt
+DATA_PATH=$5 #<Specify path and file prefix>_text_document
+
+DISTRIBUTED_ARGS=(
+    --nproc_per_node $GPUS_PER_NODE 
+    --nnodes $NUM_NODES 
+    --master_addr $MASTER_ADDR 
+    --master_port $MASTER_PORT
+)
+
+GPT_MODEL_ARGS=(
+    --num-layers 96 
+    --hidden-size 12288 
+    --num-attention-heads 96 
+    --seq-length 2048 
+    --max-position-embeddings 2048 
+)
+
+TRAINING_ARGS=(
+    --micro-batch-size 1 
+    --global-batch-size 1536 
+    --rampup-batch-size 16 16 5859375 
+    --train-iters 500000 
+    --weight-decay 0.1 
+    --adam-beta1 0.9 
+    --adam-beta2 0.95 
+    --init-method-std 0.006 
+    --clip-grad 1.0 
+    --fp16
+    --lr 6.0e-5 
+    --lr-decay-style cosine 
+    --min-lr 6.0e-6
+    --lr-warmup-fraction .001 
+    --lr-decay-iters 430000 
+)
+
+MODEL_PARALLEL_ARGS=(
+	--tensor-model-parallel-size 8 
+	--pipeline-model-parallel-size 16 
+)
+
+DATA_ARGS=(
+    --data-path $DATA_PATH 
+    --vocab-file $VOCAB_FILE 
+    --merge-file $MERGE_FILE 
+    --split 949,50,1
+)
+
+EVAL_AND_LOGGING_ARGS=(
+    --log-interval 100
+    --save-interval 10000 
+    --eval-interval 1000 
+    --save $CHECKPOINT_PATH 
+    --load $CHECKPOINT_PATH 
+    --eval-iters 10
+    --tensorboard-dir $TENSORBOARD_LOGS_PATH 
+)
+
+torchrun ${DISTRIBUTED_ARGS[@]} pretrain_gpt.py \
+    ${GPT_MODEL_ARGS[@]} \
+    ${TRAINING_ARGS[@]} \
+    ${MODEL_PARALLEL_ARGS[@]} \
+    ${DATA_ARGS[@]} \
+    ${EVAL_AND_LOGGING_ARGS[@]}
diff --git a/examples/inference/README.md b/examples/inference/README.md
new file mode 100644
index 0000000000..bd8e738e55
--- /dev/null
+++ b/examples/inference/README.md
@@ -0,0 +1,274 @@
+### Megatron Core Inference Documentation
+This guide will walk you through how you can use megatron core for inference on your models. 
+
+### Contents
+- [Megatron Core Inference Documentation](#megatron-core-inference-documentation)
+- [Contents](#contents)
+  - [1. Quick Start](#1-quick-start)
+    - [1.1 Understanding The Code](#11-understanding-the-code)
+    - [1.2 Running The Code](#12-running-the-code)
+  - [2. Flow of Control In MCore Backend](#2-flow-of-control-in-mcore-backend)
+  - [3. Customizing The Inference Pipeline](#3-customizing-the-inference-pipeline)
+    - [3.1. Create Your Own Inference Backend](#31-create-your-own-inference-backend)
+    - [3.2. Create Your Own Text Generation Controller](#32-create-your-own-text-generation-controller)
+    - [3.3. Support Other Models](#33-support-other-models)
+    - [3.3. Modify Inference Parameters](#33-modify-inference-parameters)
+  - [4. Future work](#4-future-work)
+
+<br>
+
+#### 1. Quick Start
+This will walk you through the flow of running batch inference on a GPT model trained using megatron core. The file can be found at [simple_gpt_batch_inference.py](./gpt/simple_gpt_batch_inference.py)
+
+<br>
+
+##### 1.1 Understanding The Code
+***STEP 1 - We initialize model parallel and other default arguments***
+We can default micro batch size to be 1, since for TP models it is not used, and for PP models it is calculated during runtime. 
+```python
+    initialize_megatron(
+        args_defaults={'no_load_rng': True, 'no_load_optim': True, 'micro_batch_size': 1}
+    )
+```
+
+***STEP 2 - We load the model using the model_provider_function***
+NOTE: The model provider function in the script supports MCore and Legacy models. 
+
+```python
+    model = get_model(model_provider, wrap_with_ddp=False)
+    load_checkpoint(model, None, None)
+    model = model[0]
+```
+
+***STEP 3 - Choose an engine***
+One of the important elements of the generate function is an inference engine. In this example we will be choosing the [megatron core engine](../../megatron/core/inference/engine/mcore_engine.py) with a [simple text generation controller](../../megatron/core/inference/text_generation_controllers/simple_text_generation_controller.py), the default engine. Other engines that will be supported in the future are TRTLLMEngine.
+```python
+    inference_wrapped_model = GPTInferenceWrapper(model, args)
+    text_generation_controller = SimpleTextGenerationController(
+        inference_wrapped_model=inference_wrapped_model, 
+        tokenizer=tokenizer
+    )
+    inference_backend = MCoreEngine(
+        text_generation_controller=text_generation_controller, max_batch_size=args.max_batch_size
+    )
+```
+
+***STEP 4 - Run the generate function and display results***
+We use default values for the [common inference params](../../megatron/core/inference/common_inference_params.py). Customize this if you want to change top_p, top_k, number of tokens to generate etc. 
+*Note that the result is returned as a list of [InferenceRequests](../../megatron/core/inference/inference_request.py)*
+```python
+    results: List[InferenceRequest] = inference_engine.generate(
+        prompts=args.prompts, common_inference_params=common_inference_params
+    )
+    
+    if torch.distributed.get_rank() == 0:
+        for idx, result in enumerate(results):
+            print(f' ------------- RESULT FOR PROMPT {idx} --------------- ')
+            result = {
+                'id': result.request_id,
+                'input_prompt': result.prompt, 
+                'generated_text': result.generated_text,
+                'generated_tokens' : result.generated_tokens
+                }
+            print(result)
+```
+
+<br>
+
+##### 1.2 Running The Code
+An example run script is shown below. Change the tokenizer paths, inference params, and other settings for your model. 
+
+For a quick recap on inference params refer to [this blog](https://ivibudh.medium.com/a-guide-to-controlling-llm-model-output-exploring-top-k-top-p-and-temperature-parameters-ed6a31313910) 
+
+```
+#In a slurm cluster (You could also use docker)
+ACCOUNT=<account>
+MLM_PATH=/path/to/megatron-lm
+GPT_CKPT=/path/to/gpt/ckpt
+VOCAB_MERGE_FILE_PATH=/path/to/vocab/and/merge/file
+CONTAINER_IMAGE=nvcr.io/ea-bignlp/ga-participants/nemofw-training:23.11
+
+srun --account $ACCOUNT \
+--job-name=$ACCOUNT:inference \
+--partition=batch \
+--time=01:00:00 \
+--container-image $CONTAINER_IMAGE \
+--container-mounts $MLM_PATH:/workspace/megatron-lm/,$GPT_CKPT:/workspace/mcore_gpt_ckpt,$VOCAB_MERGE_FILE_PATH:/workspace/tokenizer \
+--no-container-mount-home \
+--pty /bin/bash \
+
+# Inside the container run the following. 
+
+cd megatron-lm/
+export CUDA_DEVICE_MAX_CONNECTIONS=1
+
+TOKENIZER_ARGS=(
+    --vocab-file /workspace/tokenizer/gpt2-vocab.json
+    --merge-file /workspace/tokenizer/gpt2-merges.txt
+    --tokenizer-type GPT2BPETokenizer
+)
+
+MODEL_ARGS=(
+    --use-checkpoint-args
+    --use-mcore-models
+    --load /workspace/mcore_gpt_ckpt
+)
+
+INFERENCE_SPECIFIC_ARGS=(
+    --attention-dropout 0.0
+    --hidden-dropout 0.0
+    --num-tokens-to-generate 20
+    --max-batch-size 4
+)
+
+torchrun --nproc-per-node=4 examples/inference/gpt/simple_gpt_batch_inference.py \
+    ${TOKENIZER_ARGS[@]} \
+    ${MODEL_ARGS[@]} \
+    ${INFERENCE_SPECIFIC_ARGS[@]} \
+    --prompts "prompt one " "sample prompt two" "sample prompt 3"
+
+NOTE: Other parameters which can be customized for inference are :-
+--temperature (Sampling temperature)
+--top_k (top_k sampling)
+--top_p (top_p sampling)
+--num-tokens-to-generate (Number of tokens to generate for each prompt)
+--inference-batch-times-seqlen-threshold (During inference, if batch-size times sequence-length is smaller than this threshold then we will not use pipelining, otherwise we will.')
+--use-dist-ckpt (If you are using dist checkpoint format for the model)
+--use-legacy-models (If you are using legacy gpt model instead of mcore gpt model)
+
+```
+
+
+<br>
+
+
+#### 2. Flow of Control In MCore Backend
+The following is what happens in the [simple_gpt_batch_inference.py](./gpt/simple_gpt_batch_inference.py).
+* We call  [mcore_engine](../../megatron/core/inference/engines/mcore_engine.py) **generate()** function with all our input prompts.
+* The scheduler in the engine will add these prompts to the [active requests] pool (../../megatron/core/inference/inference_request.py) until we hit the max batch size, and then it will put the rest in the waiting requests pool. 
+* The engine will then run until all requests (waiting + active) are completed 
+    * The active requests are passed into  **generate_all_output_tokens_static_batch()** of the text generation controller . 
+    * This function uses the [model_inference_wrappers](../../megatron/core/inference/model_inference_wrappers/abstract_model_inference_wrapper.py) **prep_model_for_inference()** , and then runs an auto regressive loop
+    * In the auto regressive loop, the **get_batch_for_context_window()** method of the inference wrapper is called to get the required input, passes it into the **run_one_forward_step()** method, which calls the appropriate (PP, TP) model `.forward()` methods to get the output logits
+    * The output logits are synchronized across all pipeline parallel ranks
+    * The text generation controller obtains the log probabilities and samples tokens based on the strategy defined in the common inference parameters.
+    * The sampled tokens are then appended to the input prompt tokens for the next iteration 
+    * The **update_generation_status()** method of the text generation controller checks which prompts have finished generating or hit a stop condition
+    * After the inference loop, the result is detokenized and stored as an attribute of the InferenceRequest. These requests are marked as completed. 
+    * The **update_requests_pool()** method of the scheduler moves completed requests into the completed request pool and waiting requests into the active request pool
+
+<br>
+
+#### 3. Customizing The Inference Pipeline
+The following guide will walk you through how you can customize different parts of the inference pipeline. There are three levels at which you can customize the pipeline. 
+* **Inference engine** - Highest level of customization. Currently we support the MCore Engine. Change this to add a new engine.
+* **Text generation controller** - Extend this to customize tokenization, detokenization, or implement a new sampling strategy.
+* **Inference Wrapped Model** - Change this to support a new model.
+* **Modify Inference Parameters** - Change this to update top_p, top_k, number of tokens to be generated, temperature, or other sampling parameters.
+
+<br>
+
+##### 3.1. Create Your Own Inference Backend 
+This is the highest level of customization. The  [abstract_engine.py](./../../megatron/core/inference/engine/abstract_engine.py) file has a generate method that can be extended to support a new backend. 
+
+```python
+class AbstractEngine(ABC):
+    @staticmethod
+    def generate(self) -> dict:
+        """The abstract backend's generate function. 
+
+        To define your own backend, make sure you implement this and return the outputs as a dictionary . 
+
+
+<br>
+
+##### 3.2. Create Your Own Text Generation Controller
+In case you want to use the megatron core backend, but would like to overwrite the tokenization, text generation or detokenization extend the [simple_text_generation_controller.py](../../megatron/core/inference/text_generation_controllers/simple_text_generation_controller.py). The class has the following methods
+``` python
+class SimpleTextGenerationController:
+
+    def tokenize_prompt(self, prompt: str) -> Tuple[torch.Tensor, torch.Tensor]:
+        """Utility to tokenize the input prompts"""
+
+    def sample_from_logits(
+        self,
+        last_token_logits: torch.Tensor,
+        common_inference_params: CommonInferenceParams,
+        vocab_size: int,
+    ) -> torch.Tensor:
+        """Samples the logits to generate outputs
+
+        Given the logits of the last token, this function samples it according to the parameters defined in common_inference_params and returns the samples
+        """
+
+    def update_generation_status(
+        self,
+        updated_prompts_tokens: torch.Tensor,
+        generation_started: torch.Tensor,
+        current_context_end_position: int,
+        is_generation_done_tensor: torch.Tensor,
+        generated_sequence_lengths: torch.Tensor,
+    ) -> torch.Tensor:
+        """Function to check which prompts have reached an end condition
+
+        We check which prompts have reached an end condition and set the corresponding flags of the is_generation_done_tensor to True . The generated sequence lengths increases as we keep generating, until that prompts hits an eod condition. The generation started status tensor helps us determine which prompts have started generating
+        """
+
+    def generate_all_output_tokens_static_batch(
+        self, active_requests: OrderedDict[int, InferenceRequest],
+    ) -> OrderedDict[int, InferenceRequest]:
+        """Utility to generate all the output tokens and probabilities for the prompts .
+
+        This utility generates the output tokens for a static batch. It runs the forward steps till all prompts complete generation, updates the status of these requests to completed, adds the generated result and returns these requests
+        """
+
+    def detokenize_generations(self, prompt_tokens_with_generated_tokens: torch.Tensor) -> str:
+        """Detokenize the output generations"""
+```
+
+<br>
+
+##### 3.3. Support Other Models
+In order to support other models please extend the [abstract_model_inference_wrapper.py](./../../megatron/core/inference/model_inference_wrappers/abstract_model_inference_wrapper.py) file. The abstract wrapper already supports the following :
+* Forward method which automatically calls the appropriate forward method (PP or TP etc) depending on model parallel settings
+* Initalizes the model and puts it in eval mode
+* Obtains the input parameters (batch size, max seq length) and has an instance of the input 
+
+The main methods to change for your model might be the following: 
+```python
+class AbstractModelInferenceWrapper:
+    def prep_model_for_inference(self, prompts_tokens: torch.Tensor):
+        """A utility function for preparing model for inference
+
+        The function gets called once before the auto regressive inference loop. It puts the model in eval mode , and gets some model and inference data parameters. Extend this to build position ids ,attention mask etc, so that required slices can be extracted during the forward pass
+        """
+
+    @abc.abstractclassmethod
+    def get_batch_for_context_window(self) -> List:
+        """Returns the input data for inference 
+
+        This function gets called iteratively in the inference loop . It can be used to extract relevant input from the prompt tokens, attention mask etc. required for each step in inference.
+```
+
+Refer to [gpt_inference_wrapper.py](../../megatron/core/inference/model_inference_wrappers/gpt/gpt_inference_wrapper.py) for an example of extending this for GPTModel.
+
+<br>
+
+##### 3.3. Modify Inference Parameters
+We use  [common inference params](../../megatron/core/inference/common_inference_params.py) for text generation. Customize this if you want to change top_p, top_k, number of tokens to generate etc. If you want to add other attributes that you would use in the inference loop, you can do that as shown below
+
+```
+from megatron.core.inference.common_inference_params import CommonInferenceParams
+
+c = CommonInferenceParams(temperature=0.5)
+c.add_attributes({'min_length':4, 'eod_id':153})
+```
+
+<br>
+
+#### 4. Future work
+The following are planned for the future releases . 
+* Dynamic batching 
+* Paged Attention
+* TRTLLM Engine support
+* Support for Multimodal model inference
\ No newline at end of file
diff --git a/examples/inference/gpt/simple_gpt_batch_inference.py b/examples/inference/gpt/simple_gpt_batch_inference.py
new file mode 100644
index 0000000000..5c7ae5bd77
--- /dev/null
+++ b/examples/inference/gpt/simple_gpt_batch_inference.py
@@ -0,0 +1,115 @@
+import os
+from megatron.core.inference.model_inference_wrappers.inference_wrapper_config import InferenceWrapperConfig
+from pretrain_gpt import model_provider
+import torch
+import sys
+from argparse import Namespace
+from megatron.core.inference.engines.abstract_engine import AbstractEngine
+from megatron.core.inference.engines.mcore_engine import MCoreEngine
+from megatron.core.inference.common_inference_params import CommonInferenceParams
+from megatron.core.inference.model_inference_wrappers.gpt.gpt_inference_wrapper import GPTInferenceWrapper
+from megatron.core.inference.inference_request import InferenceRequest
+from megatron.core.inference.text_generation_controllers.simple_text_generation_controller import SimpleTextGenerationController
+from megatron.core.transformer.module import MegatronModule
+sys.path.append(os.path.abspath(os.path.join(os.path.dirname(__file__),
+                                             os.path.pardir, os.path.pardir)))
+
+from megatron.training import get_args
+from megatron.training import get_tokenizer
+from megatron.training.checkpointing import load_checkpoint
+from megatron.core import mpu
+from megatron.training.initialize import initialize_megatron
+from megatron.training import get_model
+from typing import List
+
+def add_text_generate_args(parser):
+    """Text generation arguments."""
+    group = parser.add_argument_group(title='text generation')
+
+    group.add_argument("--temperature", type=float, default=1.0,
+                       help='Sampling temperature.')
+    group.add_argument("--top_k", type=int, default=1,
+                       help='Top k sampling.')
+    group.add_argument("--top_p", type=float, default=0.0,
+                       help='Top p sampling.')
+    group.add_argument("--return-log-probs", action='store_true', default=False,
+                       help='Return the log probabilities of the final output tokens')
+    group.add_argument("--num-tokens-to-generate", type=int, default=30,
+                       help='Number of tokens to generate for each prompt')
+    group.add_argument("--prompts", metavar='N', type=str, nargs='+',
+                       help='Input prompts with each prompt within quotes and seperated by space')
+    group.add_argument("--max-batch-size", type=int, default=1,
+                       help='Max number of prompts to process at once')
+    return parser
+
+
+def get_inference_engine(args: Namespace, model: MegatronModule) -> AbstractEngine:
+    """Utility to get the relevant backend for running inference
+
+    This function will automatically chose the TRTLLMBackend when possible, and if not revert to Mcore backend if the user does not specify any backends. TRT LLM Backend is not implmented yet. 
+
+    Args:
+        args (Namespace): The user arguments parsed from command line
+        model (MegatronModule): The megatron model . 
+
+    Returns:
+        AbstractBackend: The chosen backend
+    """
+    tokenizer = get_tokenizer()
+
+    inference_wrapper_config = InferenceWrapperConfig(
+        hidden_size=args.hidden_size,
+        inference_batch_times_seqlen_threshold=args.inference_batch_times_seqlen_threshold,
+        fp32_residual_connection=args.fp32_residual_connection,
+        params_dtype=args.params_dtype,
+        padded_vocab_size=args.padded_vocab_size
+    )
+
+    inference_wrapped_model = GPTInferenceWrapper(model, inference_wrapper_config)
+    text_generation_controller = SimpleTextGenerationController(inference_wrapped_model=inference_wrapped_model, tokenizer=tokenizer)
+    return MCoreEngine(text_generation_controller=text_generation_controller, max_batch_size=args.max_batch_size)
+            
+def main():
+    """Main program."""
+
+    # Note: The default args passed here can be overwritten by using appropriate params (check arguments.py file)
+    # Micro batch size is not needed to be set by user. (It is calculated based on inference-batch-times-seqlen-threshold argument)
+    initialize_megatron(extra_args_provider=add_text_generate_args,
+                        args_defaults={'no_load_rng': True,
+                                       'no_load_optim': True,
+                                       'micro_batch_size': 1, 
+                                       'exit_on_missing_checkpoint': True})
+
+    # Set up model and load checkpoint
+    model = get_model(model_provider, wrap_with_ddp=False)
+    load_checkpoint(model, None, None)
+    model = model[0]
+
+    args = get_args()
+
+    inference_engine = get_inference_engine(args, model)
+
+    common_inference_params = CommonInferenceParams(
+        temperature=args.temperature, 
+        top_k=args.top_k, 
+        top_p=args.top_p, 
+        return_log_probs=args.return_log_probs, 
+        num_tokens_to_generate=args.num_tokens_to_generate)
+
+    results: List[InferenceRequest] = inference_engine.generate(
+        prompts=args.prompts, common_inference_params=common_inference_params
+    )
+    
+    if torch.distributed.get_rank() == 0:
+        for idx, result in enumerate(results):
+            print(f' \n------------- RESULT FOR PROMPT {idx} --------------- ')
+            result = {
+                'id': result.request_id,
+                'input_prompt': result.prompt, 
+                'generated_text': result.generated_text,
+                'generated_tokens' : result.generated_tokens
+                }
+            print(result)
+
+if __name__ == "__main__":
+    main()
diff --git a/examples/inference/llama_mistral/huggingface_reference.py b/examples/inference/llama_mistral/huggingface_reference.py
new file mode 100644
index 0000000000..9d8f4465f6
--- /dev/null
+++ b/examples/inference/llama_mistral/huggingface_reference.py
@@ -0,0 +1,25 @@
+import argparse
+from transformers import AutoConfig, AutoModelForCausalLM, AutoTokenizer
+
+# Set up argument parsing
+parser = argparse.ArgumentParser(description="Script for text generation with a specific model and prompt.")
+parser.add_argument('--prompt', type=str, required=True, help="Prompt text to use for text generation")
+parser.add_argument('--model-path', type=str, required=True, help="Path to the Huggingface model checkpoint")
+
+# Parse command-line arguments
+args = parser.parse_args()
+
+model_path = args.model_path
+prompt = args.prompt
+
+config = AutoConfig.from_pretrained(model_path)
+tokenizer = AutoTokenizer.from_pretrained(model_path, config=config)
+model = AutoModelForCausalLM.from_pretrained(model_path, config=config).cuda()
+
+inputs = tokenizer(prompt, return_tensors="pt")
+for key in inputs:
+    inputs[key] = inputs[key].cuda()
+# top_k, top_p and do_sample are set for greedy argmax based sampling
+
+outputs = model.generate(**inputs, max_length=100, do_sample=False, top_p=0, top_k=0, temperature=1.0)
+print(tokenizer.decode(outputs[0], skip_special_tokens=True))
\ No newline at end of file
diff --git a/examples/inference/llama_mistral/run_text_generation_llama3.1.sh b/examples/inference/llama_mistral/run_text_generation_llama3.1.sh
new file mode 100755
index 0000000000..06584f0917
--- /dev/null
+++ b/examples/inference/llama_mistral/run_text_generation_llama3.1.sh
@@ -0,0 +1,56 @@
+#!/bin/bash
+# This example will start serving the Llama3.1-8B model
+export NCCL_IB_SL=1
+export CUDA_DEVICE_MAX_CONNECTIONS=1
+export NVTE_APPLY_QK_LAYER_SCALING=0
+
+DISTRIBUTED_ARGS="--nproc_per_node 1 \
+                  --nnodes 1 \
+                  --node_rank 0 \
+                  --master_addr 0.0.0.0 \
+                  --master_port 6000"
+
+# Ensure CHECKPOINT and TOKENIZER_MODEL are provided
+if [ -z "$1" ] || [ -z "$2" ]; then
+  echo "Error: You must provide CHECKPOINT and TOKENIZER_MODEL as command-line arguments."
+  echo "Usage: $0 /path/to/checkpoint /path/to/tokenizer_model"
+  exit 1
+fi
+
+# Assign command-line arguments to variables
+CHECKPOINT=$1
+TOKENIZER_MODEL=$2
+
+pip install flask-restful
+
+torchrun $DISTRIBUTED_ARGS tools/run_text_generation_server.py   \
+      --use-checkpoint-args \
+      --disable-bias-linear \
+      --tokenizer-type HuggingFaceTokenizer \
+      --tokenizer-model ${TOKENIZER_MODEL} \
+      --transformer-impl transformer_engine \
+      --normalization RMSNorm \
+      --group-query-attention \
+      --num-query-groups 8 \
+      --no-masked-softmax-fusion \
+      --attention-softmax-in-fp32 \
+      --attention-dropout 0.0 \
+      --hidden-dropout 0.0 \
+      --untie-embeddings-and-output-weights \
+      --position-embedding-type rope \
+      --rotary-percent 1.0 \
+      --rotary-base 500000 \
+      --use-rope-scaling \
+      --use-rotary-position-embeddings \
+      --swiglu \
+      --tensor-model-parallel-size 1  \
+      --pipeline-model-parallel-size 1  \
+      --num-layers 32  \
+      --hidden-size 4096  \
+      --ffn-hidden-size 14336 \
+      --load ${CHECKPOINT}  \
+      --num-attention-heads 32  \
+      --max-position-embeddings 131072  \
+      --bf16  \
+      --micro-batch-size 1  \
+      --seq-length 8192
diff --git a/examples/inference/llama_mistral/run_text_generation_llama3.sh b/examples/inference/llama_mistral/run_text_generation_llama3.sh
new file mode 100755
index 0000000000..c5fc4103ab
--- /dev/null
+++ b/examples/inference/llama_mistral/run_text_generation_llama3.sh
@@ -0,0 +1,55 @@
+#!/bin/bash
+# This example will start serving the Llama3-8B model
+export NCCL_IB_SL=1
+export CUDA_DEVICE_MAX_CONNECTIONS=1
+export NVTE_APPLY_QK_LAYER_SCALING=0
+
+DISTRIBUTED_ARGS="--nproc_per_node 1 \
+                  --nnodes 1 \
+                  --node_rank 0 \
+                  --master_addr 0.0.0.0 \
+                  --master_port 6000"
+
+# Ensure CHECKPOINT and TOKENIZER_MODEL are provided
+if [ -z "$1" ] || [ -z "$2" ]; then
+  echo "Error: You must provide CHECKPOINT and TOKENIZER_MODEL as command-line arguments."
+  echo "Usage: $0 /path/to/checkpoint /path/to/tokenizer_model"
+  exit 1
+fi
+
+# Assign command-line arguments to variables
+CHECKPOINT=$1
+TOKENIZER_MODEL=$2
+
+pip install flask-restful
+
+torchrun $DISTRIBUTED_ARGS tools/run_text_generation_server.py   \
+      --use-checkpoint-args \
+      --disable-bias-linear \
+      --tokenizer-type HuggingFaceTokenizer \
+      --tokenizer-model ${TOKENIZER_MODEL} \
+      --transformer-impl transformer_engine \
+      --normalization RMSNorm \
+      --group-query-attention \
+      --num-query-groups 8 \
+      --no-masked-softmax-fusion \
+      --attention-softmax-in-fp32 \
+      --attention-dropout 0.0 \
+      --hidden-dropout 0.0 \
+      --untie-embeddings-and-output-weights \
+      --position-embedding-type rope \
+      --rotary-percent 1.0 \
+      --rotary-base 500000 \
+      --use-rotary-position-embeddings \
+      --swiglu \
+      --tensor-model-parallel-size 1  \
+      --pipeline-model-parallel-size 1  \
+      --num-layers 32  \
+      --hidden-size 4096  \
+      --ffn-hidden-size 14336 \
+      --load ${CHECKPOINT}  \
+      --num-attention-heads 32  \
+      --max-position-embeddings 8192  \
+      --bf16  \
+      --micro-batch-size 1  \
+      --seq-length 8192
diff --git a/examples/inference/llama_mistral/run_text_generation_mistral.sh b/examples/inference/llama_mistral/run_text_generation_mistral.sh
new file mode 100755
index 0000000000..4358fd494c
--- /dev/null
+++ b/examples/inference/llama_mistral/run_text_generation_mistral.sh
@@ -0,0 +1,53 @@
+#!/bin/bash
+# This example will start serving the Mistral-7B-v0.3 model
+export NCCL_IB_SL=1
+export CUDA_DEVICE_MAX_CONNECTIONS=1
+
+DISTRIBUTED_ARGS="--nproc_per_node 1 \
+                  --nnodes 1 \
+                  --node_rank 0 \
+                  --master_addr 0.0.0.0 \
+                  --master_port 6000"
+
+# Ensure CHECKPOINT and TOKENIZER_MODEL are provided
+if [ -z "$1" ] || [ -z "$2" ]; then
+  echo "Error: You must provide CHECKPOINT and TOKENIZER_MODEL as command-line arguments."
+  echo "Usage: $0 /path/to/checkpoint /path/to/tokenizer_model"
+  exit 1
+fi
+
+# Assign command-line arguments to variables
+CHECKPOINT=$1
+TOKENIZER_MODEL=$2
+
+pip install flask-restful
+
+torchrun $DISTRIBUTED_ARGS tools/run_text_generation_server.py   \
+       --tokenizer-type HuggingFaceTokenizer \
+       --tokenizer-model ${TOKENIZER_MODEL} \
+       --use-checkpoint-args \
+       --apply-layernorm-1p \
+       --transformer-impl transformer_engine \
+       --normalization RMSNorm \
+       --group-query-attention \
+       --num-query-groups 8 \
+       --no-masked-softmax-fusion \
+       --use-flash-attn \
+       --untie-embeddings-and-output-weights \
+       --disable-bias-linear \
+       --position-embedding-type rope \
+       --rotary-percent 1.0 \
+       --rotary-base 1000000 \
+       --swiglu \
+       --ffn-hidden-size 14336 \
+       --tensor-model-parallel-size 1  \
+       --pipeline-model-parallel-size 1  \
+       --num-layers 32  \
+       --hidden-size 4096  \
+       --load ${CHECKPOINT}  \
+       --num-attention-heads 32  \
+       --max-position-embeddings 4096  \
+       --bf16  \
+       --micro-batch-size 1  \
+       --seq-length 4096  \
+       --seed 101
diff --git a/examples/run_text_generation_server_345M.sh b/examples/inference/run_text_generation_server_345M.sh
similarity index 92%
rename from examples/run_text_generation_server_345M.sh
rename to examples/inference/run_text_generation_server_345M.sh
index a151b98467..e8e61adb16 100755
--- a/examples/run_text_generation_server_345M.sh
+++ b/examples/inference/run_text_generation_server_345M.sh
@@ -26,9 +26,6 @@ torchrun $DISTRIBUTED_ARGS tools/run_text_generation_server.py   \
        --fp16  \
        --micro-batch-size 1  \
        --seq-length 1024  \
-       --out-seq-length 1024  \
-       --temperature 1.0  \
        --vocab-file $VOCAB_FILE  \
        --merge-file $MERGE_FILE  \
-       --top_p 0.9  \
        --seed 42
diff --git a/examples/run_text_generation_server_345M_8_tensor_parallel.sh b/examples/inference/run_text_generation_server_345M_8_tensor_parallel.sh
similarity index 92%
rename from examples/run_text_generation_server_345M_8_tensor_parallel.sh
rename to examples/inference/run_text_generation_server_345M_8_tensor_parallel.sh
index 027ab42172..368cec3b31 100755
--- a/examples/run_text_generation_server_345M_8_tensor_parallel.sh
+++ b/examples/inference/run_text_generation_server_345M_8_tensor_parallel.sh
@@ -24,9 +24,6 @@ python -m torch.distributed.launch $DISTRIBUTED_ARGS tools/run_text_generation_s
        --fp16  \
        --micro-batch-size 1  \
        --seq-length 1024  \
-       --out-seq-length 1024  \
-       --temperature 1.0  \
        --vocab-file $VOCAB_FILE  \
        --merge-file $MERGE_FILE  \
-       --top_p 0.9  \
        --seed 42
diff --git a/examples/inference/t5/simple_t5_batch_inference.py b/examples/inference/t5/simple_t5_batch_inference.py
new file mode 100644
index 0000000000..3f4557d3c2
--- /dev/null
+++ b/examples/inference/t5/simple_t5_batch_inference.py
@@ -0,0 +1,157 @@
+import os
+import sys
+from argparse import Namespace
+
+import torch
+
+import pretrain_t5
+from megatron.core.inference.common_inference_params import CommonInferenceParams
+from megatron.core.inference.engines.abstract_engine import AbstractEngine
+from megatron.core.inference.engines.mcore_engine import MCoreEngine
+from megatron.core.inference.inference_request import InferenceRequest
+from megatron.core.inference.model_inference_wrappers.inference_wrapper_config import (
+    InferenceWrapperConfig,
+)
+from megatron.core.inference.model_inference_wrappers.t5.t5_inference_wrapper import (
+    T5InferenceWrapper,
+)
+from megatron.core.inference.text_generation_controllers.encoder_decoder_text_generation_controller import (
+    EncoderDecoderTextGenerationController,
+)
+from megatron.core.transformer.module import MegatronModule
+from pretrain_t5 import model_provider
+
+sys.path.append(
+    os.path.abspath(os.path.join(os.path.dirname(__file__), os.path.pardir, os.path.pardir))
+)
+
+from typing import List
+
+from megatron.core import mpu
+from megatron.training import get_args, get_model, get_tokenizer
+from megatron.training.checkpointing import load_checkpoint
+from megatron.training.initialize import initialize_megatron
+
+
+def add_text_generate_args(parser):
+    """Text generation arguments."""
+    group = parser.add_argument_group(title='text generation')
+
+    group.add_argument("--temperature", type=float, default=1.0, help='Sampling temperature.')
+    group.add_argument("--top_k", type=int, default=1, help='Top k sampling.')
+    group.add_argument("--top_p", type=float, default=0.0, help='Top p sampling.')
+    group.add_argument(
+        "--return-log-probs",
+        action='store_true',
+        default=False,
+        help='Return the log probabilities of the final output tokens',
+    )
+    group.add_argument(
+        "--num-tokens-to-generate",
+        type=int,
+        default=30,
+        help='Number of tokens to generate for each prompt',
+    )
+    group.add_argument(
+        "--encoder-prompts",
+        metavar='N',
+        type=str,
+        nargs='+',
+        help='Encoder input prompts with each prompt within quotes and seperated by space',
+    )
+    group.add_argument(
+        "--max-batch-size", type=int, default=1, help='Max number of prompts to process at once'
+    )
+    return parser
+
+
+def get_inference_engine(args: Namespace, model: MegatronModule) -> AbstractEngine:
+    """Utility to get the relevant backend for running inference
+
+    This function will automatically chose the TRTLLMBackend when possible, and if not revert to Mcore backend if the user does not specify any backends. TRT LLM Backend is not implmented yet.
+
+    Args:
+        args (Namespace): The user arguments parsed from command line
+        model (MegatronModule): The megatron model .
+
+    Returns:
+        AbstractBackend: The chosen backend
+    """
+    tokenizer = get_tokenizer()
+
+    inference_wrapper_config = InferenceWrapperConfig(
+        hidden_size=args.hidden_size,
+        inference_batch_times_seqlen_threshold=args.inference_batch_times_seqlen_threshold,
+        fp32_residual_connection=args.fp32_residual_connection,
+        params_dtype=args.params_dtype,
+        padded_vocab_size=args.padded_vocab_size,
+    )
+
+    inference_wrapped_model = T5InferenceWrapper(model, inference_wrapper_config)
+    text_generation_controller = EncoderDecoderTextGenerationController(
+        inference_wrapped_model=inference_wrapped_model, tokenizer=tokenizer
+    )
+    return MCoreEngine(
+        text_generation_controller=text_generation_controller, max_batch_size=args.max_batch_size
+    )
+
+
+def main():
+    """Main program."""
+
+    # Note: The default args passed here can be overwritten by using appropriate params (check arguments.py file)
+    # Micro batch size is not needed to be set by user. (It is calculated based on inference-batch-times-seqlen-threshold argument)
+    initialize_megatron(
+        extra_args_provider=add_text_generate_args,
+        args_defaults={
+            'no_load_rng': True,
+            'no_load_optim': True,
+            'micro_batch_size': 1,
+            'exit_on_missing_checkpoint': True,
+        },
+    )
+
+    # Set up model and load checkpoint
+    model = get_model(model_provider, wrap_with_ddp=False)
+    load_checkpoint(model, None, None)
+    model = model[0]
+
+    args = get_args()
+
+    inference_engine = get_inference_engine(args, model)
+
+    common_inference_params = CommonInferenceParams(
+        temperature=args.temperature,
+        top_k=args.top_k,
+        top_p=args.top_p,
+        return_log_probs=args.return_log_probs,
+        num_tokens_to_generate=args.num_tokens_to_generate,
+    )
+
+    tokenizer = get_tokenizer()
+    decoder_prompts = [""] * len(
+        args.encoder_prompts
+    )  # for T5, the prompt is provided as encoder input, hence decoder_prompts is empty
+    args.prompts = decoder_prompts
+
+    results: List[InferenceRequest] = inference_engine.generate(
+        prompts=args.prompts,
+        add_BOS=True,
+        encoder_prompts=args.encoder_prompts,
+        common_inference_params=common_inference_params,
+    )
+
+    if torch.distributed.get_rank() == 0:
+        for idx, result in enumerate(results):
+            print(f' \n------------- RESULT FOR PROMPT {idx} --------------- ')
+            result = {
+                'id': result.request_id,
+                'input_prompt': result.prompt,
+                'generated_text': result.generated_text,
+                'generated_tokens': result.generated_tokens,
+            }
+            print(result)
+
+
+if __name__ == "__main__":
+    main()
diff --git a/examples/mamba/.gitignore b/examples/mamba/.gitignore
new file mode 100644
index 0000000000..940f4797e4
--- /dev/null
+++ b/examples/mamba/.gitignore
@@ -0,0 +1,4 @@
+checkpoints/
+data-cache/
+tensorboard/
+triton-cache/
diff --git a/examples/mamba/Dockerfile b/examples/mamba/Dockerfile
new file mode 100644
index 0000000000..2e194095b7
--- /dev/null
+++ b/examples/mamba/Dockerfile
@@ -0,0 +1,32 @@
+FROM nvcr.io/nvidia/pytorch:24.01-py3
+
+RUN pip uninstall -y triton && \
+    pip install triton==2.1.0 sentencepiece==0.1.99 flask-restful
+
+# The causal-conv1d and mamba-ssm packages below are built from scratch here
+# (which takes significant time) because there are no wheels available on PyPI
+# for these relatively newer versions of the packages that are compatible with
+# the older NGC-variant PyTorch version (e.g. version 2.2.0.dev231106) that we
+# are using (in the NGC base container). Generally, if the package is not
+# compatible with the PyTorch version, then it will generate a Python import
+# error. The package authors tend to only release wheels for new versions of
+# these pacakges which are compatible with the versions of regular PyTorch and
+# NGC-variant PyTorch that are newer at the time of release. So, to use newer
+# versions of these packages with relatively older versions of the NGC PyTorch
+# container, we tend to have to build the packages from scratch.
+
+RUN cd /tmp && \
+    git clone https://github.com/Dao-AILab/causal-conv1d.git && \
+    cd causal-conv1d && \
+    git checkout v1.2.2.post1 && \
+    CAUSAL_CONV1D_FORCE_BUILD=TRUE pip install . && \
+    cd .. && \
+    rm -rf causal-conv1d
+
+RUN cd /tmp && \
+    git clone https://github.com/state-spaces/mamba.git && \
+    cd mamba && \
+    git checkout v2.0.3 && \
+    MAMBA_FORCE_BUILD=TRUE pip install . && \
+    cd .. && \
+    rm -rf mamba
diff --git a/examples/mamba/README.md b/examples/mamba/README.md
new file mode 100644
index 0000000000..f8f6d79683
--- /dev/null
+++ b/examples/mamba/README.md
@@ -0,0 +1,94 @@
+# Mamba-based Language Models
+
+## Introduction
+
+This document is an entrypoint into the code used for
+<em>[An Empirical Study of Mamba-based Language Models](https://arxiv.org/abs/2406.07887)</em>.
+
+We are releasing the parameters for some of the models described in that
+technical report via
+[HuggingFace](https://huggingface.co/collections/nvidia/ssms-666a362c5c3bb7e4a6bcfb9c).
+The code in the `main` branch is no longer compatible with the `Mamba2-*`
+checkpoints. You can load them using the
+[fixed snapshot of the code used for the technical report](https://github.com/NVIDIA/Megatron-LM/tree/ssm/examples/mamba).
+
+## Installation
+
+Create and run a Docker container using the [Dockerfile](./Dockerfile).
+
+```
+docker build -t your_image_name:your_tag .
+docker run --gpus all -it --rm \
+  -v /path/to/megatron:/workspace/megatron \
+  -v /path/to/dataset:/workspace/dataset \
+  -v /path/to/checkpoints:/workspace/checkpoints \
+  -w /workspace/megatron/examples/mamba \
+  your_image_name:your_tag
+```
+
+## Train
+
+[`train.sh`](./train.sh) is an example pretraining script, showing how to run on
+a single node. Select between 800M-scale and 8B-scale models by setting the
+`MODEL_SCALE` variable. The 8B-scale hybrid model architecture is the same as
+the one described in the technical report.
+
+## Text Generation
+
+Use [`run_text_gen_server_8b.sh`](./run_text_gen_server_8b.sh) to start a text
+generation server using an 8B hybrid checkpoint. This is configured to run the
+8B hybrid model described in the technical report, with tensor model parallel
+set to 1.
+
+The arguments in the script will need to be changed if using a checkpoint with a
+different model parallel configuration or other differences, such as model
+architecture. For example, to run the 8B pure Mamba-2 model, change
+`--hybrid-attention-ratio` and `--hybrid-mlp-ratio` to 0.0, or remove them.
+
+Use [`run_text_gen_server_8b_gpt3.sh`](./run_text_gen_server_8b_gpt3.sh) to start
+a text generation server using the 8B reference Transformer checkpoint.
+
+## Checkpoint Formats
+
+For inference, the model must be configured to match the checkpoint file used,
+including the hybrid layer configuration and model parallel configuration.
+
+If you need to convert a hybrid checkpoint file to a different tensor parallel
+or pipeline parallel size, use
+[the hybrid conversion script](../../tools/checkpoint/hybrid_conversion.py).
+There is an example run command at the end of that file.
+
+Before running that script, you will need to set `PYTHONPATH` to include the
+root directory of your Megatron-LM repository clone.
+
+```
+export PYTHONPATH=<path-to-megatron>:PYTHONPATH
+```
+
+## Hybrid Options
+
+`--hybrid-attention-ratio ATT` specifies a target ratio of attention layers
+to total layers. For example, 4 attention layers out of 48 total layers is
+specified by `--hybrid-attention-ratio 0.08`.
+
+`--hybrid-mlp-ratio MLP` specifies a target ratio of MLP layers to total
+layers. For example, 24 MLP layers out of 48 total layers is specified by
+`--hybrid-mlp-ratio 0.5`.
+
+* (`ATT` + `MLP`) must be less than or equal to 1.0.
+* (1.0 - `ATT` - `MLP`) is the hybrid mamba ratio, the ratio of mamba layers to
+total layers.
+* `ATT` = `MLP` = 0 is a pure Mamba model.
+* `ATT` = `MLP` = 0.5 is a transfomer model.
+
+If either `ATT` or `MLP` is greater than 0.0 or if `--hybrid-override-pattern`
+is specified, the logfile will include information about the hybrid layer
+pattern used. `--hybrid-override-pattern` can be used to specify a different
+pattern than the default, algorithmically-generated one.
+
+## Mamba vs Mamba-2
+
+This codebase currently only supports Mamba-2, and not the original version of
+Mamba. However, the
+[fixed snapshot of the code used for the technical report](https://github.com/NVIDIA/Megatron-LM/tree/ssm/examples/mamba)
+can be configured to run the original version of Mamba.
diff --git a/examples/mamba/run_text_gen_server_8b.sh b/examples/mamba/run_text_gen_server_8b.sh
new file mode 100755
index 0000000000..8d3137f244
--- /dev/null
+++ b/examples/mamba/run_text_gen_server_8b.sh
@@ -0,0 +1,50 @@
+#!/bin/bash
+
+# Use: ./run_text_gen_server_8b.sh <checkpoint-path> <tokenizer-path>
+# To launch the client: python ../../tools/text_generation_cli.py <URL-provided-by-server>
+
+CHECKPOINT_PATH=$1
+TOKENIZER_PATH=$2
+
+DISTRIBUTED_ARGS="--nproc_per_node 1 \
+                  --nnodes 1 \
+                  --node_rank 0 \
+                  --master_addr localhost \
+                  --master_port 6000"
+
+export NCCL_IB_SL=1
+export CUDA_DEVICE_MAX_CONNECTIONS=1
+export NCCL_IB_TIMEOUT=19
+export NCCL_IB_QPS_PER_CONNECTION=4
+
+export TRITON_CACHE_DIR="./triton-cache/"
+export TRITON_CACHE_MANAGER="megatron.core.ssm.triton_cache_manager:ParallelFileCacheManager"
+
+torchrun $DISTRIBUTED_ARGS ../../tools/run_mamba_text_generation_server.py \
+       --tensor-model-parallel-size 1  \
+       --pipeline-model-parallel-size 1  \
+       --untie-embeddings-and-output-weights \
+       --num-layers 56  \
+       --hidden-size 4096  \
+       --load ${CHECKPOINT_PATH}  \
+       --num-attention-heads 32  \
+       --group-query-attention \
+       --num-query-groups 8 \
+       --hybrid-attention-ratio 0.08 \
+       --hybrid-mlp-ratio 0.5 \
+       --attention-dropout 0.0 \
+       --hidden-dropout 0.0 \
+       --disable-bias-linear \
+       --normalization RMSNorm \
+       --seq-length 4096  \
+       --max-position-embeddings 4096  \
+       --position-embedding-type none \
+       --tokenizer-type GPTSentencePieceTokenizer  \
+       --tokenizer-model ${TOKENIZER_PATH} \
+       --distributed-backend nccl \
+       --distributed-timeout-minutes 1440 \
+       --bf16  \
+       --micro-batch-size 1  \
+       --use-mcore-models \
+       --spec megatron.core.models.mamba.mamba_layer_specs mamba_stack_spec \
+       --seed 42
diff --git a/examples/mamba/run_text_gen_server_8b_gpt3.sh b/examples/mamba/run_text_gen_server_8b_gpt3.sh
new file mode 100644
index 0000000000..5413b245ed
--- /dev/null
+++ b/examples/mamba/run_text_gen_server_8b_gpt3.sh
@@ -0,0 +1,46 @@
+#!/bin/bash
+
+# Use: ./run_text_gen_server_8b_gpt3.sh <checkpoint-path> <tokenizer-path>
+# To launch the client: python ../../tools/text_generation_cli.py <URL-provided-by-server>
+
+CHECKPOINT_PATH=$1
+TOKENIZER_PATH=$2
+
+DISTRIBUTED_ARGS="--nproc_per_node 1 \
+                  --nnodes 1 \
+                  --node_rank 0 \
+                  --master_addr localhost \
+                  --master_port 6000"
+
+export NCCL_IB_SL=1
+export CUDA_DEVICE_MAX_CONNECTIONS=1
+export NCCL_IB_TIMEOUT=19
+export NCCL_IB_QPS_PER_CONNECTION=4
+
+torchrun $DISTRIBUTED_ARGS ../../tools/run_text_generation_server.py \
+       --tensor-model-parallel-size 1  \
+       --pipeline-model-parallel-size 1  \
+       --use-flash-attn \
+       --apply-layernorm-1p \
+       --untie-embeddings-and-output-weights \
+       --num-layers 32  \
+       --hidden-size 4096  \
+       --load ${CHECKPOINT_PATH}  \
+       --num-attention-heads 32  \
+       --attention-dropout 0.0 \
+       --hidden-dropout 0.0 \
+       --disable-bias-linear \
+       --seq-length 4096  \
+       --max-position-embeddings 4096  \
+       --position-embedding-type rope \
+       --rotary-percent 0.5 \
+       --squared-relu \
+       --tokenizer-type GPTSentencePieceTokenizer  \
+       --tokenizer-model ${TOKENIZER_PATH} \
+       --distributed-backend nccl \
+       --distributed-timeout-minutes 1440 \
+       --bf16  \
+       --micro-batch-size 1  \
+       --use-mcore-models \
+       --transformer-impl local \
+       --seed 42
diff --git a/examples/mamba/train.sh b/examples/mamba/train.sh
new file mode 100755
index 0000000000..3952a997d4
--- /dev/null
+++ b/examples/mamba/train.sh
@@ -0,0 +1,105 @@
+#!/bin/bash
+
+# Use: ./train.sh <data-path> <tokenizer-path>
+
+MODEL_SCALE="800M" # or "8B"
+
+case "${MODEL_SCALE}" in
+    "800M")
+        TENSOR_MODEL_PARALLEL_SIZE=1
+        NUM_LAYERS=48
+        HIDDEN_SIZE=1024
+        NUM_ATTENTION_HEADS=16
+        GLOBAL_BATCH_SIZE=32
+        ;;
+    "8B")
+        TENSOR_MODEL_PARALLEL_SIZE=4
+        NUM_LAYERS=56
+        HIDDEN_SIZE=4096
+        NUM_ATTENTION_HEADS=32
+        GLOBAL_BATCH_SIZE=8
+        ;;
+    *)
+        echo "Invalid version specified"
+        exit 1
+        ;;
+esac
+
+DATA_PATH=$1
+TOKENIZER_PATH=$2
+
+export NCCL_IB_SL=1
+export CUDA_DEVICE_MAX_CONNECTIONS=1
+export NCCL_IB_TIMEOUT=19
+export NCCL_IB_QPS_PER_CONNECTION=4
+
+CHECKPOINT_DIR="./checkpoints"
+DATACACHE_DIR="./data-cache"
+TENSORBOARD_DIR="./tensorboard"
+
+mkdir -p ${CHECKPOINT_DIR}
+mkdir -p ${DATACACHE_DIR}
+mkdir -p ${TENSORBOARD_DIR}
+
+export TRITON_CACHE_DIR="./triton-cache/"
+export TRITON_CACHE_MANAGER="megatron.core.ssm.triton_cache_manager:ParallelFileCacheManager"
+
+SEQ_LEN=4096
+TRAIN_SAMPLES=73242188  # 300B tokens / 4096
+LR_WARMUP_SAMPLES=50000
+LR_DECAY_SAMPLES=73192188 # TRAIN_SAMPLES - LR_WARMUP_SAMPLES
+
+options=" \
+       --tensor-model-parallel-size ${TENSOR_MODEL_PARALLEL_SIZE} \
+       --sequence-parallel \
+       --pipeline-model-parallel-size 1 \
+       --use-distributed-optimizer \
+       --overlap-param-gather \
+       --overlap-grad-reduce \
+       --untie-embeddings-and-output-weights \
+       --init-method-std 0.02 \
+       --position-embedding-type none \
+       --num-layers ${NUM_LAYERS} \
+       --hidden-size ${HIDDEN_SIZE} \
+       --num-attention-heads ${NUM_ATTENTION_HEADS} \
+       --group-query-attention \
+       --num-query-groups 8 \
+       --hybrid-attention-ratio 0.08 \
+       --hybrid-mlp-ratio 0.5 \
+       --seq-length ${SEQ_LEN} \
+       --max-position-embeddings ${SEQ_LEN} \
+       --train-samples ${TRAIN_SAMPLES} \
+       --lr-warmup-samples ${LR_WARMUP_SAMPLES} \
+       --lr-decay-samples ${LR_DECAY_SAMPLES} \
+       --save ${CHECKPOINT_DIR} \
+       --load ${CHECKPOINT_DIR} \
+       --data-path ${DATA_PATH} \
+       --data-cache-path ${DATACACHE_DIR} \
+       --split 99,1,0 \
+       --tokenizer-type GPTSentencePieceTokenizer \
+       --tokenizer-model ${TOKENIZER_PATH} \
+       --distributed-backend nccl \
+       --micro-batch-size 4 \
+       --global-batch-size ${GLOBAL_BATCH_SIZE} \
+       --lr 2.5e-4 \
+       --min-lr 2.5e-5 \
+       --lr-decay-style cosine \
+       --weight-decay 0.1 \
+       --clip-grad 1.0 \
+       --attention-dropout 0.0 \
+       --hidden-dropout 0.0 \
+       --disable-bias-linear \
+       --normalization RMSNorm \
+       --adam-beta1 0.9 \
+       --adam-beta2 0.95 \
+       --log-interval 10 \
+       --save-interval 2000 \
+       --eval-interval 2000 \
+       --eval-iters 32 \
+       --bf16 \
+       --use-mcore-models \
+       --spec megatron.core.models.mamba.mamba_layer_specs mamba_stack_spec \
+       --no-create-attention-mask-in-dataloader \
+       --tensorboard-dir ${TENSORBOARD_DIR}"
+
+torchrun --nproc_per_node 8 ../../pretrain_mamba.py ${options}
diff --git a/examples/merge_mp_bert.sh b/examples/merge_mp_bert.sh
deleted file mode 100755
index 1383433284..0000000000
--- a/examples/merge_mp_bert.sh
+++ /dev/null
@@ -1,18 +0,0 @@
-#!/bin/bash
-
-TENSOR_MODEL_PARALLEL_SIZE=2
-
-VOCAB_FILE=bert-vocab.txt
-CHECKPOINT_PATH=checkpoints/bert_345m
-
-WORLD_SIZE=$TENSOR_MODEL_PARALLEL_SIZE python tools/merge_mp_partitions.py \
-                                --model-type BERT \
-                                --tensor-model-parallel-size $TENSOR_MODEL_PARALLEL_SIZE \
-                                --tokenizer-type BertWordPieceLowerCase \
-                                --vocab-file $VOCAB_FILE \
-                                --num-layers 24 \
-                                --hidden-size 1024 \
-                                --num-attention-heads 16 \
-                                --seq-length 512 \
-                                --max-position-embeddings 512 \
-                                --load $CHECKPOINT_PATH
diff --git a/examples/mixtral/README.md b/examples/mixtral/README.md
new file mode 100644
index 0000000000..e85eccd6ef
--- /dev/null
+++ b/examples/mixtral/README.md
@@ -0,0 +1,132 @@
+# Mixtral 8x7B Model Inference and Finetuning
+
+## Download Mixtral 8x7B Checkpoints
+Download Mixtral 8x7B HF format checkpoint from [HF-hub](https://huggingface.co/mistralai/Mixtral-8x7B-v0.1/)
+
+Or you can simply run this following script to download Mixtral 8x7B into a specific folder.
+```python
+from huggingface_hub import snapshot_download
+SAVED_DIR = "" # Specify the saved directory
+# Download HF checkpoints
+snapshot_download(repo_id="mistralai/Mixtral-8x7B-v0.1", ignore_patterns=["*.pt"], local_dir=SAVED_DIR, local_dir_use_symlinks=False)
+```
+
+## Convert Mixtral 8x7B checkpoints from HF to MCore
+The HF checkpoints can be converted to Megatron format by using the provided checkpoint converter for HF format.
+The target model parallel size(e.g. TP,PP,EP) should be specified.
+
+Currently the converter doesn't support distributed checkpointing yet, so each different parallel config requires a specific checkpoint.
+- For training, the recommended model parallel config is TP1EP8PP4
+- For inference, the recommended model parallel config is TP1EP1PP2
+
+```
+TOKENIZER_MODEL=/workspace/checkpoints/mixtral-hf/tokenizer.model
+MEGATRON_PATH="/workspace/megatron-lm"
+export PYTHONPATH=$MEGATRON_PATH:$PYTHONPATH
+export CUDA_DEVICE_MAX_CONNECTIONS=1
+
+TARGET_TP_SIZE=""
+TARGET_EP_SIZE=""
+TARGET_PP_SIZE=""
+
+HF_FORMAT_DIR=/workspace/checkpoints/mixtral-hf
+MEGATRON_FORMAT_DIR=/workspace/checkpoints/mixtral-mcore-TP${TARGET_TP_SIZE}PP${TARGET_PP_SIZE}EP${TARGET_EP_SIZE}
+
+python tools/checkpoint/convert.py \
+--model-type GPT \
+--loader loader_mixtral_hf \
+--saver mcore \
+--target-tensor-parallel-size ${TARGET_TP_SIZE} \
+--target-pipeline-parallel-size ${TARGET_PP_SIZE} \
+--target-expert-parallel-size ${TARGET_EP_SIZE} \
+--load-dir ${HF_FORMAT_DIR} \
+--save-dir ${MEGATRON_FORMAT_DIR} \
+--tokenizer-model ${TOKENIZER_MODEL}
+```
+
+## Text generation with Mixtral 8x7B
+Inference with Mixtral 8x7B requires at least 2 GPUS, such that a distributed checkpoint with EP>=2 or PP>=2 converted with above script is needed.
+
+The Megatron-LM have included a simple REST server to use for text generation in `tools/run_text_generation_server.py`, launch it with the following script:
+```
+#!/bin/bash
+# This example will start serving the Mixtral 8x7B model.
+DISTRIBUTED_ARGS="--nproc_per_node 2 \
+                  --nnodes 1 \
+                  --node_rank 0 \
+                  --master_addr localhost \
+                  --master_port 6000"
+
+CHECKPOINT=<Path to checkpoint>
+TOKENIZER_MODEL=<Path to tokenizer (e.g. /tokenizer.model)>
+
+export CUDA_DEVICE_MAX_CONNECTIONS=1
+
+pip install flask-restful
+
+torchrun $DISTRIBUTED_ARGS tools/run_text_generation_server.py   \
+       --tensor-model-parallel-size 1  \
+       --pipeline-model-parallel-size 2  \
+       --expert-model-parallel-size 1 \
+       --load ${CHECKPOINT}  \
+       --tokenizer-type Llama2Tokenizer \
+       --tokenizer-model $TOKENIZER_MODEL \
+       --use-mcore-models \
+       --max-position-embeddings 32768 \
+       --num-layers 32 \
+       --hidden-size 4096 \
+       --ffn-hidden-size 14336 \
+       --num-attention-heads 32 \
+       --normalization RMSNorm \
+       --disable-bias-linear \
+       --position-embedding-type rope \
+       --no-position-embedding \
+       --swiglu \
+       --untie-embeddings-and-output-weights \
+       --group-query-attention \
+       --num-query-groups 8 \
+       --bf16  \
+       --micro-batch-size 1  \
+       --seq-length 1024  \
+       --seed 42 \
+       --num-experts 8 \
+       --moe-router-topk 2 \
+       --moe-token-dispatcher-type alltoall \
+       --moe-grouped-gemm \
+       --mock-data \
+       --rotary-base 1000000
+```
+
+Once the server is running you can use `tools/text_generation_cli.py` to query it, it takes one argument which is the host the server is running on.
+
+```
+python tools/text_generation_cli.py localhost:5000
+```
+
+
+## Finetuning from pretrained Mixtral 8x7B
+To finetuning pretrained Mixtral 8x7B, use the following scripts:
+
+
+```bash
+PYTORCH_IMAGE=nvcr.io/nvidia/pytorch:24.04-py3
+CHECKPOINT_PATH="" # Speicfy path to checkpoint dir
+TOKENIZER_MODEL="" # Specify path to tokenizer.model
+DATA_PATH="" # Specify path to data
+
+docker run \
+    --gpus=all \
+    --ipc=host \
+    --workdir /workspace/megatron-lm \
+    -v /path/to/data:/path/to/data \
+    -v /path/to/megatron-lm:/workspace/megatron-lm \
+    $PYTORCH_IMAGE \
+    bash examples/mixtral/train_mixtral_8x7b_distributed.sh $CHECKPOINT_PATH $TOKENIZER_MODEL $DATA_PATH
+```
+
+The above functionality also applys to Mixtral 8x22B actually, you should set the model config (including hidden_size/head_num/num_layers/ffn_hidden_size) properly according to the original [config](https://huggingface.co/mistralai/Mixtral-8x22B-v0.1/blob/main/config.json).
+
+## Acknowledgements
+Contributors outside NVIDIA for the huggingface converter and example of Mixtral models in Megatron-Core:
+- Peng Li <jerry.lp@alibaba-inc.com>
+- Jun Huang <huangjun.hj@alibaba-inc.com>
diff --git a/examples/mixtral/train_mixtral_8x7b_distributed.sh b/examples/mixtral/train_mixtral_8x7b_distributed.sh
new file mode 100644
index 0000000000..ed44d60f5c
--- /dev/null
+++ b/examples/mixtral/train_mixtral_8x7b_distributed.sh
@@ -0,0 +1,116 @@
+#!/bin/bash
+
+# Runs Mixtral 8x7B model
+
+export CUDA_DEVICE_MAX_CONNECTIONS=1
+
+GPUS_PER_NODE=8
+# Change for multinode config
+MASTER_ADDR=${MASTER_ADDR:-"localhost"}
+MASTER_PORT=${MASTER_PORT:-"6000"}
+NNODES=${SLURM_NNODES:-"1"}
+NODE_RANK=${RANK:-"0"}
+WORLD_SIZE=$(($GPUS_PER_NODE*$NNODES))
+
+CHECKPOINT_PATH=$1
+TOKENIZER_MODEL=$2
+DATA_PATH=$3
+
+DISTRIBUTED_ARGS=(
+    --nproc_per_node $GPUS_PER_NODE
+    --nnodes $NNODES
+    --node_rank $NODE_RANK
+    --master_addr $MASTER_ADDR
+    --master_port $MASTER_PORT
+)
+
+MODEL_ARGS=(
+    --use-mcore-models
+    --disable-bias-linear
+    --seq-length 4096
+    --max-position-embeddings 32768
+    --num-layers 32
+    --hidden-size 4096
+    --ffn-hidden-size 14336
+    --num-attention-heads 32
+    --init-method-std 0.01
+    --attention-dropout 0.0
+    --hidden-dropout 0.0
+    --normalization RMSNorm
+    --position-embedding-type rope
+    --swiglu
+    --untie-embeddings-and-output-weights
+    --group-query-attention
+    --num-query-groups 8
+    --no-masked-softmax-fusion
+    --no-position-embedding
+    --rotary-base 1000000
+)
+
+MOE_ARGS=(
+    --num-experts 8
+    --moe-router-topk 2
+    --moe-router-load-balancing-type aux_loss
+    --moe-aux-loss-coeff 1e-2
+    --moe-grouped-gemm
+    --moe-token-dispatcher-type alltoall
+    --overlap-param-gather
+    --overlap-grad-reduce
+)
+
+DATA_ARGS=(
+    --tokenizer-type Llama2Tokenizer
+    --tokenizer-model ${TOKENIZER_MODEL}
+    --data-path $DATA_PATH
+    --split 99990,8,2
+)
+
+TRAINING_ARGS=(
+    --micro-batch-size 1
+    --global-batch-size 256
+    --lr 1e-4
+    --train-iters 500000
+    --lr-decay-iters 320000
+    --lr-decay-style cosine
+    --min-lr 1.0e-5
+    --weight-decay 0.1
+    --lr-warmup-iters 500
+    --clip-grad 1.0
+    --bf16
+)
+
+MODEL_PARALLEL_ARGS=(
+    --tensor-model-parallel-size 1
+    --pipeline-model-parallel-size 4
+    --expert-model-parallel-size 8
+    --use-distributed-optimizer
+    --sequence-parallel
+)
+
+LOGGING_ARGS=(
+    --log-interval 1 \
+    --save-interval 10000 \
+    --eval-interval 1000 \
+    --eval-iters 10 \
+    --save $CHECKPOINT_PATH \
+    --load $CHECKPOINT_PATH \
+    --tensorboard-dir "${CHECKPOINT_PATH}/tensorboard" \
+    --no-load-optim \
+    --no-load-rng
+)
+
+if [ -n "${WANDB_API_KEY}" ]; then
+    LOGGING_ARGS+=(
+        --wandb-project ${WANDB_PROJECT:-"Mixtral"}
+        --wandb-exp-name ${WANDB_NAME:-"Mixtral_8x7B"}
+    )
+fi
+
+
+torchrun ${DISTRIBUTED_ARGS[@]} pretrain_gpt.py \
+    ${MODEL_ARGS[@]} \
+    ${MOE_ARGS[@]} \
+    ${DATA_ARGS[@]} \
+    ${TRAINING_ARGS[@]} \
+    ${MODEL_PARALLEL_ARGS[@]} \
+    ${LOGGING_ARGS[@]}
diff --git a/examples/multimodal/Dockerfile b/examples/multimodal/Dockerfile
new file mode 100644
index 0000000000..0ea6edda3f
--- /dev/null
+++ b/examples/multimodal/Dockerfile
@@ -0,0 +1,26 @@
+FROM nvcr.io/nvidia/pytorch:24.02-py3
+
+RUN apt update && \
+    apt -y upgrade && \
+    apt install -y --no-install-recommends \
+        software-properties-common \
+        build-essential \
+        python3-pip \
+        python3-dev \
+        bash \
+        git \
+        vim \
+        python-is-python3 \
+        default-jre
+
+RUN pip install --upgrade pip
+RUN pip install einops einops-exts sentencepiece braceexpand webdataset
+RUN pip install transformers datasets
+RUN pip install pytest-cov pytest_mock nltk wrapt
+RUN pip install zarr "tensorstore==0.1.45"
+RUN pip install git+https://github.com/fanshiqing/grouped_gemm@main
+RUN pip install black isort click==8.0.2
+RUN pip install pycocoevalcap megatron-energon
+RUN pip install git+https://github.com/openai/CLIP.git
+# Use --no-deps for the following to avoid outdated and unnecessary dependencies.
+RUN pip install open-flamingo[eval] --no-deps
diff --git a/examples/multimodal/README.md b/examples/multimodal/README.md
new file mode 100644
index 0000000000..5ab0c7bf0b
--- /dev/null
+++ b/examples/multimodal/README.md
@@ -0,0 +1,156 @@
+# Multimodal Example
+
+*NOTE: This example is under active development and is expected change.*
+
+The following walks through all the steps required to pretrain and instruction tune a llava architecture vision-language model (VLM). It is important to precisely follow all steps to obtain the benchmark scores at the end.
+
+This example has been tested on an A100 based DGX cluster. Pretraining and instruction tuning took approximately 1 day and 11 hours respectively on 64 GPUs using four way tensor parallelism (tp=4). Training speed will scale approximately linearly with number of GPUs available.
+
+Multimodal support in megatron is still under active development. This example is not intended to produce state-of-the-art model quality (that would require more data and model refinements), it is merely intended to demonstrate the multimodal functionality in megatron. If you hit any problems, please open a github issue.
+
+## Setup
+
+### Docker container
+
+You can build a docker container using `examples/multimodal/Dockerfile` to run this example.
+
+### Language model
+
+Follow the instructions in `megatron-lm/docs/llama_mistral.md` to download weights for Mistral-7B-Instruct-v0.3 and convert to mcore format with tensor parallel size 4
+
+### Vision model
+
+This example uses the OpenAI CLIP `ViT-L/14@336px` Vision model. To download the weights from OpenAI and convert them to a format that can be loaded in megatron, please run the following:
+
+```
+python examples/multimodal/model_converter/clip_converter.py --download-root /some/download/folder --output /some/output/folder --tensor-parallel-size 4 --use-te
+```
+
+### Combined model checkpoint
+
+Update the paths to point to the mcore converted CLIP and Mistral models and run the following script to combine the Mistral and CLIP models into a single multimodal checkpoint folder:
+
+```
+examples/multimodal/combine_mistral_clip.sh /path/to/mistral/model /path/to/clip/model /output/dir
+```
+
+## Training
+
+### Pretraining
+
+1. Download the LLavA-Pretrain dataset from Hugging Face and unzip the images folder (NOTE: 79GB of disk space required):
+
+    ```
+    git clone https://huggingface.co/datasets/liuhaotian/LLaVA-Pretrain
+    cd LLaVA-Pretrain
+    unzip images.zip
+    ```
+
+3. Run the following script to convert the data to webdataset format:
+
+    ```
+    cd <megatron-lm dir>
+    python examples/multimodal/convert_llava_pretrain_to_wds.py
+    ```
+
+4. Run the following command to convert to megatron-energon format:
+
+    ```
+    cd <LLaVA-Pretrain dir>/wds
+    energon ./
+    ```
+
+    select the following values for the presented options:
+
+    ```
+    > Please enter a desired train/val/test split like "0.5, 0.2, 0.3" or "8,1,1": 9,1,0
+    > Do you want to create a dataset.yaml interactively? [Y/n]: Y
+    > Please enter a number to choose a class: 10 (VQAWebdataset)
+    > Do you want to set a simple field_map[Y] (or write your own sample_loader [n])? [Y/n]: Y
+    > Please enter a webdataset field name for 'image' (<class 'torch.Tensor'>): jpg
+    > Please enter a webdataset field name for 'context' (<class 'str'>): json[0][value]
+    > Please enter a webdataset field name for 'answers' (typing.Optional[typing.List[str]], default: None): json[1][value]
+    > Please enter a webdataset field name for 'answer_weights' (typing.Optional[torch.Tensor], default: None):
+    ```
+
+5. Update `pretrain_dataset.yaml` so that both `path` variables point to `LLaVA-Pretrain/wds`
+
+6. Run the following script to pretrain a llava model for image captioning:
+
+    ```
+    cd <megatron-lm dir>
+    examples/multimodal/pretrain_mistral_clip.sh
+    ```
+
+All being well you should observe training and validation loss curves similar to the following:
+
+<img src="assets/pretrain_curves.png" alt="Pretraining loss curves" width="600"/>
+
+These curves were obtained with global batch size of 256. Changing this value will likely change the curves. For pretraining and instruction tuning llava models we have found that loss curves are an unreliable predictor of downstream task performance. Therefore it is necessary to run test generation and evaluation on a range of metrics to understand model quality. We intend to add training time zero-shot evaluation in a future update.
+
+You can execute the pretraining script multiple times to resume training. On resuming, the latest model, optimizer, and dataloader state are loaded.
+
+### SFT
+
+1. Prepare an instruction tuning dataset such in [megatron-energon format](https://nvidia.github.io/Megatron-Energon/data_prep.html#). NOTE: we do not provide instructions for this.
+
+2. Update `sft_dataset.yaml` so that both `path` variables point to the train and val splits of your instruction tuning dataset.
+
+Run the following script to instruction tune the pre-trained llava model:
+
+    ```
+    examples/multimodal/sft_mistral_clip.sh
+    ```
+
+You can execute the SFT script multiple times to resume training. On resuming, the latest model, optimizer, and dataloader state are loaded.
+
+## Evaluation
+
+### Generation
+
+Run the following script:
+
+```
+examples/multimodal/text_generation_mistral_clip.sh --input-image-path /path/to/input/images --output-path /some/output/directory \
+    --model-path /path/to/model.pt --tokenizer-path /path/to/tokenizer.model --gt-path /path/to/groundtruth/file --task generation-task-name
+```
+
+where `--task generation-task-name` is the name of the evaluation benchmark such as `captioning` or `MMMU`.
+
+### After pretraining
+
+#### COCO captioning
+
+1. Download the COCO 2014 test image set:
+
+    ```wget http://images.cocodataset.org/zips/test2014.zip```
+
+2. Download COCO test image annotations:
+
+    ```https://storage.googleapis.com/sfr-vision-language-research/datasets/coco_karpathy_test.json```
+
+3. First, run text generation using `--task captioning`.
+
+4. Run the following command:
+
+    ```
+    python examples/multimodal/evaluate_coco.py --input-path /output/directory/from/generation --groundtruth-path /path/to/groundtruth/file
+    ```
+
+For the mistral-7b-instruct plus clip llava model you should obtain a COCO CIDer score of approximately 94.
+
+### After SFT
+
+#### MMMU
+
+The official MMMU repository is not pip installable currently so please clone their code in `examples/multimodal` by running `git clone https://github.com/MMMU-Benchmark/MMMU.git`.
+
+The MMMU dataset is loaded from HuggingFace automatically as part of the code.
+
+Run text generation using `--task MMMU`. Then, run the following command:
+
+```
+python examples/multimodal/evaluate_mmmu.py --input-path /output/directory/from/generation
+```
+
+For the mistral-7b-instruct plus clip instruction tuned llava model you should obtain a MMMU score of approximately 38.
diff --git a/examples/multimodal/assets/pretrain_curves.png b/examples/multimodal/assets/pretrain_curves.png
new file mode 100644
index 0000000000..7981a73ba1
Binary files /dev/null and b/examples/multimodal/assets/pretrain_curves.png differ
diff --git a/examples/multimodal/combine_mistral_clip.sh b/examples/multimodal/combine_mistral_clip.sh
new file mode 100755
index 0000000000..ff866c7f72
--- /dev/null
+++ b/examples/multimodal/combine_mistral_clip.sh
@@ -0,0 +1,23 @@
+#/bin/bash
+MCORE_MISTRAL=$1    # <path_to_mcore_mistral_model_folder>
+MCORE_CLIP=$2   # <path_to_mcore_clip_model_folder>
+OUTPUT_DIR=$3   # <path_to_output_folder_for_combined_checkpoint>
+
+python examples/multimodal/combine_state_dicts.py \
+    --input \
+    ${MCORE_MISTRAL}/iter_0000001/mp_rank_00/model_optim_rng.pt \
+    ${MCORE_CLIP}/iter_0000001/mp_rank_00/model_optim_rng.pt \
+    ${MCORE_MISTRAL}/iter_0000001/mp_rank_01/model_optim_rng.pt \
+    ${MCORE_CLIP}/iter_0000001/mp_rank_01/model_optim_rng.pt \
+    ${MCORE_MISTRAL}/iter_0000001/mp_rank_02/model_optim_rng.pt \
+    ${MCORE_CLIP}/iter_0000001/mp_rank_02/model_optim_rng.pt \
+    ${MCORE_MISTRAL}/iter_0000001/mp_rank_03/model_optim_rng.pt \
+    ${MCORE_CLIP}/iter_0000001/mp_rank_03/model_optim_rng.pt \
+    --prefixes language_model vision_model language_model vision_model language_model vision_model language_model vision_model \
+    --output \
+    ${OUTPUT_DIR}/mistral_instruct_clip336_tp4_combined_mcore/iter_0000001/mp_rank_00/model_optim_rng.pt \
+    ${OUTPUT_DIR}/mistral_instruct_clip336_tp4_combined_mcore/iter_0000001/mp_rank_01/model_optim_rng.pt \
+    ${OUTPUT_DIR}/mistral_instruct_clip336_tp4_combined_mcore/iter_0000001/mp_rank_02/model_optim_rng.pt \
+    ${OUTPUT_DIR}/mistral_instruct_clip336_tp4_combined_mcore/iter_0000001/mp_rank_03/model_optim_rng.pt
+
+echo 1 > ${OUTPUT_DIR}/mistral_instruct_clip336_tp4_combined_mcore/latest_checkpointed_iteration.txt
diff --git a/examples/multimodal/combine_state_dicts.py b/examples/multimodal/combine_state_dicts.py
new file mode 100644
index 0000000000..2f7028474c
--- /dev/null
+++ b/examples/multimodal/combine_state_dicts.py
@@ -0,0 +1,81 @@
+# Copyright (c) 2024, NVIDIA CORPORATION.  All rights reserved.
+
+import argparse
+import os
+import sys
+
+import torch
+
+# Add megatron to the path.
+sys.path.append(
+    os.path.abspath(os.path.join(os.path.dirname(__file__), os.path.pardir, os.path.pardir))
+)
+
+
+def combine(input_files, module_prefixes, output_files):
+    num_inputs_per_output = int(len(input_files) / len(output_files))
+
+    for output_idx, output_file in enumerate(output_files):
+        combined_state_dict = None
+
+        lb = output_idx * num_inputs_per_output
+        ub = (output_idx + 1) * num_inputs_per_output
+        current_input_files = input_files[lb:ub]
+        current_module_prefixes = module_prefixes[lb:ub]
+
+        for i, (input_file, module_prefix) in enumerate(
+            zip(current_input_files, current_module_prefixes)
+        ):
+            # initialize the combined state dict using the first provided input file
+            current_state_dict = torch.load(input_file)
+            if i == 0:
+                combined_state_dict = current_state_dict.copy()
+                combined_state_dict["model"] = dict()
+
+            # copy model state dict and prefix names with the given module keys.
+            for k, v in current_state_dict["model"].items():
+                combined_state_dict["model"]["%s.%s" % (module_prefix, k)] = v
+
+        output_dir = os.path.dirname(output_file)
+        if not os.path.exists(output_dir):
+            os.makedirs(output_dir, exist_ok=True)
+        torch.save(combined_state_dict, output_file)
+        print("saved:", output_file)
+
+    print("done.")
+
+
+if __name__ == "__main__":
+    parser = argparse.ArgumentParser(
+        description="""
+        Combine multiple state dicts into a single state dict.
+        The combined state dict is first initialized by taking a copy of the first provided input state dict.
+        To avoid conflicts in model parameter names, a prefix must be provided for each input file.
+        Model parameter names will be renamed from <original name> to <model prefix>.<original name>.
+
+
+        Example usage:
+        python combine_state_dicts.py --input language_model.pt vision_model.pt --prefixes language_model vision_model --output multimodal.pt
+        """,
+        formatter_class=argparse.RawDescriptionHelpFormatter,
+    )
+    parser.add_argument("--input", nargs="*", required=True, help="paths to input state dict files")
+    parser.add_argument(
+        "--prefixes",
+        nargs="*",
+        required=True,
+        help="prefixes to use with each input model's parameters",
+    )
+    parser.add_argument(
+        "--output", nargs="*", required=True, help="path(s) to output state dict file"
+    )
+
+    args = parser.parse_args()
+
+    assert len(args.input) > 1, "must provide more than 1 input model to combine"
+    assert len(args.input) == len(args.prefixes), "each input model must have a corresponding key"
+    assert (
+        len(args.input) % len(args.output) == 0
+    ), "each output file must use the same number of input files"
+
+    combine(args.input, args.prefixes, args.output)
diff --git a/examples/multimodal/config.py b/examples/multimodal/config.py
new file mode 100644
index 0000000000..cf48b131a7
--- /dev/null
+++ b/examples/multimodal/config.py
@@ -0,0 +1,155 @@
+# Copyright (c) 2024, NVIDIA CORPORATION.  All rights reserved.
+from dataclasses import dataclass
+
+import torch
+
+from megatron.training.activations import fast_gelu, quick_gelu, squared_relu
+
+
+def get_language_model_config(config):
+    if config.language_model_type == "2b":
+        config.add_bias_linear = False
+        config.bias_activation_fusion = False
+        config.gated_linear_unit = True
+        config.apply_query_key_layer_scaling = True
+        config.layernorm_zero_centered_gamma = True
+        config.bias_dropout_fusion = False
+        config.rotary_percent = 0.5
+        config.apply_rope_fusion = False
+        config.attention_softmax_in_fp32 = True
+    elif config.language_model_type == "8b":
+        config.add_bias_linear = False
+        config.bias_activation_fusion = False
+        config.gated_linear_unit = False
+        config.apply_query_key_layer_scaling = True
+        config.layernorm_zero_centered_gamma = True
+        config.bias_dropout_fusion = False
+        config.rotary_percent = 0.5
+        config.attention_dropout = 0.0
+        config.apply_rope_fusion = False
+        config.activation_func = squared_relu
+        config.ffn_hidden_size = 16384
+        config.masked_softmax_fusion = True
+        config.attention_softmax_in_fp32 = True
+        config.num_query_groups = 32
+        config.kv_channels = 128
+        config.rotary_interleaved = False
+    elif config.language_model_type == "llama3_8b":
+        config.activation_func = torch.nn.functional.silu
+        config.add_bias_linear = False
+        config.bias_activation_fusion = False
+        config.gated_linear_unit = True
+        config.apply_query_key_layer_scaling = False
+        config.layernorm_zero_centered_gamma = (
+            False  # Zero centered gamma not supported for RMSNorm
+        )
+        config.bias_dropout_fusion = False
+        config.apply_rope_fusion = False
+        config.attention_softmax_in_fp32 = True
+        config.ffn_hidden_size = 14336
+    elif config.language_model_type == "mistral_7b":
+        config.activation_func = torch.nn.functional.silu
+        config.add_bias_linear = False
+        config.bias_activation_fusion = False
+        config.gated_linear_unit = True
+        config.apply_query_key_layer_scaling = False
+        config.layernorm_zero_centered_gamma = (
+            False  # Zero centered gamma not supported for RMSNorm
+        )
+        config.bias_dropout_fusion = False
+        config.apply_rope_fusion = False
+        config.attention_softmax_in_fp32 = True
+        config.ffn_hidden_size = 14336
+
+    return config
+
+
+def get_vision_model_config(config, apply_query_key_layer_scaling):
+    if config.vision_model_type == "clip":
+        config.num_layers = 24
+        config.num_attention_heads = 16
+        config.add_bias_linear = True
+        config.add_qkv_bias = True
+        config.hidden_size = 1024
+        config.hidden_dropout = 0.0
+        config.attention_dropout = 0.0
+        config.ffn_hidden_size = 4096
+        config.gated_linear_unit = False
+        config.activation_func = quick_gelu
+        config.kv_channels = 64
+        config.num_query_groups = 16
+        config.layernorm_zero_centered_gamma = False
+        config.apply_query_key_layer_scaling = apply_query_key_layer_scaling
+        config.bias_activation_fusion = False
+        config.bias_dropout_fusion = False
+        config.attention_softmax_in_fp32 = True
+        config.normalization = 'LayerNorm'
+        config.apply_rope_fusion = False
+    elif config.vision_model_type == "siglip":
+        config.num_layers = 27
+        config.num_attention_heads = 16
+        config.add_bias_linear = True
+        config.add_qkv_bias = True
+        config.hidden_size = 1152
+        config.hidden_dropout = 0.0
+        config.attention_dropout = 0.0
+        config.ffn_hidden_size = 4304
+        config.gated_linear_unit = False
+        config.activation_func = fast_gelu
+        config.kv_channels = 72
+        config.num_query_groups = 16
+        config.layernorm_zero_centered_gamma = False
+        config.apply_query_key_layer_scaling = apply_query_key_layer_scaling
+        config.bias_activation_fusion = False
+        config.bias_dropout_fusion = False
+        config.attention_softmax_in_fp32 = True
+        config.normalization = 'LayerNorm'
+        config.apply_rope_fusion = False
+        config.qk_layernorm = False
+        config.layernorm_epsilon = 1e-6
+
+    return config
+
+
+def get_vision_projection_config(config, hidden_size):
+    config.gated_linear_unit = False
+    config.bias_activation_fusion = False
+    config.add_bias_linear = False
+    config.hidden_size = hidden_size  # Used as the vision projection output size, i.e., the input to the language model.
+    if config.language_model_type == "2b":
+        config.ffn_hidden_size = 5440
+        config.activation_func = torch.nn.functional.gelu
+    if config.language_model_type == "8b":
+        config.ffn_hidden_size = 16384
+        config.activation_func = squared_relu
+    elif config.language_model_type == "llama3_8b":
+        config.ffn_hidden_size = 14336
+        config.activation_func = torch.nn.functional.gelu
+    elif config.language_model_type == "mistral_7b":
+        config.ffn_hidden_size = 14336
+        config.activation_func = torch.nn.functional.gelu
+
+    return config
+
+
+@dataclass
+class EvaluationConfig:
+    """Evaluation related configuration."""
+    task: str
+
+    temperature: float = 1.0
+    top_p: float = 0.0
+    top_k: int = 0
+
+    out_seq_length: int = 32
+
+    output_path: str = ""
+
+    input_image_path: str = ""
+    gt_path: str = ""
+
+    num_partitions: int = 1
+    partition_id: int = 0
+    num_samples_per_partition: int = 0
+
+    prompt_format: str = "mistral"
diff --git a/examples/multimodal/conversation.py b/examples/multimodal/conversation.py
new file mode 100644
index 0000000000..5139d20335
--- /dev/null
+++ b/examples/multimodal/conversation.py
@@ -0,0 +1,353 @@
+# From https://github.com/haotian-liu/LLaVA/blob/c121f0432da27facab705978f83c4ada465e46fd/llava/conversation.py
+
+import dataclasses
+from enum import auto, Enum
+from typing import List
+
+
+class SeparatorStyle(Enum):
+    """Different separator style."""
+    SINGLE = auto()
+    TWO = auto()
+    MPT = auto()
+    PLAIN = auto()
+    LLAMA_2 = auto()
+
+
+@dataclasses.dataclass
+class Conversation:
+    """A class that keeps all conversation history."""
+    system: str
+    roles: List[str]
+    messages: List[List[str]]
+    offset: int
+    sep_style: SeparatorStyle = SeparatorStyle.SINGLE
+    sep: str = "###"
+    sep2: str = None
+    real_sep2: str = None
+    version: str = "Unknown"
+
+    skip_next: bool = False
+
+    def get_prompt(self):
+        messages = self.messages
+        if len(messages) > 0 and type(messages[0][1]) is tuple:
+            messages = self.messages.copy()
+            init_role, init_msg = messages[0].copy()
+            init_msg = init_msg[0].replace("<image>", "").strip()
+            if 'mmtag' in self.version:
+                messages[0] = (init_role, init_msg)
+                messages.insert(0, (self.roles[0], "<Image><image></Image>"))
+                messages.insert(1, (self.roles[1], "Received."))
+            else:
+                messages[0] = (init_role, "<image>\n" + init_msg)
+
+        if self.sep_style == SeparatorStyle.SINGLE:
+            ret = self.system + self.sep
+            for role, message in messages:
+                if message:
+                    if type(message) is tuple:
+                        message, _, _ = message
+                    ret += role + ": " + message + self.sep
+                else:
+                    ret += role + ":"
+        elif self.sep_style == SeparatorStyle.TWO:
+            seps = [self.sep, self.sep2]
+            ret = self.system + seps[0]
+            for i, (role, message) in enumerate(messages):
+                if message:
+                    if type(message) is tuple:
+                        message, _, _ = message
+                    ret += role + ": " + message + seps[i % 2]
+                else:
+                    ret += role + ":"
+        elif self.sep_style == SeparatorStyle.MPT:
+            ret = self.system + self.sep
+            for role, message in messages:
+                if message:
+                    if type(message) is tuple:
+                        message, _, _ = message
+                    ret += role + message + self.sep
+                else:
+                    ret += role
+        elif self.sep_style == SeparatorStyle.LLAMA_2:
+            wrap_sys = lambda msg: f"<<SYS>>\n{msg}\n<</SYS>>\n\n"
+            wrap_inst = lambda msg: f"[INST] {msg} [/INST]"
+            ret = ""
+
+            for i, (role, message) in enumerate(messages):
+                if i == 0:
+                    assert message, "first message should not be none"
+                    assert role == self.roles[0], "first message should come from user"
+                if message:
+                    if type(message) is tuple:
+                        message, _, _ = message
+                    if i == 0: message = wrap_sys(self.system) + message
+                    if i % 2 == 0:
+                        message = wrap_inst(message)
+                        ret += self.sep + message
+                    else:
+                        ret += " " + message + " " + self.sep2
+                else:
+                    ret += ""
+            ret = ret.lstrip(self.sep)
+        elif self.sep_style == SeparatorStyle.PLAIN:
+            seps = [self.sep, self.sep2]
+            ret = self.system
+            for i, (role, message) in enumerate(messages):
+                if message:
+                    if type(message) is tuple:
+                        message, _, _ = message
+                    ret += message + seps[i % 2]
+                else:
+                    ret += ""
+        else:
+            raise ValueError(f"Invalid style: {self.sep_style}")
+
+        return ret
+
+    def append_message(self, role, message):
+        self.messages.append([role, message])
+
+    def get_images(self, return_pil=False):
+        images = []
+        for i, (role, msg) in enumerate(self.messages[self.offset:]):
+            if i % 2 == 0:
+                if type(msg) is tuple:
+                    import base64
+                    from io import BytesIO
+                    from PIL import Image
+                    msg, image, image_process_mode = msg
+                    if image_process_mode == "Pad":
+                        def expand2square(pil_img, background_color=(122, 116, 104)):
+                            width, height = pil_img.size
+                            if width == height:
+                                return pil_img
+                            elif width > height:
+                                result = Image.new(pil_img.mode, (width, width), background_color)
+                                result.paste(pil_img, (0, (width - height) // 2))
+                                return result
+                            else:
+                                result = Image.new(pil_img.mode, (height, height), background_color)
+                                result.paste(pil_img, ((height - width) // 2, 0))
+                                return result
+                        image = expand2square(image)
+                    elif image_process_mode in ["Default", "Crop"]:
+                        pass
+                    elif image_process_mode == "Resize":
+                        image = image.resize((336, 336))
+                    else:
+                        raise ValueError(f"Invalid image_process_mode: {image_process_mode}")
+                    max_hw, min_hw = max(image.size), min(image.size)
+                    aspect_ratio = max_hw / min_hw
+                    max_len, min_len = 800, 400
+                    shortest_edge = int(min(max_len / aspect_ratio, min_len, min_hw))
+                    longest_edge = int(shortest_edge * aspect_ratio)
+                    W, H = image.size
+                    if longest_edge != max(image.size):
+                        if H > W:
+                            H, W = longest_edge, shortest_edge
+                        else:
+                            H, W = shortest_edge, longest_edge
+                        image = image.resize((W, H))
+                    if return_pil:
+                        images.append(image)
+                    else:
+                        buffered = BytesIO()
+                        image.save(buffered, format="PNG")
+                        img_b64_str = base64.b64encode(buffered.getvalue()).decode()
+                        images.append(img_b64_str)
+        return images
+
+    def to_gradio_chatbot(self):
+        ret = []
+        for i, (role, msg) in enumerate(self.messages[self.offset:]):
+            if i % 2 == 0:
+                if type(msg) is tuple:
+                    import base64
+                    from io import BytesIO
+                    msg, image, image_process_mode = msg
+                    max_hw, min_hw = max(image.size), min(image.size)
+                    aspect_ratio = max_hw / min_hw
+                    max_len, min_len = 800, 400
+                    shortest_edge = int(min(max_len / aspect_ratio, min_len, min_hw))
+                    longest_edge = int(shortest_edge * aspect_ratio)
+                    W, H = image.size
+                    if H > W:
+                        H, W = longest_edge, shortest_edge
+                    else:
+                        H, W = shortest_edge, longest_edge
+                    image = image.resize((W, H))
+                    buffered = BytesIO()
+                    image.save(buffered, format="JPEG")
+                    img_b64_str = base64.b64encode(buffered.getvalue()).decode()
+                    img_str = f'<img src="data:image/png;base64,{img_b64_str}" alt="user upload image" />'
+                    msg = img_str + msg.replace('<image>', '').strip()
+                    ret.append([msg, None])
+                else:
+                    ret.append([msg, None])
+            else:
+                ret[-1][-1] = msg
+        return ret
+
+    def copy(self):
+        return Conversation(
+            system=self.system,
+            roles=self.roles,
+            messages=[[x, y] for x, y in self.messages],
+            offset=self.offset,
+            sep_style=self.sep_style,
+            sep=self.sep,
+            sep2=self.sep2,
+            real_sep2=self.real_sep2,
+            version=self.version)
+
+    def dict(self):
+        if len(self.get_images()) > 0:
+            return {
+                "system": self.system,
+                "roles": self.roles,
+                "messages": [[x, y[0] if type(y) is tuple else y] for x, y in self.messages],
+                "offset": self.offset,
+                "sep": self.sep,
+                "sep2": self.sep2,
+                "real_sep2": self.real_sep2
+            }
+        return {
+            "system": self.system,
+            "roles": self.roles,
+            "messages": self.messages,
+            "offset": self.offset,
+            "sep": self.sep,
+            "sep2": self.sep2,
+            "real_sep2": self.real_sep2
+        }
+
+
+conv_mpt = Conversation(
+    system="""<|im_start|>system
+A conversation between a user and an LLM-based AI assistant. The assistant gives helpful and honest answers.""",
+    roles=("<|im_start|>user\n", "<|im_start|>assistant\n"),
+    version="mpt",
+    messages=(),
+    offset=0,
+    sep_style=SeparatorStyle.MPT,
+    sep="<|im_end|>",
+)
+
+
+### Used for llava-pretraining
+conv_llava_plain = Conversation(
+    system="",
+    roles=("", ""),
+    messages=(
+    ),
+    offset=0,
+    sep_style=SeparatorStyle.PLAIN,
+    sep="\n",
+)
+
+conv_llava_v0 = Conversation(
+    system="A chat between a curious human and an artificial intelligence assistant. "
+           "The assistant gives helpful, detailed, and polite answers to the human's questions.",
+    roles=("Human", "Assistant"),
+    messages=(
+    ),
+    offset=0,
+    sep_style=SeparatorStyle.SINGLE,
+    sep="###",
+)
+
+conv_llava_v0_mmtag = Conversation(
+    system="A chat between a curious user and an artificial intelligence assistant. "
+           "The assistant is able to understand the visual content that the user provides, and assist the user with a variety of tasks using natural language."
+           "The visual content will be provided with the following format: <Image>visual content</Image>.",
+    roles=("Human", "Assistant"),
+    messages=(
+    ),
+    offset=0,
+    sep_style=SeparatorStyle.SINGLE,
+    sep="###",
+    version="v0_mmtag",
+)
+
+conv_llava_v1 = Conversation(
+    system="A chat between a curious human and an artificial intelligence assistant. "
+           "The assistant gives helpful, detailed, and polite answers to the human's questions.",
+    roles=("USER", "ASSISTANT"),
+    version="v1",
+    messages=(),
+    offset=0,
+    sep_style=SeparatorStyle.TWO,
+    sep=" ",
+    sep2="</s>",
+)
+
+conv_llava_v1_mmtag = Conversation(
+    system="A chat between a curious user and an artificial intelligence assistant. "
+           "The assistant is able to understand the visual content that the user provides, and assist the user with a variety of tasks using natural language."
+           "The visual content will be provided with the following format: <Image>visual content</Image>.",
+    roles=("USER", "ASSISTANT"),
+    messages=(),
+    offset=0,
+    sep_style=SeparatorStyle.TWO,
+    sep=" ",
+    sep2="</s>",
+    version="v1_mmtag",
+)
+
+chatqa_sft = Conversation(
+    system="System: This is a chat between a user and an artificial intelligence assistant. "
+    "The assistant gives helpful, detailed, and polite answers to the user's questions.",
+    roles=("User", "Assistant"),
+    version="chatqa",
+    messages=(),
+    offset=0,
+    sep_style=SeparatorStyle.TWO,
+    sep="\n\n",
+    sep2="\n\n",
+    real_sep2="\n\n"
+)
+
+conv_chatml = Conversation(
+    system="""<|im_start|>system
+Answer the questions.""",
+    roles=("<|im_start|>user\n", "<|im_start|>assistant\n"),
+    version="mpt",
+    messages=(),
+    offset=0,
+    sep_style=SeparatorStyle.MPT,
+    sep="<|im_end|>",
+)
+
+mistral_instruct = Conversation(
+    system="",
+    roles=("user", "assistant"),
+    version="mpt",
+    messages=(),
+    offset=0,
+    sep_style=SeparatorStyle.LLAMA_2,
+    sep="",
+    sep2="</s>",
+)
+
+llama3_instruct = Conversation(
+    system="<|start_header_id|>system<|end_header_id|>\n\nAnswer the questions.",
+    roles=("<|start_header_id|>user<|end_header_id|>\n\n", "<|start_header_id|>assistant<|end_header_id|>\n\n"),
+    version="mpt",
+    messages=(),
+    offset=0,
+    sep_style=SeparatorStyle.MPT,
+    sep="<|eot_id|>",
+)
+
+conv_templates = {
+    "plain": conv_llava_plain,
+    "v0_plain": conv_llava_plain,
+    "llava_v0": conv_llava_v0,
+    "v0_mmtag": conv_llava_v0_mmtag,
+    "llava_v1": conv_llava_v1,
+    "v1_mmtag": conv_llava_v1_mmtag,
+
+    "mpt": conv_mpt,
+}
diff --git a/examples/multimodal/convert_llava_pretrain_to_wds.py b/examples/multimodal/convert_llava_pretrain_to_wds.py
new file mode 100644
index 0000000000..0092aef246
--- /dev/null
+++ b/examples/multimodal/convert_llava_pretrain_to_wds.py
@@ -0,0 +1,31 @@
+import json
+import os
+import webdataset as wds
+
+from tqdm import tqdm
+
+llava_pretrain_dir = '<path_to_LLaVA-Pretrain>'
+
+# Paths to the dataset files
+json_file = os.path.join(llava_pretrain_dir, 'blip_laion_cc_sbu_558k.json')
+output = os.path.join(llava_pretrain_dir, 'wds')
+
+if not os.path.exists(output):
+    os.mkdir(output)
+
+# Load data
+with open(json_file, 'r') as f:
+    data = json.load(f)
+
+with wds.ShardWriter(os.path.join(output, 'pretrain-%d.tar'), maxcount=10000) as shard_writer:
+    for entry in tqdm(data):
+        with open(os.path.join(llava_pretrain_dir, entry['image']), "rb") as img_file:
+                image_data = img_file.read()
+        sample = {
+            "__key__": entry['id'],
+            "jpg": image_data,
+            "json": json.dumps(entry['conversations']).encode("utf-8"),
+        }
+        shard_writer.write(sample)
+
+print(f"Dataset successfully converted to wds")
diff --git a/examples/multimodal/dataloader_provider.py b/examples/multimodal/dataloader_provider.py
new file mode 100644
index 0000000000..4bd1b29e51
--- /dev/null
+++ b/examples/multimodal/dataloader_provider.py
@@ -0,0 +1,133 @@
+# Copyright (c) 2024, NVIDIA CORPORATION.  All rights reserved.
+import os
+
+import torch
+from dataset_helpers import TaskEncoder, print_error_handler
+
+from megatron.core import parallel_state
+from megatron.energon import (
+    LimitDataset,
+    RepeatDataset,
+    WorkerConfig,
+    get_loader,
+    get_savable_loader,
+    get_train_dataset,
+    get_val_datasets,
+)
+from megatron.core.num_microbatches_calculator import get_num_microbatches
+from megatron.core.parallel_state import get_tensor_model_parallel_rank
+from megatron.training import get_args, print_rank_0
+from megatron.training.checkpointing import get_checkpoint_name
+
+
+def datasets_provider(worker_config=None):
+    """Create multimodal train, validation and test datasets."""
+    args = get_args()
+    dname = args.data_path[0] if type(args.data_path) is list else args.data_path
+    train_dataset = get_train_dataset(
+        dname,
+        batch_size=args.micro_batch_size,
+        task_encoder=TaskEncoder(),
+        worker_config=worker_config,
+        virtual_epoch_length=1000,
+        max_samples_per_sequence=100,
+        shuffle_buffer_size=100,
+        handler=print_error_handler,
+        image_decode="pil",
+    )
+
+    val_datasets = get_val_datasets(
+        dname,
+        batch_size=args.micro_batch_size,
+        # This is the total number over all workers
+        # limit=args.eval_iters * get_num_microbatches(),
+        task_encoder=TaskEncoder(),
+        worker_config=worker_config,
+        handler=print_error_handler,
+        image_decode="pil",
+    )
+    val_datasets_without_source_datasets = [
+        # Limit the dataset to eval_iters * num_microbatches
+        LimitDataset(
+            # Repeat the inner dataset in case it's too short
+            RepeatDataset(val_ds, worker_config=worker_config),
+            length=args.eval_iters * get_num_microbatches(),
+            worker_config=worker_config,
+            reset_after_epoch=True,
+        )
+        for val_ds, _src_ds in val_datasets
+    ]
+
+    return train_dataset, val_datasets_without_source_datasets, None
+
+
+def train_valid_test_dataloaders_provider(train_val_test_num_samples):
+    """Build multimodal train, validation and test dataloaders."""
+    if get_tensor_model_parallel_rank() != 0:
+        return None, None, None
+
+    args = get_args()
+
+    worker_debug_path = None
+    worker_log_level = 0
+
+    rank = parallel_state.get_data_parallel_rank()
+    world_size = parallel_state.get_data_parallel_world_size()
+    data_parallel_group = parallel_state.get_data_parallel_group()
+
+    worker_config = WorkerConfig(
+        rank=rank,
+        world_size=world_size,
+        num_workers=args.num_workers,
+        data_parallel_group=data_parallel_group,
+        worker_debug_path=worker_debug_path,
+        worker_log_level=worker_log_level,
+    )
+    train_ds, valid_ds1, test_ds = datasets_provider(worker_config)
+
+    train_dataloader = get_savable_loader(train_ds, worker_config=worker_config)
+    if args.load is not None:
+        if getattr(args, "dataloader_save", None):
+            dp_rank = parallel_state.get_data_parallel_rank()
+            data_save_name = get_checkpoint_name(
+                args.dataloader_save,
+                args.iteration,
+                basename=f"train_dataloader_dprank{dp_rank:03d}.pt",
+            )
+            if os.path.exists(data_save_name):
+                try:
+                    dataset_state_dict = torch.load(data_save_name, map_location="cpu")
+                    train_dataloader.restore_state_rank(dataset_state_dict["dataloader_state_dict"])
+                    print_rank_0(f"restored dataset state from {data_save_name}")
+                except Exception as e:
+                    print_rank_0("loading dataloader checkpoint failed. Skipping. " + str(e))
+
+    valid_dataloader = [
+        EnergonDataloader(get_loader(valid_ds, worker_config=worker_config))
+        for valid_ds in valid_ds1
+    ]
+    test_dataloader = None
+
+    return EnergonDataloader(train_dataloader), valid_dataloader, EnergonDataloader(test_dataloader)
+
+
+class EnergonDataloader:
+    """A wrapper to use Megatron Energon dataloader with the Megatron-LM training loop."""
+    def __init__(self, dataloader):
+        self._dataloader = dataloader
+        self._iter = iter(cyclic_iter(dataloader))
+
+    def __next__(self):
+        return self._iter.__next__()
+
+    def __iter__(self):
+        return self._iter.__iter__()
+
+    def save_state(self):
+        return self._dataloader.save_state_rank()
+
+
+def cyclic_iter(iter):
+    while True:
+        for x in iter:
+            yield x
diff --git a/examples/multimodal/dataset_helpers.py b/examples/multimodal/dataset_helpers.py
new file mode 100644
index 0000000000..6468eef9bb
--- /dev/null
+++ b/examples/multimodal/dataset_helpers.py
@@ -0,0 +1,723 @@
+# Copyright (c) 2024, NVIDIA CORPORATION.  All rights reserved.
+import dataclasses
+import itertools
+import json
+import random
+import re
+import sys
+import traceback
+from dataclasses import dataclass
+from typing import Any, Dict, List, Optional, Tuple, Union
+
+from image_processing import get_visual_transform
+import conversation as conversation_lib
+import numpy as np
+import torch
+from PIL import Image, ImageDraw
+from torchvision import transforms as T
+
+from megatron.core.models.multimodal.llava_model import IGNORE_INDEX, IMAGE_TOKEN_INDEX
+from megatron.energon import (
+    Batch,
+    CaptioningSample,
+    DefaultTaskEncoder,
+    OCRSample,
+    SimilarityInterleavedSample,
+    VQASample,
+)
+from megatron.energon.transforms import CustomTransform, MergeTransform
+from megatron.training import get_args
+from megatron.training.tokenizer import build_tokenizer
+
+
+class RandomResize(CustomTransform):
+    """Resizes the image by a random scale factor in the given interval, but at most max_size"""
+
+    def __init__(self, min_scale: float, max_scale: float, max_size: int):
+        self._min_scale = min_scale
+        self._max_scale = max_scale
+        self._max_size = max_size
+
+    def apply_transform(self, matrix: np.ndarray, dst_size: np.ndarray) -> Tuple[Any, Any, Any]:
+        scale = random.uniform(self._min_scale, self._max_scale)
+        new_size = tuple(int(x * scale) for x in dst_size)
+
+        if max(new_size) > self._max_size:
+            scale = self._max_size / max(new_size)
+            new_size = tuple(int(x * scale) for x in dst_size)
+
+        matrix = self.scale(scale, scale) @ matrix
+        dst_size = np.array(new_size, dtype=dst_size.dtype)
+
+        return matrix, dst_size, (self.__class__.__name__, scale)
+
+
+class RandomResizeLongEdge(CustomTransform):
+    """Resizes the image's longer edge to a random length between min_size and max_size pixels."""
+
+    def __init__(self, min_size: int, max_size: int):
+        self._min_size = min_size
+        self._max_size = max_size
+
+    def apply_transform(self, matrix: np.ndarray, dst_size: np.ndarray) -> Tuple[Any, Any, Any]:
+        new_long = random.randint(self._min_size, self._max_size)
+        if dst_size[0] > dst_size[1]:  # h > w
+            new_w, new_h = int(new_long * dst_size[1] / dst_size[0]), new_long
+        else:  # w > h
+            new_w, new_h = new_long, int(new_long * dst_size[0] / dst_size[1])
+
+        new_size = (new_h, new_w)
+        matrix = self.scale(new_w / dst_size[1], new_h / dst_size[0]) @ matrix
+        dst_size = np.array(new_size, dtype=dst_size.dtype)
+
+        return matrix, dst_size, (self.__class__.__name__, new_size)
+
+
+class RandomPad(CustomTransform):
+    """Pads the image to the given size, randomly choosing the position of the image within the new larger image.
+    If the image is already larger than the given size, it will not be padded in that direction(s)."""
+
+    def __init__(self, size: Tuple[int, int]):
+        self._new_size = size  # h, w
+
+    def apply_transform(self, matrix: np.ndarray, dst_size: np.ndarray) -> Tuple[Any, Any, Any]:
+        h_pad = max(self._new_size[0] - dst_size[0], 0)
+        w_pad = max(self._new_size[1] - dst_size[1], 0)
+
+        if h_pad == 0 and w_pad == 0:
+            return matrix, dst_size, (self.__class__.__name__, None)
+        else:
+            # TODO: fix me
+            # top = random.randint(0, h_pad)
+            # left = random.randint(0, w_pad)
+            top = 0
+            left = 0
+
+            matrix = self.translate(left, top) @ matrix
+            dst_size = np.array(self._new_size, dtype=dst_size.dtype)
+            return matrix, dst_size, (self.__class__.__name__, (top, left))
+
+
+def _get_ocr_document_visual_transform(IMG_H=1024, IMG_W=1024):
+    document_visual_transform = T.Compose(
+        [
+            MergeTransform(
+                [
+                    # T.RandomResizedCrop(size=FINAL_SIZE, scale=(0.5, 1.0), ratio=(0.8, 1.2)),
+                    RandomResizeLongEdge(960, 1008),  # Note: 1008 comes from list(range(960, 1024, 16))[-1]
+                    T.RandomRotation(5, interpolation=T.InterpolationMode.BILINEAR),
+                    T.RandomPerspective(distortion_scale=0.1, p=0.1),
+                    RandomPad((IMG_H, IMG_W)),
+                ]
+            ),
+            T.ColorJitter(brightness=(0.8, 1.2), contrast=(0.7, 1.0)),
+            T.RandomGrayscale(p=0.5),
+            T.RandomInvert(p=0.5),
+            T.RandomAdjustSharpness(sharpness_factor=0.0, p=0.5),
+            T.RandomAdjustSharpness(sharpness_factor=2.0, p=0.5),
+            # LogImage(),
+            # T.ToTensor(),
+            # T.Normalize(IMAGE_MEAN, IMAGE_STD),
+        ]
+    )
+    return document_visual_transform
+
+def _get_ocr_document_identity_transform(IMG_H=1024, IMG_W=1024):
+    long_edge = max(IMG_H, IMG_W)
+    document_identity_transform = T.Compose(
+        [
+            MergeTransform(
+                [
+                    RandomResizeLongEdge(long_edge, long_edge),
+                    RandomPad((long_edge, long_edge)),
+                ]
+            )
+        ]
+    )
+    return document_identity_transform
+
+def _get_ocr_paragraph_visual_transform(IMG_H=1024, IMG_W=1024):
+    paragraph_visual_transform = T.Compose(
+        [
+            MergeTransform(
+                [
+                    # T.RandomResizedCrop(size=FINAL_SIZE, scale=(0.5, 1.0), ratio=(0.8, 1.2)),
+                    RandomResize(0.5, 2.0, min(IMG_H, IMG_W)), #FINAL_SIZE),
+                    T.RandomRotation(1, interpolation=T.InterpolationMode.BILINEAR),
+                    T.RandomPerspective(distortion_scale=0.1, p=0.1),
+                    RandomPad((IMG_H, IMG_W)),
+                ]
+            ),
+            T.ColorJitter(brightness=(0.8, 1.2), contrast=(0.7, 1.0)),
+            T.RandomGrayscale(p=0.5),
+            T.RandomInvert(p=0.5),
+            # T.RandomAdjustSharpness(sharpness_factor=0.0, p=0.5),
+            # T.RandomAdjustSharpness(sharpness_factor=2.0, p=0.5),
+            # LogImage(),
+            # T.ToTensor(),
+            # T.Normalize(IMAGE_MEAN, IMAGE_STD),
+        ]
+    )
+    return paragraph_visual_transform
+
+# Type for intermediate batch, after batch()
+@dataclass
+class ImageTaskSample:
+    __key__: str
+    __subflavors__: Dict
+    # (c, h, w)
+    imgs: List[torch.Tensor]
+    num_tiles: List[int]
+    text: np.ndarray
+    prompt_len: np.int64
+    target: torch.Tensor = None
+
+
+# Typing for the resulting batch data after encode_batch()
+@dataclass
+class ImageTaskBatch(Batch):
+    __keys__: List[str]
+    __subflavors__: List[Dict]
+    # (num_tiles, c, h, w)
+    imgs: torch.Tensor
+    num_tiles: List[int]
+    # (n, seq_len)
+    text: torch.Tensor
+    # (n, 1)
+    prompt_len: torch.Tensor
+    # (n, seq_len)
+    target: torch.Tensor
+
+class IdentitySplitter(object):
+    def tokenize(self, *text):
+        return text
+
+class Tokenizer:
+    def __init__(self):
+
+        args = get_args()
+        self.args = args
+
+        self.initializer()
+
+    def initializer(self):
+        # Use Encoder class as a container for global data
+        Tokenizer.tokenizer = build_tokenizer(self.args)
+        if hasattr(Tokenizer.tokenizer, 'eod'):
+            self.eod_token = Tokenizer.tokenizer.eod
+        elif hasattr(Tokenizer.tokenizer, 'eos_id'):
+            self.eod_token = Tokenizer.tokenizer.eos_id
+        else:
+            raise AttributeError('No eod token found in Tokenizer')
+        self.split_token = 313131
+
+        if (
+            hasattr(self.args, "split_sentences") and self.args.split_sentences
+        ):  # default false
+            if not nltk_available:
+                print("NLTK is not available to split sentences.")
+                exit()
+            library = "tokenizers/punkt/{}.pickle".format("english")
+            # print("loading: " + library)
+            splitter = nltk.load(library)
+            if self.args.keep_newlines:
+                # this prevents punkt from eating newlines after sentences
+                Tokenizer.splitter = nltk.tokenize.punkt.PunktSentenceTokenizer(
+                    train_text=splitter._params, lang_vars=CustomLanguageVars()
+                )
+            else:
+                Tokenizer.splitter = splitter
+        else:
+            Tokenizer.splitter = IdentitySplitter()
+
+    def __call__(self, text: str, padded: bool = True): # -> torch.Tensor:
+        sentence = Tokenizer.splitter.tokenize(text)[0]
+        sentence = Tokenizer.tokenizer.tokenize(sentence)
+        return sentence
+
+
+class TaskEncoder(DefaultTaskEncoder[OCRSample, OCRSample, ImageTaskBatch, dict]):
+    """A simple task encoder for captioning."""
+
+    def __init__(
+        self
+    ):
+        # Specify the batch_type for default batching (batching is performed here "manually" by
+        # overwriting the `batch` method)
+        super().__init__()
+
+        self.args = get_args()
+
+        self.tokenizer = Tokenizer()
+        self.manual_prompts = json.load(open(self.args.prompt_path))
+        self.seq_len = self.args.dataloader_seq_length
+
+        self.txt_to_token_dict = {}
+
+        self.img_h, self.img_w = self.args.img_h, self.args.img_w
+
+        self.ocr_document_visual_transform = _get_ocr_document_visual_transform(self.img_h, self.img_w)
+        self.ocr_document_identity_transform = _get_ocr_document_identity_transform(self.img_h, self.img_w)
+        self.ocr_paragraph_visual_transform = _get_ocr_paragraph_visual_transform(self.img_h, self.img_w)
+
+    def encode_sample(self, sample: Union[CaptioningSample, OCRSample, VQASample, SimilarityInterleavedSample]):
+        if isinstance(sample, OCRSample):
+            yield self.encode_ocr(sample)
+        elif isinstance(sample, CaptioningSample):
+            yield self.encode_captioning(sample)
+        elif isinstance(sample, VQASample):
+            is_llava_training = sample.__subflavors__['is_llava_training'] if 'is_llava_training' in sample.__subflavors__ else False
+
+            if "llava" in sample.__key__ or is_llava_training:
+                yield self.encode_llava_pretrain(sample)
+            else:
+                yield self.encode_vqa(sample)
+        elif isinstance(sample, SimilarityInterleavedSample):
+            if "llava" or "video" in sample.__key__:
+                yield self.encode_llava_sft(sample)
+            else:
+                raise NotImplementedError('Sample format not supported')
+        else:
+            raise NotImplementedError('Sample format not supported')
+
+    def encode_captioning(self, sample: CaptioningSample):
+        augment = sample.__subflavors__.get("augmentation")
+        conv_format = sample.__subflavors__['conv_format'] if 'conv_format' in sample.__subflavors__ else 'mistral'
+
+        imgs = get_visual_transform(
+            sample.image, self.img_h, self.img_w, self.args.use_tiling, self.args.max_num_tiles, self.args.use_thumbnail, augment,
+        )
+        num_tiles = [len(imgs)]
+
+        prompt_list = self.manual_prompts["CaptioningPretraining"]["llava"]
+
+        prompt_idx = np.random.randint(len(prompt_list))
+        cur_prompt = prompt_list[prompt_idx]
+        cur_prompt = "<image>\n" + cur_prompt + "\n"
+
+        caption = sample.caption.strip()
+
+        split_by_line_flag = sample.__subflavors__.get("SplitByLine")
+        if split_by_line_flag:
+            caption_list = caption.split('\n')
+            caption = np.random.choice(caption_list)
+
+        if conv_format == 'llama3_sft':
+            conv = conversation_lib.llama3_instruct.copy()
+            sep = conv.sep
+        elif conv_format == "mistral":
+            conv = conversation_lib.mistral_instruct.copy()
+            conv = conv.sep2
+
+        conversation = cur_prompt + caption + sep
+
+        input_ids = np.array(tokenizer_image_token(self.args, conversation, self.tokenizer, has_image=True))
+        target = input_ids.copy()
+
+        prompt_len = len(tokenizer_image_token(self.args, cur_prompt, self.tokenizer))
+        target[:prompt_len] = IGNORE_INDEX
+
+        return ImageTaskSample(
+            __key__=sample.__key__,
+            __subflavors__=sample.__subflavors__,
+            imgs=imgs,
+            num_tiles=num_tiles,
+            text=input_ids,
+            prompt_len=prompt_len,
+            target=target,
+        )
+
+    def encode_llava_pretrain(self, sample: VQASample):
+        augment = sample.__subflavors__['augmentation'] if 'augmentation' in sample.__subflavors__ else False
+        use_chat_format = sample.__subflavors__['use_chat_format'] if 'use_chat_format' in sample.__subflavors__ else False
+        conv_format = sample.__subflavors__['conv_format'] if 'conv_format' in sample.__subflavors__ else "mistral"
+
+        imgs = get_visual_transform(
+            sample.image, self.img_h, self.img_w, self.args.use_tiling, self.args.max_num_tiles, self.args.use_thumbnail, augment,
+        )
+        num_tiles = [len(imgs)]
+
+        assert "<image>" in sample.context
+        has_image = True
+
+        if use_chat_format:
+            prompt_idx = np.random.randint(len(self.manual_prompts["Captioning"]["raw"]))
+            prompt = self.manual_prompts["Captioning"]["raw"][prompt_idx]
+
+            sample.context = "User: <image>" + "\n" + prompt + " Assistant: "
+            conversation = sample.context + sample.answers + conversation_lib.mistral_instruct.sep
+        else:
+            # LLAVA training: override text-prompt with just IMAGE_TOKEN_INDEX
+            sample.context = "<image>" + "\n"
+            if conv_format == 'llama3_sft':
+                conversation = sample.context + sample.answers + conversation_lib.llama3_instruct.sep
+            elif conv_format == "mistral":
+                conversation = sample.context + sample.answers + conversation_lib.mistral_instruct.sep2
+
+        input_ids = np.array(tokenizer_image_token(self.args, conversation, self.tokenizer, has_image=has_image))
+        target = input_ids.copy()
+
+        prompt_len = len(tokenizer_image_token(self.args, sample.context, self.tokenizer, has_image=has_image))
+        target[:prompt_len] = IGNORE_INDEX
+
+        return ImageTaskSample(
+            __key__=sample.__key__,
+            __subflavors__=sample.__subflavors__,
+            imgs=imgs,
+            num_tiles=num_tiles,
+            text=input_ids,
+            prompt_len=prompt_len,
+            target=target,
+        )
+
+    # Based on https://github.com/haotian-liu/LLaVA/blob/c121f0432da27facab705978f83c4ada465e46fd/llava/train/train.py#L500
+    def encode_llava_sft(self, sample: SimilarityInterleavedSample):
+        augment = sample.__subflavors__['augmentation'] if 'augmentation' in sample.__subflavors__ else False
+        use_chat_format = sample.__subflavors__['use_chat_format'] if 'use_chat_format' in sample.__subflavors__ else False
+        has_image = sample.__subflavors__['has_image'] if 'has_image' in sample.__subflavors__ else False
+        has_video = sample.__subflavors__['has_video'] if 'has_video' in sample.__subflavors__ else False
+        has_visual_data = has_image or has_video
+        conv_format = sample.__subflavors__['conv_format'] if 'conv_format' in sample.__subflavors__ else "mistral"
+
+        if has_image:
+            imgs = get_visual_transform(
+                sample.images[0], self.img_h, self.img_w, self.args.use_tiling, self.args.max_num_tiles, self.args.use_thumbnail, augment,
+            )
+            num_tiles = [len(imgs)]
+        elif has_video:
+            # Grab the selected frames of the video as a tensor with shape
+            # fhwc: (num_frames, height, width, num_channels).
+            video_fhwc = sample.images[0].permute(0, 2, 3, 1)
+            selected_frames = torch.linspace(
+                0, video_fhwc.shape[0] - 1, self.args.num_frames).long()
+            video_frame_fhwc = video_fhwc[selected_frames]
+            imgs = []
+            for video_frame_hwc in video_frame_fhwc:
+                imgs += get_visual_transform(
+                    video_frame_hwc, self.img_h, self.img_w,
+                    self.args.use_tiling, self.args.max_num_tiles,
+                    self.args.use_thumbnail, augment=False)
+            num_tiles = [len(imgs)]
+        else:
+            imgs = num_tiles = []
+            sample.__key__ = "{}-{}".format("no-image", sample.__key__)
+
+        if conv_format == 'llama3_sft':
+            conv = conversation_lib.llama3_instruct.copy()
+        elif conv_format == "mistral":
+            conv = conversation_lib.mistral_instruct.copy()
+
+        roles = {"human": conv.roles[0], "gpt": conv.roles[1]}
+
+        if use_chat_format:
+            source = sample.texts
+            if roles[source[0]["from"]] != conv.roles[0]:
+                # Skip the first one if it is not from human
+                source = source[1:]
+
+            conv.messages = []
+            for j, sentence in enumerate(source):
+                role = roles[sentence["from"]]
+                assert role == conv.roles[j % 2], sentence
+                conv.append_message(role, sentence["value"])
+            conversation = conv.get_prompt()
+
+            ### Tokenize conversations
+            input_ids = tokenizer_image_token(self.args, conversation, self.tokenizer, has_visual_data)
+
+            input_ids = torch.LongTensor(input_ids)
+            target = input_ids.clone()
+
+            if conv.sep_style == conversation_lib.SeparatorStyle.MPT:
+                # Mask targets
+                sep = conv.sep + conv.roles[1]
+
+                total_len = int((target != self.tokenizer.eod_token).sum())
+
+                rounds = conversation.split(conv.sep)
+                re_rounds = [conv.sep.join(rounds[:3])] # system + user + gpt
+                for conv_idx in range(3, len(rounds), 2):
+                    re_rounds.append(conv.sep.join(rounds[conv_idx:conv_idx+2]))    # user + gpt
+
+                cur_len = 0
+                target[:cur_len] = IGNORE_INDEX
+
+                for i, rou in enumerate(re_rounds):
+                    if rou == "":
+                        break
+
+                    rou += conv.sep
+
+                    parts = rou.split(sep)
+
+                    if len(parts) != 2:
+                        break
+                    parts[0] += sep
+
+                    round_len = len(tokenizer_image_token(self.args, rou, self.tokenizer, has_visual_data))
+                    instruction_len = len(tokenizer_image_token(self.args, parts[0], self.tokenizer, has_visual_data))
+
+                    if conv_format == 'llama3_sft' and i > 0:
+                        round_len -= 1
+                        instruction_len -= 1
+
+                    target[cur_len : cur_len + instruction_len] = IGNORE_INDEX
+
+                    cur_len += round_len
+
+                target[cur_len:] = IGNORE_INDEX
+
+            elif conv.sep_style == conversation_lib.SeparatorStyle.TWO:
+                ### Mask targets
+                sep = conv.sep + conv.roles[1] + ": "
+
+                total_len = int((target != self.tokenizer.eod_token).sum())
+
+                rounds = conversation.split(conv.sep2)
+
+                cur_len = 0
+
+                for i, rou in enumerate(rounds):
+                    if rou == "":
+                        break
+
+                    rou += conv.sep2 # put back conv.sep2 since we will lose it while we conversation.split above with conv.sep2
+
+                    parts = rou.split(sep)
+
+                    if len(parts) != 2:
+                        break
+                    parts[0] += sep
+
+                    round_len = len(tokenizer_image_token(self.args, rou, self.tokenizer, has_visual_data))
+                    instruction_len = len(tokenizer_image_token(self.args, parts[0], self.tokenizer, has_visual_data)) - 2
+
+                    target[cur_len : cur_len + instruction_len] = IGNORE_INDEX
+
+                    cur_len += round_len
+
+                target[cur_len:] = IGNORE_INDEX
+
+            elif conv.sep_style == conversation_lib.SeparatorStyle.LLAMA_2:
+                raise NotImplementedError("this tokenizer is not supported yet with this data type")
+
+            if cur_len != total_len:
+                target[:] = IGNORE_INDEX
+
+                raise Exception(
+                    f"WARNING: tokenization mismatch: {cur_len} vs. {total_len}. Something is wrong, please fix!"
+                )
+
+        else:
+            return NotImplementedError
+
+        return ImageTaskSample(
+            __key__=sample.__key__,
+            __subflavors__=sample.__subflavors__,
+            imgs=imgs,
+            num_tiles=num_tiles,
+            text=input_ids,
+            prompt_len=instruction_len,
+            target=target,
+        )
+
+    def encode_vqa(self, sample: VQASample):
+        augment = sample.__subflavors__['augmentation'] if 'augmentation' in sample.__subflavors__ else False
+        has_video = sample.__subflavors__['has_video'] if 'has_video' in sample.__subflavors__ else False
+
+        if has_video:
+            # Grab the selected frames of the video as a tensor with shape
+            # fhwc: (num_frames, height, width, num_channels).
+            video_fhwc = sample.image.permute(0, 2, 3, 1)
+            selected_frames = torch.linspace(
+                0, video_fhwc.shape[0] - 1, self.args.num_frames).long()
+            video_frame_fhwc = video_fhwc[selected_frames]
+            imgs = []
+            for video_frame_hwc in video_frame_fhwc:
+                imgs += get_visual_transform(
+                    video_frame_hwc, self.img_h, self.img_w,
+                    self.args.use_tiling, self.args.max_num_tiles,
+                    self.args.use_thumbnail, augment=False)
+        else:
+            imgs = get_visual_transform(
+                sample.image, self.img_h, self.img_w, self.args.use_tiling, self.args.max_num_tiles, self.args.use_thumbnail, augment,
+            )
+        num_tiles = [len(imgs)]
+        has_image = True
+
+        if "<image>" not in sample.context:
+            sample.context = "<image>" + sample.context
+
+        if sample.context[-1:] != "\n":
+            sample.context = sample.context + "\n"
+
+        if isinstance(sample.answers, list):
+            answer_list = sample.answers
+            weight_list = np.array(sample.answer_weights).astype(np.float32)
+            weight_list = weight_list / np.sum(weight_list)
+            answer_idx = np.random.choice(weight_list.shape[0], 1, p=weight_list)[0]
+            answer = answer_list[answer_idx]
+        else:
+            answer = sample.answers
+
+        conversation = sample.context + answer
+        text = np.array(tokenizer_image_token(self.args, conversation, self.tokenizer, has_image=has_image))
+
+        prompt_len = len(tokenizer_image_token(self.args, sample.context, self.tokenizer, has_image=has_image))
+
+        target = text.copy()
+        target[:prompt_len] = IGNORE_INDEX
+
+        return ImageTaskSample(
+            __key__=sample.__key__,
+            __subflavors__=sample.__subflavors__,
+            imgs=imgs,
+            num_tiles=num_tiles,
+            text=text,
+            prompt_len=prompt_len,
+            target=target,
+        )
+
+    def encode_ocr(self, sample: OCRSample) -> ImageTaskSample:
+        if sample.__subflavors__["type"] == "document":
+            visual_transform = self.ocr_document_visual_transform
+        elif sample.__subflavors__["type"] == "paragraph":
+            visual_transform = self.ocr_paragraph_visual_transform
+        elif sample.__subflavors__["augmentation"] == False:
+            visual_transform = self.ocr_document_identity_transform
+        else:
+            raise ValueError(f"Unknown subflavor {sample.__subflavors__}")
+
+        if sample.words_boxes is not None and sample.words_boxes.shape[1] >= 5:
+            # Boxes with conf below 0.9 are skipped
+            filter_words_mask = sample.words_boxes[:, 4] < 0.9
+            filter_boxes = sample.words_boxes[filter_words_mask, :4]
+            for x, y, x2, y2 in filter_boxes:
+                if isinstance(sample.image, Image.Image):
+                    draw = ImageDraw.Draw(sample.image)
+                    draw.rectangle([int(x), int(y), (int(x2), int(y2))], fill=0)
+                else:
+                    sample.image[:, int(y) : int(y2) + 1, int(x) : int(x2) + 1] = 0
+
+            text = " ".join(
+                text for skip, text in zip(filter_words_mask, sample.words_text) if not skip
+            )
+        else:
+            text = " ".join(sample.text.splitlines())
+
+        match = re.search(r'"text_sequence": "(.*?)"', text)
+        if match:
+            text = match.group(1)
+
+        img = visual_transform(sample.image)
+        img = (torch.Tensor(np.array(img)).permute(2, 0, 1) - self.pixel_mean) / self.pixel_std
+        img = torch.nn.functional.pad(img, (0, self.img_w - img.shape[2], 0, self.img_h - img.shape[1]))
+
+        # randomly select a prompt
+        prompt_idx = np.random.randint(len(self.manual_prompts["OCR"]["raw"]))
+        cur_prompt = self.manual_prompts["OCR"]["raw"][prompt_idx]
+
+        if cur_prompt not in self.txt_to_token_dict:
+            self.txt_to_token_dict[cur_prompt] = self.tokenizer(cur_prompt)
+        cur_prompt = self.txt_to_token_dict[cur_prompt]
+
+        text_sample = self.tokenizer(text)
+        prompt_len = len(cur_prompt)
+        text_sample = np.concatenate([cur_prompt, text_sample])
+
+        return ImageTaskSample(
+            __key__=sample.__key__,
+            __subflavors__=sample.__subflavors__,
+            imgs=[img],
+            num_tiles=[1],
+            text=text_sample,
+            prompt_len=prompt_len
+        )
+
+    def batch(self, samples: List[ImageTaskSample]) -> ImageTaskBatch:
+        # Stack images to [num_tiles, c, h, w]. If there are no images (text-only), then use a dummy image.
+        imgs = [img for s in samples for img in s.imgs]
+        if len(imgs) > 0:
+            imgs = torch.stack(imgs)
+        else:
+            imgs = torch.tensor([[0]], dtype=torch.float32)
+
+        # Put tile counts to a single tensor. If there are no images (text-only), then use a dummy tensor.
+        num_tiles = torch.tensor([n for s in samples for n in s.num_tiles], dtype=torch.int)
+        if len(num_tiles) == 0:
+            num_tiles = torch.tensor([[0]], dtype=torch.int)
+
+        # If the user hasn't defined a target sequence length, then use the max along the sample lengths.
+        max_seq_len = self.seq_len
+        if not max_seq_len:
+            max_seq_len = max(len(s.text) for s in samples)
+
+        text_mat = np.full((len(samples), max_seq_len), self.tokenizer.eod_token, dtype=np.int64)
+        # +1 to accommodate shift to left by one later.
+        target_mat = np.full((len(samples), max_seq_len + 1), self.tokenizer.eod_token, dtype=np.int64)
+
+        for i, s in enumerate(samples):
+            # If the sample/target length exceeds the target sequence length, then truncate.
+            text_len = min(max_seq_len, len(s.text))
+            target_len = min(max_seq_len+1, len(s.target))
+
+            text_mat[i, :text_len] = np.array(s.text)[:text_len]
+            target_mat[i, :target_len] = np.array(s.target)[:target_len]
+
+        batch = ImageTaskBatch(
+            __keys__=[s.__key__ for s in samples],
+            __subflavors__=[s.__subflavors__ for s in samples],
+            imgs=imgs,
+            num_tiles=num_tiles,
+            text=torch.from_numpy(text_mat),
+            prompt_len=torch.from_numpy(np.array([s.prompt_len for s in samples], dtype=np.int64)),
+            target=torch.from_numpy(target_mat),
+        )
+
+        return batch
+
+    def encode_batch(self, batch: ImageTaskBatch) -> dict:
+        raw = dataclasses.asdict(batch)
+        del raw["__subflavors__"]
+        return raw
+
+
+def print_error_handler(exc: Exception, key: Optional[str]):
+    print(
+        f"The following exception occurred in the dataloader for sample {key} and is skipped",
+        file=sys.stderr,
+    )
+    traceback.print_exc()
+
+# From https://github.com/haotian-liu/LLaVA/blob/c121f0432da27facab705978f83c4ada465e46fd/llava/mm_utils.py#L185
+def tokenizer_image_token(args, prompt, tokenizer, has_image=True, image_token_index=IMAGE_TOKEN_INDEX, return_tensors=None):
+
+    if not has_image:
+        input_ids = tokenizer(prompt)
+
+    else:
+        prompt_chunks = [tokenizer(chunk) for chunk in prompt.split('<image>')]
+
+        def insert_separator(X, sep):
+            return [ele for sublist in zip(X, [sep]*len(X)) for ele in sublist][:-1]
+
+        input_ids = []
+        offset = 0
+
+        if args.tokenizer_type in ['Llama2Tokenizer', 'Llama3Tokenizer'] and len(prompt_chunks) > 0 and len(prompt_chunks[0]) > 0:
+            offset = 1
+            input_ids.append(prompt_chunks[0][0])
+
+        for x in insert_separator(prompt_chunks, [image_token_index] * (offset + 1)):
+            input_ids.extend(x[offset:])
+
+        if return_tensors is not None:
+            if return_tensors == 'pt':
+                return torch.tensor(input_ids, dtype=torch.long)
+            raise ValueError(f'Unsupported tensor type: {return_tensors}')
+
+    # # remove BOS token
+    # if args.tokenizer_type in ['Llama2Tokenizer', 'Llama3Tokenizer']:
+    #     return input_ids[1:]
+
+    return input_ids
diff --git a/examples/multimodal/evaluate_chartqa.py b/examples/multimodal/evaluate_chartqa.py
new file mode 100644
index 0000000000..8ec346d0d1
--- /dev/null
+++ b/examples/multimodal/evaluate_chartqa.py
@@ -0,0 +1,41 @@
+import argparse
+import json
+
+from evaluate_mmmu import get_input_output_paths
+from evaluate_vqav2 import compute_vqa_accuracy
+
+
+def merge_input_files(input_path):
+    """Merge input files to a format compatible with the evaluator."""
+    input_file_paths, output_file_path = get_input_output_paths(input_path, task="ChartQA")
+
+    results = []
+
+    for input_file_path in input_file_paths:
+        with open(input_file_path, "r") as input_file:
+            for line in input_file:
+                res = json.loads(line)
+                res["question_id"] = res["sample_id"]
+
+                results.append(res)
+
+    with open(output_file_path, "w") as output_file:
+        json.dump(results, output_file)
+
+    return output_file_path
+
+
+def chartqa_eval(input_path):
+    """Run ChartQA evaluation."""
+    result_file_path = merge_input_files(input_path)
+    return compute_vqa_accuracy(result_file_path, use_chartqa_metric=True)
+
+
+if __name__ == "__main__":
+    parser = argparse.ArgumentParser()
+    parser.add_argument('--input-path', type=str, help="Path to input file(s)")
+    args = parser.parse_args()
+
+    avg_acc = chartqa_eval(args.input_path)
+
+    print(f"ChartQA accuracy: {avg_acc:.2f}")
diff --git a/examples/multimodal/evaluate_coco.py b/examples/multimodal/evaluate_coco.py
new file mode 100644
index 0000000000..a717090c92
--- /dev/null
+++ b/examples/multimodal/evaluate_coco.py
@@ -0,0 +1,58 @@
+# Copyright (c) 2024, NVIDIA CORPORATION.  All rights reserved.
+import argparse
+import json
+
+from evaluate_mmmu import get_input_output_paths
+from pycocoevalcap.eval import COCOEvalCap
+from pycocotools.coco import COCO
+
+
+def convert_to_coco_format(input_path):
+    """Convert input files to COCO compatible format."""
+    input_file_paths, output_file_path = get_input_output_paths(input_path, task="captioning")
+
+    captions = []
+
+    for input_file_path in input_file_paths:
+        with open(input_file_path, "r") as input_file:
+            for line in input_file:
+                res = json.loads(line)
+
+                question_id = res['sample_id']
+                caption = res['caption'].rstrip('.').lower()
+
+                captions.append({"image_id": question_id, "caption": caption})
+
+    with open(output_file_path, "w") as output_file:
+        json.dump(captions, output_file, indent=4)
+
+    return output_file_path
+
+
+def coco_captioning_eval(input_path, groundtruth_file):
+    """Run COCO captioning evaluation."""
+    coco = COCO(groundtruth_file)
+    input_file = convert_to_coco_format(input_path)
+    coco_result = coco.loadRes(input_file)
+
+    coco_eval = COCOEvalCap(coco, coco_result)
+
+    # Evaluate on the input subset of images.
+    coco_eval.params["image_id"] = coco_result.getImgIds()
+
+    coco_eval.evaluate()
+
+    print("========== COCO captioning scores ==========")
+    for metric, score in coco_eval.eval.items():
+        print(f"{metric} {score * 100:.3f}")
+
+
+if __name__ == "__main__":
+    parser = argparse.ArgumentParser()
+    parser.add_argument("--input-path", type=str, required=True, help="Path to input file(s)")
+    parser.add_argument(
+        "--groundtruth-path", type=str, required=True, help="Path to groundtruth file"
+    )
+    args = parser.parse_args()
+
+    coco_captioning_eval(args.input_path, args.groundtruth_path)
diff --git a/examples/multimodal/evaluate_mmmu.py b/examples/multimodal/evaluate_mmmu.py
new file mode 100644
index 0000000000..955be95842
--- /dev/null
+++ b/examples/multimodal/evaluate_mmmu.py
@@ -0,0 +1,98 @@
+import argparse
+import glob
+import json
+import os
+import re
+import subprocess
+
+from run_text_generation import get_output_path
+from config import EvaluationConfig
+
+
+def get_input_output_paths(input_path, task):
+    """Get all input files and an output path for a merged file."""
+    # Single input file.
+    if os.path.exists(input_path):
+        input_file_paths = [input_path]
+        output_file_path = input_path.replace(".jsonl", "-merged.json")
+    # Select multiple partitions and dp ranks.
+    else:
+        cfg = EvaluationConfig(task=task, output_path=input_path, partition_id="*")
+        pattern = get_output_path(cfg, dp_rank="*")
+        input_file_paths = glob.glob(pattern)
+
+        output_file_path = input_path + f"-{task}-merged.json"
+
+    return input_file_paths, output_file_path
+
+
+def convert_to_mmmu_format(input_path):
+    """Convert input files to MMMU compatible format."""
+    input_file_paths, output_file_path = get_input_output_paths(input_path, "MMMU")
+
+    output = dict()
+
+    for input_file_path in input_file_paths:
+        with open(input_file_path, "r") as input_file:
+            for line in input_file:
+                res = json.loads(line)
+
+                sample_id = res["sample_id"]
+                prediction = res["prediction"]
+
+                output[sample_id] = prediction
+
+    with open(output_file_path, "w") as output_file:
+        json.dump(output, output_file)
+
+    return output_file_path
+
+
+def mmmu_eval(input_path, groundtruth_path):
+    """Run MMMU evaluation."""
+    result_file = convert_to_mmmu_format(input_path)
+
+    # The MMMU repo has a script for running the actual evaluation but no API. So launching the script here.
+    output = subprocess.run(
+        [
+            "python",
+            "examples/multimodal/MMMU/mmmu/main_eval_only.py",
+            "--output_path",
+            result_file,
+            "--answer_path",
+            groundtruth_path,
+        ],
+        capture_output=True,
+        text=True,
+    )
+
+    print(output.stderr)
+    print(output.stdout)
+
+    m = re.search("'Overall': {'num': \d, 'acc': (\d.\d+)}", output.stdout)
+
+    return float(m.group(1)) * 100.0
+
+
+def main():
+    """Run MMMU evaluation."""
+    # Using the validation groundtruth file from the MMMU repo by default. This assumes you have cloned the MMMU github repo here.
+    default_groundtruth_path = "examples/multimodal/MMMU/mmmu/answer_dict_val.json"
+
+    parser = argparse.ArgumentParser()
+    parser.add_argument("--input-path", type=str, required=True, help="Path to input file(s)")
+    parser.add_argument(
+        "--groundtruth-path",
+        type=str,
+        default=default_groundtruth_path,
+        help="Path to groundtruth file. Defaults to the validation file in the MMMU repo.",
+    )
+    args = parser.parse_args()
+
+    avg_acc = mmmu_eval(args.input_path, args.groundtruth_path)
+
+    print(f"MMMU average accuracy: {avg_acc:.2f}")
+
+
+if __name__ == "__main__":
+    main()
diff --git a/examples/multimodal/evaluate_textvqa.py b/examples/multimodal/evaluate_textvqa.py
new file mode 100644
index 0000000000..e231b8e2c2
--- /dev/null
+++ b/examples/multimodal/evaluate_textvqa.py
@@ -0,0 +1,49 @@
+import argparse
+import json
+
+from evaluate_mmmu import get_input_output_paths
+from evaluate_vqav2 import compute_vqa_accuracy
+
+
+def merge_input_files(input_path):
+    """Merge input files to a format compatible with the evaluator."""
+    input_file_paths, output_file_path = get_input_output_paths(input_path, task="TextVQA")
+
+    results = []
+
+    for input_file_path in input_file_paths:
+        with open(input_file_path, "r") as input_file:
+            for line in input_file:
+                res = json.loads(line)
+                results.append(
+                    {
+                        "question_id": res["sample_id"],
+                        "answer": res["answer"],
+                        "gt_answer": res["gt_answer"],
+                    }
+                )
+
+    # Make order deterministic.
+    # results = sorted(results, key=lambda d: d["question_id"])
+
+    with open(output_file_path, "w") as output_file:
+        json.dump(results, output_file)
+
+    return output_file_path
+
+
+def textvqa_eval(input_path):
+    """Run TextVQA evaluation."""
+    result_file_path = merge_input_files(input_path)
+    avg_acc = compute_vqa_accuracy(result_file_path)
+    return avg_acc
+
+
+if __name__ == "__main__":
+    parser = argparse.ArgumentParser()
+    parser.add_argument('--input-path', type=str, help="Path to input file(s)")
+    args = parser.parse_args()
+
+    avg_acc = textvqa_eval(args.input_path)
+
+    print(f"===== TextVQA Accuracy {avg_acc:.2f}% =====")
diff --git a/examples/multimodal/evaluate_vqav2.py b/examples/multimodal/evaluate_vqav2.py
new file mode 100644
index 0000000000..9e3b727501
--- /dev/null
+++ b/examples/multimodal/evaluate_vqav2.py
@@ -0,0 +1,93 @@
+import argparse
+import json
+
+from evaluate_mmmu import get_input_output_paths
+from open_flamingo.eval.vqa_metric import VQAEval
+
+
+def merge_input_files(input_path):
+    """Merge input files to a format compatible with the evaluator."""
+    input_file_paths, output_file_path = get_input_output_paths(input_path, task="VQAv2")
+
+    results = []
+
+    for input_file_path in input_file_paths:
+        with open(input_file_path, "r") as input_file:
+            for line in input_file:
+                res = json.loads(line)
+                res["question_id"] = res["sample_id"]
+
+                results.append(res)
+
+    with open(output_file_path, "w") as output_file:
+        json.dump(results, output_file)
+
+    return output_file_path
+
+
+def is_number(n: str):
+    """Check if input is a number."""
+    try:
+        float(n)
+        return True
+    except ValueError:
+        return False
+
+
+def compute_vqa_accuracy(result_file, use_chartqa_metric=False):
+    """Compute VQA accuracy."""
+    merged_results = json.load(open(result_file))
+
+    vqa = VQAEval(vqa=None, vqaRes=None)
+    all_acc = []
+    for res in merged_results:
+        pred = res["answer"]
+        pred = vqa.processPunctuation(pred)
+        pred = vqa.processDigitArticle(pred)
+
+        gt = res["gt_answer"]
+        gt = [vqa.processPunctuation(ans) for ans in gt]
+        gt = [vqa.processDigitArticle(ans) for ans in gt]
+
+        # ChartQA uses relaxed accuracy:
+        # "We consider an answer to be correct if it is within 5% of the gold answer.
+        # For non-numeric answers, we still need an exact match to consider an answer to be correct."
+        if use_chartqa_metric:
+            acc = 0.0
+            assert len(gt) == 1, "expected exactly one groundtruth answer."
+            gt = gt[0]
+
+            if is_number(pred) and is_number(gt):
+                pred = float(pred)
+                gt = float(gt)
+                if pred >= (gt * 0.95) and pred <= (gt * 1.05):
+                    acc = 1.0
+            elif pred == gt:
+                acc = 1.0
+
+            all_acc.append(acc)
+        else:
+            num_match = sum([pred == ans for ans in gt])
+            acc = min(1.0, num_match / 3.0)
+            all_acc.append(acc)
+
+    acc_avg = sum(all_acc) / len(all_acc) * 100
+
+    return acc_avg
+
+
+def vqav2_eval(input_path):
+    """Run VQAv2 evaluation."""
+    result_file = merge_input_files(input_path)
+    avg_acc = compute_vqa_accuracy(result_file)
+    return avg_acc
+
+
+if __name__ == "__main__":
+    parser = argparse.ArgumentParser()
+    parser.add_argument('--input-path', type=str, help="Path to input file(s)")
+    args = parser.parse_args()
+
+    avg_acc = vqav2_eval(args.input_path)
+
+    print(f"===== VQAv2 Accuracy {avg_acc:.2f}% =====")
diff --git a/examples/multimodal/image_processing.py b/examples/multimodal/image_processing.py
new file mode 100644
index 0000000000..a4541576ae
--- /dev/null
+++ b/examples/multimodal/image_processing.py
@@ -0,0 +1,130 @@
+# Copyright (c) 2024, NVIDIA CORPORATION.  All rights reserved. Except portions as noted which are Copyright (c) 2023 OpenGVLab and licensed under the MIT license found in LICENSE.
+import numpy as np
+import torch
+
+from PIL import Image, ImageDraw
+from torchvision import transforms as T
+from torchvision.transforms import Compose, RandAugment, RandomResizedCrop, Resize, ToPILImage
+
+
+# Imagenet's mean and std.
+pixel_mean = [123.675, 116.28, 103.53]
+pixel_std = [58.395, 57.12, 57.375]
+
+# Reshape for broadcasting.
+pixel_mean = torch.Tensor(pixel_mean).view(-1, 1, 1)
+pixel_std = torch.Tensor(pixel_std).view(-1, 1, 1)
+
+
+def convert_to_rgb(image):
+    return image.convert("RGB")
+
+def _transform_train_aug(img_h, img_w):
+    return Compose([
+        ToPILImage(),
+        RandomResizedCrop((img_h, img_w), scale=(0.5, 1.0)),
+        convert_to_rgb,
+        RandAugment(2, 5, isPIL=True, augs=['Identity', 'AutoContrast', 'Brightness', 'Sharpness', 'Equalize',
+                                              'ShearX', 'ShearY', 'TranslateX', 'TranslateY', 'Rotate']),
+    ])
+
+def _transform_test(img_h, img_w):
+    return Compose([
+        ToPILImage(),
+        Resize((img_h, img_w)),
+        convert_to_rgb,
+    ])
+
+
+def standardize_image(img):
+    """Standardize image pixel values."""
+    return (torch.Tensor(np.array(img)).permute(2, 0, 1) - pixel_mean) / pixel_std
+
+
+def get_visual_transform(img, img_h, img_w, use_tiling=False, max_num_tiles=1, use_thumbnail=False, augment=False):
+    if use_tiling:
+        assert img_h == img_w, "dynamic tiling expects equal tile height and width"
+        imgs = dynamic_preprocess(img, min_num=1, max_num=max_num_tiles, image_size=img_h, use_thumbnail=use_thumbnail)
+        imgs = [standardize_image(img.convert("RGB")) for img in imgs]
+    else:
+        img = np.array(img)
+        original_h, original_w = img.shape[0], img.shape[1]
+        ratio = float(max(img_h, img_w)) / max(original_h, original_w)
+        scaled_h, scaled_w = int(original_h * ratio + 0.5), int(original_w * ratio + 0.5)
+
+        if augment:
+            visual_transform = _transform_train_aug(scaled_h, scaled_w)
+        else:
+            visual_transform = _transform_test(scaled_h, scaled_w)
+
+        img = visual_transform(img)
+
+        # Standardize pixel values.
+        img = standardize_image(img)
+
+        # Pad to target image size.
+        delta_h, delta_w = img_h - scaled_h, img_w - scaled_w
+        img = torch.nn.functional.pad(img, (0, delta_w, 0, delta_h))
+        imgs = [img]
+
+    return imgs
+
+
+# From https://github.com/OpenGVLab/InternVL/blob/c62fa4f7c850165d7386bdc48ac6bc5a6fab0864/internvl_chat/internvl/train/dataset.py#L685
+# Copyright (c) 2023 OpenGVLab.
+def find_closest_aspect_ratio(aspect_ratio, target_ratios, width, height, image_size):
+    best_ratio_diff = float('inf')
+    best_ratio = (1, 1)
+    area = width * height
+    for ratio in target_ratios:
+        target_aspect_ratio = ratio[0] / ratio[1]
+        ratio_diff = abs(aspect_ratio - target_aspect_ratio)
+        if ratio_diff < best_ratio_diff:
+            best_ratio_diff = ratio_diff
+            best_ratio = ratio
+        elif ratio_diff == best_ratio_diff:
+            if area > 0.5 * image_size * image_size * ratio[0] * ratio[1]:
+                best_ratio = ratio
+    # print(f'width: {width}, height: {height}, best_ratio: {best_ratio}')
+    return best_ratio
+
+
+# From https://github.com/OpenGVLab/InternVL/blob/c62fa4f7c850165d7386bdc48ac6bc5a6fab0864/internvl_chat/internvl/train/dataset.py#L702
+# Copyright (c) 2023 OpenGVLab.
+def dynamic_preprocess(image, min_num=1, max_num=6, image_size=448, use_thumbnail=False):
+    orig_width, orig_height = image.size
+    aspect_ratio = orig_width / orig_height
+
+    # calculate the existing image aspect ratio
+    target_ratios = set(
+        (i, j) for n in range(min_num, max_num + 1) for i in range(1, n + 1) for j in range(1, n + 1) if
+        i * j <= max_num and i * j >= min_num)
+    target_ratios = sorted(target_ratios, key=lambda x: x[0] * x[1])
+
+    # find the closest aspect ratio to the target
+    target_aspect_ratio = find_closest_aspect_ratio(
+        aspect_ratio, target_ratios, orig_width, orig_height, image_size)
+
+    # calculate the target width and height
+    target_width = image_size * target_aspect_ratio[0]
+    target_height = image_size * target_aspect_ratio[1]
+    blocks = target_aspect_ratio[0] * target_aspect_ratio[1]
+
+    # resize the image
+    resized_img = image.resize((target_width, target_height))
+    processed_images = []
+    for i in range(blocks):
+        box = (
+            (i % (target_width // image_size)) * image_size,
+            (i // (target_width // image_size)) * image_size,
+            ((i % (target_width // image_size)) + 1) * image_size,
+            ((i // (target_width // image_size)) + 1) * image_size
+        )
+        # split the image
+        split_img = resized_img.crop(box)
+        processed_images.append(split_img)
+    assert len(processed_images) == blocks
+    if use_thumbnail and len(processed_images) != 1:
+        thumbnail_img = image.resize((image_size, image_size))
+        processed_images.append(thumbnail_img)
+    return processed_images
diff --git a/examples/multimodal/layer_specs.py b/examples/multimodal/layer_specs.py
new file mode 100644
index 0000000000..b56e0b07e1
--- /dev/null
+++ b/examples/multimodal/layer_specs.py
@@ -0,0 +1,120 @@
+# Copyright (c) 2024, NVIDIA CORPORATION. All rights reserved.
+import torch
+
+from megatron.core.fusions.fused_bias_dropout import get_bias_dropout_add
+from megatron.core.tensor_parallel.layers import ColumnParallelLinear, RowParallelLinear
+from megatron.core.transformer.attention import SelfAttention, SelfAttentionSubmodules
+from megatron.core.transformer.dot_product_attention import DotProductAttention
+from megatron.core.transformer.enums import AttnMaskType
+from megatron.core.transformer.identity_op import IdentityOp
+from megatron.core.transformer.mlp import MLP, MLPSubmodules
+from megatron.core.transformer.spec_utils import ModuleSpec
+from megatron.core.transformer.transformer_layer import TransformerLayer, TransformerLayerSubmodules
+
+try:
+    from megatron.core.transformer.custom_layers.transformer_engine import (
+        TEColumnParallelLinear,
+        TEDotProductAttention,
+        TELayerNormColumnParallelLinear,
+        TENorm,
+        TERowParallelLinear,
+    )
+
+    HAVE_TE = True
+except ImportError:
+    HAVE_TE = False
+
+try:
+    import apex
+
+    from megatron.core.fusions.fused_layer_norm import FusedLayerNorm
+
+    HAVE_APEX = True
+    LNImpl = FusedLayerNorm
+except ImportError:
+    import warnings
+
+    from megatron.core.transformer.torch_layer_norm import WrappedTorchLayerNorm
+
+    warnings.warn(f'Apex is not installed. Falling back to Torch LayerNorm')
+    LNImpl = WrappedTorchLayerNorm
+
+
+def get_layer_spec(is_vit, normalization) -> ModuleSpec:
+    attn_mask_type = AttnMaskType.no_mask if is_vit else AttnMaskType.causal
+    if normalization == "LayerNorm":
+        norm = LNImpl
+    elif normalization == "RMSNorm":
+        norm = TENorm
+    else:
+        raise RuntimeError("unknown normalization", normalization)
+
+    mlp = get_mlp_module_spec(use_te=False)  # doesn't include norm.
+
+    return ModuleSpec(
+        module=TransformerLayer,
+        submodules=TransformerLayerSubmodules(
+            input_layernorm=norm,
+            self_attention=ModuleSpec(
+                module=SelfAttention,
+                params={"attn_mask_type": attn_mask_type},
+                submodules=SelfAttentionSubmodules(
+                    linear_qkv=ColumnParallelLinear,
+                    core_attention=DotProductAttention,
+                    linear_proj=RowParallelLinear,
+                    q_layernorm=IdentityOp,
+                    k_layernorm=IdentityOp,
+                ),
+            ),
+            self_attn_bda=get_bias_dropout_add,
+            pre_mlp_layernorm=norm,
+            mlp=mlp,
+            mlp_bda=get_bias_dropout_add,
+        ),
+    )
+
+
+def get_layer_spec_te(is_vit=False) -> ModuleSpec:
+    attn_mask_type = AttnMaskType.no_mask if is_vit else AttnMaskType.causal
+
+    mlp = get_norm_mlp_module_spec_te()
+    return ModuleSpec(
+        module=TransformerLayer,
+        submodules=TransformerLayerSubmodules(
+            self_attention=ModuleSpec(
+                module=SelfAttention,
+                params={"attn_mask_type": attn_mask_type},
+                submodules=SelfAttentionSubmodules(
+                    linear_qkv=TELayerNormColumnParallelLinear,
+                    core_attention=TEDotProductAttention,
+                    linear_proj=TERowParallelLinear,
+                    q_layernorm=IdentityOp,
+                    k_layernorm=IdentityOp,
+                ),
+            ),
+            self_attn_bda=get_bias_dropout_add,
+            pre_mlp_layernorm=IdentityOp,
+            mlp=mlp,
+            mlp_bda=get_bias_dropout_add,
+        ),
+    )
+
+
+def get_mlp_module_spec(use_te: bool = True) -> ModuleSpec:
+    # Dense MLP w/ or w/o TE modules.
+    return ModuleSpec(
+        module=MLP,
+        submodules=MLPSubmodules(
+            linear_fc1=TEColumnParallelLinear if use_te else ColumnParallelLinear,
+            linear_fc2=TERowParallelLinear if use_te else RowParallelLinear,
+        ),
+    )
+
+
+def get_norm_mlp_module_spec_te() -> ModuleSpec:
+    return ModuleSpec(
+        module=MLP,
+        submodules=MLPSubmodules(
+            linear_fc1=TELayerNormColumnParallelLinear, linear_fc2=TERowParallelLinear
+        ),
+    )
diff --git a/examples/multimodal/manual_prompts.json b/examples/multimodal/manual_prompts.json
new file mode 100644
index 0000000000..b0dfd84801
--- /dev/null
+++ b/examples/multimodal/manual_prompts.json
@@ -0,0 +1,48 @@
+{
+    "COMMENT": "Sources for these prompts include https://huggingface.co/datasets/liuhaotian/LLaVA-Pretrain/viewer and https://huggingface.co/datasets/HuggingFaceM4/M3IT",
+    "Captioning": {
+        "raw": [
+            "Can you briefly explain what you see in the image?",
+            "Describe what's happening in this image in one short sentence.",
+            "Write a short caption that accurately represents the content of this image.",
+            "Please generate a descriptive caption for the image provided.",
+            "How would you summarize the scene depicted in the picture in short?",
+            "Describe the image briefly.",
+            "Write a succinct description of the image, capturing its main components, the relationships between them, and any notable details.",
+            "Create a concise caption that accurately describes the main elements in the image provided.",
+            "Write a brief, yet comprehensive, description of the image.",
+            "Describe the image in a clear and concise manner.",
+            "For the given image, provide a one-sentence summary that captures the most important details.",
+            "Generate a short caption for the picture.",
+            "Write a short and informative description that highlights the primary subjects and actions occurring in the given image.",
+            "Provide a concise and informative caption for the image, focusing on the primary subjects.",
+            "Write a clear description of the image, make sure the key features are well covered.",
+            "Offer a succinct explanation of the picture presented."
+        ]
+    },
+    "CaptioningPretraining": {
+        "raw": [
+            "Generate a short caption of the image.",
+            "Describe the image concisely.",
+            "Provide a brief description of the given image."
+        ],
+        "llava": [
+            "Give a brief description of image.",
+            "Give a brief description of the image.",
+            "Provide a brief description of the given image.",
+            "Provide a one-sentence caption for the provided image.",
+            "Write a terse but informative summary of the picture.",
+            "Describe the image concisely.",
+            "Generate a clear and concise summary of the photo."
+        ]
+    },
+    "OCR": {
+        "raw": [
+            "Can you read the text from image and output here?",
+            "Extract and document the text from the provided image.",
+            "Converting the text embedded in this image into a readable document.",
+            "Transcribe all the text you find.",
+            "Can you extract all visible text from the image here?"
+        ]
+    }
+}
diff --git a/examples/multimodal/model.py b/examples/multimodal/model.py
new file mode 100644
index 0000000000..b4bab73cfb
--- /dev/null
+++ b/examples/multimodal/model.py
@@ -0,0 +1,150 @@
+# Copyright (c) 2024, NVIDIA CORPORATION.  All rights reserved.
+import warnings
+from copy import deepcopy
+
+import torch
+from config import get_language_model_config, get_vision_model_config, get_vision_projection_config
+from layer_specs import get_layer_spec, get_layer_spec_te, get_mlp_module_spec
+
+from megatron.core.models.multimodal.llava_model import LLaVAModel
+from megatron.core.models.vision.clip_vit_model import get_num_image_embeddings
+from megatron.training import get_args, print_rank_0
+from megatron.training.arguments import core_transformer_config_from_args
+
+
+def model_provider(
+    pre_process=True, post_process=True, add_encoder=True, add_decoder=True, parallel_output=True
+) -> LLaVAModel:
+    """Builds the model.
+
+    Args:
+        pre_process (bool): Include the embedding layer in the gpt decoder (used with pipeline parallelism). Defaults to True.
+        post_process (bool): Include an output layer and a layernorm in the gpt decoder (used with pipeline parallelism). Defaults to True.
+        add_encoder (bool): Construct the encoder module (used with pipeline parallelism). Defaults to True. When we use pipelining, the encoder
+            will live on only a subset of the pipeline stages (specifically, only the first stage).
+        add_decoder (bool): Construct the decoder module (used with pipeline parallelism). Defaults to True. When we use pipelining, the decoder
+            will live on only a subset of the pipeline stages (specifically, every stage after the first one).
+        parallel_output (bool): Enable parallel model output.
+
+    Returns:
+        model: A multimodal model.
+    """
+    args = get_args()
+
+    use_te = args.use_te
+
+    print_rank_0('building a multimodal model ...')
+
+    num_image_embeddings = get_num_image_embeddings(
+        args.img_h, args.img_w, args.patch_dim, args.vision_model_type,
+        args.disable_vision_class_token, 1
+    )
+    old_seq_length = args.seq_length
+    args.seq_length = args.encoder_seq_length = num_image_embeddings
+    if torch.distributed.get_rank() == 0 and old_seq_length != args.seq_length:
+        warnings.warn(
+            f"Changed seq_length and encoder_seq_length (vision model sequence length) from {old_seq_length} to num_image_tokens ({num_image_embeddings})"
+        )
+
+    max_num_image_embeddings = (args.max_num_tiles + int(args.use_thumbnail)) * num_image_embeddings
+
+    assert (
+        args.decoder_seq_length is not None
+    ), "Please provide --decoder-seq-length to set the language model sequence length"
+    assert (
+        args.decoder_seq_length > max_num_image_embeddings
+    ), "Language model sequence length must be greater than the maximum number of image embeddings"
+    if args.decoder_seq_length > args.max_position_embeddings:
+        args.max_position_embeddings = args.decoder_seq_length
+        warnings.warn(
+            f"Expanded max_position_embeddings to {args.max_position_embeddings} to accommodate the maximum language model sequence length"
+        )
+
+    base_config = core_transformer_config_from_args(get_args())
+    base_config.language_model_type = args.language_model_type
+    base_config.vision_model_type = args.vision_model_type
+    base_config.calculate_per_token_loss = True
+
+    language_config = deepcopy(base_config)
+    language_config = get_language_model_config(language_config)
+
+    if use_te:
+        language_transformer_layer_spec = get_layer_spec_te(
+            is_vit=False
+        )  # TENorm detects LayerNorm/RMS automatically.
+    else:
+        language_transformer_layer_spec = get_layer_spec(
+            is_vit=False, normalization=language_config.normalization
+        )
+
+    vision_config = deepcopy(base_config)
+    vision_config = get_vision_model_config(
+        vision_config, apply_query_key_layer_scaling=args.apply_query_key_layer_scaling
+    )
+
+    vision_model_type = args.vision_model_type
+    if vision_model_type in ["clip", "siglip"]:
+        if use_te:
+            vision_transformer_layer_spec = get_layer_spec_te(
+                is_vit=True
+            )  # TENorm detects LayerNorm/RMS automatically.
+        else:
+            vision_transformer_layer_spec = get_layer_spec(
+                is_vit=True, normalization=vision_config.normalization
+            )
+    else:
+        raise RuntimeError("unsupported vision model type", vision_model_type)
+
+    vision_projection_config = deepcopy(base_config)
+    vision_projection_config = get_vision_projection_config(
+        vision_projection_config, language_config.hidden_size
+    )
+
+    if args.encoder_pipeline_model_parallel_size > 0:
+        assert (
+            args.encoder_pipeline_model_parallel_size == 1
+        ), "vision model and projection can only live on 1 pipeline stage."
+        vision_config.pipeline_model_parallel_size = args.encoder_pipeline_model_parallel_size
+        vision_projection_config.pipeline_model_parallel_size = (
+            args.encoder_pipeline_model_parallel_size
+        )
+        if args.encoder_tensor_model_parallel_size > 0:
+            vision_config.tensor_model_parallel_size = args.encoder_tensor_model_parallel_size
+            vision_projection_config.tensor_model_parallel_size = (
+                args.encoder_tensor_model_parallel_size
+            )
+
+    vision_projection_layer_spec = get_mlp_module_spec(use_te=use_te).submodules
+
+    model = LLaVAModel(
+        language_transformer_config=language_config,
+        language_transformer_layer_spec=language_transformer_layer_spec,
+        language_vocab_size=args.padded_vocab_size,
+        language_max_sequence_length=args.decoder_seq_length,
+        vision_transformer_config=vision_config,
+        vision_transformer_layer_spec=vision_transformer_layer_spec,
+        drop_vision_class_token=args.disable_vision_class_token,
+        vision_projection_config=vision_projection_config,
+        vision_projection_layer_spec=vision_projection_layer_spec,
+        vision_projection_type="mlp",
+        allow_missing_vision_projection_checkpoint=args.allow_missing_vision_projection_checkpoint,
+        parallel_output=parallel_output,
+        language_position_embedding_type=args.position_embedding_type,
+        language_rotary_percent=args.rotary_percent,
+        pre_process=pre_process,
+        post_process=post_process,
+        add_encoder=add_encoder,
+        add_decoder=add_decoder,
+        img_h=args.img_h,
+        img_w=args.img_w,
+        patch_dim=args.patch_dim,
+        language_rotary_base=args.rotary_base,
+    )
+
+    model.freeze(
+        freeze_language_model=args.freeze_LM,
+        freeze_vision_model=args.freeze_ViT,
+        freeze_vision_projection=False,
+    )
+
+    return model
diff --git a/examples/multimodal/model_converter/clip_converter.py b/examples/multimodal/model_converter/clip_converter.py
new file mode 100644
index 0000000000..696c810890
--- /dev/null
+++ b/examples/multimodal/model_converter/clip_converter.py
@@ -0,0 +1,163 @@
+# Copyright (c) 2024, NVIDIA CORPORATION. All rights reserved.
+import argparse
+import os
+
+import torch
+
+import clip
+
+
+def convert(download_root, output_path, tensor_parallel_size, use_te):
+    device = "cuda"
+
+    model, _ = clip.load("ViT-L/14@336px", device=device, download_root=download_root)
+
+    state_dict = model.state_dict()
+    new_state_dicts = [{"model": dict()} for _ in range(tensor_parallel_size)]
+
+    # Indices from mapping pytorch multihead attention to megatron.
+    kv_channels = 64
+    hidden_dim = 1024
+    num_heads = 16
+    indices = []
+    for i in range(num_heads):
+        lb = i * kv_channels
+        ub = (i + 1) * kv_channels
+        indices.append(torch.arange(lb, ub, dtype=torch.int))
+        indices.append(torch.arange(hidden_dim + lb, hidden_dim + ub, dtype=torch.int))
+        indices.append(torch.arange(2 * hidden_dim + lb, 2 * hidden_dim + ub, dtype=torch.int))
+
+    indices = torch.cat(indices)
+
+    for name, tensor in state_dict.items():
+        # Skip text model.
+        if "visual" not in name:
+            continue
+
+        # Skip final layers not used in our model.
+        if name == "visual.proj" or "ln_post" in name:
+            continue
+
+        # Map parameter names to ones used in megatron.
+        new_name = ""
+        new_tensor = tensor
+        if new_tensor.dtype == torch.float16:
+            new_tensor = new_tensor.to(torch.float32)
+
+        # This is used for chunking some tensors to target tensor parallel size.
+        chunk_dim = None
+
+        if "class_embedding" in name:
+            new_name = "class_token"
+            # Our model uses class token that is expanded to input dimensions already.
+            new_tensor = new_tensor.expand(1, 1, -1)
+        elif "positional_embedding" in name:
+            new_name = "position_embeddings.weight"
+        elif "conv1" in name:
+            new_name = "conv1.weight"
+        elif "ln_pre.weight" in name:
+            new_name = "ln_pre.weight"
+        elif "ln_pre.bias" in name:
+            new_name = "ln_pre.bias"
+        elif "transformer.resblocks" in name:
+            layer_idx = name.split(".")[3]
+            base = f"decoder.layers.{layer_idx}"
+
+            if "attn.in_proj_weight" in name:
+                new_name = f"{base}.self_attention.linear_qkv.weight"
+                new_tensor = new_tensor[indices]
+                chunk_dim = 0
+            elif "attn.in_proj_bias" in name:
+                new_name = f"{base}.self_attention.linear_qkv.bias"
+                new_tensor = new_tensor[indices]
+                chunk_dim = 0
+            elif "attn.out_proj.weight" in name:
+                new_name = f"{base}.self_attention.linear_proj.weight"
+                chunk_dim = 1
+            elif "attn.out_proj.bias" in name:
+                new_name = f"{base}.self_attention.linear_proj.bias"
+            elif "ln_1.weight" in name:
+                new_name = f"{base}.input_layernorm.weight"
+                if use_te:
+                    new_name = f"{base}.self_attention.linear_qkv.layer_norm_weight"
+            elif "ln_1.bias" in name:
+                new_name = f"{base}.input_layernorm.bias"
+                if use_te:
+                    new_name = f"{base}.self_attention.linear_qkv.layer_norm_bias"
+            elif "mlp.c_fc.weight" in name:
+                new_name = f"{base}.mlp.linear_fc1.weight"
+                chunk_dim = 0
+            elif "mlp.c_fc.bias" in name:
+                new_name = f"{base}.mlp.linear_fc1.bias"
+                chunk_dim = 0
+            elif "mlp.c_proj.weight" in name:
+                new_name = f"{base}.mlp.linear_fc2.weight"
+                chunk_dim = 1
+            elif "mlp.c_proj.bias" in name:
+                new_name = f"{base}.mlp.linear_fc2.bias"
+            elif "ln_2.weight" in name:
+                new_name = f"{base}.pre_mlp_layernorm.weight"
+                if use_te:
+                    new_name = f"{base}.mlp.linear_fc1.layer_norm_weight"
+            elif "ln_2.bias" in name:
+                new_name = f"{base}.pre_mlp_layernorm.bias"
+                if use_te:
+                    new_name = f"{base}.mlp.linear_fc1.layer_norm_bias"
+
+        assert new_name != "", f"unexpected layer name {name}"
+
+        if chunk_dim is None:
+            new_tensors = [new_tensor for _ in range(tensor_parallel_size)]
+        else:
+            new_tensors = torch.chunk(new_tensor, tensor_parallel_size, dim=chunk_dim)
+
+        for i in range(tensor_parallel_size):
+            # chunk() creates a view of a bigger tensor. clone() is used here to avoid excessive storage.
+            new_state_dicts[i]["model"][new_name] = new_tensors[i].clone()
+
+            # TE sets _extra_state (for FP8 purposes), so set an empty one here for compatibility.
+            extra_state_layers = ("linear_qkv", "linear_proj", "linear_fc1", "linear_fc2")
+            is_extra_state_layer = any([l in new_name for l in extra_state_layers])
+            if use_te and is_extra_state_layer:
+                layer = new_name.split(".")[-2]
+                if layer in extra_state_layers:
+                    extra_state_name = (
+                        new_name[: new_name.rfind(".") + 1] + "_extra_state"
+                    )  # Replace the weight name.
+                    new_state_dicts[i]["model"][extra_state_name] = None
+
+    for i in range(tensor_parallel_size):
+        output_dir_tp = os.path.join(output_path, "iter_0000001", f"mp_rank_0{i}")
+        os.makedirs(output_dir_tp)
+        output_path_tp = os.path.join(output_dir_tp, "model_optim_rng.pt")
+        torch.save(new_state_dicts[i], output_path_tp)
+
+
+if __name__ == "__main__":
+    parser = argparse.ArgumentParser(
+        description="""
+Convert OpenAI CLIP VIT weights to megatron format.
+
+
+Example usage:
+python clip_converter.py --download-root /some/download/folder --output /some/output/folder --tensor-parallel-size 4
+""",
+        formatter_class=argparse.RawDescriptionHelpFormatter,
+    )
+
+    parser.add_argument(
+        "--download-root", type=str, required=True, help="Download folder for OpenAI CLIP weights"
+    )
+    parser.add_argument(
+        "--output", type=str, required=True, help="output directory for megatron state dict file(s)"
+    )
+    parser.add_argument(
+        "--tensor-parallel-size", type=int, default=1, help="model tensor parallel size"
+    )
+    parser.add_argument("--use-te", action="store_true", help="Use Transformer Engine")
+
+    args = parser.parse_args()
+
+    convert(args.download_root, args.output, args.tensor_parallel_size, args.use_te)
+
+    print("done.")
diff --git a/examples/multimodal/model_converter/siglip_converter.py b/examples/multimodal/model_converter/siglip_converter.py
new file mode 100644
index 0000000000..117f8b8924
--- /dev/null
+++ b/examples/multimodal/model_converter/siglip_converter.py
@@ -0,0 +1,154 @@
+# Copyright (c) 2024, NVIDIA CORPORATION. All rights reserved.
+import argparse
+import os
+from transformers import PaliGemmaForConditionalGeneration
+import torch
+
+
+def convert(output_path, tensor_parallel_size, use_te):
+    device = "cuda"
+
+    model_id = "google/paligemma-3b-pt-448"
+    model = PaliGemmaForConditionalGeneration.from_pretrained(model_id).eval()
+
+    model = model.to(device)
+
+    print(model.config)
+    for name, tensor in model.state_dict().items():
+        if "vision_model" not in name:
+            continue
+        shape_str = "(" + ", ".join([str(x) for x in tensor.shape]) + ")"
+        print(f"{name:<75} {shape_str:>20}")
+
+    state_dict = model.state_dict()
+    new_state_dicts = [{"model": dict()} for _ in range(tensor_parallel_size)]
+
+    def add_chunck_tensor(new_tensor, new_name, chunk_dim=None):
+        if chunk_dim is None:
+            new_tensors = [new_tensor for _ in range(tensor_parallel_size)]
+        else:
+            new_tensors = torch.chunk(new_tensor, tensor_parallel_size, dim=chunk_dim)
+
+        for i in range(tensor_parallel_size):
+            # chunk() creates a view of a bigger tensor. clone() is used here to avoid excessive storage.
+            new_state_dicts[i]["model"][new_name] = new_tensors[i].clone()
+
+            # TE sets _extra_state (for FP8 purposes), so set an empty one here for compatibility.
+            extra_state_layers = ("linear_qkv", "linear_proj", "linear_fc1", "linear_fc2")
+            is_extra_state_layer = any([l in new_name for l in extra_state_layers])
+            if use_te and is_extra_state_layer:
+                layer = new_name.split(".")[-2]
+                if layer in extra_state_layers:
+                    extra_state_name = (
+                        new_name[: new_name.rfind(".") + 1] + "_extra_state"
+                    )  # Replace the weight name.
+                    new_state_dicts[i]["model"][extra_state_name] = None
+
+    for name, tensor in state_dict.items():
+        if tensor.dtype == torch.float16:
+            state_dict[name] = tensor.to(torch.float32)
+
+    add_chunck_tensor(
+        state_dict["vision_tower.vision_model.embeddings.position_embedding.weight"],
+        "position_embeddings.weight")
+    add_chunck_tensor(
+        state_dict["vision_tower.vision_model.embeddings.patch_embedding.weight"],
+        "conv1.weight")
+    add_chunck_tensor(
+        state_dict["vision_tower.vision_model.embeddings.patch_embedding.bias"],
+        "conv1.bias")
+
+    head_dim = 72
+    num_head = 16
+    for layer_idx in range(27):
+        origin_base = f"vision_tower.vision_model.encoder.layers.{layer_idx}" 
+        target_base = f"decoder.layers.{layer_idx}"
+        
+        for param_type in ["weight", "bias"]:
+            # QKV
+            q_proj_params = state_dict[f"{origin_base}.self_attn.q_proj.{param_type}"]
+            k_proj_params = state_dict[f"{origin_base}.self_attn.k_proj.{param_type}"]
+            v_proj_params = state_dict[f"{origin_base}.self_attn.v_proj.{param_type}"]
+            # Do some tensor manipulation because megatron expect one tensor
+            # projection for the QKV in the order
+            # [(Q1, K1, V1), (Q2, K2, V2), ...] where Qi is the query of the
+            # i-th head with dimension num_head.
+            new_tensor = torch.concatenate([
+                q_proj_params.view(num_head, head_dim, -1),
+                k_proj_params.view(num_head, head_dim, -1),
+                v_proj_params.view(num_head, head_dim, -1)], axis=1).view(
+                    3*head_dim*num_head, -1)
+            if param_type == "bias":
+                new_tensor = new_tensor[:, 0]
+            new_name = f"{target_base}.self_attention.linear_qkv.{param_type}"
+            add_chunck_tensor(new_tensor, new_name, chunk_dim=0)
+            # linear_proj
+            add_chunck_tensor(
+                state_dict[f"{origin_base}.self_attn.out_proj.{param_type}"],
+                f"{target_base}.self_attention.linear_proj.{param_type}",
+                chunk_dim=1 if param_type == "weight" else None)
+            # layer_norm
+            new_name = f"{target_base}.input_layernorm.{param_type}"
+            if use_te:
+                new_name = f"{target_base}.self_attention.linear_qkv.layer_norm_{param_type}"
+            add_chunck_tensor(
+                state_dict[f"{origin_base}.layer_norm1.{param_type}"],
+                new_name)
+            # FC 1
+            add_chunck_tensor(
+                state_dict[f"{origin_base}.mlp.fc1.{param_type}"],
+                f"{target_base}.mlp.linear_fc1.{param_type}",
+                chunk_dim=0)
+            # FC 2
+            add_chunck_tensor(
+                state_dict[f"{origin_base}.mlp.fc2.{param_type}"],
+                f"{target_base}.mlp.linear_fc2.{param_type}",
+                chunk_dim=1 if param_type=="weight" else None)
+            # layer_norm
+            new_name = f"{target_base}.pre_mlp_layernorm.{param_type}"
+            if use_te:
+                new_name = f"{target_base}.mlp.linear_fc1.layer_norm_{param_type}"
+            add_chunck_tensor(
+                state_dict[f"{origin_base}.layer_norm2.{param_type}"],
+                new_name)
+
+    add_chunck_tensor(
+        state_dict["vision_tower.vision_model.post_layernorm.weight"],
+        "ln_post.weight")
+    add_chunck_tensor(
+        state_dict["vision_tower.vision_model.post_layernorm.bias"],
+        "ln_post.bias")
+
+    for i in range(tensor_parallel_size):
+        output_dir_tp = os.path.join(output_path, "iter_0000001", f"mp_rank_0{i}")
+        os.makedirs(output_dir_tp)
+        output_path_tp = os.path.join(output_dir_tp, "model_optim_rng.pt")
+        torch.save(new_state_dicts[i], output_path_tp)
+
+
+if __name__ == "__main__":
+    parser = argparse.ArgumentParser(
+        description="""
+Convert SigLIP weights to megatron format.
+
+
+Example usage:
+python siglip_converter.py --tensor-parallel-size 4 --output google_paligemma_3b_pt_44_mcore_tp_4 --use-te
+
+examples/multimodal/combine_mistral_clip.sh /lustre/fsw/portfolios/llmservice/users/jbarker/workspace/checkpoints/Mistral-7B-Instruct-v0.3-mcore-tp4 google_paligemma_3b_pt_44_mcore_tp_4 mistral_7b_instruct_v0p3_google_paligemma_3b_pt_44_mcore_tp_4
+""",
+        formatter_class=argparse.RawDescriptionHelpFormatter,
+    )
+    parser.add_argument(
+        "--output", type=str, required=True, help="output directory for megatron state dict file(s)"
+    )
+    parser.add_argument(
+        "--tensor-parallel-size", type=int, default=1, help="model tensor parallel size"
+    )
+    parser.add_argument("--use-te", action="store_true", help="Use Transformer Engine")
+
+    args = parser.parse_args()
+
+    convert(args.output, args.tensor_parallel_size, args.use_te)
+
+    print("done.")
diff --git a/examples/multimodal/multimodal_args.py b/examples/multimodal/multimodal_args.py
new file mode 100644
index 0000000000..a7cb4235e3
--- /dev/null
+++ b/examples/multimodal/multimodal_args.py
@@ -0,0 +1,43 @@
+# Copyright (c) 2024, NVIDIA CORPORATION.  All rights reserved.
+
+
+def add_multimodal_extra_args(parser):
+    """Extra arguments."""
+    group = parser.add_argument_group(title='multimodal arguments')
+    group.add_argument('--dataset-config', type=str, default=None)
+    group.add_argument("--prompt-path", type=str, default=None)
+    group.add_argument('--freeze-LM', action='store_true', default=False)
+    group.add_argument('--freeze-ViT', action='store_true', default=False)
+    group.add_argument('--language-model-type', type=str, required=True)
+    group.add_argument('--vision-model-type', type=str, default="clip")
+    group.add_argument("--disable-vision-class-token", action="store_true", default=False)
+    group.add_argument(
+        "--allow-missing-vision-projection-checkpoint", action="store_true", default=False
+    )
+    group.add_argument("--use-te", action="store_true", default=False)
+    group.add_argument(
+        "--dataloader-save", type=str, default=None, help="Energon dataloader state save path"
+    )
+    group.add_argument(
+        "--use-tiling", action="store_true", default=False, help="Use input image tiling"
+    )
+    group.add_argument("--max-num-tiles", type=int, default=1, help="Maximum number of image tiles")
+    group.add_argument(
+        "--use-thumbnail", action="store_true", default=False, help="Add image thumbnail as a tile"
+    )
+    group.add_argument(
+        "--dataloader-seq-length",
+        type=int,
+        help="Make dataloader to produce sequences of specific length.",
+    )
+    group.add_argument(
+        "--num-frames",
+        type=int,
+        default=1,
+        help="Number of frames to regularly sample from the video as input to the model.",
+    )
+    group.add_argument(
+        "--online-evaluation-config", type=str, help="Config file for online evaluation."
+    )
+
+    return parser
diff --git a/examples/multimodal/pretrain_dataset.yaml b/examples/multimodal/pretrain_dataset.yaml
new file mode 100644
index 0000000000..f27bccba30
--- /dev/null
+++ b/examples/multimodal/pretrain_dataset.yaml
@@ -0,0 +1,15 @@
+__module__: megatron.energon
+__class__: Metadataset
+splits:
+  train:
+    datasets:
+      - weight: 1.
+        path: <path_to_pretraining_dataset_in_energon_format>
+        subflavors:
+          augmentation: false
+  val:
+    datasets:
+      - weight: 1.
+        path: <path_to_pretraining_dataset_in_energon_format>
+        subflavors:
+          augmentation: false
diff --git a/examples/multimodal/pretrain_mistral_clip.sh b/examples/multimodal/pretrain_mistral_clip.sh
new file mode 100755
index 0000000000..b06dbfe53c
--- /dev/null
+++ b/examples/multimodal/pretrain_mistral_clip.sh
@@ -0,0 +1,131 @@
+#!/bin/bash
+# Pretrain a multimodal model.
+
+export NCCL_IB_SL=1
+export CUDA_DEVICE_MAX_CONNECTIONS=1
+MODEL_NAME="mcore-llava-mistral-7b-instruct-clip336-pretraining"
+
+# Check that the user has set an output path for model checkpoints.
+if [[ -z $WORKSPACE ]]; then
+    echo "Please set WORKSPACE for storing your model checkpoints."
+    exit 1
+fi
+
+SOURCE=`pwd`
+OUTPUT_BASE="${WORKSPACE}/output"
+OUTPUT="${OUTPUT_BASE}/${MODEL_NAME}"
+
+FINETUNE_DIR=${OUTPUT}/checkpoints
+LOGS_DIR="${OUTPUT}/logs"
+TENSORBOARD_DIR="${OUTPUT}/tensorboard"
+
+if [[ -z $LOAD_NAME ]]; then
+    echo "Please set LOAD_NAME for input model name."
+    exit 1
+fi
+
+if [[ -z $TOKENIZER_MODEL ]]; then
+    echo "Please set TOKENIZER_MODEL for tokenizer model name."
+    exit 1
+fi
+
+CHECKPOINT_DIR="${WORKSPACE}/${LOAD_NAME}/checkpoints"
+
+DATA_TRAIN="${SOURCE}/examples/multimodal/pretrain_dataset.yaml"
+
+DEBUG=0
+if [[ $DEBUG -eq 1 ]]; then
+    BZ=32
+    NW=2
+    HD=0.0
+    LI=1
+    EXTRA_ARGS=""
+    NONDETERMINISTIC_ATTN=1
+else
+    BZ=256
+    NW=2
+    HD=0.1
+    LI=10
+    EXTRA_ARGS=""
+    NONDETERMINISTIC_ATTN=1
+fi
+
+OPTIONS=" \
+    --apply-layernorm-1p \
+    --attention-softmax-in-fp32 \
+    --use-checkpoint-args \
+    --use-distributed-optimizer \
+    --transformer-impl transformer_engine \
+    --use-te \
+    --normalization RMSNorm \
+    --group-query-attention \
+    --num-query-groups 8 \
+    --no-masked-softmax-fusion \
+    --num-workers ${NW} \
+    --exit-duration-in-mins 230 \
+    --use-flash-attn \
+    --untie-embeddings-and-output-weights \
+    --disable-bias-linear \
+    --position-embedding-type rope \
+    --rotary-percent 1.0 \
+    --rotary-base 1000000 \
+    --swiglu \
+    --attention-dropout 0.0 \
+    --hidden-dropout ${HD} \
+    --tensor-model-parallel-size 4 \
+    --pipeline-model-parallel-size 1 \
+    --num-layers 32 \
+    --hidden-size 4096 \
+    --num-attention-heads 32 \
+    --seq-length 576 \
+    --decoder-seq-length 1024 \
+    --max-position-embeddings 4096 \
+    --ffn-hidden-size 14336 \
+    --train-iters 20000 \
+    --micro-batch-size 1 \
+    --global-batch-size ${BZ} \
+    --lr-decay-iters 20000 \
+    --lr-warmup-fraction .01 \
+    --lr 0.00015 \
+    --min-lr 1.0e-5 \
+    --lr-decay-style cosine \
+    --log-interval ${LI} \
+    --eval-iters 10 \
+    --eval-interval 1000 \
+    --tokenizer-type HuggingFaceTokenizer \
+    --tokenizer-model ${WORKSPACE}/${TOKENIZER_MODEL} \
+    --data-path ${DATA_TRAIN} \
+    --prompt-path ${SOURCE}/examples/multimodal/manual_prompts.json \
+    --save-interval 1000 \
+    --save ${FINETUNE_DIR} \
+    --load ${FINETUNE_DIR} \
+    --dataloader-save ${FINETUNE_DIR}/dataloader \
+    --pretrained-checkpoint ${CHECKPOINT_DIR} \
+    --split 100,0,0 \
+    --clip-grad 1.0 \
+    --weight-decay 1e-2 \
+    --adam-beta1 0.9 \
+    --adam-beta2 0.95 \
+    --init-method-std 0.014 \
+    --log-params-norm \
+    --log-num-zeros-in-grad \
+    --bf16 \
+    --eod-mask-loss \
+    --freeze-LM \
+    --freeze-ViT \
+    --patch-dim 14 \
+    --img-h 336 \
+    --img-w 336 \
+    --dataloader-type external \
+    --tensorboard-dir ${TENSORBOARD_DIR} \
+    --language-model-type=mistral_7b \
+    --disable-vision-class-token \
+    ${EXTRA_ARGS} \
+    --distributed-timeout-minutes 60 \
+    --allow-missing-vision-projection-checkpoint \
+"
+
+export NVTE_APPLY_QK_LAYER_SCALING=0
+export NVTE_ALLOW_NONDETERMINISTIC_ALGO=${NONDETERMINISTIC_ATTN}
+
+torchrun --nproc_per_node 8 examples/multimodal/train.py ${OPTIONS}
\ No newline at end of file
diff --git a/examples/multimodal/run_text_generation.py b/examples/multimodal/run_text_generation.py
new file mode 100644
index 0000000000..37d9072f0a
--- /dev/null
+++ b/examples/multimodal/run_text_generation.py
@@ -0,0 +1,971 @@
+# Copyright (c) 2024, NVIDIA CORPORATION.  All rights reserved.
+"""Generate text using a vision language model."""
+import glob
+import itertools
+import json
+import logging
+import os
+import re
+import sys
+from collections import defaultdict
+from functools import partial
+
+# Add megatron to the path.
+sys.path.append(
+    os.path.abspath(os.path.join(os.path.dirname(__file__), os.path.pardir, os.path.pardir))
+)
+
+import datasets
+import numpy as np
+import torch
+import yaml
+from config import EvaluationConfig
+from dataset_helpers import tokenizer_image_token
+from image_processing import get_visual_transform
+from MMMU.mmmu.utils.data_utils import (
+    CAT_SHORT2LONG,
+    construct_prompt,
+    load_yaml,
+    process_single_sample,
+)
+from MMMU.mmmu.utils.eval_utils import parse_multi_choice_response
+from model import model_provider
+from multimodal_args import add_multimodal_extra_args
+from PIL import Image
+from torchvision.io import read_video
+
+from megatron.core import parallel_state
+from megatron.core.models.multimodal.llava_model import IMAGE_TOKEN_INDEX
+from megatron.core.models.vision.clip_vit_model import get_num_image_embeddings
+from megatron.inference.text_generation.api import generate_and_post_process
+from megatron.inference.text_generation.forward_step import ForwardStep
+from megatron.training import get_args, get_model, get_tokenizer, print_rank_0
+from megatron.training.checkpointing import load_checkpoint
+from megatron.training.initialize import initialize_megatron
+
+
+def add_text_generation_args(parser):
+    """Text generation arguments."""
+    group = parser.add_argument_group(title='Vision language model text generation arguments')
+
+    group.add_argument("--temperature", type=float, default=1.0, help='Sampling temperature.')
+    group.add_argument("--top_p", type=float, default=0.0, help='Top p sampling.')
+    group.add_argument("--top_k", type=int, default=0, help='Top k sampling.')
+    group.add_argument(
+        "--out-seq-length", type=int, default=1024, help='Length of the output generated text.'
+    )
+    group.add_argument("--output-path", type=str, help='Output file path')
+    group.add_argument('--input-image-path', type=str, help="Input image directory")
+    group.add_argument(
+        '--num-partitions', type=int, default=0, help="Number of partitions for inputs."
+    )
+    group.add_argument('--partition-id', type=int, default=0, help="Partition index")
+    group.add_argument("--gt-path", type=str, help="Optional ground truth file")
+    group.add_argument(
+        "--task",
+        type=str,
+        choices=["captioning", "TextVQA", "VQAv2", "ChartQA", "MMMU", "VideoMME"],
+        help="Generation task to run",
+    )
+    group.add_argument(
+        "--num-samples-per-partition", type=int, default=0, help="Number of samples per partition"
+    )
+    group.add_argument(
+        "--prompt-format",
+        type=str,
+        default="mistral",
+        choices=["llama3", "mistral"],
+        help="Prompting format to use",
+    )
+    group.add_argument("--config-path", type=str, help="Config file to use.")
+
+    # Add common multimodal arguments needed for e.g. building the model.
+    parser = add_multimodal_extra_args(parser)
+
+    return parser
+
+
+def _get_partition_bounds(
+    total_num_samples, num_samples_per_partition, num_partitions, partition_id
+):
+    if num_samples_per_partition == 0:
+        samples_per_partition = [
+            int(x) for x in np.linspace(0, total_num_samples, num_partitions + 1)
+        ]
+        return samples_per_partition[partition_id], samples_per_partition[partition_id + 1]
+    return num_samples_per_partition * partition_id, num_samples_per_partition * (partition_id + 1)
+
+
+class VQADataset(torch.utils.data.Dataset):
+    def __init__(
+        self,
+        input_image_path,
+        gt_path,
+        num_samples_per_partition,
+        num_partitions,
+        partition_id,
+        keys,
+        img_h,
+        img_w,
+        use_tiling,
+        max_num_tiles,
+        use_thumbnail,
+    ):
+        samples = json.load(open(gt_path, encoding='utf-8'))
+        if "data" in samples:
+            samples = samples["data"]
+
+        # Optionally, process only a subset of the input files.
+        if num_partitions > 0:
+            lb, ub = _get_partition_bounds(
+                len(samples), num_samples_per_partition, num_partitions, partition_id
+            )
+            samples = samples[lb:ub]
+
+        self._keys = keys
+        self._samples = samples
+        self._input_image_path = input_image_path
+        self._img_h = img_h
+        self._img_w = img_w
+        self._use_tiling = use_tiling
+        self._max_num_tiles = max_num_tiles
+        self._use_thumbnail = use_thumbnail
+
+    def __len__(self):
+        return len(self._samples)
+
+    def __getitem__(self, idx):
+        sample = self._samples[idx]
+
+        img_file = "{}/{}".format(self._input_image_path, sample[self._keys["image_id"]])
+        if not os.path.exists(img_file):
+            img_file += ".jpg"
+
+            if not os.path.exists(img_file):
+                img_file = img_file.replace('.jpg', '.png')
+
+        img = Image.open(img_file)
+        imgs = get_visual_transform(
+            img,
+            self._img_h,
+            self._img_w,
+            self._use_tiling,
+            self._max_num_tiles,
+            self._use_thumbnail,
+            augment=False,
+        )
+        tile_count = torch.tensor([len(imgs)], dtype=torch.int)
+
+        sample_id = idx
+        if "sample_id" in self._keys:
+            sample_id = sample[self._keys["sample_id"]]
+
+        metadata = ""  # Not used.
+
+        return (
+            torch.stack(imgs),
+            tile_count,
+            sample_id,
+            sample[self._keys["question"]],
+            sample[self._keys["answer"]],
+            metadata,
+        )
+
+
+class CaptioningDataset(torch.utils.data.Dataset):
+    def __init__(
+        self,
+        input_image_path,
+        gt_path,
+        num_samples_per_partition,
+        num_partitions,
+        partition_id,
+        img_h,
+        img_w,
+        use_tiling,
+        max_num_tiles,
+        use_thumbnail,
+    ):
+        image_files = sorted(glob.glob(input_image_path + "/*"))
+
+        # Optionally, process only a subset of the input files.
+        if num_partitions > 0:
+            lb, ub = _get_partition_bounds(
+                len(image_files), num_samples_per_partition, num_partitions, partition_id
+            )
+            image_files = image_files[lb:ub]
+
+        gts = json.load(open(gt_path))
+        answers = defaultdict(list)
+        for gt in gts["annotations"]:
+            answers[gt["image_id"]].append(gt['caption'])
+
+        self._image_files = image_files
+        self._answers = answers
+        self._img_h = img_h
+        self._img_w = img_w
+        self._use_tiling = use_tiling
+        self._max_num_tiles = max_num_tiles
+        self._use_thumbnail = use_thumbnail
+
+    def __len__(self):
+        return len(self._image_files)
+
+    def __getitem__(self, idx):
+        img_file = self._image_files[idx]
+        image_id = int(img_file.split("_")[-1].split(".")[0])
+
+        img = Image.open(img_file)
+        imgs = get_visual_transform(
+            img,
+            self._img_h,
+            self._img_w,
+            self._use_tiling,
+            self._max_num_tiles,
+            self._use_thumbnail,
+            augment=False,
+        )
+
+        tile_count = torch.tensor([len(imgs)], dtype=torch.int)
+
+        question = ""  # Fixed for all samples.
+        metadata = ""  # Not used.
+
+        return torch.stack(imgs), tile_count, image_id, question, self._answers[image_id], metadata
+
+
+class MMMUDataset(torch.utils.data.Dataset):
+    def __init__(
+        self,
+        input_image_path,
+        num_samples_per_partition,
+        num_partitions,
+        partition_id,
+        img_h,
+        img_w,
+        use_tiling,
+        max_num_tiles,
+        use_thumbnail,
+        single_image,
+    ):
+        # The following downloads the MMMU dataset from HuggingFace and uses the API from the MMMU github repo to run MMMU evaluation.
+        all_mmmu_datasets = []
+
+        hf_datasets_cache = os.environ["HF_DATASETS_CACHE"]
+        assert hf_datasets_cache != "", "Please set the environment variable HF_DATASETS_CACHE."
+
+        for subject in CAT_SHORT2LONG.values():
+            # Use a local copy of the dataset if exists (can be faster) or the HF one.
+            if os.path.exists(input_image_path):
+                subject_dataset = datasets.load_dataset(
+                    os.path.join(input_image_path, subject),
+                    split=datasets.Split.VALIDATION,
+                    cache_dir=hf_datasets_cache,
+                    verification_mode="no_checks",
+                )
+            else:
+                subject_dataset = datasets.load_dataset(
+                    "MMMU/MMMU",
+                    subject,
+                    split=datasets.Split.VALIDATION,
+                    cache_dir=hf_datasets_cache,
+                )
+
+            all_mmmu_datasets.append(subject_dataset)
+
+        dataset = datasets.concatenate_datasets(all_mmmu_datasets)
+
+        dataset = [s for s in dataset if s['id'].startswith("val")]
+
+        # Optionally, process only a subset of the input files.
+        if num_partitions > 0:
+            lb, ub = _get_partition_bounds(
+                len(dataset), num_samples_per_partition, num_partitions, partition_id
+            )
+            dataset = dataset[lb:ub]
+
+        # Using the LLaVA config from the MMMU repo.
+        config = load_yaml("examples/multimodal/MMMU/mmmu/configs/llava1.5.yaml")
+        for k, v in config.items():
+            if isinstance(v, list):
+                assert len(v) == 1, "only one value supported."
+                config[k] = v[0]
+
+        self._config = config
+
+        self._dataset = dataset
+
+        self._img_h = img_h
+        self._img_w = img_w
+        self._use_tiling = use_tiling
+        self._max_num_tiles = max_num_tiles
+        self._use_thumbnail = use_thumbnail
+        self._single_image = single_image
+
+    def __len__(self):
+        return len(self._dataset)
+
+    def __getitem__(self, idx):
+        sample = self._dataset[idx]
+
+        # Use the single image approach from the MMMU repo.
+        if self._single_image:
+            sample = process_single_sample(sample)
+            sample = construct_prompt(sample, self._config)
+
+            img = sample["image"]
+            sample_imgs = get_visual_transform(
+                img,
+                self._img_h,
+                self._img_w,
+                self._use_tiling,
+                self._max_num_tiles,
+                self._use_thumbnail,
+                augment=False,
+            )
+            sample_num_tiles = [len(sample_imgs)]
+        else:
+            sample = construct_prompt(sample, self._config)
+
+            sample_imgs = []
+            sample_num_tiles = []
+
+            img_indices = re.findall(r"<image (\d+)", sample["final_input_prompt"])
+            # If there are multiple input images, we need to avoid the number of image embeddings getting too large.
+            adjusted_max_num_tiles = max(1, self._max_num_tiles // len(img_indices))
+
+            for img_idx in img_indices:
+                img_key = f"image_{img_idx}"
+                img_str = f"<image {img_idx}>"
+
+                img = sample[img_key]
+                assert img is not None, f"{img_str} is in prompt but not in sample images"
+
+                # Note: Only replace the current image tag.
+                sample["final_input_prompt"] = sample["final_input_prompt"].replace(
+                    img_str, "<image>", 1
+                )
+
+                imgs = get_visual_transform(
+                    img,
+                    self._img_h,
+                    self._img_w,
+                    self._use_tiling,
+                    adjusted_max_num_tiles,
+                    self._use_thumbnail,
+                    augment=False,
+                )  # List of tiles.
+
+                sample_imgs.extend(imgs)
+                sample_num_tiles.append(len(imgs))
+
+            # Sanity check.
+            for i in range(1, 8):
+                assert (
+                    f"<image {i}>" not in sample["final_input_prompt"]
+                ), "prompt contains unhandled image tags"
+
+        # MMMU specific metadata.
+        metadata = {"question_type": sample["question_type"]}
+        if sample["question_type"] == "multiple-choice":
+            metadata["index2ans"] = sample["index2ans"]
+            metadata["all_choices"] = sample["all_choices"]
+
+        prompt = sample['final_input_prompt']
+        if self._single_image:
+            for i in range(8):
+                prompt = prompt.replace(f"<image {i}>", "")
+            prompt = f"<image>\n{prompt}"
+
+        tile_count = torch.tensor(sample_num_tiles, dtype=torch.int)
+
+        return (
+            torch.stack(sample_imgs),
+            tile_count,
+            sample["id"],
+            prompt,
+            sample["answer"],
+            metadata,
+        )
+
+
+class VideoMMMEDataset(torch.utils.data.Dataset):
+    def __init__(
+        self,
+        input_image_path,
+        gt_path,
+        num_samples_per_partition,
+        num_partitions,
+        partition_id,
+        img_h,
+        img_w,
+        use_tiling,
+        max_num_tiles,
+        use_thumbnail,
+        num_frames,
+    ):
+        ground_truth_original = json.load(open(gt_path))
+        ground_truth = []
+        for gt in ground_truth_original:
+            video_path = gt["url"]
+            video_path = video_path.replace("https://www.youtube.com/watch?v=", "")
+            video_path = video_path.replace("https://m.youtube.com/watch?v=", "")
+            video_path = os.path.join(input_image_path, video_path + ".mp4")
+            if not os.path.exists(video_path):
+                continue
+            gt["video_path"] = video_path
+            ground_truth.append(gt)
+
+        ground_truth = sorted(ground_truth, key=lambda gt: gt["video_path"])
+        print_rank_0(f"Found {len(ground_truth)} videos to process.")
+
+        if num_partitions > 0:
+            start_idx, end_idx = _get_partition_bounds(
+                len(ground_truth), num_samples_per_partition, num_partitions, partition_id
+            )
+            ground_truth = ground_truth[start_idx:end_idx]
+
+        self._ground_truth = ground_truth
+        self._img_h = img_h
+        self._img_w = img_w
+        self._use_tiling = use_tiling
+        self._max_num_tiles = max_num_tiles
+        self._use_thumbnail = use_thumbnail
+        self._num_frames = num_frames
+
+    def __len__(self):
+        return len(self._ground_truth)
+
+    def __getitem__(self, idx):
+        gt = self._ground_truth[idx]
+
+        video, _, _ = read_video(gt["video_path"], start_pts=0, end_pts=None, pts_unit='sec')
+        video = video.numpy()
+        selected_frames = torch.linspace(0, video.shape[0] - 1, self._num_frames).long()
+        video_frames = video[selected_frames]
+        if self._num_frames == 1:
+            video_frames = video_frames[None]
+
+        imgs = list(
+            itertools.chain.from_iterable(
+                get_visual_transform(
+                    img,
+                    self._img_h,
+                    self._img_w,
+                    self._use_tiling,
+                    self._max_num_tiles,
+                    self._use_thumbnail,
+                    augment=False,
+                )
+                for img in video_frames
+            )
+        )
+
+        for question in gt["questions"]:
+            # Very hacky, but we essentially re-create gt holding only the
+            # question of interest. This is the make this generation script
+            # compatible with the Video MME evaluation script.
+            question_dict = {
+                "video_id": gt["video_id"],
+                "duration_category": gt["duration_category"],
+                "video_category": gt["video_category"],
+                "video_subcategory": gt["video_subcategory"],
+                "url": gt["url"],
+                "questions": [question],
+            }
+
+        num_tiles = torch.tensor([len(imgs)], dtype=torch.int)
+
+        answer = ""
+        metadata = ""
+
+        return (
+            torch.stack(imgs),
+            num_tiles,
+            question["question_id"],
+            question_dict,
+            answer,
+            metadata,
+        )
+
+
+def get_evaluation_dataloader(
+    task,
+    input_image_path,
+    gt_path,
+    img_h,
+    img_w,
+    use_tiling,
+    max_num_tiles,
+    use_thumbnail,
+    num_samples_per_partition,
+    num_partitions,
+    partition_id,
+    num_frames,
+    num_workers,
+):
+    """Build evaluation dataset."""
+    if task == "TextVQA":
+        keys = {
+            "image_id": "image_id",
+            "sample_id": "question_id",
+            "question": "question",
+            "answer": "answers",
+        }
+
+        dataset = VQADataset(
+            input_image_path,
+            gt_path,
+            num_samples_per_partition,
+            num_partitions,
+            partition_id,
+            keys,
+            img_h,
+            img_w,
+            use_tiling,
+            max_num_tiles,
+            use_thumbnail,
+        )
+    elif task == "VQAv2":
+        keys = {
+            "image_id": "image",
+            "sample_id": "question_id",
+            "question": "question",
+            "answer": "answer",
+        }
+
+        dataset = VQADataset(
+            input_image_path,
+            gt_path,
+            num_samples_per_partition,
+            num_partitions,
+            partition_id,
+            keys,
+            img_h,
+            img_w,
+            use_tiling,
+            max_num_tiles,
+            use_thumbnail,
+        )
+    elif task == "ChartQA":
+        keys = {"image_id": "imgname", "question": "query", "answer": "label"}
+
+        dataset = VQADataset(
+            input_image_path,
+            gt_path,
+            num_samples_per_partition,
+            num_partitions,
+            partition_id,
+            keys,
+            img_h,
+            img_w,
+            use_tiling,
+            max_num_tiles,
+            use_thumbnail,
+        )
+    elif task == "captioning":
+        dataset = CaptioningDataset(
+            input_image_path,
+            gt_path,
+            num_samples_per_partition,
+            num_partitions,
+            partition_id,
+            img_h,
+            img_w,
+            use_tiling,
+            max_num_tiles,
+            use_thumbnail,
+        )
+    elif task == 'MMMU':
+        # Note: single_image=True uses only one image like in the MMMU repo example.
+        # single_image=False uses all images in the sample.
+        dataset = MMMUDataset(
+            input_image_path,
+            num_samples_per_partition,
+            num_partitions,
+            partition_id,
+            img_h,
+            img_w,
+            use_tiling,
+            max_num_tiles,
+            use_thumbnail,
+            single_image=True,
+        )
+    elif task == "VideoMME":
+        dataset = VideoMMMEDataset(
+            input_image_path,
+            gt_path,
+            num_samples_per_partition,
+            num_partitions,
+            partition_id,
+            img_h,
+            img_w,
+            use_tiling,
+            max_num_tiles,
+            use_thumbnail,
+            num_frames,
+        )
+    else:
+        raise NotImplementedError(f"unsupported task {task}")
+
+    dp_rank = parallel_state.get_data_parallel_rank()
+    dp_world_size = parallel_state.get_data_parallel_world_size()
+
+    sampler = torch.utils.data.DistributedSampler(
+        dataset, shuffle=False, num_replicas=dp_world_size, rank=dp_rank
+    )
+    # TODO: Batched inference is not supported yet.
+    dataloader = torch.utils.data.DataLoader(
+        dataset, batch_size=None, num_workers=num_workers, sampler=sampler, pin_memory=True
+    )
+
+    return dataloader
+
+
+def generate_samples(model, config: EvaluationConfig, print_output):
+    """Text generation using a trained vision language model."""
+    args = get_args()
+
+    dataloader = get_evaluation_dataloader(
+        config.task,
+        config.input_image_path,
+        config.gt_path,
+        args.img_h,
+        args.img_w,
+        args.use_tiling,
+        args.max_num_tiles,
+        args.use_thumbnail,
+        config.num_samples_per_partition,
+        config.num_partitions,
+        config.partition_id,
+        args.num_frames,
+        args.num_workers,
+    )
+
+    num_img_embeddings_per_tile = get_num_image_embeddings(
+        args.img_h, args.img_w, args.patch_dim, args.vision_model_type, args.disable_vision_class_token, 1
+    )
+
+    for idx, (imgs, num_tiles, sample_id, question, answers, metadata) in enumerate(dataloader):
+        imgs = imgs.to("cuda")
+        num_tiles = num_tiles.to("cuda")
+
+        prompt = get_prompt(config.task, question, config.prompt_format)
+
+        forward_step = partial(VLMForwardStep, num_img_embeddings_per_tile, imgs, num_tiles)
+
+        if is_first_rank():
+            resp_sentences, _, _, _ = generate_and_post_process(
+                model,
+                forward_step=forward_step,
+                prompts=[prompt],
+                tokens_to_generate=config.out_seq_length,
+                top_k_sampling=config.top_k,
+                top_p_sampling=config.top_p,
+                add_BOS=False,
+                temperature=config.temperature,
+                random_seed=args.seed,
+                detokenize_segments=False,
+                data_parallel=True,
+            )
+
+            for prompt, generation in zip([prompt], resp_sentences):
+                if isinstance(sample_id, torch.Tensor):
+                    sample_id = sample_id.item()
+
+                output = {"sample_id": sample_id, "prompt": prompt}
+
+                output_name = ""
+                if config.task == "captioning":
+                    output_name = "caption"
+                elif config.task in ("TextVQA", "VQAv2", "ChartQA"):
+                    output_name = "answer"
+                elif config.task in ("MMMU"):
+                    output_name = "text"
+                elif config.task == "VideoMME":
+                    output_name = "response"
+                    output = question
+
+                generated = get_generated(generation, config.prompt_format)
+                if config.task == "VideoMME":
+                    output["questions"][0][output_name] = generated
+                else:
+                    output[output_name] = generated
+
+                if config.task == "captioning":
+                    output["ground_truth"] = answers
+                elif config.task in ("TextVQA", "VQAv2"):
+                    output["gt_answer"] = [ans for ans in answers]
+                elif config.task == "ChartQA":
+                    output["gt_answer"] = [answers]
+                elif config.task == "MMMU":
+                    prediction = generated
+                    if metadata["question_type"] == "multiple-choice":
+                        prediction = parse_multi_choice_response(
+                            generated, metadata["all_choices"], metadata["index2ans"]
+                        )
+
+                    output["prediction"] = prediction
+
+                if print_output:
+                    print(output)
+
+                yield output
+                idx += 1
+        else:
+            generate_and_post_process(
+                model, forward_step=forward_step, detokenize_segments=False, data_parallel=True
+            )
+
+            idx += 1
+
+
+def get_evaluation_config():
+    """Get evaluation config from a config file or command-line arguments."""
+    args = get_args()
+    if args.config_path:
+        with open(args.config_path, "r") as f:
+            config_dict = yaml.safe_load(f)
+
+        config = EvaluationConfig(**config_dict)
+    else:
+        config = EvaluationConfig(
+            task=args.task,
+            temperature=args.temperature,
+            top_p=args.top_p,
+            top_k=args.top_k,
+            out_seq_length=args.out_seq_length,
+            output_path=args.output_path,
+            input_image_path=args.input_image_path,
+            gt_path=args.gt_path,
+            num_partitions=args.num_partitions,
+            partition_id=args.partition_id,
+            num_samples_per_partition=args.num_samples_per_partition,
+            prompt_format=args.prompt_format,
+        )
+
+    # Default output path if not defined...
+    if not config.output_path:
+        os.makedirs("generated", exist_ok=True)
+        config.output_path = "generated/" + args.language_model_type
+
+    return config
+
+
+def is_first_rank():
+    return (
+        parallel_state.is_pipeline_first_stage(ignore_virtual=True)
+        and parallel_state.get_tensor_model_parallel_rank() == 0
+    )
+
+
+def get_output_path(config, dp_rank):
+    return (
+        f"{config.output_path}-{config.task}-dprank={dp_rank}-partition={config.partition_id}.jsonl"
+    )
+
+
+def generate_and_write_samples(model, config, print_output=True):
+    """Generate text and write to an output file."""
+    dp_rank = parallel_state.get_data_parallel_rank()
+
+    if is_first_rank():
+        output_path = get_output_path(config, dp_rank)
+        output_file = open(output_path, "w")
+        print(f"output path: {output_file.name}")
+
+    with torch.no_grad():
+        for output in generate_samples(model, config, print_output):
+            if is_first_rank():
+                output_file.write(json.dumps(output) + "\n")
+                output_file.flush()
+
+    if is_first_rank():
+        output_file.close()
+
+
+class VLMForwardStep(ForwardStep):
+    """Inference forward step for a multimodal model."""
+
+    def __init__(
+        self,
+        num_img_embeddings_per_tile,
+        images,
+        num_tiles,
+        model,
+        max_batch_size,
+        max_sequence_length,
+    ):
+        """Create multimodal forward step."""
+        total_num_tiles = torch.sum(num_tiles).item()
+        num_img_embeddings = num_img_embeddings_per_tile * total_num_tiles
+
+        super().__init__(model, max_batch_size, max_sequence_length + num_img_embeddings)
+        self._images = images
+        self._num_tiles = num_tiles
+
+    def _forward(self, tokens, position_ids, attention_mask):
+        return self.model(
+            self._images,
+            tokens,
+            position_ids,
+            attention_mask=None,
+            inference_params=self.inference_params,
+            num_image_tiles=self._num_tiles,
+            runtime_gather_output=True,
+        )
+
+    def __call__(self, tokens, position_ids, attention_mask):
+        logits = super().__call__(tokens, position_ids, attention_mask)
+
+        # On the first inference iteration, we compute image tokens.
+        # Update the sequence length offset by the number of image tokens.
+        num_image_tokens = (tokens == -200).sum().item()
+        num_tokens = tokens.size(1)
+        if num_tokens > 1 and num_image_tokens > 0:
+            self.inference_params.sequence_len_offset += (
+                self.inference_params.key_value_memory_dict["image_tokens_count"] - num_image_tokens
+            )
+
+        return logits
+
+
+def get_prompt(task, question, prompt_format):
+    """Get a prompt for the evaluation task."""
+    if task == "captioning":
+        if prompt_format == "llama3":
+            prompt = "<|start_header_id|>system<|end_header_id|>\n\nA chat between a curious user and an artificial intelligence assistant. The assistant gives helpful, detailed, and polite answers to the user's questions.<|eot_id|><|start_header_id|>user<|end_header_id|>\n\n<image>\nProvide a one-sentence caption for provided image.<|eot_id|><|start_header_id|>assistant<|end_header_id|>\n\n"
+        elif prompt_format == "mistral":
+            prompt = (
+                "[INST] <image>Give a short and clear explanation of the subsequent image. [/INST]"
+            )
+    elif task == "TextVQA":
+        if prompt_format == "llama3":
+            prompt = "<|start_header_id|>system<|end_header_id|>\n\nAnswer the questions.<|eot_id|><|start_header_id|>user<|end_header_id|>\n\n<image>\n{}\nAnswer the question using a single word or phrase.<|eot_id|><|start_header_id|>assistant<|end_header_id|>\n\n".format(
+                question
+            )
+        elif prompt_format == "mistral":
+            prompt = "[INST] <image>\n{}\nAnswer the question using a single word or phrase. [/INST]".format(
+                question
+            )
+    elif task == "VQAv2":
+        if prompt_format == "llama3":
+            prompt = "<|start_header_id|>system<|end_header_id|>\n\nAnswer the questions.<|eot_id|><|start_header_id|>user<|end_header_id|>\n\n<image>\n{}\nAnswer the question using a single word or phrase.<|eot_id|><|start_header_id|>assistant<|end_header_id|>\n\n".format(
+                question
+            )
+        elif prompt_format == "mistral":
+            prompt = "[INST] <image>\n{}\nAnswer the question using a single word or phrase. [/INST]".format(
+                question
+            )
+    elif task == "ChartQA":
+        if prompt_format == "llama3":
+            prompt = "<|start_header_id|>system<|end_header_id|>\n\nAnswer the questions.<|eot_id|><|start_header_id|>user<|end_header_id|>\n\n<image>\n{}\nAnswer the question using a single word or phrase.<|eot_id|><|start_header_id|>assistant<|end_header_id|>\n\n".format(
+                question
+            )
+        elif prompt_format == "mistral":
+            prompt = "[INST] <image>\n{}\nAnswer the question using a single word or phrase. [/INST]".format(
+                question
+            )
+    elif task == "MMMU":
+        if prompt_format == "llama3":
+            prompt = "<|start_header_id|>system<|end_header_id|>\n\nAnswer the questions.<|eot_id|><|start_header_id|>user<|end_header_id|>\n\n{}<|eot_id|><|start_header_id|>assistant<|end_header_id|>\n\n"
+            prompt = prompt.format(question)
+        elif prompt_format == "mistral":
+            prompt = "[INST] {} [/INST]".format(question)
+    elif task == "VideoMME":
+        q = (
+            "Select the best answer to the following multiple-choice "
+            "question based on the video. Respond with only the letter "
+            "(A, B, C, or D) of the correct option.\n"
+        )
+        q += question["questions"][0]["question"] + "\n"
+        q += question["questions"][0]["choices"][0] + "\n"
+        q += question["questions"][0]["choices"][1] + "\n"
+        q += question["questions"][0]["choices"][2] + "\n"
+        q += question["questions"][0]["choices"][3] + "\n"
+
+        if prompt_format == "llama3":
+            prompt = "<|start_header_id|>system<|end_header_id|>\n\nAnswer the questions.<|eot_id|><|start_header_id|>user<|end_header_id|>\n\n<image>\n{}<|eot_id|><|start_header_id|>assistant<|end_header_id|>\n\n"
+            prompt = prompt.format(q)
+        elif prompt_format == "mistral":
+            prompt = "[INST] <image>\n{} [/INST]".format(q)
+
+    return prompt
+
+
+def get_generated(prompt_and_generation, prompt_format):
+    """Strip prompt and other unnecessary text from generation."""
+    if prompt_format == "llama3":
+        generated = prompt_and_generation.split(
+            "<|start_header_id|>assistant<|end_header_id|>\n\n"
+        )[-1]
+        generated = generated.split("<|eot_id|>")[0]
+    elif prompt_format == "mistral":
+        generated = prompt_and_generation.split("[/INST]")[-1]
+        generated = generated.split("</s>")[0]
+
+    generated = generated.strip()
+    generated = generated.split("\n\n")[0]
+    generated = generated.split("\n")[0]
+
+    return generated
+
+
+def patch_tokenizer(args):
+    """Patch tokenizer with image token support."""
+
+    def _decorate_tokenize(f):
+        # When tokenizing, replace <image> with the image token index (-200)
+        def wrapper(prompt):
+            tokens = tokenizer_image_token(args, prompt, f)
+
+            return tokens
+
+        return wrapper
+
+    def _decorate_detokenize(f):
+        # When detokenizing, skip image token index.
+        def wrapper(tokens):
+            tokens = np.array(tokens)
+            tokens = tokens[tokens != IMAGE_TOKEN_INDEX]
+            tokens = tokens.tolist()
+
+            return f(tokens)
+
+        return wrapper
+
+    tokenizer = get_tokenizer()
+    tokenizer.tokenize = _decorate_tokenize(tokenizer.tokenize)
+    tokenizer.detokenize = _decorate_detokenize(tokenizer.detokenize)
+
+
+def main():
+    """Vision language model text generation."""
+    logging.getLogger(__name__).warning("Models using pipeline parallelism are not supported yet.")
+
+    initialize_megatron(extra_args_provider=add_text_generation_args)
+
+    args = get_args()
+
+    patch_tokenizer(args)  # Make the tokenizer support image tokens.
+
+    def wrapped_model_provider(pre_process, post_process):
+        return model_provider(pre_process, post_process, parallel_output=False)
+
+    # Set up model and load checkpoint.
+    model = get_model(wrapped_model_provider, wrap_with_ddp=False)
+
+    if args.load is not None:
+        _ = load_checkpoint(model, None, None)
+
+    model = model[0]
+
+    model.eval()
+
+    config = get_evaluation_config()
+
+    generate_and_write_samples(model, config)
+
+
+if __name__ == "__main__":
+    main()
diff --git a/examples/multimodal/sft_dataset.yaml b/examples/multimodal/sft_dataset.yaml
new file mode 100644
index 0000000000..c9f0257ae7
--- /dev/null
+++ b/examples/multimodal/sft_dataset.yaml
@@ -0,0 +1,15 @@
+__module__: megatron.energon
+__class__: Metadataset
+splits:
+  train:
+    datasets:
+      - weight: 1.
+        path: <path_to_sft_dataset_in_energon_format>
+        subflavors:
+          augmentation: false
+  val:
+    datasets:
+      - weight: 1.
+        path: <path_to_sft_dataset_in_energon_format>
+        subflavors:
+          augmentation: false
diff --git a/examples/multimodal/sft_mistral_clip.sh b/examples/multimodal/sft_mistral_clip.sh
new file mode 100755
index 0000000000..46fc996055
--- /dev/null
+++ b/examples/multimodal/sft_mistral_clip.sh
@@ -0,0 +1,133 @@
+#!/bin/bash
+# Run SFT on a pretrained multimodal model
+
+export NCCL_IB_SL=1
+export CUDA_DEVICE_MAX_CONNECTIONS=1
+MODEL_NAME="mcore-llava-mistral-7b-instruct-clip336-sft"
+
+# Check that the user has set an output path for model checkpoints.
+if [[ -z $WORKSPACE ]]; then
+    echo "Please set WORKSPACE for storing your model checkpoints."
+    exit 1
+fi
+
+SOURCE=`pwd`
+OUTPUT_BASE="${WORKSPACE}/output"
+OUTPUT="${OUTPUT_BASE}/${MODEL_NAME}"
+
+FINETUNE_DIR=${OUTPUT}/checkpoints
+LOGS_DIR="${OUTPUT}/logs"
+TENSORBOARD_DIR="${OUTPUT}/tensorboard"
+
+if [[ -z $LOAD_NAME ]]; then
+    echo "Please set LOAD_NAME for input model name."
+    exit 1
+fi
+
+if [[ -z $LOAD_ITER ]]; then
+    echo "Please set LOAD_ITER for pre-trained input model iteration."
+    exit 1
+fi
+
+if [[ -z $TOKENIZER_MODEL ]]; then
+    echo "Please set TOKENIZER_MODEL for tokenizer model name."
+    exit 1
+fi
+
+CHECKPOINT_DIR="${WORKSPACE}/${LOAD_NAME}/checkpoints"
+
+DATA_TRAIN="${SOURCE}/examples/multimodal/sft_dataset.yaml"
+
+DEBUG=0
+if [[ $DEBUG -eq 1 ]]; then
+    BZ=8
+    NW=1
+    HD=0.0
+    LI=1
+    EXTRA_ARGS=""
+    NONDETERMINISTIC_ATTN=1
+else
+    BZ=128
+    NW=2
+    HD=0.1
+    LI=10
+    EXTRA_ARGS=""
+    NONDETERMINISTIC_ATTN=1
+fi
+
+OPTIONS=" \
+    --apply-layernorm-1p \
+    --attention-softmax-in-fp32 \
+    --use-checkpoint-args \
+    --use-distributed-optimizer \
+    --transformer-impl transformer_engine \
+    --use-te \
+    --normalization RMSNorm \
+    --group-query-attention \
+    --num-query-groups 8 \
+    --no-masked-softmax-fusion \
+    --num-workers ${NW} \
+    --exit-duration-in-mins 230 \
+    --use-flash-attn \
+    --untie-embeddings-and-output-weights \
+    --disable-bias-linear \
+    --position-embedding-type rope \
+    --rotary-percent 1.0 \
+    --rotary-base 1000000 \
+    --swiglu \
+    --attention-dropout 0.0 \
+    --hidden-dropout ${HD} \
+    --tensor-model-parallel-size 4 \
+    --pipeline-model-parallel-size 1 \
+    --num-layers 32 \
+    --hidden-size 4096 \
+    --num-attention-heads 32 \
+    --seq-length 576 \
+    --decoder-seq-length 2048 \
+    --max-position-embeddings 4096 \
+    --ffn-hidden-size 14336 \
+    --train-iters 20000 \
+    --micro-batch-size 1 \
+    --global-batch-size ${BZ} \
+    --lr-decay-iters 20000 \
+    --lr-warmup-fraction .01 \
+    --lr 1e-6 \
+    --min-lr 1e-7 \
+    --lr-decay-style cosine \
+    --log-interval ${LI} \
+    --eval-iters 10 \
+    --eval-interval 500 \
+    --tokenizer-type HuggingFaceTokenizer \
+    --tokenizer-model ${WORKSPACE}/${TOKENIZER_MODEL} \
+    --data-path ${DATA_TRAIN} \
+    --prompt-path ${SOURCE}/examples/multimodal/manual_prompts.json \
+    --save-interval 500 \
+    --save ${FINETUNE_DIR} \
+    --load ${FINETUNE_DIR} \
+    --pretrained-checkpoint ${CHECKPOINT_DIR} \
+    --dataloader-save ${FINETUNE_DIR}/dataloader \
+    --split 100,0,0 \
+    --clip-grad 0.5 \
+    --weight-decay 0.1 \
+    --adam-beta1 0.9 \
+    --adam-beta2 0.95 \
+    --init-method-std 0.014 \
+    --log-params-norm \
+    --log-num-zeros-in-grad \
+    --eod-mask-loss \
+    --freeze-ViT \
+    --patch-dim 14 \
+    --img-h 336 \
+    --img-w 336 \
+    --dataloader-type external \
+    --tensorboard-dir ${TENSORBOARD_DIR} \
+    --language-model-type=mistral_7b \
+    --disable-vision-class-token \
+    ${EXTRA_ARGS} \
+    --distributed-timeout-minutes 60 \
+"
+
+export NVTE_APPLY_QK_LAYER_SCALING=0
+export NVTE_ALLOW_NONDETERMINISTIC_ALGO=${NONDETERMINISTIC_ATTN}
+
+torchrun --nproc_per_node 8 examples/multimodal/train.py ${OPTIONS}
diff --git a/examples/multimodal/text_generation_mistral_clip.sh b/examples/multimodal/text_generation_mistral_clip.sh
new file mode 100755
index 0000000000..6423464e6d
--- /dev/null
+++ b/examples/multimodal/text_generation_mistral_clip.sh
@@ -0,0 +1,117 @@
+#!/bin/bash
+
+export NCCL_IB_SL=1
+export CUDA_DEVICE_MAX_CONNECTIONS=1
+export NVTE_APPLY_QK_LAYER_SCALING=0
+
+GROUNDTRUTH_PATH="placeholder"
+NUM_FRAMES=1
+
+while [[ $# -gt 0 ]]; do
+    case $1 in
+        --input-image-path)
+            INPUT_IMAGE_PATH="$2"
+            shift
+            shift
+            ;;
+        --num-frames)
+            NUM_FRAMES="$2"
+            shift
+            shift
+            ;;
+        -g|--groundtruth-path)
+            GROUNDTRUTH_PATH="$2"
+            shift
+            shift
+            ;;
+        -o|--output-path)
+            OUTPUT_PATH="$2"
+            shift
+            shift
+            ;;
+        -m|--model-path)
+            MODEL_PATH="$2"
+            shift
+            shift
+            ;;
+        -t|--tokenizer-path)
+            TOKENIZER_PATH="$2"
+            shift
+            shift
+            ;;
+        --task)
+            TASK="$2"
+            shift
+            shift
+            ;;
+        -g|--gt-path)
+            GROUNDTRUTH_PATH="$2"
+            shift
+            shift
+            ;;
+        -*|--*)
+            echo "Invalid option $1"
+            exit 1
+            ;;
+    esac
+done
+
+# Please modify these as needed.
+NUM_PARTITIONS=0
+START=0
+END=0
+
+for PARTITION_ID in $( eval echo {$START..$END} )
+do
+    torchrun --nproc_per_node 8 examples/multimodal/run_text_generation.py \
+        --apply-layernorm-1p \
+        --attention-softmax-in-fp32 \
+        --use-flash-attn \
+        --transformer-impl transformer_engine \
+        --use-te \
+        --use-checkpoint-args \
+        --normalization RMSNorm \
+        --language-model-type mistral_7b \
+        --untie-embeddings-and-output-weights \
+        --disable-bias-linear \
+        --position-embedding-type rope \
+        --rotary-percent 1.0 \
+        --rotary-base 1000000 \
+        --swiglu \
+        --attention-dropout 0.0 \
+        --hidden-dropout 0.0 \
+        --tensor-model-parallel-size 4 \
+        --pipeline-model-parallel-size 1 \
+        --group-query-attention \
+        --num-query-groups 8 \
+        --num-layers 32 \
+        --hidden-size 4096 \
+        --ffn-hidden-size 14336 \
+        --num-attention-heads 32 \
+        --max-position-embeddings 4096 \
+        --no-masked-softmax-fusion \
+        --load ${MODEL_PATH} \
+        --tokenizer-type HuggingFaceTokenizer \
+        --tokenizer-model ${TOKENIZER_PATH} \
+        --bf16 \
+        --micro-batch-size 1 \
+        --seq-length 2048 \
+        --out-seq-length 12 \
+        --temperature 1.0 \
+        --img-h 336 \
+        --img-w 336 \
+        --patch-dim 14 \
+        --seed 153 \
+        --top_k 1 \
+        --no-load-rng \
+        --no-load-optim \
+        --input-image-path ${INPUT_IMAGE_PATH} \
+        --num-partitions ${NUM_PARTITIONS} \
+        --partition-id ${PARTITION_ID} \
+        --output-path ${OUTPUT_PATH} \
+        --gt-path ${GROUNDTRUTH_PATH} \
+        --task ${TASK} \
+        --disable-vision-class-token \
+        --prompt-format mistral \
+        --num-frames ${NUM_FRAMES}
+done
diff --git a/examples/multimodal/train.py b/examples/multimodal/train.py
new file mode 100644
index 0000000000..1615531afa
--- /dev/null
+++ b/examples/multimodal/train.py
@@ -0,0 +1,315 @@
+# Copyright (c) 2024, NVIDIA CORPORATION.  All rights reserved.
+"""Pretrain or SFT multimodal."""
+import os
+import sys
+from functools import partial
+
+import torch
+import yaml
+
+sys.path.append(os.path.abspath(os.path.join(os.path.dirname(__file__),
+                                             os.path.pardir, os.path.pardir)))
+
+from dataloader_provider import train_valid_test_dataloaders_provider
+from model import model_provider
+from multimodal_args import add_multimodal_extra_args
+
+from megatron.core import mpu, tensor_parallel
+from megatron.core.enums import ModelType
+from megatron.core.models.multimodal.llava_model import LLaVAModel
+from megatron.core.parallel_state import get_tensor_model_parallel_rank
+from megatron.training import get_args, get_timers, get_tokenizer, pretrain
+from megatron.training.utils import is_last_rank
+
+
+def get_batch(data_iterator):
+    """Generate a batch"""
+
+    args = get_args()
+
+    imgs = None
+    tokens = None
+    labels = None
+    loss_mask = None
+    attention_mask = None
+    position_ids = None
+    num_tiles = None
+
+    # Broadcast data.
+    torch.cuda.nvtx.range_push("get_data")
+    if data_iterator is not None and get_tensor_model_parallel_rank() == 0:
+        data = next(data_iterator)
+    else:
+        data = None
+
+    data_text = tensor_parallel.broadcast_data(["text"], data, torch.int64)["text"]
+    prompt_len = tensor_parallel.broadcast_data(["prompt_len"], data, torch.int64)["prompt_len"]
+    target = tensor_parallel.broadcast_data(["target"], data, torch.int64)["target"]
+
+    imgs = tensor_parallel.broadcast_data(["imgs"], data, torch.float32)["imgs"]
+    num_tiles = tensor_parallel.broadcast_data(["num_tiles"], data, torch.int)["num_tiles"]
+
+    # Dummy image, no image.
+    if imgs.shape == torch.Size([1, 1]):
+        imgs = torch.tensor([], dtype=torch.float32, device=data_text.device)
+        num_tiles = torch.tensor([], dtype=torch.int, device=data_text.device)
+
+    torch.cuda.nvtx.range_pop()
+
+    tokens_ = data_text.long()
+
+    torch.cuda.nvtx.range_push("index tokens")
+    tokenizer = get_tokenizer()
+    text_length = tokens_.shape[1]
+    tokens = tokens_[:, :text_length].contiguous()
+    labels = target[:, 1:text_length+1].contiguous()
+
+    assert tokens.shape == labels.shape, f"tokens: {tokens.shape} != labels: {labels.shape}"
+    torch.cuda.nvtx.range_pop()
+
+    torch.cuda.nvtx.range_push("get_ltor_masks_and_position_ids")
+    if hasattr(tokenizer, 'eod'):
+        eod_token = tokenizer.eod
+    elif hasattr(tokenizer, 'eos_id'):
+        eod_token = tokenizer.eos_id
+    attention_mask, loss_mask, position_ids = \
+        get_ltor_masks_and_position_ids(tokens, eod_token,
+                                        args.reset_position_ids,
+                                        args.reset_attention_mask,
+                                        args.eod_mask_loss,
+                                        question_length=prompt_len,
+                                        target=target[:, 1:text_length+1]
+                                        )
+    torch.cuda.nvtx.range_pop()
+
+    return tokens, labels, loss_mask, attention_mask, position_ids, imgs, num_tiles
+
+
+def get_ltor_masks_and_position_ids(data,
+                                    eod_token,
+                                    reset_position_ids,
+                                    reset_attention_mask,
+                                    eod_mask_loss,
+                                    question_length=None,
+                                    target=None,
+                                    weights=None):
+    """Build masks and position id for left to right model."""
+
+    # Extract batch size and sequence length.
+    micro_batch_size, seq_length = data.size()
+
+    # Attention mask (lower triangular).
+    if reset_attention_mask:
+        att_mask_batch = micro_batch_size
+    else:
+        att_mask_batch = 1
+
+    attention_mask = torch.tril(torch.ones(
+        (att_mask_batch, seq_length, seq_length), device=data.device)).view(
+            att_mask_batch, 1, seq_length, seq_length)
+
+     # Loss mask.
+    if target != None: # use target to create loss mask that is created in data preparation step
+        loss_mask = torch.ones(target.size(), dtype=torch.float, device=data.device)
+        loss_mask[target == eod_token] = 0.0 # mask paddings
+        loss_mask[target == -100] = 0.0 # mask prompts
+
+    else: # default creation
+        loss_mask = torch.ones(data.size(), dtype=torch.float, device=data.device)
+        if eod_mask_loss:
+            loss_mask[data == eod_token] = 0.0
+
+        if question_length is not None:
+            # Create a mask based on question_length
+            question_length_mask = torch.arange(loss_mask.size(1), device=loss_mask.device)[None, :] < question_length[:, None]
+            # Invert the mask (1 where we want to keep the loss, 0 where we want to zero it out)
+            inverted_mask = ~question_length_mask
+            # Apply the mask to loss_mask
+            loss_mask = loss_mask * inverted_mask.float()
+
+    # Position ids.
+    position_ids = torch.arange(seq_length, dtype=torch.long,
+                                device=data.device)
+    position_ids = position_ids.unsqueeze(0).expand_as(data)
+    # We need to clone as the ids will be modifed based on batch index.
+    if reset_position_ids:
+        position_ids = position_ids.clone()
+
+    if reset_position_ids or reset_attention_mask:
+        # Loop through the batches:
+        for b in range(micro_batch_size):
+
+            # Find indecies where EOD token is.
+            eod_index = position_ids[b, data[b] == eod_token]
+            # Detach indecies from positions if going to modify positions.
+            if reset_position_ids:
+                eod_index = eod_index.clone()
+
+            # Loop through EOD indecies:
+            prev_index = 0
+            for j in range(eod_index.size()[0]):
+                i = eod_index[j]
+                # Mask attention loss.
+                if reset_attention_mask:
+                    attention_mask[b, 0, (i + 1):, :(i + 1)] = 0
+                # Reset positions.
+                if reset_position_ids:
+                    position_ids[b, (i + 1):] -= (i + 1 - prev_index)
+                    prev_index = i + 1
+
+    # Convert attention mask to binary:
+    attention_mask = (attention_mask < 0.5)
+    if weights is not None:
+        loss_mask = loss_mask * weights
+
+    return attention_mask, loss_mask, position_ids
+
+
+def loss_func(loss_mask, output_tensor):
+    losses = output_tensor.float()
+
+    loss_mask = loss_mask.contiguous().view(-1).float()
+
+    total_tokens = loss_mask.sum()
+    total_loss = torch.sum(losses.view(-1) * loss_mask)
+    loss = torch.cat([total_loss.view(1), total_tokens.view(1)])
+
+    reporting_loss = loss.clone().detach()
+    torch.distributed.all_reduce(reporting_loss, group=mpu.get_data_parallel_group())
+
+    local_num_tokens = loss[1].clone().detach().to(torch.int)
+
+    return (
+        total_loss,
+        local_num_tokens,
+        {'lm loss': (reporting_loss[0], reporting_loss[1])},
+    )
+
+
+def forward_step(data_iterator, model: LLaVAModel):
+    """Forward training step.
+
+    Args:
+        data_iterator (torch.utils.data.dataloader): Input data iterator
+        model: Multimodal model
+
+    Returns:
+        output_tensor (torch.Tensor): Loss of shape [b, s] if labels are provided, otherwise logits of shape [b, s, vocab_size].
+        loss_func (callable): Loss function with a loss mask specified.
+    """
+    timers = get_timers()
+
+    # Get the batch.
+    timers('batch-generator', log_level=2).start()
+    tokens, labels, loss_mask, attention_mask, position_ids, images, num_image_tiles = get_batch(data_iterator)
+    timers('batch-generator').stop()
+
+    output_tensor, loss_mask = model(images, tokens, position_ids, attention_mask, labels, loss_mask, num_image_tiles=num_image_tiles)
+
+    return output_tensor, partial(loss_func, loss_mask)
+
+
+def llava_embedding_ranks(pp_ranks):
+    """LLava's embedding ranks consist of the decoder's first and last ranks (ie, the ViT has no embeddings).
+    Args:
+        pp_ranks: A list of global ranks that constitute a pipeline group.
+    """
+    args = get_args()
+
+    # encoder size is also the index to the first rank of the decoder.
+    epp = args.encoder_pipeline_model_parallel_size
+
+    last_rank = pp_ranks[-1]
+    if len(pp_ranks) == 1 or pp_ranks[epp] == last_rank:
+        return [last_rank]
+    else:
+        return [pp_ranks[epp], last_rank]
+
+
+def llava_position_embedding_ranks(pp_ranks):
+    """LLava's embedding ranks consist of the singular rank of the model or the decoder's first rank.
+    Args:
+        pp_ranks: A list of global ranks that constitute a pipeline group.
+    """
+    args = get_args()
+
+    # encoder size is also the index to the first rank of the decoder.
+    epp = args.encoder_pipeline_model_parallel_size
+
+    last_rank = pp_ranks[-1]
+    if len(pp_ranks) == 1:
+        return [last_rank]
+    else:
+        return [pp_ranks[epp]]
+
+
+
+def run_online_eval(model):
+    """Run an evaluation benchmark during training."""
+    args = get_args()
+
+    # Online evaluation config is not defined. Do nothing.
+    if not args.online_evaluation_config:
+        return []
+
+    from config import EvaluationConfig
+    from run_text_generation import generate_and_write_samples, patch_tokenizer
+
+    with open(args.online_evaluation_config, "r") as f:
+        config_dict = yaml.safe_load(f)
+
+    config = EvaluationConfig(**config_dict)
+
+    patch_tokenizer(args)
+
+    # The inference code assumes the first rank is the leader.
+    # Tensorboard writer is on the last rank.
+    # We must write to a storage space that all ranks see.
+    output_dir = os.path.join(args.save, "online_eval")
+    os.makedirs(output_dir, exist_ok=True)
+    config.output_path = os.path.join(output_dir, args.language_model_type)
+
+    # The actual generation.
+    generate_and_write_samples(model[0].module, config, print_output=False)
+
+    # Make sure the first rank is done writing so that the last rank can run eval.
+    torch.distributed.barrier()
+
+    if not is_last_rank():
+        return []
+
+    # Run evaluation.
+    if config.task == "TextVQA":
+        from evaluate_textvqa import textvqa_eval
+        avg_acc = textvqa_eval(config.output_path)
+
+        return [{"TextVQA accuracy": avg_acc}]
+    else:
+        raise NotImplementedError(f"online evaluation of {config.task} not implemented yet")
+
+
+def write_online_eval_to_tensorboard(data, iteration, writer):
+    """Write online evaluation data to Tensorboard."""
+    if not writer:
+        return
+
+    for item in data:
+        for k, v in item.items():
+            writer.add_scalar(k, v, iteration)
+
+
+if __name__ == "__main__":
+    train_valid_test_dataloaders_provider.is_distributed = True
+
+    pretrain(
+        train_valid_test_dataloaders_provider,
+        model_provider,
+        ModelType.encoder_and_decoder,
+        forward_step,
+        args_defaults={'tokenizer_type': 'GPT2BPETokenizer'},
+        extra_args_provider=add_multimodal_extra_args,
+        process_non_loss_data_func=write_online_eval_to_tensorboard,
+        get_embedding_ranks=llava_embedding_ranks,
+        get_position_embedding_ranks=llava_position_embedding_ranks,
+        non_loss_data_func=run_online_eval
+    )
diff --git a/examples/pretrain_bert.sh b/examples/pretrain_bert.sh
deleted file mode 100755
index c98c7ebbdb..0000000000
--- a/examples/pretrain_bert.sh
+++ /dev/null
@@ -1,47 +0,0 @@
-#!/bin/bash
-
-export CUDA_DEVICE_MAX_CONNECTIONS=1
-
-CHECKPOINT_PATH=<Specify path>
-VOCAB_FILE=<Specify path to file>/bert-vocab.txt
-DATA_PATH=<Specify path and file prefix>_text_sentence
-
-BERT_ARGS="
-    --num-layers 24 \
-    --hidden-size 1024 \
-    --num-attention-heads 16 \
-    --seq-length 512 \
-    --max-position-embeddings 512 \
-    --micro-batch-size 4 \
-    --global-batch-size 8 \
-    --lr 0.0001 \
-    --train-iters 2000000 \
-    --lr-decay-iters 990000 \
-    --lr-decay-style linear \
-    --min-lr 0.00001 \
-    --weight-decay 1e-2 \
-    --lr-warmup-fraction .01 \
-    --clip-grad 1.0 \
-    --fp16
-"
-
-DATA_ARGS="
-    --data-path $DATA_PATH \
-    --vocab-file $VOCAB_FILE \
-    --data-impl mmap \
-    --split 949,50,1
-"
-
-OUTPUT_ARGS="
-    --log-interval 100 \
-    --save-interval 10000 \
-    --eval-interval 1000 \
-    --eval-iters 10
-"
-
-torchrun pretrain_bert.py \
-    $BERT_ARGS \
-    $DATA_ARGS \
-    $OUTPUT_ARGS \
-    --save $CHECKPOINT_PATH \
-    --load $CHECKPOINT_PATH
diff --git a/examples/pretrain_bert_distributed.sh b/examples/pretrain_bert_distributed.sh
deleted file mode 100755
index 4a87a7bfba..0000000000
--- a/examples/pretrain_bert_distributed.sh
+++ /dev/null
@@ -1,64 +0,0 @@
-#!/bin/bash
-
-export CUDA_DEVICE_MAX_CONNECTIONS=1
-
-GPUS_PER_NODE=8
-# Change for multinode config
-MASTER_ADDR=localhost
-MASTER_PORT=6000
-NNODES=1
-NODE_RANK=0
-WORLD_SIZE=$(($GPUS_PER_NODE*$NNODES))
-
-CHECKPOINT_PATH=<Specify path>
-VOCAB_FILE=<Specify path to file>/bert-vocab.txt
-DATA_PATH=<Specify path and file prefix>_text_sentence
-
-DISTRIBUTED_ARGS="
-    --nproc_per_node $GPUS_PER_NODE \
-    --nnodes $NNODES \
-    --node_rank $NODE_RANK \
-    --master_addr $MASTER_ADDR \
-    --master_port $MASTER_PORT
-"
-
-BERT_ARGS="
-    --num-layers 24 \
-    --hidden-size 1024 \
-    --num-attention-heads 16 \
-    --seq-length 512 \
-    --max-position-embeddings 512 \
-    --micro-batch-size 4 \
-    --global-batch-size 32 \
-    --lr 0.0001 \
-    --train-iters 1000000 \
-    --lr-decay-iters 990000 \
-    --lr-decay-style linear \
-    --min-lr 1.0e-5 \
-    --weight-decay 1e-2 \
-    --lr-warmup-fraction .01 \
-    --clip-grad 1.0 \
-    --fp16
-"
-
-DATA_ARGS="
-    --data-path $DATA_PATH \
-    --vocab-file $VOCAB_FILE \
-    --data-impl mmap \
-    --split 949,50,1
-"
-
-OUTPUT_ARGS="
-    --log-interval 100 \
-    --save-interval 10000 \
-    --eval-interval 1000 \
-    --eval-iters 10
-"
-
-torchrun $DISTRIBUTED_ARGS pretrain_bert.py \
-    $BERT_ARGS \
-    $DATA_ARGS \
-    $OUTPUT_ARGS \
-    --distributed-backend nccl \
-    --save $CHECKPOINT_PATH \
-    --load $CHECKPOINT_PATH
diff --git a/examples/pretrain_bert_distributed_with_mp.sh b/examples/pretrain_bert_distributed_with_mp.sh
deleted file mode 100755
index 62d7f741c2..0000000000
--- a/examples/pretrain_bert_distributed_with_mp.sh
+++ /dev/null
@@ -1,66 +0,0 @@
-#!/bin/bash
-
-export CUDA_DEVICE_MAX_CONNECTIONS=1
-
-GPUS_PER_NODE=8
-# Change for multinode config
-MASTER_ADDR=localhost
-MASTER_PORT=6000
-NNODES=1
-NODE_RANK=0
-WORLD_SIZE=$(($GPUS_PER_NODE*$NNODES))
-
-CHECKPOINT_PATH=<Specify path>
-VOCAB_FILE=<Specify path to file>/bert-vocab.txt
-DATA_PATH=<Specify path and file prefix>_text_sentence
-
-DISTRIBUTED_ARGS="
-    --nproc_per_node $GPUS_PER_NODE \
-    --nnodes $NNODES \
-    --node_rank $NODE_RANK \
-    --master_addr $MASTER_ADDR \
-    --master_port $MASTER_PORT
-"
-
-BERT_ARGS="
-    --tensor-model-parallel-size 2 \
-    --pipeline-model-parallel-size 2 \
-    --num-layers 24 \
-    --hidden-size 1024 \
-    --num-attention-heads 16 \
-    --seq-length 512 \
-    --max-position-embeddings 512 \
-    --micro-batch-size 2 \
-    --global-batch-size 16 \
-    --lr 0.0001 \
-    --train-iters 1000000 \
-    --lr-decay-iters 990000 \
-    --lr-decay-style linear \
-    --min-lr 1.0e-5 \
-    --weight-decay 1e-2 \
-    --lr-warmup-fraction .01 \
-    --clip-grad 1.0 \
-    --fp16
-"
-
-DATA_ARGS="
-    --data-path $DATA_PATH \
-    --vocab-file $VOCAB_FILE \
-    --data-impl mmap \
-    --split 949,50,1
-"
-
-OUTPUT_ARGS="
-    --log-interval 100 \
-    --save-interval 10000 \
-    --eval-interval 1000 \
-    --eval-iters 10
-"
-
-torchrun $DISTRIBUTED_ARGS pretrain_bert.py \
-    $BERT_ARGS \
-    $DATA_ARGS \
-    $OUTPUT_ARGS \
-    --distributed-backend nccl \
-    --save $CHECKPOINT_PATH \
-    --load $CHECKPOINT_PATH
diff --git a/examples/pretrain_gpt.sh b/examples/pretrain_gpt.sh
deleted file mode 100755
index 4956d26ffa..0000000000
--- a/examples/pretrain_gpt.sh
+++ /dev/null
@@ -1,51 +0,0 @@
-#!/bin/bash
-
-# Runs the "345M" parameter model
-
-export CUDA_DEVICE_MAX_CONNECTIONS=1
-
-CHECKPOINT_PATH=<Specify path>
-VOCAB_FILE=<Specify path to file>/gpt2-vocab.json
-MERGE_FILE=<Specify path to file>/gpt2-merges.txt
-DATA_PATH=<Specify path and file prefix>_text_document
-
-GPT_ARGS="
-    --num-layers 24 \
-    --hidden-size 1024 \
-    --num-attention-heads 16 \
-    --seq-length 1024 \
-    --max-position-embeddings 1024 \
-    --micro-batch-size 4 \
-    --global-batch-size 8 \
-    --lr 0.00015 \
-    --train-iters 500000 \
-    --lr-decay-iters 320000 \
-    --lr-decay-style cosine \
-    --min-lr 1.0e-5 \
-    --weight-decay 1e-2 \
-    --lr-warmup-fraction .01 \
-    --clip-grad 1.0 \
-    --fp16
-"
-
-DATA_ARGS="
-    --data-path $DATA_PATH \
-    --vocab-file $VOCAB_FILE \
-    --merge-file $MERGE_FILE \
-    --data-impl mmap \
-    --split 949,50,1
-"
-
-OUTPUT_ARGS="
-    --log-interval 100 \
-    --save-interval 10000 \
-    --eval-interval 1000 \
-    --eval-iters 10
-"
-
-torchrun pretrain_gpt.py \
-    $GPT_ARGS \
-    $DATA_ARGS \
-    $OUTPUT_ARGS \
-    --save $CHECKPOINT_PATH \
-    --load $CHECKPOINT_PATH
diff --git a/examples/pretrain_gpt3_175B.sh b/examples/pretrain_gpt3_175B.sh
deleted file mode 100755
index b423e4bd13..0000000000
--- a/examples/pretrain_gpt3_175B.sh
+++ /dev/null
@@ -1,65 +0,0 @@
-#!/bin/bash
-
-
-#SBATCH <SLURM OPTIONS> --nodes=128 --exclusive --ntasks-per-node=8 --job-name=megatron_gpt3_175b
-
-
-DIR=`pwd`
-DATETIME=`date +'date_%y-%m-%d_time_%H-%M-%S'`
-mkdir -p $DIR/logs
-
-
-DATASET_1="<PATH TO THE FIRST DATASET>"
-DATASET_2="<PATH TO THE SECOND DATASET>"
-DATASET_3="<PATH TO THE THIRD DATASET>"
-DATASET="0.2 ${DATASET_1} 0.3 ${DATASET_2} 0.5 ${DATASET_3}"
-
-
-options=" \
-	--tensor-model-parallel-size 8 \
-	--pipeline-model-parallel-size 16 \
-        --num-layers 96 \
-        --hidden-size 12288 \
-        --num-attention-heads 96 \
-        --seq-length 2048 \
-        --max-position-embeddings 2048 \
-	--micro-batch-size 1 \
-	--global-batch-size 1536 \
-	--rampup-batch-size 16 16 5859375 \
-	--train-samples 146484375 \
-       	--lr-decay-samples 126953125 \
-        --lr-warmup-samples 183105 \
-        --lr 6.0e-5 \
-	--min-lr 6.0e-6 \
-        --lr-decay-style cosine \
-        --log-interval 10 \
-        --eval-iters 40 \
-        --eval-interval 1000 \
-	--data-path ${DATASET} \
-	--vocab-file <PATH TO gpt-vocab.json> \
-	--merge-file <PATH TO gpt-merges.txt> \
-	--save-interval 1000 \
-	--save <PATH TO CHECKPOINTS DIRECTORY> \
-	--load <PATH TO CHECKPOINTS DIRECTORY> \
-        --split 98,2,0 \
-        --clip-grad 1.0 \
-	--weight-decay 0.1 \
-	--adam-beta1 0.9 \
-	--adam-beta2 0.95 \
-	--init-method-std 0.006 \
-	--tensorboard-dir <TENSORBOARD DIRECTORY> \
-        --fp16 \
-	--activations-checkpoint-method uniform "
-
-
-run_cmd="python -u ${DIR}/pretrain_gpt.py $@ ${options}"
-
-
-srun -l \
-     --container-image "nvcr.io/nvidia/pytorch:20.12-py3" \
-     --container-mounts "<DIRECTORIES TO MOUNT>" \
-     --output=$DIR/logs/%x_%j_$DATETIME.log sh -c "${run_cmd}"
-
-
-set +x
-
diff --git a/examples/pretrain_gpt_distributed.sh b/examples/pretrain_gpt_distributed.sh
deleted file mode 100755
index 24d76a1dc3..0000000000
--- a/examples/pretrain_gpt_distributed.sh
+++ /dev/null
@@ -1,68 +0,0 @@
-#!/bin/bash
-
-# Runs the "345M" parameter model
-
-export CUDA_DEVICE_MAX_CONNECTIONS=1
-
-GPUS_PER_NODE=8
-# Change for multinode config
-MASTER_ADDR=localhost
-MASTER_PORT=6000
-NNODES=1
-NODE_RANK=0
-WORLD_SIZE=$(($GPUS_PER_NODE*$NNODES))
-
-CHECKPOINT_PATH=<Specify path>
-VOCAB_FILE=<Specify path to file>/gpt2-vocab.json
-MERGE_FILE=<Specify path to file>/gpt2-merges.txt
-DATA_PATH=<Specify path and file prefix>_text_document
-
-DISTRIBUTED_ARGS="
-    --nproc_per_node $GPUS_PER_NODE \
-    --nnodes $NNODES \
-    --node_rank $NODE_RANK \
-    --master_addr $MASTER_ADDR \
-    --master_port $MASTER_PORT
-"
-
-GPT_ARGS="
-    --num-layers 24 \
-    --hidden-size 1024 \
-    --num-attention-heads 16 \
-    --seq-length 1024 \
-    --max-position-embeddings 1024 \
-    --micro-batch-size 8 \
-    --global-batch-size 64 \
-    --lr 0.00015 \
-    --train-iters 500000 \
-    --lr-decay-iters 320000 \
-    --lr-decay-style cosine \
-    --min-lr 1.0e-5 \
-    --weight-decay 1e-2 \
-    --lr-warmup-fraction .01 \
-    --clip-grad 1.0 \
-    --fp16
-"
-
-DATA_ARGS="
-    --data-path $DATA_PATH \
-    --vocab-file $VOCAB_FILE \
-    --merge-file $MERGE_FILE \
-    --data-impl mmap \
-    --split 949,50,1
-"
-
-OUTPUT_ARGS="
-    --log-interval 100 \
-    --save-interval 10000 \
-    --eval-interval 1000 \
-    --eval-iters 10
-"
-
-torchrun $DISTRIBUTED_ARGS pretrain_gpt.py \
-    $GPT_ARGS \
-    $DATA_ARGS \
-    $OUTPUT_ARGS \
-    --distributed-backend nccl \
-    --save $CHECKPOINT_PATH \
-    --load $CHECKPOINT_PATH
diff --git a/examples/pretrain_gpt_distributed_with_mp.sh b/examples/pretrain_gpt_distributed_with_mp.sh
deleted file mode 100755
index 721288fdb0..0000000000
--- a/examples/pretrain_gpt_distributed_with_mp.sh
+++ /dev/null
@@ -1,72 +0,0 @@
-#!/bin/bash
-
-# Runs the "345M" parameter model
-
-export CUDA_DEVICE_MAX_CONNECTIONS=1
-
-GPUS_PER_NODE=8
-# Change for multinode config
-MASTER_ADDR=localhost
-MASTER_PORT=6000
-NNODES=1
-NODE_RANK=0
-WORLD_SIZE=$(($GPUS_PER_NODE*$NNODES))
-
-CHECKPOINT_PATH=<Specify path>
-VOCAB_FILE=<Specify path to file>/gpt2-vocab.json
-MERGE_FILE=<Specify path to file>/gpt2-merges.txt
-DATA_PATH=<Specify path and file prefix>_text_document
-
-DISTRIBUTED_ARGS="
-    --nproc_per_node $GPUS_PER_NODE \
-    --nnodes $NNODES \
-    --node_rank $NODE_RANK \
-    --master_addr $MASTER_ADDR \
-    --master_port $MASTER_PORT
-"
-
-GPT_ARGS="
-    --tensor-model-parallel-size 2 \
-    --pipeline-model-parallel-size 2 \
-    --sequence-parallel \
-    --num-layers 24 \
-    --hidden-size 1024 \
-    --num-attention-heads 16 \
-    --seq-length 1024 \
-    --max-position-embeddings 1024 \
-    --micro-batch-size 4 \
-    --global-batch-size 16 \
-    --lr 0.00015 \
-    --train-iters 500000 \
-    --lr-decay-iters 320000 \
-    --lr-decay-style cosine \
-    --min-lr 1.0e-5 \
-    --weight-decay 1e-2 \
-    --lr-warmup-fraction .01 \
-    --clip-grad 1.0 \
-    --fp16
-"
-
-DATA_ARGS="
-    --data-path $DATA_PATH \
-    --vocab-file $VOCAB_FILE \
-    --merge-file $MERGE_FILE \
-    --data-impl mmap \
-    --split 949,50,1
-"
-
-OUTPUT_ARGS="
-    --log-interval 100 \
-    --save-interval 10000 \
-    --eval-interval 1000 \
-    --eval-iters 10
-"
-
-torchrun $DISTRIBUTED_ARGS pretrain_gpt.py \
-    $GPT_ARGS \
-    $DATA_ARGS \
-    $OUTPUT_ARGS \
-    --distributed-backend nccl \
-    --save $CHECKPOINT_PATH \
-    --load $CHECKPOINT_PATH
-
diff --git a/examples/pretrain_ict.sh b/examples/pretrain_ict.sh
deleted file mode 100755
index 8cba0f08ba..0000000000
--- a/examples/pretrain_ict.sh
+++ /dev/null
@@ -1,44 +0,0 @@
-#! /bin/bash
-
-# Runs the "217M" parameter biencoder model for ICT retriever
-
-RANK=0
-WORLD_SIZE=1
-
-PRETRAINED_BERT_PATH=<Specify path of pretrained BERT model>
-TEXT_DATA_PATH=<Specify path and file prefix of the text data>
-TITLE_DATA_PATH=<Specify path and file prefix od the titles>
-CHECKPOINT_PATH=<Specify path>
-
-
-python pretrain_ict.py \
-        --num-layers 12 \
-        --hidden-size 768 \
-        --num-attention-heads 12 \
-        --tensor-model-parallel-size 1 \
-        --micro-batch-size 32 \
-        --seq-length 256 \
-        --max-position-embeddings 512 \
-        --train-iters 100000 \
-        --vocab-file bert-vocab.txt \
-        --tokenizer-type BertWordPieceLowerCase \
-        --DDP-impl torch \
-        --bert-load ${PRETRAINED_BERT_PATH} \
-        --log-interval 100 \
-        --eval-interval 1000 \
-        --eval-iters 10 \
-        --retriever-report-topk-accuracies 1 5 10 20 100 \
-        --retriever-score-scaling \
-        --load $CHECKPOINT_PATH \
-        --save $CHECKPOINT_PATH \
-        --data-path ${TEXT_DATA_PATH} \
-        --titles-data-path ${TITLE_DATA_PATH} \
-        --lr 0.0001 \
-        --lr-decay-style linear \
-        --weight-decay 1e-2 \
-        --clip-grad 1.0 \
-        --lr-warmup-fraction 0.01 \
-        --save-interval 4000 \
-        --exit-interval 8000 \
-        --query-in-block-prob 0.1 \
-        --fp16
diff --git a/examples/pretrain_t5.sh b/examples/pretrain_t5.sh
deleted file mode 100644
index 5f4b63ad68..0000000000
--- a/examples/pretrain_t5.sh
+++ /dev/null
@@ -1,51 +0,0 @@
-#!/bin/bash
-
-export CUDA_DEVICE_MAX_CONNECTIONS=1
-
-CHECKPOINT_PATH=<Specify path>
-VOCAB_FILE=<Specify path to file>/t5-vocab.txt
-DATA_PATH=<Specify path and file prefix>_text_sentence
-
-T5_ARGS="
-    --num-layers 12 \
-    --hidden-size 768 \
-    --num-attention-heads 12 \
-    --kv-channels 64 \
-    --ffn-hidden-size 3072 \
-    --encoder-seq-length 512 \
-    --decoder-seq-length 128 \
-    --max-position-embeddings 512 \
-    --micro-batch-size 16 \
-    --global-batch-size 16 \
-    --lr 0.0001 \
-    --train-iters 1000000 \
-    --lr-decay-iters 1000000 \
-    --lr-decay-style linear \
-    --min-lr 0.00001 \
-    --weight-decay 1e-2 \
-    --lr-warmup-fraction .01 \
-    --clip-grad 1.0 \
-    --fp16 \
-    --vocab-extra-ids 100
-"
-
-DATA_ARGS="
-    --data-path $DATA_PATH \
-    --vocab-file $VOCAB_FILE \
-    --data-impl mmap \
-    --split 949,50,1
-"
-
-OUTPUT_ARGS="
-    --log-interval 100 \
-    --save-interval 10000 \
-    --eval-interval 1000 \
-    --eval-iters 10
-"
-
-torchrun pretrain_t5.py \
-    $T5_ARGS \
-    $DATA_ARGS \
-    $OUTPUT_ARGS \
-    --save $CHECKPOINT_PATH \
-    --load $CHECKPOINT_PATH
diff --git a/examples/pretrain_t5_distributed.sh b/examples/pretrain_t5_distributed.sh
deleted file mode 100644
index eec5245827..0000000000
--- a/examples/pretrain_t5_distributed.sh
+++ /dev/null
@@ -1,68 +0,0 @@
-#!/bin/bash
-
-export CUDA_DEVICE_MAX_CONNECTIONS=1
-
-GPUS_PER_NODE=8
-# Change for multinode config
-MASTER_ADDR=localhost
-MASTER_PORT=6000
-NNODES=1
-NODE_RANK=0
-WORLD_SIZE=$(($GPUS_PER_NODE*$NNODES))
-
-CHECKPOINT_PATH=<Specify path>
-VOCAB_FILE=<Specify path to file>/t5-vocab.txt
-DATA_PATH=<Specify path and file prefix>_text_sentence
-
-DISTRIBUTED_ARGS="
-    --nproc_per_node $GPUS_PER_NODE \
-    --nnodes $NNODES \
-    --node_rank $NODE_RANK \
-    --master_addr $MASTER_ADDR \
-    --master_port $MASTER_PORT
-"
-
-T5_ARGS="
-    --num-layers 12 \
-    --hidden-size 768 \
-    --num-attention-heads 12 \
-    --kv-channels 64 \
-    --ffn-hidden-size 3072 \
-    --encoder-seq-length 512 \
-    --decoder-seq-length 128 \
-    --max-position-embeddings 512 \
-    --micro-batch-size 16 \
-    --global-batch-size 128 \
-    --lr 0.0001 \
-    --train-iters 1000000 \
-    --lr-decay-iters 1000000 \
-    --lr-decay-style linear \
-    --min-lr 0.00001 \
-    --weight-decay 1e-2 \
-    --lr-warmup-fraction .01 \
-    --clip-grad 1.0 \
-    --fp16 \
-    --vocab-extra-ids 100
-"
-
-DATA_ARGS="
-    --data-path $DATA_PATH \
-    --vocab-file $VOCAB_FILE \
-    --data-impl mmap \
-    --split 949,50,1
-"
-
-OUTPUT_ARGS="
-    --log-interval 100 \
-    --save-interval 10000 \
-    --eval-interval 1000 \
-    --eval-iters 10
-"
-
-torchrun $DISTRIBUTED_ARGS pretrain_t5.py \
-    $T5_ARGS \
-    $DATA_ARGS \
-    $OUTPUT_ARGS \
-    --distributed-backend nccl \
-    --save $CHECKPOINT_PATH \
-    --load $CHECKPOINT_PATH
diff --git a/examples/retro/README.md b/examples/retro/README.md
new file mode 100644
index 0000000000..f78bcdeb56
--- /dev/null
+++ b/examples/retro/README.md
@@ -0,0 +1,74 @@
+# RETRO MODEL
+
+## Table of contents
+- [1. Training Setup](#1-training-setup)
+- [2. Data Preprocessing](#2-data-preprocessing)
+- [3. Configurations](#3-configurations)
+
+## 1. Training setup
+<a id="markdown-training-setup" name="training-setup"></a>
+
+To run the model using a docker container run it as follows
+```
+PYTORCH_IMAGE=nvcr.io/nvidia/pytorch:23.09-py3
+CHECKPOINT_PATH="" #<Specify path>
+TENSORBOARD_LOGS_PATH=""#<Specify path>
+
+docker run \
+  --gpus=all \
+  --ipc=host \
+  --workdir /workspace/megatron-lm \
+  -v /path/to/data:/path/to/data \
+  -v /path/to/megatron-lm:/workspace/megatron-lm \
+  megatron-lm nvcr.io/nvidia/pytorch:23.09-py3 \
+  bash examples/retro/train_retro_2b_distributed.sh $CHECKPOINT_PATH $TENSORBOARD_LOGS_PATH"
+
+```
+NOTE: Depending on the environment you are running it the above command might look slightly different.
+
+NOTE: Due to how Retro preprocess and caches elements of the pretraining dataset before training begins, some arguments are auto-loaded from the Retro preprocessing configuration. These loaded arguments include:
+
+- `--data-path`
+- `--data-cache-path`
+- `--eval-interval`
+- `--eval-iters`
+- `--global-batch-size`
+- `--tokenizer-type`
+- `--tokenizer-model`
+- `--vocab-file`
+- `--merge-file`
+- `--seed`
+- `--seq-length`
+- `--train-samples`
+
+
+## 2. Data Preprocessing
+<a id="markdown-data-preprocessing" name="data-preprocessing"></a>
+
+Retro preprocesses and caches data prior to pretraining, to greatly speed up pretraining. During data preprocessing, the retrieval database is built, and neighbor IDs are queried for each sample within the pretraining dataset. Please see `preprocess_data.sh` for an example script to preprocess data for Retro. The reference documentation for data preprocessing can be found [here](tools/retro/README.md).
+
+
+## 3. Configurations
+<a id="markdown-configurations" name="configurations"></a>
+The example in this folder shows you how to run a 2B model. Below are a few other example configurations.
+
+### 857M
+```
+       --num-layers 24 \
+       --hidden-size 1024 \
+       --num-attention-heads 16 \
+       --seq-length 2048 \
+       --tensor-model-parallel-size 1 \
+       --pipeline-model-parallel-size 1 \
+
+```
+
+### 4B
+```
+       --num-layers 48 \
+       --hidden-size 2560 \
+       --num-attention-heads 32 \
+       --tensor-model-parallel-size 1 \
+       --pipeline-model-parallel-size 1 \
+
+```
diff --git a/examples/retro/preprocess_data.sh b/examples/retro/preprocess_data.sh
new file mode 100644
index 0000000000..5d2e66ba0e
--- /dev/null
+++ b/examples/retro/preprocess_data.sh
@@ -0,0 +1,144 @@
+#!/bin/bash
+
+set -u
+
+unset NCCL_DEBUG
+
+######## Megatron, Retro dirs. ########
+
+REPO_DIR="<path/to/megatron/repo>"
+RETRO_PROJECT_DIR="<path/to/retro/project/directory>"
+
+######## Task (e.g., db, index, query). ########
+
+# This script takes a single argument, which specifies the retro task to be
+# performed. The available tasks are: db-build, index-train, index-add, and
+# query-neighbors.
+
+# ~~ Examples ~~
+# RETRO_TASKS="db-build"          # Build the retrieval database
+# RETRO_TASKS="index-train"       # Train the index
+# RETRO_TASKS="index-add"         # Add data to the index
+# RETRO_TASKS="query-neighbors"   # Perform query pretraining for neighbors
+
+# You can also provide the task as a command-line argument when executing the
+# script. Example: ./preprocess_data.sh index-add
+RETRO_TASKS=$1
+
+######## Data. ########
+DATA_BLEND="<see --data-path in arguments.py>"
+
+######## Index. ########
+
+RETRO_INDEX_STR="OPQ32_64,IVF65536_HNSW8,PQ32"
+RETRO_INDEX_NTRAIN=66625331
+RETRO_INDEX_TRAIN_LOAD_FRACTION=0.97
+RETRO_INDEX_ADD_LOAD_FRACTION=0.95
+
+######## GPT. ########
+
+RETRO_GPT_SEED=1234
+RETRO_GPT_SPLIT="98,2,0"
+RETRO_GPT_DATA_PATH=${DATA_BLEND}
+RETRO_GPT_TRAIN_SAMPLES=200000
+RETRO_GPT_EVAL_INTERVAL=2000
+RETRO_GPT_EVAL_ITERS=50
+RETRO_GPT_LR_DECAY_SAMPLES=175000
+RETRO_GPT_LR_WARMUP_SAMPLES=10000
+RETRO_GPT_SEQ_LENGTH=2048
+RETRO_GPT_GLOBAL_BATCH_SIZE=256
+RETRO_GPT_CHUNK_LENGTH=64
+
+######## Query. ########
+
+RETRO_QUERY_NUM_NEIGHBORS_QUERY=200
+RETRO_QUERY_NUM_NEIGHBORS_SAVE=20
+RETRO_QUERY_EF_SEARCH=32
+RETRO_QUERY_NPROBE=4096
+
+######## Args. ########
+
+ARGS=" \
+    --distributed-timeout-minutes 600 \
+    --tensor-model-parallel-size 1 \
+    --pipeline-model-parallel-size 1 \
+    --num-layers 24 \
+    --hidden-size 1024 \
+    --num-attention-heads 16 \
+    --micro-batch-size 1 \
+    --global-batch-size ${RETRO_GPT_GLOBAL_BATCH_SIZE} \
+    --seq-length 512 \
+    --max-position-embeddings 512 \
+    --load ${RETRO_PROJECT_DIR}/checkpoints/bert \
+    --exit-on-missing-checkpoint \
+    --no-load-optim \
+    --data-path [null] \
+    --tokenizer-type BertWordPieceLowerCase \
+    --vocab-file ${RETRO_PROJECT_DIR}/tokenizer/bert-large-uncased-vocab.txt \
+    --split ${RETRO_GPT_SPLIT} \
+    --distributed-backend nccl \
+    --lr 0.0001 \
+    --lr-decay-style linear \
+    --min-lr 1.0e-5 \
+    --train-samples ${RETRO_GPT_TRAIN_SAMPLES} \
+    --lr-decay-samples ${RETRO_GPT_LR_DECAY_SAMPLES} \
+    --lr-warmup-samples ${RETRO_GPT_LR_WARMUP_SAMPLES} \
+    --weight-decay 1e-2 \
+    --clip-grad 1.0 \
+    --eval-interval ${RETRO_GPT_EVAL_INTERVAL} \
+    --eval-iters ${RETRO_GPT_EVAL_ITERS} \
+    --bf16 \
+    --no-data-sharding \
+    --no-gradient-accumulation-fusion \
+    --no-async-tensor-model-parallel-allreduce \
+    --bert-embedder-type megatron \
+    --output-bert-embeddings \
+    \
+    --retro-project-dir ${RETRO_PROJECT_DIR} \
+    --retro-tasks ${RETRO_TASKS} \
+    --retro-bert-vocab-file tokenizer/bert-large-uncased-vocab.txt \
+    --retro-bert-tokenizer-type BertWordPieceLowerCase \
+    \
+    --retro-gpt-seed ${RETRO_GPT_SEED} \
+    --retro-gpt-tokenizer-type GPTSentencePieceTokenizer \
+    --retro-gpt-tokenizer-model /path/to/tokenizer/model \
+    --retro-gpt-seq-length ${RETRO_GPT_SEQ_LENGTH} \
+    --retro-gpt-chunk-length ${RETRO_GPT_CHUNK_LENGTH} \
+    --retro-gpt-global-batch-size ${RETRO_GPT_GLOBAL_BATCH_SIZE} \
+    --retro-gpt-eval-interval ${RETRO_GPT_EVAL_INTERVAL} \
+    --retro-gpt-eval-iters ${RETRO_GPT_EVAL_ITERS} \
+    --retro-gpt-split ${RETRO_GPT_SPLIT} \
+    --retro-gpt-data-path ${RETRO_GPT_DATA_PATH} \
+    --retro-gpt-train-samples ${RETRO_GPT_TRAIN_SAMPLES} \
+    \
+    --retro-index-str ${RETRO_INDEX_STR} \
+    --retro-index-ntrain ${RETRO_INDEX_NTRAIN} \
+    --retro-index-train-load-fraction ${RETRO_INDEX_TRAIN_LOAD_FRACTION} \
+    --retro-index-add-load-fraction ${RETRO_INDEX_ADD_LOAD_FRACTION} \
+    --no-retro-index-delete-training-embeddings \
+    --no-retro-index-delete-added-codes \
+    \
+    --retro-query-num-neighbors-query ${RETRO_QUERY_NUM_NEIGHBORS_QUERY} \
+    --retro-query-num-neighbors-save ${RETRO_QUERY_NUM_NEIGHBORS_SAVE} \
+    --retro-query-ef-search ${RETRO_QUERY_EF_SEARCH} \
+    --retro-query-nprobe ${RETRO_QUERY_NPROBE} \
+"
+
+######## Command. ########
+
+NPROCS=8 # Number of GPUs.
+CMD="\
+    cd ${REPO_DIR} && pwd && \
+    export PYTHONPATH=$PYTHONPATH:${REPO_DIR} && \
+    python -m torch.distributed.run \
+    --nproc_per_node ${NPROCS} \
+    --nnodes 1 \
+    --node_rank ${NODE_RANK} \
+    --master_addr ${MASTER_ADDR} \
+    --master_port 6000 \
+    tools/retro/preprocess_data.py ${ARGS} \
+"
+echo "~~~~~~~~~~~~~~~~~~~~~~~~~~"
+echo "CMD = '$CMD'."
+echo "~~~~~~~~~~~~~~~~~~~~~~~~~~"
+eval $CMD
diff --git a/examples/retro/train_retro_2b_distributed.sh b/examples/retro/train_retro_2b_distributed.sh
new file mode 100644
index 0000000000..c8276b56f4
--- /dev/null
+++ b/examples/retro/train_retro_2b_distributed.sh
@@ -0,0 +1,98 @@
+#!/bin/bash
+
+# Runs the "307M" parameter Retro model.
+
+export CUDA_DEVICE_MAX_CONNECTIONS=1
+
+GPUS_PER_NODE=8
+# Change for multinode config
+MASTER_ADDR=localhost
+MASTER_PORT=6000
+NUM_NODES=1
+NODE_RANK=0
+WORLD_SIZE=$(($GPUS_PER_NODE*$NUM_NODES))
+
+CHECKPOINT_PATH=$1 #<Specify path>
+TENSORBOARD_LOGS_PATH=$2 #<Specify path>
+
+DISTRIBUTED_ARGS=(
+    --nproc_per_node $GPUS_PER_NODE 
+    --nnodes $NUM_NODES 
+    --master_addr $MASTER_ADDR 
+    --master_port $MASTER_PORT
+)
+
+######## GPT or Retro? ########
+
+# 0 : GPT.
+# 1 : Retro
+
+ADD_RETRIEVER=1
+
+######## Megatron, Retro dirs. ########
+
+RETRO_PROJECT_DIR="<path/to/retro/project/directory>"
+
+######## Model, training args. ########
+
+# ** Note: --seq-length auto loaded from Retro project dir.
+RETRO_MODEL_ARGS=(
+    --num-layers 32
+    --hidden-size 2048
+    --num-attention-heads 32
+)
+
+# ** Note: --data-path, --tokenizer-type, and --tokenizer-model auto loaded from Retro project dir.
+DATA_ARGS=(
+    --split 98,2,0
+)
+
+MODEL_PARALLEL_ARGS=(
+    --tensor-model-parallel-size 8
+    --pipeline-model-parallel-size 1 
+)
+
+# ** Note: --eval-interval, --eval-iters auto loaded from Retro project dir.
+EVAL_AND_LOGGING_ARGS=(
+    --log-interval 100
+    --save-interval 10000 
+    --eval-interval 1000 
+    --save $CHECKPOINT_PATH 
+    --load $CHECKPOINT_PATH 
+    --eval-iters 10
+    --tensorboard-dir $TENSORBOARD_LOGS_PATH 
+)
+
+TRAINING_ARGS=" \
+    --retro-project-dir ${RETRO_PROJECT_DIR} \
+    --transformer-impl transformer_engine \
+    --num-workers 8 \
+    --micro-batch-size 4 \
+    --lr-decay-samples 166400000 \
+    --lr-warmup-samples 162761 \
+    --lr 6.0e-4 \
+    --min-lr 6.0e-5 \
+    --lr-decay-style cosine \
+    --clip-grad 1.0 \
+    --weight-decay 0.1 \
+    --adam-beta1 0.9 \
+    --adam-beta2 0.95 \
+    --init-method-std 0.023 \
+    --log-params-norm \
+    --log-num-zeros-in-grad \
+    --bf16 \
+    --no-data-sharding \
+"
+
+if [ "$ADD_RETRIEVER" = "1" ]; then
+    TRAINING_ARGS+=" --retro-add-retriever"
+fi
+
+######## Command. ########
+
+torchrun ${DISTRIBUTED_ARGS[@]} pretrain_retro.py \
+    ${RETRO_MODEL_ARGS[@]} \
+    ${TRAINING_ARGS} \
+    ${MODEL_PARALLEL_ARGS[@]} \
+    ${DATA_ARGS[@]} \
+    ${EVAL_AND_LOGGING_ARGS[@]}
diff --git a/examples/run_simple_mcore_train_loop.py b/examples/run_simple_mcore_train_loop.py
new file mode 100644
index 0000000000..d5ffffeeaf
--- /dev/null
+++ b/examples/run_simple_mcore_train_loop.py
@@ -0,0 +1,158 @@
+import os
+import torch
+from torch.optim import Adam
+from torch.utils.data import DataLoader
+from functools import partial
+from pathlib import Path
+
+from megatron.core import parallel_state
+from megatron.core import dist_checkpointing
+from megatron.core.pipeline_parallel.schedules import get_forward_backward_func
+from megatron.core.tensor_parallel.random import model_parallel_cuda_manual_seed
+from megatron.core.transformer.transformer_config import TransformerConfig
+from megatron.core.models.gpt.gpt_model import GPTModel
+from megatron.core.models.gpt.gpt_layer_specs import get_gpt_layer_local_spec
+from megatron.core.datasets.utils import compile_helpers 
+from megatron.core.datasets.blended_megatron_dataset_builder import BlendedMegatronDatasetBuilder
+from megatron.core.datasets.gpt_dataset import GPTDatasetConfig, MockGPTDataset
+from megatron.training.tokenizer.tokenizer import _NullTokenizer
+
+
+_SEQUENCE_LENGTH = 64
+
+
+def initialize_distributed(tensor_model_parallel_size=1, pipeline_model_parallel_size=1):
+    parallel_state.destroy_model_parallel()
+
+    # Torch setup for distributed training
+    rank = int(os.environ['LOCAL_RANK'])
+    world_size = torch.cuda.device_count()
+    torch.cuda.set_device(rank)
+    torch.distributed.init_process_group(world_size=world_size, rank=rank)
+
+    # Megatron core distributed training initialization
+    parallel_state.initialize_model_parallel(tensor_model_parallel_size, pipeline_model_parallel_size)
+
+def model_provider():
+    """Build the model."""
+
+    transformer_config = TransformerConfig(
+        num_layers=2, 
+        hidden_size=12, 
+        num_attention_heads=4, 
+        use_cpu_initialization=True, 
+        pipeline_dtype=torch.float32,
+    )
+
+    gpt_model = GPTModel(
+        config=transformer_config, 
+        transformer_layer_spec=get_gpt_layer_local_spec(), 
+        vocab_size=100, 
+        max_sequence_length=_SEQUENCE_LENGTH,
+    )
+
+    return gpt_model
+
+def get_train_data_iterator():
+    if torch.distributed.is_available() and torch.distributed.is_initialized():
+        if torch.distributed.get_rank() == 0:
+            compile_helpers()
+        torch.distributed.barrier()
+    else:
+        compile_helpers()
+
+    config = GPTDatasetConfig(
+        random_seed=0,
+        sequence_length=_SEQUENCE_LENGTH,
+        reset_position_ids=False,
+        reset_attention_mask=False,
+        eod_mask_loss=False,
+        tokenizer=_NullTokenizer(vocab_size=_SEQUENCE_LENGTH),
+    )
+
+    datasets = BlendedMegatronDatasetBuilder(
+        MockGPTDataset, [1000, None, None], lambda: True, config
+    ).build()
+
+    train_dataloader = DataLoader(datasets[0], batch_size=8, shuffle=True)
+
+    train_iterator = iter(train_dataloader)
+
+    return train_iterator
+
+def forward_step_func(data_iterator, model):
+
+    def loss_func(loss_mask: torch.Tensor, output_tensor: torch.Tensor):
+
+        losses = output_tensor.float()
+        loss_mask = loss_mask.view(-1).float()
+        loss = torch.sum(losses.view(-1) * loss_mask) / loss_mask.sum()
+        # If you have data parallel reduce loss across data parallel groups.
+        # If pipeline parallel, loss computation is done only in last stage.
+
+        return loss, {'lm loss': loss}
+
+    data = next(data_iterator)
+    tokens = data['tokens'].to(device)
+    attention_mask = data['attention_mask'].to(device)
+    position_ids = data['position_ids'].to(device)
+    labels = data['labels'].to(device)
+    loss_mask = data['loss_mask'].to(device)
+
+    output_tensor = model(tokens, position_ids, attention_mask,
+                          labels=labels)
+
+    return output_tensor, partial(loss_func, loss_mask)
+
+def save_distributed_checkpoint(checkpoint_path, gpt_model):
+    sharded_state_dict = gpt_model.sharded_state_dict(prefix='')
+    dist_checkpointing.save(sharded_state_dict=sharded_state_dict, checkpoint_dir=checkpoint_path)
+
+def load_distributed_checkpoint(checkpoint_path, gpt_model):
+    sharded_state_dict=gpt_model.sharded_state_dict(prefix='')
+    checkpoint = dist_checkpointing.load(sharded_state_dict=sharded_state_dict, checkpoint_dir=checkpoint_path)
+    gpt_model.load_state_dict(checkpoint)
+    return gpt_model
+
+if __name__ == "__main__":
+    initialize_distributed(tensor_model_parallel_size=2, pipeline_model_parallel_size=1)
+    model_parallel_cuda_manual_seed(123)
+
+    gpt_model = model_provider()
+    device = torch.device("cuda")
+    gpt_model.to(device)
+
+    optim = Adam(gpt_model.parameters())
+
+    train_iterator = get_train_data_iterator()
+
+    forward_backward_func = get_forward_backward_func()
+
+    # Running the model for 5 iterations
+    for _ in range(5):
+        optim.zero_grad()
+
+        losses_reduced = forward_backward_func(
+            forward_step_func=forward_step_func,
+            data_iterator=train_iterator,
+            model=gpt_model,
+            num_microbatches=1,
+            seq_length=_SEQUENCE_LENGTH,
+            micro_batch_size=8,
+            decoder_seq_length=_SEQUENCE_LENGTH,
+            forward_only=False)
+
+        optim.step()
+
+        print(f'Losses reduced :  {losses_reduced}')
+
+    # Saving the model
+    ckpt_path = os.getcwd() + '/ckpt'
+    Path(ckpt_path).mkdir(exist_ok=True)
+    save_distributed_checkpoint(gpt_model=gpt_model, checkpoint_path=ckpt_path)
+
+    # Loading the model
+    gpt_model = load_distributed_checkpoint(gpt_model=gpt_model, checkpoint_path=ckpt_path)
+    gpt_model.to(device)
+    print('Successfully loaded the model')
+
diff --git a/examples/t5/README.md b/examples/t5/README.md
new file mode 100644
index 0000000000..205da1db37
--- /dev/null
+++ b/examples/t5/README.md
@@ -0,0 +1,55 @@
+# T5 MODEL
+
+## Table of contents
+- [1. Training Setup](#1-training-setup)
+- [2. Configurations](#2-configurations)
+- [3. Training Results](#3-training-results)
+
+## 1. Training setup
+<a id="markdown-training-setup" name="training-setup"></a>
+To run the model on a Slurm based cluster  
+```
+PYTORCH_IMAGE=nvcr.io/nvidia/pytorch:23.09-py3
+ACCOUNT_NAME="" 
+PARTITION=""
+JOB_NAME=""
+NUM_NODES=1
+CHECKPOINT_PATH="" #<Specify path to checkpoint>
+TENSORBOARD_LOGS_PATH=""#<Specify path to tensorboard log>
+VOCAB_FILE="" #<Specify path to file>/bert-large-cased-vocab.txt
+DATA_PATH="" #<Specify path and file prefix>_text_document
+
+srun -N $NUM_NODES --container-image $PYTORCH_IMAGE --container-mounts "/path/to/data:/path/to/data,/path/to/megatron-lm:/workspace/megatron-lm" --account $ACCOUNT -N 1 -J $JOB_NAME  -p $PARTITION --no-container-mount-home  -c "
+  cd /workspace/megatron-lm
+  ./examples/t5/train_t5_220m_distributed.sh $CHECKPOINT_PATH $TENSORBOARD_LOGS_PATH $VOCAB_FILE $DATA_PATH"
+
+```
+
+## 2. Configurations
+<a id="markdown-configurations" name="configurations"></a>
+The architecture arguments below shows configuration for T5 220M model. 
+
+### 220M 
+```
+       --num-layers 12 \
+       --hidden-size 768 \
+       --num-attention-heads 12 \
+       --kv-channels 64 \
+       --ffn-hidden-size 3072 \
+       --encoder-seq-length 512 \
+       --decoder-seq-length 128 \
+       --max-position-embeddings 512 \
+       --tensor-model-parallel-size 1 \
+       --pipeline-model-parallel-size 1 \
+
+```
+
+
+## 3. Training Results
+<a id="markdown-training-results" name="training-results"></a>
+Below is the training curve for the 220M model on Pile dataset. The training takes 4 days on 32 GPUs, with batch size of 2048. 
+
+Finetuning on SQUAD dataset, the validation result is: 63.44\%
+<p align="center">
+<img src="./t5_mcore_train_curve.png"  width="800" height="400">
+</p>
diff --git a/examples/t5/t5_mcore_train_curve.png b/examples/t5/t5_mcore_train_curve.png
new file mode 100644
index 0000000000..de1aaa8582
Binary files /dev/null and b/examples/t5/t5_mcore_train_curve.png differ
diff --git a/examples/pretrain_t5_distributed_with_mp.sh b/examples/t5/train_t5_220m_distributed.sh
old mode 100644
new mode 100755
similarity index 57%
rename from examples/pretrain_t5_distributed_with_mp.sh
rename to examples/t5/train_t5_220m_distributed.sh
index d51ecee19e..5d9357ab0e
--- a/examples/pretrain_t5_distributed_with_mp.sh
+++ b/examples/t5/train_t5_220m_distributed.sh
@@ -1,30 +1,33 @@
 #!/bin/bash
 
+# Runs the "220M" parameter model
+
 export CUDA_DEVICE_MAX_CONNECTIONS=1
 
 GPUS_PER_NODE=8
 # Change for multinode config
 MASTER_ADDR=localhost
 MASTER_PORT=6000
-NNODES=1
+NUM_NODES=1
 NODE_RANK=0
-WORLD_SIZE=$(($GPUS_PER_NODE*$NNODES))
+WORLD_SIZE=$(($GPUS_PER_NODE*$NUM_NODES))
 
-CHECKPOINT_PATH=<Specify path>
-VOCAB_FILE=<Specify path to file>/t5-vocab.txt
-DATA_PATH=<Specify path and file prefix>_text_sentence
+CHECKPOINT_PATH=$1 #<Specify path>
+TENSORBOARD_DIR=$2 #<Specify path>
+VOCAB_FILE=$3 #<Specify path to file>/bert-large-cased-vocab.txt
+DATA_PATH=$4 #<Specify path and file prefix>_text_document
 
 DISTRIBUTED_ARGS="
     --nproc_per_node $GPUS_PER_NODE \
-    --nnodes $NNODES \
+    --nnodes $NUM_NODES \
     --node_rank $NODE_RANK \
     --master_addr $MASTER_ADDR \
     --master_port $MASTER_PORT
 "
 
 T5_ARGS="
-    --tensor-model-parallel-size 2 \
-    --num-layers 12 \
+    --encoder-num-layers 12 \
+    --decoder-num-layers 12 \
     --hidden-size 768 \
     --num-attention-heads 12 \
     --kv-channels 64 \
@@ -32,8 +35,8 @@ T5_ARGS="
     --encoder-seq-length 512 \
     --decoder-seq-length 128 \
     --max-position-embeddings 512 \
-    --micro-batch-size 16 \
-    --global-batch-size 128 \
+    --micro-batch-size 64 \
+    --global-batch-size 512 \
     --lr 0.0001 \
     --train-iters 1000000 \
     --lr-decay-iters 1000000 \
@@ -42,20 +45,25 @@ T5_ARGS="
     --weight-decay 1e-2 \
     --lr-warmup-fraction .01 \
     --clip-grad 1.0 \
-    --fp16  \
-    --vocab-extra-ids 100
+    --bf16 \
+    --vocab-extra-ids 100 \
+    --init-method-std 0.015 \
+    --transformer-impl transformer_engine \
+    --tensor-model-parallel-size 1 \
+    --pipeline-model-parallel-size 1 \
 "
 
 DATA_ARGS="
     --data-path $DATA_PATH \
     --vocab-file $VOCAB_FILE \
-    --data-impl mmap \
-    --split 949,50,1
+    --tokenizer-type BertWordPieceCase \
+    --split 99982,9,9 \
 "
 
 OUTPUT_ARGS="
     --log-interval 100 \
-    --save-interval 10000 \
+    --tensorboard-dir ${TENSORBOARD_DIR} \
+    --save-interval 500 \
     --eval-interval 1000 \
     --eval-iters 10
 "
@@ -66,4 +74,4 @@ torchrun $DISTRIBUTED_ARGS pretrain_t5.py \
     $OUTPUT_ARGS \
     --distributed-backend nccl \
     --save $CHECKPOINT_PATH \
-    --load $CHECKPOINT_PATH
+    --load $CHECKPOINT_PATH \
diff --git a/images/Achieved_petaFLOPs.png b/images/Achieved_petaFLOPs.png
deleted file mode 100644
index 3431099f3f..0000000000
Binary files a/images/Achieved_petaFLOPs.png and /dev/null differ
diff --git a/images/cases_april2021.png b/images/cases_april2021.png
deleted file mode 100644
index 8a6d9e9f8b..0000000000
Binary files a/images/cases_april2021.png and /dev/null differ
diff --git a/images/model_table.png b/images/model_table.png
new file mode 100644
index 0000000000..f126c2fcfb
Binary files /dev/null and b/images/model_table.png differ
diff --git a/images/strong_scaling.png b/images/strong_scaling.png
new file mode 100644
index 0000000000..d8337c347e
Binary files /dev/null and b/images/strong_scaling.png differ
diff --git a/images/weak_scaling.png b/images/weak_scaling.png
new file mode 100644
index 0000000000..59c3cec6c6
Binary files /dev/null and b/images/weak_scaling.png differ
diff --git a/megatron/arguments.py b/megatron/arguments.py
deleted file mode 100644
index 84a007c026..0000000000
--- a/megatron/arguments.py
+++ /dev/null
@@ -1,1220 +0,0 @@
-# Copyright (c) 2022, NVIDIA CORPORATION. All rights reserved.
-
-"""Megatron arguments."""
-
-import argparse
-import json
-import os
-import torch
-import types
-
-from megatron.global_vars import set_retro_args, get_retro_args
-from tools.retro.utils import get_args_path as get_retro_args_path
-
-
-def parse_args(extra_args_provider=None, ignore_unknown_args=False):
-    """Parse all arguments."""
-    parser = argparse.ArgumentParser(description='Megatron-LM Arguments',
-                                     allow_abbrev=False)
-
-    # Standard arguments.
-    parser = _add_network_size_args(parser)
-    parser = _add_regularization_args(parser)
-    parser = _add_training_args(parser)
-    parser = _add_initialization_args(parser)
-    parser = _add_learning_rate_args(parser)
-    parser = _add_checkpointing_args(parser)
-    parser = _add_mixed_precision_args(parser)
-    parser = _add_distributed_args(parser)
-    parser = _add_validation_args(parser)
-    parser = _add_data_args(parser)
-    parser = _add_autoresume_args(parser)
-    parser = _add_biencoder_args(parser)
-    parser = _add_vision_args(parser)
-    parser = _add_logging_args(parser)
-    parser = _add_inference_args(parser)
-    parser = _add_transformer_engine_args(parser)
-    parser = _add_retro_args(parser)
-
-    # Custom arguments.
-    if extra_args_provider is not None:
-        parser = extra_args_provider(parser)
-
-    # Parse.
-    if ignore_unknown_args:
-        args, _ = parser.parse_known_args()
-    else:
-        args = parser.parse_args()
-
-    # Args from environment
-    args.rank = int(os.getenv('RANK', '0'))
-    args.world_size = int(os.getenv("WORLD_SIZE", '1'))
-        
-    return args
-
-def validate_args(args, defaults={}):
-    # Tensor model parallel size.
-    args.tensor_model_parallel_size = min(
-        args.tensor_model_parallel_size, args.world_size)
-    assert args.world_size % args.tensor_model_parallel_size == 0, 'world size'\
-        ' ({}) is not divisible by tensor model parallel size ({})'.format(
-            args.world_size, args.tensor_model_parallel_size)
-    # Pipeline model parallel size.
-    args.pipeline_model_parallel_size = min(
-        args.pipeline_model_parallel_size,
-        (args.world_size // args.tensor_model_parallel_size))
-    args.transformer_pipeline_model_parallel_size = (
-        args.pipeline_model_parallel_size - 1
-        if args.standalone_embedding_stage else
-        args.pipeline_model_parallel_size
-    )
-    # Checks.
-    model_parallel_size = args.pipeline_model_parallel_size * \
-                          args.tensor_model_parallel_size
-    assert args.world_size % model_parallel_size == 0, 'world size is not'\
-        ' divisible by tensor parallel size ({}) times pipeline parallel ' \
-        'size ({})'.format(args.world_size, args.tensor_model_parallel_size,
-                           args.pipeline_model_parallel_size)
-    args.data_parallel_size = args.world_size // model_parallel_size
-    if args.rank == 0:
-        print('using world size: {}, data-parallel-size: {}, '
-              'tensor-model-parallel size: {}, '
-              'pipeline-model-parallel size: {} '.format(
-                  args.world_size, args.data_parallel_size,
-                  args.tensor_model_parallel_size,
-                  args.pipeline_model_parallel_size), flush=True)
-    if args.pipeline_model_parallel_size > 1:
-        if args.pipeline_model_parallel_split_rank is not None:
-            assert args.pipeline_model_parallel_split_rank < \
-                    args.pipeline_model_parallel_size, 'split rank needs'\
-                    ' to be less than pipeline model parallel size ({})'.format(
-                            args.pipeline_model_parallel_size)
-
-    # Deprecated arguments
-    assert args.batch_size is None, '--batch-size argument is no longer ' \
-        'valid, use --micro-batch-size instead'
-    del args.batch_size
-    assert args.warmup is None, '--warmup argument is no longer valid, use ' \
-        '--lr-warmup-fraction instead'
-    del args.warmup
-    assert args.model_parallel_size is None, '--model-parallel-size is no ' \
-        'longer valid, use --tensor-model-parallel-size instead'
-    del args.model_parallel_size
-
-    if args.checkpoint_activations:
-        if args.rank == 0:
-            print('--checkpoint-activations is no longer valid, use --recompute-activations, '
-                  'or, for more control, --recompute-granularity and --recompute-method.')
-        exit()
-    del args.checkpoint_activations
-
-    if args.recompute_activations:
-        args.recompute_granularity = 'selective'
-    del args.recompute_activations
-
-    # Set input defaults.
-    for key in defaults:
-        # For default to be valid, it should not be provided in the
-        # arguments that are passed to the program. We check this by
-        # ensuring the arg is set to None.
-        if getattr(args, key) is not None:
-            if args.rank == 0:
-                print('WARNING: overriding default arguments for {key}:{v} \
-                       with {key}:{v2}'.format(key=key, v=defaults[key],
-                                               v2=getattr(args, key)),
-                                               flush=True)
-        else:
-            setattr(args, key, defaults[key])
-
-    # Batch size.
-    assert args.micro_batch_size is not None
-    assert args.micro_batch_size > 0
-    if args.global_batch_size is None:
-        args.global_batch_size = args.micro_batch_size * args.data_parallel_size
-        if args.rank == 0:
-            print('setting global batch size to {}'.format(
-                args.global_batch_size), flush=True)
-    assert args.global_batch_size > 0
-    if args.num_layers_per_virtual_pipeline_stage is not None:
-        assert args.pipeline_model_parallel_size > 2, \
-            'pipeline-model-parallel size should be greater than 2 with ' \
-            'interleaved schedule'
-        assert args.num_layers % args.num_layers_per_virtual_pipeline_stage == 0, \
-            'number of layers is not divisible by number of layers per virtual ' \
-            'pipeline stage'
-        args.virtual_pipeline_model_parallel_size = \
-            (args.num_layers // args.transformer_pipeline_model_parallel_size) // \
-            args.num_layers_per_virtual_pipeline_stage
-    else:
-        args.virtual_pipeline_model_parallel_size = None
-
-    # Parameters dtype.
-    args.params_dtype = torch.float
-    if args.fp16:
-        assert not args.bf16
-        args.params_dtype = torch.half
-    if args.bf16:
-        assert not args.fp16
-        args.params_dtype = torch.bfloat16
-        # bfloat16 requires gradient accumulation and all-reduce to
-        # be done in fp32.
-        if not args.accumulate_allreduce_grads_in_fp32:
-            args.accumulate_allreduce_grads_in_fp32 = True
-            if args.rank == 0:
-                print('accumulate and all-reduce gradients in fp32 for '
-                      'bfloat16 data type.', flush=True)
-
-    if args.rank == 0:
-        print('using {} for parameters ...'.format(args.params_dtype),
-              flush=True)
-
-    # If we do accumulation and all-reduces in fp32, we need to have local DDP
-    # and we should make sure use-contiguous-buffers-in-local-ddp is not off.
-    if args.accumulate_allreduce_grads_in_fp32:
-        assert args.DDP_impl == 'local'
-        assert args.use_contiguous_buffers_in_local_ddp
-
-    # If we use the distributed optimizer, we need to have local DDP
-    # and we should make sure use-contiguous-buffers-in-local-ddp is on.
-    if args.use_distributed_optimizer:
-        assert args.DDP_impl == 'local'
-        assert args.use_contiguous_buffers_in_local_ddp
-
-    # For torch DDP, we do not use contiguous buffer
-    if args.DDP_impl == 'torch':
-        args.use_contiguous_buffers_in_local_ddp = False
-
-    if args.dataloader_type is None:
-        args.dataloader_type = 'single'
-
-    # Consumed tokens.
-    args.consumed_train_samples = 0
-    args.consumed_valid_samples = 0
-
-    # Support for variable sequence lengths across batches/microbatches.
-    # set it if the dataloader supports generation of variable sequence lengths
-    # across batches/microbatches. Due to additional communication overhead
-    # during pipeline parallelism, it should not be set if sequence length
-    # is constant during training.
-    args.variable_seq_lengths = False
-
-    # Iteration-based training.
-    if args.train_iters:
-        # If we use iteration-based training, make sure the
-        # sample-based options are off.
-        assert args.train_samples is None, \
-            'expected iteration-based training'
-        assert args.lr_decay_samples is None, \
-            'expected iteration-based learning rate decay'
-        assert args.lr_warmup_samples == 0, \
-            'expected iteration-based learning rate warmup'
-        assert args.rampup_batch_size is None, \
-            'expected no batch-size rampup for iteration-based training'
-        if args.lr_warmup_fraction is not None:
-            assert args.lr_warmup_iters == 0, \
-                'can only specify one of lr-warmup-fraction and lr-warmup-iters'
-
-    # Sample-based training.
-    if args.train_samples:
-        # If we use sample-based training, make sure the
-        # iteration-based options are off.
-        assert args.train_iters is None, \
-            'expected sample-based training'
-        assert args.lr_decay_iters is None, \
-            'expected sample-based learning rate decay'
-        assert args.lr_warmup_iters == 0, \
-            'expected sample-based learnig rate warmup'
-        if args.lr_warmup_fraction is not None:
-            assert args.lr_warmup_samples == 0, \
-                'can only specify one of lr-warmup-fraction ' \
-                'and lr-warmup-samples'
-
-    if args.num_layers is not None:
-        assert args.encoder_num_layers is None, \
-            'cannot have both num-layers and encoder-num-layers specified'
-        args.encoder_num_layers = args.num_layers
-    else:
-        assert args.encoder_num_layers is not None, \
-            'either num-layers or encoder-num-layers should be specified'
-        args.num_layers = args.encoder_num_layers
-
-    # Check required arguments.
-    required_args = ['num_layers', 'hidden_size', 'num_attention_heads',
-                     'max_position_embeddings']
-    for req_arg in required_args:
-        _check_arg_is_not_none(args, req_arg)
-
-    # Checks.
-    if args.ffn_hidden_size is None:
-        args.ffn_hidden_size = 4 * args.hidden_size
-
-    if args.swiglu:
-        # reduce the dimnesion for MLP since projections happens on
-        # two linear layers. this keeps the number of paramters in
-        # the same ballpark as the counterpart with 4*h size
-        # we keep it a multiple of 64, which means the actual tensor size
-        # will be a multiple of 64 / tp_size
-        args.ffn_hidden_size = int((4 * args.hidden_size * 2 / 3) / 64) * 64
-
-    if args.kv_channels is None:
-        assert args.hidden_size % args.num_attention_heads == 0
-        args.kv_channels = args.hidden_size // args.num_attention_heads
-
-    if args.seq_length is not None:
-        assert args.encoder_seq_length is None
-        args.encoder_seq_length = args.seq_length
-    else:
-        assert args.encoder_seq_length is not None
-        args.seq_length = args.encoder_seq_length
-
-    if args.seq_length is not None:
-        assert args.max_position_embeddings >= args.seq_length
-    if args.decoder_seq_length is not None:
-        assert args.max_position_embeddings >= args.decoder_seq_length
-    if args.lr is not None:
-        assert args.min_lr <= args.lr
-    if args.save is not None:
-        assert args.save_interval is not None
-    # Mixed precision checks.
-    if args.fp16_lm_cross_entropy:
-        assert args.fp16, 'lm cross entropy in fp16 only support in fp16 mode.'
-    if args.fp32_residual_connection:
-        assert args.fp16 or args.bf16, \
-            'residual connection in fp32 only supported when using fp16 or bf16.'
-
-    if args.weight_decay_incr_style == 'constant':
-        assert args.start_weight_decay is None
-        assert args.end_weight_decay is None
-        args.start_weight_decay = args.weight_decay
-        args.end_weight_decay = args.weight_decay
-    else:
-        assert args.start_weight_decay is not None
-        assert args.end_weight_decay is not None
-
-    TORCH_MAJOR = int(torch.__version__.split('.')[0])
-    TORCH_MINOR = int(torch.__version__.split('.')[1])
-    # Persistent fused layer norm.
-    if TORCH_MAJOR < 1 or (TORCH_MAJOR == 1 and TORCH_MINOR < 11):
-        args.no_persist_layer_norm = True
-        if args.rank == 0:
-            print('Persistent fused layer norm kernel is supported from '
-                  'pytorch v1.11 (nvidia pytorch container paired with v1.11). '
-                  'Defaulting to no_persist_layer_norm=True')
-
-    # Activation recomputing.
-    if args.distribute_saved_activations:
-        assert args.tensor_model_parallel_size > 1, 'can distribute ' \
-            'recomputed activations only across tensor model ' \
-            'parallel groups'
-        assert args.recompute_granularity == 'full', \
-            'distributed recompute activations is only '\
-            'application to full recompute granularity'
-        assert args.recompute_method is not None, \
-            'for distributed recompute activations to work you '\
-            'need to use a recompute method '
-        assert TORCH_MAJOR >= 1 and TORCH_MINOR >= 10, \
-            'distributed recompute activations are supported for pytorch ' \
-            'v1.10 and above (Nvidia Pytorch container >= 21.07). Current ' \
-            'pytorch version is v%s.%s.' % (TORCH_MAJOR, TORCH_MINOR)
-
-    # Tranformer-Engine/FP8 related checking
-    if args.fp8_e4m3 or args.fp8_hybrid:
-        assert args.transformer_impl == 'transformer_engine', \
-            'transformer-engine required for fp8 training and inference'
-
-    assert not (args.fp8_e4m3 and args.fp8_hybrid), \
-        'cannot train with both fp8 e4m3 and hybrid formatting'
-
-    if args.recompute_granularity == 'selective':
-        assert args.recompute_method is None, \
-            'recompute method is not yet supported for ' \
-            'selective recomputing granularity'
-
-    # disable sequence parallelism when tp=1
-    # to avoid change in numerics when
-    # sequence_parallelism is enabled.
-    if args.tensor_model_parallel_size == 1:
-        args.sequence_parallel = False
-
-    # disable async_tensor_model_parallel_allreduce when
-    # model parallel memory optimization is enabled
-    if args.sequence_parallel:
-        args.async_tensor_model_parallel_allreduce = False
-
-    if os.environ.get('CUDA_DEVICE_MAX_CONNECTIONS') != "1":
-        if args.sequence_parallel:
-            raise RuntimeError(
-                "Using sequence parallelism requires setting the environment variable "
-                "CUDA_DEVICE_MAX_CONNECTIONS to 1")
-        if args.async_tensor_model_parallel_allreduce:
-            raise RuntimeError(
-                "Using async gradient all reduce requires setting the environment "
-                "variable CUDA_DEVICE_MAX_CONNECTIONS to 1")
-
-    # Disable bias gelu fusion if we are disabling bias altogether
-    if not args.add_bias_linear:
-        args.bias_gelu_fusion = False
-
-    # Load retro args.
-    if args.retro_workdir:
-        retro_args_path = get_retro_args_path(args.retro_workdir)
-        if os.path.exists(retro_args_path):
-            with open(retro_args_path) as f:
-                retro_args = types.SimpleNamespace(**json.load(f))
-                retro_args.retro_return_doc_ids = args.retro_return_doc_ids
-                retro_args.retro_gpt_retrieved_length = \
-                    args.retro_num_retrieved_chunks * \
-                    retro_args.retro_gpt_chunk_length
-                set_retro_args(retro_args)
-
-    # Print arguments.
-    _print_args("arguments", args)
-    retro_args = get_retro_args()
-    if retro_args and args != retro_args:
-        _print_args("retro arguments", types.SimpleNamespace(**{k:v for k,v in vars(retro_args).items() if k.startswith("retro")}, rank=args.rank))
-
-    return args
-
-
-def _print_args(title, args):
-    """Print arguments."""
-    if args.rank == 0:
-        print(f'------------------------ {title} ------------------------',
-              flush=True)
-        str_list = []
-        for arg in vars(args):
-            dots = '.' * (48 - len(arg))
-            str_list.append('  {} {} {}'.format(arg, dots, getattr(args, arg)))
-        for arg in sorted(str_list, key=lambda x: x.lower()):
-            print(arg, flush=True)
-        print(f'-------------------- end of {title} ---------------------',
-              flush=True)
-
-
-def _check_arg_is_not_none(args, arg):
-    assert getattr(args, arg) is not None, '{} argument is None'.format(arg)
-
-
-def _add_transformer_engine_args(parser):
-    group = parser.add_argument_group(title='Transformer-Engine')
-
-    group.add_argument('--fp8-e4m3', action='store_true',
-                        help='E4M3 TransformerLayer', dest='fp8_e4m3')
-    group.add_argument('--fp8-hybrid', action='store_true',
-                        help='Hybrid FP8 TransformerLayer', dest='fp8_hybrid')
-    group.add_argument('--no-fp8-wgrad', action='store_false',
-                        help='Execute wgrad in higher precision even for FP8 runs', dest='fp8_wgrad')
-    group.add_argument('--fp8-margin', type=int, default=0,
-                        help='Scaling margin for fp8', dest='fp8_margin')
-    group.add_argument('--fp8-interval', type=int, default=1,
-                        help='Scaling update interval for fp8', dest='fp8_interval')
-    group.add_argument('--transformer-impl', default='local',
-                       choices=['local', 'transformer_engine'],
-                       help='Which Transformer implementation to use.',
-                       dest='transformer_impl')
-    group.add_argument('--fp8-amax-history-len', type=int, default=1,
-                        help='Number of steps for which amax history is recorded per tensor',
-                        dest='fp8_amax_history_len')
-    group.add_argument('--fp8-amax-compute-algo', default='most_recent',
-                       choices=['most_recent', 'max'],
-                       help='Algorithm for computing amax from history',
-                       dest='fp8_amax_compute_algo')
-
-    return parser
-
-def _add_inference_args(parser):
-    group = parser.add_argument_group(title='inference')
-
-    group.add_argument('--inference-batch-times-seqlen-threshold',
-                       type=int, default=512,
-                       help='During inference, if batch-size times '
-                       'sequence-length is smaller than this threshold '
-                       'then we will not use pipelining, otherwise we will.')
-    group.add_argument('--max-tokens-to-oom',
-                       type=int, default=12000,
-                       help='Maximum number of tokens during inference'
-                       'tokens here is # in prompt + # to generate'
-                       'Allows us to throw an error before OOM crashes server')
-    group.add_argument('--output-bert-embeddings', action='store_true',
-                       help='Output Bert embeddings (via mean pooling) from '
-                       'model, rather than its binary head output or entire '
-                       'hidden batch.')
-    group.add_argument('--bert-embedder-type', default="megatron",
-                       choices=["megatron", "huggingface"],
-                       help='Select either Megatron or Huggingface as the '
-                       'Bert embedder.')
-
-    return parser
-
-
-def _add_retro_args(parser):
-    group = parser.add_argument_group(title='retro')
-
-    group.add_argument('--retro-workdir', default=None,
-                       help='Retro working directory, which contains the '
-                       'preprocessed data for for pretraining. This directory '
-                       'is built during preprocessing (see '
-                       'tools/retro/README.md), and contains subdirectories '
-                       'for the chunk database and pretraining neighbors.')
-    group.add_argument('--retro-add-retriever',
-                       action='store_true', default=False,
-                       help='Add a retriever to the transformer, for use in '
-                       'pretraining a Retro model.')
-    group.add_argument('--retro-cyclic-train-iters', type=int, default=None,
-                       help='Set number of training iterations for cyclic '
-                       'Retro training.')
-    group.add_argument('--retro-encoder-layers', type=int, default=2,
-                       help='Number of layers to use for the retrieval '
-                       'encoder.')
-    group.add_argument('--retro-encoder-hidden-dropout',
-                       type=float, default=0.1, help='Hidden dropout for '
-                       'retrieval encoder.')
-    group.add_argument('--retro-encoder-attention-dropout',
-                       type=float, default=0.1, help='Attention dropout for '
-                       'retrieval encoder.')
-    group.add_argument("--retro-num-neighbors", type=int, default=2,
-                       help='Number of neighbors to retrieve during '
-                       'pretraining.')
-    group.add_argument("--retro-num-retrieved-chunks", type=int, default=2,
-                       help='Number of chunks to retrieve from the retrieval '
-                       'database.')
-    group.add_argument("--retro-return-doc-ids", action="store_true",
-                       help="Turn this on when preprocessing retro data.")
-
-    # Enforce argument naming convention.
-    for action in group._group_actions:
-        prefix = action.dest.split("_")[0]
-        assert prefix == "retro", \
-            "Retro args must be prefixed with '--retro-*', for consistent " \
-            "styling. Please fix '%s'." % ", ".join(action.option_strings)
-
-    return parser
-
-
-def _add_network_size_args(parser):
-    group = parser.add_argument_group(title='network size')
-
-    group.add_argument('--num-layers', type=int, default=None,
-                       help='Number of transformer layers.')
-    group.add_argument('--encoder-num-layers', type=int, default=None,
-                       help='Number of encoder transformer layers.')
-    group.add_argument('--decoder-num-layers', type=int, default=None,
-                       help='Number of decoder transformer layers.')
-    group.add_argument('--hidden-size', type=int, default=None,
-                       help='Tansformer hidden size.')
-    group.add_argument('--ffn-hidden-size', type=int, default=None,
-                       help='Transformer Feed-Forward Network hidden size. '
-                       'This is set to 4*hidden-size if not provided')
-    group.add_argument('--num-attention-heads', type=int, default=None,
-                       help='Number of transformer attention heads.')
-    group.add_argument('--kv-channels', type=int, default=None,
-                       help='Projection weights dimension in multi-head '
-                       'attention. This is set to '
-                       '   args.hidden_size // args.num_attention_heads '
-                       'if not provided.')
-    group.add_argument('--max-position-embeddings', type=int, default=None,
-                       help='Maximum number of position embeddings to use. '
-                       'This is the size of position embedding.')
-    group.add_argument('--use-rotary-position-embeddings', action='store_true',
-                       help='Use rotary positional embeddings or not')
-    group.add_argument('--rotary-percent', type=float, default=1.0,
-                       help='Percent of rotary dimension to use, default 100%')
-    group.add_argument('--no-position-embedding',
-                       action='store_false',
-                       help='Disable position embedding.',
-                       dest='add_position_embedding')
-    group.add_argument('--make-vocab-size-divisible-by', type=int, default=128,
-                       help='Pad the vocab size to be divisible by this value.'
-                       'This is added for computational efficieny reasons.')
-    group.add_argument('--layernorm-epsilon', type=float, default=1e-5,
-                       help='Layer norm epsilon.')
-    group.add_argument('--apply-layernorm-1p', action='store_true',
-                       help='Adjust LayerNorm weights such that they are centered '
-                       'around zero. This improves numerical stability.')
-    group.add_argument('--apply-residual-connection-post-layernorm',
-                       action='store_true',
-                       help='If set, use original BERT residula connection '
-                       'ordering.')
-    group.add_argument('--openai-gelu', action='store_true',
-                       help='Use OpenAIs GeLU implementation. This option'
-                       'should not be used unless for backward compatibility'
-                       'reasons.')
-    group.add_argument('--squared-relu', action='store_true',
-                       help='Use squared relu activation instead of default gelu')
-    group.add_argument('--swiglu', action='store_true',
-                       help='Use gated linear units and SiLU activation instead of default gelu')
-    group.add_argument('--onnx-safe', type=bool, required=False,
-                       help='Use workarounds for known problems with '
-                       'Torch ONNX exporter')
-    group.add_argument('--bert-no-binary-head', action='store_false',
-                       help='Disable BERT binary head.',
-                       dest='bert_binary_head')
-    group.add_argument('--num-experts', type=int, default=None,
-                       help='Number of Experts in Switch Transformer (None means no Switch)')
-    group.add_argument('--untie-embeddings-and-output-weights', action='store_true',
-                       help='Untie embeddings and output weights.'),
-    return parser
-
-
-def _add_logging_args(parser):
-    group = parser.add_argument_group(title='logging')
-
-    group.add_argument('--log-params-norm', action='store_true',
-                       help='If set, calculate and log parameters norm.')
-    group.add_argument('--log-num-zeros-in-grad', action='store_true',
-                       help='If set, calculate and log the number of zeros in gradient.')
-    group.add_argument('--timing-log-level', type=int,
-                       default=0, choices=range(0,3),
-                       help='Granularity level to measure and report timing. '
-                       '   0: report only iteration time and make sure timing '
-                       '      does not introduce extra overhead.'
-                       '   1: report timing for operations that are executed '
-                       '      very limited times (basically once) during '
-                       '      each iteration (such as gradient all-reduce) '
-                       '   2: report timing for operations that migh be '
-                       '      executed numerous times during each iteration. '
-                       'Note that setting the level to 1 or 2 might '
-                       'cause increase in iteration time.')
-    group.add_argument('--no-barrier-with-level-1-timing', action='store_false',
-                       help='If not set, use barrier with level 1 time '
-                       'measurements. Note that this is up to the user '
-                       'to make sure calling barrier with their timers '
-                       'will not result in hangs. This can happen if for '
-                       'example the user adds a level 1 timer that is not '
-                       'called by all ranks.',
-                       dest='barrier_with_L1_time')
-    group.add_argument('--timing-log-option', type=str, default='minmax',
-                       choices=['max', 'minmax', 'all'],
-                       help='Options for logging timing:'
-                       '  max: report the max timing across all ranks'
-                       '  minmax: report min and max timings across all ranks'
-                       '  all: report timings of all ranks.')
-    group.add_argument('--tensorboard-log-interval', type=int, default=1,
-                       help='Report to tensorboard interval.')
-    group.add_argument('--tensorboard-queue-size', type=int, default=1000,
-                       help='Size of the tensorboard queue for pending events '
-                       'and summaries before one of the ‘add’ calls forces a '
-                       'flush to disk.')
-    group.add_argument('--log-timers-to-tensorboard', action='store_true',
-                       help='If set, write timers to tensorboard.')
-    group.add_argument('--log-batch-size-to-tensorboard', action='store_true',
-                       help='If set, write batch-size to tensorboard.')
-    group.add_argument('--no-log-learnig-rate-to-tensorboard',
-                       action='store_false',
-                       help='Disable learning rate logging to tensorboard.',
-                       dest='log_learning_rate_to_tensorboard')
-    group.add_argument('--no-log-loss-scale-to-tensorboard',
-                       action='store_false',
-                       help='Disable loss-scale logging to tensorboard.',
-                       dest='log_loss_scale_to_tensorboard')
-    group.add_argument('--log-validation-ppl-to-tensorboard',
-                       action='store_true',
-                       help='If set, write validation perplexity to '
-                       'tensorboard.')
-    group.add_argument('--log-memory-to-tensorboard',
-                       action='store_true',
-                       help='Enable memory logging to tensorboard.')
-    group.add_argument('--log-world-size-to-tensorboard',
-                       action='store_true',
-                       help='Enable world size logging to tensorboard.')
-
-    return parser
-
-
-def _add_regularization_args(parser):
-    group = parser.add_argument_group(title='regularization')
-
-    group.add_argument('--attention-dropout', type=float, default=0.1,
-                       help='Post attention dropout probability.')
-    group.add_argument('--hidden-dropout', type=float, default=0.1,
-                       help='Dropout probability for hidden state transformer.')
-    group.add_argument('--weight-decay', type=float, default=0.01,
-                       help='Weight decay coefficient for L2 regularization.')
-    group.add_argument('--start-weight-decay', type=float,
-                       help='Initial weight decay coefficient for L2 regularization.')
-    group.add_argument('--end-weight-decay', type=float,
-                       help='End of run weight decay coefficient for L2 regularization.')
-    group.add_argument('--weight-decay-incr-style', type=str, default='constant',
-                       choices=['constant', 'linear', 'cosine'],
-                       help='Weight decay increment function.')
-    group.add_argument('--clip-grad', type=float, default=1.0,
-                       help='Gradient clipping based on global L2 norm.')
-    group.add_argument('--adam-beta1', type=float, default=0.9,
-                       help='First coefficient for computing running averages '
-                       'of gradient and its square')
-    group.add_argument('--adam-beta2', type=float, default=0.999,
-                       help='Second coefficient for computing running averages '
-                       'of gradient and its square')
-    group.add_argument('--adam-eps', type=float, default=1e-08,
-                       help='Term added to the denominator to improve'
-                       'numerical stability')
-    group.add_argument('--sgd-momentum', type=float, default=0.9,
-                       help='Momentum factor for sgd')
-
-    return parser
-
-
-def _add_training_args(parser):
-    group = parser.add_argument_group(title='training')
-
-    group.add_argument('--micro-batch-size', type=int, default=None,
-                       help='Batch size per model instance (local batch size). '
-                       'Global batch size is local batch size times data '
-                       'parallel size times number of micro batches.')
-    group.add_argument('--batch-size', type=int, default=None,
-                       help='Old batch size parameter, do not use. '
-                       'Use --micro-batch-size instead')
-    group.add_argument('--global-batch-size', type=int, default=None,
-                       help='Training batch size. If set, it should be a '
-                       'multiple of micro-batch-size times data-parallel-size. '
-                       'If this value is None, then '
-                       'use micro-batch-size * data-parallel-size as the '
-                       'global batch size. This choice will result in 1 for '
-                       'number of micro-batches.')
-    group.add_argument('--rampup-batch-size', nargs='*', default=None,
-                       help='Batch size ramp up with the following values:'
-                       '  --rampup-batch-size <start batch size> '
-                       '                      <batch size incerement> '
-                       '                      <ramp-up samples> '
-                       'For example:'
-                       '   --rampup-batch-size 16 8 300000 \ '
-                       '   --global-batch-size 1024'
-                       'will start with global batch size 16 and over '
-                       ' (1024 - 16) / 8 = 126 intervals will increase'
-                       'the batch size linearly to 1024. In each interval'
-                       'we will use approximately 300000 / 126 = 2380 samples.')
-    group.add_argument('--recompute-activations', action='store_true',
-                       help='recompute activation to allow for training '
-                       'with larger models, sequences, and batch sizes.')
-    group.add_argument('--recompute-granularity', type=str, default=None,
-                       choices=['full', 'selective'],
-                       help='Checkpoint activations to allow for training '
-                       'with larger models, sequences, and batch sizes. '
-                       'It is supported at two granularities 1) full: '
-                       'whole transformer layer is recomputed, '
-                       '2) selective: core attention part of the transformer '
-                       'layer is recomputed.')
-    group.add_argument('--distribute-saved-activations',
-                       action='store_true',
-                       help='If set, distribute recomputed activations '
-                       'across model parallel group.')
-    group.add_argument('--recompute-method', type=str, default=None,
-                       choices=['uniform', 'block'],
-                       help='1) uniform: uniformly divide the total number of '
-                       'Transformer layers and recompute the input activation of '
-                       'each divided chunk at specified granularity, '
-                       '2) recompute the input activations of only a set number of '
-                       'individual Transformer layers per pipeline stage and do the '
-                       'rest without any recomputing at specified granularity'
-                       'default) do not apply activations recompute to any layers')
-    group.add_argument('--recompute-num-layers', type=int, default=1,
-                       help='1) uniform: the number of Transformer layers in each '
-                       'uniformly divided recompute unit, '
-                       '2) block: the number of individual Transformer layers '
-                       'to recompute within each pipeline stage.')
-
-    # deprecated
-    group.add_argument('--checkpoint-activations', action='store_true',
-                       help='Checkpoint activation to allow for training '
-                       'with larger models, sequences, and batch sizes.')
-    group.add_argument('--train-iters', type=int, default=None,
-                       help='Total number of iterations to train over all '
-                       'training runs. Note that either train-iters or '
-                       'train-samples should be provided.')
-    group.add_argument('--train-samples', type=int, default=None,
-                       help='Total number of samples to train over all '
-                       'training runs. Note that either train-iters or '
-                       'train-samples should be provided.')
-    group.add_argument('--log-interval', type=int, default=100,
-                       help='Report loss and timing interval.')
-    group.add_argument('--exit-interval', type=int, default=None,
-                       help='Exit the program after the iteration is divisible '
-                       'by this value.')
-    group.add_argument('--exit-duration-in-mins', type=int, default=None,
-                       help='Exit the program after this many minutes.')
-    group.add_argument('--exit-signal-handler', action='store_true',
-                       help='Dynamically save the checkpoint and shutdown the '
-                       'training if SIGTERM is received')
-    group.add_argument('--tensorboard-dir', type=str, default=None,
-                       help='Write TensorBoard logs to this directory.')
-    group.add_argument('--no-masked-softmax-fusion',
-                       action='store_false',
-                       help='Disable fusion of query_key_value scaling, '
-                       'masking, and softmax.',
-                       dest='masked_softmax_fusion')
-    group.add_argument('--no-bias-gelu-fusion', action='store_false',
-                       help='Disable bias and gelu fusion.',
-                       dest='bias_gelu_fusion')
-    group.add_argument('--no-bias-dropout-fusion', action='store_false',
-                       help='Disable bias and dropout fusion.',
-                       dest='bias_dropout_fusion')
-    group.add_argument('--use-flash-attn', action='store_true',
-                       help='use FlashAttention implementation of attention. '
-                       'https://arxiv.org/abs/2205.14135')
-    group.add_argument('--disable-bias-linear', action='store_false',
-                       help='Disable bias in the linear layers',
-                       dest='add_bias_linear')
-    group.add_argument('--optimizer', type=str, default='adam',
-                       choices=['adam', 'sgd'],
-                       help='Optimizer function')
-    group.add_argument('--dataloader-type', type=str, default=None,
-                       choices=['single', 'cyclic'],
-                       help='Single pass vs multiple pass data loader')
-    group.add_argument('--no-async-tensor-model-parallel-allreduce',
-                       action='store_false',
-                       help='Disable asynchronous execution of '
-                       'tensor-model-parallel all-reduce with weight '
-                       'gradient compuation of a column-linear layer.',
-                       dest='async_tensor_model_parallel_allreduce')
-    group.add_argument('--no-persist-layer-norm', action='store_true',
-                       help='Disable using persistent fused layer norm kernel. '
-                       'This kernel supports only a set of hidden sizes. Please '
-                       'check persist_ln_hidden_sizes if your hidden '
-                       'size is supported.')
-    group.add_argument('--sequence-parallel', action='store_true',
-                       help='Enable sequence parallel optimization.')
-    group.add_argument('--no-gradient-accumulation-fusion',
-                       action='store_false',
-                       help='Disable fusing gradient accumulation to weight '
-                       'gradient computation of linear layers',
-                       dest='gradient_accumulation_fusion')
-    return parser
-
-
-def _add_initialization_args(parser):
-    group = parser.add_argument_group(title='initialization')
-
-    group.add_argument('--seed', type=int, default=1234,
-                       help='Random seed used for python, numpy, '
-                       'pytorch, and cuda.')
-    group.add_argument('--data-parallel-random-init', action='store_true',
-                       help='Enable random initialization of params '
-                       'across data parallel ranks')
-    group.add_argument('--init-method-std', type=float, default=0.02,
-                       help='Standard deviation of the zero mean normal '
-                       'distribution used for weight initialization.')
-    group.add_argument('--init-method-xavier-uniform', action='store_true',
-                       help='Enable Xavier uniform parameter initialization')
-
-    return parser
-
-
-def _add_learning_rate_args(parser):
-    group = parser.add_argument_group(title='learning rate')
-
-    group.add_argument('--lr', type=float, default=None,
-                       help='Initial learning rate. Depending on decay style '
-                       'and initial warmup, the learing rate at each '
-                       'iteration would be different.')
-    group.add_argument('--lr-decay-style', type=str, default='linear',
-                       choices=['constant', 'linear', 'cosine', 'inverse-square-root'],
-                       help='Learning rate decay function.')
-    group.add_argument('--lr-decay-iters', type=int, default=None,
-                       help='number of iterations to decay learning rate over,'
-                       ' If None defaults to `--train-iters`')
-    group.add_argument('--lr-decay-samples', type=int, default=None,
-                       help='number of samples to decay learning rate over,'
-                       ' If None defaults to `--train-samples`')
-    group.add_argument('--lr-warmup-fraction', type=float, default=None,
-                       help='fraction of lr-warmup-(iters/samples) to use '
-                       'for warmup (as a float)')
-    group.add_argument('--lr-warmup-iters', type=int, default=0,
-                       help='number of iterations to linearly warmup '
-                       'learning rate over.')
-    group.add_argument('--lr-warmup-samples', type=int, default=0,
-                       help='number of samples to linearly warmup '
-                       'learning rate over.')
-    group.add_argument('--warmup', type=int, default=None,
-                       help='Old lr warmup argument, do not use. Use one of the'
-                       '--lr-warmup-* arguments above')
-    group.add_argument('--min-lr', type=float, default=0.0,
-                       help='Minumum value for learning rate. The scheduler'
-                       'clip values below this threshold.')
-    group.add_argument('--override-opt_param-scheduler', action='store_true',
-                       help='Reset the values of the scheduler (learning rate,'
-                       'warmup iterations, minimum learning rate, maximum '
-                       'number of iterations, and decay style from input '
-                       'arguments and ignore values from checkpoints. Note'
-                       'that all the above values will be reset.')
-    group.add_argument('--use-checkpoint-opt_param-scheduler', action='store_true',
-                       help='Use checkpoint to set the values of the scheduler '
-                       '(learning rate, warmup iterations, minimum learning '
-                       'rate, maximum number of iterations, and decay style '
-                       'from checkpoint and ignore input arguments.')
-
-    return parser
-
-
-def _add_checkpointing_args(parser):
-    group = parser.add_argument_group(title='checkpointing')
-
-    group.add_argument('--save', type=str, default=None,
-                       help='Output directory to save checkpoints to.')
-    group.add_argument('--save-interval', type=int, default=None,
-                       help='Number of iterations between checkpoint saves.')
-    group.add_argument('--no-save-optim', action='store_true', default=None,
-                       help='Do not save current optimizer.')
-    group.add_argument('--no-save-rng', action='store_true', default=None,
-                       help='Do not save current rng state.')
-    group.add_argument('--load', type=str, default=None,
-                       help='Directory containing a model checkpoint.')
-    group.add_argument('--no-load-optim', action='store_true', default=None,
-                       help='Do not load optimizer when loading checkpoint.')
-    group.add_argument('--no-load-rng', action='store_true', default=None,
-                       help='Do not load rng state when loading checkpoint.')
-    group.add_argument('--finetune', action='store_true',
-                       help='Load model for finetuning. Do not load optimizer '
-                       'or rng state from checkpoint and set iteration to 0. '
-                       'Assumed when loading a release checkpoint.')
-    group.add_argument('--no-initialization', action='store_false',
-                       help='Do not perform initialization when building model, '
-                       'can reduce startup time when definitely loading from a '
-                       'checkpoint',
-                       dest='perform_initialization')
-    group.add_argument('--use-checkpoint-args', action='store_true',
-                       help='Override any command line arguments with arguments '
-                       'from the checkpoint')
-    group.add_argument('--exit-on-missing-checkpoint', action='store_true',
-                       help="If '--load' is set, but checkpoint is not found "
-                       "(e.g., path typo), then exit instead of random "
-                       "initialization.")
-
-    return parser
-
-
-def _add_mixed_precision_args(parser):
-    group = parser.add_argument_group(title='mixed precision')
-
-    group.add_argument('--fp16', action='store_true',
-                       help='Run model in fp16 mode.')
-    group.add_argument('--bf16', action='store_true',
-                       help='Run model in bfloat16 mode.')
-    group.add_argument('--loss-scale', type=float, default=None,
-                       help='Static loss scaling, positive power of 2 '
-                       'values can improve fp16 convergence. If None, dynamic'
-                       'loss scaling is used.')
-    group.add_argument('--initial-loss-scale', type=float, default=2**32,
-                       help='Initial loss-scale for dynamic loss scaling.')
-    group.add_argument('--min-loss-scale', type=float, default=1.0,
-                       help='Minimum loss scale for dynamic loss scale.')
-    group.add_argument('--loss-scale-window', type=float, default=1000,
-                       help='Window over which to raise/lower dynamic scale.')
-    group.add_argument('--hysteresis', type=int, default=2,
-                       help='hysteresis for dynamic loss scaling')
-    group.add_argument('--fp32-residual-connection', action='store_true',
-                       help='Move residual connections to fp32.')
-    group.add_argument('--no-query-key-layer-scaling', action='store_false',
-                       help='Do not scale Q * K^T by 1 / layer-number.',
-                       dest='apply_query_key_layer_scaling')
-    group.add_argument('--attention-softmax-in-fp32', action='store_true',
-                       help='Run attention masking and softmax in fp32. '
-                       'This flag is ignored unless '
-                       '--no-query-key-layer-scaling is specified.')
-    group.add_argument('--accumulate-allreduce-grads-in-fp32',
-                       action='store_true',
-                       help='Gradient accumulation and all-reduce in fp32.')
-    group.add_argument('--fp16-lm-cross-entropy', action='store_true',
-                       help='Move the cross entropy unreduced loss calculation'
-                       'for lm head to fp16.')
-
-    return parser
-
-
-def _add_distributed_args(parser):
-    group = parser.add_argument_group(title='distributed')
-
-    group.add_argument('--tensor-model-parallel-size', type=int, default=1,
-                       help='Degree of tensor model parallelism.')
-    group.add_argument('--pipeline-model-parallel-size', type=int, default=1,
-                       help='Degree of pipeline model parallelism.')
-    group.add_argument('--pipeline-model-parallel-split-rank',
-                       type=int, default=None,
-                       help='Rank where encoder and decoder should be split.')
-    group.add_argument('--model-parallel-size', type=int, default=None,
-                       help='Old model parallel argument, do not use. Use '
-                       '--tensor-model-parallel-size instead.')
-    group.add_argument('--num-layers-per-virtual-pipeline-stage', type=int, default=None,
-                       help='Number of layers per virtual pipeline stage')
-    group.add_argument('--distributed-backend', default='nccl',
-                       choices=['nccl', 'gloo'],
-                       help='Which backend to use for distributed training.')
-    group.add_argument('--distributed-timeout-minutes', type=int, default=10,
-                       help='Timeout minutes for torch.distributed.')
-    group.add_argument('--DDP-impl', default='local',
-                       choices=['local', 'torch'],
-                       help='which DistributedDataParallel implementation '
-                       'to use.')
-    group.add_argument('--no-contiguous-buffers-in-local-ddp',
-                       action='store_false', help='If set, dont use '
-                       'contiguous buffer in local DDP.',
-                       dest='use_contiguous_buffers_in_local_ddp')
-    group.add_argument('--no-scatter-gather-tensors-in-pipeline', action='store_false',
-                       help='Use scatter/gather to optimize communication of tensors in pipeline',
-                       dest='scatter_gather_tensors_in_pipeline')
-    group.add_argument('--use-ring-exchange-p2p', action='store_true',
-                       default=False, help='If set, use custom-built ring exchange '
-                       'for p2p communications. Note that this option will require '
-                       'a custom built image that support ring-exchange p2p.')
-    group.add_argument('--local_rank', type=int, default=None,
-                       help='local rank passed from distributed launcher.')
-    group.add_argument('--lazy-mpu-init', type=bool, required=False,
-                       help='If set to True, initialize_megatron() '
-                       'skips DDP initialization and returns function to '
-                       'complete it instead.Also turns on '
-                       '--use-cpu-initialization flag. This is for '
-                       'external DDP manager.' )
-    group.add_argument('--use-cpu-initialization', action='store_true',
-                       default=None, help='If set, affine parallel weights '
-                       'initialization uses CPU' )
-    group.add_argument('--empty-unused-memory-level', default=0, type=int,
-                       choices=[0, 1, 2],
-                       help='Call torch.cuda.empty_cache() each iteration '
-                       '(training and eval), to reduce fragmentation.'
-                       '0=off, 1=moderate, 2=aggressive.')
-    group.add_argument('--standalone-embedding-stage', action='store_true',
-                       default=False, help='If set, *input* embedding layer '
-                       'is placed on its own pipeline stage, without any '
-                       'transformer layers. (For T5, this flag currently only '
-                       'affects the encoder embedding.)')
-    group.add_argument('--use-distributed-optimizer', action='store_true',
-                       help='Use distributed optimizer.')
-
-    return parser
-
-
-def _add_validation_args(parser):
-    group = parser.add_argument_group(title='validation')
-
-    group.add_argument('--eval-iters', type=int, default=100,
-                       help='Number of iterations to run for evaluation'
-                       'validation/test for.')
-    group.add_argument('--eval-interval', type=int, default=1000,
-                       help='Interval between running evaluation on '
-                       'validation set.')
-
-    return parser
-
-
-def _add_data_args(parser):
-    group = parser.add_argument_group(title='data and dataloader')
-
-    group.add_argument('--data-path', nargs='*', default=None,
-                       help='Path to the training dataset. Accepted format:'
-                       '1) a single data path, 2) multiple datasets in the'
-                       'form: dataset1-weight dataset1-path dataset2-weight '
-                       'dataset2-path ... It is used with --split when a '
-                       'single dataset used for all three: train, valid '
-                       'and test. It is exclusive to the other '
-                       '--*-data-path args')
-    group.add_argument('--split', type=str, default='969, 30, 1',
-                       help='Comma-separated list of proportions for training,'
-                       ' validation, and test split. For example the split '
-                       '`90,5,5` will use 90%% of data for training, 5%% for '
-                       'validation and 5%% for test.')
-    group.add_argument('--train-data-path', nargs='*', default=None,
-                       help='Path to the training dataset. Accepted format:'
-                       '1) a single data path, 2) multiple datasets in the'
-                       'form: dataset1-weight dataset1-path dataset2-weight '
-                       'dataset2-path ...')
-    group.add_argument('--valid-data-path', nargs='*', default=None,
-                       help='Path to the validation dataset. Accepted format:'
-                       '1) a single data path, 2) multiple datasets in the'
-                       'form: dataset1-weight dataset1-path dataset2-weight '
-                       'dataset2-path ...')
-    group.add_argument('--test-data-path', nargs='*', default=None,
-                       help='Path to the test dataset. Accepted format:'
-                       '1) a single data path, 2) multiple datasets in the'
-                       'form: dataset1-weight dataset1-path dataset2-weight '
-                       'dataset2-path ...')
-
-    group.add_argument('--vocab-size', type=int, default=None,
-                       help='Size of vocab before EOD or padding.')
-    group.add_argument('--vocab-file', type=str, default=None,
-                       help='Path to the vocab file.')
-    group.add_argument('--merge-file', type=str, default=None,
-                       help='Path to the BPE merge file.')
-    group.add_argument('--vocab-extra-ids', type=int, default=0,
-                       help='Number of additional vocabulary tokens. '
-                            'They are used for span masking in the T5 model')
-    group.add_argument('--seq-length', type=int, default=None,
-                       help='Maximum sequence length to process.')
-    group.add_argument('--encoder-seq-length', type=int, default=None,
-                       help='Maximum encoder sequence length to process.'
-                       'This should be exclusive of --seq-length')
-    group.add_argument('--decoder-seq-length', type=int, default=None,
-                       help="Maximum decoder sequence length to process.")
-    group.add_argument('--retriever-seq-length', type=int, default=256,
-                       help='Maximum sequence length for the biencoder model '
-                       'for retriever')
-    group.add_argument('--sample-rate', type=float, default=1.0,
-                       help='sample rate for training data. Supposed to be 0 '
-                            ' < sample_rate < 1')
-    group.add_argument('--mask-prob', type=float, default=0.15,
-                       help='Probability of replacing a token with mask.')
-    group.add_argument('--short-seq-prob', type=float, default=0.1,
-                       help='Probability of producing a short sequence.')
-    group.add_argument('--mmap-warmup', action='store_true',
-                       help='Warm up mmap files.')
-    group.add_argument('--num-workers', type=int, default=2,
-                       help="Dataloader number of workers.")
-    group.add_argument('--tokenizer-type', type=str,
-                       default=None,
-                       choices=['BertWordPieceLowerCase',
-                                'BertWordPieceCase',
-                                'GPT2BPETokenizer',
-                                'SentencePieceTokenizer',
-                                'GPTSentencePieceTokenizer',
-                                'NullTokenizer'],
-                       help='What type of tokenizer to use.')
-    group.add_argument('--tokenizer-model', type=str, default=None,
-                       help='Sentencepiece tokenizer model.')
-    group.add_argument('--data-impl', type=str, default='infer',
-                       choices=['lazy', 'cached', 'mmap', 'infer'],
-                       help='Implementation of indexed datasets.')
-    group.add_argument('--reset-position-ids', action='store_true',
-                       help='Reset posistion ids after end-of-document token.')
-    group.add_argument('--reset-attention-mask', action='store_true',
-                       help='Reset self attention maske after '
-                       'end-of-document token.')
-    group.add_argument('--eod-mask-loss', action='store_true',
-                       help='Mask loss for the end of document tokens.')
-
-    return parser
-
-
-def _add_autoresume_args(parser):
-    group = parser.add_argument_group(title='autoresume')
-
-    group.add_argument('--adlr-autoresume', action='store_true',
-                       help='Enable autoresume on adlr cluster.')
-    group.add_argument('--adlr-autoresume-interval', type=int, default=1000,
-                       help='Intervals over which check for autoresume'
-                       'termination signal')
-
-    return parser
-
-
-def _add_biencoder_args(parser):
-    group = parser.add_argument_group(title='biencoder')
-
-    # network size
-    group.add_argument('--ict-head-size', type=int, default=None,
-                       help='Size of block embeddings to be used in ICT and '
-                        'REALM (paper default: 128)')
-    group.add_argument('--biencoder-projection-dim', type=int, default=0,
-                       help='Size of projection head used in biencoder (paper'
-                        ' default: 128)')
-    group.add_argument('--biencoder-shared-query-context-model', action='store_true',
-                        help='Whether to share the parameters of the query '
-                        'and context models or not')
-
-    # checkpointing
-    group.add_argument('--ict-load', type=str, default=None,
-                       help='Directory containing an ICTBertModel checkpoint')
-    group.add_argument('--bert-load', type=str, default=None,
-                       help='Directory containing an BertModel checkpoint '
-                       '(needed to start ICT and REALM)')
-
-    # data
-    group.add_argument('--titles-data-path', type=str, default=None,
-                       help='Path to titles dataset used for ICT')
-    group.add_argument('--query-in-block-prob', type=float, default=0.1,
-                       help='Probability of keeping query in block for '
-                       'ICT dataset')
-    group.add_argument('--use-one-sent-docs', action='store_true',
-                       help='Whether to use one sentence documents in ICT')
-    group.add_argument('--evidence-data-path', type=str, default=None,
-                       help='Path to Wikipedia Evidence frm DPR paper')
-
-    # training
-    group.add_argument('--retriever-report-topk-accuracies', nargs='+', type=int,
-                        default=[], help="Which top-k accuracies to report "
-                        "(e.g. '1 5 20')")
-    group.add_argument('--retriever-score-scaling', action='store_true',
-                       help='Whether to scale retriever scores by inverse '
-                        'square root of hidden size')
-
-    # faiss index
-    group.add_argument('--block-data-path', type=str, default=None,
-                       help='Where to save/load BlockData to/from')
-    group.add_argument('--embedding-path', type=str, default=None,
-                       help='Where to save/load Open-Retrieval Embedding'
-                        ' data to/from')
-
-    # indexer
-    group.add_argument('--indexer-batch-size', type=int, default=128,
-                       help='How large of batches to use when doing indexing '
-                       'jobs')
-    group.add_argument('--indexer-log-interval', type=int, default=1000,
-                       help='After how many batches should the indexer '
-                       'report progress')
-    return parser
-
-
-def _add_vision_args(parser):
-    group = parser.add_argument_group(title="vision")
-
-    # general vision arguements
-    group.add_argument('--num-classes', type=int, default=1000,
-                       help='num of classes in vision classificaiton task')
-    group.add_argument('--img-h', type=int, default=224,
-                       help='Image height for vision classification task')
-    group.add_argument('--img-w', type=int, default=224,
-                       help='Image height for vision classification task')
-    group.add_argument('--num-channels', type=int, default=3,
-                       help='Number of channels in input image data')
-    group.add_argument('--patch-dim', type=int, default=16,
-                       help='patch dimension')
-    group.add_argument('--classes-fraction', type=float, default=1.0,
-                       help='training with fraction of classes.')
-    group.add_argument('--data-per-class-fraction', type=float, default=1.0,
-                       help='training with fraction of data per class.')
-    group.add_argument('--no-data-sharding', action='store_false',
-                       help='Disable data sharding.',
-                       dest='data_sharding')
-    group.add_argument('--head-lr-mult', type=float, default=1.0,
-                       help='learning rate multiplier for head during finetuning')
-
-    # pretraining type and backbone selection`
-    group.add_argument('--vision-pretraining', action='store_true',
-                       help='flag to indicate vision pretraining')
-    group.add_argument('--vision-pretraining-type', type=str, default='classify',
-                       choices=['classify', 'inpaint', 'dino'],
-                       help='pretraining objectives')
-    group.add_argument('--vision-backbone-type', type=str, default='vit',
-                       choices=['vit', 'mit', 'swin'],
-                       help='backbone types types')
-    group.add_argument('--swin-backbone-type', type=str, default='tiny',
-                       choices=['tiny', 'base', 'h3'],
-                       help='pretraining objectives')
-    
-    # inpainting arguments
-    group.add_argument('--mask-type', type=str, default='random',
-                       choices=['random', 'row'],
-                       help='mask types')
-    group.add_argument('--mask-factor', type=float, default=1.0,
-                       help='mask size scaling parameter')
- 
-    # dino arguments
-    group.add_argument('--iter-per-epoch', type=int, default=1250,
-                       help='iterations per epoch')
-    group.add_argument('--dino-local-img-size', type=int, default=96,
-                       help='Image size for vision classification task')
-    group.add_argument('--dino-local-crops-number', type=int, default=10,
-                       help='Number of local crops')
-    group.add_argument('--dino-head-hidden-size', type=int, default=2048,
-                       help='Hidden dimension size in dino head')
-    group.add_argument('--dino-bottleneck-size', type=int, default=256,
-                       help='Bottle neck dimension in dino head ')
-    group.add_argument('--dino-freeze-last-layer', type=float, default=1,
-                       help='Freezing last layer weights')
-    group.add_argument('--dino-norm-last-layer', action='store_true',
-                       help='Disable Norm in last layer.')
-    group.add_argument('--dino-warmup-teacher-temp', type=float, default=0.04,
-                       help='warump teacher temperature')
-    group.add_argument('--dino-teacher-temp', type=float, default=0.07,
-                       help='teacher temperature')
-    group.add_argument('--dino-warmup-teacher-temp-epochs', type=int, default=30,
-                       help='warmup teacher temperaure epochs')
-
-    return parser
diff --git a/megatron/checkpointing.py b/megatron/checkpointing.py
deleted file mode 100644
index 41b0535704..0000000000
--- a/megatron/checkpointing.py
+++ /dev/null
@@ -1,679 +0,0 @@
-# Copyright (c) 2023, NVIDIA CORPORATION. All rights reserved.
-
-"""Input/output checkpointing."""
-
-import os
-import random
-import sys
-import numpy as np
-
-import torch
-
-from megatron import update_num_microbatches
-from megatron.core import mpu, tensor_parallel
-from .global_vars import get_args
-from .utils import (unwrap_model,
-                    print_rank_0)
-
-
-_CHECKPOINT_VERSION = None
-
-
-def set_checkpoint_version(value):
-    global _CHECKPOINT_VERSION
-    if _CHECKPOINT_VERSION is not None:
-        assert _CHECKPOINT_VERSION == value, \
-            "checkpoint versions do not match"
-    _CHECKPOINT_VERSION = value
-
-
-def get_checkpoint_version():
-    global _CHECKPOINT_VERSION
-    return _CHECKPOINT_VERSION
-
-
-def check_checkpoint_args(checkpoint_args):
-    """Ensure fixed arguments for a model are the same for the input
-    arguments and the one retrieved from checkpoint."""
-    args = get_args()
-
-    def _compare(arg_name, old_arg_name=None):
-        if old_arg_name is not None:
-            checkpoint_value = getattr(checkpoint_args, old_arg_name)
-        else:
-            checkpoint_value = getattr(checkpoint_args, arg_name)
-        args_value = getattr(args, arg_name)
-        error_message = '{} value from checkpoint ({}) is not equal to the ' \
-                        'input argument value ({}).'.format(
-                            arg_name, checkpoint_value, args_value)
-        assert checkpoint_value == args_value, error_message
-
-    _compare('num_layers')
-    _compare('hidden_size')
-    _compare('num_attention_heads')
-    _compare('add_position_embedding')
-    if args.vocab_file:
-        _compare('max_position_embeddings')
-        _compare('make_vocab_size_divisible_by')
-        _compare('padded_vocab_size')
-        _compare('tokenizer_type')
-    if args.data_parallel_random_init:
-        _compare('data_parallel_random_init')
-    if get_checkpoint_version() < 3.0:
-        _compare('tensor_model_parallel_size',
-                 old_arg_name='model_parallel_size')
-    if get_checkpoint_version() >= 3.0:
-        _compare('tensor_model_parallel_size')
-        _compare('pipeline_model_parallel_size')
-
-
-def ensure_directory_exists(filename):
-    """Build filename's path if it does not already exists."""
-    dirname = os.path.dirname(filename)
-    os.makedirs(dirname, exist_ok = True)
-
-
-def get_checkpoint_name(checkpoints_path, iteration, release=False,
-                        pipeline_parallel=None,
-                        tensor_rank=None, pipeline_rank=None):
-    """Determine the directory name for this rank's checkpoint."""
-    if release:
-        directory = 'release'
-    else:
-        directory = 'iter_{:07d}'.format(iteration)
-
-    # Use both the tensor and pipeline MP rank.
-    if pipeline_parallel is None:
-        pipeline_parallel = (mpu.get_pipeline_model_parallel_world_size() > 1)
-    if tensor_rank is None:
-        tensor_rank = mpu.get_tensor_model_parallel_rank()
-    if pipeline_rank is None:
-        pipeline_rank = mpu.get_pipeline_model_parallel_rank()
-
-    # Use both the tensor and pipeline MP rank. If using the distributed
-    # optimizer, then the optimizer's path must additionally include the
-    # data parallel rank.
-    if not pipeline_parallel:
-        common_path = os.path.join(checkpoints_path, directory,
-                            f'mp_rank_{tensor_rank:02d}')
-    else:
-        common_path = os.path.join(checkpoints_path, directory,
-                        f'mp_rank_{tensor_rank:02d}_{pipeline_rank:03d}')
-
-    return os.path.join(common_path, "model_optim_rng.pt")
-
-
-def get_distributed_optimizer_checkpoint_name(model_checkpoint_name):
-    return os.path.join(os.path.dirname(model_checkpoint_name),
-                        "distrib_optim.pt")
-
-
-def find_checkpoint_rank_0(checkpoints_path, iteration, release=False):
-    """Finds the checkpoint for rank 0 without knowing if we are using
-    pipeline parallelism or not.
-
-    Since the checkpoint naming scheme changes if pipeline parallelism
-    is present, we need to look for both naming schemes if we don't
-    know if the checkpoint has pipeline parallelism.
-    """
-
-    # Look for checkpoint with no pipelining
-    filename = get_checkpoint_name(checkpoints_path, iteration, release,
-                                   pipeline_parallel=False,
-                                   tensor_rank=0, pipeline_rank=0)
-    if os.path.isfile(filename):
-        return filename
-
-    # Look for checkpoint with pipelining
-    filename = get_checkpoint_name(checkpoints_path, iteration, release,
-                                   pipeline_parallel=True,
-                                   tensor_rank=0, pipeline_rank=0)
-    if os.path.isfile(filename):
-        return filename
-
-    return None, None
-
-
-def get_checkpoint_tracker_filename(checkpoints_path):
-
-    """Tracker file rescords the latest chckpoint during
-    training to restart from."""
-    return os.path.join(checkpoints_path, 'latest_checkpointed_iteration.txt')
-
-
-def read_metadata(tracker_filename):
-    # Read the tracker file and either set the iteration or
-    # mark it as a release checkpoint.
-    iteration = 0
-    release = False
-    with open(tracker_filename, 'r') as f:
-        metastring = f.read().strip()
-        try:
-            iteration = int(metastring)
-        except ValueError:
-            release = metastring == 'release'
-            if not release:
-                print_rank_0('ERROR: Invalid metadata file {}. Exiting'.format(
-                    tracker_filename))
-                sys.exit()
-    assert iteration > 0 or release, 'error parsing metadata file {}'.format(
-        tracker_filename)
-
-    # Get the max iteration retrieved across the ranks.
-    if torch.distributed.is_initialized():
-        iters_cuda = torch.cuda.LongTensor([iteration])
-        torch.distributed.all_reduce(iters_cuda, op=torch.distributed.ReduceOp.MAX)
-        max_iter = iters_cuda[0].item()
-
-        # We should now have all the same iteration.
-        # If not, print a warning and chose the maximum
-        # iteration across all ranks.
-        if iteration != max_iter:
-            print('WARNING: on rank {} found iteration {} in the '
-                  'metadata while max iteration across the ranks '
-                  'is {}, replacing it with max iteration.'.format(
-                      rank, iteration, max_iter), flush=True)
-    else:
-        # When loading a checkpoint outside of training (for example,
-        # when editing it), we might not have torch distributed
-        # initialized, in this case, just assume we have the latest
-        max_iter = iteration
-    return max_iter, release
-
-
-def get_rng_state():
-    """ collect rng state across data parallel ranks """
-    args = get_args()
-    rng_state = {
-        'random_rng_state': random.getstate(),
-        'np_rng_state': np.random.get_state(),
-        'torch_rng_state': torch.get_rng_state(),
-        'cuda_rng_state': torch.cuda.get_rng_state(),
-        'rng_tracker_states': tensor_parallel.get_cuda_rng_tracker().get_states()}
-
-    rng_state_list = None
-    if torch.distributed.is_initialized() and \
-            mpu.get_data_parallel_world_size() > 1 and \
-            args.data_parallel_random_init:
-        rng_state_list = \
-            [None for i in range(mpu.get_data_parallel_world_size())]
-        torch.distributed.all_gather_object(
-            rng_state_list,
-            rng_state,
-            group=mpu.get_data_parallel_group())
-    else:
-        rng_state_list = [rng_state]
-
-    return rng_state_list
-
-
-def save_checkpoint(iteration, model, optimizer, opt_param_scheduler):
-    """Save a model checkpoint."""
-    args = get_args()
-
-    # Only rank zero of the data parallel writes to the disk.
-    model = unwrap_model(model)
-
-    print_rank_0('saving checkpoint at iteration {:7d} to {}'.format(
-        iteration, args.save))
-
-    # Collect rng state across data parallel ranks.
-    rng_state = get_rng_state()
-
-    # Checkpoint name.
-    checkpoint_name = get_checkpoint_name(args.save, iteration)
-
-    # Save distributed optimizer's custom parameter state.
-    if args.use_distributed_optimizer:
-        optim_checkpoint_name = \
-            get_distributed_optimizer_checkpoint_name(checkpoint_name)
-        ensure_directory_exists(optim_checkpoint_name)
-        optimizer.save_parameter_state(optim_checkpoint_name)
-
-    # Collect args, model, RNG.
-    if not torch.distributed.is_initialized() \
-       or mpu.get_data_parallel_rank() == 0:
-
-        # Arguments, iteration, and model.
-        state_dict = {}
-        state_dict['args'] = args
-        state_dict['checkpoint_version'] = 3.0
-        state_dict['iteration'] = iteration
-        if len(model) == 1:
-            state_dict['model'] = model[0].state_dict_for_save_checkpoint()
-        else:
-            for i in range(len(model)):
-                mpu.set_virtual_pipeline_model_parallel_rank(i)
-                state_dict['model%d' % i] = \
-                    model[i].state_dict_for_save_checkpoint()
-
-        # Optimizer stuff.
-        if not args.no_save_optim:
-            if optimizer is not None:
-                state_dict['optimizer'] = optimizer.state_dict()
-            if opt_param_scheduler is not None:
-                state_dict['opt_param_scheduler'] = \
-                    opt_param_scheduler.state_dict()
-
-        # RNG states.
-        if not args.no_save_rng:
-            state_dict["rng_state"] = rng_state
-
-        # Save.
-        ensure_directory_exists(checkpoint_name)
-        torch.save(state_dict, checkpoint_name)
-
-    # Wait so everyone is done (necessary)
-    if torch.distributed.is_initialized():
-        torch.distributed.barrier()
-
-    print_rank_0('  successfully saved checkpoint at iteration {:7d} to {}' \
-                 .format(iteration, args.save))
-
-    # And update the latest iteration
-    if not torch.distributed.is_initialized() \
-       or torch.distributed.get_rank() == 0:
-        tracker_filename = get_checkpoint_tracker_filename(args.save)
-        with open(tracker_filename, 'w') as f:
-            f.write(str(iteration))
-
-    # Wait so everyone is done (not necessary)
-    if torch.distributed.is_initialized():
-        torch.distributed.barrier()
-
-
-def _transpose_first_dim(t, num_splits, num_splits_first, model):
-    input_shape = t.size()
-    # We use a self_attention module but the values extracted aren't
-    # specific to self attention so should work for cross attention as well
-    while hasattr(model, 'module'):
-        model = model.module
-    attention_module = model.language_model.encoder.layers[0].self_attention
-    hidden_size_per_attention_head = attention_module.hidden_size_per_attention_head
-    num_attention_heads_per_partition = attention_module.num_attention_heads_per_partition
-    if num_splits_first:
-        """[num_splits * np * hn, h]
-        -->(view) [num_splits, np, hn, h]
-        -->(tranpose) [np, num_splits, hn, h]
-        -->(view) [np * num_splits * hn, h] """
-
-        intermediate_shape = \
-            (num_splits, num_attention_heads_per_partition,
-             hidden_size_per_attention_head) + input_shape[1:]
-
-        t = t.view(*intermediate_shape)
-        t = t.transpose(0, 1).contiguous()
-    else:
-        """[np * hn * num_splits, h]
-        -->(view) [np, hn, num_splits, h]
-        -->(tranpose) [np, num_splits, hn, h]
-        -->(view) [np * num_splits * hn, h] """
-
-        intermediate_shape = \
-            (num_attention_heads_per_partition,
-             hidden_size_per_attention_head, num_splits) +\
-             input_shape[1:]
-
-        t = t.view(*intermediate_shape)
-        t = t.transpose(1, 2).contiguous()
-    t = t.view(*input_shape)
-
-    return t
-
-def fix_query_key_value_ordering(model, checkpoint_version):
-    """Fix up query/key/value matrix ordering if checkpoint
-    version is smaller than 2.0
-    """
-    if checkpoint_version < 2.0:
-        if isinstance(model, list):
-            assert len(model)==1
-            model = model[0]
-        for name, param in model.named_parameters():
-            if name.endswith(('.query_key_value.weight', '.query_key_value.bias')):
-                if checkpoint_version == 0:
-                    fixed_param = _transpose_first_dim(param.data, 3, True, model)
-                elif checkpoint_version == 1.0:
-                    fixed_param = _transpose_first_dim(param.data, 3, False, model)
-                else:
-                    print_rank_0(f"Invalid checkpoint version {checkpoint_version}.")
-                    sys.exit()
-                param.data.copy_(fixed_param)
-            if name.endswith(('.key_value.weight', '.key_value.bias')):
-                if checkpoint_version == 0:
-                    fixed_param = _transpose_first_dim(param.data, 2, True, model)
-                elif checkpoint_version == 1.0:
-                    fixed_param = _transpose_first_dim(param.data, 2, False, model)
-                else:
-                    print_rank_0(f"Invalid checkpoint version {checkpoint_version}.")
-                    sys.exit()
-                param.data.copy_(fixed_param)
-        print_rank_0(" succesfully fixed query-key-values ordering for"
-                    " checkpoint version {}".format(checkpoint_version))
-
-
-def _load_base_checkpoint(load_dir, rank0=False):
-    """ Load the base state_dict from the given directory
-
-    If rank0 is true, just loads rank 0 checkpoint, ignoring arguments.
-    """
-
-    # Read the tracker file and set the iteration.
-    tracker_filename = get_checkpoint_tracker_filename(load_dir)
-
-    # If no tracker file, return nothing
-    if not os.path.isfile(tracker_filename):
-        if not rank0:
-            print_rank_0('WARNING: could not find the metadata file {} '.format(
-                tracker_filename))
-            print_rank_0('    will not load any checkpoints and will start from '
-                         'random')
-        return None, False
-
-    # Otherwise, read the tracker file and either set the iteration or
-    # mark it as a release checkpoint.
-    iteration, release = read_metadata(tracker_filename)
-
-    # Checkpoint.
-    if rank0:
-        checkpoint_name = find_checkpoint_rank_0(load_dir, iteration, release)
-    else:
-        checkpoint_name = get_checkpoint_name(load_dir, iteration, release)
-        if release:
-            print_rank_0(f' loading release checkpoint from {load_dir}')
-        else:
-            print_rank_0(f' loading checkpoint from {load_dir} at iteration {iteration}')
-
-    # Load the checkpoint.
-    try:
-        state_dict = torch.load(checkpoint_name, map_location='cpu')
-    except ModuleNotFoundError:
-        from megatron.fp16_deprecated import loss_scaler
-        # For backward compatibility.
-        if not rank0:
-            print_rank_0(' > deserializing using the old code structure ...')
-        sys.modules['fp16.loss_scaler'] = sys.modules[
-            'megatron.fp16_deprecated.loss_scaler']
-        sys.modules['megatron.fp16.loss_scaler'] = sys.modules[
-            'megatron.fp16_deprecated.loss_scaler']
-        state_dict = torch.load(checkpoint_name, map_location='cpu')
-        sys.modules.pop('fp16.loss_scaler', None)
-        sys.modules.pop('megatron.fp16.loss_scaler', None)
-    except BaseException as e:
-        print_rank_0('could not load the checkpoint')
-        print_rank_0(e)
-        sys.exit()
-
-    return state_dict, release
-
-
-def load_args_from_checkpoint(args, load_arg='load'):
-    """Set required arguments from the checkpoint specified in the
-    arguments.
-
-    Will overwrite arguments that have a non-None default value, but
-    will leave any arguments that default to None as set.
-
-    Returns the same args NameSpace with the new values added/updated.
-
-    If no checkpoint is specified in args, or if the checkpoint is
-    there but invalid, the arguments will not be modified
-
-    """
-    load_dir = getattr(args, load_arg)
-
-    if load_dir is None:
-        print_rank_0('No load directory specified, using provided arguments.')
-        return args
-
-    state_dict, release = _load_base_checkpoint(load_dir, rank0=True)
-
-    # Args.
-    if not state_dict:
-        print_rank_0('Checkpoint not found to provide arguments, using provided arguments.')
-        return args
-
-    if 'args' not in state_dict:
-        print_rank_0('Checkpoint provided does not have arguments saved, using provided arguments.')
-        return args
-
-    checkpoint_args = state_dict['args']
-    checkpoint_version = state_dict.get('checkpoint_version', 0)
-    args.iteration = state_dict['iteration']
-
-    # One-off conversion for foundation models
-    if hasattr(checkpoint_args, 'disable_bias_linear'):
-        setattr(checkpoint_args, 'add_bias_linear', not getattr(checkpoint_args, 'disable_bias_linear'))
-
-    def _set_arg(arg_name, old_arg_name=None, force=False):
-        if not force and getattr(args, arg_name, None) is not None:
-            return
-
-        if old_arg_name is not None:
-            checkpoint_value = getattr(checkpoint_args, old_arg_name, None)
-        else:
-            checkpoint_value = getattr(checkpoint_args, arg_name, None)
-
-        if checkpoint_value is not None:
-            print_rank_0(f"Setting {arg_name} to {checkpoint_value} from checkpoint")
-            setattr(args, arg_name, checkpoint_value)
-        else:
-            print_rank_0(f"Checkpoint did not provide arguments {arg_name}")
-
-    _set_arg('num_layers')
-    _set_arg('hidden_size')
-    _set_arg('ffn_hidden_size')
-    _set_arg('seq_length')
-    _set_arg('num_attention_heads')
-    _set_arg('kv_channels')
-    _set_arg('max_position_embeddings')
-    _set_arg('add_position_embedding', force=True)
-    _set_arg('use_rotary_position_embeddings', force=True)
-    _set_arg('rotary_percent', force=True)
-    _set_arg('add_bias_linear', force=True)
-    _set_arg('swiglu', force=True)
-    _set_arg('untie_embeddings_and_output_weights', force=True)
-    _set_arg('apply_layernorm_1p', force=True)
-    _set_arg('tokenizer_type')
-    _set_arg('padded_vocab_size')
-    if checkpoint_version < 3.0:
-        _set_arg('tensor_model_parallel_size',
-                 'model_parallel_size')
-    else:
-        _set_arg('tensor_model_parallel_size', force=True)
-        _set_arg('pipeline_model_parallel_size', force=True)
-        _set_arg('virtual_pipeline_model_parallel_size', force=True)
-        _set_arg('num_layers_per_virtual_pipeline_stage')
-    return args, checkpoint_args
-
-
-def load_checkpoint(model, optimizer, opt_param_scheduler, load_arg='load', strict=True):
-    """Load a model checkpoint and return the iteration.
-    strict (bool): whether to strictly enforce that the keys in
-        :attr:`state_dict` of the checkpoint match the names of
-        parameters and buffers in model.
-    """
-    args = get_args()
-    load_dir = getattr(args, load_arg)
-
-    model = unwrap_model(model)
-
-    state_dict, release = _load_base_checkpoint(load_dir, rank0=False)
-
-    # Checkpoint not loaded.
-    if state_dict is None:
-
-        # Conditionally exit at this point.
-        if args.exit_on_missing_checkpoint:
-            print_rank_0(">> '--exit-on-missing-checkpoint' set ... exiting. <<")
-            torch.distributed.barrier()
-            sys.exit()
-
-        # Iteration defaults to 0.
-        return 0
-
-    # Set checkpoint version.
-    set_checkpoint_version(state_dict.get('checkpoint_version', 0))
-
-    # Set iteration.
-    if args.finetune or release:
-        iteration = 0
-    else:
-        try:
-            iteration = state_dict['iteration']
-        except KeyError:
-            try:  # Backward compatible with older checkpoints
-                iteration = state_dict['total_iters']
-            except KeyError:
-                print_rank_0('A metadata file exists but unable to load '
-                             'iteration from checkpoint {}, exiting'.format(
-                                 checkpoint_name))
-                sys.exit()
-
-    # Check arguments.
-    assert args.consumed_train_samples == 0
-    assert args.consumed_valid_samples == 0
-    if 'args' in state_dict and not args.finetune:
-        checkpoint_args = state_dict['args']
-        check_checkpoint_args(checkpoint_args)
-        args.consumed_train_samples = getattr(checkpoint_args,
-                                              'consumed_train_samples', 0)
-        update_num_microbatches(consumed_samples=args.consumed_train_samples)
-        args.consumed_valid_samples = getattr(checkpoint_args,
-                                              'consumed_valid_samples', 0)
-    else:
-        print_rank_0('could not find arguments in the checkpoint ...')
-
-    # Model.
-    if len(model) == 1:
-        model[0].load_state_dict(state_dict['model'], strict=strict)
-    else:
-        for i in range(len(model)):
-            mpu.set_virtual_pipeline_model_parallel_rank(i)
-            model[i].load_state_dict(state_dict['model%d' % i], strict=strict)
-
-    # Fix up query/key/value matrix ordering if needed.
-    checkpoint_version = get_checkpoint_version()
-    print_rank_0(f' checkpoint version {checkpoint_version}')
-    fix_query_key_value_ordering(model, checkpoint_version)
-
-    # Optimizer.
-    if not release and not args.finetune and not args.no_load_optim:
-        try:
-            # Load state dict.
-            if optimizer is not None:
-                optimizer.load_state_dict(state_dict['optimizer'])
-
-            # Load distributed optimizer's custom parameter state.
-            if args.use_distributed_optimizer:
-                tracker_filename = get_checkpoint_tracker_filename(load_dir)
-                iteration, release = read_metadata(tracker_filename)
-                model_checkpoint_name = \
-                    get_checkpoint_name(load_dir, iteration, release)
-                optim_checkpoint_name = \
-                    get_distributed_optimizer_checkpoint_name(
-                        model_checkpoint_name)
-                optimizer.load_parameter_state(optim_checkpoint_name)
-
-            # Load scheduler.
-            if opt_param_scheduler is not None:
-                if 'lr_scheduler' in state_dict: # backward compatbility
-                    opt_param_scheduler.load_state_dict(state_dict['lr_scheduler'])
-                else:
-                    opt_param_scheduler.load_state_dict(state_dict['opt_param_scheduler'])
-        except KeyError:
-            print_rank_0('Unable to load optimizer from checkpoint {}. '
-                         'Specify --no-load-optim or --finetune to prevent '
-                         'attempting to load the optimizer state, '
-                         'exiting ...'.format(checkpoint_name))
-            sys.exit()
-    else:
-        if (args.fp16 or args.bf16) and optimizer is not None:
-            optimizer.reload_model_params()
-
-    # rng states.
-    if not release and not args.finetune and not args.no_load_rng:
-        try:
-            if 'rng_state' in state_dict:
-                # access rng_state for data parallel rank
-                if args.data_parallel_random_init:
-
-                    rng_state = state_dict['rng_state'][mpu.get_data_parallel_rank()]
-                else:
-                    rng_state = state_dict['rng_state'][0]
-                random.setstate(rng_state['random_rng_state'])
-                np.random.set_state(rng_state['np_rng_state'])
-                torch.set_rng_state(rng_state['torch_rng_state'])
-                torch.cuda.set_rng_state(rng_state['cuda_rng_state'])
-                # Check for empty states array
-                if not rng_state['rng_tracker_states']:
-                    raise KeyError
-                tensor_parallel.get_cuda_rng_tracker().set_states(
-                    rng_state['rng_tracker_states'])
-            else:  # backward compatability
-                random.setstate(state_dict['random_rng_state'])
-                np.random.set_state(state_dict['np_rng_state'])
-                torch.set_rng_state(state_dict['torch_rng_state'])
-                torch.cuda.set_rng_state(state_dict['cuda_rng_state'])
-                # Check for empty states array
-                if not state_dict['rng_tracker_states']:
-                    raise KeyError
-                tensor_parallel.get_cuda_rng_tracker().set_states(
-                    state_dict['rng_tracker_states'])
-        except KeyError:
-            print_rank_0('Unable to load rng state from checkpoint {}. '
-                         'Specify --no-load-rng or --finetune to prevent '
-                         'attempting to load the rng state, '
-                         'exiting ...'.format(checkpoint_name))
-            sys.exit()
-
-    # Some utilities want to load a checkpoint without distributed being initialized
-    if torch.distributed.is_initialized():
-        torch.distributed.barrier()
-
-    print_rank_0(f'  successfully loaded checkpoint from {args.load} '
-                 f'at iteration {iteration}')
-
-    return iteration
-
-
-def load_biencoder_checkpoint(model, only_query_model=False,
-        only_context_model=False, custom_load_path=None):
-    """
-    selectively load retrieval models for indexing/retrieving
-    from saved checkpoints
-    """
-
-    args = get_args()
-
-    model = unwrap_model(model)
-
-    load_path = custom_load_path if custom_load_path is not None else args.load
-
-    tracker_filename = get_checkpoint_tracker_filename(load_path)
-    with open(tracker_filename, 'r') as f:
-        iteration = int(f.read().strip())
-
-    checkpoint_name = get_checkpoint_name(load_path, iteration,
-                                          args.use_distributed_optimizer,
-                                          release=False)
-
-    if mpu.get_data_parallel_rank() == 0:
-        print('global rank {} is loading checkpoint {}'.format(
-            torch.distributed.get_rank(), checkpoint_name))
-
-    state_dict = torch.load(model_checkpoint_name, map_location='cpu')
-    ret_state_dict = state_dict['model']
-
-    if only_query_model:
-        ret_state_dict.pop('context_model')
-    if only_context_model:
-        ret_state_dict.pop('query_model')
-
-    assert len(model) == 1
-    model[0].load_state_dict(ret_state_dict)
-    torch.distributed.barrier()
-
-    if mpu.get_data_parallel_rank() == 0:
-        print(' successfully loaded {}'.format(checkpoint_name))
-
-    return model
diff --git a/megatron/core/QuickStart.md b/megatron/core/QuickStart.md
new file mode 100644
index 0000000000..6deb1a5f76
--- /dev/null
+++ b/megatron/core/QuickStart.md
@@ -0,0 +1,250 @@
+## Quick Start
+
+The following guide is a short getting started guide for Megatron Core. In it you:
+
+* Initialize Megatron Core on 2 GPUS. 
+* Build a GPT model with tensor model parallel size 2, pipeline parallel size 1
+* Train it for a five iterations using Megatron Core schedules
+* Save the model using the distributed checkpointing format
+* Load the model saved above. 
+
+**NOTE:** The following sample was tested using Megatron Core version 0.8.0 and NGC PyTorch Container version 24.02. 
+
+### Environment Setup
+
+```
+docker run --ipc=host --shm-size=512m --gpus 2 -it nvcr.io/nvidia/pytorch:24.02-py3
+
+git clone https://github.com/NVIDIA/Megatron-LM.git && cd Megatron-LM
+```
+<br>
+
+### Writing Your First Training Loop
+
+In the following steps you create a sample GPT model split across tensors (Tensor model parallel) on 2 GPUS, and run a forward pass through it using a MockGPT dataset helper class that we created in Megatron Core. 
+
+<br>
+
+**NOTE:** All of the following steps are in the [run_simple_mcore_train_loop.py](https://github.com/NVIDIA/Megatron-LM/tree/main/examples/run_simple_mcore_train_loop.py) script.
+
+To run the ``run_simple_mcore_train_loop.py`` script:
+
+```
+PYTHONPATH=$PYTHON_PATH:./megatron torchrun --nproc-per-node 2 examples/run_simple_mcore_train_loop.py
+```
+
+<br>
+
+**STEP 1 - Initialize Distributed Training and Model Parallel Setup**
+
+The following utility, when called, initializes your distributed setup. 
+
+```python
+import os
+import torch
+from megatron.core import parallel_state
+
+def initialize_distributed(tensor_model_parallel_size = 1, pipeline_model_parallel_size = 1):
+    # Torch setup for distributed training
+    rank = int(os.environ['LOCAL_RANK'])
+    world_size = torch.cuda.device_count()
+    torch.cuda.set_device(rank)
+    torch.distributed.init_process_group(world_size=world_size, rank=rank)
+
+    # Megatron core distributed training initialization
+    parallel_state.initialize_model_parallel(tensor_model_parallel_size, pipeline_model_parallel_size)
+```
+<br>
+
+**STEP 2 - GPT Model Setup**
+
+In this step, you create a GPT model. For a list of other configurations that you can pass into the model open and review [transformer_config.py](https://github.com/NVIDIA/Megatron-LM/tree/main/megatron/core/transformer/transformer_config.py).
+
+```
+from megatron.core.transformer.transformer_config import TransformerConfig
+from megatron.core.models.gpt.gpt_model import GPTModel
+from megatron.core.models.gpt.gpt_layer_specs import get_gpt_layer_local_spec
+
+def model_provider():
+    """Build the model."""
+
+    transformer_config = TransformerConfig(
+        num_layers=2, 
+        hidden_size=12, 
+        num_attention_heads=4, 
+        use_cpu_initialization=True, 
+        pipeline_dtype=torch.float32)
+
+    gpt_model = GPTModel(
+        config=transformer_config, 
+        transformer_layer_spec=get_gpt_layer_local_spec(), 
+        vocab_size=100, 
+        max_sequence_length=64)
+
+    return gpt_model
+```
+<br>
+
+**STEP 3 - GPT Mock Dataset Setup**
+
+In the following step, you explore the mock dataset utility.
+
+* To train the model using your data, use the GPTDataset class in [gpt_dataset.py](https://github.com/NVIDIA/Megatron-LM/tree/main/megatron/core/datasets/gpt_dataset.py).
+
+* To find more information about Megatron Core data pipeline, see the [data pipeline readme.md](https://github.com/NVIDIA/Megatron-LM/tree/main/megatron/core/datasets/readme.md?ref_type=heads).
+
+```
+import torch
+from torch.utils.data import DataLoader
+
+from megatron.core.datasets.blended_megatron_dataset_builder import BlendedMegatronDatasetBuilder
+from megatron.core.datasets.gpt_dataset import GPTDatasetConfig, MockGPTDataset
+from megatron.training.tokenizer.tokenizer import _NullTokenizer
+from megatron.core.datasets.utils import compile_helpers
+
+_SEQUENCE_LENGTH = 64
+
+def get_train_data_iterator():
+    if torch.distributed.is_available() and torch.distributed.is_initialized():
+        if torch.distributed.get_rank() == 0:
+            compile_helpers()
+        torch.distributed.barrier()
+    else:
+        compile_helpers()
+
+    config = GPTDatasetConfig(
+        random_seed=0,
+        sequence_length=_SEQUENCE_LENGTH,
+        reset_position_ids=False,
+        reset_attention_mask=False,
+        eod_mask_loss=False,
+        tokenizer=_NullTokenizer(vocab_size=_SEQUENCE_LENGTH),
+    )
+
+    datasets = BlendedMegatronDatasetBuilder(
+        MockGPTDataset, [1000, None, None], lambda: True, config
+    ).build()
+
+    train_dataloader = DataLoader(datasets[0], batch_size=8, shuffle=True)
+
+    train_iterator = iter(train_dataloader)
+
+    return train_iterator
+
+```
+<br>
+
+**STEP 4 - Forward Step Function**
+
+Megatron Core uses [schedules.py](https://github.com/NVIDIA/Megatron-LM/tree/main/megatron/core/pipeline_parallel/schedules.py) to run the model. It is sufficient to define a forward step function, which takes as input the data iterator and the model and produces as output the output tensor and a loss function.
+
+```python
+from functools import partial
+
+def forward_step_func(data_iterator, model):
+   
+    def loss_func(loss_mask: torch.Tensor, output_tensor: torch.Tensor):
+
+        losses = output_tensor.float()
+        loss_mask = loss_mask.view(-1).float()
+        loss = torch.sum(losses.view(-1) * loss_mask) / loss_mask.sum()
+        # If you have data parallel reduce loss across data parallel groups. 
+        # If pipeline parallel, loss computation is done only in last stage.
+
+        return loss, {'lm loss': loss}
+
+    data = next(data_iterator)
+    tokens = data['tokens'].to(device)
+    attention_mask = data['attention_mask'].to(device)
+    position_ids = data['position_ids'].to(device)
+    labels = data['labels'].to(device)
+    loss_mask = data['loss_mask'].to(device)
+   
+    output_tensor = model(tokens, position_ids, attention_mask,
+                          labels=labels)
+
+    return output_tensor, partial(loss_func, loss_mask)   
+```
+<br>
+
+**STEP 5 - Load and Save Distributed Checkpoint**
+
+Megatron Core uses distributed checkpoints for loading and saving models. This gives you the flexibility to convert the model from one model parallel setting to another when you load a model. For example, a model trained with tensor parallel size 2, can be loaded again as tensor model parallel size 4, and so forth.
+
+```python
+from megatron.core import dist_checkpointing
+
+def save_distributed_checkpoint(checkpoint_path, gpt_model):
+    sharded_state_dict = gpt_model.sharded_state_dict(prefix='')
+    dist_checkpointing.save(sharded_state_dict=sharded_state_dict, checkpoint_dir=checkpoint_path)
+
+def load_distributed_checkpoint(checkpoint_path, gpt_model):
+    sharded_state_dict=gpt_model.sharded_state_dict(prefix='')
+    checkpoint = dist_checkpointing.load(sharded_state_dict=sharded_state_dict, checkpoint_dir=checkpoint_path)
+    gpt_model.load_state_dict(checkpoint)
+    return gpt_model
+```
+<br>
+
+**STEP 6 - Main Function**
+
+The following code snippet is the main function that needs to go into your script. It runs the model for 5 iterations, saves the model, and loads the data model.  
+
+```python
+from pathlib import Path
+from torch.optim import Adam
+from megatron.core.pipeline_parallel.schedules import get_forward_backward_func
+from megatron.core.tensor_parallel.random import model_parallel_cuda_manual_seed
+
+if __name__ == "__main__":
+    initialize_distributed(tensor_model_parallel_size=2, pipeline_model_parallel_size=1)
+    model_parallel_cuda_manual_seed(123)
+
+    gpt_model = model_provider()
+    device = torch.device("cuda")
+    gpt_model.to(device)
+
+    optim = Adam(gpt_model.parameters())
+    
+    train_iterator = get_train_data_iterator()
+    
+    forward_backward_func = get_forward_backward_func()
+
+    # Running the model for 5 iterations
+    for _ in range(5):
+        optim.zero_grad()
+        
+        losses_reduced = forward_backward_func(
+            forward_step_func=forward_step_func,
+            data_iterator=train_iterator,
+            model=gpt_model,
+            num_microbatches=1,
+            seq_length=64,
+            micro_batch_size=8,
+            decoder_seq_length=64,
+            forward_only=False)
+    
+        optim.step()
+
+        print(f'Losses reduced :  {losses_reduced}')
+
+    # Saving the model
+    save_distributed_checkpoint(gpt_model=gpt_model, checkpoint_path='/workspace/ckpt')
+
+    # Loading the model
+    gpt_model = load_distributed_checkpoint(gpt_model=gpt_model, checkpoint_path='/workspace/ckpt')
+    gpt_model.to(device)
+    print('Successfully loaded the model')  
+```
+<br>
+
+
+
+### Extending Further
+
+The example you explored here is a basic training loop in Megatron Core. To review more advanced examples, explore [pretrain_gpt.py]. ``pretrain_gpt.py`` has more complex training loops that includes the following and other Megatron Core features:
+
+* pipeline parallel
+* context parallel
+* rope embeddings
+* mixture of experts
diff --git a/megatron/core/README.md b/megatron/core/README.md
index 0c8c61738d..38970b0c47 100644
--- a/megatron/core/README.md
+++ b/megatron/core/README.md
@@ -1 +1,14 @@
-Megatron Core is a library for efficient and scalable training of transformer based models.
+# Megatron-Core
+
+Megatron-Core is an open-source PyTorch-based library that contains GPU-optimized techniques and cutting-edge system-level optimizations. It abstracts them into composable and modular APIs, allowing full flexibility for developers and model researchers to train custom transformers at-scale on NVIDIA accelerated computing infrastructure. This library is compatible with all NVIDIA Tensor Core GPUs, including FP8 acceleration support for [NVIDIA Hopper architectures](https://www.nvidia.com/en-us/data-center/technologies/hopper-architecture/).
+
+Megatron-Core offers core building blocks such as attention mechanisms, transformer blocks and layers, normalization layers, and embedding techniques. Additional functionality like activation re-computation, distributed checkpointing is also natively built-in to the library. The building blocks and functionality are all GPU optimized, and can be built with advanced parallelization strategies for optimal training speed and stability on NVIDIA Accelerated Computing Infrastructure. Another key component of the Megatron-Core library includes advanced model parallelism techniques (tensor, sequence, pipeline, context, and MoE expert parallelism).
+
+Megatron-Core can be used with [NVIDIA NeMo](https://www.nvidia.com/en-us/ai-data-science/products/nemo/), an enterprise-grade AI platform. Alternatively, you can explore Megatron-Core with the native PyTorch training loop [here](https://github.com/NVIDIA/Megatron-LM/tree/main/examples). Visit [Megatron-Core documentation](https://docs.nvidia.com/megatron-core/developer-guide/latest/index.html) to learn more.
+
+## Quick links
+
+- [Benchmark using NVIDIA NeMo](https://docs.nvidia.com/nemo-framework/user-guide/latest/overview.html#performance-benchmarks)
+- [Multimodal example (LLaVA training pipeline)](https://github.com/NVIDIA/Megatron-LM/tree/main/examples/multimodal)
+- [Mixture-of-Experts](https://github.com/NVIDIA/Megatron-LM/tree/main/megatron/core/transformer/moe)
+- [Training Mamba-based Language Models](https://github.com/NVIDIA/Megatron-LM/tree/main/examples/mamba)
diff --git a/megatron/core/README_STRAGGLER.md b/megatron/core/README_STRAGGLER.md
new file mode 100644
index 0000000000..fe9062c851
--- /dev/null
+++ b/megatron/core/README_STRAGGLER.md
@@ -0,0 +1,93 @@
+## StragglerDetector for a TP Group
+
+The file `megatron/core/utils.py` has a class named `StragglerDetector` which supports Python Contexts.
+It can be used to find straggling TP group based on the RTT of the ranks in the TP Group. It also collects
+Power/Temp/Utilization for GPUs, which can additionally be used to narrow down to the exact GPU in the TP Group,
+assuming the straggling was caused by hardware anomaly in a given GPU.<br>
+This class supports collecting timing events for various steps of a given iteration. It
+keeps collecting such timing events on a per rank basis, and when the reporter is invoked
+during a logging interval, it computes the min and max of certain metric across all
+ranks and logs the observed metric and the rank as follows
+
+```
+ 0: INFO:megatron.core.utils:[2024-03-14 23:07:56] | MnRtt/Rnk: 3453.08ms/8 | MxRtt/Rnk: 3468.20ms/0 | MnPwr/Rnk: 601796W/8 | MxPwr/Rnk: 683801W/18 | MnTmp/Rnk: 52C/0 | MxTmp/Rnk: 65C/21 | MnUtl/Rnk: 97%/8 | MxUtl/Rnk: 100%/6 | MnClk/Rnk: 1950MHz/28 | MxClk/Rnk: 1980MHz/0 | MnDRtt/Rnk: 14.27ms/23 | MxDRtt/Rnk: 34.65ms/3 | MnEtpt/Rnk: 296.02TF/0 | MxEtpt/Rnk: 297.32TF/8
+```
+<hr>
+
+### Description of the metrics
+
+Each metric is prefixed with `Mn` or `Mx` to represent `Minimum` or `Maximum`. Each metric is also suffixed with the rank where the metric was measured. The metrics are averaged over the logging interval. Between the prefix and the rank is the name of the metric as follows
+
+- Rtt : RoundTrip Time (time spent in all the traced ops per iteration)
+- Pwr : GPU Power
+- Tmp : GPU Temperature
+- Utl : GPU Utilization
+- Clk : GPU Clock
+- DRtt: get_batch latency
+- Etpt: Estimated throughput. This is derived from actual computed throughput dividied by Rtt. Since we do not collect timing for backward pass, the value is further divided by three to come up with estimated throughput. 
+<hr>
+
+### Command Line activation
+To start using the StragglerDetector, need to pass the following argument `--log-straggler`. It optionally also takes two additional parameters. Default disabled
+- `--disable-straggler-on-startup` - whether to keept the StragglerDetector disabled on startup and enable later. Default enabled
+- `--straggler-ctrlr-port` - The StragglerDetector can toggle between on/off just by sending `curl Rank0Host:port`. Default port is 65535. Every time it is turned 
+- `--straggler-minmax-count` - If set to > 1 (N), it prints N Top and Bottom Etpt/Rank pairs as shown below
+```
+ 0: INFO:megatron.core.utils:^^^^ Bottom 4 Ranks with lowest  Etpt(TF): 296.02/0, 296.17/2, 296.23/1, 296.23/4,
+ 0: INFO:megatron.core.utils:^^^^ Top    4 Ranks with highest Etpt(TF): 297.28/15, 297.28/11, 297.32/12, 297.32/8,
+```
+<hr>
+
+### Programming the StragglerDetector
+The StragglerDetector class supports context, and its implementation is a Singleton.
+- Initialization 
+
+```
+ # initialization, where StragglerDetector will be used
+   from megatron.core.utils import StragglerDetector
+   stimer = StragglerDetector()
+```
+
+- One time for each rank
+
+```
+ # one time before the training loop starts
+ stimer.configure(world, rank, enabled=True, port=65545)
+
+ # Arguments to configure 
+ #     world   : World Size
+ #     rank    : The rank of this trainer
+ #     mmcnt   : (Optional) Number of ranks to print for showing Min/Max Etpt
+ #     amp     : (Optional) Set to 3.0 if we only use timers in fwd pass
+ #     port    : (Optional) control port, useful only for rank-0
+ #     prefill : (Optional) howmany Events to pre-populate
+ #     enabled : (Optional) whether or not collection is enabled on startup
+```
+
+- To Capture time
+
+```
+ # whereever timing need to be captured
+ with stimer:
+     do_operation()
+
+ # special case for get_batch
+ with stimer(bdata=True):
+      input,... = get_batch(iterator,...)
+```
+
+- Logging in main training loop
+
+```
+ # logging
+   total_flops = 0.0
+   iteration = 0
+   # inside the main training loop
+   while training:
+        iteration += 1
+        do_step()
+        total_flops += get_computed_flops()
+        if iteration % log_interval:
+           stimer.report(total_flops, log_interval)
+           total_flops = 0.0
+```
diff --git a/megatron/core/__init__.py b/megatron/core/__init__.py
index cb437d5dae..0eccb1d02e 100644
--- a/megatron/core/__init__.py
+++ b/megatron/core/__init__.py
@@ -1,6 +1,24 @@
-import megatron.core.parallel_state
+# Copyright (c) 2024, NVIDIA CORPORATION. All rights reserved.
 import megatron.core.tensor_parallel
 import megatron.core.utils
+from megatron.core import parallel_state
+from megatron.core.distributed import DistributedDataParallel
+from megatron.core.inference_params import InferenceParams
+from megatron.core.model_parallel_config import ModelParallelConfig
+from megatron.core.package_info import (
+    __contact_emails__,
+    __contact_names__,
+    __description__,
+    __download_url__,
+    __homepage__,
+    __keywords__,
+    __license__,
+    __package_name__,
+    __repository_url__,
+    __shortversion__,
+    __version__,
+)
+from megatron.core.timers import Timers
 
 # Alias parallel_state as mpu, its legacy name
 mpu = parallel_state
@@ -9,4 +27,8 @@
     "parallel_state",
     "tensor_parallel",
     "utils",
+    "DistributedDataParallel",
+    "InferenceParams",
+    "ModelParallelConfig",
+    "Timers",
 ]
diff --git a/megatron/core/config_logger.py b/megatron/core/config_logger.py
new file mode 100644
index 0000000000..231a0226be
--- /dev/null
+++ b/megatron/core/config_logger.py
@@ -0,0 +1,104 @@
+# Copyright (c) 2024, NVIDIA CORPORATION. All rights reserved.
+
+import dataclasses
+import json
+import os
+
+import torch
+import torch.nn as nn
+
+from megatron.core import parallel_state
+
+
+def get_config_logger_path(config):
+    return getattr(config, 'config_logger_dir', '')
+
+
+def has_config_logger_enabled(config):
+    return get_config_logger_path(config) != ''
+
+
+# For each prefix, holds a counter and increases it every time we dump with this
+# prefix.
+__config_logger_path_counts = {}
+
+
+def get_path_count(path):
+    """
+    keeps tracks of number of times we've seen the input `path` and return count-1
+    """
+    global __config_logger_path_counts
+    if not path in __config_logger_path_counts:
+        __config_logger_path_counts[path] = 0
+    count = __config_logger_path_counts[path]
+    __config_logger_path_counts[path] += 1
+    return count
+
+
+def get_path_with_count(path):
+    """
+    calls get_path_count and appends returned value to path
+    """
+    return f'{path}.iter{get_path_count(path)}'
+
+
+class JSONEncoderWithMcoreTypes(json.JSONEncoder):
+    def default(self, o):
+        if type(o).__name__ in ['function', 'ProcessGroup']:
+            return str(o)
+        if type(o).__name__ in ['dict', 'OrderedDict']:
+            return {k: self.default(v) for k, v in o.items()}
+        if type(o).__name__ in ['list', 'ModuleList']:
+            return [self.default(val) for val in o]
+        if type(o).__name__ == 'UniqueDescriptor':
+            return {
+                attr: self.default(getattr(o, attr))
+                for attr in filter(lambda x: not x.startswith('__'), dir(o))
+            }
+        if type(o) is torch.dtype:
+            return str(o)
+        # if it's a Float16Module, add "Float16Module" to the output dict
+        if type(o).__name__ == 'Float16Module':
+            return {'Float16Module': {'module': self.default(o.module)}}
+        # If it's a nn.Module subchild, either print its children or itself if leaf.
+        if issubclass(type(o), nn.Module):
+            if len(getattr(o, '_modules', {})) > 0:
+                return {key: self.default(val) for key, val in o._modules.items()}
+            else:
+                return str(o)
+        if type(o).__name__ in ['ABCMeta', 'type', 'AttnMaskType']:
+            return str(o)
+        if dataclasses.is_dataclass(o) or type(o).__name__ in ['ModuleSpec', 'TransformerConfig']:
+            return dataclasses.asdict(o)
+        try:
+            return super().default(o)
+        except:
+            return str(o)
+
+
+def log_config_to_disk(config, dict_data, prefix=''):
+    """
+    Encodes the input dict (dict_data) using the JSONEncoderWithMcoreTypes
+    and dumps to disk, as specified via path
+    """
+    path = get_config_logger_path(config)
+    assert path is not None, 'Expected config_logger_dir to be non-empty in config.'
+
+    if 'self' in dict_data:
+        if prefix == '':
+            prefix = type(dict_data['self']).__name__
+        del dict_data['self']
+
+    if not os.path.exists(path):
+        os.makedirs(path, exist_ok=True)
+
+    rank = parallel_state.get_all_ranks()
+    path = get_path_with_count(os.path.join(path, f'{prefix}.rank_{rank}'))
+    if type(dict_data).__name__ == 'OrderedDict':
+        torch.save(dict_data, f'{path}.pth')
+    else:
+        with open(f'{path}.json', 'w') as fp:
+            json.dump(dict_data, fp, cls=JSONEncoderWithMcoreTypes)
+
+
+__all__ = ['has_config_logger_enabled', 'log_config_to_disk']
diff --git a/megatron/data/Makefile b/megatron/core/datasets/Makefile
similarity index 100%
rename from megatron/data/Makefile
rename to megatron/core/datasets/Makefile
diff --git a/megatron/fused_kernels/tests/__init__.py b/megatron/core/datasets/__init__.py
similarity index 100%
rename from megatron/fused_kernels/tests/__init__.py
rename to megatron/core/datasets/__init__.py
diff --git a/megatron/core/datasets/bert_dataset.py b/megatron/core/datasets/bert_dataset.py
new file mode 100644
index 0000000000..78ae2edf62
--- /dev/null
+++ b/megatron/core/datasets/bert_dataset.py
@@ -0,0 +1,192 @@
+# Copyright (c) 2024, NVIDIA CORPORATION. All rights reserved.
+
+from dataclasses import dataclass
+from typing import Dict, List, Optional, Union
+
+import numpy
+
+from megatron.core.datasets.indexed_dataset import IndexedDataset
+from megatron.core.datasets.masked_dataset import (
+    MaskedWordPieceDataset,
+    MaskedWordPieceDatasetConfig,
+)
+from megatron.core.datasets.utils import Split
+
+
+@dataclass
+class BERTMaskedWordPieceDatasetConfig(MaskedWordPieceDatasetConfig):
+    """Configuration object for Megatron Core BERT WordPiece datasets"""
+
+    classification_head: bool = None
+    """Option to perform the next sequence prediction during sampling"""
+
+    def __post_init__(self) -> None:
+        """Do asserts and set fields post init"""
+        super().__post_init__()
+
+        assert self.classification_head is not None
+
+
+class BERTMaskedWordPieceDataset(MaskedWordPieceDataset):
+    """The BERT dataset that assumes WordPiece tokenization
+
+    Args:
+        indexed_dataset (IndexedDataset): The IndexedDataset around which to build the MegatronDataset
+
+        dataset_path (str): The real path on disk to the dataset, for bookkeeping
+
+        indexed_indices (numpy.ndarray): The set of the documents indices to expose
+
+        num_samples (Optional[int]): The number of samples to draw from the indexed dataset. When None, build as many samples as correspond to one epoch.
+
+        index_split (Split): The indexed_indices Split
+
+        config (BERTMaskedWordPieceDatasetConfig): The config
+    """
+
+    def __init__(
+        self,
+        indexed_dataset: IndexedDataset,
+        dataset_path: str,
+        indexed_indices: numpy.ndarray,
+        num_samples: Optional[int],
+        index_split: Split,
+        config: BERTMaskedWordPieceDatasetConfig,
+    ) -> None:
+        super().__init__(
+            indexed_dataset, dataset_path, indexed_indices, num_samples, index_split, config
+        )
+
+        self.token_lookup = list(self.config.tokenizer.inv_vocab.keys())
+        # Account for the single <cls> and two <sep> token ids
+        self.sample_index = self._build_sample_index(
+            self.config.sequence_length - 3, 2 if self.config.classification_head else 1
+        )
+
+    @staticmethod
+    def _key_config_attributes() -> List[str]:
+        """Inherited method implementation
+
+        Returns:
+            List[str]: The key config attributes
+        """
+        return super(
+            BERTMaskedWordPieceDataset, BERTMaskedWordPieceDataset
+        )._key_config_attributes() + ["classification_head"]
+
+    def __getitem__(self, idx: int) -> Dict[str, Union[int, numpy.ndarray]]:
+        """Abstract method implementation
+
+        Args:
+            idx (int): The index into the dataset
+
+        Returns:
+            Dict[str, Union[int, numpy.ndarray]]: The
+        """
+        idx_beg, idx_end, target_sequence_length = self.sample_index[idx]
+        sample = [self.dataset[i] for i in range(idx_beg, idx_end)]
+        numpy_random_state = numpy.random.RandomState(seed=(self.config.random_seed + idx) % 2**32)
+
+        assert target_sequence_length <= self.config.sequence_length
+
+        # Split the sample into contiguous subsegments A and B
+        pivot = len(sample)
+        is_next_random = False
+        if self.config.classification_head:
+            assert len(sample) > 1, "the sample must contain at least two sentences"
+            pivot = 1
+            if len(sample) >= 3:
+                pivot = numpy_random_state.randint(low=1, high=len(sample))
+            is_next_random = numpy_random_state.random() < 0.5
+        split_A = []
+        for sample_a in sample[:pivot]:
+            split_A.extend(sample_a)
+        split_B = []
+        for sample_b in sample[pivot:]:
+            split_B.extend(sample_b)
+        if is_next_random:
+            split_A, split_B = split_B, split_A
+
+        # Trim the subsegments from either end to a desired joint length
+        length_A = len(split_A)
+        length_B = len(split_B)
+        if length_A + length_B <= target_sequence_length:
+            truncated = False
+        else:
+            while length_A + length_B > target_sequence_length:
+                split = split_A if length_A > length_B else split_B
+                if numpy_random_state.random() < 0.5:
+                    del split[0]
+                else:
+                    del split[-1]
+                length_A = len(split_A)
+                length_B = len(split_B)
+            truncated = True
+
+        # Merge the subsegments and create the token assignment labels
+        tokens = [self.config.tokenizer.cls, *split_A, self.config.tokenizer.sep]
+        assignments = [0 for _ in range(1 + len(split_A) + 1)]
+        if split_B:
+            tokens += [*split_B, self.config.tokenizer.sep]
+            assignments += [1 for _ in range(len(split_B) + 1)]
+
+        # Masking
+        tokens, masked_positions, masked_labels, _, _ = self._create_masked_lm_predictions(
+            tokens, target_sequence_length, numpy_random_state
+        )
+
+        # Pad the sequences and convert to NumPy
+        length_toks = len(tokens)
+        length_pads = self.config.sequence_length - length_toks
+        assert length_pads >= 0
+
+        tokens = numpy.array(tokens, dtype=numpy.int64)
+        tokens = numpy.pad(tokens, (0, length_pads), constant_values=self.config.tokenizer.pad)
+
+        assignments = numpy.array(assignments, dtype=numpy.int64)
+        assignments = numpy.pad(
+            assignments, (0, length_pads), constant_values=self.config.tokenizer.pad
+        )
+
+        # Get the padding mask
+        mask_pads = numpy.ones(length_toks, dtype=numpy.int64)
+        mask_pads = numpy.pad(
+            mask_pads, (0, length_pads), constant_values=self.config.tokenizer.pad
+        )
+
+        # Mask the labels
+        labels = numpy.zeros(self.config.sequence_length, dtype=numpy.int64) - 1
+        labels[masked_positions] = masked_labels
+
+        # Get the loss mask
+        mask_loss = numpy.zeros(self.config.sequence_length, dtype=numpy.int64)
+        mask_loss[masked_positions] = 1
+
+        return {
+            "text": tokens,
+            "types": assignments,
+            "labels": labels,
+            "is_random": int(is_next_random),
+            "padding_mask": mask_pads,
+            "loss_mask": mask_loss,
+            "truncated": int(truncated),
+        }
+
+    def _get_token_mask(self, numpy_random_state: numpy.random.RandomState) -> Optional[int]:
+        """Abstract method implementation
+
+        80% of the time, replace the token id with mask token id. 10% of the time, replace token id
+        with a random token id from the vocabulary. 10% of the time, do nothing.
+
+        Args:
+            numpy_random_state (RandomState): The NumPy random state
+
+        Returns:
+            Optional[int]: The replacement token id or None
+        """
+        if numpy_random_state.random() < 0.8:
+            return self.config.tokenizer.mask
+        else:
+            if numpy_random_state.random() >= 0.5:
+                return self.token_lookup[numpy_random_state.randint(0, len(self.token_lookup))]
+        return None
diff --git a/megatron/core/datasets/blended_dataset.py b/megatron/core/datasets/blended_dataset.py
new file mode 100644
index 0000000000..be0b7a4a08
--- /dev/null
+++ b/megatron/core/datasets/blended_dataset.py
@@ -0,0 +1,201 @@
+# Copyright (c) 2022, NVIDIA CORPORATION. All rights reserved.
+
+import hashlib
+import json
+import logging
+import os
+import time
+from collections import OrderedDict
+from typing import Dict, List, Optional, Tuple, Union
+
+import numpy
+import torch
+
+from megatron.core.datasets.blended_megatron_dataset_config import BlendedMegatronDatasetConfig
+from megatron.core.datasets.megatron_dataset import MegatronDataset
+from megatron.core.datasets.utils import normalize
+from megatron.core.utils import log_single_rank
+
+logger = logging.getLogger(__name__)
+
+_VERBOSE = False
+
+
+class BlendedDataset(torch.utils.data.Dataset):
+    """Conjugating class for a set of MegatronDataset instances
+
+    Args:
+        datasets (List[MegatronDataset]): The MegatronDataset instances to blend
+
+        weights (List[Union[int, float]]): The weights that determine the dataset blend ratios
+
+        size (Optional[int]): The number of samples to draw from the blend. If None, for each dataset index idx draw exactly weights[idx] samples from datasets[idx].
+
+        config (BlendedMegatronDatasetConfig): The config
+
+    Raises:
+        RuntimeError: When the dataset has fewer or more samples than 'size' post-initialization
+    """
+
+    def __init__(
+        self,
+        datasets: List[MegatronDataset],
+        weights: List[Union[int, float]],
+        size: Optional[int],
+        config: BlendedMegatronDatasetConfig,
+    ) -> None:
+        assert len(datasets) == len(weights)
+        assert len(datasets) < 32767
+        assert all(map(lambda _: type(_) == type(datasets[0]), datasets))
+        assert all(map(lambda _: _.index_split == datasets[0].index_split, datasets))
+        assert all(map(lambda _: _ > 0, weights))
+        assert all(map(lambda _: type(_) == type(weights[0]), weights))
+        if size is None and isinstance(weights[0], float):
+            assert all(map(lambda _: _ == int(_), weights))
+
+        # Alert user to unnecessary blending
+        if len(datasets) == 1:
+            log_single_rank(
+                logger, logging.WARNING, f"Building a BlendedDataset for a single MegatronDataset"
+            )
+
+        if size is not None:
+            weights = normalize(weights)
+
+        self.datasets = datasets
+        self.split = self.datasets[0].index_split
+        self.weights = weights
+        self.size = size
+        self.config = config
+
+        unique_identifiers = OrderedDict()
+        unique_identifiers["class"] = type(self).__name__
+        unique_identifiers["datasets"] = [dataset.unique_identifiers for dataset in self.datasets]
+        unique_identifiers["split"] = self.split.name
+        unique_identifiers["weights"] = self.weights
+        unique_identifiers["size"] = self.size
+        unique_identifiers["renormalize_blend_weights"] = self.config.renormalize_blend_weights
+
+        self.unique_description = json.dumps(
+            unique_identifiers, indent=4, default=lambda obj: obj.unique_identifiers
+        )
+        self.unique_description_hash = hashlib.md5(
+            self.unique_description.encode("utf-8")
+        ).hexdigest()
+
+        self.built_anew_on_cache_miss = False
+
+        self.dataset_index, self.dataset_sample_index = self._build_indices()
+
+    def __len__(self) -> int:
+        return self.dataset_index.shape[0]
+
+    def __getitem__(self, idx: int) -> Dict[str, Union[int, numpy.ndarray]]:
+        dataset_id = self.dataset_index[idx]
+        dataset_sample_id = self.dataset_sample_index[idx]
+        return {"dataset_id": dataset_id, **self.datasets[dataset_id][dataset_sample_id]}
+
+    def _build_indices(self) -> Tuple[numpy.ndarray, numpy.ndarray]:
+        """Build and optionally cache the dataset index and the dataset sample index
+
+        The dataset index is a 1-D mapping which determines the dataset to query. The dataset
+        sample index is a 1-D mapping which determines the sample to request from the queried
+        dataset.
+
+        Returns:
+            Tuple[numpy.ndarray, numpy.ndarray]: The dataset index and the dataset sample index
+        """
+        path_to_cache = self.config.path_to_cache
+
+        if path_to_cache:
+            get_path_to = lambda suffix: os.path.join(
+                path_to_cache,
+                f"{self.unique_description_hash}-{type(self).__name__}-{self.split.name}-{suffix}",
+            )
+            path_to_description = get_path_to("description.txt")
+            path_to_dataset_index = get_path_to("dataset_index.npy")
+            path_to_dataset_sample_index = get_path_to("dataset_sample_index.npy")
+            cache_hit = all(
+                map(
+                    os.path.isfile,
+                    [path_to_description, path_to_dataset_index, path_to_dataset_sample_index],
+                )
+            )
+        else:
+            cache_hit = False
+
+        if not path_to_cache or (not cache_hit and torch.distributed.get_rank() == 0):
+            log_single_rank(
+                logger, logging.INFO, f"Build and save the {type(self).__name__} indices"
+            )
+            self.built_anew_on_cache_miss = True
+
+            # Build the dataset and dataset sample indexes
+            log_single_rank(
+                logger, logging.INFO, f"\tBuild and save the dataset and dataset sample indexes"
+            )
+            t_beg = time.time()
+            from megatron.core.datasets import helpers
+
+            if self.size is not None:
+                dataset_index = numpy.zeros(self.size, dtype=numpy.int16)
+                dataset_sample_index = numpy.zeros(self.size, dtype=numpy.int64)
+                helpers.build_blending_indices(
+                    dataset_index,
+                    dataset_sample_index,
+                    self.weights,
+                    len(self.datasets),
+                    self.size,
+                    _VERBOSE,
+                )
+            else:
+                size = sum(self.weights)
+                dataset_index = numpy.zeros(size, dtype=numpy.int16)
+                dataset_sample_index = numpy.zeros(size, dtype=numpy.int64)
+                helpers.build_exhaustive_blending_indices(
+                    dataset_index, dataset_sample_index, self.weights, len(self.datasets)
+                )
+
+            if path_to_cache:
+                os.makedirs(path_to_cache, exist_ok=True)
+                # Write the description
+                with open(path_to_description, "wt") as writer:
+                    writer.write(self.unique_description)
+                # Save the indexes
+                numpy.save(path_to_dataset_index, dataset_index, allow_pickle=True)
+                numpy.save(path_to_dataset_sample_index, dataset_sample_index, allow_pickle=True)
+            else:
+                log_single_rank(
+                    logger,
+                    logging.WARNING,
+                    f"Unable to save the {type(self).__name__} indexes because path_to_cache is None",
+                )
+
+            t_end = time.time()
+            log_single_rank(logger, logging.DEBUG, f"\t> time elapsed: {t_end - t_beg:4f} seconds")
+
+            return dataset_index, dataset_sample_index
+
+        log_single_rank(logger, logging.INFO, f"Load the {type(self).__name__} indices")
+
+        log_single_rank(
+            logger, logging.INFO, f"\tLoad the dataset index from {path_to_dataset_index}"
+        )
+        t_beg = time.time()
+        dataset_index = numpy.load(path_to_dataset_index, allow_pickle=True, mmap_mode='r')
+        t_end = time.time()
+        log_single_rank(logger, logging.DEBUG, f"\t> time elapsed: {t_end - t_beg:4f} seconds")
+
+        log_single_rank(
+            logger,
+            logging.INFO,
+            f"\tLoad the dataset sample index from {path_to_dataset_sample_index}",
+        )
+        t_beg = time.time()
+        dataset_sample_index = numpy.load(
+            path_to_dataset_sample_index, allow_pickle=True, mmap_mode='r'
+        )
+        t_end = time.time()
+        log_single_rank(logger, logging.DEBUG, f"\t> time elapsed: {t_end - t_beg:4f} seconds")
+
+        return dataset_index, dataset_sample_index
diff --git a/megatron/core/datasets/blended_megatron_dataset_builder.py b/megatron/core/datasets/blended_megatron_dataset_builder.py
new file mode 100644
index 0000000000..c9cf4abf63
--- /dev/null
+++ b/megatron/core/datasets/blended_megatron_dataset_builder.py
@@ -0,0 +1,528 @@
+# Copyright (c) 2022, NVIDIA CORPORATION. All rights reserved.
+
+import logging
+import math
+from concurrent.futures import ThreadPoolExecutor
+from typing import Any, Callable, Iterable, List, Optional, Type, Union
+
+import numpy
+import torch
+
+from megatron.core.datasets.blended_dataset import BlendedDataset
+from megatron.core.datasets.blended_megatron_dataset_config import BlendedMegatronDatasetConfig
+from megatron.core.datasets.megatron_dataset import LowLevelDataset, MegatronDataset
+from megatron.core.datasets.utils import Split, normalize
+from megatron.core.parallel_state import get_virtual_pipeline_model_parallel_rank
+from megatron.core.utils import log_single_rank
+
+logger = logging.getLogger(__name__)
+
+MidLevelDataset = MegatronDataset
+
+TopLevelDataset = Union[BlendedDataset, MidLevelDataset]
+
+DistributedDataset = Union[
+    TopLevelDataset, MidLevelDataset, LowLevelDataset, torch.utils.data.Dataset
+]
+
+
+class BlendedMegatronDatasetBuilder(object):
+    """Builder class for the BlendedDataset and MegatronDataset classes
+
+    Args:
+        cls (Type[MegatronDataset]): The class to instantiate, must inherit from MegatronDataset
+
+        sizes (List[Optional[int]]): The minimum total number of samples to draw, or None, per split
+
+        is_built_on_rank (Callable): A callable which returns True if the dataset should be built on the current rank and False otherwise. It should be Megatron Core parallelism aware i.e. global rank, local group rank, and virtual rank may inform its return value.
+
+        config (BlendedMegatronDatasetConfig): The config object which informs dataset creation
+    """
+
+    def __init__(
+        self,
+        cls: Type[MidLevelDataset],
+        sizes: List[int],
+        is_built_on_rank: Callable,
+        config: BlendedMegatronDatasetConfig,
+    ):
+        self.cls = cls
+        self.sizes = sizes
+        self.is_built_on_rank = is_built_on_rank
+        self.config = config
+
+        log_single_rank(
+            logger,
+            logging.INFO,
+            f"Building dataset splits with cls={cls.__name__}, sizes={self.sizes}, and config={self.config}",
+        )
+
+        if not self.config.mock:
+            for split in Split:
+                size_is_none = self.sizes[split.value] is None
+                if self.config.blend_per_split is None:
+                    weights_are_none = self.config.blend[1] is None
+                else:
+                    if self.config.blend_per_split[split.value] is None:
+                        continue
+                    weights_are_none = self.config.blend_per_split[split.value][1] is None
+                if size_is_none:
+                    assert (
+                        weights_are_none
+                    ), f"size_is_none => weights_are_none fails for {split.name} split"
+
+        if torch.distributed.is_initialized():
+            gb_rank = torch.distributed.get_rank()
+            vp_rank = get_virtual_pipeline_model_parallel_rank()
+            if gb_rank == 0 and (vp_rank == 0 or vp_rank is None):
+                assert (
+                    self.is_built_on_rank()
+                ), "is_built_on_rank must return True when global rank = 0 and vp rank = 0"
+
+    def build(self) -> List[Optional[TopLevelDataset]]:
+        """Build all dataset splits according to the provided blend(s)
+
+        This method is distributed-aware and must be called on all ranks.
+
+        The dataset splits returned can vary according to the config. Supply config.blend and
+        config.split to build BlendedDataset and/or MegatronDataset splits from the same
+        distribution. Supply config.blend_per_split to build BlendedDataset and/or MegatronDataset
+        splits from separate distributions. In either case, for each split, handle the following
+        cases:
+
+        (1) The split is None
+            - do nothing
+
+        (2) The split has one contributing dataset, and...
+
+            (a) 'size' is not None
+                - Build a mid-level dataset with low-level dataset sampling in proportion to the size
+
+            (b) 'size' is None
+                - Build mid-level datasets with no excess low-level dataset sampling
+
+        (3) The split has multiple contributing datasets, and...
+
+            (a) 'weights' is not None and 'size' is not None
+                - Build mid-level datasets with low-level dataset sampling in proportion to their weights and the size
+                - Build a top-level dataset of length marginally greater than 'size' with mid-level dataset sampling in proportion to their weights and the size
+
+            (b) 'weights' is not None and 'size' is None
+                - Error
+
+            (c) 'weights' is None and 'size' is not None
+                - Build mid-level datasets with no excess low-level dataset sampling
+                - Build a top-level dataset of length 'size' with mid-level dataset sampling in proportion to their lengths and the size
+
+                  - The 'size' of the top-level dataset is capped at the sum of the mid-level dataset lengths
+
+            (d) 'weights' is None and 'size' is None
+                - Build mid-level datasets with no excess low-level dataset sampling
+                - Build a top-level dataset with no excess mid-level dataset sampling
+
+        Returns:
+            List[Optional[TopLevelDataset]]: A list containing a dataset instance (or None) per split
+        """
+        datasets = self._build_blended_dataset_splits()
+
+        for dataset in datasets:
+            if dataset is not None and len(dataset) > 0:
+                if isinstance(dataset, BlendedDataset):
+                    if dataset.built_anew_on_cache_miss or any(
+                        x.built_anew_on_cache_miss for x in dataset.datasets
+                    ):
+                        log_single_rank(
+                            logger,
+                            logging.INFO,
+                            f"Verifying NumPy indices for {type(dataset).__name__} {dataset.split.name} split",
+                        )
+                    else:
+                        log_single_rank(
+                            logger,
+                            logging.INFO,
+                            f"NumPy indices for {type(dataset).__name__} {dataset.split.name} split are fully cached, skipping verification",
+                        )
+                        continue
+                    # Check blend size
+                    assert dataset.size is None or dataset.size == dataset.dataset_index.shape[0]
+                    # Check blend access of mid-level datasets
+                    _, sizes = numpy.unique(dataset.dataset_index, return_counts=True)
+                    for i, dataset_and_size in enumerate(zip(dataset.datasets, sizes)):
+                        if len(dataset_and_size[0]) < dataset_and_size[1]:
+                            raise IndexError(
+                                f"The {dataset.split.name} blend oversamples (N = {dataset_and_size[1]}) {type(dataset_and_size[0]).__name__} {i} (len = {len(dataset_and_size[0])}). "
+                                f"Set renormalize_blend_weights to True and re-run. File an issue if the problem is not resolved."
+                            )
+
+        return datasets
+
+    def _build_blended_dataset_splits(self) -> List[Optional[TopLevelDataset]]:
+        """Build all dataset splits according to the provided blend(s)
+
+        See the BlendedMegatronDatasetBuilder.build alias for more information.
+
+        Returns:
+            List[Optional[TopLevelDataset]]: A list containing a dataset instance (or None) per split
+        """
+        ##
+        # Return fake "mock" datasets
+        ##
+        if self.config.mock:
+            split = self.config.split_matrix
+            try:
+                return self._build_megatron_dataset_splits(None, split, self.sizes)
+            except Exception as error:
+                raise Exception(
+                    f"{self.cls.__name__} failed to build as a mock data generator"
+                ) from error
+
+        ##
+        # All splits come from the same distribution
+        ##
+        elif self.config.blend:
+            prefixes, weights = self.config.blend
+            if weights is not None:
+                weights = normalize(weights)
+
+            split = self.config.split_matrix
+
+            # Blend consists of a single prefix
+            if len(prefixes) == 1 and weights is None:
+                return self._build_megatron_dataset_splits(prefixes[0], split, self.sizes)
+
+            # Build the mid-level datasets
+            if weights is None:
+                sizes_per_dataset = [[None for split in Split] for prefix in prefixes]
+            else:
+                sizes_per_dataset = _get_size_per_split_per_dataset(weights, self.sizes)
+
+            # build each dataset in parallel
+            megatron_datasets = self._build_megatron_datasets_parallel(
+                prefixes, split, sizes_per_dataset
+            )
+
+            # Build the top-level datasets
+            blended_datasets = [None] * len(Split)
+            for i in range(len(Split)):
+                if split[i] is not None:
+                    weights_i = weights
+                    if weights_i is not None and self.sizes[i] is not None:
+                        size_per_dataset = list(zip(*sizes_per_dataset))[i]
+                        size_i = sum(size_per_dataset)
+                        if self.config.renormalize_blend_weights:
+                            weights_i = list(map(lambda _size: _size / size_i, size_per_dataset))
+                    elif weights_i is None:
+                        try:
+                            weights_i = [
+                                len(megatron_dataset) for megatron_dataset in megatron_datasets[i]
+                            ]
+                        except TypeError:
+                            weights_i = [0 for _ in prefixes]
+                        if self.sizes[i] is not None:
+                            size_i = min(self.sizes[i], sum(weights_i))
+                        else:
+                            size_i = None  # => the size will be sum(weights_i)
+                    else:
+                        raise RuntimeError
+                    blended_datasets[i] = self.build_generic_dataset(
+                        BlendedDataset,
+                        self.is_built_on_rank,
+                        True,  # synchronize_ranks, default behavior to build on rank-0 first
+                        megatron_datasets[i],
+                        weights_i,
+                        size_i,
+                        self.config,
+                    )
+
+            return blended_datasets
+
+        ##
+        # Each split comes from a separate distribution
+        ##
+        else:
+            blended_datasets = [None] * len(Split)
+            for i in range(len(Split)):
+                split_spoof = [None] * len(Split)
+                split_spoof[i] = (0.0, 1.0)
+                sizes_spoof = [0] * len(Split)
+                sizes_spoof[i] = self.sizes[i]
+
+                # Blend is provided for the split
+                blend = self.config.blend_per_split[i]
+                if blend is not None:
+                    prefixes, weights = blend
+                    if weights is not None:
+                        weights = normalize(weights)
+
+                    # Blend consists of a sigle prefix
+                    if len(prefixes) == 1:
+                        blended_datasets[i] = self._build_megatron_dataset_splits(
+                            prefixes[0], split_spoof, sizes_spoof
+                        )[i]
+                        continue
+
+                    # Build mid-level datasets
+                    if weights is None:
+                        sizes_per_dataset = [[None for split in Split] for prefix in prefixes]
+                    else:
+                        sizes_per_dataset = _get_size_per_split_per_dataset(weights, sizes_spoof)
+
+                    # build each dataset in parallel
+                    megatron_datasets = self._build_megatron_datasets_parallel(
+                        prefixes, split_spoof, sizes_per_dataset
+                    )[i]
+
+                    # Build top-level dataset
+                    if weights is not None and self.sizes[i] is not None:
+                        size_per_dataset = list(zip(*sizes_per_dataset))[i]
+                        size = sum(size_per_dataset)
+                        if self.config.renormalize_blend_weights:
+                            weights = list(map(lambda _size: _size / size, size_per_dataset))
+                    elif weights is None:
+                        try:
+                            weights = [
+                                len(megatron_dataset) for megatron_dataset in megatron_datasets
+                            ]
+                        except TypeError:
+                            weights = [0 for _ in prefixes]
+                        if self.sizes[i] is not None:
+                            size = min(self.sizes[i], sum(weights))
+                        else:
+                            size = None  # => the size will be sum(weights)
+                    else:
+                        raise RuntimeError
+                    blended_datasets[i] = self.build_generic_dataset(
+                        BlendedDataset,
+                        self.is_built_on_rank,
+                        True,  # synchronize_ranks, default behavior to build on rank-0 first
+                        megatron_datasets,
+                        weights,
+                        size,
+                        self.config,
+                    )
+
+            return blended_datasets
+
+    def _build_megatron_datasets_parallel(
+        self, prefixes: List[str], split: List[float], sizes_per_dataset: List[List[int]]
+    ) -> List[List[Optional[MegatronDataset]]]:
+        """Build the megatron datasets for a list of prefixes in parallel
+
+        Args:
+            prefixes (List[str]): The list of prefix strings
+
+            split (List[float]): The dataset split ratios (must sum to 1.00)
+
+            sizes_per_dataset (List[List[int]]): The number of samples to request
+            per MegatronDataset per spilt
+
+        Returns:
+            List[List[Optional[MegatronDataset]]]: For each split, have a list of
+            MegatronDataset per prefix
+        """
+
+        # Helper function to wrap the threading logic
+        def _threading_helper(
+            megatron_datasets: List[List[Optional[MegatronDataset]]],
+            num_workers: int,
+            prefixes: List[str],
+            split: List[float],
+            sizes_per_dataset: List[List[int]],
+        ) -> None:
+            with ThreadPoolExecutor(max_workers=num_workers) as executor:
+                all_futures = []
+                for i in range(len(prefixes)):
+                    all_futures.append(
+                        executor.submit(
+                            self._build_megatron_dataset_splits,
+                            prefixes[i],
+                            split,
+                            sizes_per_dataset[i],
+                            False,  # synchronize_ranks, barrier is called in this function
+                        )
+                    )
+                for future in all_futures:
+                    try:
+                        megatron_datasets_split = future.result()
+                        for j in range(len(megatron_datasets_split)):
+                            megatron_datasets[j].append(megatron_datasets_split[j])
+                    except Exception as err:
+                        raise err
+
+        megatron_datasets = [[] for _ in range(len(Split))]
+        num_dataset_builder_threads = self.config.num_dataset_builder_threads
+
+        if torch.distributed.is_initialized():
+            rank = torch.distributed.get_rank()
+            # First, build on rank 0
+            if rank == 0:
+                num_workers = num_dataset_builder_threads
+                if num_workers > 1:
+                    # since only rank 0 is running, scale up the thread count
+                    # but not too much to avoid overloading storage on miss path.
+                    # if user set num_dataset_builder_threads to 1,
+                    # i.e. meant for serial build, do not scale up.
+                    num_workers *= min(2, max(1, torch.cuda.device_count()))
+                _threading_helper(
+                    megatron_datasets, num_workers, prefixes, split, sizes_per_dataset
+                )
+
+            torch.distributed.barrier()
+
+            # Then, build on other ranks; guaranteed to be data_cache hit
+            if rank != 0:
+                _threading_helper(
+                    megatron_datasets,
+                    num_dataset_builder_threads,
+                    prefixes,
+                    split,
+                    sizes_per_dataset,
+                )
+        else:
+            _threading_helper(
+                megatron_datasets, num_dataset_builder_threads, prefixes, split, sizes_per_dataset
+            )
+
+        return megatron_datasets
+
+    def _build_megatron_dataset_splits(
+        self,
+        dataset_path: Optional[str],
+        split: List[float],
+        sizes: List[int],
+        synchronize_ranks: bool = True,
+    ) -> List[Optional[MidLevelDataset]]:
+        """Build each MidLevelDataset split from a single LowLevelDataset
+
+        Args:
+            dataset_path (Optional[str]): The path on disk which defines the underlying LowLevelDataset, or None for mock dataset classes
+
+            split (List[Tuple[float, float]]): The dataset split matrix
+
+            sizes (List[int]): The number of total samples to draw from each split
+
+            synchronize_ranks (bool): Whether to call barrier for rank-0 / barrier / other-ranks behavior. Set to False when we enforce this behavior at higher level.
+
+        Returns:
+            List[Optional[MidLevelDataset]]: The MidLevelDataset (or None) per split
+        """
+        # short-cut if we are not building on this rank
+        if torch.distributed.is_initialized() and not self.is_built_on_rank():
+            for i in range(len(Split)):
+                if split[i] is not None and synchronize_ranks:
+                    torch.distributed.barrier()
+            return [None] * len(Split)
+
+        # Build the low level dataset
+        low_level_dataset = self.cls.build_low_level_dataset(dataset_path, self.config)
+
+        # Build the split indices for the low level dataset
+        num_elements = self.cls.numel_low_level_dataset(low_level_dataset)
+        split_indices = []
+        for i, _ in enumerate(Split):
+            if split[i] is not None:
+                beg = int(round(split[i][0] * float(num_elements)))
+                end = int(round(split[i][1] * float(num_elements)))
+                split_indices.append(numpy.arange(start=beg, stop=end, step=1, dtype=numpy.int32))
+            else:
+                split_indices.append(None)
+
+        # Build the mid level dataset
+        mid_level_datasets = []
+        for i, _split in enumerate(Split):
+            if split[i] is None:
+                mid_level_datasets.append(None)
+            else:
+                mid_level_datasets.append(
+                    self.build_generic_dataset(
+                        self.cls,
+                        self.is_built_on_rank,
+                        synchronize_ranks,
+                        low_level_dataset,
+                        dataset_path,
+                        split_indices[i],
+                        sizes[i],
+                        _split,
+                        self.config,
+                    )
+                )
+
+        return mid_level_datasets
+
+    @staticmethod
+    def build_generic_dataset(
+        cls: Union[Type[DistributedDataset], Callable],
+        is_built_on_rank: Callable,
+        synchronize_ranks: bool,
+        *args: Any,
+    ) -> Optional[Union[DistributedDataset, Iterable]]:
+        """Build the DistributedDataset
+
+        Return None if and only if the underlying dataset class is not built on the current rank
+        and torch.distributed is initialized.
+
+        Args:
+            cls (Union[Type[DistributedDataset], Callable]): The DistributedDataset class to be built. In special cases, e.g. when we are building the low level dataset for a RawMegatronDataset instance, we can accept a Callable which returns an Iterable.
+
+            synchronize_ranks (bool): Whether to call barrier for rank-0 / barrier / other-ranks behavior. Set to False when we enforce this behavior at higher level.
+
+            args (Tuple[Any]): The positional arguments used to build the provided DistributedDataset class
+
+        Raises:
+            Exception: When the dataset constructor raises an OSError
+
+        Returns:
+            Optional[Union[DistributedDataset, Iterable]]: The DistributedDataset instantion, the Iterable instantiation, or None
+        """
+        if torch.distributed.is_initialized():
+            rank = torch.distributed.get_rank()
+
+            dataset = None
+
+            # First, build on rank 0
+            if rank == 0 and is_built_on_rank():
+                try:
+                    dataset = cls(*args)
+                except OSError as err:
+                    log = (
+                        f"Failed to write dataset materials to the data cache directory. "
+                        + f"Please supply a directory to which you have write access via "
+                        + f"the path_to_cache attribute in BlendedMegatronDatasetConfig and "
+                        + f"retry. Refer to the preserved traceback above for more information."
+                    )
+                    raise Exception(log) from err
+
+            if synchronize_ranks:
+                torch.distributed.barrier()
+
+            # After, build on other ranks
+            if rank != 0 and is_built_on_rank():
+                dataset = cls(*args)
+
+            return dataset
+
+        return cls(*args)
+
+
+def _get_size_per_split_per_dataset(
+    normalized_weights: List[float], target_size_per_split: List[int]
+) -> List[List[int]]:
+    """Determine the contribution of the MegatronDataset splits to the BlendedDataset splits
+
+    Args:
+        normalized_weights (List[float]): e.g. [0.3, 0.7]
+
+        target_size_per_split (List[int]): The number of samples to target for each BlendedDataset split
+
+    Returns:
+        List[List[int]]: The number of samples to request per MegatronDataset per split
+    """
+    assert numpy.isclose(sum(normalized_weights), 1.0)
+
+    # Use 0.5% target margin to ensure we satiate the request
+    sizes_per_dataset = [
+        [int(math.ceil(target_size * weight * 1.005)) for target_size in target_size_per_split]
+        for weight in normalized_weights
+    ]
+
+    return sizes_per_dataset
diff --git a/megatron/core/datasets/blended_megatron_dataset_config.py b/megatron/core/datasets/blended_megatron_dataset_config.py
new file mode 100644
index 0000000000..52bc31f62e
--- /dev/null
+++ b/megatron/core/datasets/blended_megatron_dataset_config.py
@@ -0,0 +1,177 @@
+# Copyright (c) 2023, NVIDIA CORPORATION. All rights reserved.
+
+import functools
+import logging
+import re
+from dataclasses import dataclass, field
+from typing import List, Optional, Tuple
+
+from megatron.core.datasets.megatron_tokenizer import MegatronTokenizer
+from megatron.core.datasets.utils import Split, log_single_rank, normalize
+
+logger = logging.getLogger(__name__)
+
+
+@dataclass
+class BlendedMegatronDatasetConfig:
+    """Configuration object for Megatron Core datasets"""
+
+    random_seed: int
+    """The seed for all RNG during dataset creation."""
+
+    sequence_length: int
+    """The sequence length."""
+
+    blend: Optional[Tuple[List[str], Optional[List[float]]]] = None
+    """The blend, consisting of a list of dataset prefixes and optionally a list of dataset
+       weights. For example, [["dataset-path1", "dataset-path2"], [0.3, 0.7]]. When the weights are
+       None, they are inferred from the lengths of the contributing datasets. Not to be used with
+       'blend_per_split'. Defaults to None.
+    """
+
+    blend_per_split: Optional[List[Optional[Tuple[List[str], Optional[List[float]]]]]] = None
+    """A set of blends, as defined above, one for each split distribution. Not to be used with
+       'blend'. Defauls to None.
+    """
+
+    renormalize_blend_weights: bool = False
+    """Renormalize the blend weights to account for mid-level dataset oversampling done to ensure 
+       fulfillmenet of the of the requested number of samples. Defaults to False for backward
+       comparability in the data sample order.
+    """
+
+    split: Optional[str] = None
+    """The split string, a comma separated weighting for the dataset splits when drawing samples
+       from a single distribution. Not to be used with 'blend_per_split'.  Defaults to None.
+    """
+
+    split_matrix: Optional[List[Tuple[float, float]]] = field(init=False, default=None)
+    """The split matrix consisting of non-overlapping book-ends of each split in order. For more
+       information, refer to 'convert_split_vector_to_split_matrix'. Created automatically from
+       'split'. Not to be passed in to the constructor.
+    """
+
+    num_dataset_builder_threads: int = 1
+    """The number of threads to use for dataset building."""
+
+    path_to_cache: Optional[str] = None
+    """Where all re-useable dataset indices are to be cached."""
+
+    mmap_bin_files: bool = True
+    """Whether to mmap the .bin files or use file pointers."""
+
+    mock: bool = field(init=False, default=False)
+    """Whether to bypass real data loading and validation in favor of mock data generation.
+       Created automatically from 'blend' and 'blend_per_split'. Not to be passed in to the
+       constructor.
+    """
+
+    tokenizer: Optional[MegatronTokenizer] = None
+    """The MegatronTokenizer instance or None. Required for datasets which do online tokenization."""
+
+    def __post_init__(self) -> None:
+        """Do asserts and set fields post init"""
+        if self.blend_per_split is not None and any(self.blend_per_split):
+            assert self.blend is None, "blend and blend_per_split are incompatible"
+            assert self.split is None, "split and blend_per_split are incompatible"
+            assert len(self.blend_per_split) == len(
+                Split
+            ), f"blend_per_split must contain {len(Split)} blends"
+            for split in Split:
+                if self.blend_per_split[split.value] is None:
+                    log_single_rank(
+                        logger, logging.INFO, f"blend not provided for {split.name} split"
+                    )
+                else:
+                    assert self.blend_per_split[split.value][1] is None or len(
+                        self.blend_per_split[split.value][0]
+                    ) == len(
+                        self.blend_per_split[split.value][1]
+                    ), "blend per split prefixes and weights must be equal in number"
+        else:
+            if self.blend is not None:
+                assert self.blend[1] is None or len(self.blend[0]) == len(
+                    self.blend[1]
+                ), "blend prefixes and weights must be equal in number"
+                assert self.split is not None, "split must be provided when blend is not None"
+            else:
+                self.mock = True
+                log_single_rank(
+                    logger,
+                    logging.INFO,
+                    f"Let mock = True, as both blend and blend_per_split are None",
+                )
+                self.split = "1,1,1"
+                log_single_rank(
+                    logger,
+                    logging.INFO,
+                    f"Let split = {self.split}, an arbitrarily even split, as mock is True",
+                )
+            split_vector = parse_and_normalize_split(self.split)
+            self.split_matrix = convert_split_vector_to_split_matrix(split_vector)
+            log_single_rank(logger, logging.INFO, f"Let split_matrix = {self.split_matrix}")
+
+
+def parse_and_normalize_split(split: str) -> List[float]:
+    """Parse the dataset split ratios from a string
+
+    Args:
+        split (str): The train valid test split string e.g. "99,1,0"
+
+    Returns:
+        List[float]: The trian valid test split ratios e.g. [0.99, 0.01, 0.0]
+    """
+    split = list(map(float, re.findall(r"[.0-9]+", split)))
+    split = split + [0.0 for _ in range(len(Split) - len(split))]
+
+    assert len(split) == len(Split)
+    assert all(map(lambda _: _ >= 0.0, split))
+
+    split = normalize(split)
+
+    return split
+
+
+def convert_split_vector_to_split_matrix(
+    vector_a: List[float], vector_b: Optional[List[float]] = None
+) -> List[Optional[Tuple[float, float]]]:
+    """Build the split matrix from one or optionally two contributing split vectors.
+
+    Ex. a standard conversion:
+
+    [0.99, 0.01, 0.0] -> [(0, 0.99), (0.99, 1.0), None]
+
+    Ex. a conversion for Retro when Retro pretraining uses a [0.99, 0.01, 0.0] split and Retro
+    preprocessing used a [0.98, 0.02, 0.0] split:
+
+    [0.99, 0.01, 0.0], [0.98, 0.02, 0.0] -> [(0, 0.98), (0.99, 1.0), None]
+
+    Args:
+        vector_a (List[float]): The primary split vector
+
+        vector_b (Optional[List[float]]): An optional secondary split vector which constrains the primary split vector. Defaults to None.
+
+    Returns:
+        List[Tuple[float, float]]: The split matrix consisting of book-ends of each split in order
+    """
+    if vector_b is None:
+        vector_b = vector_a
+
+    # [.900, .090, .010] -> [0.00, .900, .990, 100]
+    expansion_a = functools.reduce(lambda a, b: a + [a[len(a) - 1] + b], [[0], *vector_a])
+    expansion_b = functools.reduce(lambda a, b: a + [a[len(a) - 1] + b], [[0], *vector_b])
+
+    # [0.00, .900, .990, 100.0] -> [(0.00, .900), (.900, .990), (.990, 100)]
+    bookends_a = list(zip(expansion_a[:-1], expansion_a[1:]))
+    bookends_b = list(zip(expansion_b[:-1], expansion_b[1:]))
+
+    # gather per-split overlap or None
+    matrix = []
+    for bookend_a, bookend_b in zip(bookends_a, bookends_b):
+        if min(bookend_a[1], bookend_b[1]) <= max(bookend_a[0], bookend_b[0]):
+            overlap = None
+        else:
+            overlap = (max(bookend_a[0], bookend_b[0]), min(bookend_a[1], bookend_b[1]))
+        matrix.append(overlap)
+
+    return matrix
diff --git a/megatron/core/datasets/gpt_dataset.py b/megatron/core/datasets/gpt_dataset.py
new file mode 100644
index 0000000000..115727de92
--- /dev/null
+++ b/megatron/core/datasets/gpt_dataset.py
@@ -0,0 +1,778 @@
+# Copyright (c) 2023, NVIDIA CORPORATION. All rights reserved.
+
+import logging
+import os
+import time
+from dataclasses import dataclass
+from typing import Dict, Optional, Tuple
+
+import numpy
+import torch
+
+from megatron.core.datasets.blended_megatron_dataset_config import BlendedMegatronDatasetConfig
+from megatron.core.datasets.indexed_dataset import IndexedDataset
+from megatron.core.datasets.megatron_dataset import MegatronDataset
+from megatron.core.datasets.megatron_tokenizer import MegatronTokenizer
+from megatron.core.datasets.utils import Split
+from megatron.core.datasets.utils_s3 import S3Config, is_s3_path
+from megatron.core.utils import log_single_rank
+
+logger = logging.getLogger(__name__)
+
+_PAD_TOKEN_ID = -1
+
+
+@dataclass
+class GPTDatasetConfig(BlendedMegatronDatasetConfig):
+    """Configuration object for Megatron Core GPT datasets"""
+
+    reset_position_ids: bool = None
+    """Option to reset the position IDs in the dataset at an interval"""
+
+    reset_attention_mask: bool = None
+    """Option to reset the attention mask from the dataset"""
+
+    eod_mask_loss: bool = None
+    """Option to enable the EOD mask loss"""
+
+    create_attention_mask: bool = True
+    """Option to enable the attention masks generation. Can be disabled if attention kernel
+       generates masks by itself.
+    """
+
+    drop_last_partial_validation_sequence: bool = True
+    """Option to drop the last partial validation sequence"""
+
+    add_extra_token_to_sequence: bool = True
+    """Option to draw sequences with one extra token to ensure the sample input tokens and sample
+       output tokens are both of the desired sequence length
+    """
+
+    s3_cache_path: str = None
+    """Path for caching indices for s3 dataloading."""
+
+    def __post_init__(self) -> None:
+        """Do asserts and set fields post init"""
+        super().__post_init__()
+
+        assert self.tokenizer is not None
+
+        assert self.reset_position_ids is not None
+        assert self.reset_attention_mask is not None
+        assert self.eod_mask_loss is not None
+
+
+class GPTDataset(MegatronDataset):
+    """The base GPT dataset
+
+    Args:
+        indexed_dataset (IndexedDataset): The IndexedDataset around which to build the GPTDataset
+
+        dataset_path (Optional[str]): The real path on disk to the dataset, for bookkeeping
+
+        indexed_indices (numpy.ndarray): The set of the documents indices to expose
+
+        num_samples (Optional[int]): The number of samples to draw from the indexed dataset. When None, build as many samples as correspond to one epoch.
+
+        index_split (Split): The indexed_indices Split
+
+        config (GPTDatasetConfig): The config
+    """
+
+    def __init__(
+        self,
+        indexed_dataset: IndexedDataset,
+        dataset_path: Optional[str],
+        indexed_indices: numpy.ndarray,
+        num_samples: Optional[int],
+        index_split: Split,
+        config: GPTDatasetConfig,
+    ) -> None:
+        super().__init__(
+            indexed_dataset, dataset_path, indexed_indices, num_samples, index_split, config
+        )
+        self.masks_and_position_ids_are_cacheable = not any(
+            [
+                self.config.reset_position_ids,
+                self.config.reset_attention_mask,
+                self.config.eod_mask_loss,
+            ]
+        )
+        self.masks_and_position_ids_are_cached = False
+        self.cached_attention_mask = None
+        self.cached_loss_mask = None
+        self.cached_position_ids = None
+
+        try:
+            self._pad_token_id = self.config.tokenizer.pad
+        except Exception:
+            self._pad_token_id = _PAD_TOKEN_ID
+
+        (self.document_index, self.sample_index, self.shuffle_index) = (
+            self._build_document_sample_shuffle_indices()
+        )
+
+    @staticmethod
+    def numel_low_level_dataset(low_level_dataset: IndexedDataset) -> int:
+        """Abstract method implementation
+
+        For GPT, the underlying IndexedDataset should be split by sequence, as opposed to, say,
+        BERT, which should be split by document
+
+        Args:
+            low_level_dataset (IndexedDataset): The underlying IndexedDataset
+
+        Returns:
+            int: The number of unique elements in the underlying IndexedDataset
+        """
+        return low_level_dataset.sequence_lengths.shape[0]
+
+    @staticmethod
+    def build_low_level_dataset(dataset_path: str, config: GPTDatasetConfig) -> IndexedDataset:
+        """Abstract method implementation
+
+        Args:
+            dataset_path (str): The real path prefix to the IndexedDataset .bin and .idx files
+
+            config (GPTDatasetConfig): The config
+
+        Returns:
+            IndexedDataset: The underlying IndexedDataset
+        """
+        if is_s3_path(dataset_path):
+            return IndexedDataset(
+                dataset_path,
+                multimodal=False,
+                mmap=config.mmap_bin_files,
+                s3_config=S3Config(path_to_idx_cache=config.s3_cache_path),
+            )
+        return IndexedDataset(dataset_path, multimodal=False, mmap=config.mmap_bin_files)
+
+    def __len__(self) -> int:
+        """Abstract method implementation
+
+        Returns:
+            int: The length of the dataset
+        """
+        return self.sample_index.shape[0] - 1
+
+    def __getitem__(self, idx: Optional[int]) -> Dict[str, torch.Tensor]:
+        """Abstract method implementation
+
+        Args:
+            idx (Optioal[int]): The index into the dataset
+
+        Returns:
+            Dict[str, torch.Tensor]: The sample information wrapped in a dictionary
+        """
+        if idx is None:
+            # Batch padding sequence so the index does not matter
+            text, _ = self._query_document_sample_shuffle_indices(0)
+        else:
+            text, _ = self._query_document_sample_shuffle_indices(idx)
+
+        text = torch.from_numpy(text).long()
+        if self.config.add_extra_token_to_sequence:
+            tokens = text[:-1].contiguous()
+            labels = text[1:].contiguous()
+        else:
+            tokens = text
+            labels = torch.roll(text, shifts=-1, dims=0)
+            labels[-1] = self._pad_token_id
+
+        if (
+            not self.masks_and_position_ids_are_cacheable
+            or not self.masks_and_position_ids_are_cached
+        ):
+            attention_mask, loss_mask, position_ids = _get_ltor_masks_and_position_ids(
+                tokens,
+                self.config.tokenizer.eod,
+                self.config.reset_position_ids,
+                self.config.reset_attention_mask,
+                self.config.eod_mask_loss,
+                self.config.create_attention_mask,
+            )
+            if self.masks_and_position_ids_are_cacheable:
+                self.cached_attention_mask = attention_mask
+                self.cached_loss_mask = loss_mask
+                self.cached_position_ids = position_ids
+                self.masks_and_position_ids_are_cached = True
+        else:
+            attention_mask = self.cached_attention_mask
+            loss_mask = self.cached_loss_mask
+            position_ids = self.cached_position_ids
+
+        # For padded sequences, mask the loss
+        loss_mask[labels == self._pad_token_id] = 0.0
+
+        # For padded sequences, ensure the embedding layer can map the token ID
+        tokens[tokens == self._pad_token_id] = 0
+        labels[labels == self._pad_token_id] = 0
+
+        # Batch padding sequence so we mask the loss
+        if idx is None:
+            loss_mask = torch.zeros_like(loss_mask)
+
+        if self.config.create_attention_mask:
+            return {
+                "tokens": tokens,
+                "labels": labels,
+                "attention_mask": attention_mask,
+                "loss_mask": loss_mask,
+                "position_ids": position_ids,
+            }
+        else:
+            return {
+                "tokens": tokens,
+                "labels": labels,
+                "loss_mask": loss_mask,
+                "position_ids": position_ids,
+            }
+
+    def _query_document_sample_shuffle_indices(
+        self, idx: int
+    ) -> Tuple[numpy.ndarray, numpy.ndarray]:
+        """Get the text (token ids) and document ids for a given index
+
+        Args:
+            idx (int): The index into the dataset
+
+        Returns:
+            Tuple[numpy.ndarray, numpy.ndarray]: The text ids and document ids
+        """
+        # Do the shuffle mapping
+        idx = self.shuffle_index[idx]
+
+        # Get the beginning and end documents and offsets
+        doc_index_beg, doc_index_beg_offset = self.sample_index[idx]
+        doc_index_end, doc_index_end_offset = self.sample_index[idx + 1]
+
+        document_ids = []
+        sample_parts = []
+
+        # Sample spans a single document
+        if doc_index_beg == doc_index_end:
+            # Add the document id
+            document_ids.append(self.document_index[doc_index_beg])
+
+            # Add the entire sample
+            sample_parts.append(
+                self.dataset.get(
+                    self.document_index[doc_index_beg],
+                    offset=doc_index_beg_offset,
+                    length=doc_index_end_offset
+                    - doc_index_beg_offset
+                    + self.config.add_extra_token_to_sequence,
+                )
+            )
+
+        # Sample spans multiple documents
+        else:
+            for i in range(doc_index_beg, doc_index_end + 1):
+                # Add the document id
+                document_ids.append(self.document_index[i])
+
+                # Add the sample part
+                offset = 0 if i > doc_index_beg else doc_index_beg_offset
+                length = (
+                    None
+                    if i < doc_index_end
+                    else doc_index_end_offset + self.config.add_extra_token_to_sequence
+                )
+                sample_parts.append(
+                    self.dataset.get(self.document_index[i], offset=offset, length=length)
+                )
+        assert len(document_ids) == len(
+            sample_parts
+        ), f"len(document_ids) ({len(document_ids)}) != len(sample_parts) ({len(sample_parts)})"
+
+        length = sum(map(len, sample_parts))
+
+        # Pad the sample if necessary
+        if length < (self.config.sequence_length + self.config.add_extra_token_to_sequence):
+            sample_parts.append(
+                [self._pad_token_id]
+                * (self.config.sequence_length + self.config.add_extra_token_to_sequence - length)
+            )
+
+        return (
+            numpy.concatenate(sample_parts, dtype=numpy.int64),
+            numpy.array(document_ids, dtype=numpy.int64),
+        )
+
+    def _build_document_sample_shuffle_indices(
+        self,
+    ) -> Tuple[numpy.ndarray, numpy.ndarray, numpy.ndarray]:
+        """Build the document index, the sample index, and the shuffle index
+
+        The document index:
+            -- 1-D
+            -- An ordered array of document ids
+
+        The sample index:
+            -- 2-D
+            -- The document indices and offsets which mark the start of every sample
+
+        The shuffle index:
+            -- 1-D
+            -- A random permutation of index range of the sample index
+
+        Returns:
+            Tuple[numpy.ndarray, numpy.ndarray]: The document index, the sample index, and the shuffle index
+        """
+        path_to_cache = self.config.path_to_cache
+        if path_to_cache is None and not self.config.mock:
+            path_to_cache = os.path.join(
+                self.dataset.path_prefix, "cache", f"{type(self).__name__}_indices"
+            )
+
+        if path_to_cache:
+            get_path_to = lambda suffix: os.path.join(
+                path_to_cache,
+                f"{self.unique_description_hash}-{type(self).__name__}-{self.index_split.name}-{suffix}",
+            )
+            path_to_description = get_path_to("description.txt")
+            path_to_document_index = get_path_to("document_index.npy")
+            path_to_sample_index = get_path_to("sample_index.npy")
+            path_to_shuffle_index = get_path_to("shuffle_index.npy")
+            cache_hit = all(
+                map(
+                    os.path.isfile,
+                    [
+                        path_to_description,
+                        path_to_document_index,
+                        path_to_sample_index,
+                        path_to_shuffle_index,
+                    ],
+                )
+            )
+        else:
+            cache_hit = False
+
+        if not path_to_cache or (
+            not cache_hit
+            and (not torch.distributed.is_initialized() or torch.distributed.get_rank() == 0)
+        ):
+
+            log_single_rank(
+                logger,
+                logging.INFO,
+                f"Build and save the {type(self).__name__} {self.index_split.name} indices",
+            )
+            self.built_anew_on_cache_miss = True
+            t_beg = time.time()
+
+            sequence_length = self.config.sequence_length
+            num_tokens_per_epoch = self._get_num_tokens_per_epoch()
+            num_epochs = self._get_num_epochs(num_tokens_per_epoch)
+
+            if num_epochs == 1:
+                separate_final_epoch = False
+            else:
+                # Get the number of samples for the last epoch
+                num_samples_sans_final_epoch = (
+                    (num_epochs - 1) * num_tokens_per_epoch
+                    - self.config.add_extra_token_to_sequence
+                ) // sequence_length
+                num_samples_from_final_epoch = self.num_samples - num_samples_sans_final_epoch
+                num_samples_per_epoch = (
+                    num_tokens_per_epoch - self.config.add_extra_token_to_sequence
+                ) // sequence_length
+
+                # num_samples_from_final_epoch should be non-negative
+                assert num_samples_from_final_epoch >= 0
+
+                # num_samples_from_final_epoch should not exceed max value
+                assert num_samples_from_final_epoch <= num_samples_per_epoch + 1
+
+                # Separate the final epoch if it falls below the threshold
+                threshold = 0.80
+                separate_final_epoch = num_samples_from_final_epoch < int(
+                    threshold * num_samples_per_epoch
+                )
+
+                log_single_rank(
+                    logger,
+                    logging.DEBUG,
+                    f"> num_samples_from_final_epoch: {num_samples_from_final_epoch}",
+                )
+                log_single_rank(logger, logging.DEBUG, f"> threshold: {threshold}")
+                log_single_rank(
+                    logger, logging.DEBUG, f"> num_samples_per_epoch: {num_samples_per_epoch}"
+                )
+
+            log_single_rank(
+                logger, logging.DEBUG, f"> separate_final_epoch: {separate_final_epoch}"
+            )
+
+            numpy_random_state = numpy.random.RandomState(self.config.random_seed)
+
+            # Build the document index
+            document_index = _build_document_index(
+                self.indices, num_epochs, numpy_random_state, separate_final_epoch
+            )
+
+            drop_last_partial_sequence = True
+            if self.index_split == Split.valid:
+                drop_last_partial_sequence = self.config.drop_last_partial_validation_sequence
+
+            # Build the sample index
+            from megatron.core.datasets import helpers
+
+            if self.index_split == Split.valid:
+                drop_last_partial_sequence = self.config.drop_last_partial_validation_sequence
+            else:
+                drop_last_partial_sequence = True
+
+            assert document_index.dtype == numpy.int32
+            assert self.dataset.sequence_lengths.dtype == numpy.int32
+            if len(document_index) * 2 > len(self.dataset.sequence_lengths):
+                # Heuristic: if "access density" of sequence_lengths is relatively high,
+                # force loading the mmap-ed array into memory by taking a copy.
+                # System performance benefits come from two aspects:
+                # 1. **sequentially** pre-loading the whole file if we're gonna read a large fraction anyways.
+                # 2. GIL is held when calling into c++ code; making the c++ func faster improves parallelism.
+                sequence_lengths_for_cpp = self.dataset.sequence_lengths.copy()
+            else:
+                sequence_lengths_for_cpp = self.dataset.sequence_lengths
+            sample_index = helpers.build_sample_idx(
+                sequence_lengths_for_cpp,
+                document_index,
+                sequence_length,
+                num_epochs,
+                num_tokens_per_epoch,
+                drop_last_partial_sequence,
+                self.config.add_extra_token_to_sequence,
+            )
+
+            # Build the shuffle index
+            if separate_final_epoch:
+                shuffle_index = _build_shuffle_index(
+                    num_samples_sans_final_epoch, sample_index.shape[0] - 1, numpy_random_state
+                )
+            else:
+                shuffle_index = _build_shuffle_index(
+                    sample_index.shape[0] - 1, sample_index.shape[0] - 1, numpy_random_state
+                )
+
+            if path_to_cache:
+                os.makedirs(path_to_cache, exist_ok=True)
+                # Write the description
+                with open(path_to_description, "wt") as writer:
+                    writer.write(self.unique_description)
+                numpy.save(path_to_document_index, document_index, allow_pickle=True)
+                numpy.save(path_to_sample_index, sample_index, allow_pickle=True)
+                numpy.save(path_to_shuffle_index, shuffle_index, allow_pickle=True)
+            else:
+                log_single_rank(
+                    logger,
+                    logging.WARNING,
+                    f"Unable to save the {type(self).__name__} indexes because path_to_cache is None",
+                )
+
+            t_end = time.time()
+            log_single_rank(logger, logging.DEBUG, f"\t> time elapsed: {t_end - t_beg:4f} seconds")
+
+            log_single_rank(
+                logger, logging.INFO, f"> total number of samples: {sample_index.shape[0] - 1}"
+            )
+            log_single_rank(logger, logging.INFO, f"> total number of epochs: {num_epochs}")
+
+            return document_index, sample_index, shuffle_index
+
+        log_single_rank(
+            logger, logging.INFO, f"Load the {type(self).__name__} {self.index_split.name} indices"
+        )
+
+        log_single_rank(
+            logger,
+            logging.INFO,
+            f"\tLoad the document index from {os.path.basename(path_to_document_index)}",
+        )
+        t_beg = time.time()
+        document_index = numpy.load(path_to_document_index, allow_pickle=True, mmap_mode='r')
+        t_end = time.time()
+        log_single_rank(logger, logging.DEBUG, f"\t> time elapsed: {t_end - t_beg:4f} seconds")
+
+        log_single_rank(
+            logger,
+            logging.INFO,
+            f"\tLoad the sample index from {os.path.basename(path_to_sample_index)}",
+        )
+        t_beg = time.time()
+        sample_index = numpy.load(path_to_sample_index, allow_pickle=True, mmap_mode='r')
+        t_end = time.time()
+        log_single_rank(logger, logging.DEBUG, f"\t> time elapsed: {t_end - t_beg:4f} seconds")
+
+        log_single_rank(
+            logger,
+            logging.INFO,
+            f"\tLoad the shuffle index from {os.path.basename(path_to_shuffle_index)}",
+        )
+        t_beg = time.time()
+        shuffle_index = numpy.load(path_to_shuffle_index, allow_pickle=True, mmap_mode='r')
+        t_end = time.time()
+        log_single_rank(logger, logging.DEBUG, f"\t> time elapsed: {t_end - t_beg:4f} seconds")
+
+        log_single_rank(
+            logger, logging.INFO, f"> total number of samples: {sample_index.shape[0] - 1}"
+        )
+
+        return document_index, sample_index, shuffle_index
+
+    def _get_num_tokens_per_epoch(self) -> int:
+        """Calculate the number of tokens in a single epoch
+
+        Returns:
+            int: The number of tokens in a single epoch
+        """
+        return int(numpy.sum(self.dataset.sequence_lengths[self.indices]))
+
+    def _get_num_epochs(self, num_tokens_per_epoch: int) -> int:
+        """Calculate the number of epochs
+
+        Args:
+            num_tokens_per_epoch (int): The number of tokens in a single epoch
+
+        Returns:
+            int: The number of epochs
+        """
+        num_epochs = 1
+        num_tokens = num_tokens_per_epoch
+        if self.num_samples is None:
+            return num_epochs
+        else:
+            num_tokens_requested = (
+                self.num_samples * self.config.sequence_length
+            ) + self.config.add_extra_token_to_sequence
+            while num_tokens < num_tokens_requested:
+                num_epochs += 1
+                num_tokens += num_tokens_per_epoch
+        return num_epochs
+
+
+def _build_document_index(
+    documents: numpy.ndarray,
+    num_epochs: int,
+    numpy_random_state: numpy.random.RandomState,
+    separate_final_epoch: bool,
+) -> numpy.ndarray:
+    """Build an array with length = num epochs * num documents
+
+    Args:
+        documents (numpy.ndarray): the subset of exposed document indices
+
+        num_epochs (int): The number of epochs
+
+        numpy_random_state (numpy.random.RandomState): The NumPy random state
+
+        separate_final_epoch (bool): Whether to exclude the last epoch from the global shuffle
+
+    Returns:
+        numpy.ndarray: The document index
+    """
+    if not separate_final_epoch or num_epochs == 1:
+        document_index = numpy.mgrid[0:num_epochs, 0 : len(documents)][1]
+        document_index[:] = documents
+        document_index = document_index.reshape(-1)
+        document_index = document_index.astype(numpy.int32)
+        numpy_random_state.shuffle(document_index)
+        return document_index
+
+    doc_idx_first = _build_document_index(documents, num_epochs - 1, numpy_random_state, False)
+    doc_idx_last = _build_document_index(documents, 1, numpy_random_state, False)
+    return numpy.concatenate((doc_idx_first, doc_idx_last))
+
+
+def _build_shuffle_index(
+    num_samples: int, total_size: int, numpy_random_state: numpy.random.RandomState
+) -> numpy.ndarray:
+    """Build the range [0, size) and shuffle
+
+    Args:
+        num_samples (int): The size of the first shuffle range [0, num_samples)
+
+        total_size (int): The size of the entire index. If larger than 'num_samples', it defines the second shuffle range [num_samples, total_size)
+
+        numpy_random_state (numpy.random.RandomState): The NumPy random state
+
+    Returns:
+        numpy.ndarray: The shuffle index
+    """
+    dtype_ = numpy.uint32
+    if total_size >= (numpy.iinfo(numpy.uint32).max - 1):
+        dtype_ = numpy.int64
+
+    shuffle_idx_first = numpy.arange(start=0, stop=num_samples, step=1, dtype=dtype_)
+    numpy_random_state.shuffle(shuffle_idx_first)
+    if num_samples == total_size:
+        return shuffle_idx_first
+
+    shuffle_idx_last = numpy.arange(start=num_samples, stop=total_size, step=1, dtype=dtype_)
+    numpy_random_state.shuffle(shuffle_idx_last)
+
+    return numpy.concatenate((shuffle_idx_first, shuffle_idx_last))
+
+
+def _get_ltor_masks_and_position_ids(
+    data: torch.Tensor,
+    eod_token: int,
+    reset_position_ids: bool,
+    reset_attention_mask: bool,
+    eod_mask_loss: bool,
+    create_attention_mask: bool,
+):
+    """Build masks and position id for left to right model.
+
+    Args:
+        data (torch.Tensor): The data tenor that holds the tokens from the dataset
+
+        eod_token (int): ID of the token to that is considered the EOD
+
+        reset_position_ids (bool): Switch to reset the document position ID's
+
+        reset_attention_mask (bool): Switch to reset the attention mask
+
+        eod_mask_loss (bool): Switch to enable the EOD mask loss
+
+        create_attention_mask (bool): Switch to enable the attention masks generation. Can be disabled if attention kernel generates masks by itself.
+
+    Returns:
+        torch.Tensor: Attention mask needed to be used for Attention
+
+        torch.Tensor: The mask used for loss value during training
+
+        torch.Tensor: The position ID's of the token
+    """
+    seq_length = data.numel()
+
+    if create_attention_mask:
+        attention_mask = torch.tril(
+            torch.ones((seq_length, seq_length), device=data.device)
+        ).unsqueeze(0)
+    else:
+        attention_mask = None
+
+    # Loss mask.
+    loss_mask = torch.ones(seq_length, dtype=torch.float, device=data.device)
+    if eod_mask_loss:
+        loss_mask[data == eod_token] = 0.0
+
+    # Position ids.
+    position_ids = torch.arange(seq_length, dtype=torch.long, device=data.device)
+    # We need to clone as the ids will be modifed based on batch index.
+    if reset_position_ids:
+        position_ids = position_ids.clone()
+
+    if reset_position_ids or reset_attention_mask:
+        # Find indices where EOD token is.
+        eod_index = position_ids[data == eod_token]
+        # Detach indices from positions if going to modify positions.
+        if reset_position_ids:
+            eod_index = eod_index.clone()
+
+        # Loop through EOD indices:
+        prev_index = 0
+        for j in range(eod_index.numel()):
+            i = eod_index[j]
+            # Mask attention loss.
+            if reset_attention_mask and attention_mask is not None:
+                attention_mask[0, (i + 1) :, : (i + 1)] = 0
+            # Reset positions.
+            if reset_position_ids:
+                position_ids[(i + 1) :] -= i + 1 - prev_index
+                prev_index = i + 1
+
+    if attention_mask is not None:
+        # Convert attention mask to binary:
+        attention_mask = attention_mask < 0.5
+
+    return attention_mask, loss_mask, position_ids
+
+
+class MockGPTLowLevelDataset:
+
+    seed: int = 0
+    size: int = 100000
+    max_sequence_length: int = 4096
+
+    def __init__(self, tokenizer: MegatronTokenizer) -> None:
+        self.tokenizer = tokenizer
+        rng = numpy.random.default_rng(seed=self.seed)
+        self.sequence_lengths = rng.integers(
+            low=1, high=self.max_sequence_length, size=self.size, dtype=numpy.int32
+        )
+
+    def __len__(self) -> int:
+        return self.size
+
+    def __getitem__(self, idx: int) -> numpy.number:
+        length = self.sequence_lengths[idx]
+        sample = numpy.int64(
+            numpy.concatenate([numpy.arange(length - 1) + 1, [self.tokenizer.eod]])
+        )
+        return sample
+
+    def get(self, idx: int, offset: int = 0, length: Optional[int] = None) -> numpy.ndarray:
+        if length is None:
+            length = self.sequence_lengths[idx] - offset
+        return self[idx][offset : offset + length]
+
+
+class MockGPTDataset(GPTDataset):
+    """The mock GPT dataset
+
+    Args:
+        indexed_dataset (MockGPTLowLevelDataset): The MockGPTLowLevelDataset around which to build the MockGPTDataset
+
+        dataset_path (Optional[str]): This argument is of no consequence for the MockGPTDataset
+
+        indices (numpy.ndarray): The set of the dataset indices to expose
+
+        num_samples (int): The number of samples to draw from the dataset
+
+        index_split (Split): The indices Split
+
+        config (GPTDatasetConfig): The config
+    """
+
+    def __init__(
+        self,
+        dataset: MockGPTLowLevelDataset,
+        dataset_path: Optional[str],
+        indices: numpy.ndarray,
+        num_samples: int,
+        index_split: Split,
+        config: GPTDatasetConfig,
+    ) -> None:
+        assert config.mock
+
+        super().__init__(dataset, dataset_path, indices, num_samples, index_split, config)
+
+    @staticmethod
+    def numel_low_level_dataset(low_level_dataset: MockGPTLowLevelDataset) -> int:
+        """Abstract method implementation
+
+        Args:
+            low_level_dataset (MockGPTLowLevelDataset): The underlying MockGPTLowLevelDataset
+
+        Returns:
+            int: The number of unique elements in the underlying MockGPTLowLevelDataset
+        """
+        return len(low_level_dataset)
+
+    @staticmethod
+    def build_low_level_dataset(
+        dataset_path: Optional[str], config: GPTDatasetConfig
+    ) -> MockGPTLowLevelDataset:
+        """Abstract method implementation
+
+        Args:
+            dataset_path (Optional[str]): This argument is of no consequence for the MockGPTLowLevelDataset
+
+            config (GPTDatasetConfig): The config
+
+        Returns:
+            MockGPTLowLevelDataset: The underlying MockGPTLowLevelDataset
+        """
+        return MockGPTLowLevelDataset(config.tokenizer)
diff --git a/megatron/core/datasets/helpers.cpp b/megatron/core/datasets/helpers.cpp
new file mode 100644
index 0000000000..0b05f09d7a
--- /dev/null
+++ b/megatron/core/datasets/helpers.cpp
@@ -0,0 +1,839 @@
+/* Copyright (c) 2022, NVIDIA CORPORATION.  All rights reserved. */
+
+/* Helper methods for fast index mapping builds */
+
+#include <algorithm>
+#include <iostream>
+#include <limits>
+#include <math.h>
+#include <set>
+#include <stdexcept>
+#include <pybind11/pybind11.h>
+#include <pybind11/numpy.h>
+#include <random>
+
+namespace py = pybind11;
+using namespace std;
+
+const int32_t LONG_SENTENCE_LEN = 512;
+
+
+void build_exhaustive_blending_indices(py::array_t<int16_t> &dataset_index, py::array_t<int64_t> &dataset_sample_index, const py::array_t<int64_t> &sizes, const int32_t num_datasets) {
+  /*
+      Build blending indices by sampling exactly as many samples from dataset[i]
+      as is requested by sizes[i] for all i in the range [0, num_datasets).
+  */
+  auto dataset_index_ptr = dataset_index.mutable_unchecked<1>();
+  auto dataset_sample_index_ptr = dataset_sample_index.mutable_unchecked<1>();
+  auto sizes_ptr = sizes.unchecked<1>();
+
+  int64_t total_size = 0;
+  int64_t dataset_sample_counts[num_datasets];
+  std::set<int32_t> dataset_unspent_indices;
+  for (int32_t i = 0; i < num_datasets; ++i) {
+    total_size += sizes_ptr[i];
+    dataset_sample_counts[i] = 0;
+    dataset_unspent_indices.insert(i);
+  }
+
+  // still need fractional weights to sample in proportion to sizes
+  double weights[num_datasets];
+  for (int32_t i = 0; i < num_datasets; ++i) {
+    weights[i] = sizes_ptr[i] / static_cast<double>(total_size);
+  }
+
+  int64_t index_sample = 0;
+  while (dataset_unspent_indices.size() > 0) {
+    double index_sample_double = std::max(static_cast<double>(index_sample), 1.0);
+
+    int64_t error_argmax;
+    double error_max = std::numeric_limits<double>::lowest();
+
+    for (int32_t index_dataset : dataset_unspent_indices) {
+      double error = weights[index_dataset] * index_sample_double - static_cast<double>(dataset_sample_counts[index_dataset]);
+      if (error > error_max) {
+        error_argmax = index_dataset;
+        error_max = error;
+      }
+    }
+
+    // Populate the indices.
+    dataset_index_ptr[index_sample] = static_cast<int16_t>(error_argmax);
+    dataset_sample_index_ptr[index_sample] = dataset_sample_counts[error_argmax];
+
+    // Update the total samples.
+    dataset_sample_counts[error_argmax] += 1;
+
+    if (sizes_ptr[error_argmax] - static_cast<double>(dataset_sample_counts[error_argmax]) == 0) {
+      dataset_unspent_indices.erase(error_argmax);
+    }
+
+    index_sample += 1;
+  }
+}
+
+void build_blending_indices(py::array_t<int16_t> &dataset_index,
+                            py::array_t<int64_t> &dataset_sample_index,
+                            const py::array_t<double> &weights,
+                            const int32_t num_datasets,
+                            const int64_t size, const bool verbose)
+{
+  /* Given multiple datasets and a weighting array, build samples
+   such that it follows those wieghts.*/
+
+  if (verbose)
+  {
+    std::cout << "> building indices for blended datasets ..." << std::endl;
+  }
+
+  // Get the pointer access without the checks.
+  auto dataset_index_ptr = dataset_index.mutable_unchecked<1>();
+  auto dataset_sample_index_ptr = dataset_sample_index.mutable_unchecked<1>();
+  auto weights_ptr = weights.unchecked<1>();
+
+  // Initialize buffer for number of samples used for each dataset.
+  int64_t current_samples[num_datasets];
+  for (int64_t i = 0; i < num_datasets; ++i)
+  {
+    current_samples[i] = 0;
+  }
+
+  // For each sample:
+  for (int64_t sample_idx = 0; sample_idx < size; ++sample_idx)
+  {
+
+    // Determine where the max error in sampling is happening.
+    auto sample_idx_double = std::max(static_cast<double>(sample_idx), 1.0);
+    int64_t max_error_index = 0;
+    double max_error = weights_ptr[0] * sample_idx_double -
+                       static_cast<double>(current_samples[0]);
+    for (int64_t dataset_idx = 1; dataset_idx < num_datasets; ++dataset_idx)
+    {
+      double error = weights_ptr[dataset_idx] * sample_idx_double -
+                     static_cast<double>(current_samples[dataset_idx]);
+      if (error > max_error)
+      {
+        max_error = error;
+        max_error_index = dataset_idx;
+      }
+    }
+
+    // Populate the indices.
+    dataset_index_ptr[sample_idx] = static_cast<int16_t>(max_error_index);
+    dataset_sample_index_ptr[sample_idx] = current_samples[max_error_index];
+
+    // Update the total samples.
+    current_samples[max_error_index] += 1;
+  }
+
+  // print info
+  if (verbose)
+  {
+    std::cout << " > sample ratios:" << std::endl;
+    for (int64_t dataset_idx = 0; dataset_idx < num_datasets; ++dataset_idx)
+    {
+      auto ratio = static_cast<double>(current_samples[dataset_idx]) /
+                   static_cast<double>(size);
+      std::cout << "   dataset " << dataset_idx << ", input: " << weights_ptr[dataset_idx] << ", achieved: " << ratio << std::endl;
+    }
+  }
+}
+
+py::array build_sample_idx(const py::array_t<int32_t> &sizes_,
+                           const py::array_t<int32_t> &doc_idx_,
+                           const int32_t seq_length,
+                           const int32_t num_epochs,
+                           const int64_t tokens_per_epoch,
+                           const bool drop_last_partial_sequence = true,
+                           const int add_extra_token_to_sequence = 1)
+{
+  /* Sample index (sample_idx) is used for gpt2 like dataset for which
+     the documents are flattened and the samples are built based on this
+     1-D flatten array. It is a 2D array with sizes [number-of-samples + 1, 2]
+     where [..., 0] contains the index into `doc_idx` and [..., 1] is the
+     starting offset in that document.*/
+
+  // Consistency checks.
+  assert(seq_length > 1);
+  assert(num_epochs > 0);
+  assert(tokens_per_epoch > 1);
+
+  // Remove bound checks.
+  auto sizes = sizes_.unchecked<1>();
+  auto doc_idx = doc_idx_.unchecked<1>();
+
+  // Mapping and it's length (1D).
+  int64_t num_samples = 0;
+  if (drop_last_partial_sequence == true)
+  {
+    num_samples = (num_epochs * tokens_per_epoch - add_extra_token_to_sequence) / seq_length;
+  }
+  else
+  {
+    num_samples = ceil(float(num_epochs * tokens_per_epoch - add_extra_token_to_sequence) / seq_length);
+  }
+  int64_t *sample_idx = new int64_t[2 * (num_samples + 1)];
+
+  // Index into sample_idx.
+  int64_t sample_index = 0;
+  // Index into doc_idx.
+  int64_t doc_idx_index = 0;
+  // Begining offset for each document.
+  int32_t doc_offset = 0;
+  // Start with first document and no offset.
+  sample_idx[2 * sample_index] = doc_idx_index;
+  sample_idx[2 * sample_index + 1] = doc_offset;
+  ++sample_index;
+
+  while (sample_index <= num_samples)
+  {
+    // Start with a fresh sequence.
+    int32_t remaining_seq_length = seq_length + add_extra_token_to_sequence;
+    while (remaining_seq_length != 0)
+    {
+      // Get the document length.
+      auto doc_id = doc_idx[doc_idx_index];
+      auto doc_length = sizes[doc_id] - doc_offset;
+      // And add it to the current sequence.
+      remaining_seq_length -= doc_length;
+      // If we have more than a full sequence, adjust offset and set
+      // remaining length to zero so we return from the while loop.
+      // Note that -1 here is for the same reason we have -1 in
+      // `_num_epochs` calculations.
+      if (remaining_seq_length <= 0)
+      {
+        doc_offset += (remaining_seq_length + doc_length - add_extra_token_to_sequence);
+        remaining_seq_length = 0;
+      }
+      else
+      {
+        // Otherwise, start from the begining of the next document.
+        if (doc_idx_index == (doc_idx_.shape(0) - 1))
+        {
+          // If we have reached the end of the documents, break.
+          assert(sample_index == num_samples);
+          doc_offset = sizes[doc_idx[doc_idx_index]] - add_extra_token_to_sequence;
+          break;
+        }
+        ++doc_idx_index;
+        doc_offset = 0;
+      }
+    }
+    // Record the sequence.
+    sample_idx[2 * sample_index] = doc_idx_index;
+    sample_idx[2 * sample_index + 1] = doc_offset;
+    ++sample_index;
+  }
+
+  // Method to deallocate memory.
+  py::capsule free_when_done(sample_idx, [](void *mem_)
+                             {
+	int64_t *mem = reinterpret_cast<int64_t*>(mem_);
+	delete[] mem; });
+
+  // Return the numpy array.
+  const auto byte_size = sizeof(int64_t);
+  return py::array(std::vector<int64_t>{num_samples + 1, 2}, // shape
+                   {2 * byte_size, byte_size},               // C-style contiguous strides
+                   sample_idx,                               // the data pointer
+                   free_when_done);                          // numpy array references
+}
+
+inline int32_t get_target_sample_len(const int32_t short_seq_ratio,
+                                     const int32_t max_length,
+                                     std::mt19937 &rand32_gen)
+{
+  /* Training sample length. */
+  if (short_seq_ratio == 0)
+  {
+    return max_length;
+  }
+  const auto random_number = rand32_gen();
+  if ((random_number % short_seq_ratio) == 0)
+  {
+    return 2 + random_number % (max_length - 1);
+  }
+  return max_length;
+}
+
+template <typename DocIdx>
+py::array build_mapping_impl(const py::array_t<int64_t> &docs_,
+                             const py::array_t<int32_t> &sizes_,
+                             const int32_t num_epochs,
+                             const uint64_t max_num_samples,
+                             const int32_t max_seq_length,
+                             const double short_seq_prob,
+                             const int32_t seed,
+                             const bool verbose,
+                             const int32_t min_num_sent)
+{
+  /* Build a mapping of (start-index, end-index, sequence-length) where
+     start and end index are the indices of the sentences in the sample
+     and sequence-length is the target sequence length.
+  */
+
+  // Consistency checks.
+  assert(num_epochs > 0);
+  assert(max_seq_length > 1);
+  assert(short_seq_prob >= 0.0);
+  assert(short_seq_prob <= 1.0);
+  assert(seed > 0);
+
+  // Remove bound checks.
+  auto docs = docs_.unchecked<1>();
+  auto sizes = sizes_.unchecked<1>();
+
+  // For efficiency, convert probability to ratio. Note: rand() generates int.
+  int32_t short_seq_ratio = 0;
+  if (short_seq_prob > 0)
+  {
+    short_seq_ratio = static_cast<int32_t>(round(1.0 / short_seq_prob));
+  }
+
+  if (verbose)
+  {
+    const auto sent_start_index = docs[0];
+    const auto sent_end_index = docs[docs_.shape(0) - 1];
+    const auto num_sentences = sent_end_index - sent_start_index;
+    cout << "    using:" << endl
+         << std::flush;
+    cout << "     number of documents:            " << docs_.shape(0) - 1 << endl
+         << std::flush;
+    cout << "     sentences range:                [" << sent_start_index << ", " << sent_end_index << ")" << endl
+         << std::flush;
+    cout << "     total number of sentences:      " << num_sentences << endl
+         << std::flush;
+    cout << "     number of epochs:               " << num_epochs << endl
+         << std::flush;
+    cout << "     maximum number of samples:      " << max_num_samples << endl
+         << std::flush;
+    cout << "     maximum sequence length:        " << max_seq_length << endl
+         << std::flush;
+    cout << "     short sequence probability:     " << short_seq_prob << endl
+         << std::flush;
+    cout << "     short sequence ration (1/prob): " << short_seq_ratio << endl
+         << std::flush;
+    cout << "     seed:                           " << seed << endl
+         << std::flush;
+  }
+
+  // Mapping and it's length (1D).
+  int64_t num_samples = -1;
+  DocIdx *maps = NULL;
+
+  // Perform two iterations, in the first iteration get the size
+  // and allocate memory and in the second iteration populate the map.
+  bool second = false;
+  for (int32_t iteration = 0; iteration < 2; ++iteration)
+  {
+
+    // Set the seed so both iterations produce the same results.
+    std::mt19937 rand32_gen(seed);
+
+    // Set the flag on second iteration.
+    second = (iteration == 1);
+
+    // Counters:
+    uint64_t empty_docs = 0;
+    uint64_t one_sent_docs = 0;
+    uint64_t long_sent_docs = 0;
+
+    // Current map index.
+    uint64_t map_index = 0;
+
+    // For each epoch:
+    for (int32_t epoch = 0; epoch < num_epochs; ++epoch)
+    {
+      if (map_index >= max_num_samples)
+      {
+        if (verbose && (!second))
+        {
+          cout << "    reached " << max_num_samples << " samples after "
+               << epoch << " epochs ..." << endl
+               << std::flush;
+        }
+        break;
+      }
+      // For each document:
+      for (int32_t doc = 0; doc < (docs.shape(0) - 1); ++doc)
+      {
+
+        // Document sentences are in [sent_index_first, sent_index_last)
+        const auto sent_index_first = docs[doc];
+        const auto sent_index_last = docs[doc + 1];
+
+        // At the begining of the document previous index is the
+        // start index.
+        auto prev_start_index = sent_index_first;
+
+        // Remaining documents.
+        auto num_remain_sent = sent_index_last - sent_index_first;
+
+        // Some bookkeeping
+        if ((epoch == 0) && (!second))
+        {
+          if (num_remain_sent == 0)
+          {
+            ++empty_docs;
+          }
+          if (num_remain_sent == 1)
+          {
+            ++one_sent_docs;
+          }
+        }
+
+        // Detect documents with long sentences.
+        bool contains_long_sentence = false;
+        if (num_remain_sent > 1)
+        {
+          for (auto sent_index = sent_index_first;
+               sent_index < sent_index_last; ++sent_index)
+          {
+            if (sizes[sent_index] > LONG_SENTENCE_LEN)
+            {
+              if ((epoch == 0) && (!second))
+              {
+                ++long_sent_docs;
+              }
+              contains_long_sentence = true;
+              break;
+            }
+          }
+        }
+
+        // If we have more than two sentences.
+        if ((num_remain_sent >= min_num_sent) && (!contains_long_sentence))
+        {
+
+          // Set values.
+          auto seq_len = int32_t{0};
+          auto num_sent = int32_t{0};
+          auto target_seq_len = get_target_sample_len(short_seq_ratio,
+                                                      max_seq_length,
+                                                      rand32_gen);
+
+          // Loop through sentences.
+          for (auto sent_index = sent_index_first;
+               sent_index < sent_index_last; ++sent_index)
+          {
+
+            // Add the size and number of sentences.
+            seq_len += sizes[sent_index];
+            ++num_sent;
+            --num_remain_sent;
+
+            // If we have reached the target length.
+            // and if not only one sentence is left in the document.
+            // and if we have at least two sentneces.
+            // and if we have reached end of the document.
+            if (((seq_len >= target_seq_len) &&
+                 (num_remain_sent > 1) &&
+                 (num_sent >= min_num_sent)) ||
+                (num_remain_sent == 0))
+            {
+
+              // Check for overflow.
+              if ((3 * map_index + 2) >
+                  std::numeric_limits<int64_t>::max())
+              {
+                cout << "number of samples exceeded maximum "
+                     << "allowed by type int64: "
+                     << std::numeric_limits<int64_t>::max()
+                     << endl;
+                throw std::overflow_error("Number of samples");
+              }
+
+              // Populate the map.
+              if (second)
+              {
+                const auto map_index_0 = 3 * map_index;
+                maps[map_index_0] = static_cast<DocIdx>(prev_start_index);
+                maps[map_index_0 + 1] = static_cast<DocIdx>(sent_index + 1);
+                maps[map_index_0 + 2] = static_cast<DocIdx>(target_seq_len);
+              }
+
+              // Update indices / counters.
+              ++map_index;
+              prev_start_index = sent_index + 1;
+              target_seq_len = get_target_sample_len(short_seq_ratio,
+                                                     max_seq_length,
+                                                     rand32_gen);
+              seq_len = 0;
+              num_sent = 0;
+            }
+
+          } // for (auto sent_index=sent_index_first; ...
+        }   // if (num_remain_sent > 1) {
+      }     // for (int doc=0; doc < num_docs; ++doc) {
+    }       // for (int epoch=0; epoch < num_epochs; ++epoch) {
+
+    if (!second)
+    {
+      if (verbose)
+      {
+        cout << "   number of empty documents: " << empty_docs << endl
+             << std::flush;
+        cout << "   number of documents with one sentence: " << one_sent_docs << endl
+             << std::flush;
+        cout << "   number of documents with long sentences: " << long_sent_docs << endl
+             << std::flush;
+        cout << "   will create mapping for " << map_index << " samples" << endl
+             << std::flush;
+      }
+      assert(maps == NULL);
+      assert(num_samples < 0);
+      maps = new DocIdx[3 * map_index];
+      num_samples = static_cast<int64_t>(map_index);
+    }
+
+  } // for (int iteration=0; iteration < 2; ++iteration) {
+
+  // Shuffle.
+  // We need a 64 bit random number generator as we might have more
+  // than 2 billion samples.
+  std::mt19937_64 rand64_gen(seed + 1);
+  for (auto i = (num_samples - 1); i > 0; --i)
+  {
+    const auto j = static_cast<int64_t>(rand64_gen() % (i + 1));
+    const auto i0 = 3 * i;
+    const auto j0 = 3 * j;
+    // Swap values.
+    swap(maps[i0], maps[j0]);
+    swap(maps[i0 + 1], maps[j0 + 1]);
+    swap(maps[i0 + 2], maps[j0 + 2]);
+  }
+
+  // Method to deallocate memory.
+  py::capsule free_when_done(maps, [](void *mem_)
+                             {
+            DocIdx *mem = reinterpret_cast<DocIdx*>(mem_);
+	    delete[] mem; });
+
+  // Return the numpy array.
+  const auto byte_size = sizeof(DocIdx);
+  return py::array(std::vector<int64_t>{num_samples, 3}, // shape
+                   {3 * byte_size, byte_size},           // C-style contiguous strides
+                   maps,                                 // the data pointer
+                   free_when_done);                      // numpy array references
+}
+
+py::array build_mapping(const py::array_t<int64_t> &docs_,
+                        const py::array_t<int> &sizes_,
+                        const int num_epochs,
+                        const uint64_t max_num_samples,
+                        const int max_seq_length,
+                        const double short_seq_prob,
+                        const int seed,
+                        const bool verbose,
+                        const int32_t min_num_sent)
+{
+
+  if (sizes_.size() > std::numeric_limits<uint32_t>::max())
+  {
+    if (verbose)
+    {
+      cout << "    using uint64 for data mapping..." << endl
+           << std::flush;
+    }
+    return build_mapping_impl<uint64_t>(docs_, sizes_, num_epochs,
+                                        max_num_samples, max_seq_length,
+                                        short_seq_prob, seed, verbose,
+                                        min_num_sent);
+  }
+  else
+  {
+    if (verbose)
+    {
+      cout << "    using uint32 for data mapping..." << endl
+           << std::flush;
+    }
+    return build_mapping_impl<uint32_t>(docs_, sizes_, num_epochs,
+                                        max_num_samples, max_seq_length,
+                                        short_seq_prob, seed, verbose,
+                                        min_num_sent);
+  }
+}
+
+template <typename DocIdx>
+py::array build_blocks_mapping_impl(const py::array_t<int64_t> &docs_,
+                                    const py::array_t<int32_t> &sizes_,
+                                    const py::array_t<int32_t> &titles_sizes_,
+                                    const int32_t num_epochs,
+                                    const uint64_t max_num_samples,
+                                    const int32_t max_seq_length,
+                                    const int32_t seed,
+                                    const bool verbose,
+                                    const bool use_one_sent_blocks)
+{
+  /* Build a mapping of (start-index, end-index, sequence-length) where
+     start and end index are the indices of the sentences in the sample
+     and sequence-length is the target sequence length.
+  */
+
+  // Consistency checks.
+  assert(num_epochs > 0);
+  assert(max_seq_length > 1);
+  assert(seed > 0);
+
+  // Remove bound checks.
+  auto docs = docs_.unchecked<1>();
+  auto sizes = sizes_.unchecked<1>();
+  auto titles_sizes = titles_sizes_.unchecked<1>();
+
+  if (verbose)
+  {
+    const auto sent_start_index = docs[0];
+    const auto sent_end_index = docs[docs_.shape(0) - 1];
+    const auto num_sentences = sent_end_index - sent_start_index;
+    cout << "    using:" << endl
+         << std::flush;
+    cout << "     number of documents:            " << docs_.shape(0) - 1 << endl
+         << std::flush;
+    cout << "     sentences range:                [" << sent_start_index << ", " << sent_end_index << ")" << endl
+         << std::flush;
+    cout << "     total number of sentences:      " << num_sentences << endl
+         << std::flush;
+    cout << "     number of epochs:               " << num_epochs << endl
+         << std::flush;
+    cout << "     maximum number of samples:      " << max_num_samples << endl
+         << std::flush;
+    cout << "     maximum sequence length:        " << max_seq_length << endl
+         << std::flush;
+    cout << "     seed:                           " << seed << endl
+         << std::flush;
+  }
+
+  // Mapping and its length (1D).
+  int64_t num_samples = -1;
+  DocIdx *maps = NULL;
+
+  // Acceptable number of sentences per block.
+  int min_num_sent = 2;
+  if (use_one_sent_blocks)
+  {
+    min_num_sent = 1;
+  }
+
+  // Perform two iterations, in the first iteration get the size
+  // and allocate memory and in the second iteration populate the map.
+  bool second = false;
+  for (int32_t iteration = 0; iteration < 2; ++iteration)
+  {
+
+    // Set the flag on second iteration.
+    second = (iteration == 1);
+
+    // Current map index.
+    uint64_t map_index = 0;
+
+    uint64_t empty_docs = 0;
+    uint64_t one_sent_docs = 0;
+    uint64_t long_sent_docs = 0;
+    // For each epoch:
+    for (int32_t epoch = 0; epoch < num_epochs; ++epoch)
+    {
+      // assign every block a unique id
+      int32_t block_id = 0;
+
+      if (map_index >= max_num_samples)
+      {
+        if (verbose && (!second))
+        {
+          cout << "    reached " << max_num_samples << " samples after "
+               << epoch << " epochs ..." << endl
+               << std::flush;
+        }
+        break;
+      }
+      // For each document:
+      for (int32_t doc = 0; doc < (docs.shape(0) - 1); ++doc)
+      {
+
+        // Document sentences are in [sent_index_first, sent_index_last)
+        const auto sent_index_first = docs[doc];
+        const auto sent_index_last = docs[doc + 1];
+        const auto target_seq_len = max_seq_length - titles_sizes[doc];
+
+        // At the begining of the document previous index is the
+        // start index.
+        auto prev_start_index = sent_index_first;
+
+        // Remaining documents.
+        auto num_remain_sent = sent_index_last - sent_index_first;
+
+        // Some bookkeeping
+        if ((epoch == 0) && (!second))
+        {
+          if (num_remain_sent == 0)
+          {
+            ++empty_docs;
+          }
+          if (num_remain_sent == 1)
+          {
+            ++one_sent_docs;
+          }
+        }
+        // Detect documents with long sentences.
+        bool contains_long_sentence = false;
+        if (num_remain_sent >= min_num_sent)
+        {
+          for (auto sent_index = sent_index_first;
+               sent_index < sent_index_last; ++sent_index)
+          {
+            if (sizes[sent_index] > LONG_SENTENCE_LEN)
+            {
+              if ((epoch == 0) && (!second))
+              {
+                ++long_sent_docs;
+              }
+              contains_long_sentence = true;
+              break;
+            }
+          }
+        }
+        // If we have enough sentences and no long sentences.
+        if ((num_remain_sent >= min_num_sent) && (!contains_long_sentence))
+        {
+
+          // Set values.
+          auto seq_len = int32_t{0};
+          auto num_sent = int32_t{0};
+
+          // Loop through sentences.
+          for (auto sent_index = sent_index_first;
+               sent_index < sent_index_last; ++sent_index)
+          {
+
+            // Add the size and number of sentences.
+            seq_len += sizes[sent_index];
+            ++num_sent;
+            --num_remain_sent;
+
+            // If we have reached the target length.
+            // and there are an acceptable number of sentences left
+            // and if we have at least the minimum number of sentences.
+            // or if we have reached end of the document.
+            if (((seq_len >= target_seq_len) &&
+                 (num_remain_sent >= min_num_sent) &&
+                 (num_sent >= min_num_sent)) ||
+                (num_remain_sent == 0))
+            {
+
+              // Populate the map.
+              if (second)
+              {
+                const auto map_index_0 = 4 * map_index;
+                // Each sample has 4 items: the starting sentence index, ending sentence index,
+                // the index of the document from which the block comes (used for fetching titles)
+                // and the unique id of the block (used for creating block indexes)
+
+                maps[map_index_0] = static_cast<DocIdx>(prev_start_index);
+                maps[map_index_0 + 1] = static_cast<DocIdx>(sent_index + 1);
+                maps[map_index_0 + 2] = static_cast<DocIdx>(doc);
+                maps[map_index_0 + 3] = static_cast<DocIdx>(block_id);
+              }
+
+              // Update indices / counters.
+              ++map_index;
+              ++block_id;
+              prev_start_index = sent_index + 1;
+              seq_len = 0;
+              num_sent = 0;
+            }
+          } // for (auto sent_index=sent_index_first; ...
+        }   // if (num_remain_sent > 1) {
+      }     // for (int doc=0; doc < num_docs; ++doc) {
+    }       // for (int epoch=0; epoch < num_epochs; ++epoch) {
+
+    if (!second)
+    {
+      if (verbose)
+      {
+        cout << "   number of empty documents: " << empty_docs << endl
+             << std::flush;
+        cout << "   number of documents with one sentence: " << one_sent_docs << endl
+             << std::flush;
+        cout << "   number of documents with long sentences: " << long_sent_docs << endl
+             << std::flush;
+        cout << "   will create mapping for " << map_index << " samples" << endl
+             << std::flush;
+      }
+      assert(maps == NULL);
+      assert(num_samples < 0);
+      maps = new DocIdx[4 * map_index];
+      num_samples = static_cast<int64_t>(map_index);
+    }
+
+  } // for (int iteration=0; iteration < 2; ++iteration) {
+
+  // Shuffle.
+  // We need a 64 bit random number generator as we might have more
+  // than 2 billion samples.
+  std::mt19937_64 rand64_gen(seed + 1);
+  for (auto i = (num_samples - 1); i > 0; --i)
+  {
+    const auto j = static_cast<int64_t>(rand64_gen() % (i + 1));
+    const auto i0 = 4 * i;
+    const auto j0 = 4 * j;
+    // Swap values.
+    swap(maps[i0], maps[j0]);
+    swap(maps[i0 + 1], maps[j0 + 1]);
+    swap(maps[i0 + 2], maps[j0 + 2]);
+    swap(maps[i0 + 3], maps[j0 + 3]);
+  }
+
+  // Method to deallocate memory.
+  py::capsule free_when_done(maps, [](void *mem_)
+                             {
+            DocIdx *mem = reinterpret_cast<DocIdx*>(mem_);
+	    delete[] mem; });
+
+  // Return the numpy array.
+  const auto byte_size = sizeof(DocIdx);
+  return py::array(std::vector<int64_t>{num_samples, 4}, // shape
+                   {4 * byte_size, byte_size},           // C-style contiguous strides
+                   maps,                                 // the data pointer
+                   free_when_done);                      // numpy array references
+}
+
+py::array build_blocks_mapping(const py::array_t<int64_t> &docs_,
+                               const py::array_t<int> &sizes_,
+                               const py::array_t<int> &titles_sizes_,
+                               const int num_epochs,
+                               const uint64_t max_num_samples,
+                               const int max_seq_length,
+                               const int seed,
+                               const bool verbose,
+                               const bool use_one_sent_blocks)
+{
+
+  if (sizes_.size() > std::numeric_limits<uint32_t>::max())
+  {
+    if (verbose)
+    {
+      cout << "    using uint64 for data mapping..." << endl
+           << std::flush;
+    }
+    return build_blocks_mapping_impl<uint64_t>(docs_, sizes_, titles_sizes_,
+                                               num_epochs, max_num_samples, max_seq_length, seed, verbose, use_one_sent_blocks);
+  }
+  else
+  {
+    if (verbose)
+    {
+      cout << "    using uint32 for data mapping..." << endl
+           << std::flush;
+    }
+    return build_blocks_mapping_impl<uint32_t>(docs_, sizes_, titles_sizes_,
+                                               num_epochs, max_num_samples, max_seq_length, seed, verbose, use_one_sent_blocks);
+  }
+}
+
+PYBIND11_MODULE(helpers, m)
+{
+  m.def("build_mapping", &build_mapping);
+  m.def("build_blocks_mapping", &build_blocks_mapping);
+  m.def("build_sample_idx", &build_sample_idx);
+  m.def("build_blending_indices", &build_blending_indices);
+  m.def("build_exhaustive_blending_indices", &build_exhaustive_blending_indices);
+}
diff --git a/megatron/core/datasets/indexed_dataset.py b/megatron/core/datasets/indexed_dataset.py
new file mode 100644
index 0000000000..29975336f1
--- /dev/null
+++ b/megatron/core/datasets/indexed_dataset.py
@@ -0,0 +1,857 @@
+# Copyright (c) Facebook, Inc. and its affiliates.
+#
+# This source code is licensed under the MIT license found in the
+# LICENSE file in the root directory of this source tree.
+
+# Essentially re-written in entirety
+
+import logging
+import os
+import shutil
+import struct
+import time
+from abc import ABC, abstractmethod
+from enum import Enum
+from functools import lru_cache
+from itertools import accumulate
+from types import TracebackType
+from typing import List, Optional, Tuple, Type, Union
+
+try:
+    import boto3
+except ModuleNotFoundError:
+    pass
+import numpy
+import torch
+
+from megatron.core.datasets.utils_s3 import (
+    S3Config,
+    is_s3_path,
+    maybe_download_file,
+    object_exists,
+    parse_s3_path,
+)
+from megatron.core.utils import log_single_rank
+
+logger = logging.getLogger(__name__)
+
+_INDEX_HEADER = b"MMIDIDX\x00\x00"
+
+
+class DType(Enum):
+    """The NumPy data type Enum for writing/reading the IndexedDataset indices"""
+
+    uint8 = 1
+    int8 = 2
+    int16 = 3
+    int32 = 4
+    int64 = 5
+    float64 = 6
+    float32 = 7
+    uint16 = 8
+
+    @classmethod
+    def code_from_dtype(cls, value: Type[numpy.number]) -> int:
+        """Get the code from the dtype
+
+        Args:
+            value (Type[numpy.number]): The dtype
+
+        Returns:
+            int: The code
+        """
+        return cls[value.__name__].value
+
+    @classmethod
+    def dtype_from_code(cls, value: int) -> Type[numpy.number]:
+        """Get the dtype from the code
+
+        Args:
+            value (int): The code
+
+        Returns:
+            Type[numpy.number]: The dtype
+        """
+        return getattr(numpy, cls(value).name)
+
+    @staticmethod
+    def size(key: Union[int, Type[numpy.number]]) -> int:
+        """Get the size of the dtype/code in bytes
+
+        Args:
+            key (Union[int, Type[numpy.number]]): The dtype or code
+
+        Raises:
+            ValueError: If the key is neither dtype nor integer code
+
+        Returns:
+            int: The size of the dtype/code in in bytes
+        """
+        if isinstance(key, int):
+            return DType.dtype_from_code(key)().itemsize
+        elif numpy.number in key.__mro__:
+            return key().itemsize
+        else:
+            raise ValueError
+
+    @staticmethod
+    def optimal_dtype(cardinality: Optional[int]) -> Type[numpy.number]:
+        """Get the dtype to use for an index of a certain cardinality
+
+        Args:
+            cardinality (Optional[int]): The number of elements to be indexed
+
+        Returns:
+            Type[numpy.number]: The dtype to use for the index
+        """
+        if cardinality is not None and cardinality < 65500:
+            return numpy.uint16
+        else:
+            return numpy.int32
+
+
+class _IndexWriter(object):
+    """Object class to write the index (.idx) file
+
+    Args:
+        idx_path (str): The path to the index file
+
+        dtype (Type[numpy.number]): The dtype of the index file
+    """
+
+    def __init__(self, idx_path: str, dtype: Type[numpy.number]) -> None:
+        self.idx_path = idx_path
+        self.dtype = dtype
+
+    def __enter__(self) -> "_IndexWriter":
+        """Enter the context introduced by the 'with' keyword
+
+        Returns:
+            _IndexWriter: The instance
+        """
+        self.idx_writer = open(self.idx_path, "wb")
+        # fixed, vestigial practice
+        self.idx_writer.write(_INDEX_HEADER)
+        # fixed, vestigial practice
+        self.idx_writer.write(struct.pack("<Q", 1))
+        # the numeric code for the dtype
+        self.idx_writer.write(struct.pack("<B", DType.code_from_dtype(self.dtype)))
+        return self
+
+    def __exit__(
+        self,
+        exc_type: Optional[Type[BaseException]],
+        exc_val: Optional[BaseException],
+        exc_tb: Optional[TracebackType],
+    ) -> Optional[bool]:
+        """Exit the context introduced by the 'with' keyword
+
+        Args:
+            exc_type (Optional[Type[BaseException]]): Exception type
+
+            exc_val (Optional[BaseException]): Exception value
+
+            exc_tb (Optional[TracebackType]): Exception traceback object
+
+        Returns:
+            Optional[bool]: Whether to silence the exception
+        """
+        self.idx_writer.close()
+
+    def write(
+        self,
+        sequence_lengths: List[int],
+        sequence_modes: Optional[List[int]],
+        document_indices: List[int],
+    ) -> None:
+        """Write the index (.idx) file
+
+        Args:
+            sequence_lengths (List[int]): The length of each sequence
+
+            sequence_modes (Optional[List[int]]): The mode of each sequences
+
+            document_indices (List[int]): The seqyebce indices demarcating the end of each document
+        """
+        sequence_pointers = self._sequence_pointers(sequence_lengths)
+
+        # the number of sequences in the dataset
+        sequence_count = len(sequence_lengths)
+        self.idx_writer.write(struct.pack("<Q", sequence_count))
+
+        # the number of documents in the dataset
+        document_count = len(document_indices)
+        self.idx_writer.write(struct.pack("<Q", document_count))
+
+        # the number of tokens per sequence
+        sequence_lengths = numpy.array(sequence_lengths, dtype=numpy.int32)
+        self.idx_writer.write(sequence_lengths.tobytes(order="C"))
+        del sequence_lengths
+
+        # the byte offsets for all sequences
+        sequence_pointers = numpy.array(sequence_pointers, dtype=numpy.int64)
+        self.idx_writer.write(sequence_pointers.tobytes(order="C"))
+        del sequence_pointers
+
+        # the sequence indices marking the end of each document
+        document_indices = numpy.array(document_indices, dtype=numpy.int64)
+        self.idx_writer.write(document_indices.tobytes(order="C"))
+
+        # the mode per sequence
+        if sequence_modes is not None:
+            sequence_modes = numpy.array(sequence_modes, dtype=numpy.int8)
+            self.idx_writer.write(sequence_modes.tobytes(order='C'))
+            del sequence_modes
+
+    def _sequence_pointers(self, sequence_lengths: List[int]) -> List[int]:
+        """Build the sequence pointers per the sequence lengths and dtype size
+
+        Args:
+            sequence_lengths (List[int]): The length of each sequence
+
+        Returns:
+            List[int]: The pointer to the beginning of each sequence
+        """
+        itemsize = DType.size(self.dtype)
+        curr_ptr = 0
+        list_ptr = []
+        for length in sequence_lengths:
+            list_ptr.append(curr_ptr)
+            curr_ptr += length * itemsize
+        return list_ptr
+
+
+class _IndexReader(object):
+    """Object class to read the index (.idx) file
+
+    Args:
+        idx_path (str): The path to the index file
+
+        multimodal (bool): Whether the dataset is multimodal
+    """
+
+    def __init__(self, idx_path: str, multimodal: bool) -> None:
+
+        log_single_rank(logger, logging.INFO, f"Load the {type(self).__name__} from {idx_path}")
+
+        with open(idx_path, "rb") as stream:
+            header = stream.read(9)
+            assert header == _INDEX_HEADER, f"bad header, cannot read: {idx_path}"
+
+            version = struct.unpack("<Q", stream.read(8))[0]
+            assert version == 1, f"bad version, cannot read: {idx_path}"
+
+            code = struct.unpack("<B", stream.read(1))[0]
+            self.dtype = DType.dtype_from_code(code)
+            self.dtype_size = DType.size(self.dtype)
+
+            self.sequence_count = struct.unpack("<Q", stream.read(8))[0]
+            self.document_count = struct.unpack("<Q", stream.read(8))[0]
+
+            offset = stream.tell()
+
+        self.bin_buffer_mmap = numpy.memmap(idx_path, mode="r", order="C")
+        self.bin_buffer = memoryview(self.bin_buffer_mmap)
+
+        log_single_rank(logger, logging.INFO, f"\tExtract the sequence lengths")
+        t_beg = time.time()
+        self.sequence_lengths = numpy.frombuffer(
+            self.bin_buffer, dtype=numpy.int32, count=self.sequence_count, offset=offset
+        )
+        t_end = time.time()
+        log_single_rank(logger, logging.DEBUG, f"\t> time elapsed: {t_end - t_beg:4f} seconds")
+
+        log_single_rank(logger, logging.INFO, f"\tExtract the sequence pointers")
+        t_beg = time.time()
+        self.sequence_pointers = numpy.frombuffer(
+            self.bin_buffer,
+            dtype=numpy.int64,
+            count=self.sequence_count,
+            offset=offset + self.sequence_lengths.nbytes,
+        )
+        t_end = time.time()
+        log_single_rank(logger, logging.DEBUG, f"\t> time elapsed: {t_end - t_beg:4f} seconds")
+
+        log_single_rank(logger, logging.INFO, f"\tExtract the document indices")
+        t_beg = time.time()
+        self.document_indices = numpy.frombuffer(
+            self.bin_buffer,
+            dtype=numpy.int64,
+            count=self.document_count,
+            offset=offset + self.sequence_lengths.nbytes + self.sequence_pointers.nbytes,
+        )
+        t_end = time.time()
+        log_single_rank(logger, logging.DEBUG, f"\t> time elapsed: {t_end - t_beg:4f} seconds")
+
+        self.sequence_modes = None
+        if multimodal:
+            log_single_rank(logger, logging.INFO, f"\tExtract the sequence modes")
+            t_beg = time.time()
+            self.sequence_modes = numpy.frombuffer(
+                self.bin_buffer,
+                dtype=numpy.int8,
+                count=self.sequence_count,
+                offset=offset
+                + self.sequence_lengths.nbytes
+                + self.sequence_pointers.nbytes
+                + self.document_indices.nbytes,
+            )
+            t_end = time.time()
+            log_single_rank(logger, logging.DEBUG, f"\t> time elapsed: {t_end - t_beg:4f} seconds")
+
+        assert self.sequence_lengths.shape[0] == len(self)
+        assert self.sequence_lengths.shape[0] == self.sequence_count
+        assert self.sequence_lengths.shape[0] == self.document_indices[-1]
+
+        log_single_rank(logger, logging.INFO, f"> total number of sequences: {len(self)}")
+        log_single_rank(
+            logger,
+            logging.INFO,
+            f"> total number of documents: {self.document_indices.shape[0] - 1}",
+        )
+
+    def __del__(self) -> None:
+        """Clean up the object"""
+        if hasattr(self, "bin_buffer_mmap"):
+            self.bin_buffer_mmap._mmap.close()
+            del self.bin_buffer_mmap
+
+    def __len__(self) -> int:
+        """Return the length of the dataset
+
+        Returns:
+            int: The length of the dataset
+        """
+        return self.sequence_count
+
+    @lru_cache(maxsize=8)
+    def __getitem__(self, idx: int) -> Tuple[numpy.int32, numpy.int64, Optional[numpy.int8]]:
+        """Return the pointer, length, and mode at the index
+
+        Args:
+            idx (int): The index into the dataset
+
+        Returns:
+            Tuple[numpy.int32, numpy.int64, Optional[numpy.int8]]: The pointer, length and mode at the index
+        """
+        return (
+            self.sequence_pointers[idx],
+            self.sequence_lengths[idx],
+            self.sequence_modes[idx] if self.sequence_modes is not None else None,
+        )
+
+
+class _BinReader(ABC):
+    """Abstract class to read the data (.bin) file"""
+
+    @abstractmethod
+    def read(self, dtype: Type[numpy.number], count: int, offset: int) -> numpy.ndarray:
+        """Read bytes into a numpy array.
+
+        Args:
+            dtype (Type[numpy.number]): Data-type of the returned array.
+
+            count (int): Number of items to read.
+
+            offset (int): Start reading from this offset (in bytes).
+
+        Returns:
+            numpy.ndarray: An array with `count` items and data-type `dtype` constructed from reading bytes from the data file starting at `offset`.
+        """
+        pass
+
+
+class _MMapBinReader(_BinReader):
+    """A _BinReader that memory maps the data (.bin) file
+
+    Args:
+        bin_path (str): bin_path (str): The path to the data (.bin) file.
+    """
+
+    def __init__(self, bin_path: str) -> None:
+        self._bin_buffer_mmap = numpy.memmap(bin_path, mode="r", order="C")
+        self._bin_buffer = memoryview(self._bin_buffer_mmap)
+
+    def read(self, dtype: Type[numpy.number], count: int, offset: int) -> numpy.ndarray:
+        """Read bytes into a numpy array.
+
+        Args:
+            dtype (Type[numpy.number]): Data-type of the returned array.
+
+            count (int): Number of items to read.
+
+            offset (int): Start reading from this offset (in bytes).
+
+        Returns:
+            numpy.ndarray: An array with `count` items and data-type `dtype` constructed from reading bytes from the data file starting at `offset`.
+        """
+        return numpy.frombuffer(self._bin_buffer, dtype=dtype, count=count, offset=offset)
+
+    def __del__(self) -> None:
+        """Clean up the object."""
+        if self._bin_buffer_mmap is not None:
+            self._bin_buffer_mmap._mmap.close()
+        del self._bin_buffer_mmap
+
+
+class _FileBinReader(_BinReader):
+    """A _BinReader that reads from the data (.bin) file using a file pointer
+
+    Args:
+        bin_path (str): bin_path (str): The path to the data (.bin) file.
+    """
+
+    def __init__(self, bin_path: str) -> None:
+        self._bin_path = bin_path
+
+    def read(self, dtype: Type[numpy.number], count: int, offset: int) -> numpy.ndarray:
+        """Read bytes into a numpy array.
+
+        Args:
+            dtype (Type[numpy.number]): Data-type of the returned array.
+
+            count (int): Number of items to read.
+
+            offset (int): Start reading from this offset (in bytes).
+
+        Returns:
+            numpy.ndarray: An array with `count` items and data-type `dtype` constructed from reading bytes from the data file starting at `offset`.
+        """
+        sequence = numpy.empty(count, dtype=dtype)
+        with open(self._bin_path, mode='rb', buffering=0) as bin_buffer_file:
+            bin_buffer_file.seek(offset)
+            bin_buffer_file.readinto(sequence)
+        return sequence
+
+
+class _S3BinReader(_BinReader):
+    """A _BinReader that reads from the data (.bin) file from S3
+
+    Args:
+        bin_path (str): bin_path (str): The path to the data (.bin) file.
+
+        bin_chunk_nbytes (int, optional): If not None, then maintain an in-memory cache to speed up calls to the `read` method. Furthermore, on a cache miss, download this number of bytes to refresh the cache. Otherwise (None), do not maintain an in-memory cache. A class that inherits from _BinReader may not implement caching in which case it should assert that `bin_chunk_nbytes` is None at initialization.
+    """
+
+    def __init__(self, bin_path: str, bin_chunk_nbytes: int) -> None:
+        assert bin_chunk_nbytes > 0
+        self._client = boto3.client("s3")
+        self._s3_bucket, self._s3_key = parse_s3_path(bin_path)
+        self._cache = None
+        self._cache_bytes_start = None
+        self._cache_bytes_end = None
+        self._cache_nbytes = bin_chunk_nbytes
+
+    def _extract_from_cache(self, offset: int, size: int) -> bytes:
+        """Extract `size` bytes starting at `offset` bytes into the cache"""
+        start = offset - self._cache_bytes_start
+        assert start >= 0
+        end = start + size
+        assert end <= len(self._cache)
+        return self._cache[start:end]
+
+    def read(self, dtype: Type[numpy.number], count: int, offset: int) -> numpy.ndarray:
+        """Read bytes into a numpy array.
+
+        Let `size` be the `count` * `DType.size(dtype)`. If the requested span of bytes [`offset`,
+        `offset` + `size`) is covered by the in-memory cache maintained by this class, then this
+        function extracts the requested span from that cache and returns it. Otherwise, this
+        function first refreshes the cache and then extracts the requested span from the refreshed
+        cache and returns it.
+
+        The cache is refreshed based on `offset` and `size`. In particular, we divide all the bytes
+        in an S3 object into blocks, where each block contains `bin_chunk_nbytes` bytes. We assign
+        each block an index starting from 0. We take the block with index (`offset` //
+        `bin_chunk_nbytes`) to refresh the cache. If this new block still does not cover the
+        requested span, we extend it just enough to include `offset` + `size`.
+
+        Args:
+            dtype (Type[numpy.number]): Data-type of the returned array.
+
+            count (int): Number of items to read.
+
+            offset (int): Start reading from this offset (in bytes).
+
+        Returns:
+            numpy.ndarray: An array with `count` items and data-type `dtype` constructed from reading bytes from the data file starting at `offset`.
+        """
+        size = count * DType.size(dtype)
+        if (
+            self._cache is not None
+            and offset >= self._cache_bytes_start
+            and offset + size <= self._cache_bytes_end
+        ):
+            return numpy.frombuffer(self._extract_from_cache(offset, size), dtype=dtype)
+
+        bytes_start = (offset // self._cache_nbytes) * self._cache_nbytes
+        assert bytes_start >= 0
+        assert offset >= bytes_start
+        bytes_end = max(bytes_start + self._cache_nbytes, offset + size)
+        assert bytes_end >= 1
+        self._cache = self._client.get_object(
+            Bucket=self._s3_bucket,
+            Key=self._s3_key,
+            # Subtract 1, because the end of Range is inclusive.
+            Range=f'bytes={bytes_start}-{bytes_end-1}',
+        )['Body'].read()
+        self._cache_bytes_start = bytes_start
+        self._cache_bytes_end = bytes_end
+        return numpy.frombuffer(self._extract_from_cache(offset, size), dtype=dtype)
+
+    def __del__(self) -> None:
+        """Clean up the object"""
+        self._client.close()
+
+
+class IndexedDataset(torch.utils.data.Dataset):
+    """The low-level interface dataset class
+
+    Args:
+        path_prefix (str): The index (.idx) and data (.bin) prefix
+
+        multimodal (bool): Whether the dataset is multimodal. Defaults to False.
+
+        mmap (bool): Whether to mmap the .bin files. Defaults to True.
+
+        s3_config (Optional[S3Config]): Supplied only for data stored on S3. IndexedDataset downloads the index (.idx) file to `s3_config.path_to_idx_cache` and streams data from the data (.bin) file in `s3_config.bin_chunk_nbytes` blocks. Note that `mmap` must be disabled for S3 data loading. Defaults to None.
+    """
+
+    def __init__(
+        self,
+        path_prefix: str,
+        multimodal: bool = False,
+        mmap: bool = True,
+        s3_config: Optional[S3Config] = None,
+    ) -> None:
+        super().__init__()
+        self.path_prefix = None
+        self.multimodal = None
+        self.mmap = None
+        self.s3_config = None
+
+        self.index = None
+        self.bin_reader = None
+
+        if is_s3_path(path_prefix) and s3_config is not None:
+            idx_path = get_idx_path(path_prefix)
+            cache_idx_path = os.path.join(s3_config.path_to_idx_cache, os.path.basename(idx_path))
+            maybe_download_file(idx_path, cache_idx_path)
+
+        self.initialize(path_prefix, multimodal, mmap, s3_config)
+
+    def initialize(
+        self, path_prefix: str, multimodal: bool, mmap: bool, s3_config: Optional[S3Config]
+    ) -> None:
+        """Initialize the dataset
+
+        This method is called by IndexedDataset.__init__ during object creation and by
+        IndexedDataset.__setstate__ during un-pickling
+
+        Args:
+            path_prefix (str): The index (.idx) and data (.bin) prefix
+
+            multimodal (bool): Whether the dataset is multimodal
+
+            mmap (bool): Whether to mmap the .bin file
+
+            s3_config (Optional[S3Config]): See IndexedDataset docstring for details.
+        """
+        idx_path = get_idx_path(path_prefix)
+        bin_path = get_bin_path(path_prefix)
+        if s3_config is None:
+            assert os.path.exists(idx_path) and os.path.exists(
+                bin_path
+            ), f"One or both of the .idx and .bin files cannot be found at the path prefix {path_prefix}"
+        self.path_prefix = path_prefix
+        self.multimodal = multimodal
+        self.mmap = mmap
+        self.s3_config = s3_config
+        if mmap:
+            assert not s3_config
+            self.bin_reader = _MMapBinReader(bin_path)
+        elif s3_config:
+            assert not mmap
+            self.bin_reader = _S3BinReader(bin_path, s3_config.bin_chunk_nbytes)
+            idx_path = os.path.join(
+                s3_config.path_to_idx_cache, os.path.basename(get_idx_path(path_prefix))
+            )
+        else:
+            self.bin_reader = _FileBinReader(bin_path)
+        self.index = _IndexReader(idx_path, self.multimodal)
+
+    def __getstate__(self) -> Tuple[str, bool, bool, Optional[S3Config]]:
+        """Get the state during pickling
+
+        Returns:
+            Tuple[str, bool, bool, Optional[S3Config]]: The state tuple
+        """
+        return self.path_prefix, self.multimodal, self.mmap, self.s3_config
+
+    def __setstate__(self, state: Tuple[str, bool, bool, Optional[S3Config]]) -> None:
+        """Set the state during un-pickling
+
+        Args:
+            state (Tuple[str, bool, bool, Optional[S3Config]]): The state tuple
+        """
+        path_prefix, multimodal, mmap, s3_config = state
+        self.initialize(path_prefix, multimodal, mmap, s3_config)
+
+    def __del__(self) -> None:
+        """Clean up the object"""
+        del self.bin_reader
+        del self.index
+
+    def __len__(self) -> int:
+        """Return the length of the dataset i.e. the number of sequences in the index
+
+        Returns:
+            int: The length of the dataset
+        """
+        return len(self.index)
+
+    def __getitem__(
+        self, idx: Union[int, numpy.integer, slice]
+    ) -> Union[numpy.ndarray, Tuple[numpy.ndarray, numpy.ndarray]]:
+        """Return from the dataset
+
+        Args:
+            idx (Union[int, numpy.integer, slice]): The index or index slice into the dataset
+
+        Raises:
+            ValueError: When the index slice is non-contiguous
+
+            TypeError: When the index is of an unexpected type
+
+        Returns:
+            Union[numpy.ndarray, Tuple[numpy.ndarray, numpy.ndarray]]: The sequence tokens and modes at the index or index slice
+        """
+        if isinstance(idx, (int, numpy.integer)):
+            sequence_pointer, sequence_length, sequence_mode = self.index[idx]
+            sequence = self.bin_reader.read(
+                dtype=self.index.dtype, count=sequence_length, offset=sequence_pointer
+            )
+            return (sequence, sequence_mode) if sequence_mode is not None else sequence
+        elif isinstance(idx, slice):
+            start, stop, step = idx.indices(len(self))
+            if step != 1:
+                raise ValueError("Slices into indexed_dataset must be contiguous")
+            sequence_lengths = self.index.sequence_lengths[idx]
+            sequence_modes = self.index.sequence_modes[idx] if self.multimodal else None
+            sequence_offsets = list(accumulate(sequence_lengths))
+            sequences = numpy.split(
+                self.bin_reader.read(
+                    dtype=self.index.dtype,
+                    count=sum(sequence_lengths),
+                    offset=self.index.sequence_pointers[start],
+                ),
+                sequence_offsets[:-1],
+            )
+            return (sequences, sequence_modes) if sequence_modes is not None else sequences
+        else:
+            raise TypeError("Unexpected type received for idx: {}".format(type(idx)))
+
+    def get(self, idx: int, offset: int = 0, length: Optional[int] = None) -> numpy.ndarray:
+        """Retrieve a single item from the dataset with the option to only
+        return a portion of the item.
+
+        get(idx) is the same as [idx] but get() does not support slicing.
+
+        Args:
+            idx (Union[int, numpy.integer]): The index into the dataset
+
+            offset (int): The integer token offset in the sequence
+
+            length (int): The number of tokens to grab from the sequence
+
+        Returns:
+            Union[numpy.ndarray, Tuple[numpy.ndarray, numpy.ndarray]]: The sequence tokens and modes at the index
+        """
+        sequence_pointer, sequence_length, sequence_mode = self.index[idx]
+        if length is None:
+            length = sequence_length - offset
+        sequence_pointer += offset * DType.size(self.index.dtype)
+        sequence = self.bin_reader.read(
+            dtype=self.index.dtype, count=length, offset=sequence_pointer
+        )
+        return (sequence, sequence_mode) if sequence_mode is not None else sequence
+
+    @property
+    def sequence_lengths(self) -> numpy.ndarray:
+        """Get the sequence lengths
+
+        Returns:
+            numpy.ndarray: The sequence lengths
+        """
+        return self.index.sequence_lengths
+
+    @property
+    def document_indices(self) -> numpy.ndarray:
+        """Get the document indices
+
+        Returns:
+            numpy.ndarray: The document indices
+        """
+        return self.index.document_indices
+
+    def get_document_indices(self) -> numpy.ndarray:
+        """Get the document indices
+
+        This method is slated for deprecation.
+
+        Returns:
+            numpy.ndarray: The document indices
+        """
+        return self.index.document_indices
+
+    def set_document_indices(self, document_indices: numpy.ndarray) -> None:
+        """Set the document indices
+
+        This method is slated for deprecation.
+
+        Args:
+            document_indices (numpy.ndarray): The document indices
+        """
+        self.index.document_indices = document_indices
+
+    @property
+    def sequence_modes(self) -> numpy.ndarray:
+        """Get the sequence modes
+
+        Returns:
+            numpy.ndarray: The sequence modes
+        """
+        return self.index.sequence_modes
+
+    @staticmethod
+    def exists(path_prefix: str) -> bool:
+        """Return whether the IndexedDataset exists on disk at the prefix
+
+        Args:
+            path_prefix (str): The prefix to the index (.idx) and data (.bin) files
+
+        Returns:
+            bool: Whether the IndexedDataset exists on disk at the prefix
+        """
+        if is_s3_path(path_prefix):
+            s3_client = boto3.client("s3")
+            return object_exists(s3_client, get_idx_path(path_prefix)) and object_exists(
+                s3_client, get_bin_path(path_prefix)
+            )
+        return os.path.exists(get_idx_path(path_prefix)) and os.path.exists(
+            get_bin_path(path_prefix)
+        )
+
+
+class IndexedDatasetBuilder(object):
+    """Builder class for the IndexedDataset class
+
+    Args:
+        bin_path (str): The path to the data (.bin) file
+
+        dtype (Type[numpy.number], optional): The dtype of the index file. Defaults to numpy.int32.
+
+        multimodal (bool, optional): Whether the dataset is multimodal. Defaults to False.
+    """
+
+    def __init__(
+        self, bin_path: str, dtype: Type[numpy.number] = numpy.int32, multimodal: bool = False
+    ) -> None:
+        self.data_file = open(bin_path, "wb")
+        self.dtype = dtype
+        self.multimodal = multimodal
+
+        self.sequence_lengths = []
+        self.document_indices = [0]
+        self.sequence_modes = [] if self.multimodal else None
+
+    def add_item(self, tensor: torch.Tensor, mode: int = 0) -> None:
+        """Add a single item to the dataset
+
+        Args:
+            tensor (torch.Tensor): The item to add to the data file
+
+            mode (int, optional): The mode for the item. Defaults to 0.
+        """
+        np_array = numpy.array(tensor.numpy(), dtype=self.dtype)
+        self.data_file.write(np_array.tobytes(order="C"))
+        self.sequence_lengths.append(np_array.size)
+        if self.multimodal:
+            self.sequence_modes.append(mode)
+
+    def add_document(
+        self, tensor: torch.Tensor, lengths: List[int], modes: Optional[List[int]] = None
+    ) -> None:
+        """Add an entire document to the dataset
+
+        Args:
+            tensor (torch.Tensor): The document to add
+
+            lengths (List[int]): The lengths of each item in the document
+
+            modes (Optional[List[int]], optional): The modes for each item in the document. Defaults to None.
+        """
+        np_array = numpy.array(tensor, dtype=self.dtype)
+        self.data_file.write(np_array.tobytes(order="C"))
+        self.sequence_lengths.extend(lengths)
+        self.document_indices.append(len(self.sequence_lengths))
+        if self.multimodal:
+            self.sequence_modes.extend(modes if modes is not None else [0] * lengths)
+
+    def end_document(self) -> None:
+        """Finalize the document, for use with IndexedDatasetBuilder.add_item"""
+        self.document_indices.append(len(self.sequence_lengths))
+
+    def add_index(self, path_prefix: str) -> None:
+        """Add an entire IndexedDataset to the dataset
+
+        Args:
+            path_prefix (str): The index (.idx) and data (.bin) prefix
+        """
+        # Concatenate index
+        index = _IndexReader(get_idx_path(path_prefix), multimodal=self.multimodal)
+        assert index.dtype == self.dtype
+
+        offset = len(self.sequence_lengths)
+        self.sequence_lengths.extend(index.sequence_lengths)
+        self.document_indices.extend((offset + index.document_indices)[1:])
+
+        if self.multimodal:
+            self.sequence_modes.extend(index.sequence_modes)
+
+        # Concatenate data
+        with open(get_bin_path(path_prefix), "rb") as f:
+            shutil.copyfileobj(f, self.data_file)
+
+    def finalize(self, idx_path: str) -> None:
+        """Clean up and write the index (.idx) file
+
+        Args:
+            idx_path (str): The path to the index file
+        """
+        self.data_file.close()
+        with _IndexWriter(idx_path, self.dtype) as writer:
+            writer.write(self.sequence_lengths, self.sequence_modes, self.document_indices)
+
+
+def get_idx_path(path_prefix: str) -> str:
+    """Get the path to the index file from the prefix
+
+    Args:
+        path_prefix (str): The prefix
+
+    Returns:
+        str: The path to the index file
+    """
+    return path_prefix + ".idx"
+
+
+def get_bin_path(path_prefix: str) -> str:
+    """Get the path to the data file from the prefix
+
+    Args:
+        path_prefix (str): The prefix
+
+    Returns:
+        str: The path to the data file
+    """
+    return path_prefix + ".bin"
diff --git a/megatron/core/datasets/masked_dataset.py b/megatron/core/datasets/masked_dataset.py
new file mode 100644
index 0000000000..9db6c67eb1
--- /dev/null
+++ b/megatron/core/datasets/masked_dataset.py
@@ -0,0 +1,423 @@
+# Copyright (c) 2024, NVIDIA CORPORATION. All rights reserved.
+
+import logging
+import os
+import time
+from abc import abstractmethod
+from dataclasses import dataclass
+from typing import List, Optional, Tuple, Union
+
+import numpy
+import torch
+
+from megatron.core.datasets.blended_megatron_dataset_config import BlendedMegatronDatasetConfig
+from megatron.core.datasets.indexed_dataset import IndexedDataset
+from megatron.core.datasets.megatron_dataset import MegatronDataset
+from megatron.core.datasets.utils import Split
+from megatron.core.utils import log_single_rank
+
+logger = logging.getLogger(__name__)
+
+
+@dataclass
+class MaskedWordPieceDatasetConfig(BlendedMegatronDatasetConfig):
+    """Configuration object for Megatron Core Masked WordPiece datasets"""
+
+    masking_probability: float = None
+    """The probability we mask a candidate N-gram"""
+
+    short_sequence_probability: float = None
+    """The probability we return a sequence shorter than the target sequence length"""
+
+    masking_max_ngram: int = None
+    """The maximum length N-gram to consider masking or permuting"""
+
+    masking_do_full_word: bool = None
+    """Whether we mask the the whole word or its component parts"""
+
+    masking_do_permutation: bool = None
+    """Whether we shuffle a subset of candidate N-grams in addition"""
+
+    masking_use_longer_ngrams: bool = None
+    """Whether to favor longer N-grams over shorter N-grams"""
+
+    masking_use_geometric_distribution: bool = None
+    """Whether to draw the size of the N-gram from a geometric distribution according to SpanBERT
+       https://arxiv.org/abs/1907.10529 (Section 3.1)
+    """
+
+    def __post_init__(self) -> None:
+        """Do asserts and set fields post init"""
+        super().__post_init__()
+
+        assert self.tokenizer is not None
+
+        assert self.masking_probability is not None
+        assert self.short_sequence_probability is not None
+        assert self.masking_max_ngram is not None
+        assert self.masking_do_full_word is not None
+        assert self.masking_do_permutation is not None
+        assert self.masking_use_longer_ngrams is not None
+        assert self.masking_use_geometric_distribution is not None
+
+        assert self.masking_probability > 0 and self.masking_probability < 1.0
+        assert self.short_sequence_probability >= 0 and self.short_sequence_probability <= 1.0
+        assert self.masking_max_ngram > 0
+        assert not (self.masking_use_geometric_distribution and self.masking_do_permutation)
+
+        if self.masking_use_geometric_distribution and self.masking_use_longer_ngrams:
+            log_single_rank(
+                logger,
+                logging.WARNING,
+                "The use of a geometric distribution overrides the default distribution",
+            )
+
+
+class MaskedWordPieceDataset(MegatronDataset):
+    """The semi-abstract base class for masked WordPiece datasets
+
+    This implementation makes the rigid assumption that all inheritor datasets are built upon the
+    IndexedDataset class. This assumption may be pushed down to the inheritors in future if
+    necessary.
+
+    NB: WordPiece tokenization prepends a double hash "##" to all tokens/pieces in a word, save the
+    first token/piece.
+
+    Args:
+        indexed_dataset (IndexedDataset): The IndexedDataset around which to build the MegatronDataset
+
+        dataset_path (str): The real path on disk to the dataset, for bookkeeping
+
+        indexed_indices (numpy.ndarray): The set of the documents indices to expose
+
+        num_samples (Optional[int]): The number of samples to draw from the indexed dataset. When None, build as many samples as correspond to one epoch.
+
+        index_split (Split): The indexed_indices Split
+
+        config (MaskedWordPieceDatasetConfig): The config
+    """
+
+    def __init__(
+        self,
+        indexed_dataset: IndexedDataset,
+        dataset_path: str,
+        indexed_indices: numpy.ndarray,
+        num_samples: Optional[int],
+        index_split: Split,
+        config: MaskedWordPieceDatasetConfig,
+    ) -> None:
+        super().__init__(
+            indexed_dataset, dataset_path, indexed_indices, num_samples, index_split, config
+        )
+
+    @staticmethod
+    def numel_low_level_dataset(low_level_dataset: IndexedDataset) -> int:
+        return low_level_dataset.document_indices.shape[0] - 1
+
+    @staticmethod
+    def build_low_level_dataset(
+        dataset_path: str, config: MaskedWordPieceDatasetConfig
+    ) -> IndexedDataset:
+        return IndexedDataset(dataset_path)
+
+    @staticmethod
+    def _key_config_attributes() -> List[str]:
+        """Inherited method implementation
+
+        Returns:
+            List[str]: The key config attributes
+        """
+        return super(MaskedWordPieceDataset, MaskedWordPieceDataset)._key_config_attributes() + [
+            "masking_probability",
+            "short_sequence_probability",
+            "masking_max_ngram",
+            "masking_do_full_word",
+            "masking_do_permutation",
+            "masking_use_longer_ngrams",
+            "masking_use_geometric_distribution",
+        ]
+
+    def __len__(self) -> int:
+        return self.sample_index.shape[0]
+
+    def _build_sample_index(
+        self, sequence_length: int, min_sentences_per_sample: int
+    ) -> numpy.ndarray:
+        path_to_cache = self.config.path_to_cache
+        if path_to_cache is None:
+            path_to_cache = os.path.join(
+                self.dataset.path_prefix, "cache", f"{type(self).__name__}_indices"
+            )
+
+        get_path_to = lambda suffix: os.path.join(
+            path_to_cache, f"{self.unique_description_hash}-{type(self).__name__}-{suffix}"
+        )
+        path_to_description = get_path_to("description.txt")
+        path_to_sample_index = get_path_to("sample_index.npy")
+        cache_hit = all(map(os.path.isfile, [path_to_description, path_to_sample_index]))
+
+        if self.num_samples is not None:
+            num_epochs = numpy.iinfo(numpy.int32).max - 1
+        else:
+            num_epochs = 1
+
+        if not cache_hit and torch.distributed.get_rank() == 0:
+            log_single_rank(
+                logger,
+                logging.INFO,
+                f"Build and save the {type(self).__name__} {self.index_split.name} indices",
+            )
+            self.built_anew_on_cache_miss = True
+
+            os.makedirs(path_to_cache, exist_ok=True)
+
+            # Write the description
+            with open(path_to_description, "wt") as writer:
+                writer.write(self.unique_description)
+
+            # Build the sample index
+            log_single_rank(
+                logger,
+                logging.INFO,
+                f"\tBuild and save the sample index to {os.path.basename(path_to_sample_index)}",
+            )
+            t_beg = time.time()
+            from megatron.core.datasets import helpers
+
+            # Add +1 for access to document upper bound
+            indices = numpy.append(self.indices, self.indices[-1] + 1)
+
+            sample_index = helpers.build_mapping(
+                self.dataset.document_indices[indices],
+                self.dataset.sequence_lengths,
+                num_epochs,
+                self.num_samples,
+                sequence_length,
+                self.config.short_sequence_probability,
+                self.config.random_seed,
+                False,
+                min_sentences_per_sample,
+            )
+            numpy.save(path_to_sample_index, sample_index, allow_pickle=True)
+            t_end = time.time()
+            log_single_rank(logger, logging.DEBUG, f"\t> time elapsed: {t_end - t_beg:4f} seconds")
+
+            log_single_rank(
+                logger, logging.INFO, f"> total number of samples: {sample_index.shape[0]}"
+            )
+            log_single_rank(logger, logging.INFO, f"> total number of epochs: {num_epochs}")
+
+            return sample_index
+
+        log_single_rank(
+            logger, logging.INFO, f"Load the {type(self).__name__} {self.index_split.name} indices"
+        )
+
+        log_single_rank(
+            logger,
+            logging.INFO,
+            f"\tLoad the sample index from {os.path.basename(path_to_sample_index)}",
+        )
+        t_beg = time.time()
+        sample_index = numpy.load(path_to_sample_index, allow_pickle=True, mmap_mode="r")
+        t_end = time.time()
+        log_single_rank(logger, logging.DEBUG, f"\t> time elapsed: {t_end - t_beg:4f} seconds")
+
+        return sample_index
+
+    def _create_masked_lm_predictions(
+        self,
+        token_ids: List[int],
+        target_sequence_length: int,
+        numpy_random_state: numpy.random.RandomState,
+    ) -> Tuple[List[int], List[int], List[int], List[int], List[Tuple[List[int], List[int]]]]:
+        """Creates the predictions for the masked LM objective
+
+        Args:
+            token_ids (List[int]): The token ids
+            target_sequence_length (int): The target sequence length
+            numpy_random_state (numpy.random.RandomState): The NumPy random state
+
+        Returns:
+            Tuple[List[int], List[int], List[int], List[int], List[Tuple[List[int], List[int]]]]:
+                1. masked_token_ids -> The masked sequence
+                2. masked_positions -> The indices for the masked token ids
+                3. masked_labels    -> The original token ids for the masked token ids
+                4. boundaries       -> The sentence and word boundaries for the sequence
+                4. masked_spans     -> The masked positions and labels with N-gram info intact
+        """
+        # Build the token sentence and word boundaries and the masking candidates
+        # e.g. [cls, id, ##id, ##id, id, ##id, sep, id, ##id, sep]
+        #    -> boundaries: [1, 1, 0, 0, 1, 0, 1, 1, 0, 1]
+        #    -> candidates with whole word masking: [[1, 2, 3], [4, 5], [7, 8]]
+        #    -> candidates sans whole word masking: [[1], [2], [3], [4], [5], [7], [8]]
+        boundaries = []
+        candidates = []
+        for i, token_id in enumerate(token_ids):
+            if token_id == self.config.tokenizer.cls or token_id == self.config.tokenizer.sep:
+                boundaries.append(1)
+            else:
+                if not self.config.tokenizer.inv_vocab[token_id].startswith("##"):
+                    boundaries.append(1)
+                    candidates.append([i])
+                else:
+                    boundaries.append(0)
+                    if self.config.masking_do_full_word and len(candidates) > 0:
+                        candidates[-1].append(i)
+                    else:
+                        candidates.append([i])
+
+        n_maskings = min(
+            self.config.masking_probability * target_sequence_length,
+            max(1, int(round(len(token_ids) * self.config.masking_probability))),
+        )
+
+        ngram_nvals = numpy.arange(self.config.masking_max_ngram, dtype=numpy.int64) + 1
+
+        # By default, the N-gram probabilites are inversely proportional to N
+        # e.g. N = 3
+        #    -> P = array([0.54545455, 0.27272727, 0.18181818])
+        nprobs = 1.0 / ngram_nvals
+        nprobs = nprobs / nprobs.sum(keepdims=True)
+        if self.config.masking_use_longer_ngrams:
+            nprobs = nprobs[::-1]
+
+        # Create a nested list of depth 3
+        #   layer 1: the candidate dimension
+        #   layer 2: the N-gram dimension
+        #   layer 3: the token dimension
+        candidate_ngrams = [
+            [candidates[idx : idx + n] for n in ngram_nvals] for idx in range(len(candidates))
+        ]
+        numpy_random_state.shuffle(candidate_ngrams)
+
+        masked_token_ids = list(token_ids)
+        masked_positions_and_labels = []
+        masked_spans = []
+        masked_indices = set()
+        for candidate_idx in range(len(candidate_ngrams)):
+            n_ngrams = len(candidate_ngrams[candidate_idx])
+
+            # Stop when we hit our desired number of maskings
+            if len(masked_positions_and_labels) >= n_maskings:
+                break
+
+            # Do nothing for candidates with no ngrams
+            if not candidate_ngrams[candidate_idx]:
+                continue
+
+            # Choose the initial value of N
+            if self.config.masking_use_geometric_distribution:
+                # Sample N from a geometric distribution with p = 0.2 and clip
+                # i.e. SpanBERT
+                #    -> https://arxiv.org/abs/1907.10529 (Section 3.1)
+                p = 0.2
+                n = min(numpy_random_state.geometric(p), self.config.masking_max_ngram)
+            else:
+                p = nprobs[:n_ngrams] / nprobs[:n_ngrams].sum(keepdims=True)
+                n = numpy_random_state.choice(ngram_nvals[:n_ngrams], p=p)
+
+            while True:
+                ngram_indices = sum(candidate_ngrams[candidate_idx][n - 1], [])
+                n = n - 1
+                # Success: masking this N-gram puts us below the desired number of maskings
+                if n_maskings >= len(masked_positions_and_labels) + len(ngram_indices):
+                    skip_candidate = False
+                    break
+                # Failure: no N-grams remain for this candidate
+                if n == 0:
+                    skip_candidate = True
+                    break
+
+            # Do nothing for candidates whose 1-gram is too long
+            if skip_candidate:
+                continue
+
+            # Do nothing for candidate indices which have already been masked
+            if any(map(lambda idx: idx in masked_indices, ngram_indices)):
+                continue
+
+            # Mask the tokens and record their original positions and values
+            for index in ngram_indices:
+                masked_indices.add(index)
+                mask = self._get_token_mask(numpy_random_state)
+                if mask is None:
+                    masked_token_ids[index] = token_ids[index]
+                else:
+                    masked_token_ids[index] = mask
+                masked_positions_and_labels.append((index, token_ids[index]))
+
+            masked_spans.append((ngram_indices, [token_ids[index] for index in ngram_indices]))
+
+        assert len(masked_positions_and_labels) <= n_maskings
+
+        numpy_random_state.shuffle(candidate_ngrams)
+
+        if self.config.masking_do_permutation:
+
+            n_swappings = n_maskings
+
+            permuted_indices = set()
+            for candidate_idx in range(len(candidate_ngrams)):
+                n_ngrams = len(candidate_ngrams[candidate_idx])
+
+                if len(permuted_indices) >= n_swappings:
+                    break
+
+                # Do nothing for candidates with no ngrams
+                if not candidate_ngrams[candidate_idx]:
+                    continue
+
+                p = nprobs[:n_ngrams] / nprobs[:n_ngrams].sum(keepdims=True)
+                n = numpy.random.choice(ngram_nvals[:n_ngrams], p=p)
+
+                while True:
+                    ngram_indices = sum(candidate_ngrams[candidate_idx][n - 1], [])
+                    n = n - 1
+                    # Success: swapping this N-gram puts us below the desired number of swappings
+                    if n_swappings >= len(permuted_indices) + len(ngram_indices):
+                        skip_candidate = False
+                        break
+                    # Failure: no N-grams remain for this candidate
+                    if n == 0:
+                        skip_candidate = True
+                        break
+
+                # Do nothing for candidates whose 1-gram is too long
+                if skip_candidate:
+                    continue
+
+                # Do nothing for candidate indices which have already been masked or permuted
+                if any(
+                    map(lambda idx: idx in masked_indices or idx in permuted_indices, ngram_indices)
+                ):
+                    continue
+
+                for index in ngram_indices:
+                    permuted_indices.add(index)
+
+            assert len(permuted_indices) <= n_swappings
+
+            permuted_indices = sorted(permuted_indices)
+            permuted_indices_copy = list(permuted_indices)
+            numpy_random_state.shuffle(permuted_indices_copy)
+            masked_token_ids_copy = list(masked_token_ids)
+
+            for idx, idx_copy in zip(permuted_indices, permuted_indices_copy):
+                masked_token_ids[idx] = masked_token_ids_copy[idx_copy]
+                masked_positions_and_labels.append((idx, masked_token_ids_copy[idx]))
+
+        masked_positions_and_labels = sorted(masked_positions_and_labels, key=lambda x: x[0])
+        masked_positions = []
+        masked_labels = []
+        for position, label in masked_positions_and_labels:
+            masked_positions.append(position)
+            masked_labels.append(label)
+
+        masked_spans = sorted(masked_spans, key=lambda x: x[0][0])
+
+        return masked_token_ids, masked_positions, masked_labels, boundaries, masked_spans
+
+    @abstractmethod
+    def _get_token_mask(self, numpy_random_state: numpy.random.RandomState) -> Optional[int]:
+        pass
diff --git a/megatron/core/datasets/megatron_dataset.py b/megatron/core/datasets/megatron_dataset.py
new file mode 100644
index 0000000000..15a9a53328
--- /dev/null
+++ b/megatron/core/datasets/megatron_dataset.py
@@ -0,0 +1,139 @@
+# Copyright (c) 2022, NVIDIA CORPORATION. All rights reserved.
+
+import hashlib
+import json
+from abc import ABC, abstractmethod
+from collections import OrderedDict
+from typing import Any, Dict, Iterable, List, Optional, Union
+
+import numpy
+import torch
+
+from megatron.core.datasets.blended_megatron_dataset_config import BlendedMegatronDatasetConfig
+from megatron.core.datasets.indexed_dataset import IndexedDataset
+from megatron.core.datasets.utils import Split
+
+LowLevelDataset = Union[IndexedDataset, Iterable]
+
+
+class MegatronDataset(ABC, torch.utils.data.Dataset):
+    """The highest level wrapper class from which all dataset classes should inherit
+
+    Args:
+        dataset (LowLevelDataset): The dataset around which to build the MegatronDataset
+
+        dataset_path (Optional[str]): The real path on disk to the dataset, for bookkeeping
+
+        indices (numpy.ndarray): The set of the documents indices to expose
+
+        num_samples (Optional[int]): The minimum number of samples to build from the indexed dataset. When None, build as many samples as correspond to one epoch.
+
+        index_split (Split): The indices Split
+
+        config (BlendedMegatronDatasetConfig): The config
+    """
+
+    def __init__(
+        self,
+        dataset: LowLevelDataset,
+        dataset_path: Optional[str],
+        indices: numpy.ndarray,
+        num_samples: Optional[int],
+        index_split: Split,
+        config: BlendedMegatronDatasetConfig,
+    ) -> None:
+        self.dataset = dataset
+        self.dataset_path = dataset_path
+        self.indices = indices
+        self.num_samples = num_samples
+        self.index_split = index_split
+        self.config = config
+
+        self.unique_identifiers = OrderedDict()
+
+        self.unique_identifiers["class"] = type(self).__name__
+        self.unique_identifiers["dataset_path"] = self.dataset_path
+        self.unique_identifiers["num_samples"] = self.num_samples
+        self.unique_identifiers["index_split"] = self.index_split.name
+        for attr in self._key_config_attributes():
+            self.unique_identifiers[attr] = getattr(self.config, attr)
+
+        self.unique_description = json.dumps(
+            self.unique_identifiers, indent=4, default=lambda obj: obj.unique_identifiers
+        )
+        self.unique_description_hash = hashlib.md5(
+            self.unique_description.encode("utf-8")
+        ).hexdigest()
+
+        self.built_anew_on_cache_miss = False
+
+    @staticmethod
+    def numel_low_level_dataset(low_level_dataset: LowLevelDataset) -> int:
+        """Return the number of elements in the underlying low level dataset for the purpose of
+        segregating the train/valid/test split indices
+
+        It may be that the low level dataset can be split any number of ways, depending on the mid
+        level dataset it supports, which is why we define the "number of elements" function
+        separately from the __len__ function here in the mid level dataset class
+
+        Args:
+            low_level_dataset (LowLevelDataset): The underlying low level dataset
+
+        Returns:
+            int: The number of elements in the underlying low level dataset
+        """
+        raise NotImplementedError
+
+    @staticmethod
+    def build_low_level_dataset(
+        dataset_path: str, config: BlendedMegatronDatasetConfig
+    ) -> LowLevelDataset:
+        """Build the low level dataset via a function to be called from within
+        BlendedMegatronDatasetBuilder.build_generic_dataset
+
+        It may be that the low level dataset spans any subset of train/valid/test splits, which is
+        why we define a static "build" function separately from the constructor in the mid level
+        dataset class
+
+        Args:
+            dataset_path (str): The real path on disk to the dataset
+
+            config (BlendedMegatronDatasetConfig): The dataset config
+
+        Returns:
+            LowLevelDataset: The low level dataset
+        """
+        raise NotImplementedError
+
+    @staticmethod
+    def _key_config_attributes() -> List[str]:
+        """Return all config attributes which contribute to uniquely identifying the dataset.
+
+        These attributes will be used to build a uniquely identifying string and MD5 hash which
+        will be used to cache/load dataset resources from run to run.
+
+        Returns:
+            List[str]: The key config attributes
+        """
+        return ["random_seed", "sequence_length", "split", "split_matrix", "tokenizer"]
+
+    @abstractmethod
+    def __len__(self) -> int:
+        """Return the length of the dataset
+
+        Returns:
+            int: See abstract implementation
+        """
+        pass
+
+    @abstractmethod
+    def __getitem__(self, idx: int) -> Dict[str, Union[torch.Tensor, numpy.ndarray]]:
+        """Return from the dataset
+
+        Args:
+            idx (int): The index into the dataset
+
+        Returns:
+            Dict[str, Union[torch.Tensor, numpy.ndarray]]: See abstract implementation
+        """
+        pass
diff --git a/megatron/core/datasets/megatron_tokenizer.py b/megatron/core/datasets/megatron_tokenizer.py
new file mode 100644
index 0000000000..84f3546cf3
--- /dev/null
+++ b/megatron/core/datasets/megatron_tokenizer.py
@@ -0,0 +1,154 @@
+# Copyright (c) 2024, NVIDIA CORPORATION. All rights reserved.
+import json
+from abc import ABC, abstractmethod
+from collections import OrderedDict
+from typing import Any
+
+import numpy
+
+
+class MegatronTokenizer(ABC):
+    """Abstract class for tokenizer
+
+    Absent a config or class-specific tracking of which objects are uniquely identifying, we must
+    include all key word arguments as unique identifiers
+
+    Args:
+        tokenizer_paths (Tuple[str]): All tokenizer source paths or prefixes
+
+        tokenizer_options (Dict[str, Any]): All tokenizer options
+    """
+
+    def __init__(self, *tokenizer_paths: str, **tokenizer_options: Any):
+
+        self.unique_identifiers = OrderedDict()
+        self.unique_identifiers["class"] = type(self).__name__
+        self.unique_identifiers["tokenizer_path"] = list(tokenizer_paths)
+        for option in tokenizer_options:
+            self.unique_identifiers[option] = str(tokenizer_options[option])
+
+        self.unique_description = json.dumps(self.unique_identifiers, indent=4)
+
+        super().__init__()
+
+    @abstractmethod
+    def tokenize(self, text: str) -> numpy.ndarray:
+        """Convert text to embedding ids
+
+        Args:
+            text (str): The text to convert
+
+        Returns:
+            numpy.ndarray: The converted embedding ids
+        """
+        pass
+
+    def detokenize(self, ids: numpy.ndarray) -> str:
+        """Convert embedding ids to text
+
+        Args:
+            ids (numpy.ndarray): The ids to convert
+
+        Returns:
+            str: The converted text
+
+        Raises:
+            NotImplementedError: Non-abstract, optional method
+        """
+        raise NotImplementedError("{} has no method 'detokenize'".format(type(self).__name__))
+
+    def offsets(self, ids: list[int], text: str) -> list[int]:
+        """Convert embedding ids to text offsets
+
+        Args:
+            ids (list[int]): The ids to convert
+            text (str): The text to convert
+
+        Returns:
+            list[int]: The converted offsets
+
+        Raises:
+            NotImplementedError: Non-abstract, optional method
+        """
+        raise NotImplementedError("{} has no method 'offsets'".format(type(self).__name__))
+
+    @property
+    @abstractmethod
+    def vocab(self):
+        """Dictionary from vocab text token to id token"""
+        pass
+
+    @property
+    @abstractmethod
+    def inv_vocab(self):
+        """Dictionary from vocab id token to text token"""
+        pass
+
+    @property
+    @abstractmethod
+    def vocab_size(self):
+        """The vocabulary size"""
+        pass
+
+    @property
+    def cls(self):
+        """The CLS token id
+
+        Raises:
+            NotImplementedError: Non-abstract, optional attribute
+        """
+        raise NotImplementedError("{} has no attribute 'cls'".format(type(self).__name__))
+
+    @property
+    def sep(self):
+        """The SEP token id
+
+        Raises:
+            NotImplementedError: Non-abstract, optional attribute
+        """
+        raise NotImplementedError("{} has no attribute 'sep'".format(type(self).__name__))
+
+    @property
+    def pad(self):
+        """The PAD token id
+
+        Raises:
+            NotImplementedError: Non-abstract, optional attribute
+        """
+        raise NotImplementedError("{} has no attribute 'pad'".format(type(self).__name__))
+
+    @property
+    def eod(self):
+        """The EOD token id
+
+        Raises:
+            NotImplementedError: Non-abstract, optional attribute
+        """
+        raise NotImplementedError("{} has no attribute 'eod'".format(type(self).__name__))
+
+    @property
+    def bos(self):
+        """The BOS token id
+
+        Raises:
+            NotImplementedError: Non-abstract, optional attribute
+        """
+        raise NotImplementedError("{} has no attribute 'bos'".format(type(self).__name__))
+
+    @property
+    def eos(self):
+        """The EOS token id
+
+        Raises:
+            NotImplementedError: Non-abstract, optional attribute
+        """
+        raise NotImplementedError("{} has no attribute 'eos'".format(type(self).__name__))
+
+    @property
+    def mask(self):
+        """The MASK token id
+
+        Raises:
+            NotImplementedError: Non-abstract, optional attribute
+        """
+        raise NotImplementedError("{} has no attribute 'mask'".format(type(self).__name__))
diff --git a/megatron/core/datasets/multimodal_dataset.py b/megatron/core/datasets/multimodal_dataset.py
new file mode 100644
index 0000000000..0a3e93a15b
--- /dev/null
+++ b/megatron/core/datasets/multimodal_dataset.py
@@ -0,0 +1,62 @@
+# Copyright (c) 2024, NVIDIA CORPORATION. All rights reserved.
+
+from dataclasses import dataclass
+from typing import Callable, Dict
+
+import torch
+
+from megatron.core.datasets.gpt_dataset import GPTDatasetConfig, MockGPTDataset
+
+
+@dataclass
+class MultimodalDatasetConfig(GPTDatasetConfig):
+    """Configuration object for Megatron Core Multimodal datasets.
+
+    Note: This is unused at the moment and may be missing features. Follow-up changes will use this.
+    """
+
+    image_h: int = None
+    """Image height."""
+
+    image_w: int = None
+    """Image width."""
+
+    # Function to preprocess the data sample to a format expected by a specific model. By default, do nothing.
+    preprocess_func: Callable[[Dict[str, torch.Tensor]], Dict[str, torch.Tensor]] = lambda x: x
+    """Optional function to preprocess data samples for a specific model."""
+
+    def __post_init__(self) -> None:
+        super().__post_init__()
+
+        assert self.image_h is not None
+        assert self.image_w is not None
+
+
+class MockMultimodalDataset(MockGPTDataset):
+    """Mock multimodal dataset.
+
+
+    This is unused at the moment and may be missing features. Follow-up changes will use this.
+    """
+
+    def __getitem__(self, idx: int) -> Dict[str, torch.Tensor]:
+        """Return a sample that contains a dummy image, text sequence and the associated labels and cost and attention masks.
+
+        Args:
+            idx (int): The integer seed for mock data generation.
+
+        Returns:
+            Dict[str, torch.Tensor]: The mock data.
+        """
+        # Get a text sample.
+        sample = super().__getitem__(idx)
+
+        # Add mock input image.
+        sample["image"] = torch.zeros(
+            (3, self.config.image_h, self.config.image_w), dtype=torch.float32
+        )
+
+        # Run optional data preprocessing.
+        preprocess_func = self.config.preprocess_func
+
+        return preprocess_func(sample)
diff --git a/megatron/core/datasets/readme.md b/megatron/core/datasets/readme.md
new file mode 100644
index 0000000000..12ade943b5
--- /dev/null
+++ b/megatron/core/datasets/readme.md
@@ -0,0 +1,193 @@
+# Data Pipeline
+
+## Data pre-processing
+
+Data preprocessing is built around the following classes:
+
+1. `IndexedDatasetBuilder`
+2. `IndexedDataset`
+
+At the moment, an end-to-end data preprocessing implementation is left to the user. See the class docstring(s) for more details.
+
+#### IndexedDatasetBuilder
+
+The `IndexedDatasetBuilder` is capable of building and merging `IndexedDataset` instances.
+
+#### IndexedDataset
+
+The `IndexedDataset` class is the lowest-level data interface in Megatron Core. Internally, an `IndexedDataset` instance references two binaries: the data file (`.bin`) contains document/sequence data and the index file (`.idx`) contains document/sequence metadata.
+
+The index file stores dataset-level metadata first:
+- The index header, for backward compatibility
+- The index version, for backward compatibility
+- A numeric code corresponding to the data type used to write data to the data file
+- The number of sequences in the dataset
+- The number of documents in the dataset
+
+The index file stores document-level and sequence-level metadata second:
+- In order, the number of elements per sequence
+- In order, the byte offset (pointer) per sequence
+- In order, the consecutive sequence index range `[...)` per document
+- In order, the mode per sequence (in the multimodal case)
+
+## Data loading: construction
+
+Building the data loaders is a distributed-aware process built around the following classes:
+
+1. `BlendedMegatronDatasetConfig`
+2. `BlendedMegatronDatasetBuilder`
+3. `IndexedDataset`
+3. `MegatronDataset`
+4. `BlendedDataset`
+
+See the class docstrings for more details.
+
+#### BlendedMegatronDatasetConfig (extendable)
+
+The `BlendedMegatronDatasetConfig` class parameterizes the `BlendedMegatronDatasetBuilder` and in turn the `MegatronDataset` and `BlendedDataset`.
+
+Different training/inference regimes will require different extensions e.g. the `GPTDatasetConfig`
+
+#### BlendedMegatronDatasetBuilder
+
+The `BlendedMegatronDatasetBuilder` class builds the highest-level data interfaces in Megatron Core.
+
+**NB:** All ranks should attempt to build the dataset via the `BlendedMegatronDatasetBuilder` or the program will hang. Which ranks follow through on their attempts can be controlled via the `BlendedMegatronDatasetConfig`.
+
+#### IndexedDataset
+
+The `IndexedDataset` class is the lowest-level data interface in Megatron Core.
+
+The `IndexedDataset` should already exist on disk before attempting to build any of the high-level data interfaces.
+
+
+#### MegatronDataset (extendable)
+
+The `MegatronDataset` abstract class is a high-level data interface in Megatron Core. It is an abstraction built upon the `IndexedDataset`.
+
+Different training/inference regimes will require different extensions e.g. the `GPTDataset`
+
+#### BlendedDataset
+
+The `BlendedDataset` class is a high-level data interface in Megatron Core. It is an abstraction built upon the `MegatronDataset`.
+
+The `BlendedDataset` is only necessary when a blend multiple data distributions, i.e. multiple `MegatronDataset` instances, should contribute to a certain dataset split. The blend can be controlled via the `BlendedMegatronDatasetConfig`.
+
+## Data loading: implementation
+
+### GPTDataset
+
+The `GPTDataset` is parameterized by the following variables: the underlying `IndexedDataset` instance `indexed_dataset`, the split indices `indexed_indices` (the congituous subset of document or sequence indices used for training, validation, and testing), the number of samples `N`, the sequence length `S`, and the random seed `R`.
+
+The `GPTDataset` creates three index mappings to facilitate lookup: (1) the document index, (2) the sample index, and (3) the shuffle index.
+
+1. The document index _Do_idx_ is a 1-D array mapping from _i_ to document index of length `E * |indexed_indices|` where `E` corresponds to the minimum number of epochs such that `E * |indexed_indices| >= N`. The document index is shuffled according to `R`.
+
+    ```
+    Given:
+
+    N = 15
+    indexed_indices = [5, 6, 7, 8, 9]
+    E = 3
+
+    Then, for example:
+
+    Do_idx = [8, 8, 9, 6, 7, 5, 8, 5, 6, 6, 5, 9, 7, 7, 9]
+    ```
+
+2. The sample index _Sa_idx_ is a 2-D array mapping from _j_ to pairs of (_i_, _Do_idx_[ _i_ ] offset) of shape `[N + 1, 2]`. The rows _j_ and _j_ + 1 serve as the left and right bounds for the _j_-th sample. 
+
+    ```
+    Given:
+
+    S = 1024
+
+    Then, for example:
+
+    Sa_idx[0] = (0, 0)
+    Sa_idx[1] = (0, 1024)       => Do_idx[0] has length greater than S
+    Sa_idx[2] = (1, 512)        => Do_idx[0] has length 1536
+    Sa_idx[3] = (2, 0)          => Do_idx[1] has length 1536
+    Sa_idx[4] = (5, 300)        => Do_idx[2:5] are shorter documents relative to Do_idx[0:2]
+    Sa_idx[5] = (6, 24)         => Do_idx[5] has length 1300
+    ```
+
+3. The shuffle index _Sh_idx_ is a 1-D array mapping from _k_ to _j_ of length `N`. The shuffle index is shuffled according to `R`.
+
+    ```
+    Given
+
+    N = 10
+
+    Then, for example:
+
+    Sh_idx = [4, 0, 2, 6, 1, 9, 5, 8, 7, 3]
+    ```
+
+To query the `GPTDataset` for the _k_-th sample we do the following
+
+-  Use the shuffle index to get the index _j_ into the sample index.
+
+    ```
+    j = Sh_idx[k]
+    ```
+- Use the sample index to get the left and right sample-bounding indices into the document index and the starting token offset for each document.
+
+    ```
+    i, offset = Sa_idx[j]
+    i_next, offset_next = Sa_idx[j + 1]
+    ```
+- Use the document index to retrieve `S` tokens from consecutive (in the document index) documents.
+
+    ```
+    sample = []
+    sample += indexed_dataset[Do_idx[i]][offset:]
+    if i != i_next:
+        sample += indexed_dataset[Do_idx[i + 1:i_next]]
+    sample += indexed_dataset[Do_idx[i_next]][:offset_next]
+    ```
+
+To save time during initialization, each index is built/cached sequentially on one process rank and subsequently loaded in parallel on other process ranks. The cached indices are unique to a hash generated in the `MegatronDataset.__init__` function.
+
+### BlendedDataset
+
+The `BlendedDataset` is parameterized by the following variables: the underlying `MegatronDataset` instances `D`, the weights `W` (one per dataset), and the size `S`. The `BlendedDataset` will draw samples from contributing datasets in proportion to the weights until achieving a composite dataset of the desired size. During each sampling step, we draw a single sample from the dataset which has the greatest sampling error.
+
+The `BlendedDataset` creates two "blending" indices to facilitate lookup: (1) the dataset index and (2) the dataset sample index.
+
+1. The dataset index _Da_idx_ is a 1-D array mapping from _i_ to dataset index of length `S`.
+
+    ```
+    Given
+
+    D = [d0, d1, d2]
+    W = [1/2, 1/4, 1/4]
+    S = 4
+
+    Then, for example:
+
+    Da_idx = [0, 1, 2, 0]
+
+    ```
+
+2. The dataset sample index _Sa_idx_ is a 1-D mapping from _i_ to the sample index for dataset _Da_idx[i]_ of length `S`.
+
+    ```
+    Given
+
+    Da_idx = [0, 1, 2, 0]
+
+    Then, for example:
+
+    Sa_idx = [0, 0, 0, 1]
+    ```
+
+To query the `BlendedDataset` for the _k_-th sample we do the following
+
+- Use the dataset index to retrieve the corresponding dataset from `D` and the dataset sample index to retrieve the corresponding sample from that dataset.
+
+    ```
+    sample = D[Da_idx[k]][Sa_idx[k]]
+    ```
+
+To save time during initialization, each index is built/cached sequentially on one process rank and subsequently loaded in parallel on other process ranks. The cached indices are unique to a hash generated in the `BlendedDataset.__init__` function.
diff --git a/megatron/core/datasets/retro/__init__.py b/megatron/core/datasets/retro/__init__.py
new file mode 100644
index 0000000000..7ce970c6e9
--- /dev/null
+++ b/megatron/core/datasets/retro/__init__.py
@@ -0,0 +1,5 @@
+# Copyright (c) 2024, NVIDIA CORPORATION.  All rights reserved.
+
+from .config import RetroGPTChunkDatasets
+from .query.multi_split_gpt_dataset import MultiSplitGPTDataset, MultiSplitGPTDatasetConfig
+from .query.retro_dataset import get_retro_datasets
diff --git a/megatron/core/datasets/retro/config/__init__.py b/megatron/core/datasets/retro/config/__init__.py
new file mode 100644
index 0000000000..3635bedb3f
--- /dev/null
+++ b/megatron/core/datasets/retro/config/__init__.py
@@ -0,0 +1,16 @@
+# Copyright (c) 2024, NVIDIA CORPORATION.  All rights reserved.
+
+"""
+Exports:
+
+  - Embedder: Base class for all Bert embedders.
+  - RetroBertEmbedders: Container class for in-memory and on-disk embedders.
+  - RetroPreprocessingConfig: Configuration class for all of Retro preprocessing.
+  - RetroGPTChunkDatasets: Container class for train, valid, and test datasets.
+  - RetroTokenizers: Container class for GPT and Bert tokenizers.
+"""
+
+from .bert_embedders import Embedder, RetroBertEmbedders
+from .config import RetroPreprocessingConfig
+from .gpt_chunk_datasets import RetroGPTChunkDatasets
+from .tokenizers import RetroTokenizers
diff --git a/megatron/core/datasets/retro/config/bert_embedders.py b/megatron/core/datasets/retro/config/bert_embedders.py
new file mode 100644
index 0000000000..8f3fe85c4a
--- /dev/null
+++ b/megatron/core/datasets/retro/config/bert_embedders.py
@@ -0,0 +1,48 @@
+# Copyright (c) 2024, NVIDIA CORPORATION.  All rights reserved.
+
+"""Container dataclass for holding both in-memory and on-disk Bert embedders."""
+
+import abc
+from dataclasses import dataclass
+from typing import Any
+
+import numpy as np
+import torch
+
+
+class Embedder(abc.ABC):
+    """Base class for all Bert embedders.
+
+    All embedders should be able to embed either an entire text dataset (to a 2D
+    numpy array), or a single text string (to a 1D numpy array).
+    """
+
+    @abc.abstractmethod
+    def embed_text_dataset(self, text_dataset: torch.utils.data.Dataset) -> np.ndarray:
+        """Embed a text dataset.
+
+        Args:
+            text_dataset (torch.utils.data.Dataset): Text dataset to embed. Each sample of the text dataset should output a dict with a key 'text' and a string value.
+
+        Returns:
+            A 2D ndarray with shape (len(text_dataset), dimension(embedder)).
+        """
+
+    @abc.abstractmethod
+    def embed_text(self, text: str) -> np.ndarray:
+        """Embed a simple string of text.
+
+        Args:
+            text (str): A single text sample.
+
+        Returns:
+            A 1D ndarray with shape (dimensions(embedder),).
+        """
+
+
+@dataclass
+class RetroBertEmbedders:
+    """Container dataclass for in-memory and on-disk Bert embedders."""
+
+    disk: Embedder
+    mem: Embedder
diff --git a/megatron/core/datasets/retro/config/config.py b/megatron/core/datasets/retro/config/config.py
new file mode 100644
index 0000000000..ac9ca84124
--- /dev/null
+++ b/megatron/core/datasets/retro/config/config.py
@@ -0,0 +1,135 @@
+# Copyright (c) 2024, NVIDIA CORPORATION.  All rights reserved.
+
+"""Retro preprocessing config."""
+
+from dataclasses import dataclass
+
+from megatron.core.transformer import TransformerConfig
+
+from .bert_embedders import RetroBertEmbedders
+from .gpt_chunk_datasets import RetroGPTChunkDatasets
+from .tokenizers import RetroTokenizers
+
+
+@dataclass
+class RetroPreprocessingConfig(TransformerConfig):
+    """Configuration object for Retro preprocessing.
+
+    *Note* : Arguments prefixed with '--retro-gpt-*' or '--retro-bert-*' are
+    included and named as such to more easily handle managing both models
+    running at the same time. Megatron is not optimized to run two models at
+    once, so this naming convention makes it clearer.
+
+    Args:
+
+        retro_project_dir (str): Retro project directory, which contains the preprocessed data for for pretraining. This directory is built during preprocessing (see tools/retro/README.md), and contains subdirectories for the chunk database and pretraining neighbors.
+        retro_tasks (str): Comma-separated list of tasks to run. Run entire preprocesing pipeline by using '--retro-tasks build'. Alternatively, run individual stages with tasks (in this order) 'db-build', 'index-build', or 'query-pretraining-neighbors'. For example, '--retro-tasks db-build,index-build,query-pretraining-neighbors' is equivalent to '--retro-tasks build'; or the argument can contain a subset of these tasks. Stages must always be run in the correct order (listed above).
+        retro_task_validate (float): If defined, validate a randomly sampled subset of the existing results of the given task. Each task implements a 'validate' method that is responsible for sampling a `retro_task_validate` fraction of the existing results, and then checking for bitwise equality with the current code base. (E.g., `--retro-task-validate 0.01`.)
+        retro_block_size (int): Number of chunks to process at a time when generating Bert embeddings and querying the search index. Partial results for each block are generally saved to disk in separate files.
+        retro_doc_block_size (int): Number of documents to processe at time when processing token datasets into chunk databases. The partial chunk database for each block is saved into a separate file.
+        retro_gpt_seed (int): Random seed used for python, numpy, pytorch, and cuda.
+        retro_gpt_data_path (str): Path to the training dataset. Accepted format: 1) a single data path, 2) multiple datasets in the form: dataset1-weight dataset1-path dataset2-weight dataset2-path ... It is used with --split when a single dataset used for all three: train, valid and test. It is exclusive to the other --*-data-path args.
+        retro_gpt_data_cache_path (str): Path to a directory to hold cached index files.
+        retro_gpt_split (str): Comma-separated list of proportions for training, validation, and test split. For example the split `90,5,5` will use 90%% of data for training, 5%% for validation and 5%% for test.
+        retro_gpt_train_samples (int): Total number of samples to train over all training runs.
+        retro_gpt_eval_interval (int): GPT evaluation interval.
+        retro_gpt_eval_iters (int): GPT evaluation iterations.
+        retro_gpt_tokenizer_type (str): GPT tokenizer type.
+        retro_gpt_tokenizer_model (str): GPT tokenizer model file.
+        retro_gpt_vocab_file (str): GPT vocab file.
+        retro_gpt_merge_file (str): GPT merge file.
+        retro_gpt_seq_length (int): GPT sequence length.
+        retro_gpt_global_batch_size (int): GPT global batch size.
+        retro_gpt_chunk_length (int): GPT chunk length.
+        retro_bert_tokenizer_type (str): Bert tokenizer type (for when using '--bert-embedder-type megatron').
+        retro_bert_vocab_file (str): Bert vocab file.
+        retro_bert_batch_size (int): Micro-batch size for processing Bert embeddings.
+        retro_bert_max_chunk_length (int): Maximum sequence length for Bert embeddings. (Named 'chunk' here in reference to these Bert sequences being converted from GPT chunks.)
+        retro_index_type (str): A 'faiss-base' index is a simple, un-optimized wrapper around a Faiss index. A 'faiss-par-add' index optimizes the 'add()' method by making it multi-node and multi-process, but with bit-wise equivalent results.
+        retro_index_str (str): Index string used for calling faiss.index_factory(). For example, 'IVF262144_HNSW32,Flat' or 'OPQ32_256,IVF4194304_HNSW32,PQ32'.
+        retro_index_ntrain (int): Number of database chunks to use for training the index. This value must be less or equal to the total number of chunks in the database.
+        retro_index_train_load_fraction (float): Fraction of sampled chunks to use for training the index. Useful when our total sampled embeddings use too much memory; lowering the load fraction is less costly than re-embedding a new sampled dataset from scratch.
+        retro_index_add_load_fraction (float): Fraction of database chunks to use for adding to the index. Useful when our total index size would use too much memory; lowering the load fraction is less costly than re-designing our token datasets.
+        retro_index_delete_training_embeddings (bool): Delete training embeddings for the search index. Useful for debugging.
+        retro_index_delete_added_codes (bool): Delete added codes for the search index. Useful for debugging.
+        retro_query_ef_search (int): Index ef-search parameter for Hierarchical Navigable Small Worlds (HNSW) during querying.
+        retro_query_nprobe (int): Index nprobe parameter for Inverted File (IVF) during querying.
+        retro_query_num_neighbors_query (int): Number of neighbors to retrieve when calling index.search().
+        retro_query_num_neighbors_save (int): Number of neighbors to save to disk after the index's returned neighbors. If longer than target value, neighbors truncated; and if shorter than target value, neighbors are padded with -1's.
+        retro_bert_embedders (RetroBertEmbedders): Set of Bert embedders used for embedding chunks. Contains entries: 1) 'mem' for an in-memory embedder, and 2) 'disk' for an embedder that saves results in blocks to disk.
+        retro_gpt_chunk_datasets (RetroGPTChunkDatasets): GPT datasets for 'train', 'valid', and 'test'.
+        retro_tokenizers (RetroTokenizers): GPT ('gpt') and Bert ('bert') tokenizers.
+    """
+
+    # Basic.
+    retro_project_dir: str = None
+    retro_tasks: str = 'build'
+    retro_task_validate: float = None
+    retro_block_size: int = 100000
+    retro_doc_block_size: int = 100000
+
+    # GPT.
+    retro_gpt_seed: int = 1234
+    retro_gpt_data_path: list = None  # basic list here, for parsing purposes
+    retro_gpt_data_cache_path: str = None
+    retro_gpt_split: str = '969,30,1'
+    retro_gpt_train_samples: int = None
+    retro_gpt_eval_interval: int = None
+    retro_gpt_eval_iters: int = None
+    retro_gpt_tokenizer_type: str = None
+    retro_gpt_tokenizer_model: str = None
+    retro_gpt_vocab_file: str = None
+    retro_gpt_merge_file: str = None
+    retro_gpt_seq_length: int = None
+    retro_gpt_global_batch_size: int = None
+    retro_gpt_chunk_length: int = 64
+
+    # Bert.
+    retro_bert_tokenizer_type: str = None
+    retro_bert_vocab_file: str = None
+    retro_bert_batch_size: int = 128
+    retro_bert_max_chunk_length: int = 256
+
+    # Index.
+    retro_index_type: str = 'faiss-par-add'
+    retro_index_str: str = None
+    retro_index_ntrain: int = None
+    retro_index_train_load_fraction: float = 1.0
+    retro_index_add_load_fraction: float = 1.0
+    retro_index_delete_training_embeddings: bool = True
+    retro_index_delete_added_codes: bool = True
+
+    # Query.
+    retro_query_ef_search: int = 256
+    retro_query_nprobe: int = 65536
+    retro_query_num_neighbors_query: int = 200
+    retro_query_num_neighbors_save: int = 20
+
+    # Tools.
+    retro_bert_embedders: RetroBertEmbedders = None
+    retro_gpt_chunk_datasets: RetroGPTChunkDatasets = None
+    retro_tokenizers: RetroTokenizers = None
+
+    def __post_init__(self) -> None:
+        """Validate Retro config."""
+
+        # Validate required attributes.
+        assert self.retro_project_dir is not None
+        assert self.retro_tasks is not None
+        assert self.retro_gpt_data_path is not None or self.retro_gpt_data_cache_path is not None
+        assert self.retro_gpt_train_samples is not None
+        assert self.retro_gpt_eval_interval is not None
+        assert self.retro_gpt_eval_iters is not None
+        assert self.retro_gpt_tokenizer_type is not None
+        assert self.retro_gpt_tokenizer_model is not None or (
+            self.retro_gpt_vocab_file is not None and self.retro_gpt_merge_file is not None
+        )
+        assert self.retro_gpt_seq_length is not None
+        assert self.retro_gpt_global_batch_size is not None
+        assert self.retro_bert_tokenizer_type is not None
+        assert self.retro_bert_vocab_file is not None
+        assert self.retro_index_str is not None
+        assert self.retro_index_ntrain is not None
+
+        # Split retro tasks.
+        self.retro_tasks = self.retro_tasks.split(",")
diff --git a/megatron/core/datasets/retro/config/gpt_chunk_datasets.py b/megatron/core/datasets/retro/config/gpt_chunk_datasets.py
new file mode 100644
index 0000000000..831b1d812b
--- /dev/null
+++ b/megatron/core/datasets/retro/config/gpt_chunk_datasets.py
@@ -0,0 +1,15 @@
+# Copyright (c) 2024, NVIDIA CORPORATION.  All rights reserved.
+
+"""Container dataclass for GPT chunk datasets (train, valid, and test)."""
+
+from dataclasses import dataclass
+
+
+@dataclass
+class RetroGPTChunkDatasets:
+    """Container dataclass for GPT chunk datasets."""
+
+    # Each dict contains 'dataset', 'neighbor_dir', and 'num_active_chunks'.
+    train: dict = None
+    valid: dict = None
+    test: dict = None
diff --git a/megatron/core/datasets/retro/config/tokenizers.py b/megatron/core/datasets/retro/config/tokenizers.py
new file mode 100644
index 0000000000..2e731c83b9
--- /dev/null
+++ b/megatron/core/datasets/retro/config/tokenizers.py
@@ -0,0 +1,15 @@
+# Copyright (c) 2024, NVIDIA CORPORATION.  All rights reserved.
+
+"""Container class for GPT and Bert tokenizers."""
+
+from dataclasses import dataclass
+
+from megatron.core.datasets.megatron_tokenizer import MegatronTokenizer
+
+
+@dataclass
+class RetroTokenizers:
+    """Container class for GPT and Bert tokenizers."""
+
+    gpt: MegatronTokenizer = None
+    bert: MegatronTokenizer = None
diff --git a/megatron/core/datasets/retro/db/__init__.py b/megatron/core/datasets/retro/db/__init__.py
new file mode 100644
index 0000000000..f1f460b3b0
--- /dev/null
+++ b/megatron/core/datasets/retro/db/__init__.py
@@ -0,0 +1,9 @@
+# Copyright (c) 2024, NVIDIA CORPORATION.  All rights reserved.
+
+"""
+Exports:
+
+  - build_db: Build a chunk database from a list of indexed datasets.
+"""
+
+from .build import build_db
diff --git a/megatron/core/datasets/retro/db/build.py b/megatron/core/datasets/retro/db/build.py
new file mode 100644
index 0000000000..44b9038230
--- /dev/null
+++ b/megatron/core/datasets/retro/db/build.py
@@ -0,0 +1,633 @@
+# Copyright (c) 2024, NVIDIA CORPORATION.  All rights reserved.
+
+"""Build a chunk database from a list of indexed datasets.
+
+Building a chunk database consists of.
+
+  - Breaking each document of each indexed dataset into consecutive
+      retro_gpt_chunk_length chunks.
+  - Re-tokenize each chunk into Bert, and discard any chunks with empty Bert
+      tokens.
+  - Save chunk offsets to disk for each indexed dataset.
+"""
+
+import glob
+import os
+import types
+from concurrent.futures import ProcessPoolExecutor, as_completed
+from typing import Dict, List, Tuple
+
+import numpy as np
+import torch
+from tqdm import tqdm
+
+from megatron.core.datasets.indexed_dataset import IndexedDataset
+from megatron.core.datasets.retro.config import RetroPreprocessingConfig
+from megatron.core.datasets.retro.external_libs import h5py
+from megatron.core.datasets.retro.utils import (
+    extract_data_config,
+    get_blocks_by_rank,
+    log_retro_rank_0,
+    retro_makedir,
+)
+
+from .utils import (
+    get_indexed_dataset_infos,
+    get_indexed_dataset_infos_path,
+    get_individual_chunk_db,
+    get_individual_db_dir,
+    get_individual_db_paths,
+    get_individual_doc_offsets,
+    get_merged_db_path_map,
+    init_indexed_dataset_infos,
+    load_indexed_datasets,
+    save_indexed_dataset_infos,
+)
+
+
+def build_partial_db(
+    config: types.SimpleNamespace,
+    dataset_idx: int,
+    n_datasets: int,
+    indexed_dataset: IndexedDataset,
+    block_id: int,
+    n_blocks: int,
+    block: dict,
+    proc_id: int,
+    n_procs: int,
+) -> Tuple[int, list, list, dict]:
+    """Process a document index range of the indexed dataset.
+
+    The chunk database is built in parallel blocks, since de-tokenizing &
+    re-tokenizing for Bert-length computation is expensive. This method
+    iterates each document and extracts sequential 'chunk-length' sequences
+    from each document.
+
+    Args:
+        config (types.SimpleNamespace): Subset of Retro config, containing 'chunk_length', 'gpt_eod', 'gpt_detokenize', 'bert_tokenize', and 'task_validate'.
+        dataset_idx (int): Index of this dataset out of all blended datasets.
+        n_datasets (int): Total number of blended datasets.
+        indexed_dataset (IndexedDataset): Indexed dataset to be chunked.
+        block_id (int): Block index out of all blocks to be processed.
+        n_blocks (int):  Total number of blocks to be processed.
+        block (dict): Range information such as start/end points for chunking idnexed dataset.
+        proc_id (int): Process ID for tracking parallel process order.
+        n_procs (int): Total number of parallel processes.
+
+    Returns:
+        A tuple containing:
+
+        - Process ID.
+        - List of valid chunks.
+        - List of invalid chunks (i.e., chunks that converted to empty Bert embeddings.).
+        - Dict mapping document ID to number of valid chunks.
+    """
+
+    # Document start/end indexes.
+    doc_range = block["range"]
+    n_docs = doc_range[1] - doc_range[0]
+    n_docs_per_proc = int(np.ceil(n_docs / n_procs))
+    doc_start_id = doc_range[0] + proc_id * n_docs_per_proc
+    doc_end_id = min(doc_range[1], doc_start_id + n_docs_per_proc)
+
+    # Print progress.
+    progress_proc_ids = set(range(n_procs)) if torch.distributed.get_rank() == 0 else set()
+    if proc_id in progress_proc_ids:
+        log_retro_rank_0(
+            " > building partial chunk db, proc %d / %d, docs %d:%d / %d."
+            % (proc_id, n_procs, doc_start_id, doc_end_id, n_docs)
+        )
+
+    # Progress bars (snapshot of overall progress).
+    doc_id_iter = range(doc_start_id, doc_end_id)
+    pbar = (
+        tqdm(doc_id_iter, "parse doc chunks", miniters=len(doc_id_iter) // 20)
+        if proc_id in progress_proc_ids
+        else doc_id_iter
+    )
+
+    # Iterate documents & parse chunks.
+    chunk_db_valid: List[Tuple] = []
+    chunk_db_invalid: List[Tuple] = []
+    doc_size_map = {}
+    for doc_id in pbar:
+
+        # Progress description.
+        try:
+            pbar.set_description(
+                "%sds %d / %d, block %d / %d, proc %d / %d."
+                % (
+                    "" if config.task_validate is None else "[validate] ",
+                    dataset_idx,
+                    n_datasets,
+                    block_id,
+                    n_blocks,
+                    proc_id,
+                    n_procs,
+                )
+            )
+        except Exception:
+            pass
+
+        # Remove EOD token.
+        doc = indexed_dataset.get(doc_id)
+        if doc[-1].item() == config.gpt_eod:
+            doc = doc[:-1]
+        doc_len = len(doc)
+
+        # Chunk start/end indexes.
+        chunk_start_idxs = list(range(0, doc_len, config.chunk_length))
+        chunk_end_idxs = [min(doc_len, s + config.chunk_length) for s in chunk_start_idxs]
+
+        # Re-tokenize each chunk to Bert/Wordpiece (empty bert -> 'invalid').
+        doc_size_map[doc_id] = 0
+        for i, chunk_start_idx in enumerate(chunk_start_idxs):
+
+            # Re-tokenize.
+            chunk_end_idx = chunk_end_idxs[i]
+            gpt_token_ids = indexed_dataset.get(
+                idx=doc_id, offset=chunk_start_idx, length=chunk_end_idx - chunk_start_idx
+            )
+            text = config.gpt_detokenize(gpt_token_ids.tolist())
+            bert_token_ids = config.bert_tokenize(text)
+
+            # 'Valid' for non-empty Bert chunks; 'invalid' otherwise.
+            if len(bert_token_ids) == 0:
+                _chunk_db = chunk_db_invalid
+            else:
+                _chunk_db = chunk_db_valid
+                doc_size_map[doc_id] += 1
+            _chunk_db.append((doc_id, chunk_start_idx, chunk_end_idx, len(bert_token_ids)))
+
+    return proc_id, chunk_db_valid, chunk_db_invalid, doc_size_map
+
+
+def build_block_db(
+    config: RetroPreprocessingConfig,
+    dataset_idx: int,
+    n_datasets: int,
+    indexed_dataset: IndexedDataset,
+    n_procs: int,
+    executor: ProcessPoolExecutor,
+    n_missing_blocks: int,
+    block_idx: int,
+    block: dict,
+) -> Tuple[np.ndarray, np.ndarray, np.ndarray]:
+    """Split each document within block into consecutive retro_gpt_chunk_length size chunks.
+
+    Args:
+        config (RetroPreprocessingConfig): For DB building, we make use of attributes 'chunk_length', 'gpt_eod', 'gpt_detokenize', 'bert_tokenize', and 'task_validate'.
+        dataset_idx (int): Index of this dataset out of all blended datasets.
+        n_datasets (int): Total number of blended datasets.
+        indexed_dataset (IndexedDataset): Indexed dataset to be chunked.
+        n_procs (int): Total number of parallel processes.
+        executor (ProcessPoolExecutor): Executor for launching parallel processes.
+        n_missing_blocks (int):  Total number of blocks to be processed.
+        block_idx (int): Block index out of all blocks to be processed.
+        block (dict): Range information such as start/end points for chunking idnexed dataset.
+
+    Returns:
+        A tuple containing:
+
+        - List of valid chunks.
+        - List of invalid chunks (i.e., chunks that converted to empty Bert embeddings.).
+        - Dict mapping document ID to number of valid chunks.
+    """
+
+    # Build partial dbs.
+    log_retro_rank_0(' > build partial dbs.')
+    futures = []
+    for proc_id in range(n_procs):  # not true process id
+        futures.append(
+            executor.submit(
+                build_partial_db,
+                types.SimpleNamespace(
+                    chunk_length=config.retro_gpt_chunk_length,
+                    gpt_eod=config.retro_tokenizers.gpt.eod,
+                    gpt_detokenize=config.retro_tokenizers.gpt.detokenize,
+                    bert_tokenize=config.retro_tokenizers.bert.tokenize,
+                    task_validate=config.retro_task_validate,
+                ),
+                dataset_idx,
+                n_datasets,
+                indexed_dataset,
+                block_idx,
+                n_missing_blocks,
+                block,
+                proc_id,
+                n_procs,
+            )
+        )
+    partial_chunk_dbs = []
+    for future in as_completed(futures):
+        partial_chunk_dbs.append(future.result())
+
+    # Concatenate chunks.
+    partial_chunk_dbs.sort(key=lambda item: item[0])  # sort by proc_id
+    chunk_db_valid = [
+        item for partial_chunk_db in partial_chunk_dbs for item in partial_chunk_db[1]
+    ]
+    chunk_db_invalid = [
+        item for partial_chunk_db in partial_chunk_dbs for item in partial_chunk_db[2]
+    ]
+
+    # Convert to numpy.
+    log_retro_rank_0(' > converting chunk db to numpy.')
+    chunk_db_valid = np.array(chunk_db_valid, dtype="uint32")
+    chunk_db_invalid = np.array(chunk_db_invalid, dtype="uint32")
+
+    # Document offsets.
+    doc_sizes = [
+        (d, s) for partial_chunk_db in partial_chunk_dbs for d, s in partial_chunk_db[3].items()
+    ]
+    doc_sizes.sort(key=lambda item: item[0])
+    doc_offsets = np.cumsum([item[1] for item in doc_sizes]).astype("uint64")
+    doc_offsets = np.stack(
+        (np.array([item[0] for item in doc_sizes], dtype="uint64"), doc_offsets), axis=1
+    )
+
+    return chunk_db_valid, chunk_db_invalid, doc_offsets
+
+
+def save_block_db(
+    block: dict, chunk_db_valid: np.ndarray, chunk_db_invalid: np.ndarray, doc_offsets: np.ndarray
+) -> None:
+    """Save block of chunked tokens to disk. These blocks are later used for
+    training and adding to the vector index.
+
+    Args:
+        block (dict): Range information such as start/end points for chunking idnexed dataset.
+        chunk_db_valid (np.ndarray): Array of valid chunk indexes.
+        chunk_db_invalid (np.ndarray): Array of invalid chunk indexes.
+        doc_offsets (np.ndarray): Array of document offsets by chunks.
+    """
+    log_retro_rank_0(" > saving individual db.")
+    with h5py.File(block["path"], "w") as f:
+        dset = f.create_dataset("chunks_valid", data=chunk_db_valid)
+        dset = f.create_dataset("chunks_invalid", data=chunk_db_invalid)
+        dset = f.create_dataset("doc_offsets", data=doc_offsets)
+
+
+def build_individual_db(
+    config: RetroPreprocessingConfig, dataset_idx: int, n_datasets: int, dataset_info: dict
+) -> None:
+    """Process a single indexed dataset & extract chunks.
+
+    Args:
+        config (RetroPreprocessingConfig): Retro preprocessing config.
+        dataset_idx (int): Dataset index within blended dataset.
+        n_datasets (int): Total number of datasets within blended dataset.
+        dataset_info (dict): Metadata for dataset (see `save_indexed_dataset_infos()` in `utils.py` for more detail).
+    """
+
+    # Make directory.
+    db_dir = get_individual_db_dir(config.retro_project_dir, dataset_info["prefix"])
+    retro_makedir(config, db_dir)
+
+    # Indexed dataset.
+    indexed_dataset = dataset_info["dataset"]
+
+    # Missing DB blocks (split by documents).
+    blocks = get_blocks_by_rank(
+        db_dir,
+        len(indexed_dataset),
+        config.retro_doc_block_size,
+        validate=lambda f: f["chunks_valid"].shape == (0,) or f["chunks_valid"].shape[1] == 4,
+        sample=config.retro_task_validate,
+    )
+    if config.retro_task_validate is None:
+        active_blocks = blocks.missing
+    else:
+        assert blocks.n_missing_world == 0
+        active_blocks = blocks.existing
+
+    # Prevent missing-path-write race condition.
+    torch.distributed.barrier()
+
+    # Nothing to do?
+    if config.retro_task_validate is None and not active_blocks:
+        return
+
+    # Num processes.
+    if blocks.n_missing_world == 1:
+        n_procs = 128
+    elif blocks.n_missing_world <= 2:
+        n_procs = 64
+    elif blocks.n_missing_world <= 4:
+        n_procs = 32
+    elif blocks.n_missing_world <= 8:
+        n_procs = 16
+    else:
+        n_procs = 8
+
+    # Process documents in parallel.
+    with ProcessPoolExecutor(max_workers=n_procs) as executor:
+        for block_idx, block in enumerate(active_blocks):
+
+            if block is not None:
+
+                # Build block DB.
+                chunk_db_valid, chunk_db_invalid, doc_offsets = build_block_db(
+                    config=config,
+                    dataset_idx=dataset_idx,
+                    n_datasets=n_datasets,
+                    indexed_dataset=indexed_dataset,
+                    n_procs=n_procs,
+                    executor=executor,
+                    n_missing_blocks=len(active_blocks),
+                    block_idx=block_idx,
+                    block=block,
+                )
+
+                if config.retro_task_validate is None:
+                    # Save block DB.
+                    save_block_db(
+                        block=block,
+                        chunk_db_valid=chunk_db_valid,
+                        chunk_db_invalid=chunk_db_invalid,
+                        doc_offsets=doc_offsets,
+                    )
+
+                else:
+
+                    # Load existing block DB.
+                    with h5py.File(block["path"]) as f:
+                        existing_chunks_valid = np.copy(f["chunks_valid"])
+                        existing_chunks_invalid = np.copy(f["chunks_invalid"])
+                        existing_doc_offsets = np.copy(f["doc_offsets"])
+
+                    # Check equality.
+                    log_retro_rank_0(" > validate.")
+                    assert np.array_equal(existing_chunks_valid, chunk_db_valid)
+                    assert np.array_equal(existing_chunks_invalid, chunk_db_invalid)
+                    assert np.array_equal(existing_doc_offsets, doc_offsets)
+
+            # Wait for all ranks to finish block.
+            log_retro_rank_0(" > waiting for all ranks to finish block.")
+            torch.distributed.barrier()
+
+    log_retro_rank_0(" > finished saving individual db.")
+
+
+def build_individual_dbs(
+    config: RetroPreprocessingConfig, indexed_dataset_infos: List[Dict]
+) -> None:
+    """Iterate each indexed dataset & process its chunks.
+
+    Args:
+        config (RetroPreprocessingConfig): Retro preprocessing config.
+        indexed_dataset_infos (List[Dict]): Preprocessing metadata for each dataset.
+    """
+
+    # Build individual DBs.
+    log_retro_rank_0(" > build individual chunk dbs.")
+    for ds_idx, ds_info in enumerate(indexed_dataset_infos):
+
+        # Progress.
+        log_retro_rank_0(
+            " > building individual db, dataset %d / %d ... '%s'."
+            % (ds_idx, len(indexed_dataset_infos), ds_info["prefix"])
+        )
+
+        # Process single dataset.
+        build_individual_db(config, ds_idx, len(indexed_dataset_infos), ds_info)
+
+
+def update_chunk_counts(
+    config: RetroPreprocessingConfig, indexed_dataset_infos: List[Dict]
+) -> None:
+    """Set n_chunks_train & n_chunks sampled for each individual DB.
+
+    Args:
+        config (RetroPreprocessingConfig): Retro preprocessing config.
+        indexed_dataset_infos (List[Dict]): Preprocessing metadata for each dataset (i.e., 'prefix', 'ratio', 'n_chunks', etc.).
+    """
+
+    if torch.distributed.get_rank() != 0:
+        return
+
+    # Data ratio sum (for setting index training chunks).
+    data_ratio_sum = sum([d["ratio"] for d in indexed_dataset_infos])
+
+    # Training split size (split at document level).
+    train_fraction = float(extract_data_config(config).split.split(",")[0]) / 100
+    assert train_fraction > 0 and train_fraction <= 1
+
+    # Set n_chunks (including n_chunks_sampled for unambiguity).
+    log_retro_rank_0(" > compute n_chunks.")
+    for ds_index, ds_info in enumerate(indexed_dataset_infos):
+
+        db_paths = get_individual_db_paths(config.retro_project_dir, ds_info["prefix"])
+
+        # Update counts.
+        ds_info["n_docs"] = len(ds_info["dataset"].document_indices) - 1
+        ds_info["n_docs_train"] = int(train_fraction * ds_info["n_docs"])
+        ds_info["n_chunks"] = 0  # previously, 'n_chunks_valid'
+        ds_info["n_chunks_train"] = 0
+        ds_info["n_chunks_invalid"] = 0
+        for db_path in tqdm(
+            db_paths, "%d/%d, %s" % (ds_index, len(indexed_dataset_infos), ds_info["prefix"])
+        ):
+            with h5py.File(db_path, "r") as f:
+                ds_info["n_chunks"] += len(f["chunks_valid"])
+                ds_info["n_chunks_invalid"] += len(f["chunks_invalid"])
+                ds_info["n_chunks_train"] += (
+                    (np.copy(f["chunks_valid"][:, 0]) < ds_info["n_docs_train"]).sum().item()
+                )
+
+        ds_info["n_chunks_sampled"] = int(
+            config.retro_index_ntrain * ds_info["ratio"] / data_ratio_sum
+        )
+
+        # Verify counts.
+        assert ds_info["n_chunks_train"] <= ds_info["n_chunks"], "n_train (%d) > n_total (%d)." % (
+            ds_info["n_chunks_train"],
+            ds_info["n_chunks"],
+        )
+        assert (
+            ds_info["n_chunks_sampled"] <= ds_info["n_chunks_train"]
+        ), "n_sampled (%d) > n_train (%d)." % (
+            ds_info["n_chunks_sampled"],
+            ds_info["n_chunks_train"],
+        )
+
+
+def merge_dbs(project_dir: str, indexed_dataset_infos: List[Dict], db_type: str) -> None:
+    """Merge individual DBs into single DB.
+
+    Args:
+        project_dir (str): Retro project dir.
+        indexed_dataset_infos (List[Dict]): Preprocessing metadata for each dataset (i.e., 'prefix', 'ratio', 'n_chunks', etc.).
+        db_type (str): DB type (e.g., 'sampled', 'train', or 'valid').
+    """
+
+    if torch.distributed.get_rank() != 0:
+        return
+
+    log_retro_rank_0(" > build %s chunk db." % db_type)
+
+    # Count chunks.
+    if db_type == "sampled":
+        n_chunks_key = "n_chunks_sampled"
+        n_docs_key = None
+    elif db_type == "train":
+        n_chunks_key = "n_chunks_train"
+        n_docs_key = "n_docs_train"
+    elif db_type == "valid":
+        n_docs_key = None
+    else:
+        raise Exception("handle db_type '%s'." % db_type)
+
+    if db_type == "valid":
+        n_chunks = sum(m["n_chunks"] - m["n_chunks_train"] for m in indexed_dataset_infos)
+    else:
+        n_chunks = sum(m[n_chunks_key] for m in indexed_dataset_infos)
+        n_docs = None if n_docs_key is None else sum(m[n_docs_key] for m in indexed_dataset_infos)
+
+    # DB path.
+    db_path = get_merged_db_path_map(project_dir)[db_type]
+
+    # Delete existing chunk db if incorrect size.
+    if os.path.exists(db_path):
+
+        try:
+
+            f = h5py.File(db_path)
+            n_alloc = len(f["chunks"])  # total allocated
+            n_written = f["n_written"][0].item()  # total written
+            f.close()
+
+            if n_chunks != n_alloc or n_chunks != n_written:
+                os.remove(db_path)
+
+        except Exception as e:
+            if isinstance(e, OSError):
+                os.remove(db_path)
+            elif isinstance(e, KeyError):
+                f.close()
+                os.remove(db_path)
+            else:
+                raise e
+
+    # Build merged chunk db.
+    if not os.path.exists(db_path):
+
+        os.makedirs(os.path.dirname(db_path), exist_ok=True)
+        f = h5py.File(db_path, "w")
+
+        # Initialize output arrays.
+        merged_chunk_db: np.ndarray = f.create_dataset("chunks", (n_chunks, 5), dtype="uint32")
+        merged_doc_offsets: np.ndarray = (
+            None
+            if n_docs_key is None
+            else f.create_dataset("doc_offsets", (n_docs, 3), dtype="uint64")
+        )
+        n_written = f.create_dataset("n_written", (1,), dtype="uint64")
+        n_written[0] = 0
+
+        # Iterate indexed datasets & collect chunks.
+        chunk_start_index = 0
+        doc_start_index = 0
+        doc_start_offset = 0
+        for ds_idx, ds_info in enumerate(indexed_dataset_infos):
+            log_retro_rank_0(
+                " > merging dbs; '%s', dataset %d / %d ... '%s'."
+                % (db_type, ds_idx, len(indexed_dataset_infos), ds_info["prefix"])
+            )
+            individual_chunk_db: np.ndarray = get_individual_chunk_db(project_dir, ds_idx, ds_info)
+            individual_doc_offsets: np.ndarray = (
+                None
+                if n_docs_key is None
+                else get_individual_doc_offsets(project_dir, ds_idx, ds_info)
+            )
+
+            if db_type == "valid":
+                individual_chunk_db = individual_chunk_db[ds_info["n_chunks_train"] :]
+                if n_docs_key is None:
+                    individual_doc_offsets = None
+                else:
+                    train_doc_offset = individual_doc_offsets[ds_info["n_docs_train"] - 1, 2]
+                    individual_doc_offsets = np.copy(
+                        individual_doc_offsets[ds_info["n_docs_train"] :]
+                    )
+                    individual_doc_offsets[:, 2] -= train_doc_offset
+
+                    log_retro_rank_0("~~~")
+                    log_retro_rank_0(individual_doc_offsets)
+                    log_retro_rank_0(train_doc_offset)
+                    raise Exception("test me.")
+            else:
+                individual_chunk_db = individual_chunk_db[: ds_info[n_chunks_key]]
+                individual_doc_offsets = (
+                    None
+                    if n_docs_key is None
+                    else np.copy(individual_doc_offsets[: ds_info[n_docs_key]])
+                )
+
+            merged_chunk_db[chunk_start_index : chunk_start_index + len(individual_chunk_db)] = (
+                individual_chunk_db
+            )
+            chunk_start_index += len(individual_chunk_db)
+            n_written[0] = chunk_start_index
+            if n_docs_key is not None:
+                individual_doc_offsets[:, 2] += doc_start_offset
+                doc_end_index = doc_start_index + individual_doc_offsets.shape[0]
+                merged_doc_offsets[doc_start_index:doc_end_index] = individual_doc_offsets
+                doc_start_index = doc_end_index
+                doc_start_offset = individual_doc_offsets[-1, 2].item()
+
+        f.close()
+
+
+def build_merged_dbs(project_dir: str, indexed_dataset_infos: List[Dict]) -> None:
+    """Merge individual dataset components into single database.
+
+    This method merges databases for DB types:
+    - 'sampled': used for training the vector index.
+    - 'train': used for adding to the trained vector index.
+    - 'valid': can be used for validating/testing the vector index.
+
+    Args:
+        project_dir (str): Retro project dir.
+        indexed_dataset_infos (List[Dict]): Preprocessing metadata for each dataset (i.e., 'prefix', 'ratio', 'n_chunks', etc.).
+    """
+    merge_dbs(project_dir, indexed_dataset_infos, "sampled")
+    merge_dbs(project_dir, indexed_dataset_infos, "train")
+    merge_dbs(project_dir, indexed_dataset_infos, "valid")
+
+
+def build_db(config: RetroPreprocessingConfig) -> None:
+    """Extract token chunks from each indexed dataset.
+
+    Iterate each document of each indexed dataset, extract that document's chunks, and save to a 'DB' (hdf5 file).
+
+    Args:
+        config (RetroPreprocessingConfig): Retro preprocessing config.
+    """
+
+    project_dir = config.retro_project_dir
+
+    # Indexed dataset info.
+    if config.retro_task_validate is None:
+        indexed_dataset_infos = init_indexed_dataset_infos(config)
+    else:
+        indexed_dataset_infos = get_indexed_dataset_infos(config.retro_project_dir)
+    # Build individual dbs.
+    build_individual_dbs(config, indexed_dataset_infos)
+
+    # If validating, return here.
+    if config.retro_task_validate is not None:
+        return
+
+    # Single-process going forward.
+    if torch.distributed.get_rank() != 0:
+        return
+
+    # Update n_chunks & save indexed dataset infos.
+    if not os.path.exists(get_indexed_dataset_infos_path(project_dir)):
+        update_chunk_counts(config, indexed_dataset_infos)
+        save_indexed_dataset_infos(project_dir, indexed_dataset_infos)
+    indexed_dataset_infos = get_indexed_dataset_infos(project_dir)
+
+    # Builded merged dbs.
+    build_merged_dbs(project_dir, indexed_dataset_infos)
diff --git a/megatron/core/datasets/retro/db/dataset.py b/megatron/core/datasets/retro/db/dataset.py
new file mode 100644
index 0000000000..f9053622ab
--- /dev/null
+++ b/megatron/core/datasets/retro/db/dataset.py
@@ -0,0 +1,105 @@
+# Copyright (c) 2024, NVIDIA CORPORATION.  All rights reserved.
+
+"""A DBDataset is for iterating the chunks of the chunk database.
+
+This dataset is used for both training a vector index, and adding vectors to a
+trained index.
+"""
+
+from typing import List
+
+import numpy as np
+import torch
+from tqdm import tqdm
+
+from megatron.core.datasets.indexed_dataset import IndexedDataset
+
+
+class DBDataset(torch.utils.data.Dataset):
+    """Dataset for iterating chunks.
+
+    Args:
+        db_path (str): Path of HDF5-format chunk database.
+        indexed_datasets (List[IndexedDataset]): Indexed datasets used to build database.
+        chunks (np.ndarray): Array of chunk indexes, for indexing into indexed datasets. Format [dataset_idx, doc_id, start_idx, end_idx, bert_length].
+        chunk_length (int): Max GPT chunk length (e.g., 64).
+        eod_token_id (int): EOD token ID.
+    """
+
+    def __init__(
+        self,
+        db_path: str,
+        indexed_datasets: List[IndexedDataset],
+        chunks: np.ndarray,
+        chunk_length: int,
+        eod_token_id: int,
+    ):
+
+        assert chunks.shape[1] == 5, (
+            "expected 5 columns (dataset_idx, "
+            "doc_idx, token_start_idx, token_end_idx, bert_chunk_length); "
+            "found %d columns." % chunks.shape[1]
+        )
+
+        self.db_path = db_path
+        self.indexed_datasets = indexed_datasets
+        self.chunks = chunks
+        self.doc_chunk_map = None
+
+        self.max_chunk_length = chunk_length
+        self.eod_token_id = eod_token_id
+
+    def __len__(self) -> int:
+        """Length of DB dataset.
+
+        Returns:
+            Number of chunks contained in the dataset.
+        """
+        return self.chunks.shape[0]
+
+    def __getitem__(self, chunk_id: int) -> dict:
+        """DB dataset sample.
+
+        Args:
+            chunk_id (int): Index of chunk within dataset.
+
+        Returns:
+            A dict containing:
+            - 'doc_id': Document index within indexed dataset.
+            - 'text': GPT token IDs.
+        """
+
+        # Chunk start/end indexes.
+        indexed_dataset_id, doc_id, token_start_idx, token_end_idx, _ = [
+            value.item() for value in self.chunks[chunk_id]
+        ]
+        chunk_length = token_end_idx - token_start_idx
+        indexed_dataset = self.indexed_datasets[indexed_dataset_id]
+
+        # Chunk token ids.
+        token_ids = indexed_dataset.get(doc_id, offset=token_start_idx, length=chunk_length)
+
+        # Extend chunks to max_chunk_length by padding with EOD tokens.
+        if chunk_length != self.max_chunk_length:
+            assert chunk_length < self.max_chunk_length, "invalid chunk len."
+            token_ids = token_ids.tolist()
+            token_ids += [self.eod_token_id] * (self.max_chunk_length - chunk_length)
+
+        return {"doc_id": doc_id, "text": np.array(token_ids, dtype=np.int64)}
+
+    def load_doc_tuples(self) -> None:
+        """Load the dataset & document ids.
+
+        Load the dataset id & document id of each chunk in the database, to
+        be used for causality filtering during querying.
+        """
+        self.doc_tuples = np.zeros(shape=(len(self), 2), dtype="uint32")
+        block_size = int(1e6)
+        for start_idx in tqdm(
+            range(0, len(self), block_size),
+            "load doc tuples",
+            miniters=(len(self) // block_size) // 10,
+            disable=torch.distributed.get_rank() != 0,
+        ):
+            end_idx = min(len(self), start_idx + block_size)
+            self.doc_tuples[start_idx:end_idx] = self.chunks[start_idx:end_idx, :2]
diff --git a/megatron/core/datasets/retro/db/utils.py b/megatron/core/datasets/retro/db/utils.py
new file mode 100644
index 0000000000..e8578a09d5
--- /dev/null
+++ b/megatron/core/datasets/retro/db/utils.py
@@ -0,0 +1,367 @@
+# Copyright (c) 2024, NVIDIA CORPORATION.  All rights reserved.
+
+"""Utilities for building a chunk database."""
+
+import glob
+import json
+import os
+from typing import Dict, List, Optional
+
+import numpy as np
+
+from megatron.core.datasets.indexed_dataset import IndexedDataset
+from megatron.core.datasets.retro.config import RetroPreprocessingConfig
+from megatron.core.datasets.retro.external_libs import h5py
+from megatron.core.models.retro.utils import get_gpt_data_dir
+
+from .dataset import DBDataset
+
+
+def get_db_dir(project_dir: str) -> str:
+    """Sub-directory for DB data.
+
+    Args:
+        project_dir (str): Path to Retro project dir.
+
+    Returns:
+        Path of the DB sub-directory within the project.
+    """
+    return os.path.join(project_dir, "db")
+
+
+def init_indexed_dataset_infos(config: RetroPreprocessingConfig) -> List[Dict]:
+    """Gather meta-info about each indexed dataset.
+
+    The returned info array allows for easy access to the configuration, and
+    helps remove ambiguity.
+
+    Args:
+        config (RetroPreprocessingConfig): Retro preprocessing config.
+
+    Returns:
+        List of processing metadata for each dataset, including:
+        - ratio: Data split weight.
+        - prefix: Relative path to dataset under DB sub-directory.
+    """
+
+    data_dir = get_gpt_data_dir(config.retro_project_dir)
+    data_blend: List[str] = config.retro_gpt_data_path
+    assert len(data_blend) % 2 == 0, "currently, only blended dataset is supported."
+
+    # Dataset infos.
+    infos = []
+    for i in range(0, len(data_blend), 2):
+        ratio = float(data_blend[i])
+        prefix = data_blend[i + 1]
+        path = os.path.join(data_dir, prefix + ".bin")
+        assert os.path.exists(path), "couldn't find '%s'." % path
+        infos.append({"ratio": ratio, "prefix": prefix})
+
+    # Load indexed datasets.
+    load_indexed_datasets(config.retro_project_dir, infos)
+
+    return infos
+
+
+def get_indexed_dataset_infos_path(project_dir: str) -> str:
+    """Path to indexed dataset meta-infos.
+
+    Args:
+        project_dir (str): Path to Retro project dir.
+
+    Returns:
+        Path to the `indexed_dataset_infos.json` file.
+    """
+    return os.path.join(get_db_dir(project_dir), "indexed_dataset_infos.json")
+
+
+def save_indexed_dataset_infos(project_dir: str, indexed_dataset_infos: List[Dict]) -> None:
+    """Save dataset order & meta-info.
+
+    Args:
+        project_dir (str): Path to Retro project dir.
+        indexed_dataset_infos (List[Dict]): List of metadata for each dataset, with each entry containing:
+
+        - ratio: Data split weight.
+        - prefix: Relative path to dataset under DB sub-directory.
+        - n_docs: Number of documents.
+        - n_docs_train: Number of documents used for pretraining.
+        - n_chunks: Number of valid chunks.
+        - n_chunks_train: Number of valid chunks used for pretraining.
+        - n_chunks_invalid: Number of invalid chunks.
+        - n_chunks_sampled: Number of valid chunks used for vector index training.
+    """
+
+    # Remove 'dataset' field.
+    clean_infos = []
+    for info in indexed_dataset_infos:
+        info = dict(info)
+        del info["dataset"]
+        clean_infos.append(info)
+
+    # Save.
+    with open(get_indexed_dataset_infos_path(project_dir), "w") as f:
+        json.dump(clean_infos, f, indent=4)
+
+
+def load_indexed_datasets(project_dir: str, indexed_dataset_infos: List[Dict]) -> None:
+    """Loaded indexed datasets into memory-mapped datasets.
+
+    Args:
+        project_dir (str): Path to Retro project dir.
+        indexed_dataset_infos (List[Dict]): List of metadata for each dataset (see `save_indexed_dataset_infos()` for more details.
+    """
+    data_dir = get_gpt_data_dir(project_dir)
+    for info in indexed_dataset_infos:
+        info["dataset"] = IndexedDataset(os.path.join(data_dir, info["prefix"]), mmap=True)
+
+
+def get_indexed_dataset_infos(project_dir: str) -> List[Dict]:
+    """Load indexed dataset meta-infos.
+
+    Args:
+        project_dir (str): Path to Retro project dir.
+
+    Returns:
+        List of metadata for each dataset (see `save_indexed_dataset_infos()` for more details.
+    """
+
+    # Load json.
+    path = get_indexed_dataset_infos_path(project_dir)
+    with open(path) as f:
+        infos = json.load(f)
+
+    # Load indexed datasets.
+    load_indexed_datasets(project_dir, infos)
+
+    return infos
+
+
+def get_individual_db_dir(project_dir: str, prefix: str) -> str:
+    """Individual DB's directory.
+
+    Args:
+        project_dir (str): Path to Retro project dir.
+        prefix (str): Unique relative path to dataset within project dir.
+
+    Returns:
+        Path to the given datasets's chunk database.
+    """
+    return os.path.join(get_db_dir(project_dir), "individual", prefix)
+
+
+def get_individual_db_paths(project_dir: str, prefix: str) -> List[str]:
+    """Get paths of all database blocks of an individual dataset.
+
+    Args:
+        project_dir (str): Path to Retro project dir.
+        prefix (str): Unique relative path to dataset within project dir.
+
+    Returns:
+        Paths to each HDF5 chunk database files that comprises this datasets full chunk database.
+    """
+    return sorted(glob.glob(get_individual_db_dir(project_dir, prefix) + "/*hdf5"))
+
+
+def get_individual_chunk_db(project_dir: str, ds_id: int, ds_info: dict) -> np.ndarray:
+    """Load individual dataset's chunk DB.
+
+    Args:
+        project_dir (str): Path to Retro project dir.
+        ds_id (int): Index of dataset within blended dataset.
+        ds_info (dict): Preprocessing metadata for dataset (see `save_indexed_dataset_infos()` for more detail).
+
+    Returns:
+        Array of chunk start/end indexes for this dataset, where the chunk indexes can be used for indexing into the corresponding indexed dataset.
+    """
+    paths = get_individual_db_paths(project_dir, ds_info["prefix"])
+    # *Note*: convert to dataset, rather than copying to memory.
+    db = np.zeros((ds_info["n_chunks"], 5), dtype="uint32")
+    db[:, 0] = ds_id
+    start_idx = 0
+    for path in paths:
+        f = h5py.File(path, "r")
+        n_chunks_current = f["chunks_valid"].shape[0]
+        db[start_idx : (start_idx + n_chunks_current), 1:] = f["chunks_valid"]
+        start_idx += n_chunks_current
+        f.close()
+
+    assert start_idx == ds_info["n_chunks"]
+
+    return db
+
+
+def get_individual_doc_offsets(project_dir: str, ds_id: int, ds_info: dict) -> np.ndarray:
+    """Load individual dataset's document offsets.
+
+    Args:
+        project_dir (str): Path to Retro project dir.
+        ds_id (int): Index of dataset within blended dataset.
+        ds_info (dict): Preprocessing metadata for dataset (see `save_indexed_dataset_infos()` for more detail).
+
+    Returns:
+        Array of document offsets by chunk index for this dataset.
+    """
+    paths = get_individual_db_paths(project_dir, ds_info["prefix"])
+    # *Note*: convert to dataset, rather than copying to memory.
+    doc_offsets = np.zeros((ds_info["n_docs"], 3), dtype="uint64")
+    doc_offsets[:, 0] = ds_id
+    start_idx = 0
+    start_offset = 0
+    for path in paths:
+        with h5py.File(path) as f:
+            current_doc_offsets = np.copy(f["doc_offsets"])
+            current_doc_offsets[:, 1] += start_offset
+            current_ndocs = current_doc_offsets.shape[0]
+            doc_offsets[start_idx : (start_idx + current_ndocs), 1:] = current_doc_offsets
+            start_idx += current_ndocs
+            start_offset = current_doc_offsets[-1, 1].item()
+
+    return doc_offsets
+
+
+def get_merged_db_path_map(project_dir: str) -> dict:
+    """Paths to merged datasets.
+
+    Args:
+        project_dir (str): Path to Retro project dir.
+
+    Returns:
+        A dict of chunk databases, one for each of:
+        - sampled: Chunks used for training the vector index.
+        - train: Chunks used for pretraining 'train' dataset.
+        - valid: Chunks used for pretraining 'valid' dataset.
+    """
+    base_dir = get_db_dir(project_dir)
+    return {
+        "sampled": os.path.join(base_dir, "merged", "sampled.hdf5"),
+        "train": os.path.join(base_dir, "merged", "train.hdf5"),
+        "valid": os.path.join(base_dir, "merged", "valid.hdf5"),
+    }
+
+
+def get_merged_dataset(
+    project_dir: str,
+    chunk_length: int,
+    eod_token_id: int,
+    db_type: str,
+    indexed_dataset_infos: Optional[List[Dict]] = None,
+) -> DBDataset:
+    """Get merged dataset.
+
+    Args:
+        project_dir (str): Path to Retro project dir.
+        chunk_length (int): GPT chunk length (e.g., 64).
+        eod_token_id (int): EOD token ID.
+        db_type (str): DB type (e.g., 'sampled', 'train', or 'valid').
+        indexed_dataset_infos (Optional[List[Dict]]): Optionally, pre-loaded list of dataset metadata (see `save_indexed_dataset_infos()` for more detail). If not provided, the indexed dataset infos will be loaded from disk.
+
+    Returns:
+        A DBDataset, which is a dataset that wraps the HDF5 chunk index array.
+    """
+
+    if not indexed_dataset_infos:
+        indexed_dataset_infos = get_indexed_dataset_infos(project_dir)
+
+    # Load chunks.
+    db_path = get_merged_db_path_map(project_dir)[db_type]
+    f = h5py.File(db_path, "r")
+    chunks = f["chunks"]
+
+    # DB dataset.
+    indexed_datasets = [info["dataset"] for info in indexed_dataset_infos]
+    dataset = DBDataset(
+        db_path=db_path,
+        indexed_datasets=indexed_datasets,
+        chunks=chunks,
+        chunk_length=chunk_length,
+        eod_token_id=eod_token_id,
+    )
+
+    return dataset
+
+
+def get_merged_sampled_dataset(
+    project_dir: str,
+    chunk_length: int,
+    eod_token_id: int,
+    indexed_dataset_infos: Optional[List[Dict]] = None,
+) -> DBDataset:
+    """Get sampled dataset (for training the vector index).
+
+    Args:
+        project_dir (str): Path to Retro project dir.
+        chunk_length (int): GPT chunk length (e.g., 64).
+        eod_token_id (int): EOD token ID.
+        indexed_dataset_infos (Optional[List[Dict]]): Optionally, pre-loaded list of dataset metadata (see `save_indexed_dataset_infos()` for more detail). If not provided, the indexed dataset infos will be loaded from disk.
+
+    Returns:
+        A DBDataset, which is a dataset that wraps the HDF5 chunk index array.
+    """
+    return get_merged_dataset(
+        project_dir, chunk_length, eod_token_id, "sampled", indexed_dataset_infos
+    )
+
+
+def get_merged_train_dataset(
+    project_dir: str,
+    chunk_length: int,
+    eod_token_id: int,
+    indexed_dataset_infos: Optional[List[Dict]] = None,
+) -> DBDataset:
+    """Get training dataset (for adding to the vector index).
+
+    Args:
+        project_dir (str): Path to Retro project dir.
+        chunk_length (int): GPT chunk length (e.g., 64).
+        eod_token_id (int): EOD token ID.
+        indexed_dataset_infos (Optional[List[Dict]]): Optionally, pre-loaded list of dataset metadata (see `save_indexed_dataset_infos()` for more detail). If not provided, the indexed dataset infos will be loaded from disk.
+
+    Returns:
+        A DBDataset, which is a dataset that wraps the HDF5 chunk index array.
+    """
+    return get_merged_dataset(
+        project_dir, chunk_length, eod_token_id, "train", indexed_dataset_infos
+    )
+
+
+def get_merged_valid_dataset(
+    project_dir: str,
+    chunk_length: int,
+    eod_token_id: int,
+    indexed_dataset_infos: Optional[List[Dict]] = None,
+) -> DBDataset:
+    """Get validation dataset (for testing the vector index).
+
+    Args:
+        project_dir (str): Path to Retro project dir.
+        chunk_length (int): GPT chunk length (e.g., 64).
+        eod_token_id (int): EOD token ID.
+        indexed_dataset_infos (Optional[List[Dict]]): Optionally, pre-loaded list of dataset metadata (see `save_indexed_dataset_infos()` for more detail). If not provided, the indexed dataset infos will be loaded from disk.
+
+    Returns:
+        A DBDataset, which is a dataset that wraps the HDF5 chunk index array.
+    """
+    return get_merged_dataset(
+        project_dir, chunk_length, eod_token_id, "valid", indexed_dataset_infos
+    )
+
+
+def get_merged_datasets(project_dir: str, chunk_length: int, eod_token_id: int) -> dict:
+    """Get all merged datasets.
+
+    Args:
+        project_dir (str): Path to Retro project dir.
+        chunk_length (int): GPT chunk length (e.g., 64).
+        eod_token_id (int): EOD token ID.
+
+    Returns:
+        A dict mapping DB type ('sampled', 'train', or 'valid') to the corresponding DBDataset, which is a dataset that wraps the HDF5 chunk index array.
+    """
+    fns = {
+        "sampled": get_merged_sampled_dataset,
+        "train": get_merged_train_dataset,
+        "valid": get_merged_valid_dataset,
+    }
+    datasets = {key: fn(project_dir, chunk_length, eod_token_id) for key, fn in fns.items()}
+    return datasets
diff --git a/megatron/core/datasets/retro/external_libs.py b/megatron/core/datasets/retro/external_libs.py
new file mode 100644
index 0000000000..c057eba25c
--- /dev/null
+++ b/megatron/core/datasets/retro/external_libs.py
@@ -0,0 +1,15 @@
+# Copyright (c) 2024, NVIDIA CORPORATION.  All rights reserved.
+
+"""Required external libraries for Retro preprocessing."""
+
+import importlib
+
+required_libs = ["faiss", "h5py", "transformers"]  # for huggingface bert
+
+for lib in required_libs:
+    try:
+        globals()[lib] = importlib.import_module(lib)
+    except ImportError as e:
+        raise Exception(
+            f"Missing one or more packages required for Retro preprocessing: {required_libs}. Tried importing '{lib}'."
+        )
diff --git a/megatron/core/datasets/retro/index/__init__.py b/megatron/core/datasets/retro/index/__init__.py
new file mode 100644
index 0000000000..d069f55f22
--- /dev/null
+++ b/megatron/core/datasets/retro/index/__init__.py
@@ -0,0 +1,11 @@
+# Copyright (c) 2024, NVIDIA CORPORATION.  All rights reserved.
+
+"""
+Exports:
+
+  - train_index: Train an index on representative vectors.
+  - add_to_index: Add vectors to a trained index.
+  - build_index: Wrapper function that calls above two functions.
+"""
+
+from .build import add_to_index, build_index, train_index
diff --git a/megatron/core/datasets/retro/index/build.py b/megatron/core/datasets/retro/index/build.py
new file mode 100644
index 0000000000..1f310d89c3
--- /dev/null
+++ b/megatron/core/datasets/retro/index/build.py
@@ -0,0 +1,313 @@
+# Copyright (c) 2024, NVIDIA CORPORATION.  All rights reserved.
+
+"""Construct an index.
+
+Constructing an index generally happens in two phases:
+
+  - index.train(): Train an index on a representative set of vectors.
+  - index.add(): Add vectors to an index, to be available for retrieval.
+"""
+
+import os
+import shutil
+
+import numpy as np
+import torch
+from tqdm import tqdm
+
+from megatron.core.datasets.retro.config import RetroPreprocessingConfig
+from megatron.core.datasets.retro.db.utils import (
+    get_merged_sampled_dataset,
+    get_merged_train_dataset,
+)
+from megatron.core.datasets.retro.external_libs import h5py
+from megatron.core.datasets.retro.utils import GPTToTextDataset
+
+from .factory import IndexFactory
+from .utils import (
+    get_training_data_block_dir,
+    get_training_data_block_paths,
+    get_training_data_merged_path,
+    get_training_data_root_dir,
+)
+
+##################################################
+# Train index.
+##################################################
+
+
+def get_empty_index_path(config: RetroPreprocessingConfig) -> str:
+    """Path of empty index.
+
+    Args:
+        config (RetroPreprocessingConfig): Retro preprocessing config.
+
+    Returns:
+        Path to the empty (trained, but without added samples) vector index.
+    """
+    index = IndexFactory.get_index(config.retro_index_type)
+    empty_index_path = index.get_empty_index_path(config)
+    return empty_index_path
+
+
+def get_block_nload(block_path: str, load_fraction: float) -> int:
+    """Compute number of blocks to load.
+
+    This is computed by multiplying the total number of available blocks with the
+    fraction of blocks to load.
+
+    Args:
+        block_path (str): Path to HDF5 file containing block of data. File must contain key 'data'.
+        load_fraction (float): Fraction (0 < load_fraction <= 1) of block samples to load.
+
+    Returns:
+        Number of block samples to load.
+    """
+    with h5py.File(block_path) as fi:
+        return int(load_fraction * fi["data"].shape[0])
+
+
+def merge_embedding_blocks(config: RetroPreprocessingConfig) -> None:
+    """Merge individual embedding blocks into a single binary mmap file.
+
+    The embeddings are initially stored in block-sized (e.g., ~100k embeddings per
+    block) HDF5 files. These individual block files must be merged into a single
+    file before training, to be based as a numpy mmap array to the index.
+
+    Args:
+        config (RetroPreprocessingConfig): Retro preprocessing config.
+    """
+
+    if torch.distributed.get_rank() != 0:
+        return
+
+    # Get block, merged paths.
+    load_fraction = config.retro_index_train_load_fraction
+    block_paths = get_training_data_block_paths(config)
+    bin_path = get_training_data_merged_path(config)
+
+    # Skip, if already built.
+    if os.path.exists(bin_path):
+        return
+
+    # Merge blocks.
+    with open(bin_path, "wb") as fo:
+        byte_offset = 0
+        for block_idx, block_path in enumerate(
+            tqdm(
+                block_paths,
+                "merge train embeddings",
+                miniters=len(block_paths) // 10,
+                disable=torch.distributed.get_rank() != 0,
+            )
+        ):
+            with h5py.File(block_path) as fi:
+
+                nload = get_block_nload(block_path, load_fraction)
+                block = np.array(fi["data"][:nload], copy=False)
+
+                fo.write(block.tobytes())
+
+                byte_offset += block.size * block.itemsize
+                fo.seek(byte_offset)
+
+
+def get_text_dataset_for_training(config: RetroPreprocessingConfig) -> GPTToTextDataset:
+    """Convert GPT token chunk dataset to a text dataset for passing to the
+    embedder.
+
+    Args:
+        config (RetroPreprocessingConfig): Retro preprocessing config.
+
+    Returns:
+        The text dataset consisting of tokens converted from sampled chunk database.
+    """
+    gpt_dataset = get_merged_sampled_dataset(
+        project_dir=config.retro_project_dir,
+        chunk_length=config.retro_gpt_chunk_length,
+        eod_token_id=config.retro_tokenizers.gpt.eod,
+    )
+    text_dataset = GPTToTextDataset(gpt_dataset, config.retro_tokenizers.gpt)
+    return text_dataset
+
+
+def embed_training_chunks(config: RetroPreprocessingConfig) -> None:
+    """Embed DB chunks.
+
+    Store chunks in blocks on disk. These blocks will later be merged into
+    a single dataset for training the index.
+
+    Args:
+        config (RetroPreprocessingConfig): Retro preprocessing config.
+    """
+
+    merged_train_data_path = get_training_data_merged_path(config)
+    if os.path.exists(merged_train_data_path):
+        return
+
+    # Get training text dataset.
+    text_dataset = get_text_dataset_for_training(config)
+
+    # Embed dataset.
+    embedder = config.retro_bert_embedders.disk
+    embedder.embed_text_dataset("index", get_training_data_block_dir(config), text_dataset)
+
+    # Merge embeddings.
+    merge_embedding_blocks(config)
+
+
+def train_on_embeddings(config: RetroPreprocessingConfig) -> None:
+    """Train index on embedded DB chunks.
+
+    Args:
+        config (RetroPreprocessingConfig): Retro preprocessing config.
+    """
+    index = IndexFactory.get_index(config.retro_index_type)
+    index.train(config)
+
+
+def remove_embeddings(config: RetroPreprocessingConfig) -> None:
+    """Remove embeddings after training.
+
+    Args:
+        config (RetroPreprocessingConfig): Retro preprocessing config.
+    """
+    torch.distributed.barrier()
+    if torch.distributed.get_rank() != 0:
+        return
+    empty_index_path = get_empty_index_path(config)
+    assert os.path.isfile(empty_index_path)
+    shutil.rmtree(get_training_data_root_dir(config), ignore_errors=True)
+
+
+def _train_index(config: RetroPreprocessingConfig) -> None:
+    """Train index on DB chunks.
+
+    Args:
+        config (RetroPreprocessingConfig): Retro preprocessing config.
+    """
+
+    # Check if trained index already exists.
+    if not os.path.isfile(get_empty_index_path(config)):
+
+        # Embed training chunks.
+        embed_training_chunks(config)
+
+        # Train index on embeddings.
+        train_on_embeddings(config)
+
+    # Wait for (single-process) training to complete.
+    torch.distributed.barrier()
+
+    # Remove embeddings.
+    if config.retro_index_delete_training_embeddings:
+        remove_embeddings(config)
+
+
+def train_index(config: RetroPreprocessingConfig) -> None:
+    """Entry point for training the index.
+
+    We select whether to train a new index, or validate an existing index.
+
+    Args:
+        config (RetroPreprocessingConfig): Retro preprocessing config.
+    """
+
+    # Train new index.
+    if config.retro_task_validate is None:
+        _train_index(config)
+
+    # Validate existing trained index.
+    else:
+        from .validate import validate_training_embeddings
+
+        validate_training_embeddings(config)
+
+
+##################################################
+# Add to index.
+##################################################
+
+
+def get_text_dataset_for_adding(config: RetroPreprocessingConfig) -> GPTToTextDataset:
+    """Convert GPT token chunk dataset to a text dataset for passing to the
+    embedder.
+
+    Args:
+        config (RetroPreprocessingConfig): Retro preprocessing config.
+
+    Returns:
+        The text dataset that consists of tokens converted from the 'train' chunk database. These are the chunks used for retrieval by the pretraining 'train' dataset.
+    """
+    gpt_dataset = get_merged_train_dataset(
+        project_dir=config.retro_project_dir,
+        chunk_length=config.retro_gpt_chunk_length,
+        eod_token_id=config.retro_tokenizers.gpt.eod,
+    )
+    text_dataset = GPTToTextDataset(gpt_dataset, config.retro_tokenizers.gpt)
+    return text_dataset
+
+
+def _add_to_index(config: RetroPreprocessingConfig) -> str:
+    """Add DB chunks to index.
+
+    Args:
+        config (RetroPreprocessingConfig): Retro preprocessing config.
+
+    Returns:
+        Path to the populated index.
+    """
+
+    # Get index.
+    index = IndexFactory.get_index(config.retro_index_type)
+
+    # Get text dataset.
+    text_dataset = get_text_dataset_for_adding(config)
+
+    # Add to index.
+    output_index_path = index.add(config, text_dataset)
+
+    return output_index_path
+
+
+def add_to_index(config: RetroPreprocessingConfig) -> None:
+    """Entry point for adding to the index.
+
+    We select whether to add to a new index, or validate an existing index.
+
+    Args:
+        config (RetroPreprocessingConfig): Retro preprocessing config.
+    """
+
+    # Add to new index.
+    if config.retro_task_validate is None:
+        _add_to_index(config)
+
+    # Validate existing encodings.
+    else:
+        from .validate import validate_added_encodings
+
+        validate_added_encodings(config)
+
+
+##################################################
+# Build index (train + add).
+##################################################
+
+
+def build_index(config: RetroPreprocessingConfig) -> None:
+    """Build index.
+
+    Building index involves sequentially running stages above:
+    - Train index (on sampled training chunks).
+    - Add to index (on all training chunks).
+
+    Args:
+        config (RetroPreprocessingConfig): Retro preprocessing config.
+    """
+
+    # Train index.
+    train_index(config)
+
+    # Add to index.
+    add_to_index(config)
diff --git a/megatron/core/datasets/retro/index/factory.py b/megatron/core/datasets/retro/index/factory.py
new file mode 100644
index 0000000000..f88084ddb1
--- /dev/null
+++ b/megatron/core/datasets/retro/index/factory.py
@@ -0,0 +1,40 @@
+# Copyright (c) 2024, NVIDIA CORPORATION.  All rights reserved.
+
+"""The IndexFactory constructs an index from an index type string."""
+
+from megatron.core.datasets.retro.index.index import Index
+
+from .indexes import FaissBaseIndex, FaissParallelAddIndex
+
+
+class IndexFactory:
+    """Get index.
+
+    Index type generally read from argument '--retro-index-ty'.
+    """
+
+    @classmethod
+    def get_index_class(cls, index_type: str) -> type:
+        """Get an index class, given a type string.
+
+        Args:
+            index_type (str): One of 'faiss-base' (naive Faiss index wrapper) or 'faiss-par-add' (Faiss index wrapper with near embarrassingly parallel index.add().
+
+        Returns:
+            An `Index` sub-type corresponding to the `index_type`.
+        """
+        return {"faiss-base": FaissBaseIndex, "faiss-par-add": FaissParallelAddIndex}[index_type]
+
+    @classmethod
+    def get_index(cls, index_type: str) -> Index:
+        """Construct an index from an index type string.
+
+        Args:
+            index_type (str): One of 'faiss-base' (naive Faiss index wrapper) or 'faiss-par-add' (Faiss index wrapper with near embarrassingly parallel index.add().
+
+        Returns:
+            An `Index` instance corresponding to the `index_type`.
+        """
+        index_class = cls.get_index_class(index_type)
+        index = index_class()
+        return index
diff --git a/megatron/core/datasets/retro/index/index.py b/megatron/core/datasets/retro/index/index.py
new file mode 100644
index 0000000000..c6bd13fbee
--- /dev/null
+++ b/megatron/core/datasets/retro/index/index.py
@@ -0,0 +1,133 @@
+# Copyright (c) 2024, NVIDIA CORPORATION.  All rights reserved.
+
+"""Base class for all vector indexes.
+
+A vector index is a type of retrieval database that is queried using vectors,
+and returns vectors that are 'similar' (e.g., by cosine distance) to the query
+vector. The construction and usage of an index generally has the following
+pattern:
+
+  - Train the index on representative vectors.
+  - Add vectors to the index (i.e., vectors available for retrieval)
+  - Query index with new vector, to retrieve similar vector indexes.
+"""
+
+import abc
+import os
+from typing import List, Tuple
+
+import numpy as np
+import torch
+
+from megatron.core.datasets.retro.config import Embedder, RetroPreprocessingConfig
+from megatron.core.datasets.retro.external_libs import faiss
+from megatron.core.datasets.retro.utils import GPTToTextDataset
+
+from .utils import get_index_dir
+
+
+class Index(abc.ABC):
+    """Abstract base class for indexes.
+
+    *Note* : While currently only Faiss-based classes are implemented, in the
+    future, this class will be extended with other types of indexes that have
+    different performance-accuracy trade-offs.
+
+    The primary methods to override are:
+    - train() : Train index on the sampled training chunks.
+    - add() : Add all training chunks to index.
+    """
+
+    @classmethod
+    def make_object_verbose(cls, index: faiss.Index, verbose: bool) -> None:
+        """Make index object verbose.
+
+        Args:
+            index (faiss.Index): Faiss object to set verbose.
+            verbose (bool): Sets whether index should log status updates during training and adding.
+        """
+        assert isinstance(verbose, bool)
+        faiss.ParameterSpace().set_index_parameter(index, "verbose", verbose)
+
+    def get_empty_index_path(self, config: RetroPreprocessingConfig) -> str:
+        """Get file path to empty index (i.e., trained, but unpopulated).
+
+        Args:
+            config (RetroPreprocessingConfig): Retro preprocessing config.
+
+        Returns:
+            File path to empty index (i.e., this index has had index.train() called, but not yet index.add()).
+        """
+        return os.path.join(
+            get_index_dir(config), "empty_%.3f.faissindex" % config.retro_index_train_load_fraction
+        )
+
+    def get_empty_index(self, config: RetroPreprocessingConfig) -> faiss.Index:
+        """Get empty index (i.e., trained, but unpopulated).
+
+        Args:
+            config (RetroPreprocessingConfig): Retro preprocessing config.
+
+        Returns:
+            Empty Faiss index, loaded from storage.
+        """
+        return faiss.read_index(self.get_empty_index_path(config))
+
+    def get_added_index_path(self, config: RetroPreprocessingConfig) -> str:
+        """Get file path to index that has been populated with vectors.
+
+        Args:
+            config (RetroPreprocessingConfig): Retro preprocessing config.
+
+        Returns:
+            File path to added index (i.e., this index has had both index.train() and index.add() called).
+        """
+        return os.path.join(
+            get_index_dir(config),
+            "added_%.3f_%.3f.faissindex"
+            % (config.retro_index_train_load_fraction, config.retro_index_add_load_fraction),
+        )
+
+    def get_added_index(self, config: RetroPreprocessingConfig) -> faiss.Index:
+        """Get index that has been populated with vectors.
+
+        Args:
+            config (RetroPreprocessingConfig): Retro preprocessing config.
+
+        Returns:
+            'Added' (i.e., populated) Faiss index, loaded from storage.
+        """
+        return faiss.read_index(self.get_added_index_path(config))
+
+    @abc.abstractmethod
+    def train(self, config: RetroPreprocessingConfig) -> None:
+        """Train index on a representative set of vectors.
+
+        Args:
+            config (RetroPreprocessingConfig): Retro preprocessing config.
+        """
+
+    @abc.abstractmethod
+    def add(self, config: RetroPreprocessingConfig, text_dataset: GPTToTextDataset) -> None:
+        """Add vectors to index.
+
+        Args:
+            config (RetroPreprocessingConfig): Retro preprocessing config.
+            text_dataset (GPTToTextDataset): Text dataset that will be embedded and added to the index.
+        """
+
+    def embed_text_dataset_block(
+        self, embedder: Embedder, text_dataset: GPTToTextDataset, _range: Tuple[int, int]
+    ) -> np.ndarray:
+        """Embed a range of a text dataset.
+
+        Args:
+            embedder (Embedder): Embedder used for embedding a text dataset.
+            text_dataset (GPTToTextDataset): Text dataset that will be embedded.
+            _range (Tuple[int, int]): Start/end sample indices within text dataset used for embedding.
+
+        Returns:
+            An array of embeddings, with shape (len(text_dataset), dimension(embedder)).
+        """
+        sub_dataset = torch.utils.data.Subset(text_dataset, range(*_range))
+        return embedder.embed_text_dataset(sub_dataset)
diff --git a/megatron/core/datasets/retro/index/indexes/__init__.py b/megatron/core/datasets/retro/index/indexes/__init__.py
new file mode 100644
index 0000000000..c445909fea
--- /dev/null
+++ b/megatron/core/datasets/retro/index/indexes/__init__.py
@@ -0,0 +1,10 @@
+# Copyright (c) 2024, NVIDIA CORPORATION.  All rights reserved.
+
+"""
+Exports:
+- FaissBaseIndex: Unoptimized Faiss index wrapper
+- FaissParallelAddIndex: Optimized index.add() for Faiss index.
+"""
+
+from .faiss_base import FaissBaseIndex
+from .faiss_par_add import FaissParallelAddIndex
diff --git a/megatron/core/datasets/retro/index/indexes/faiss_base.py b/megatron/core/datasets/retro/index/indexes/faiss_base.py
new file mode 100644
index 0000000000..c1daf3f533
--- /dev/null
+++ b/megatron/core/datasets/retro/index/indexes/faiss_base.py
@@ -0,0 +1,150 @@
+# Copyright (c) 2024, NVIDIA CORPORATION.  All rights reserved.
+
+"""
+This class implements a simple, un-optimized wrapper around a Faiss index, that
+implements the Index interface (see ..index.py). While this class is
+instantiable, it is meant to be extended with optimizations in classes that
+inherit from this class (see FaissParAddIndex, for an example).
+"""
+
+import os
+
+import numpy as np
+import torch
+from tqdm import tqdm
+
+from megatron.core.datasets.retro.config import RetroPreprocessingConfig
+from megatron.core.datasets.retro.external_libs import faiss
+from megatron.core.datasets.retro.index.index import Index
+from megatron.core.datasets.retro.index.utils import (
+    get_training_data_merged_path,
+    num_samples_to_block_ranges,
+)
+from megatron.core.datasets.retro.utils import GPTToTextDataset, log_retro_rank_0
+
+
+class FaissBaseIndex(Index):
+    """Base class for Faiss-base indexes.
+
+    This class wraps a Faiss index, and adds additional functionality for training
+    and adding codes. This base class performs a naive sequential code adding,
+    while the optimized FaissParallelAddIndex class performs a parallel
+    index.add().
+    """
+
+    def _train(self, config: RetroPreprocessingConfig) -> None:
+        """Train index (rank 0's method).
+
+        Args:
+            config (RetroPreprocessingConfig): Retro preprocessing config.
+        """
+
+        assert torch.distributed.get_rank() == 0
+
+        # Set num threads (torch.distributed reset it to 1).
+        faiss.omp_set_num_threads(64)
+
+        empty_index_path = self.get_empty_index_path(config)
+
+        # Index already exists? -> return.
+        if os.path.isfile(empty_index_path):
+            return
+
+        # Load data.
+        merged_path = get_training_data_merged_path(config)
+        inp = np.memmap(merged_path, dtype="f4", mode="r").reshape((-1, config.hidden_size))
+
+        # Init index.
+        index = faiss.index_factory(config.hidden_size, config.retro_index_str)
+
+        # Move to GPU.
+        log_retro_rank_0("> move faiss index to gpu.")
+        index_ivf = faiss.extract_index_ivf(index)
+        clustering_index = faiss.index_cpu_to_all_gpus(faiss.IndexFlatL2(index_ivf.d))
+        index_ivf.clustering_index = clustering_index
+        log_retro_rank_0("> finished moving to gpu.")
+        self.make_object_verbose(index, True)
+        self.make_object_verbose(index_ivf, True)
+        self.make_object_verbose(index_ivf.quantizer, True)
+        self.make_object_verbose(index_ivf.clustering_index, True)
+
+        # Train index.
+        index.train(inp)
+
+        # Save index.
+        faiss.write_index(index, empty_index_path)
+
+    def train(self, config: RetroPreprocessingConfig) -> None:
+        """Train index.
+
+        Args:
+            config (RetroPreprocessingConfig): Retro preprocessing config.
+        """
+
+        # Single process only.
+        if torch.distributed.get_rank() == 0:
+            self._train(config)
+
+        torch.distributed.barrier()
+
+    def _add(self, config: RetroPreprocessingConfig, text_dataset: GPTToTextDataset) -> None:
+        """Add to index (rank 0's method).
+
+        Args:
+            config (RetroPreprocessingConfig): Retro preprocessing config.
+            text_dataset (GPTToTextDataset): Text dataset that will be embedded and added to the index.
+        """
+
+        assert torch.distributed.get_rank() == 0
+
+        dataset_sample_ranges = num_samples_to_block_ranges(len(text_dataset))
+
+        # Set num threads (torch.distributed reset it to 1).
+        faiss.omp_set_num_threads(64)
+
+        # Bert embedder.
+        embedder = config.bert_embedders.mem
+
+        # Empty/added index paths.
+        empty_index_path = self.get_empty_index_path()
+        added_index_path = self.get_added_index_path()
+
+        # Skip adding, if index exists.
+        if os.path.isfile(added_index_path):
+            return
+
+        # Read trained index.
+        index = faiss.read_index(empty_index_path)
+
+        # Iterate data blocks & add.
+        for sample_range in tqdm(dataset_sample_ranges, "faiss_base.add"):
+
+            # Embed text.
+            embeds = self.embed_text_dataset_block(embedder, text_dataset, sample_range)
+
+            # Add to index.
+            index.add(embeds)
+
+        # Write index.
+        faiss.write_index(index, added_index_path)
+
+    def add(self, config: RetroPreprocessingConfig, text_dataset: GPTToTextDataset) -> str:
+        """Add to index.
+
+        Args:
+            config (RetroPreprocessingConfig): Retro preprocessing config.
+            text_dataset (GPTToTextDataset): Text dataset that will be embedded and added to the index.
+
+        Returns:
+            File path to the populated index.
+        """
+
+        # Single process only.
+        if torch.distributed.get_rank() == 0:
+            self._add(config, text_dataset)
+
+        # Wait for rank 0.
+        torch.distributed.barrier()
+
+        # Get output index path, for return.
+        return self.get_added_index_path(config)
diff --git a/megatron/core/datasets/retro/index/indexes/faiss_par_add.py b/megatron/core/datasets/retro/index/indexes/faiss_par_add.py
new file mode 100644
index 0000000000..e014217262
--- /dev/null
+++ b/megatron/core/datasets/retro/index/indexes/faiss_par_add.py
@@ -0,0 +1,208 @@
+# Copyright (c) 2024, NVIDIA CORPORATION.  All rights reserved.
+
+"""Multi-process & multi-node version of Faiss's index.add().
+
+This class inherits from FaissBaseIndex, and optimizes the 'add()' method by
+making it multi-node and multi-process, with bit-wise equivalence to
+FaissBaseIndex. This allows 'add()' to scale out to very large datasets, since
+the vast majority of the computational effort is embarrassingly parallel.
+"""
+
+import os
+import shutil
+from typing import Tuple
+
+import numpy as np
+import psutil
+import torch
+from tqdm import tqdm
+
+from megatron.core.datasets.retro.config import Embedder, RetroPreprocessingConfig
+from megatron.core.datasets.retro.external_libs import faiss, h5py
+from megatron.core.datasets.retro.index.utils import get_added_code_paths, get_added_codes_dir
+from megatron.core.datasets.retro.utils import (
+    GPTToTextDataset,
+    get_blocks_by_rank,
+    log_retro_rank_0,
+    retro_makedir,
+)
+
+from .faiss_base import FaissBaseIndex
+
+
+class FaissParallelAddIndex(FaissBaseIndex):
+    """
+    This class parallelizes both 1) encoding vectors, and 2) adding codes to the
+    index. This class is more performant than naive use of Faiss, because most
+    of the computational work is in encoding the vectors, which is an
+    embarassingly parallel operation.
+    """
+
+    def encode_block(
+        self, index: faiss.Index, embedder: Embedder, text_dataset: GPTToTextDataset, block: dict
+    ) -> Tuple[np.ndarray, np.ndarray]:
+        """Encode sub-dataset block, to be later added to index.
+
+        Encode the data subset, generally in blocks of 1M vectors each. For
+        each block, the empty/trained index is loaded, codes are computed
+        via index.sa_encode(), and the resulting codes are saved to disk.
+
+        Args:
+            index (faiss.Index): Faiss index object.
+            embedder (Embedder): Embedder used to embed text dataset.
+            text_dataset (GPTToTextDataset): Text dataset to be embedded and encoded.
+            block (dict): Range information specifying start/end indices within text dataset.
+
+        Returns:
+            A tuple of (embeddings, encodings) for the given block subset of the text dataset.
+        """
+
+        # Embed block.
+        embeddings = self.embed_text_dataset_block(embedder, text_dataset, block["range"])
+
+        # Encode block.
+        log_retro_rank_0("encode.")
+        codes = index.sa_encode(embeddings)
+
+        # Return embeddings for validation purposes.
+        return embeddings, codes
+
+    def save_block(self, config: RetroPreprocessingConfig, block: dict, codes: np.ndarray) -> None:
+        """Save block of codes to disk.
+
+        Args:
+            config (RetroPreprocessingConfig): Retro preprocessing config.
+            block (dict): Range information specifying the start/end indices within the encoded text dataset. Here, the 'path' item is used for writing the encodings to storage.
+            codes (np.ndarray): Block of encodings to be saved to storage.
+        """
+        # Save neighbors.
+        log_retro_rank_0("save codes.")
+        retro_makedir(config, os.path.dirname(block["path"]))
+        with h5py.File(block["path"], "w") as f:
+            f.create_dataset("data", data=codes)
+
+    def encode(self, config: RetroPreprocessingConfig, text_dataset: GPTToTextDataset) -> None:
+        """Encode text dataset, to be later added to index.
+
+        Args:
+            config (RetroPreprocessingConfig): Retro preprocessing config.
+            text_dataset (GPTToTextDataset): Text dataset to be encoded by the index.
+        """
+
+        codes_dir = get_added_codes_dir(config)
+        retro_makedir(config, codes_dir)
+
+        # Index.
+        index = self.get_empty_index(config)
+
+        # Bert embedder.
+        embedder = config.retro_bert_embedders.mem
+
+        # Missing code blocks.
+        def validate(f: h5py.File) -> None:
+            """Validation method for validating loaded encodings.
+
+            Args:
+                f (h5py.File): File that contains encodings.
+            """
+            assert len(f["data"].shape) == 2
+
+        blocks = get_blocks_by_rank(
+            codes_dir, len(text_dataset), config.retro_block_size, validate=validate
+        )
+
+        # Encode each block.
+        for block_index, block in enumerate(blocks.missing):
+
+            if block is not None:
+
+                # Progress.
+                log_retro_rank_0(
+                    "encode block %d / %d ... %s."
+                    % (block_index, len(blocks.missing), block["path"])
+                )
+
+                # Encode and save.
+                _, codes = self.encode_block(index, embedder, text_dataset, block)
+                self.save_block(config, block, codes)
+
+            # Synchronize progress across all ranks. (for easier observation)
+            log_retro_rank_0(" > waiting for other ranks to finish block.")
+            torch.distributed.barrier()
+
+    def add_codes(self, config: RetroPreprocessingConfig) -> None:
+        """Read codes from disk, and add them to the index.
+
+        Args:
+            config (RetroPreprocessingConfig): Retro preprocessing config.
+        """
+
+        if torch.distributed.get_rank() != 0:
+            return
+
+        added_index_path = self.get_added_index_path(config)
+        if os.path.exists(added_index_path):
+            return
+
+        # Index.
+        log_retro_rank_0("read empty index.")
+        index = self.get_empty_index(config)
+        index_ivf = faiss.extract_index_ivf(index)
+
+        # Add codes.
+        log_retro_rank_0("add codes.")
+        code_paths = get_added_code_paths(config)
+        pbar = tqdm(code_paths)
+        for code_path in pbar:
+            pbar.set_description(
+                "add codes, mem %.3f gb, %.1f%%"
+                % (psutil.virtual_memory()[3] / 1024**3, psutil.virtual_memory()[2])
+            )
+            with h5py.File(code_path) as f:
+
+                nload = int(config.retro_index_add_load_fraction * f["data"].shape[0])
+                offset = int(os.path.basename(code_path).split("-")[0])
+                xids = np.arange(offset, offset + nload)
+                codes = np.copy(f["data"][:nload])
+                index_ivf.add_sa_codes(codes, xids)
+
+        # Update index's ntotal.
+        index.ntotal = index_ivf.ntotal
+
+        # Write index.
+        log_retro_rank_0("write added index.")
+        faiss.write_index(index, added_index_path)
+
+    def remove_codes(self, config: RetroPreprocessingConfig) -> None:
+        """Remove added codes after adding to index.
+
+        Args:
+            config (RetroPreprocessingConfig): Retro preprocessing config.
+        """
+        if torch.distributed.get_rank() != 0:
+            return
+        assert os.path.isfile(self.get_added_index_path(config))
+
+        if config.retro_index_delete_added_codes:
+            raise Exception("remove?")
+            shutil.rmtree(get_added_codes_dir(config), ignore_errors=True)
+
+    def add(self, config: RetroPreprocessingConfig, text_dataset: GPTToTextDataset) -> None:
+        """Add vectors to index.
+
+        Args:
+            config (RetroPreprocessingConfig): Retro preprocessing config.
+            text_dataset (GPTToTextDataset): Text dataset that will be embedded and added to the index.
+        """
+
+        # Encode chunks.
+        self.encode(config, text_dataset)
+
+        # Add codes to index.
+        self.add_codes(config)
+
+        # Wait for (single-process) adding to complete.
+        torch.distributed.barrier()
+
+        # Remove codes.
+        self.remove_codes(config)
diff --git a/megatron/core/datasets/retro/index/utils.py b/megatron/core/datasets/retro/index/utils.py
new file mode 100644
index 0000000000..58229439ae
--- /dev/null
+++ b/megatron/core/datasets/retro/index/utils.py
@@ -0,0 +1,126 @@
+# Copyright (c) 2024, NVIDIA CORPORATION.  All rights reserved.
+
+"""Utilities for building an index."""
+
+import glob
+import os
+from typing import List, Tuple
+
+from megatron.core.datasets.retro.config import RetroPreprocessingConfig
+from megatron.core.datasets.retro.utils import retro_makedir
+
+
+def get_index_dir(config: RetroPreprocessingConfig) -> str:
+    """Create sub-directory for this index.
+
+    Args:
+        config (RetroPreprocessingConfig): Retro preprocessing config.
+
+    Returns:
+        Path to index sub-directory within Retro project.
+    """
+
+    # Directory path.
+    index_dir_path = os.path.join(
+        config.retro_project_dir, "index", config.retro_index_type, config.retro_index_str
+    )
+
+    # Make directory.
+    retro_makedir(config, index_dir_path)
+
+    return index_dir_path
+
+
+def num_samples_to_block_ranges(
+    config: RetroPreprocessingConfig, num_samples: int
+) -> List[Tuple[int, int]]:
+    """Split a range (length num_samples) into sequence of block ranges
+    of size block_size.
+
+    Args:
+        config (RetroPreprocessingConfig): Retro preprocessing config.
+        num_samples (int): Split `num_samples` into consecutive block ranges, where each block is size `config.retro_block_size`.
+
+    Returns:
+        A list of tuples where each item is the (start, end) index for a given block.
+    """
+    block_size = config.retro_block_size
+    start_idxs = list(range(0, num_samples, block_size))
+    end_idxs = [min(num_samples, s + block_size) for s in start_idxs]
+    ranges = list(zip(start_idxs, end_idxs))
+    return ranges
+
+
+def get_training_data_root_dir(config: RetroPreprocessingConfig) -> str:
+    """Get root directory for embeddings (blocks and merged data).
+
+    Args:
+        config (RetroPreprocessingConfig): Retro preprocessing config.
+
+    Returns:
+        Path to the training data directory, which contains both training embedding blocks and the final merged training embeddings.
+    """
+    return os.path.join(config.retro_project_dir, "index", "train_emb")
+
+
+def get_training_data_block_dir(config: RetroPreprocessingConfig) -> str:
+    """Get directory for of saved embedding blocks.
+
+    Args:
+        config (RetroPreprocessingConfig): Retro preprocessing config.
+
+    Returns:
+        Path to the directory containing the training embedding blocks, which will be later merged into a single embedding array.
+    """
+    return os.path.join(get_training_data_root_dir(config), "blocks")
+
+
+def get_training_data_block_paths(config: RetroPreprocessingConfig) -> List[str]:
+    """Get paths to saved embedding blocks.
+
+    Args:
+        config (RetroPreprocessingConfig): Retro preprocessing config.
+
+    Returns:
+        Paths of all training embedding blocks.
+    """
+    return sorted(glob.glob(get_training_data_block_dir(config) + "/*.hdf5"))
+
+
+def get_training_data_merged_path(config: RetroPreprocessingConfig) -> str:
+    """Get path to merged training embeddings.
+
+    Args:
+        config (RetroPreprocessingConfig): Retro preprocessing config.
+
+    Returns:
+        Path to the merged training embedding binary file.
+    """
+    return os.path.join(
+        get_training_data_root_dir(config),
+        "train_%.3f.bin" % config.retro_index_train_load_fraction,
+    )
+
+
+def get_added_codes_dir(config: RetroPreprocessingConfig) -> str:
+    """Get directory of saved encodings.
+
+    Args:
+        config (RetroPreprocessingConfig): Retro preprocessing config.
+
+    Returns:
+        Path to the directory containing the vector encodings for adding to the index.
+    """
+    return os.path.join(get_index_dir(config), "add_codes")
+
+
+def get_added_code_paths(config: RetroPreprocessingConfig) -> List[str]:
+    """Get paths to all saved encodings.
+
+    Args:
+        config (RetroPreprocessingConfig): Retro preprocessing config.
+
+    Returns:
+        Paths of all vector encoding blocks, for adding to the index.
+    """
+    return sorted(glob.glob(get_added_codes_dir(config) + "/*.hdf5"))
diff --git a/megatron/core/datasets/retro/index/validate.py b/megatron/core/datasets/retro/index/validate.py
new file mode 100644
index 0000000000..57306707c4
--- /dev/null
+++ b/megatron/core/datasets/retro/index/validate.py
@@ -0,0 +1,191 @@
+# Copyright (c) 2024, NVIDIA CORPORATION.  All rights reserved.
+
+"""Validate an index's data.
+
+This module contains functionality for checking for bitwise equality across code
+changes. The training and adding steps of index construction can be validated
+separately. The following high-level checks are supported:
+
+  - Training: Validate that saved training embeddings are bitwise equal with a
+      sample set of freshly computed embeddings. (*Note*:
+      `--no-retro-index-delete-training-embeddings` must be used.)
+  - Adding: Validate that the saved encodings are bitwise equal with a sample of
+      sample set of freshly computed encodings. (*Note*:
+      `--no-retro-index-delete-added-codes` must be used.)
+"""
+
+import typing
+
+import numpy as np
+import torch
+from torch.utils.data import Subset
+
+from megatron.core.datasets.retro.config import RetroPreprocessingConfig
+from megatron.core.datasets.retro.external_libs import h5py
+from megatron.core.datasets.retro.utils import (
+    GPTToTextDataset,
+    get_blocks_by_rank,
+    log_retro_rank_0,
+)
+
+from .build import get_text_dataset_for_adding, get_text_dataset_for_training
+from .factory import IndexFactory
+from .utils import get_added_codes_dir, get_training_data_block_dir
+
+##################################################
+# Validate trained index.
+##################################################
+
+
+def validate_training_embeddings(config: RetroPreprocessingConfig) -> None:
+    """Validate training embeddings.
+
+    Steps:
+    - Randomly sample subset of text dataset blocks.
+    - Embed each block.
+    - Compare against saved embeddings.
+
+    Args:
+        config (RetroPreprocessingConfig): Retro preprocessing config.
+    """
+
+    # Training text dataset.
+    text_dataset = get_text_dataset_for_training(config)
+
+    # Sample existing blocks.
+    blocks = get_blocks_by_rank(
+        dirname=get_training_data_block_dir(config),
+        n_samples=len(text_dataset),
+        block_size=config.retro_block_size,
+        validate=None,
+        sample=config.retro_task_validate,
+    )
+
+    assert blocks.n_missing_world == 0
+
+    # Embed & validate blocks.
+    embedder = config.retro_bert_embedders.mem
+    for block_idx, block in enumerate(blocks.existing):
+
+        # Missing block lists are extended with None to have equal-length
+        # lists. Skip the Nones.
+        if block is not None:
+
+            # Progress. (*note*: move world progress to here.)
+            log_retro_rank_0(
+                "embed training block %d / %d ... %s."
+                % (block_idx, len(blocks.existing), block["path"])
+            )
+
+            # Load existing block embeddings.
+            with h5py.File(block["path"]) as f:
+                existing_embeddings = np.copy(f["data"])
+
+            # Embed block.
+            sub_dataset = Subset(text_dataset, range(*block["range"]))
+            embeddings = embedder.embed_text_dataset(sub_dataset, "train")
+
+            # Check equality.
+            log_retro_rank_0(" > validate.")
+            assert np.array_equal(existing_embeddings, embeddings)
+
+        # Synchronize progress across all ranks. (for easier observation)
+        log_retro_rank_0(" > waiting for other ranks to finish block.")
+        torch.distributed.barrier()
+
+    log_retro_rank_0(" > finished validating training embeddings.")
+
+
+##################################################
+# Validate filled index.
+##################################################
+
+
+def validate_added_encodings(config: RetroPreprocessingConfig) -> None:
+    """Validate added encodings.
+
+    Steps:
+    - Randomly sample subset of text dataset blocks.
+    - Encode each block.
+    - Compare against saved encodings.
+
+    Args:
+        config (RetroPreprocessingConfig): Retro preprocessing config.
+    """
+
+    # Index.
+    index = IndexFactory.get_index(config.retro_index_type)
+    inner_index = index.get_empty_index(config)
+
+    # Text dataset.
+    text_dataset = get_text_dataset_for_adding(config)
+
+    # Sample existing blocks.
+    def validate(f: h5py.File) -> None:
+        """Validation method for validating encoding blocks.
+
+        Args:
+            f (h5py.File): File with block of encodings.
+        """
+        assert len(f["data"].shape) == 2
+
+    blocks = get_blocks_by_rank(
+        dirname=get_added_codes_dir(config),
+        n_samples=len(text_dataset),
+        block_size=config.retro_block_size,
+        validate=validate,
+        sample=config.retro_task_validate,
+    )
+
+    assert blocks.n_missing_world == 0
+
+    # Encode and validate blocks.
+    embedder = config.retro_bert_embedders.mem
+    for block_idx, block in enumerate(blocks.existing):
+
+        if block is not None:
+
+            # Progress.
+            log_retro_rank_0(
+                "encode block %d / %d ... %s." % (block_idx, len(blocks.existing), block["path"])
+            )
+
+            # Load existing codes.
+            with h5py.File(block["path"]) as f:
+                existing_codes = np.copy(f["data"])
+
+            # Encode block.
+            embeddings, codes = index.encode_block(inner_index, embedder, text_dataset, block)
+
+            # Check equality.
+            log_retro_rank_0(" > validate.")
+            assert np.array_equal(existing_codes, codes)
+
+        # Synchronize progress across all ranks. (for easier observation)
+        log_retro_rank_0(" > waiting for other ranks to finish block.")
+        torch.distributed.barrier()
+
+    log_retro_rank_0(" > finished validating added encodings.")
+
+
+##################################################
+# Validate index (trained + filled).
+##################################################
+
+
+def validate_index(config: RetroPreprocessingConfig) -> None:
+    """Validate index.
+
+    Validating index involves sequentially running stages above:
+    - Validate trained index.
+    - Validate filled index.
+
+    Args:
+        config (RetroPreprocessingConfig): Retro preprocessing config.
+    """
+
+    # Validate training embeddings.
+    validate_training_embeddings(config)
+
+    # Validate added codes.
+    validate_added_encodings(config)
diff --git a/megatron/core/datasets/retro/query/__init__.py b/megatron/core/datasets/retro/query/__init__.py
new file mode 100644
index 0000000000..ac9483373c
--- /dev/null
+++ b/megatron/core/datasets/retro/query/__init__.py
@@ -0,0 +1 @@
+# Copyright (c) 2024, NVIDIA CORPORATION.  All rights reserved.
diff --git a/megatron/core/datasets/retro/query/gpt_chunk_dataset.py b/megatron/core/datasets/retro/query/gpt_chunk_dataset.py
new file mode 100644
index 0000000000..6191a30a31
--- /dev/null
+++ b/megatron/core/datasets/retro/query/gpt_chunk_dataset.py
@@ -0,0 +1,109 @@
+# Copyright (c) 2024, NVIDIA CORPORATION.  All rights reserved.
+
+"""
+A GPTChunkDataset is a wrapper around a regular GPTDataset, that sequentially
+chunks the sample tokens into `retro_chunk_length` sized smaller samples.
+
+For example, if the GPTDataset has 100 samples and a sequence length of 2048, and
+retro_chunk_length is 64, then the GPTChunkDataset will contain 100*(2048/64) =
+3200 samples, each with length 64.
+"""
+
+import torch
+
+from megatron.core.datasets.gpt_dataset import GPTDataset
+from megatron.core.datasets.retro.utils import get_num_chunks_per_sample
+
+from .utils import get_neighbor_dir
+
+
+class GPTChunkDataset(torch.utils.data.Dataset):
+    """Pretraining chunk dataset wraps a standard GPT dataset.
+
+    This dataset conceptually divides each sample (e.g., length 2048)
+    into chunks (e.g., length 64) and restructures them into a list of
+    chunks (e.g., length num_samples * num_chunks_per_sample).
+
+    Args:
+        sample_dataset (GPTDataset): Original GPT dataset, with `sequence_length` size samples.
+        sample_length (int): Alias for `sequence_length`.
+        chunk_length (int): Retro chunk length (e.g., 64).
+    """
+
+    def __init__(self, sample_dataset: GPTDataset, sample_length: int, chunk_length: int):
+
+        super().__init__()
+
+        self.sample_dataset = sample_dataset
+        self.chunk_length = chunk_length
+        self.n_chunks_per_sample = get_num_chunks_per_sample(sample_length, chunk_length)
+        self.n_samples = len(sample_dataset)
+        self.n_chunks = self.n_samples * self.n_chunks_per_sample
+
+    def __len__(self) -> int:
+        """Get dataset length.
+
+        Returns:
+            Dataset length.
+        """
+        return self.n_chunks
+
+    def __getitem__(self, idx: int) -> dict:
+        """Get sample, including represented document IDs.
+
+        Args:
+            idx (int): Sample index.
+
+        Returns:
+            A sample, which contains both the chunk-length token sample ('text') along with all document_ids ('doc_ids') contained withing the full `sequence_length` sample.
+        """
+
+        # Convert global chunk index to global sample index & local chunk index.
+        sample_idx = idx // self.n_chunks_per_sample
+        chunk_idx = idx % self.n_chunks_per_sample
+
+        # Extract sample data.
+        sample = self.sample_dataset[sample_idx]
+        sample_token_ids = sample["text"]
+        sample_doc_ids = sample["document_ids"]
+
+        # Chunk start/end token idxs.
+        token_start_idx = chunk_idx * self.chunk_length
+        token_end_idx = token_start_idx + self.chunk_length
+        chunk_token_ids = sample_token_ids[token_start_idx:token_end_idx]
+
+        # Sample.
+        return {"doc_ids": sample_doc_ids, "text": chunk_token_ids}
+
+
+def build_gpt_chunk_datasets_from_gpt_datasets(
+    project_dir: str, gpt_datasets: dict, sample_length: int, chunk_length: int
+) -> dict:
+    """Get train, valid, test GPT chunk datasets.
+
+    Args:
+        project_dir (str): Retro project dir.
+        gpt_datasets (dict): Mapping of 'train', 'valid', and 'test' GPT datasets (original, unchunked datasets).
+        sample_length (int): Alias of `sequence_length`.
+        chunk_length (int): Retro chunk length (e.g., 64).
+
+    Returns:
+        A <dict> ?
+    """
+
+    # GPT chunk datasets.
+    chunk_datasets = {
+        key: (
+            {
+                "dataset": GPTChunkDataset(sample_ds, sample_length, chunk_length),
+                "neighbor_dir": get_neighbor_dir(project_dir, key, sample_ds),
+                "num_active_chunks": num_active_samples
+                * get_num_chunks_per_sample(sample_length, chunk_length),
+            }
+            if sample_ds
+            else None
+        )
+        for key, (sample_ds, num_active_samples) in gpt_datasets.items()
+    }
+
+    return chunk_datasets
diff --git a/megatron/core/datasets/retro/query/multi_split_gpt_dataset.py b/megatron/core/datasets/retro/query/multi_split_gpt_dataset.py
new file mode 100644
index 0000000000..97a891fd14
--- /dev/null
+++ b/megatron/core/datasets/retro/query/multi_split_gpt_dataset.py
@@ -0,0 +1,107 @@
+# Copyright (c) 2022, NVIDIA CORPORATION. All rights reserved.
+
+"""A MultiSplitGPTDataset can handle multiple intersecting split strings, as well
+as returning all of the document IDs of a sample."""
+
+import logging
+from dataclasses import dataclass
+from typing import Dict, List
+
+import numpy
+
+from megatron.core.datasets.blended_megatron_dataset_config import (
+    convert_split_vector_to_split_matrix,
+    parse_and_normalize_split,
+)
+from megatron.core.datasets.gpt_dataset import GPTDataset, GPTDatasetConfig
+from megatron.core.datasets.indexed_dataset import IndexedDataset
+from megatron.core.datasets.utils import Split
+from megatron.core.utils import log_single_rank
+
+logger = logging.getLogger(__name__)
+
+
+@dataclass
+class MultiSplitGPTDatasetConfig(GPTDatasetConfig):
+    """Configuration object for Megatron Core blended and Retro datasets.
+
+    Args:
+        return_document_ids (bool): Whether to return the document ids when querying the dataset. Turn this option on during preprocessing.
+        split_preprocessing (str): The Retro preprocessing split string. It follows the same pattern convention as 'split'. Not to be used with 'blend_per_split'.
+    """
+
+    return_document_ids: bool = None
+
+    split_preprocessing: str = None
+
+    def __post_init__(self) -> None:
+        """Validate config attributes."""
+        super().__post_init__()
+        assert self.split is not None, "the Retro data pipeline does not support 'blend_per_split'"
+        assert self.return_document_ids is not None, "this attribute must be user defined"
+        assert self.split_preprocessing is not None, "this attribute must be user defined"
+        split_vector = parse_and_normalize_split(self.split)
+        split_preprocessing_vector = parse_and_normalize_split(self.split_preprocessing)
+        if not numpy.allclose(split_vector, split_preprocessing_vector):
+            self.split_matrix = convert_split_vector_to_split_matrix(
+                split_vector, split_preprocessing_vector
+            )
+            log_single_rank(
+                logger,
+                logging.WARNING,
+                f"split =/= split_preprocessing. Let split_matrix = {self.split_matrix}",
+            )
+
+
+class MultiSplitGPTDataset(GPTDataset):
+    """Retro's customized GPT dataset.
+
+    Args:
+        indexed_dataset (IndexedDataset): The IndexedDataset around which to build the MegatronDataset.
+        dataset_path (str): The real path on disk to the dataset, for bookkeeping.
+        indexed_indices (numpy.ndarray): The set of the documents indices to expose.
+        num_samples (int): The number of samples to draw from the indexed dataset.
+        index_split (Split): The indexed_indices Split.
+        config (MultiSplitGPTDatasetConfig): The Retro-specific container for all config sourced parameters.
+    """
+
+    def __init__(
+        self,
+        indexed_dataset: IndexedDataset,
+        dataset_path: str,
+        indexed_indices: numpy.ndarray,
+        num_samples: int,
+        index_split: Split,
+        config: MultiSplitGPTDatasetConfig,
+    ) -> None:
+        super().__init__(
+            indexed_dataset, dataset_path, indexed_indices, num_samples, index_split, config
+        )
+
+    def __getitem__(self, idx: int) -> Dict[str, numpy.ndarray]:
+        """Get dataset sample.
+
+        Args:
+            idx (int): The index into the dataset.
+
+        Returns:
+            Dict[str, numpy.ndarray]: The text ids and (optionally) the document ids wrapped in a dictionary.
+        """
+        text, document_ids = self._query_document_sample_shuffle_indices(idx)
+        if self.config.return_document_ids:
+            return {"text": text, "document_ids": document_ids}
+        else:
+            return {"text": text}
+
+    @staticmethod
+    def _key_config_attributes() -> List[str]:
+        """Add custom attributes for building unique dataset hash.
+
+        The preprocessing split used for preprocessing will constrain the samples available for pretraining.
+
+        Returns:
+            List[str]: The key config attributes.
+        """
+        return super(MultiSplitGPTDataset, MultiSplitGPTDataset)._key_config_attributes() + [
+            "split_preprocessing"
+        ]
diff --git a/megatron/core/datasets/retro/query/query.py b/megatron/core/datasets/retro/query/query.py
new file mode 100644
index 0000000000..9da3381712
--- /dev/null
+++ b/megatron/core/datasets/retro/query/query.py
@@ -0,0 +1,393 @@
+# Copyright (c) 2024, NVIDIA CORPORATION.  All rights reserved.
+
+"""Entry point for querying an index using a GPTChunkDataset.
+
+Querying involves:
+
+  - Iterate all chunks in the GPTChunkDataset.
+  - Query index for neighbor chunk IDs (i.e., chunks from the chunk database).
+  - Save neighbor chunk IDs to disk, for use in building a RetroDataset sample
+      during pretraining.
+"""
+
+import os
+import time
+import typing
+
+import numpy as np
+import psutil
+import torch
+from tqdm import tqdm
+
+from megatron.core.datasets.retro.config import RetroPreprocessingConfig
+from megatron.core.datasets.retro.db.dataset import DBDataset
+from megatron.core.datasets.retro.db.utils import (
+    get_merged_train_dataset as get_db_merged_train_dataset,
+)
+from megatron.core.datasets.retro.external_libs import faiss, h5py
+from megatron.core.datasets.retro.index.factory import IndexFactory
+from megatron.core.datasets.retro.index.index import Index
+from megatron.core.datasets.retro.index.utils import get_index_dir
+from megatron.core.datasets.retro.query.gpt_chunk_dataset import GPTChunkDataset
+from megatron.core.datasets.retro.utils import (
+    GPTToTextDataset,
+    get_blocks_by_rank,
+    log_retro_rank_0,
+    retro_makedir,
+)
+
+from .gpt_chunk_dataset import build_gpt_chunk_datasets_from_gpt_datasets
+
+
+def get_index(config: RetroPreprocessingConfig, ondisk: bool = False) -> faiss.Index:
+    """Read index from disk.
+
+    Args:
+        config (RetroPreprocessingConfig): Retro preprocessing config.
+        ondisk (bool): If `ondisk = True`, memory map the index. (For debugging purposes only; very non-performant.)
+
+    Returns:
+        A Faiss index, loaded from storage.
+    """
+
+    # Load index.
+    index_wrapper = IndexFactory.get_index(config.retro_index_type)
+    index_dir = get_index_dir(config)
+    added_index_path = index_wrapper.get_added_index_path(config)
+    if ondisk:
+        index = faiss.read_index(added_index_path, faiss.IO_FLAG_MMAP)
+    else:
+        index = faiss.read_index(added_index_path)
+
+    # Search parameters.
+    faiss.ParameterSpace().set_index_parameter(index, "efSearch", config.retro_query_ef_search)
+    faiss.ParameterSpace().set_index_parameter(index, "nprobe", config.retro_query_nprobe)
+
+    return index
+
+
+def embed_block(
+    config: RetroPreprocessingConfig, gpt_dataset: GPTChunkDataset, block: dict
+) -> np.ndarray:
+    """Embed block of chunks.
+
+    Args:
+        config (RetroPreprocessingConfig): Retro preprocessing config.
+        gpt_dataset (GPTChunkDataset): Chunk dataset to be embedded.
+        block (dict): Range information containing start/end indices of subset of chunk dataset.
+
+    Returns:
+        Embeddings array, with shape (len(block["range"]), dimension(embedder)).
+    """
+    text_block_dataset = torch.utils.data.Subset(
+        GPTToTextDataset(gpt_dataset, config.retro_tokenizers.gpt), range(*block["range"])
+    )
+    return config.retro_bert_embedders.mem.embed_text_dataset(text_block_dataset)
+
+
+def query_embeddings(
+    config: RetroPreprocessingConfig,
+    db_dataset: DBDataset,
+    index: Index,
+    embeddings: np.ndarray,
+    chunk_id_range: range,
+    sample_map: dict,
+    n_chunks_per_sample: int,
+    verbose: bool = True,
+) -> typing.Tuple[np.ndarray, np.ndarray]:
+    """Query neighbors of a block of embeddings.
+
+    Querying includes:
+      - Query index for neighbor chunk IDs.
+      - Filter chunk IDs that have the same document ID as the queried embedding.
+
+    Args:
+        config (RetroPreprocessingConfig): Retro preprocessing config.
+        db_dataset (DBDataset): Dataset containing chunk database entries.
+        index (Index): Vector index populated with chunk database indices.
+        embeddings (np.ndarray): Embeddings from GPT chunk dataset.
+        chunk_id_range (range): Chunk ID range from GPT chunk dataset.
+        sample_map (dict): Mapping of sample_idx to dataset_idx and document_ids. Used for document filtering.
+        n_chunks_per_sample (int): Number of chunks per sample (e.g., sequence_length / chunk_length).
+        verbose (bool): Log querying progress.
+
+    Returns:
+        A tuple of original (unfiltered) neighbor IDs, and filtered (by document ID) neighbor IDs.
+    """
+
+    # Query neighbor ids.
+    if verbose:
+        log_retro_rank_0("search.")
+    t = time.time()
+    assert index.ntotal > 0, "check we don't accidentally have an empty index."
+    _, query_neighbor_ids = index.search(embeddings, config.retro_query_num_neighbors_query)
+    if verbose:
+        log_retro_rank_0("  time : %.3f sec." % (time.time() - t))
+
+    # Filter banned neighbor ids.
+    if verbose:
+        log_retro_rank_0("filter banned neighbor ids.")
+    filtered_neighbor_ids = np.full(
+        shape=(len(query_neighbor_ids), config.retro_query_num_neighbors_save),
+        fill_value=-1,
+        dtype="int64",
+    )
+    min_chunk_id, max_chunk_id = chunk_id_range
+    for chunk_id in range(min_chunk_id, max_chunk_id):
+
+        sample_id = chunk_id // n_chunks_per_sample
+        sample = sample_map[sample_id]
+        sample_dataset_idx = sample["dataset_idx"].item()
+        sample_doc_ids = sample["doc_ids"].tolist()
+        sample_doc_tuples = [(sample_dataset_idx, d) for d in sample_doc_ids]
+
+        # Get valid neighbors (!= -1).
+        query_row = [i for i in query_neighbor_ids[chunk_id - min_chunk_id] if i >= 0]
+
+        # Filter row.
+        filtered_row = [
+            i
+            for i in query_row
+            if tuple(db_dataset.doc_tuples[i].tolist()) not in sample_doc_tuples
+        ]
+        filtered_row = filtered_row[: config.retro_query_num_neighbors_save]
+        filtered_row += [-1] * (config.retro_query_num_neighbors_save - len(filtered_row))
+        filtered_neighbor_ids[chunk_id - min_chunk_id] = filtered_row
+
+    return query_neighbor_ids, filtered_neighbor_ids
+
+
+def query_embedding_block(
+    config: RetroPreprocessingConfig,
+    db_dataset: DBDataset,
+    index: Index,
+    embeddings: np.ndarray,
+    chunk_id_range: range,
+    sample_map: dict,
+    n_chunks_per_sample: int,
+) -> typing.Tuple[np.ndarray, np.ndarray]:
+    """Query a block of embeddings.
+
+    The block is broken into smaller sub-blocks, for easier tracking of progress.
+    Both the raw neighbor IDs and the filtered neighbor IDs (i.e., chunks with the
+    same document ID are removed) are collected.
+
+    Args:
+        config (RetroPreprocessingConfig): Retro preprocessing config.
+        db_dataset (DBDataset): Dataset containing chunk database entries.
+        index (Index): Vector index populated with chunk database indices.
+        embeddings (np.ndarray): Embeddings from GPT chunk dataset.
+        chunk_id_range (range): Chunk ID range from GPT chunk dataset.
+        sample_map (dict): Mapping of sample_idx to dataset_idx and document_ids. Used for document filtering.
+        n_chunks_per_sample (int): Number of chunks per sample (e.g., sequence_length / chunk_length).
+
+    Returns:
+        A tuple of original (unfiltered) neighbor IDs, and filtered (by document ID) neighbor IDs.
+    """
+
+    query_neighbor_ids = []
+    filtered_neighbor_ids = []
+
+    # Query in sub-blocks.
+    partial_block_size = 1000
+    for partial_start_idx in tqdm(
+        range(0, len(embeddings), partial_block_size),
+        "  search",
+        miniters=(len(embeddings) // partial_block_size) // 10,
+        disable=torch.distributed.get_rank() != 0,
+    ):
+        partial_end_idx = min(len(embeddings), partial_start_idx + partial_block_size)
+        partial_embeddings = embeddings[partial_start_idx:partial_end_idx]
+        partial_chunk_id_range = (
+            chunk_id_range[0] + partial_start_idx,
+            chunk_id_range[0] + partial_end_idx,
+        )
+        partial_query_neighbor_ids, partial_filtered_neighbor_ids = query_embeddings(
+            config,
+            db_dataset,
+            index,
+            partial_embeddings,
+            partial_chunk_id_range,
+            sample_map,
+            n_chunks_per_sample,
+            verbose=False,
+        )
+        query_neighbor_ids.append(partial_query_neighbor_ids)
+        filtered_neighbor_ids.append(partial_filtered_neighbor_ids)
+
+    # Concatenate.
+    query_neighbor_ids = np.concatenate(query_neighbor_ids, axis=0)
+    filtered_neighbor_ids = np.concatenate(filtered_neighbor_ids, axis=0)
+
+    return query_neighbor_ids, filtered_neighbor_ids
+
+
+def query_block_neighbors(
+    config: RetroPreprocessingConfig,
+    db_dataset: DBDataset,
+    query_dataset: GPTChunkDataset,
+    index: Index,
+    block: dict,
+) -> None:
+    """Query neighbors of a dataset block (i.e., range).
+
+    Args:
+        config (RetroPreprocessingConfig): Retro preprocessing config.
+        db_dataset (DBDataset): Dataset containing chunk database entries.
+        query_dataset (GPTChunkDataset): GPT chunk dataset to be queried.
+        index (Index): Vector index populated with chunk database indices.
+        block (dict): Range information containing start/end indices for querying GPT chunk dataset.
+    """
+
+    n_chunks_per_sample = query_dataset.n_chunks_per_sample
+
+    # Sample map.
+    sample_ids = sorted(
+        list(set(chunk_id // n_chunks_per_sample for chunk_id in range(*block["range"])))
+    )
+    sample_map = {}
+    for i in sample_ids:
+        sample = query_dataset.sample_dataset[i]
+        sample_map[i] = {"dataset_idx": sample["dataset_id"], "doc_ids": sample["document_ids"]}
+
+    # Embed block.
+    embeddings = embed_block(config, query_dataset, block)
+
+    # Query embeddings.
+    _, filtered_neighbor_ids = query_embedding_block(
+        config, db_dataset, index, embeddings, block["range"], sample_map, n_chunks_per_sample
+    )
+
+    if config.retro_task_validate is None:
+        # Save neighbors.
+        log_retro_rank_0("save neighbors.")
+        retro_makedir(config, os.path.dirname(block["path"]))
+        f = h5py.File(block["path"], "w")
+        f.create_dataset("neighbors", data=filtered_neighbor_ids)
+        f.close()
+
+    else:
+        # Validate neighbors.
+        with h5py.File(block["path"]) as f:
+            existing_neighbor_ids = np.copy(f["neighbors"])
+            assert np.array_equal(existing_neighbor_ids, filtered_neighbor_ids)
+
+
+def query_dataset_neighbors(
+    config: RetroPreprocessingConfig,
+    db_dataset: DBDataset,
+    query_dataset: GPTChunkDataset,
+    num_active_chunks: int,
+    prefix: str,
+    neighbor_dir: str,
+    index: Index,
+) -> None:
+    """Query neighbors of each chunk within a dataset.
+
+    Args:
+        config (RetroPreprocessingConfig): Retro preprocessing config.
+        db_dataset (DBDataset): Dataset containing chunk database entries.
+        query_dataset (GPTChunkDataset): GPT chunk dataset to be queried.
+        num_active_chunks (int): The 'active' chunks are the subset of the GPT chunk dataset that aren't being queried. This argument is used when validating the correctness of a subset of the GPT chunk dataset.
+        prefix (str): Extra string for logging progress.
+        neighbor_dir (str): File path to directory for saving neighbor IDs.
+        index (Index): Vector index populated with chunk database indices.
+    """
+
+    def validate(f: h5py.File) -> None:
+        """Validation method for validating saved neighbor IDs.
+
+        Args:
+            f (h5py.File): File containing save neighbor IDs.
+        """
+        assert (
+            f["neighbors"].shape[1] == config.retro_query_num_neighbors_save
+        ), "neighbors.shape == %s; num_neighbors_target == %d." % (
+            str(f["neighbors"].shape),
+            config.retro_num_neighbors_target,
+        )
+
+    if config.retro_task_validate is None:
+        retro_makedir(config, neighbor_dir)
+        blocks = get_blocks_by_rank(
+            neighbor_dir, num_active_chunks, config.retro_block_size, validate=validate
+        )
+        active_blocks = blocks.missing
+    else:
+        blocks = get_blocks_by_rank(
+            neighbor_dir,
+            num_active_chunks,
+            config.retro_block_size,
+            validate=validate,
+            sample=config.retro_task_validate,
+        )
+        assert blocks.n_missing_world == 0
+        active_blocks = blocks.existing
+
+    # Query each block.
+    for block_index, block in enumerate(active_blocks):
+
+        if block is not None:
+
+            # Progress.
+            log_retro_rank_0(
+                "%squery '%s' block %d / %d ... %s ... mem %.3f gb, %.1f%%."
+                % (
+                    "" if config.retro_task_validate is None else "[validate] ",
+                    prefix,
+                    block_index,
+                    len(active_blocks),
+                    os.path.basename(block["path"]),
+                    psutil.virtual_memory()[3] / 1024**3,
+                    psutil.virtual_memory()[2],
+                )
+            )
+
+            # Query block neighbors.
+            query_block_neighbors(config, db_dataset, query_dataset, index, block)
+
+        # Synchronize progress across all ranks. (for easier observation)
+        log_retro_rank_0(" > waiting for other ranks to finish block.")
+        torch.distributed.barrier()
+
+
+def query_neighbors(config: RetroPreprocessingConfig) -> None:
+    """Query pretraining datasets (train & valid).
+
+    Args:
+        config (RetroPreprocessingConfig): Retro preprocessing config.
+    """
+
+    # Num threads.
+    faiss.omp_set_num_threads(64)
+
+    # Load chunk db dataset.
+    log_retro_rank_0("load chunk db dataset.")
+    db_dataset = get_db_merged_train_dataset(
+        project_dir=config.retro_project_dir,
+        chunk_length=config.retro_gpt_chunk_length,
+        eod_token_id=config.retro_tokenizers.gpt.eod,
+    )
+    db_dataset.load_doc_tuples()
+
+    # Load index.
+    log_retro_rank_0(" > get index.")
+    index = get_index(config)
+
+    # Query each (i.e., train, valid, test) dataset.
+    log_retro_rank_0(" > query.")
+    for prefix, info in vars(config.retro_gpt_chunk_datasets).items():
+        if info is None:
+            continue
+        log_retro_rank_0(
+            " > query '%s' dataset ... %d samples." % (prefix, info["num_active_chunks"])
+        )
+        query_dataset_neighbors(
+            config,
+            db_dataset,
+            info["dataset"],
+            info["num_active_chunks"],
+            prefix,
+            info["neighbor_dir"],
+            index,
+        )
diff --git a/megatron/core/datasets/retro/query/retro_dataset.py b/megatron/core/datasets/retro/query/retro_dataset.py
new file mode 100644
index 0000000000..6c3b9ae60c
--- /dev/null
+++ b/megatron/core/datasets/retro/query/retro_dataset.py
@@ -0,0 +1,238 @@
+# Copyright (c) 2024, NVIDIA CORPORATION.  All rights reserved.
+
+"""
+A RetroDataset wraps both:
+
+  - A GPTDataset (which is nested as GPTChunkDataset -> MultiSplitGPTDataset ->
+      GPTDataset).
+  - Neighbor IDs of chunks in the chunk database, that were saved during
+      preprocessing.
+
+Both the GPT sample data and the neighbor IDs are returned within a sample from
+this dataset.
+"""
+
+import os
+from typing import Any, Dict, Optional, Tuple
+
+import numpy as np
+import torch
+
+from megatron.core.datasets.retro.db.dataset import DBDataset
+from megatron.core.datasets.retro.db.utils import get_merged_train_dataset as get_db_dataset
+from megatron.core.datasets.retro.external_libs import h5py
+from megatron.core.datasets.retro.utils import BlockPathMap, log_retro_rank_0
+from megatron.core.models.retro import RetroConfig
+
+from .gpt_chunk_dataset import GPTChunkDataset, build_gpt_chunk_datasets_from_gpt_datasets
+from .utils import get_query_dir
+
+
+class RetroDataset(torch.utils.data.Dataset):
+    """Dataset of retro samples.
+
+    Each sample contains the original GPT sample, along with the token IDs
+    of each neighbor of each chunk within the sequence. Neighbor array has
+    shape (num_chunks_per_sample, num_neighbors, num_retrieved_tokens).
+
+    ** Note: chunk dataset wraps original GPT dataset (see gpt_chunk_dataset.py).
+
+    Args:
+        num_queried_samples (int): Total number of queried samples.
+        num_neighbors (int): Total number of saved neighbors.
+        num_retrieved_chunks (int): Number of retrieved chunks (e.g., 2 for neighbor + continuation).
+        block_size (int): Number of neighbor entries per file.
+        db_dataset (DBDataset): Chunk database used for retrieval.
+        chunk_dataset (GPTChunkDataset): GPT chunk dataset, which is a wrapper around a standard GPT dataset that breaks each sample into chunks.
+        neighbor_path_map (BlockPathMap): Mapping of neighbor ID to file path.
+    """
+
+    def __init__(
+        self,
+        num_queried_samples: int,
+        num_neighbors: int,
+        num_retrieved_chunks: int,
+        block_size: int,
+        db_dataset: DBDataset,
+        chunk_dataset: GPTChunkDataset,
+        neighbor_path_map: BlockPathMap,
+    ):
+        super().__init__()
+
+        self.num_queried_samples = num_queried_samples
+        self.num_neighbors = num_neighbors
+        self.num_retrieved_chunks = num_retrieved_chunks
+        self.block_size = block_size
+        self.db_dataset = db_dataset
+        self.chunk_dataset = chunk_dataset
+        self.neighbor_path_map = neighbor_path_map
+
+    def __len__(self) -> int:
+        """Dataset length.
+
+        Returns:
+            Number of samples in dataset.
+        """
+        return len(self.chunk_dataset.sample_dataset)
+
+    def __getitem__(self, sample_idx: int) -> dict:
+        """Get dataset sample.
+
+        Args:
+            sample_idx (int): Index of sample in dataset.
+
+        Returns:
+            A dict consisting of GPT sample (attribute 'text') and corresponding neighbor chunk IDs ('neighbor_chunks', for indexing chunk database) and neighbor token IDs (corresponding chunk database GPT tokens).
+        """
+        n_chunks_per_sample = self.chunk_dataset.n_chunks_per_sample
+
+        # Wrap sample idx around number of queried samples.
+        sample_idx = sample_idx % self.num_queried_samples
+
+        # Get standard sample.
+        sample = self.chunk_dataset.sample_dataset[sample_idx]
+
+        # Sample idx to chunk idxs.
+        chunk_idxs = list(
+            range(sample_idx * n_chunks_per_sample, (sample_idx + 1) * n_chunks_per_sample)
+        )
+
+        # Collect retrieved tokens.
+        all_retrieved_chunk_ids = []
+        all_retrieved_token_ids = []
+        for chunk_idx in chunk_idxs:
+
+            # Neighbor chunk ids.
+            neighbor_path = self.neighbor_path_map[chunk_idx]
+            with h5py.File(neighbor_path, "r") as f:
+                neighbor_chunk_ids = f["neighbors"][
+                    chunk_idx % self.block_size, : self.num_neighbors
+                ].tolist()
+
+            # Retrieved (neighbor + continuation) token ids.
+            retrieved_chunk_ids = []
+            retrieved_token_ids = []
+            for neighbor_chunk_id in neighbor_chunk_ids:
+                current_chunk_ids = [
+                    i % len(self.db_dataset)
+                    for i in range(neighbor_chunk_id, neighbor_chunk_id + self.num_retrieved_chunks)
+                ]
+                current_token_ids = [self.db_dataset[ci]["text"] for ci in current_chunk_ids]
+                retrieved_chunk_ids.append(current_chunk_ids)
+                retrieved_token_ids.append(current_token_ids)
+
+            # Collect retrieved tokens.
+            all_retrieved_chunk_ids.append(retrieved_chunk_ids)
+            all_retrieved_token_ids.append(retrieved_token_ids)
+
+        # Reshape retrieved tokens.
+        all_retrieved_chunk_ids = np.array(all_retrieved_chunk_ids).reshape(
+            (n_chunks_per_sample, self.num_neighbors, -1)
+        )
+        all_retrieved_token_ids = np.array(all_retrieved_token_ids).reshape(
+            (n_chunks_per_sample, self.num_neighbors, -1)
+        )
+
+        # Sample.
+        sample: Dict[str, np.ndarray] = {
+            **sample,
+            "neighbor_chunks": all_retrieved_chunk_ids,
+            "neighbor_tokens": all_retrieved_token_ids,
+        }
+
+        return sample
+
+
+def get_retro_datasets(
+    config: RetroConfig, gpt_datasets: dict, sample_length: int, eod_token_id: int
+) -> Tuple[Optional[RetroDataset], Optional[RetroDataset], Optional[RetroDataset]]:
+    """Get train, valid, test retro datasets.
+
+    Args:
+        config (RetroConfig): Retro preprocessing config.
+        gpt_datasets (dict): Mapping of data split key ('train', 'valid', or 'test') to the original sequence-length GPT dataset (i.e., not the chunk dataset).
+        sample_length (int): Alias to `sequence_length`.
+        eod_token_id (int): GPT EOD token ID.
+
+    Returns:
+        A tuple of 'train', 'valid', and 'test' `RetroDataset`s.
+    """
+
+    # DB dataset.
+    db_dataset = get_db_dataset(
+        project_dir=config.retro_project_dir,
+        chunk_length=config.retro_chunk_length,
+        eod_token_id=eod_token_id,
+    )
+
+    # GPT chunk datasets.
+    chunk_ds_info_map = build_gpt_chunk_datasets_from_gpt_datasets(
+        project_dir=config.retro_project_dir,
+        gpt_datasets=gpt_datasets,
+        sample_length=sample_length,
+        chunk_length=config.retro_chunk_length,
+    )
+
+    # Retro datasets.
+    retro_dataset_map: Dict[str, Optional[RetroDataset]] = {}
+    query_dir = get_query_dir(config.retro_project_dir)
+    for data_key, chunk_ds_info in chunk_ds_info_map.items():
+
+        # Skip unused datasets.
+        if chunk_ds_info is None:
+            retro_dataset_map[data_key] = None
+            continue
+
+        # For consistency with preprocessing, the neighbor_dir is overwritten
+        # (from its setting in `build_gpt_chunk_datasets_from_gpt_datasets()`
+        # above). This is one piece -- along with setting data_path and
+        # train_samples from config.json -- of ensuring consistency between
+        # preprocessing and pretraining.
+        chunk_dataset = chunk_ds_info["dataset"]
+        chunk_ds_info["neighbor_dir"] = os.path.join(
+            query_dir, config.retro_neighbor_dirs[data_key]
+        )
+        neighbor_dir = chunk_ds_info["neighbor_dir"]
+        neighbor_path_map = BlockPathMap.from_dir(
+            dir=neighbor_dir, block_size=config.retro_block_size
+        )
+
+        # Verify num chunks.
+        n_active_chunks = chunk_ds_info["num_active_chunks"]
+        n_neighbor_chunks = neighbor_path_map.max_idx
+
+        if not os.path.isdir(neighbor_dir):
+            if torch.distributed.get_rank() == 0:
+                raise Exception(
+                    "neighbor directory '%s' not found; please "
+                    "compare --train-samples, --seq-length, --seed, "
+                    "--eval-iters, and --eval-interval, with "
+                    "retro preprocessing args." % neighbor_dir
+                )
+            torch.distributed.barrier()
+            exit()
+
+        if config.retro_verify_neighbor_count and n_active_chunks != n_neighbor_chunks:
+            if torch.distributed.get_rank() == 0:
+                log_retro_rank_0("neighbor_dir : %s" % neighbor_dir)
+                log_retro_rank_0("neighbor_path_map : %s" % neighbor_path_map)
+                raise Exception(
+                    "num sampled chunks (%d) != num neighbor chunks "
+                    "(%d); did you complete querying the entire "
+                    "pretraining dataset?" % (n_active_chunks, n_neighbor_chunks)
+                )
+            torch.distributed.barrier()
+            exit()
+
+        # Retro dataset.
+        retro_dataset_map[data_key] = RetroDataset(
+            num_queried_samples=gpt_datasets[data_key][1],
+            num_neighbors=config.retro_num_neighbors,
+            num_retrieved_chunks=config.retro_num_retrieved_chunks,
+            block_size=config.retro_block_size,
+            db_dataset=db_dataset,
+            chunk_dataset=chunk_dataset,
+            neighbor_path_map=neighbor_path_map,
+        )
+
+    return (retro_dataset_map["train"], retro_dataset_map["valid"], retro_dataset_map["test"])
diff --git a/megatron/core/datasets/retro/query/utils.py b/megatron/core/datasets/retro/query/utils.py
new file mode 100644
index 0000000000..b4e0c67009
--- /dev/null
+++ b/megatron/core/datasets/retro/query/utils.py
@@ -0,0 +1,35 @@
+# Copyright (c) 2024, NVIDIA CORPORATION.  All rights reserved.
+
+"""Utilities for querying the pretraining dataset."""
+
+import os
+
+from megatron.core.datasets.megatron_dataset import MegatronDataset
+
+
+def get_query_dir(project_dir: str) -> str:
+    """Get root directory of all saved query data.
+
+    Args:
+        project_dir (str): Retro project dir.
+
+    Returns:
+        Path to query sub-directory in Retro project.
+    """
+    return os.path.join(project_dir, "query")
+
+
+def get_neighbor_dir(project_dir: str, key: str, dataset: MegatronDataset) -> str:
+    """Get directory containing neighbor IDs for a dataset (i.e., train, valid, or test).
+
+    Args:
+        project_dir (str): Retro project dir.
+        key (str): Dataset split key; 'train', 'valid', or 'test'.
+        dataset (MegatronDataset): Dataset containing unique hash for finding corresponding neighbors.
+
+    Returns:
+        Path to directory containing this dataset's neighbors within Retro project.
+    """
+    return os.path.join(
+        get_query_dir(project_dir), os.path.basename(f"{key}_{dataset.unique_description_hash}")
+    )
diff --git a/megatron/core/datasets/retro/utils.py b/megatron/core/datasets/retro/utils.py
new file mode 100644
index 0000000000..31c0be14c8
--- /dev/null
+++ b/megatron/core/datasets/retro/utils.py
@@ -0,0 +1,349 @@
+# Copyright (c) 2024, NVIDIA CORPORATION.  All rights reserved.
+
+"""Utilities for Retro preprocessing."""
+
+import glob
+import logging
+import os
+from collections import defaultdict
+from types import SimpleNamespace
+from typing import Any, Callable, Dict, List, Optional
+
+import numpy as np
+import torch
+from tqdm import tqdm
+
+from megatron.core import parallel_state
+from megatron.core.datasets.retro.config import RetroPreprocessingConfig
+from megatron.core.datasets.retro.query.multi_split_gpt_dataset import (
+    MultiSplitGPTDataset,
+    MultiSplitGPTDatasetConfig,
+)
+from megatron.core.utils import log_single_rank
+
+from .external_libs import h5py
+
+logger = logging.getLogger(__name__)
+
+
+def log_retro_rank_0(message: str) -> None:
+    """Log on rank 0.
+
+    Args:
+        message (str): Message to log.
+    """
+    log_single_rank(logger, logging.INFO, "[RETRO] " + message)
+
+
+def retro_makedir(config: RetroPreprocessingConfig, path: str) -> None:
+    """Make a directory, conditional on not being in validation mode.
+
+    Args:
+        config (RetroPreprocessingConfig): Retro preprocessing config.
+        path (str): Path to directory.
+    """
+    if config.retro_task_validate is None:
+        os.makedirs(path, exist_ok=True)
+
+
+def extract_data_config(config: RetroPreprocessingConfig) -> MultiSplitGPTDatasetConfig:
+    """Extract data config from dataset.
+
+    Args:
+        config (RetroPreprocessingConfig): Retro preprocessing config.
+
+    Returns:
+        The config object used to build the dataset.
+    """
+    return config.retro_gpt_chunk_datasets.train["dataset"].sample_dataset.config
+
+
+def get_num_chunks_per_sample(sample_length: int, chunk_length: int) -> int:
+    """Compute seq_length // chunk_length.
+
+    Args:
+        sample_length (int): Alias of `sequence_length`.
+        chunk_length (int): Retro chunk length (e.g., 64).
+
+    Returns:
+        Number of chunks per sample (i.e., `sequence_length` / `chunk_length`).
+    """
+    assert sample_length % chunk_length == 0
+    return sample_length // chunk_length
+
+
+class GPTToTextDataset(torch.utils.data.Dataset):
+    """Dataset to convert GPT tokens to text.
+
+    Args:
+        gpt_dataset (MultiSplitGPTDataset): GPT dataset, which outputs GPT token samples.
+        gpt_tokenizer (Any): GPT tokenizer.
+    """
+
+    def __init__(self, gpt_dataset: MultiSplitGPTDataset, gpt_tokenizer: Any):
+
+        super().__init__()
+
+        self.gpt_dataset = gpt_dataset
+        self.gpt_tokenizer = gpt_tokenizer
+
+    def __len__(self) -> int:
+        """Dataset length.
+
+        Returns:
+            Number of samples in the dataset.
+        """
+        return len(self.gpt_dataset)
+
+    def __getitem__(self, idx: int) -> dict:
+        """Get dataset sample.
+
+        Args:
+            idx (int): Index of sample.
+
+        Returns:
+            A dict containing attribute 'text' of type string.
+        """
+        gpt_token_ids = self.gpt_dataset[idx]["text"].tolist()
+        text = self.gpt_tokenizer.detokenize(gpt_token_ids)
+        return {"text": text}
+
+
+def get_blocks(
+    dirname: str, n_samples: int, block_size: int, validate: Callable = None
+) -> SimpleNamespace:
+    """Divide range [0, num_samples) to sequence of block ranges.
+
+    This is a core method within the concept of block processing. The idea
+    is to divide a range (size n_samples) into a sequence of blocks. Each
+    block corresponds to a file within 'dirname' with name
+    '{start_idx}-{end_idx}.hdf5'. This method checks for the existence of
+    these files, and returns two lists, one for existing blocks and one for
+    missing blocks.
+
+    Args:
+        dirname (str): Path to directory containing block files.
+        n_samples (int): Ideal number of samples. The total number of saved block data is <=n_samples.
+        block_size (int): Max number of samples per block file (e.g., 100000).
+        validate (Callable): Method for validating each block file during load.
+
+    Returns:
+        A namespace consisting of 2 lists: existing blocks, and missing blocks. The total number of samples between the existing and missing blocks should equal n_samples above.
+    """
+
+    assert os.path.isdir(dirname), "missing directory '%s.'" % dirname
+
+    # Block ranges.
+    block_start_idxs = list(range(0, n_samples, block_size))
+    block_end_idxs = [min(n_samples, i + block_size) for i in block_start_idxs]
+    block_ranges = list(zip(block_start_idxs, block_end_idxs))
+
+    # All block files (existing + missing).
+    n_digits = int(np.ceil(np.log(n_samples) / np.log(10)) + 1)
+    all_blocks = [
+        {
+            "range": r,
+            "path": os.path.join(
+                dirname, "%s-%s.hdf5" % tuple([str(i).zfill(n_digits) for i in r])
+            ),
+        }
+        for r in block_ranges
+    ]
+    all_block_path_set = set(block["path"] for block in all_blocks)
+
+    # Validate function.
+    validate = (lambda f: None) if validate is None else validate
+
+    # Delete corrupt files.
+    if torch.distributed.get_rank() == 0:
+        existing_block_paths = [
+            block["path"] for block in all_blocks if os.path.exists(block["path"])
+        ]
+        for index, path in enumerate(tqdm(existing_block_paths, "validating block.")):
+
+            assert path in all_block_path_set, "unexpected filename, '%s'." % path
+
+            try:
+                f = h5py.File(path, "r")
+            except Exception:
+                os.remove(path)
+                continue
+
+            try:
+                validate(f)
+            except Exception:
+                os.remove(path)
+            finally:
+                f.close()
+
+    # Wait for files to be deleted.
+    torch.distributed.barrier()
+
+    # Collect blocks.
+    blocks = SimpleNamespace(
+        existing=[b for b in all_blocks if os.path.exists(b["path"])],
+        missing=[b for b in all_blocks if not os.path.exists(b["path"])],
+    )
+
+    return blocks
+
+
+def get_blocks_by_rank(
+    dirname: str,
+    n_samples: int,
+    block_size: int,
+    validate: Callable = None,
+    sample: Optional[float] = None,
+) -> SimpleNamespace:
+    """Divide existing and missing blocks evenly across all ranks.
+
+    See 'get_blocks()' above for description. The returned lists of existing and
+    missing blocks are split evenly across ranks via interleaving. This way,
+    each rank has a roughly equal number of blocks to process for a
+    downstream operation.
+
+    Args:
+        dirname (str): Path to directory containing block files.
+        n_samples (int): Ideal number of samples. The total number of saved block data is <=n_samples.
+        block_size (int): Max number of samples per block file (e.g., 100000).
+        validate (Callable): Method for validating each block file during load.
+        sample (Optional[float]): If provided, sample a random subset of the blocks. Used for validating preprocessing correctness.
+
+    Returns:
+        A namespace consisting of 2 lists: existing blocks, and missing blocks. Each of these two lists is potentially a sub-sample of the total set of existing and missing blocks, depending on whether sampling is used. Additionally, the attributes n_existing_world and n_missing_world are the total number of existing and missing blocks, independent of samples. Therefore, (n_existing_world + n_missing_world) * block_size == n_samples.
+    """
+
+    # Get world blocks.
+    blocks = get_blocks(dirname, n_samples, block_size, validate)
+
+    # This rank's existing and missing files.
+    data_parallel_rank = parallel_state.get_data_parallel_rank()
+    data_parallel_world_size = parallel_state.get_data_parallel_world_size()
+    rank_existing_blocks = blocks.existing[
+        data_parallel_rank : len(blocks.existing) : data_parallel_world_size
+    ]
+    rank_missing_blocks = blocks.missing[
+        data_parallel_rank : len(blocks.missing) : data_parallel_world_size
+    ]
+
+    # Extend rank's existing and missing blocks (with None) such that all ranks
+    # have equal length lists. This allows for easier tracking of global progress.
+    def get_world_max(n: int) -> int:
+        """Get max value across ranks.
+
+        Args:
+            n (int): Value on this rank.
+
+        Returns:
+            Max value across all ranks.
+        """
+        n_tensor = torch.cuda.LongTensor([n])
+        torch.distributed.all_reduce(n_tensor, op=torch.distributed.ReduceOp.MAX)
+        return n_tensor.item()
+
+    max_n_existing = get_world_max(len(rank_existing_blocks))
+    max_n_missing = get_world_max(len(rank_missing_blocks))
+
+    rank_existing_blocks += [None] * (max_n_existing - len(rank_existing_blocks))
+    rank_missing_blocks += [None] * (max_n_missing - len(rank_missing_blocks))
+
+    # Collect blocks.
+    blocks = SimpleNamespace(
+        n_existing_world=len(blocks.existing),
+        n_missing_world=len(blocks.missing),
+        existing=rank_existing_blocks,
+        missing=rank_missing_blocks,
+    )
+
+    if sample is not None:
+        # Sample existing and missing blocks evenly across all ranks. The
+        # returned lists of blocks are randomly sampled (without replacement)
+        # to yield `sample * len(blocks)` number of blocks.
+
+        # Randomly sample blocks.
+        def sample_blocks(_blocks: List[Optional[Dict]]) -> List[Optional[Dict]]:
+            """Sample a random subset of all blocks.
+
+            Args:
+                _blocks (List[Optional[Dict]]): List of all blocks.
+
+            Returns:
+                A random subset of the blocks.
+            """
+            n_blocks_sample = int(np.ceil(sample * len(_blocks)))
+            sampled_blocks: List[Optional[Dict]] = [b for b in _blocks if b is not None]
+
+            np.random.seed(None)
+            np.random.shuffle(sampled_blocks)
+
+            sampled_blocks = sampled_blocks[:n_blocks_sample]
+            sampled_blocks += [None] * (n_blocks_sample - len(sampled_blocks))
+
+            return sampled_blocks
+
+        blocks.existing = sample_blocks(blocks.existing)
+        blocks.missing = sample_blocks(blocks.missing)
+
+    return blocks
+
+
+class BlockPathMap:
+    """Map an index to its containing block path.
+
+    The common use for this class is to have a directory of files containing
+    blocks of processed data, of uniform block size (e.g., 100k samples per
+    file). Each file must follow a naming convention of 'startIdx-endIdx.[ext]',
+    where 'endIdx' minus 'startIdx' must equal the block size, with the possible
+    exception of the final block. Given an input index, this class maps the
+    index to the containing block file.
+
+    Args:
+        block_paths (List[str]): List of paths to saved block files.
+        block_size (int): Max number of samples per block file (e.g., 100000).
+    """
+
+    @classmethod
+    def from_dir(cls, dir: str, block_size: int, ext: str = "hdf5") -> Any:
+        """Get list of block files, and create map.
+
+        Args:
+            dir (str): Path to directory containing saved block files.
+            block_size (int): Max number of samples per block file (e.g., 100000).
+            ext (str): Block file extension (e.g., 'hdf5').
+
+        Returns:
+            A mapping of sample index to block file path.
+        """
+        assert os.path.isdir(dir), f"directory not found, '{dir}'."
+        return cls(sorted(glob.glob(dir + f"/*.{ext}")), block_size)
+
+    def __init__(self, block_paths: List[str], block_size: int):
+        self.max_idx = 0
+        self.block_path_map = {}
+        for block_path in block_paths:
+            name = os.path.splitext(os.path.basename(block_path))[0]
+            start_idx, end_idx = [int(i) for i in name.split("-")]
+            self.block_path_map[start_idx] = block_path
+            self.max_idx = max(self.max_idx, end_idx)
+        self.block_size = block_size
+
+    def __str__(self) -> str:
+        """Stringify the mapping.
+
+        Returns:
+            A string representation of this block path map.
+        """
+        return "%d paths" % len(self.block_path_map)
+
+    def __getitem__(self, idx: int) -> str:
+        """Get block path from index.
+
+        Args:
+            idx (int): Index of sample.
+
+        Returns:
+            The path to the block file containing the sample index.
+        """
+        block_start_idx = self.block_size * (idx // self.block_size)
+        block_path = self.block_path_map[block_start_idx]
+        return block_path
diff --git a/megatron/core/datasets/t5_dataset.py b/megatron/core/datasets/t5_dataset.py
new file mode 100644
index 0000000000..b54e4f5315
--- /dev/null
+++ b/megatron/core/datasets/t5_dataset.py
@@ -0,0 +1,228 @@
+# Copyright (c) 2024, NVIDIA CORPORATION. All rights reserved.
+
+from collections import deque
+from dataclasses import dataclass, field
+from typing import Dict, List, Optional, Union
+
+import numpy
+
+from megatron.core.datasets.indexed_dataset import IndexedDataset
+from megatron.core.datasets.masked_dataset import (
+    MaskedWordPieceDataset,
+    MaskedWordPieceDatasetConfig,
+)
+from megatron.core.datasets.utils import Split
+
+
+@dataclass
+class T5MaskedWordPieceDatasetConfig(MaskedWordPieceDatasetConfig):
+    """Configuration object for Megatron Core T5 WordPiece datasets
+
+    NB: As a temporary holdover from Megatron-LM. The T5 tokenizer has an attribute which defines
+    a number of special sentinel tokens used during sampling. The assert in __post_init__ serves to
+    preserve compatibility with Megatron-LM until the T5 tokenizer is in Megatron Core.
+    """
+
+    sequence_length_encoder: Optional[int] = field(init=False, default=None)
+    """A sequence_length alias and the sequence length for the encoder"""
+
+    sequence_length_decoder: int = None
+    """The sequence length for the decoder"""
+
+    def __post_init__(self) -> None:
+        """Do asserts and set fields post init"""
+        super().__post_init__()
+
+        self.sequence_length_encoder = self.sequence_length
+
+        assert self.sequence_length_encoder is not None
+        assert self.sequence_length_decoder is not None
+
+        assert len(self.tokenizer.additional_special_tokens_ids) > 0
+
+
+class T5MaskedWordPieceDataset(MaskedWordPieceDataset):
+    """The T5 dataset that assumes WordPiece tokenization
+
+    Args:
+        indexed_dataset (IndexedDataset): The IndexedDataset around which to build the MegatronDataset
+
+        dataset_path (str): The real path on disk to the dataset, for bookkeeping
+
+        indexed_indices (numpy.ndarray): The set of the documents indices to expose
+
+        num_samples (Optional[int]): The number of samples to draw from the indexed dataset. When None, build as many samples as correspond to one epoch.
+
+        index_split (Split): The indexed_indices Split
+
+        config (T5MaskedWordPieceDatasetConfig): The config
+    """
+
+    def __init__(
+        self,
+        indexed_dataset: IndexedDataset,
+        dataset_path: str,
+        indexed_indices: numpy.ndarray,
+        num_samples: Optional[int],
+        index_split: Split,
+        config: T5MaskedWordPieceDatasetConfig,
+    ) -> None:
+        super().__init__(
+            indexed_dataset, dataset_path, indexed_indices, num_samples, index_split, config
+        )
+
+        self.token_lookup = list(self.config.tokenizer.inv_vocab.keys())
+        # Account for the single <bos> and single <eos> token ids
+        self.sample_index = self._build_sample_index(self.config.sequence_length - 2, 1)
+
+    @staticmethod
+    def _key_config_attributes() -> List[str]:
+        """Inherited method implementation
+
+        Returns:
+            List[str]: The key config attributes
+        """
+        return super(
+            T5MaskedWordPieceDataset, T5MaskedWordPieceDataset
+        )._key_config_attributes() + ["sequence_length_decoder"]
+
+    def __getitem__(self, idx: int) -> Dict[str, Union[int, numpy.ndarray]]:
+        """Abstract method implementation
+
+        Args:
+            idx (int): The index into the dataset
+
+        Returns:
+            Dict[str, Union[int, numpy.ndarray]]: The
+        """
+        idx_beg, idx_end, target_sequence_length = self.sample_index[idx]
+        sample = [self.dataset[i] for i in range(idx_beg, idx_end)]
+
+        numpy_random_state = numpy.random.RandomState(seed=(self.config.random_seed + idx) % 2**32)
+
+        assert target_sequence_length <= self.config.sequence_length
+
+        # Flatten the sample into a list of tokens
+        tokens = [token for sentence in sample for token in sentence]
+
+        # Truncate the list of tokens to a desired length
+        truncated = len(tokens) > target_sequence_length
+        tokens = tokens[:target_sequence_length]
+
+        # Masking
+        (tokens, _, _, _, masked_spans) = self._create_masked_lm_predictions(
+            tokens, target_sequence_length, numpy_random_state
+        )
+
+        # Prepare the encoder input and decoder input and output
+        sentinels = deque(self.config.tokenizer.additional_special_tokens_ids)
+        encoder_input = []
+        decoder_input = [self.config.tokenizer.bos]
+        decoder_output = []
+        idx_beg = 0
+        for indices, labels in masked_spans:
+            sentinel = sentinels.popleft()
+
+            # set the end index
+            idx_end = indices[0]
+
+            encoder_input.extend(tokens[idx_beg:idx_end])
+            encoder_input.append(sentinel)
+
+            decoder_input.append(sentinel)
+            decoder_input.extend(labels)
+
+            decoder_output.append(sentinel)
+            decoder_output.extend(labels)
+
+            # set the start index
+            idx_beg = indices[-1] + 1
+
+        encoder_input.extend(tokens[idx_beg:])
+        decoder_output.append(self.config.tokenizer.eos)
+
+        # Pad the sequences and convert to NumPy
+        length_toks_encoder = len(encoder_input)
+        length_toks_decoder = len(decoder_input)
+        length_pads_encoder = self.config.sequence_length_encoder - length_toks_encoder
+        length_pads_decoder = self.config.sequence_length_decoder - length_toks_decoder
+        assert length_pads_encoder >= 0
+        assert length_pads_decoder >= 0
+
+        encoder_input = numpy.array(encoder_input, dtype=numpy.int64)
+        encoder_input = numpy.pad(
+            encoder_input, (0, length_pads_encoder), constant_values=self.config.tokenizer.pad
+        )
+
+        decoder_input = numpy.array(decoder_input, dtype=numpy.int64)
+        decoder_input = numpy.pad(
+            decoder_input, (0, length_pads_decoder), constant_values=self.config.tokenizer.pad
+        )
+
+        # Create attention and history masks
+        mask_encoder = self._make_attention_mask(encoder_input, encoder_input)
+        mask_encoder_decoder = self._make_attention_mask(decoder_input, encoder_input)
+        mask_decoder = self._make_attention_mask(decoder_input, decoder_input)
+        mask_decoder = mask_decoder * self._make_history_mask(decoder_input)
+
+        # Mask the labels
+        decoder_output = numpy.array(decoder_output, dtype=numpy.int64)
+        decoder_output = numpy.pad(decoder_output, (0, length_pads_decoder), constant_values=-1)
+
+        # Get the loss mask
+        loss_mask = numpy.zeros(self.config.sequence_length_decoder, dtype=numpy.int64)
+        loss_mask[:length_toks_decoder] = 1
+
+        return {
+            "text_enc": encoder_input,
+            "text_dec": decoder_input,
+            "labels": decoder_output,
+            "loss_mask": loss_mask,
+            "truncated": int(truncated),
+            "enc_mask": mask_encoder,
+            "dec_mask": mask_decoder,
+            "enc_dec_mask": mask_encoder_decoder,
+        }
+
+    @staticmethod
+    def _make_attention_mask(
+        source_block: numpy.ndarray, target_block: numpy.ndarray
+    ) -> numpy.ndarray:
+        """Return a 2-D attention mask
+
+        Args:
+            source_block (numpy.ndarray): A 1-D array
+            target_block (numpy.ndarray): A 1-D array
+
+        Returns:
+            numpy.ndarray: The 2-D attention mask
+        """
+        mask = (target_block[None, :] >= 1) * (source_block[:, None] >= 1)
+        return mask.astype(numpy.int64)
+
+    @staticmethod
+    def _make_history_mask(block: numpy.ndarray) -> numpy.ndarray:
+        """Return a 2-D history (lower-left-triangular) mask
+
+        Args:
+            block (numpy.ndarray): A 1-D array
+
+        Returns:
+            numpy.ndarray: The 2-D history (lower-left-triangular) mask
+        """
+        arange = numpy.arange(block.shape[0])
+        mask = arange[None,] <= arange[:, None]
+        return mask.astype(numpy.int64)
+
+    def _get_token_mask(self, numpy_random_state: numpy.random.RandomState) -> int:
+        """Abstract method implementation
+
+        100% of the time, replace the token id with mask token id.
+
+        Args:
+            numpy_random_state (RandomState): The NumPy random state
+
+        Returns:
+            int: The mask token id
+        """
+        return self.config.tokenizer.mask
diff --git a/megatron/core/datasets/utils.py b/megatron/core/datasets/utils.py
new file mode 100644
index 0000000000..8d887d4a4a
--- /dev/null
+++ b/megatron/core/datasets/utils.py
@@ -0,0 +1,87 @@
+# Copyright (c) 2022, NVIDIA CORPORATION. All rights reserved.
+
+import logging
+from enum import Enum
+from typing import List, Optional, Tuple
+
+import numpy
+import torch
+
+from ..utils import log_single_rank
+
+logger = logging.getLogger(__name__)
+
+
+class Split(Enum):
+    train = 0
+    valid = 1
+    test = 2
+
+
+def compile_helpers():
+    """Compile C++ helper functions at runtime. Make sure this is invoked on a single process."""
+    import os
+    import subprocess
+
+    command = ["make", "-C", os.path.abspath(os.path.dirname(__file__))]
+    if subprocess.run(command).returncode != 0:
+        import sys
+
+        log_single_rank(logger, logging.ERROR, "Failed to compile the C++ dataset helper functions")
+        sys.exit(1)
+
+
+def normalize(weights: List[float]) -> List[float]:
+    """Do non-exponentiated normalization
+
+    Args:
+        weights (List[float]): The weights
+
+    Returns:
+        List[float]: The normalized weights
+    """
+    w = numpy.array(weights, dtype=numpy.float64)
+    w_sum = numpy.sum(w)
+    w = (w / w_sum).tolist()
+    return w
+
+
+def get_blend_from_list(
+    blend: Optional[List[str]],
+) -> Optional[Tuple[List[str], Optional[List[float]]]]:
+    """Get the megatron.core.datasets.blended_megatron_dataset_config.BlendedMegatronDatasetConfig blend from the blend list
+
+    Args:
+        blend (Optional[List[str]]): The blend list, which can be either (1) a list of prefixes, e.g. ["path/to/dataset_1_prefix", "path/to/dataset_2_prefix"], or (2) a flattened, zipped list of weights and prefixes, e.g. ["30", "path/to/dataset_1_prefix", "70", "path/to/dataset_2_prefix"]
+
+    Returns:
+        Optional[Tuple[List[str], Optional[List[float]]]]: The blend, consisting of a list of dataset prefixes and optionally a list of dataset weights, e.g. [["path/to/dataset_1_prefix", "path/to/dataset_2_prefix"], [30.0, 70.0]].
+    """
+    if blend is None:
+        return None
+
+    if len(blend) % 2 == 1:
+        weight_per_dataset = None
+        raw_prefix_per_dataset = blend
+    else:
+        raw_weight_per_dataset, raw_prefix_per_dataset = zip(
+            *[(blend[i], blend[i + 1]) for i in range(0, len(blend), 2)]
+        )
+
+        weight_per_dataset = []
+        for rwpd in raw_weight_per_dataset:
+            try:
+                weight = float(rwpd)
+            except ValueError:
+                weight = None
+            weight_per_dataset.append(weight)
+
+        is_none = map(lambda _: _ is None, weight_per_dataset)
+        if any(is_none):
+            assert all(is_none)
+            weight_per_dataset = None
+            raw_prefix_per_dataset = blend
+
+    prefix_per_dataset = [rppd.strip() for rppd in raw_prefix_per_dataset]
+
+    return prefix_per_dataset, weight_per_dataset
diff --git a/megatron/core/datasets/utils_s3.py b/megatron/core/datasets/utils_s3.py
new file mode 100644
index 0000000000..61103b429d
--- /dev/null
+++ b/megatron/core/datasets/utils_s3.py
@@ -0,0 +1,164 @@
+# Copyright (c) 2024, NVIDIA CORPORATION. All rights reserved.
+import os
+from typing import Any, Dict, NamedTuple, Protocol, Tuple
+
+import torch
+
+try:
+    import boto3
+    import botocore.exceptions as exceptions
+except ModuleNotFoundError:
+    pass
+
+S3_PREFIX = "s3://"
+
+
+class S3Config(NamedTuple):
+    """Config when the data (.bin) file and the index (.idx) file are in S3
+
+    TODO: These parameters are few and can be consolidated with parameters specific to bin reader
+    classes - @jkamalu
+
+    Attributes:
+
+        path_to_idx_cache (str): The local directory where we will store the index (.idx) file
+
+        bin_chunk_nbytes (int): If the number of bytes is too small, then we send a request to S3 at each call of the `read` method in _S3BinReader, which is slow, because each request has a fixed cost independent of the size of the byte range requested. If the number of bytes is too large, then we only rarely have to send requests to S3, but it takes a lot of time to complete the request when we do, which can block training. We've found that 256 * 1024 * 1024 (i.e., 256 MiB) has worked well (though we have not put that much effort into tuning it), so we default to it.
+    """
+
+    path_to_idx_cache: str
+
+    bin_chunk_nbytes: int = 256 * 1024 * 1024
+
+
+class S3Client(Protocol):
+    """The protocol which all s3 clients should abide by"""
+
+    def download_file(self, Bucket: str, Key: str, Filename: str) -> None: ...
+
+    def upload_file(self, Filename: str, Bucket: str, Key: str) -> None: ...
+
+    def head_object(self, Bucket: str, Key: str) -> Dict[str, Any]: ...
+
+    def get_object(self, Bucket: str, Key: str, Range: str) -> Dict[str, Any]: ...
+
+    def close(self) -> None: ...
+
+
+def is_s3_path(path: str) -> bool:
+    """Ascertain whether a path is in S3
+
+    Args:
+        path (str): The path
+
+    Returns:
+        bool: True if the path is in S3, False otherwise
+    """
+    return path.startswith(S3_PREFIX)
+
+
+def parse_s3_path(path: str) -> Tuple[str, str]:
+    """Parses the given S3 path returning correspsonding bucket and key.
+
+    Args:
+        path (str): The S3 path
+
+    Returns:
+        Tuple[str, str]: A (bucket, key) tuple
+    """
+    assert is_s3_path(path)
+    parts = path.replace(S3_PREFIX, "").split("/")
+    bucket = parts[0]
+    if len(parts) > 1:
+        key = "/".join(parts[1:])
+        assert S3_PREFIX + bucket + "/" + key == path
+    else:
+        key = ""
+    return bucket, key
+
+
+def object_exists(client: S3Client, path: str) -> bool:
+    """Ascertain whether the object at the given S3 path exists in S3
+
+    Args:
+        client (S3Client): The S3 client
+
+        path (str): The S3 path
+
+    Raises:
+        botocore.exceptions.ClientError: The error code is 404
+
+    Returns:
+        bool: True if the object exists in S3, False otherwise
+    """
+    parsed_s3_path = parse_s3_path(path)
+    try:
+        response = client.head_object(bucket=parsed_s3_path[0], key=parsed_s3_path[1])
+    except exceptions.ClientError as e:
+        if e.response["Error"]["Code"] != "404":
+            raise e
+    return True
+
+
+def _download_file(client: S3Client, s3_path: str, local_path: str) -> None:
+    """Download the object at the given S3 path to the given local file system path
+
+    Args:
+        client (S3Client): The S3 client
+
+        s3_path (str): The S3 source path
+
+        local_path (str): The local destination path
+    """
+    dirname = os.path.dirname(local_path)
+    os.makedirs(dirname, exist_ok=True)
+    parsed_s3_path = parse_s3_path(s3_path)
+    client.download_file(parsed_s3_path[0], parsed_s3_path[1], local_path)
+
+
+def maybe_download_file(s3_path: str, local_path: str) -> None:
+    """Download the object at the given S3 path to the given local file system path
+
+    In a distributed setting, downloading the S3 object proceeds in stages in order
+    to try to have the minimum number of processes download the object in order for
+    all the ranks to have access to the downloaded object.
+
+    Args:
+        s3_path (str): The S3 source path
+
+        local_path (str): The local destination path
+    """
+
+    if torch.distributed.is_initialized():
+        rank = torch.distributed.get_rank()
+        local_rank = rank % torch.cuda.device_count()
+    else:
+        rank = 0
+        local_rank = 0
+
+    s3_client = boto3.client("s3")
+
+    if (not os.path.exists(local_path)) and (rank == 0):
+        _download_file(s3_client, s3_path, local_path)
+
+    if torch.distributed.is_initialized():
+        torch.distributed.barrier()
+
+    # If the `local_path` is in a file system that is not
+    # shared across all the ranks, then we assume it's in the
+    # host file system and each host needs to download the file.
+    if (not os.path.exists(local_path)) and (local_rank == 0):
+        _download_file(s3_client, s3_path, local_path)
+
+    if torch.distributed.is_initialized():
+        torch.distributed.barrier()
+
+    # If the `local_path` still does not exist, then we assume
+    # each rank is saving to a separate location.
+    if not os.path.exists(local_path):
+        _download_file(s3_client, s3_path, local_path)
+
+    if torch.distributed.is_initialized():
+        torch.distributed.barrier()
+
+    assert os.path.exists(local_path)
diff --git a/megatron/core/dist_checkpointing/__init__.py b/megatron/core/dist_checkpointing/__init__.py
new file mode 100644
index 0000000000..a065b5f36a
--- /dev/null
+++ b/megatron/core/dist_checkpointing/__init__.py
@@ -0,0 +1,11 @@
+# Copyright (c) 2022-2023, NVIDIA CORPORATION.  All rights reserved.
+
+from .core import check_is_distributed_checkpoint
+from .mapping import LocalNonpersistentObject, LocalNonpersitentObject, ShardedTensor
+from .serialization import (
+    load,
+    load_common_state_dict,
+    load_plain_tensors,
+    load_tensors_metadata,
+    save,
+)
diff --git a/megatron/core/dist_checkpointing/core.py b/megatron/core/dist_checkpointing/core.py
new file mode 100644
index 0000000000..af6ebff6ec
--- /dev/null
+++ b/megatron/core/dist_checkpointing/core.py
@@ -0,0 +1,77 @@
+# Copyright (c) 2022-2023, NVIDIA CORPORATION.  All rights reserved.
+
+""" Module for managing distributed checkpoints metadata. """
+
+import json
+from dataclasses import asdict, dataclass
+from pathlib import Path
+from typing import Optional
+
+CONFIG_FNAME = 'metadata.json'
+
+
+class CheckpointingException(Exception):
+    """Base checkpointing related exception"""
+
+    pass
+
+
+@dataclass
+class CheckpointingConfig:
+    """Documents backends used in the checkpoint.
+
+    Checkpoint config keeps track of formats used for storing the sharded tensors
+    (sharded_backend) and other objects (common_backend).
+
+    Note that versioning is not for the checkpoint content (which is application specific),
+    but for the checkpoint format itself.
+    """
+
+    sharded_backend: str
+    sharded_backend_version: int = 1
+    common_backend: str = 'torch'
+    common_backend_version: int = 1
+
+
+def check_is_distributed_checkpoint(checkpoint_dir):
+    """Checks if `metadata.json` exists in the checkpoint and is a valid config.
+
+    Args:
+        checkpoint_dir: checkpoint directory
+
+    Returns:
+        bool: True if `metadata.json` exists in the checkpoint and is a valid config.
+    """
+    return maybe_load_config(checkpoint_dir) is not None
+
+
+def maybe_load_config(checkpoint_dir: str) -> Optional[CheckpointingConfig]:
+    """Returns checkpoint config if `checkpoint_dir` is a distributed checkpoint and None otherwise
+
+    Args:
+        checkpoint_dir: checkpoint directory
+
+    Returns:
+        CheckpointingConfig (optional): None if checkpoint is not a valid distributed checkpoint
+    """
+    config_path = Path(checkpoint_dir, CONFIG_FNAME)
+    if not config_path.exists():
+        return None
+    with config_path.open() as f:
+        config_dict = json.load(f)
+    return CheckpointingConfig(**config_dict)
+
+
+def save_config(config: CheckpointingConfig, checkpoint_dir: str):
+    """Save given config to checkpoint directory.
+
+    Args:
+        config: checkpoint config
+        checkpoint_dir: checkpoint directory
+
+    Returns:
+        None
+    """
+    config_path = Path(checkpoint_dir, CONFIG_FNAME)
+    with config_path.open('w') as f:
+        json.dump(asdict(config), f)
diff --git a/megatron/core/dist_checkpointing/dict_utils.py b/megatron/core/dist_checkpointing/dict_utils.py
new file mode 100644
index 0000000000..438925112c
--- /dev/null
+++ b/megatron/core/dist_checkpointing/dict_utils.py
@@ -0,0 +1,245 @@
+# Copyright (c) 2022-2023, NVIDIA CORPORATION.  All rights reserved.
+
+""" Utilities for operating with dicts and lists.
+
+All functions in this module handle nesting of dicts and lists.
+Other objects (e.g. tuples) are treated as atomic leaf types that cannot be traversed.
+"""
+
+from collections import defaultdict
+from typing import Any, Callable, Dict, Iterable, List, Tuple, TypeVar, Union
+
+import numpy as np
+import torch
+
+U, V = TypeVar("U"), TypeVar("V")
+
+
+def extract_matching_values(
+    x: Union[dict, list], predicate: Callable[[Any], bool], return_lists_as_dicts: bool = False
+) -> Tuple[Union[dict, list], Union[dict, list]]:
+    """Return matching and nonmatching values. Keeps hierarchy.
+
+    Args:
+        x (Union[dict, list]) : state dict to process. Top-level argument must be a dict or list
+        predicate (object -> bool): determines matching values
+        return_lists_as_dicts (bool): if True, matching lists will be turned
+            into dicts, with keys indicating the indices of original elements.
+            Useful for reconstructing the original hierarchy.
+    """
+
+    def _set_elem(target, k, v):
+        if return_lists_as_dicts:
+            target[k] = v
+        else:
+            target.append(v)
+
+    if isinstance(x, dict):
+        matching_vals = {}
+        nonmatching_vals = {}
+        for k, v in x.items():
+            if isinstance(v, (list, dict)):
+                match, nonmatch = extract_matching_values(v, predicate, return_lists_as_dicts)
+                if match:
+                    matching_vals[k] = match
+                if nonmatch or not v:
+                    nonmatching_vals[k] = nonmatch
+            elif predicate(v):
+                matching_vals[k] = v
+            else:
+                nonmatching_vals[k] = v
+    elif isinstance(x, list):  # type: ignore
+        matching_vals = {} if return_lists_as_dicts else []
+        nonmatching_vals = {} if return_lists_as_dicts else []
+        for ind, v in enumerate(x):
+            if isinstance(v, (list, dict)) and v:
+                match, nonmatch = extract_matching_values(v, predicate, return_lists_as_dicts)
+                if match:
+                    _set_elem(matching_vals, ind, match)
+                if nonmatch or not v:
+                    _set_elem(nonmatching_vals, ind, nonmatch)
+            else:
+                target = matching_vals if predicate(v) else nonmatching_vals
+                _set_elem(target, ind, v)
+    else:
+        raise ValueError(f'Unexpected top-level object type: {type(x)}')
+    return matching_vals, nonmatching_vals
+
+
+def diff(x1: Any, x2: Any, prefix: Tuple = ()) -> Tuple[list, list, list]:
+    """Recursive diff of dicts.
+
+    Args:
+        x1 (object): left dict
+        x2 (object): right dict
+        prefix (tuple): tracks recursive calls. Used for reporting differing keys.
+
+    Returns:
+        Tuple[list, list, list]: tuple of:
+            - only_left: Prefixes present only in left dict
+            - only_right: Prefixes present only in right dict
+            - mismatch: values present in both dicts but not equal across dicts.
+                For tensors equality of all elems is checked.
+                Each element is a tuple (prefix, type of left value, type of right value).
+    """
+    mismatch = []
+    if isinstance(x1, dict) and isinstance(x2, dict):
+        only_left = [prefix + (k,) for k in x1.keys() - x2.keys()]
+        only_right = [prefix + (k,) for k in x2.keys() - x1.keys()]
+        for k in x2.keys() & x1.keys():
+            _left, _right, _mismatch = diff(x1[k], x2[k], prefix + (k,))
+            only_left.extend(_left)
+            only_right.extend(_right)
+            mismatch.extend(_mismatch)
+    elif isinstance(x1, list) or isinstance(x1, tuple) or isinstance(x1, np.ndarray):
+        assert type(x1) == type(x2)
+        only_left = list(range(len(x1) - 1, len(x2) - 1, -1))
+        only_right = list(range(len(x1) - 1, len(x2) - 1, -1))
+        for i, (v1, v2) in enumerate(zip(x1, x2)):
+            _left, _right, _mismatch = diff(v1, v2, prefix + (i,))
+            only_left.extend(_left)
+            only_right.extend(_right)
+            mismatch.extend(_mismatch)
+    else:
+        only_left = []
+        only_right = []
+        if isinstance(x1, torch.Tensor) and isinstance(x2, torch.Tensor):
+            _is_mismatch = not torch.all(x1 == x2)
+        # TODO: change with concrete type that has both replica_id and data attrs
+        elif hasattr(x1, 'replica_id') and hasattr(x2, 'replica_id'):
+            assert type(x1) == type(x2)
+            only_left, only_right, mismatch = diff(
+                x1.data, x2.data, prefix + (type(x1),)
+            )  # type: ignore
+            _is_mismatch = False
+        else:
+            try:
+                _is_mismatch = bool(x1 != x2)
+            except RuntimeError:
+                _is_mismatch = True
+
+        if _is_mismatch:
+            mismatch.append((prefix, type(x1), type(x2)))
+
+    return only_left, only_right, mismatch
+
+
+def inspect_types(x: Any, prefix: Tuple = (), indent: int = 4):
+    """Helper to print types of (nested) dict values."""
+    print_indent = lambda: print(' ' * indent * len(prefix), end='')
+    if isinstance(x, dict):
+        print()
+        for k, v in x.items():
+            print_indent()
+            print(f'> {k}: ', end='')
+            inspect_types(v, prefix + (k,), indent)
+    elif isinstance(x, list):
+        print()
+        for i, v in enumerate(x):
+            print_indent()
+            print(f'- {i}: ', end='')
+            inspect_types(v, prefix + (i,), indent)
+    else:
+        if isinstance(x, torch.Tensor):
+            print(f'Tensor of shape {x.shape}')
+        else:
+            try:
+                x_str = str(x)
+            except:
+                x_str = '<no string repr>'
+            if len(x_str) > 30:
+                x_str = x_str[:30] + '... (truncated)'
+            print(f'[{type(x)}]: {x_str}')
+
+
+def nested_values(x: Union[dict, list]):
+    """Returns iterator over (nested) values of a given dict or list."""
+    x_iter = x.values() if isinstance(x, dict) else x
+    for v in x_iter:
+        if isinstance(v, (dict, list)):
+            yield from nested_values(v)
+        else:
+            yield v
+
+
+def nested_items_iter(x: Union[dict, list]):
+    """Returns iterator over (nested) tuples (container, key, value) of a given dict or list."""
+    x_iter = x.items() if isinstance(x, dict) else enumerate(x)
+    for k, v in x_iter:
+        if isinstance(v, (dict, list)):
+            yield from nested_items_iter(v)
+        else:
+            yield x, k, v
+
+
+def dict_map(f: Callable, d: dict):
+    """`map` equivalent for dicts."""
+    for sub_d, k, v in nested_items_iter(d):
+        sub_d[k] = f(v)
+
+
+def dict_map_with_key(f: Callable, d: dict):
+    """`map` equivalent for dicts with a function that accepts tuple (key, value)."""
+    for sub_d, k, v in nested_items_iter(d):
+        sub_d[k] = f(k, v)
+
+
+def dict_list_map_inplace(f: Callable[[U], V], x: Union[Dict, List, U]):
+    """Maps dicts and lists *in-place* with a given function."""
+    if isinstance(x, dict):
+        for k, v in x.items():
+            x[k] = dict_list_map_inplace(f, v)
+    elif isinstance(x, list):
+        x[:] = (dict_list_map_inplace(f, v) for v in x)
+    else:
+        return f(x)
+    return x
+
+
+def dict_list_map_outplace(f: Callable[[U], V], x: Union[Dict, List, U]) -> Union[Dict, List, V]:
+    """Maps dicts and lists *out-of-place* with a given function."""
+    if isinstance(x, dict):
+        return {k: dict_list_map_outplace(f, v) for k, v in x.items()}
+    elif isinstance(x, list):
+        return [dict_list_map_outplace(f, v) for v in x]
+    else:
+        return f(x)
+
+
+def merge(x1: Union[dict, list], x2: Union[dict, list], key: Tuple[Union[str, int], ...] = ()):
+    """Merges dicts and lists recursively."""
+    if isinstance(x1, dict) and isinstance(x2, dict):
+        for k, v2 in x2.items():
+            if k not in x1:
+                x1[k] = v2
+            else:
+                x1[k] = merge(x1[k], v2, key=key + (k,))
+    elif isinstance(x1, list) and isinstance(x2, list):
+        if len(x1) != len(x2):
+            raise ValueError(
+                f'Cannot merge two lists with different lengths ({len(x1)} and {len(x2)}, '
+                f'encountered at level {key})'
+            )
+        for i, v2 in enumerate(x2):
+            x1[i] = merge(x1[i], v2, key=key + (i,))
+    else:
+        raise ValueError(
+            f'Duplicate non-dict and non-list values encountered: `{x1}` and `{x2}` '
+            f'(at level {key})'
+        )
+    return x1
+
+
+def map_reduce(
+    xs: Iterable,
+    key_fn: Callable = lambda x: x,
+    value_fn: Callable = lambda x: x,
+    reduce_fn: Callable = lambda x: x,
+) -> dict:
+    """Simple map-reduce implementation following `more_itertools.map_reduce` interface."""
+    res = defaultdict(list)
+    for x in xs:
+        res[key_fn(x)].append(value_fn(x))
+    for k in res:
+        res[k] = reduce_fn(res[k])
+    return dict(res)
diff --git a/megatron/core/dist_checkpointing/exchange_utils.py b/megatron/core/dist_checkpointing/exchange_utils.py
new file mode 100644
index 0000000000..2106fe574c
--- /dev/null
+++ b/megatron/core/dist_checkpointing/exchange_utils.py
@@ -0,0 +1,519 @@
+# Copyright (c) 2022-2023, NVIDIA CORPORATION.  All rights reserved.
+
+"""Utilities for exchanging data between ranks."""
+
+import logging
+from collections import defaultdict
+from functools import reduce
+from itertools import zip_longest
+from time import time
+from typing import Dict, List, NamedTuple, Optional, Set, Tuple, TypeVar, cast
+
+import numpy as np
+import torch
+
+from .core import CheckpointingException
+from .dict_utils import nested_values
+from .mapping import ShardedStateDict, ShardedTensor, is_main_replica
+from .utils import _sharded_tensor_shard_id, _ShardId
+
+# TODO: remove TE references once the TE bug is fixed
+# Check if Transformer Engine has Float8Tensor class
+HAVE_TE_FLOAT8TENSOR = False
+try:
+    from transformer_engine.pytorch.float8_tensor import Float8Tensor
+
+    HAVE_TE_FLOAT8TENSOR = True
+except (ImportError, ModuleNotFoundError):
+    # Float8Tensor not found
+    pass
+
+
+def is_float8tensor(tensor: torch.Tensor) -> bool:
+    """Check if a tensor is a Transformer Engine Float8Tensor"""
+    return HAVE_TE_FLOAT8TENSOR and isinstance(tensor, Float8Tensor)
+
+
+logger = logging.getLogger(__name__)
+
+
+class ShardDistribution(NamedTuple):
+    """Represents a distribution of ShardedTensors.
+
+    Given distribution is valid only for a specific parallelization group,
+    which is implicit here (not referenced by this class).
+
+    Args:
+        main_rank_for_shard (Dict[_ShardId, int]): specifies which rank should hold
+            the main replica for a given shard
+        shards_in_this_group (Set[_ShardId]): which shards have a main replica
+            in this parallelization group
+        shard_to_metadata (Dict[_ShardId, ShardedTensor]): maps ShardedTensor
+            identifier to the original ShardedTensor
+        all_ranks_for_shard (Dict[_ShardId, List[int]]): specifies which ranks
+            need a given shard in a given parallelization group
+
+    """
+
+    main_rank_for_shard: Dict[_ShardId, int]
+    shards_in_this_group: Set[_ShardId]
+    shard_to_metadata: Dict[_ShardId, ShardedTensor]
+    all_ranks_for_shard: Dict[_ShardId, List[int]]
+
+
+def _shard_size(sh_ten: ShardedTensor):
+    """Returns size in bytes of a given sharded tensor."""
+    if sh_ten.flattened_range is None:
+        numel = np.product(sh_ten.local_shape)
+    else:
+        numel = sh_ten.flattened_range.stop - sh_ten.flattened_range.start
+    return numel * torch._utils._element_size(sh_ten.dtype)
+
+
+def _get_empty_tensor_for_exchange(
+    shard_id: _ShardId,
+    needed_shards: Dict[_ShardId, ShardedTensor],
+    unneeded_shards: Dict[_ShardId, ShardedTensor],
+    loaded_tensors: Dict[_ShardId, torch.Tensor],
+) -> Tuple[torch.Tensor, Optional[torch.device]]:
+    """Determines the empty tensor to use for exchange.
+
+    If shard_id is needed by this rank, it will be in the `unloaded_shards`.
+    Otherwise, the metadata for this tensor can be found in `shard_to_metadata`
+
+    Args:
+        shard_id (_ShardId): shard_id that will be exchanged
+        needed_shards (Dict[_ShardId, ShardedTensor]): mapping from shard ids
+            to metadata for shards needed by this rank
+        unneeded_shards (Dict[_ShardId, ShardedTensor]): mapping from shard ids
+            to metadata for shards that can be discarded after exchange
+        loaded_tensors (Dict[_ShardId, torch.Tensor]): mapping where useful tensors
+            are placed in
+
+    Returns:
+        Tuple[torch.Tensor, Optional[torch.device]]: empty CUDA tensor to be exchanged,
+            and the device of the original state dict tensor (if there was any)
+    """
+    local_unloaded_sh_ten = needed_shards.get(shard_id)
+    if local_unloaded_sh_ten is None:
+        orig_device = None  # this tensor will be discarded anyway
+        sh_ten = unneeded_shards[shard_id]
+        if sh_ten.data is None:
+            sh_ten.init_data('cuda')
+            tensor = sh_ten.data
+            sh_ten.data = None  # won't be used. free memory
+        else:
+            tensor = sh_ten.data
+            if tensor.device.type == 'cpu':
+                tensor = torch.empty_like(tensor, device='cuda')
+    else:
+        local_unloaded_sh_ten.init_data('cuda')
+        orig_device = local_unloaded_sh_ten.data.device
+        tensor = local_unloaded_sh_ten.data
+        if tensor.device.type == 'cpu':
+            tensor = torch.empty_like(tensor, device='cuda')
+        loaded_tensors[shard_id] = tensor
+    return tensor, orig_device
+
+
+T = TypeVar('T')
+
+
+def distribute_shards_to_ranks(
+    shard_to_ranks: Dict[T, List[int]], shard_to_size: Dict[T, int], num_ranks: int
+) -> Dict[T, int]:
+    """Computes uniform distribution of workload across ranks, based on sizes.
+
+    Currently, the assignment is greedy, based on:
+    1. Firstly, the coverage of each shard
+        (how many ranks the shard is available on; lower coverage is assigned first)
+    2. Secondly, the size of each shard (larger size is assigned first)
+    3. Finally, shard id for differentiation.
+
+    Third step is added because we rely on the fact that
+    the assignment is deterministic on all ranks.
+
+    Args:
+        shard_to_ranks (Dict[T, List[int]]): mapping of rank access to shards
+        shard_to_size (Dict[T, int]): sizes of each shard
+        num_ranks (int): number of ranks in the parallelization group
+
+    Returns (Dict[T, int]): assignment of shard to rank (which rank should do the work
+        to achieve maximal uniformity)
+    """
+    shard_to_ranks = {k: tuple(v) for k, v in shard_to_ranks.items()}
+    shard_to_saving_rank = {}
+    rank_sizes = [(0, rank) for rank in range(num_ranks)]
+
+    # start from tensors of lowest coverage, then go by tensor size from largest (hence minus size)
+    for shard_id, shard_ranks in sorted(
+        shard_to_ranks.items(),
+        key=lambda sh_id_ranks: (
+            len(sh_id_ranks[1]),
+            -shard_to_size[sh_id_ranks[0]],
+            sh_id_ranks[0],
+        ),
+    ):
+        # assign greedily to the least occupied rank
+        size, rank = min((size, rank) for size, rank in rank_sizes if rank in shard_ranks)
+
+        shard_to_saving_rank[shard_id] = rank
+        rank_sizes[rank] = (size + shard_to_size[shard_id], rank)
+
+    logger.debug(f'distribute_shards_to_ranks distribution: {rank_sizes}')
+
+    return shard_to_saving_rank
+
+
+def determine_main_replica_uniform_distribution(
+    sharded_state_dict: ShardedStateDict,
+    parallelization_group: torch.distributed.ProcessGroup,
+    ignore_groups: bool = False,
+) -> Optional[ShardDistribution]:
+    """Computes the save distribution.
+
+    Should be used in conjunction with `distribute_main_replicas_with_precomputed_distribution`
+    which applies the computed save distribution.
+
+    We rely on the fact that the assignment algorithm is deterministic on all ranks,
+    so there is no extra communication needed after metadata exchange.
+
+    Args:
+        sharded_state_dict (ShardedStateDict): state dict to compute the distribution of
+        parallelization_group (ProcessGroup): distribution will be computed
+            within this process group
+        ignore_groups (bool, optional): whether the distribution defines groups.
+            This option is primarily used during loading, as it ensures that all replicas,
+            including non-main ones, are loaded by this parallelization group
+            Defaults to False.
+
+    Returns (ShardDistribution, optional): distribution that can be used to apply the
+        parallelization. Returns None if the process_group is trivial (1 rank)
+
+    """
+    group_size = torch.distributed.get_world_size(group=parallelization_group)
+    if group_size <= 1:
+        return
+    local_shards = list(
+        sh_base
+        for sh_base in nested_values(sharded_state_dict)
+        if isinstance(sh_base, ShardedTensor)
+    )
+    local_shards_no_data = [ten.without_data() for ten in local_shards]
+
+    all_shards = [None] * torch.distributed.get_world_size(group=parallelization_group)
+    torch.distributed.all_gather_object(
+        all_shards, local_shards_no_data, group=parallelization_group
+    )
+
+    shard_to_ranks = defaultdict(list)
+    shard_to_size = {}
+    shard_to_metadata = {}
+    shards_in_this_parallelization_group: Set[_ShardId] = set()
+    for rank, rank_shards in enumerate(all_shards):
+        for sh_ten in rank_shards:
+            shard_id = _sharded_tensor_shard_id(sh_ten)
+            shard_to_ranks[shard_id].append(rank)
+            if shard_id not in shard_to_size:
+                shard_to_size[shard_id] = _shard_size(sh_ten)
+                shard_to_metadata[shard_id] = sh_ten
+            if is_main_replica(sh_ten.replica_id) or ignore_groups:
+                shards_in_this_parallelization_group.add(shard_id)
+
+    shard_to_ranks = {
+        k: v for k, v in shard_to_ranks.items() if k in shards_in_this_parallelization_group
+    }
+
+    shard_to_saving_rank = distribute_shards_to_ranks(
+        shard_to_ranks, shard_to_size, len(all_shards)
+    )
+
+    return ShardDistribution(
+        shard_to_saving_rank,
+        shards_in_this_parallelization_group,
+        shard_to_metadata,
+        shard_to_ranks,
+    )
+
+
+@torch.no_grad()
+def exchange_loaded_tensors_gather_rounds(
+    loaded_tensors: Dict[_ShardId, torch.Tensor],
+    unloaded_shards: Dict[_ShardId, ShardedTensor],
+    shard_distribution: ShardDistribution = None,
+    parallelization_group: Optional[torch.distributed.ProcessGroup] = None,
+) -> Dict[_ShardId, torch.Tensor]:
+    """Exchange the tensors loaded by different ranks with several all_gather calls.
+
+    Groups tensors by dtype, divide tensors that will be exchanged into rounds
+    and execute all_gather for tensors from each round.
+
+    Note: the loading is distributed across ranks based on total loaded size
+    in bytes, so there is no guarantee that number of rounds needed for each
+    rank will be similar, which might result in a lot of almost empty
+    all_gathers. The solution would be to group all tensors into a one
+    bytes tensor and do a single all_gather (with similarly sized messages).
+
+    Args:
+        loaded_tensors (Dict[_ShardId, torch.Tensor]): mapping from ShardedTensor
+            shard ids to tensors already loaded by this rank.
+        unloaded_shards (Dict[_ShardId, torch.Tensor]): mapping from ShardedTensor
+            shard ids to ShardedTensors that aren't loaded yet.
+        shard_distribution (ShardDistribution): distribution of all shards
+        parallelization_group (ProcessGroup, optional): process group used for load
+            distribution. Tensors will be exchanged within this group
+
+    Returns:
+        Dict[_ShardId, torch.Tensor]: dictionary mapping shard ids to tensors
+            needed by this rank to load a given state dict. Includes
+            previously loaded tensors (from `loaded_tensors` input)
+    """
+    main_rank_for_shard, _, shard_to_metadata, all_ranks_for_shard = shard_distribution
+    local_rank = torch.distributed.get_rank(group=parallelization_group)
+
+    all_loaded_tensors = dict(loaded_tensors)
+
+    # Group by dtype so that we all_gather tensors of the same dtype
+    for dtype in sorted(set(map(lambda sh_ten: sh_ten.dtype, shard_to_metadata.values())), key=str):
+
+        start = time()
+        # shards_by_rank maps rank to tensors loaded by this rank
+        shards_by_rank: List[List[torch.Tensor]] = [
+            [] for _ in range(torch.distributed.get_world_size(group=parallelization_group))
+        ]
+        for shard_id, rank in main_rank_for_shard.items():
+            if len(all_ranks_for_shard[shard_id]) == 1:
+                assert all_ranks_for_shard[shard_id][0] == main_rank_for_shard[shard_id], (
+                    f'When there is only 1 ranks that needs a given shard,'
+                    f' it should be the loading rank.'
+                    f' Got: needs [{all_ranks_for_shard[shard_id][0]}]'
+                    f' vs loads [{main_rank_for_shard[shard_id]}]'
+                )
+                # Skipping the exchange since only the loading rank needs this tensor
+                # TODO: we can employ some optimizations even for `len(shard_to_ranks) > 1`
+                #  case, e.g. P2P exchange. Currently handling this case saves most of the
+                #  work though.
+                continue
+            if shard_to_metadata[shard_id].dtype == dtype:
+                shards_by_rank[rank].append(shard_id)
+
+        # Transpose `shards_by_rank` to form exchange rounds
+        shards_by_round = zip_longest(*shards_by_rank, fillvalue=None)
+        for round_idx, round_shard_ids in enumerate(shards_by_round):
+            round_tensors = []
+            orig_devices = {}
+            for rank, shard_id in enumerate(round_shard_ids):
+                if shard_id is None:
+                    # if no more useful data, the given rank will exchange empty tensor
+                    local_ten = torch.empty(0, dtype=dtype, device='cuda')
+                    orig_device = None
+                else:
+                    assert isinstance(shard_id, tuple), type(shard_id)
+                    if rank == local_rank:
+                        assert shard_id in all_loaded_tensors, (shard_id, all_loaded_tensors.keys())
+                        orig_device = all_loaded_tensors[shard_id]
+                        all_loaded_tensors[shard_id] = all_loaded_tensors[shard_id].cuda()
+                        local_ten = all_loaded_tensors[shard_id]
+                    else:
+                        local_ten, orig_device = _get_empty_tensor_for_exchange(
+                            shard_id, unloaded_shards, shard_to_metadata, all_loaded_tensors
+                        )
+                    # Because of a TE bug, we have to exchange a nominal dtype instead of FP8
+                    # It's ok to keep the nominal dtype after exchange, because TE will handle
+                    # this during state dict load.
+                    # TODO: remove it once the bug is fixed
+                    if is_float8tensor(local_ten):
+                        local_ten = local_ten.from_float8()
+                        all_loaded_tensors[shard_id] = local_ten
+
+                round_tensors.append(local_ten)
+                if orig_device is not None:
+                    orig_devices[shard_id] = orig_device
+
+            torch.distributed.all_gather(
+                list(round_tensors),
+                round_tensors[local_rank],
+                group=parallelization_group,
+                async_op=False,
+            )
+
+            # Move tensors back to CPU if originally was on CPU
+            for shard_id, orig_device in orig_devices.items():
+                all_loaded_tensors[shard_id] = all_loaded_tensors[shard_id].to(orig_device)
+
+            del round_tensors  # remove tensor references
+
+        end = time()
+        if torch.distributed.get_rank() == 0:
+            logger.debug(f'{dtype} exchange rounds all_gather schedule took {end - start}s')
+
+    return all_loaded_tensors
+
+
+def exchange_loaded_tensors_gather_object(
+    loaded_tensors: Dict[_ShardId, torch.Tensor],
+    unloaded_shards: Dict[_ShardId, ShardedTensor],
+    shard_distribution: ShardDistribution,
+    parallelization_group: Optional[torch.distributed.ProcessGroup] = None,
+) -> Dict[_ShardId, torch.Tensor]:
+    """Exchange the tensors loaded by different ranks with a simple all_gather_object call.
+
+    This version can be used for debugging purposes do to its simplistic
+    implementation. Shouldn't be used if performance is important.
+
+    Args:
+        loaded_tensors (Dict[_ShardId, torch.Tensor]): mapping from ShardedTensor
+            shard ids to tensors already loaded by this rank.
+        unloaded_shards (Dict[_ShardId, torch.Tensor]): mapping from ShardedTensor
+            shard ids to ShardedTensors that aren't loaded yet.
+        shard_distribution (ShardDistribution): distribution of all shards
+        parallelization_group (ProcessGroup, optional): process group used for load
+            distribution. Tensors will be exchanged within this group
+
+    Returns:
+        Dict[_ShardId, torch.Tensor]: dictionary mapping shard ids to tensors
+            needed by this rank to load a given state dict. Includes
+            previously loaded tensors (from `loaded_tensors` input)
+
+    """
+    all_loaded_tensors_list = [None] * torch.distributed.get_world_size(group=parallelization_group)
+    torch.distributed.all_gather_object(
+        all_loaded_tensors_list, loaded_tensors, group=parallelization_group
+    )
+    all_loaded_tensors_list = cast(List[Dict[_ShardId, torch.Tensor]], all_loaded_tensors_list)
+    all_loaded_tensors = reduce(lambda x, y: {**x, **y}, all_loaded_tensors_list)
+
+    # Error checks
+    if len(all_loaded_tensors) != sum(map(len, all_loaded_tensors_list)):
+        err_msg = 'Duplicate shard ids loaded by different ranks'
+        if torch.distributed.get_rank() == 0:
+            logger.error(
+                f'{err_msg}. Shards ids by rank:'
+                f' {[lt.keys() for lt in all_loaded_tensors_list]}'
+            )
+        raise CheckpointingException(err_msg)
+
+    return all_loaded_tensors
+
+
+@torch.no_grad()
+def exchange_loaded_tensors_broadcast(
+    loaded_tensors: Dict[_ShardId, torch.Tensor],
+    unloaded_shards: Dict[_ShardId, ShardedTensor],
+    shard_distribution: ShardDistribution,
+    parallelization_group: Optional[torch.distributed.ProcessGroup] = None,
+) -> Dict[_ShardId, torch.Tensor]:
+    """Exchange the tensors loaded by different ranks by a series of broadcasts.
+
+    For each rank for each loaded tensor do a broadcast to the whole group.
+    A reasonable tradeoff in terms of performance and simplicity.
+
+    Args:
+        loaded_tensors (Dict[_ShardId, torch.Tensor]): mapping from ShardedTensor
+            shard ids to tensors already loaded by this rank.
+        unloaded_shards (Dict[_ShardId, ShardedTensor]): mapping from ShardedTensor
+            shard ids to ShardedTensors that aren't loaded yet.
+        shard_distribution (ShardDistribution): distribution of all shards
+        parallelization_group (ProcessGroup, optional): process group used for load
+            distribution. Tensors will be exchanged within this group
+
+    Returns:
+        Dict[_ShardId, torch.Tensor]: dictionary mapping shard ids to tensors
+            needed by this rank to load a given state dict. Includes
+            previously loaded tensors (from `loaded_tensors` input)
+    """
+    main_rank_for_shard, _, shard_to_metadata, all_ranks_for_shard = shard_distribution
+    local_rank = torch.distributed.get_rank(group=parallelization_group)
+
+    all_loaded_tensors = dict(loaded_tensors)
+
+    start = time()
+
+    for idx, (shard_id, rank) in enumerate(main_rank_for_shard.items()):
+        if len(all_ranks_for_shard[shard_id]) == 1:
+            assert all_ranks_for_shard[shard_id][0] == main_rank_for_shard[shard_id], (
+                f'When there is only 1 ranks that needs a given shard,'
+                f' it should be the loading rank.'
+                f'Got: needs [{all_ranks_for_shard[shard_id][0]}]'
+                f' vs loads [{main_rank_for_shard[shard_id]}]'
+            )
+            # Skipping the exchange since only the loading rank needs this tensor
+            # TODO: we can employ some optimizations even for `len(shard_to_ranks) > 1` case,
+            #  e.g. P2P exchange. Currently handling this case saves most of the work though.
+            continue
+        if rank == local_rank:
+            assert shard_id in all_loaded_tensors, (shard_id, all_loaded_tensors.keys())
+            orig_device = all_loaded_tensors[shard_id].device
+            local_ten = all_loaded_tensors[shard_id].cuda()
+        else:
+            local_ten, orig_device = _get_empty_tensor_for_exchange(
+                shard_id, unloaded_shards, shard_to_metadata, all_loaded_tensors
+            )
+
+        # Because of a TE bug, we have to exchange a nominal dtype instead of FP8
+        # It's ok to keep the nominal dtype after exchange, because TE will handle
+        # this during state dict load.
+        # TODO: remove it once the bug is fixed
+        if is_float8tensor(local_ten):
+            local_ten = local_ten.from_float8()
+            all_loaded_tensors[shard_id] = local_ten
+
+        global_src_rank = (
+            rank
+            if parallelization_group == None
+            else torch.distributed.get_global_rank(parallelization_group, rank)
+        )
+        # We can do async_op=True only if there is no CPU-copy follow-up
+        torch.distributed.broadcast(
+            local_ten,
+            src=global_src_rank,
+            group=parallelization_group,
+            async_op=orig_device is None,
+        )
+        # Move tensor back to CPU if originally was on CPU
+        if orig_device is not None:
+            all_loaded_tensors[shard_id] = local_ten.to(orig_device)
+        del local_ten
+
+    end = time()
+    if torch.distributed.get_rank() == 0:
+        logger.debug(f'exchange broadcast schedule took {end - start}s')
+
+    return all_loaded_tensors
+
+
+def exchange_by_distribution(
+    loaded_tensors: Dict[_ShardId, torch.Tensor],
+    unloaded_shards: Dict[_ShardId, ShardedTensor],
+    shard_distribution: ShardDistribution = None,
+    parallelization_group: Optional[torch.distributed.ProcessGroup] = None,
+    exchange_algo='broadcast',
+) -> Dict[_ShardId, torch.Tensor]:
+    """Exchange tensors loaded by different ranks using the specified exchange_algo.
+
+    Args:
+        loaded_tensors (Dict[_ShardId, torch.Tensor]): mapping from ShardedTensor
+            shard ids to tensors already loaded by this rank.
+        unloaded_shards (Dict[_ShardId, ShardedTensor]): mapping from ShardedTensor
+            shard ids to ShardedTensors that aren't loaded yet.
+        shard_distribution (ShardDistribution): distribution of all shards
+        parallelization_group (ProcessGroup, optional): process group used for load
+            distribution. Tensors will be exchanged within this group
+        exchange_algo (str): The algorithm used for performing exchanges.
+            Defaults to 'broadcast'.
+
+    Returns:
+        Dict[_ShardId, torch.Tensor]: dictionary mapping shard ids to tensors
+            needed by this rank to load a given state dict. Includes
+            previously loaded tensors (from `loaded_tensors` input)
+    """
+
+    if exchange_algo == 'gather_object':
+        exchange_fn = exchange_loaded_tensors_gather_object
+    elif exchange_algo == 'gather_rounds':
+        exchange_fn = exchange_loaded_tensors_gather_rounds
+    elif exchange_algo == 'broadcast':
+        exchange_fn = exchange_loaded_tensors_broadcast
+    else:
+        raise NotImplementedError(f'Unrecognized gather algorithm: {exchange_algo}')
+    return exchange_fn(loaded_tensors, unloaded_shards, shard_distribution, parallelization_group)
diff --git a/megatron/core/dist_checkpointing/mapping.py b/megatron/core/dist_checkpointing/mapping.py
new file mode 100644
index 0000000000..90d4fcdc22
--- /dev/null
+++ b/megatron/core/dist_checkpointing/mapping.py
@@ -0,0 +1,729 @@
+# Copyright (c) 2022-2023, NVIDIA CORPORATION.  All rights reserved.
+
+""" Core library classes for representing sharding of tensors and objects.
+
+The main expected usage is wrapping torch.Tensors in state dicts with
+ShardedTensor class (mostly with the ShardedTensor.from_rank_offsets classmethod).
+"""
+
+import logging
+from abc import ABC, abstractmethod
+from dataclasses import dataclass, field, replace
+from itertools import chain
+from typing import Any, Callable, Dict, List, Optional, Tuple, Union
+
+import numpy as np
+import torch
+
+from .core import CheckpointingException
+from .dict_utils import dict_list_map_inplace
+
+logger = logging.getLogger(__name__)
+
+# These type definitions are just hints to differentiate a plain model state
+#  dict (StateDict) from a state dict with tensors replaced with ShardedTensors
+#  (ShardedStateDict).
+StateDict = Dict[str, Any]
+ShardedStateDict = Dict[str, Any]
+ReplicaId = Union[int, Tuple[int, ...]]
+
+
+class ShardedBase(ABC):
+    """Base class for ShardedTensor and ShardedStateDict."""
+
+    key: str
+    data: object
+    replica_id: ReplicaId
+
+    @abstractmethod
+    def validate_metadata_integrity(self):
+        """Codifies the constraints on metadata attributes."""
+
+    @abstractmethod
+    def without_data(self) -> 'ShardedBase':
+        """Returns a new ShardedBase instance with data=None."""
+        raise NotImplementedError
+
+
+@dataclass
+class ShardedTensor(ShardedBase):
+    """Represents a mapping between a local tensor and a global tensor.
+
+    Global tensor is assumed to consist of many local tensors distributed
+    between different processes.
+
+    Args:
+        key: unique identifier of a global tensor
+        data: local tensor data. Can be None only for consistency validation
+        dtype: tensor dtype
+        local_shape: local tensor shape
+        global_shape: global tensor shape
+        global_offset: offset of a local tensor in a global tensor,
+            specified in number of tensor elements
+        axis_fragmentations: global tensor fragmentation of each axis
+        replica_id: indicates given local tensor's replication wrt.
+            local tensors in different processes
+        prepend_axis_num: number of axes prepended to the local tensor to
+            reflect global tensor shape. The behavior is similar to
+            unsqueezing the local tensor.
+        allow_shape_mismatch: if True, during loading, the global shape of
+            a stored tensor does not have to match the expected global shape.
+            Useful for representing tensors with flexible shape,
+            e.g. padded.
+        flattened_range: specifies a slice that should be applied to a
+            flattened tensor with `local_shape` in order to get
+            the tensor stored as `data`
+    """
+
+    key: str
+    data: Optional[torch.Tensor] = field(repr=False)
+    dtype: torch.dtype
+    local_shape: Tuple[int, ...]
+    global_shape: Tuple[int, ...]
+    global_offset: Tuple[int, ...]
+    axis_fragmentations: Optional[Tuple[int, ...]]
+    replica_id: ReplicaId = 0
+    prepend_axis_num: int = 0
+    allow_shape_mismatch: bool = False
+    flattened_range: Optional[slice] = None
+
+    def __post_init__(self):
+        self.validate_metadata_integrity()
+
+    def validate_metadata_integrity(self) -> None:
+        """Codifies the constraints on metadata attributes.
+
+        Meeting those constraints is guaranteed when instantiating a ShardedTensor
+        class with `from_rank_offsets` or `from_rank_offsets_flat` constructors.
+
+        Returns:
+            None
+        """
+        has_flattened_range = self.flattened_range is not None
+        if self.data is not None:
+            if self.data.dtype != self.dtype:
+                raise CheckpointingException(
+                    f'Data dtype should match `dtype` attribute for {self}'
+                )
+            if not has_flattened_range and self.data.shape != self.local_shape:
+                raise CheckpointingException(
+                    f'Data shape should match `local_shape` attribute for {self}'
+                )
+            if has_flattened_range:
+                if self.data.ndim != 1:
+                    raise CheckpointingException(f'Data should be 1D for a flattened {self}')
+                real_data = self.data
+                try:
+                    self.data = None
+                    self.init_data(device='meta')
+                    if self.data.shape != real_data.shape:
+                        raise CheckpointingException(
+                            f'Data shape doesnt match expected {self.data.shape} for {self}'
+                        )
+                finally:
+                    self.data = real_data
+
+        if len(self.global_shape) != len(self.global_offset):
+            raise CheckpointingException(
+                f'Global offset dimensions should be equal to global shape dimensions for {self}'
+            )
+        if len(self.local_shape) + self.prepend_axis_num != len(self.global_shape):
+            raise CheckpointingException(
+                f'Local shape together with `prepend_axis_num` dimensions should be '
+                f'equal to global shape dimensions for {self}'
+            )
+
+        for off, sh in zip(self.global_offset[self.prepend_axis_num :], self.local_shape):
+            if off % sh != 0:
+                raise CheckpointingException(
+                    f'Global offset ({off}) must be divisible by local shape ({sh}) for {self}.'
+                )
+
+        if has_flattened_range and self.flattened_range.step is not None:
+            raise CheckpointingException(
+                f'`step` argument in the flattened range of a ShardedTensor is not supported.'
+            )
+
+    def global_slice(self) -> Tuple[Union[int, slice], ...]:
+        """
+        Returns a tuple of int and slice objects representing a slice of the
+        global tensor that this ShardedTensor corresponds to.
+        """
+        assert len(self.global_offset) == len(self.local_shape) + self.prepend_axis_num
+        return tuple(
+            chain(
+                (off for off in self.global_offset[: self.prepend_axis_num]),
+                (
+                    slice(off, off + sh)
+                    for off, sh in zip(
+                        self.global_offset[self.prepend_axis_num :], self.local_shape
+                    )
+                ),
+            )
+        )
+
+    def global_coordinates(self) -> Tuple[np.ndarray, ...]:
+        """
+        Returns a tuple of np.ndarrays representing the coordinates of the global tensor
+        that this ShardedTensor corresponds to.
+        """
+        if self.flattened_range is None:
+            raise CheckpointingException(
+                f'`global_coordinates` is undefined for'
+                f' {self.__class__.__name__} without `flattened_range`'
+            )
+
+        local_coords = self.local_coordinates()
+        assert len(local_coords) + self.prepend_axis_num == len(self.global_offset), (
+            len(local_coords),
+            self,
+        )
+        global_coords = tuple(
+            c + off
+            for c, off in zip((0,) * self.prepend_axis_num + local_coords, self.global_offset)
+        )
+        return global_coords
+
+    def local_coordinates(self) -> Tuple[np.ndarray, ...]:
+        """
+        Returns a tuple of np.ndarrays representing the coordinates of the local tensor
+        that this ShardedTensor corresponds to.
+        """
+        if self.flattened_range is None:
+            raise CheckpointingException(
+                f'`local_coordinates` is undefined for'
+                f' {self.__class__.__name__} without `flattened_range`'
+            )
+
+        # TODO: np.unravel_index?
+        mask = np.zeros(np.product(self.local_shape), dtype=bool)
+        mask[self.flattened_range] = True
+        return np.nonzero(mask.reshape(self.local_shape))
+
+    def local_chunk_offset_in_global(self) -> Tuple[int, ...]:
+        """Offset of a local chunk in a global array of chunks.
+
+        Returns:
+            Tuple[int, ...]: the offset of the whole local chunk in a global array of chunks.
+        """
+        assert len(self.global_offset) == len(self.local_shape) + self.prepend_axis_num
+        chunk_offset = list(self.global_offset[: self.prepend_axis_num])
+        for off, sh in zip(self.global_offset[self.prepend_axis_num :], self.local_shape):
+            assert off % sh == 0, str(self)
+            chunk_offset.append(off // sh)
+        return tuple(chunk_offset)
+
+    def max_allowed_chunks(self) -> Tuple[int, ...]:
+        """
+        Returns the maximum allowed chunks for this ShardedTensor.
+        """
+        chunks = []
+        for axis_sh, axis_fragm in zip(self.global_shape, self.axis_fragmentations):
+            if not self.allow_shape_mismatch and axis_sh % axis_fragm != 0:
+                raise CheckpointingException(
+                    f'Axis shape ({axis_sh}) not divisible by axis fragmentation ({axis_fragm}'
+                )
+            axis_chunk_size = axis_sh // axis_fragm
+            chunks.append(axis_chunk_size)
+        return tuple(chunks)
+
+    def without_data(self):
+        return replace(self, data=None)
+
+    @classmethod
+    def from_rank_offsets(
+        cls,
+        key: str,
+        data: torch.Tensor,
+        *rank_offsets: Tuple[int, int, int],
+        replica_id: ReplicaId = 0,
+        prepend_axis_num: int = 0,
+        flattened_range: None = None,
+        **init_kwargs,
+    ):
+        """Allows to construct the ShardedTensor given offset specified in process ranks.
+
+        Args:
+            key (str): unique key
+            data (torch.Tensor): local tensor data
+            rank_offsets (Tuple[int, int, int]): each tuple
+                (axis, axis_rank_offset, axis_fragm) says that if
+                global tensor is divided into `axis_fragm` fragment along `axis`
+                axis, then local tensor data corresponds to the `axis_rank_offset` chunk.
+            replica_id (ReplicaId): see ShardedTensor
+            prepend_axis_num (int): see ShardedTensor
+            flattened_range (None): must be None when using this constructor
+            init_kwargs: passed to ShardedTensor.__init__
+        """
+        if flattened_range is not None:
+            raise ValueError(
+                'Cannot instantiate a flat ShardedTensor with `from_rank_offsets` method.'
+                ' Use `from_rank_offsets_flat` instead'
+            )
+        global_offset = [0] * (data.ndim + prepend_axis_num)
+        global_shape = ([1] * prepend_axis_num) + list(data.shape)
+        axis_fragmentations = [1] * (data.ndim + prepend_axis_num)
+        _seen_axis = set()
+        for axis, axis_rank_offset, axis_fragm in rank_offsets:
+            assert axis >= 0 and axis_rank_offset >= 0 and axis_fragm >= 0, (
+                axis,
+                axis_rank_offset,
+                axis_fragm,
+            )
+            assert (
+                axis_rank_offset < axis_fragm
+            ), 'Rank offset must be lower than axis fragmentation'
+            if axis in _seen_axis:
+                raise CheckpointingException('Duplicated axis specified')
+            _seen_axis.add(axis)
+
+            local_axis_shape = 1 if axis < prepend_axis_num else data.shape[axis - prepend_axis_num]
+            global_shape[axis] = axis_fragm * local_axis_shape
+            global_offset[axis] = axis_rank_offset * local_axis_shape
+            axis_fragmentations[axis] = axis_fragm
+
+        return cls(
+            key,
+            data,
+            data.dtype,
+            tuple(data.shape),
+            tuple(global_shape),
+            tuple(global_offset),
+            tuple(axis_fragmentations),
+            replica_id,
+            prepend_axis_num,
+            flattened_range=flattened_range,
+            **init_kwargs,
+        )
+
+    @classmethod
+    def from_rank_offsets_flat(
+        cls,
+        key: str,
+        data: torch.Tensor,
+        non_flat_local_shape: Tuple[int, ...],
+        *args,
+        flattened_range: Optional[slice] = None,
+        **kwargs,
+    ):
+        """Allows to construct a *flattened* ShardedTensor given offset specified in process ranks.
+
+        Args:
+            key (str):
+            data (torch.Tensor): this should be a flattened data tensor
+            non_flat_local_shape (Tuple[int, ...]): expected local shape of a non-flat chunk
+            *args: passed unchanged to the `from_rank_offsets` constructor
+            flattened_range (slice): see ShardedTensor. Defaults to None, but must be set to
+                a non-None slice.
+            **kwargs:
+
+        Returns:
+            ShardedTensor: constructed ShardedTensor instance
+        """
+        if flattened_range is None:
+            raise CheckpointingException(
+                'Cannot instantiate a non-flat ShardedTensor with `from_rank_offsets_flat` method.'
+                ' Use `from_rank_offsets` instead'
+            )
+        if data.ndim != 1:
+            raise CheckpointingException(
+                f'Flattened ShardedTensor requires 1D data, got shape: {data.shape}'
+            )
+        if flattened_range.stop - flattened_range.start != data.numel():
+            raise CheckpointingException(
+                f'Flattened ShardedTensor data length ({data.numel()}) must meet the '
+                f'slice length: {flattened_range.stop - flattened_range.start}'
+            )
+
+        non_flat_data_meta = torch.empty(*non_flat_local_shape, dtype=data.dtype, device='meta')
+        sh_ten = cls.from_rank_offsets(key, non_flat_data_meta, *args, **kwargs)
+        instance = replace(sh_ten, data=data, flattened_range=flattened_range)
+        instance.validate_metadata_integrity()
+        return instance
+
+    def init_data(self, device: Union[str, torch.device], init_fn=torch.empty):
+        """
+        Initialize the tensor data of this ShardedTensor.
+
+        Only called if `data` attribute is None.
+
+        Args:
+            device (Union[str, torch.device]): device to place the tensor on
+            init_fn (Callable, optional): function to use to initialize the tensor.
+                Defaults to `torch.empty`.
+        """
+        if self.data is not None:
+            return
+        self.data = init_fn(self.local_shape, dtype=self.dtype, device=device)
+        if self.flattened_range is not None:
+            self.data = self.data.flatten()[self.flattened_range.start : self.flattened_range.stop]
+
+    def narrow(self, dim: int, start: int, length: int) -> List['ShardedTensor']:
+        """This is an analogue of torch.narrow for ShardedTensors.
+
+        Narrowing assumes that we narrow a local tensor on each rank.
+        This has consequences on local_shape, global_shape, global_offset, etc.
+
+        Args:
+            dim (int): dimension to narrow. Doesn't include prepended axes.
+            start (int): start element
+            length (int): length of the slice
+
+        Returns:
+            List[ShardedTensor]: narrowed ShardedTensors. For non-flat tensors,
+                the list will always have 1 element. For flat ShardedTensors the number of
+                elements varies depending on `dim` and on overlap, because flat
+                tensors must be contiguous. In particular the list can be empty.
+        """
+        prepended_dim = dim + self.prepend_axis_num
+        local_length_along_dim = self.local_shape[dim]
+
+        def _update_tuple(x, ind, val):
+            x = list(x)
+            x[ind] = val
+            return tuple(x)
+
+        def _safe_div(x, y):
+            assert x % y == 0, (x, y)
+            return x // y
+
+        # Decrease global shape and global offset by `length / local_length_along_dim`
+        assert (
+            self.global_shape[prepended_dim] % local_length_along_dim == 0
+        ), f'Only regular grid of local tensors is supported for narrowing, got: {self}'
+        assert (
+            self.global_offset[prepended_dim] % local_length_along_dim == 0
+        ), f'Only regular grid of local tensors is supported for narrowing, got: {self}'
+        global_shape = _update_tuple(
+            self.global_shape,
+            prepended_dim,
+            _safe_div(self.global_shape[prepended_dim] * length, local_length_along_dim),
+        )
+        global_offset = _update_tuple(
+            self.global_offset,
+            prepended_dim,
+            _safe_div(self.global_offset[prepended_dim] * length, local_length_along_dim),
+        )
+
+        if self.flattened_range is None:
+            new_data = self.data.narrow(dim, start, length)
+            # always a single result tensor
+            return [
+                replace(
+                    self,
+                    data=new_data,
+                    local_shape=new_data.shape,
+                    global_shape=global_shape,
+                    global_offset=global_offset,
+                )
+            ]
+        else:
+            if dim != 0:
+                raise CheckpointingException(
+                    f'Narrowing along the first axis is supported for now only, got dim={dim}'
+                )
+
+            # If dim=0, we will always get 0 or 1 resulting tensor.
+            # If dim>1, in general there can be more result tensors (e.g. max 3 for dim=1)
+
+            # For on original flat ShardedTensor of local shape [3, 4] and
+            # flattened_range=slice(5, 10),
+            # the X signs mark the actual (flat) data in `self.data`
+            # notice 12 (3*4) total "virtual" elements, out of which 5 is actual data.
+            # flat original: [.....XXXXX..]
+
+            # If we narrow to start=1, length=1 in the original local shape dimensions,
+            # the overlapping flat slice would be:
+            # narrow to:     [....XXXX....]
+            # flat overlap:  [.....XXX....]
+
+            # Now `data` is flattened and sliced, so we must compute local_shape manually
+            local_shape = _update_tuple(self.local_shape, dim, length)
+            other_dims_volume = np.prod(
+                _update_tuple(local_shape, dim, 1)
+            )  # 4 in the example above
+            volume_before_split = other_dims_volume * start  # 4 in the example above
+            volume_of_split = other_dims_volume * length  # 4 in the example above
+
+            flat_slice_start_shifted = (
+                self.flattened_range.start - volume_before_split
+            )  # 5 - 4 = 1 in the example above
+            flat_slice_stop_shifted = (
+                self.flattened_range.stop - volume_before_split
+            )  # 10 - 4 = 6 in the example above
+
+            # Find an intersection of
+            # (flat_slice_start_shifted, flat_slice_stop_shifted) vs (0, volume_of_split)
+
+            if flat_slice_stop_shifted <= 0 or flat_slice_start_shifted >= volume_of_split:
+                return []  # no intersection
+
+            # new_flattened_range = slice(1, 4) in the example above
+            new_flattened_range = slice(
+                max(flat_slice_start_shifted, 0), min(flat_slice_stop_shifted, volume_of_split)
+            )
+            # Apply the intersection to the flattened data tensor.
+            # Compute start and slice appropriate length
+            intersection_slice_start = (
+                new_flattened_range.start - flat_slice_start_shifted
+            )  # 0 in the example above
+            new_data = self.data[
+                intersection_slice_start : intersection_slice_start
+                + new_flattened_range.stop
+                - new_flattened_range.start
+            ]
+
+            return [
+                replace(
+                    self,
+                    data=new_data,
+                    local_shape=local_shape,
+                    global_shape=global_shape,
+                    global_offset=global_offset,
+                    flattened_range=new_flattened_range,
+                )
+            ]
+
+
+def is_main_replica(replica_id: ReplicaId):
+    """Checks if given `replica_id` is considered as main.
+
+    "Main" replica is:
+    - integer 0
+    - or an iterable with all 0 elements
+
+    It is the application responsibility to set correct replicas for sharded tensors.
+
+    Args:
+        replica_id (Union[int, Tuple[int, ...]]): replica id
+
+    Returns:
+        (bool): True for a "main" replica
+    """
+    if isinstance(replica_id, int):
+        return replica_id == 0
+    return all(r == 0 for r in replica_id)
+
+
+class LocalNonpersistentObject:
+    """Object that should not be stored in a checkpoint, but restored locally.
+
+    Wrapping any object inside the state dict with LocalNonpersistentObject
+    will result in:
+    - during saving, this object will *not* be stored in the checkpoint
+    - during loading, a local version of this object will be placed in a state dict
+    """
+
+    def __init__(self, obj):
+        self.obj = obj
+
+    def unwrap(self):
+        """Returns the original object."""
+        return self.obj
+
+
+# TODO: Delete once NeMo fixes typo.
+LocalNonpersitentObject = LocalNonpersistentObject
+
+
+@dataclass
+class ShardedObject(ShardedBase):
+    """Represents a mapping between a local object and a global object.
+
+    Global object is assumed to consist of many local objects distributed
+    between different processes.
+
+    NOTE: Contrary to ShardedTensor, it's impossible to change global object
+    sharding. Conceptually, ShardedObject is a fully-sharded ShardedTensor
+    with atomic arbitrary typed elements.
+
+    Args:
+        key: unique identifier of a global tensor
+        data: local object data. Can be None only for consistency validation
+        global_shape: global object shape
+        global_offset: offset of a local object in a global object, specified in number of shards
+        replica_id: indicates local object replication wrt. local objects in different processes
+    """
+
+    key: str
+    data: object
+    global_shape: Tuple[int, ...]
+    global_offset: Tuple[int, ...]
+    replica_id: ReplicaId = 0
+
+    def __post_init__(self):
+        self.validate_metadata_integrity()
+
+    def validate_metadata_integrity(self):
+        if len(self.global_shape) != len(self.global_offset):
+            raise CheckpointingException(
+                f'Global offset dimensions should be equal to global shape dimensions for {self}'
+            )
+
+    def without_data(self):
+        return replace(self, data=None)
+
+    @property
+    def unique_key(self):
+        """returns a unique key for this object"""
+        return (
+            f'{self.key}/shard_'
+            f'{".".join(map(str, self.global_offset))}_'
+            f'{".".join(map(str, self.global_shape))}'
+        )
+
+    def __str__(self):
+        return f'{self.__class__.__name__}(key=\'{self.key}\')'
+
+    @classmethod
+    def empty_from_unique_key(cls, unique_key, replica_id: ReplicaId = 0) -> 'ShardedObject':
+        """Instantiates a ShardedObject from a unique key.
+
+        Args:
+            unique_key: a string of the form
+                <key>/shard_<global_offset>_<global_shape>
+            replica_id: indicates local object replication wrt.
+                local objects in different processes
+
+        Returns:
+            a ShardedObject with data=None
+        """
+        key, shard_key = unique_key.split('/')
+        shard_str, offset, shape = shard_key.split('_')
+        assert shard_str == 'shard'
+        offset = tuple(map(int, offset.split('.')))
+        shape = tuple(map(int, shape.split('.')))
+        if len(shape) + 1 == len(offset):
+            # This is a backward-compatible fix. We don't know the last
+            # element of global shape so set it to -1.
+            shape += (-1,)
+        return cls(key, None, shape, offset, replica_id)
+
+
+FactoryBuildFn = Callable[[str, torch.Tensor, ReplicaId, Optional[slice]], ShardedStateDict]
+FactoryMergeFn = Callable[[StateDict], torch.Tensor]
+
+
+@dataclass
+class ShardedTensorFactory(ShardedBase):
+    """Allows to apply transformations to tensors before/after serialization.
+
+    The essence of those transformations is that they can be applied to
+    optimizer states the same way they are applied to the model params.
+    The ultimate state dict with sharded tensors must depend functionally on
+    `build_fn` arguments (key, data, replica_id, flattened_range),
+    which will be provided by the optimizer.
+
+    Builder creates a sub-state-dict out of a tensor before saving, and merger
+    merges the corresponding state dict after loading.
+
+    Args:
+        key (str): unique identifier of the factory
+        data (torch.Tensor): original model parameter that will be further
+            transformed by this factory
+        build_fn (callable): function that transforms the original tensor
+            to a sharded state dict
+        merge_fn (callable): function that transforms loaded subtree back
+            into a single tensor (inverse of `build_fn`)
+        replica_id (ReplicaId): indicates factory replication wrt.
+            factories in different processes
+        flattened_range (slice, optional): indicates additional flattening
+            applied to the ShardedTensors produced by the factory
+    """
+
+    key: str
+    data: torch.Tensor
+    build_fn: FactoryBuildFn
+    merge_fn: FactoryMergeFn
+    replica_id: ReplicaId = 0
+    flattened_range: Optional[slice] = None
+
+    def build(self):
+        """Builds a ShardedStateDict from the original tensor"""
+        return self.build_fn(self.key, self.data, self.replica_id, self.flattened_range)
+
+    def validate_metadata_integrity(self):
+        """No reasonable checks can be applied"""
+        pass
+
+    def without_data(self):
+        return replace(self, data=None)
+
+
+def apply_factories(sharded_state_dict: ShardedStateDict):
+    """Turn ShardedTensorFactories into ShardedTensors *in-place*.
+
+    Args:
+        sharded_state_dict (ShardedStateDict): state dict possibly
+            containing ShardedTensorFactory objects
+
+    Returns:
+        None: state dict is modified in place
+    """
+
+    def apply(x):
+        if isinstance(x, ShardedTensorFactory):
+            x = x.build()
+        return x
+
+    dict_list_map_inplace(apply, sharded_state_dict)
+
+
+def apply_factory_merges(
+    x1: StateDict, x2: ShardedStateDict, key: Tuple[str, ...] = ()
+) -> StateDict:
+    """Apply merges defined by ShardedTensorFactories *in-place*.
+
+    Args:
+        x1 (StateDict): state dict loaded from the checkpoint
+        x2 (ShardedStateDict): subset of `x1` (in terms of dict keys)
+            with ShardedTensorFactory
+            as (possibly nested) values that define how to
+            merge objects from the `x1` state dict
+        key (Tuple[str, ...]): current key in a recursive call.
+            Used only for reporting meaningful errors
+
+    Returns:
+        StateDict: `x1` modified in-place
+    """
+    if isinstance(x2, ShardedTensorFactory):
+        return x2.merge_fn(x1)
+
+    # There rest is almost the same as the `merge` function from `dict_utils`
+    if isinstance(x1, dict) and isinstance(x2, dict):
+        for k, v2 in x2.items():
+            if k not in x1:
+                raise ValueError(
+                    f'Different dict keys encountered in `apply_factory_merges` '
+                    f'({x1.keys()} vs {x2.keys()})'
+                )
+            else:
+                x1[k] = apply_factory_merges(x1[k], v2, key=key + (k,))
+    elif isinstance(x1, list) and isinstance(x2, list):
+        if len(x1) != len(x2):
+            err_msg = (
+                f'Cannot merge two lists with different lengths '
+                f'({len(x1)} and {len(x2)}, encountered at key {key})'
+            )
+            logger.error(err_msg + f'\nx1: {x1}\nx2: {x2}')
+            raise ValueError(err_msg)
+        for i, v2 in enumerate(x2):
+            x1[i] = apply_factory_merges(x1[i], v2, key=key + (i,))
+    elif isinstance(x1, list) and isinstance(x2, dict):
+        for k, v2 in x2.items():
+            if not isinstance(k, int):
+                raise ValueError(
+                    f'Invalid dict key {k} non-integer type encountered '
+                    f'in a list-dict merge at level {key}'
+                )
+            if k >= len(x1):
+                raise ValueError(
+                    f'Dict key {k} out of bound for list of length'
+                    f'{len(x1)} (encountered at level {key})'
+                )
+            x1[k] = apply_factory_merges(x1[k], v2, key=key + (k,))
+    else:
+        raise ValueError(
+            f'Duplicate non-dict and non-list values encountered: `{x1}` and `{x2} (at key {key})`'
+        )
+    return x1
diff --git a/megatron/core/dist_checkpointing/optimizer.py b/megatron/core/dist_checkpointing/optimizer.py
new file mode 100644
index 0000000000..2d231a24ff
--- /dev/null
+++ b/megatron/core/dist_checkpointing/optimizer.py
@@ -0,0 +1,129 @@
+# Copyright (c) 2022-2023, NVIDIA CORPORATION.  All rights reserved.
+
+""" Helpers for defining sharding for optimizer states based on existing sharding for model parameters. """
+
+import logging
+from copy import deepcopy
+from dataclasses import replace
+from itertools import chain
+from typing import Dict, Iterable, List, Tuple, Union
+
+logger = logging.getLogger(__name__)
+
+import torch
+
+from .dict_utils import nested_values
+from .mapping import (
+    LocalNonpersistentObject,
+    ShardedStateDict,
+    ShardedTensor,
+    ShardedTensorFactory,
+    StateDict,
+)
+from .utils import extract_sharded_tensors_and_factories
+
+
+def get_optim_param_to_id_map(optim_params_iter: Iterable[torch.nn.Parameter]) -> Dict[int, int]:
+    param_mappings = {}
+    for i, param in enumerate(optim_params_iter):
+        if id(param) not in param_mappings:
+            param_mappings[id(param)] = i
+    return param_mappings
+
+
+def get_param_id_to_sharded_param_map(
+    model_sharded_state_dict: ShardedStateDict, optim_params_iter: Iterable[torch.nn.Parameter]
+) -> Dict[int, Union[ShardedTensor, ShardedTensorFactory]]:
+    """Generate mapping from optimizer state ids to model sharded parameters.
+
+    Args:
+        model_sharded_state_dict: sharded state dict with all model sharded tensors (can have any structure)
+        optim_params_iter: iterable which iterates over model parameters tracked by the optimizer.
+            The iteration must be in the same order as in the optimizer parameters.
+
+    Returns:
+        Dict[int, Union[ShardedTensor, ShardedTensorFactory]]: mapping from optimizer state ids
+            to model sharded parameters.
+    """
+    model_sharded_state_dict, _ = extract_sharded_tensors_and_factories(model_sharded_state_dict)
+    id_to_sharded_param_map = {}
+    param_to_id_map = get_optim_param_to_id_map(optim_params_iter)
+    for ten in nested_values(model_sharded_state_dict):
+        if id(ten.data) in param_to_id_map:
+            id_to_sharded_param_map[param_to_id_map[id(ten.data)]] = ten
+        else:
+            logger.debug(f'{ten} is not tracked by the optimizer')
+
+    if not id_to_sharded_param_map:
+        logger.warning(
+            "Sharded parameters mapping is empty. It means tensors in model state dict"
+            " do not correspond to tensors in optimizer parameters map."
+            " Make sure to call state_dict with `keep_vars=True`."
+        )
+    return id_to_sharded_param_map
+
+
+def make_sharded_optimizer_tensor(
+    model_param: Union[ShardedTensor, ShardedTensorFactory], optim_param: torch.Tensor, prefix: str
+) -> Union[ShardedTensor, ShardedTensorFactory]:
+    """Build a ShardedTensor or ShardedTensorFactory for optimizer param based on model param
+
+    Args:
+        model_param (Union[ShardedTensor, ShardedTensorFactory]): model param
+        optim_param (torch.Tensor): corresponding optimizer param
+        prefix (str): optimizer prefix for the ShardedTensor or ShardedTensorFactory
+
+    Returns:
+        Union[ShardedTensor, ShardedTensorFactory]: wrapped optimizer parameter
+    """
+    if isinstance(model_param, ShardedTensorFactory):
+        return replace(model_param, key=f'{prefix}.{model_param.key}', data=optim_param)
+
+    assert (
+        tuple(optim_param.shape) == model_param.local_shape
+    ), f'Optimizer shape ({tuple(optim_param.shape)} does not match model shape ({model_param.local_shape})'
+    sh_ten = replace(
+        model_param, key=f'{prefix}.{model_param.key}', data=optim_param, dtype=optim_param.dtype
+    )
+    sh_ten.validate_metadata_integrity()
+    return sh_ten
+
+
+def optim_state_to_sharding_state(
+    optim_state_dict: StateDict,
+    id_to_sharded_param_map: Dict[int, ShardedTensor],
+    exclude_keys: Tuple[str] = (),
+):
+    """Turn optimizer state dict to sharded state dict based on model state dict *in-place*.
+
+    Can be used to add sharding information to most common optimizer state dict.
+    Creates separate ShardedTensors for each key in `optim_state_dict['state']`
+    (e.g. for torch.optim.Adam there will be separate tensors for `exp_avg` and `exp_avg_sq`)
+
+    Args:
+        optim_state_dict (StateDict): optimizer state dict with
+            state parameters under `state` key and group hyperparameters under `param_groups` -> `params` key.
+        id_to_sharded_param_map (Dict[int, ShardedTensor]): mapping from optimizer param ids to model sharded tensors.
+            Can be generated with `get_param_id_to_sharded_param_map` function
+        exclude_keys (Tuple[str]): optimizer state keys to exclude from the final state dict.
+
+    Returns:
+        None: state dict is modified in place
+    """
+    sharded_state = {}
+    for param_id, param_state in optim_state_dict['state'].items():
+        sharded_state[param_id] = {}
+        for state_key, param in param_state.items():
+            if state_key in exclude_keys:
+                continue
+            if param_id in id_to_sharded_param_map:
+                sharded_state[param_id][state_key] = make_sharded_optimizer_tensor(
+                    id_to_sharded_param_map[param_id], param, prefix=f'optimizer.state.{state_key}'
+                )
+            else:
+                raise ValueError(f'Param id {param_id} does not match any model sharded param')
+
+    optim_state_dict['param_groups'] = deepcopy(optim_state_dict['param_groups'])
+    for group in optim_state_dict['param_groups']:
+        group['params'] = LocalNonpersistentObject(group['params'])
+    optim_state_dict['state'] = sharded_state
diff --git a/megatron/core/dist_checkpointing/serialization.py b/megatron/core/dist_checkpointing/serialization.py
new file mode 100644
index 0000000000..5493c96bbd
--- /dev/null
+++ b/megatron/core/dist_checkpointing/serialization.py
@@ -0,0 +1,411 @@
+# Copyright (c) 2022-2023, NVIDIA CORPORATION.  All rights reserved.
+
+""" Entrypoints for saving and loading the distributed checkpoints.
+
+Functions `load` and `save` are equivalents of `torch.load` and `torch.save`
+but expect torch.Tensors to be wrapped with classes from the `mapping module`.
+Additionally, `load` expects the sharded state dict argument as a guidance for
+loading the sharded tensors.
+"""
+
+import logging
+from pathlib import Path
+from typing import Dict, Optional, Set, Tuple, Union
+
+import torch
+
+from . import ShardedTensor
+from .core import CheckpointingConfig, save_config
+from .dict_utils import extract_matching_values, merge
+from .mapping import (
+    CheckpointingException,
+    ShardedObject,
+    ShardedStateDict,
+    StateDict,
+    apply_factory_merges,
+)
+from .state_dict_transformation import load_preprocess, save_preprocess
+from .strategies.async_utils import AsyncRequest
+from .strategies.base import (
+    AsyncSaveShardedStrategy,
+    LoadCommonStrategy,
+    LoadShardedStrategy,
+    SaveCommonStrategy,
+    SaveShardedStrategy,
+    StrategyAction,
+    get_default_strategy,
+)
+from .utils import extract_sharded_base
+from .validation import (
+    StrictHandling,
+    determine_global_metadata,
+    parse_strict_flag,
+    validate_integrity_and_strict_load,
+    validate_sharded_objects_handling,
+    verify_checkpoint_and_load_strategy,
+)
+
+logger = logging.getLogger(__name__)
+
+
+# flat state dict with sharded objects without any data
+CkptShardedMetadata = Dict[str, Union[ShardedTensor, ShardedObject]]
+
+
+def load(
+    sharded_state_dict: ShardedStateDict,
+    checkpoint_dir: str,
+    sharded_strategy: Union[LoadShardedStrategy, Tuple[str, int], None] = None,
+    common_strategy: Union[LoadCommonStrategy, Tuple[str, int], None] = None,
+    validate_access_integrity: bool = True,
+    strict: Union[str, StrictHandling] = StrictHandling.ASSUME_OK_UNEXPECTED,
+) -> Union[StateDict, Tuple[StateDict, Set[str], Set[str]]]:
+    """Loading entrypoint.
+
+    In the steps below, the following verbs refer to corresponding objects:
+    - load = load from checkpoint
+    - extract = extract from sharded_state_dict
+    - add = add to the final state dict
+    Steps:
+    1. Load common state dict and form the base of the result state dict
+    2. Apply factories to sharded_state_dict
+    3. Extract LocalNonPersistentObject and add
+    4. (optional) Extract ShardedObjects, load and add
+    5. Extract ShardedBase, load, apply factory merges and add
+
+    Args:
+        sharded_state_dict (ShardedStateDict): state dict of the existing model
+            populated with ShardedTensors. Used as a mapping to determine which
+            parts of global tensors stored in the checkpoint should be loaded.
+        checkpoint_dir (str): directory with the checkpoint
+        sharded_strategy (LoadShardedStrategy, Tuple[str, int], optional):
+            configures loading behavior for sharded tensors
+        common_strategy (LoadCommonStrategy, Tuple[str, int], optional):
+            configures loading behavior for common data
+        validate_access_integrity (bool default = True): checks if each tensor shard is accessed
+            exactly once (as main replica) by some process
+        strict (StrictHandling, str, optional): determines the behavior in case of a mismatch
+            between the requested sharded state dict and the checkpoint. See `StrictHandling` docs
+            for more details. Some values affect the return value of this function
+            (missing and unexpected keys are returned).
+            Defaults to `True` (StrictHandling.ASSUME_OK_UNEXPECTED) which doesn't
+            incur any performance overhead. Other recommended values
+            are: `False` (StrictHandling.LOG_UNEXPECTED) which logs only unexpected keys
+            or `StrictHandling.RETURN_ALL` which returns all mismatch keys.
+
+    Returns:
+        StateDict or Tuple[StateDict, Set[str], Set[str]]: in most cases only
+            the loaded state dict is returned. If `strict` flag was set to
+    """
+    sharded_strategy, common_strategy = verify_checkpoint_and_load_strategy(
+        checkpoint_dir, sharded_strategy, common_strategy
+    )
+
+    checkpoint_dir = Path(checkpoint_dir)
+    common_state_dict = common_strategy.load_common(checkpoint_dir)
+    if not sharded_state_dict:
+        return common_state_dict
+
+    sharded_state_dict, nonpersistent_state_dict, sh_ten_factories = load_preprocess(
+        sharded_state_dict
+    )
+    merge(common_state_dict, nonpersistent_state_dict)
+
+    # At this point we are only dealing with ShardedBase objects
+    sharded_state_dict, _ = extract_sharded_base(sharded_state_dict)
+
+    # Validation
+    ckpt_sharded_metadata = None
+    local_metadata, global_metadata = None, None
+    strict = parse_strict_flag(strict)
+    if StrictHandling.requires_explicit_ckpt_mismatch_check(strict):
+        ckpt_sharded_metadata = load_sharded_metadata(
+            str(checkpoint_dir), sharded_strategy, common_strategy
+        )
+    if validate_access_integrity or StrictHandling.requires_global_app_metadata(strict):
+        local_metadata, global_metadata = determine_global_metadata(sharded_state_dict)
+
+    sharded_state_dict, missing_keys, unexpected_keys = validate_integrity_and_strict_load(
+        sharded_state_dict,
+        strict,
+        validate_access_integrity,
+        local_metadata,
+        global_metadata,
+        ckpt_sharded_metadata,
+    )
+
+    # ShardedBase loading
+    if not sharded_strategy.can_handle_sharded_objects:
+        validate_sharded_objects_handling(sharded_strategy, common_strategy)
+        sharded_objects_state_dict, sharded_state_dict = extract_matching_values(
+            sharded_state_dict, lambda v: isinstance(v, ShardedObject)
+        )
+        sharded_objects = common_strategy.load_sharded_objects(
+            sharded_objects_state_dict, checkpoint_dir
+        )
+        merge(common_state_dict, sharded_objects)
+
+    loaded_state_dict = sharded_strategy.load(sharded_state_dict, checkpoint_dir)
+
+    merge(common_state_dict, loaded_state_dict)
+
+    loaded_state_dict = apply_factory_merges(common_state_dict, sh_ten_factories)
+
+    if StrictHandling.requires_returning_mismatch_keys(strict):
+        return common_state_dict, missing_keys, unexpected_keys
+    else:
+        return common_state_dict
+
+
+def load_common_state_dict(checkpoint_dir: Path) -> StateDict:
+    """Load common (non-sharded) objects state dict from the checkpoint.
+
+    Args:
+        checkpoint_dir (Path): checkpoint directory
+
+    Returns:
+        StateDict: state dict with non-sharded objects from the checkpoint
+    """
+    sharded_strategy, common_strategy = verify_checkpoint_and_load_strategy(str(checkpoint_dir))
+    return common_strategy.load_common(checkpoint_dir)
+
+
+def load_tensors_metadata(
+    checkpoint_dir: str, sharded_strategy: Union[LoadShardedStrategy, None] = None
+) -> CkptShardedMetadata:
+    """Load tensors metadata from the checkpoint.
+
+    Returns a dictionary similar to a sharded state dict, but note that
+    the dictionary keys are simply ShardedTensor keys (contrary to the
+    actual sharded state dicts where keys correspond to state dict keys).
+
+    Dict values are ShardedTensors without any sharding (so, the only useful
+    information is tensors global shape and dtype).
+
+    Concrete implementation depends on the loading strategy. If no strategy is
+    given, a default for a given backend is used.
+
+    Args:
+        checkpoint_dir (str): checkpoint directory to load from
+        sharded_strategy (LoadShardedStrategy, optional): sharded strategy to load metadata.
+            Defaults to None - in this case a default load strategy for a given checkpoint type
+            is used.
+
+    Returns:
+        CkptShardedMetadata: flat state dict without data describing ShardedTensors
+            in the checkpoint
+    """
+    sharded_strategy, common_strategy = verify_checkpoint_and_load_strategy(
+        checkpoint_dir, sharded_strategy
+    )
+    return sharded_strategy.load_tensors_metadata(Path(checkpoint_dir))
+
+
+def load_sharded_metadata(
+    checkpoint_dir: str,
+    sharded_strategy: Union[LoadShardedStrategy, None] = None,
+    common_strategy: Union[LoadCommonStrategy, None] = None,
+) -> CkptShardedMetadata:
+    """Load sharded metadata from the checkpoint.
+
+    Similar to `load_tensors_metadata`, but includes also ShardedObjects.
+
+    Returns a dictionary similar to a sharded state dict, but note that
+    the dictionary keys are simply ShardedTensor keys (contrary to the
+    actual sharded state dicts where keys correspond to state dict keys).
+
+    Dict values are ShardedTensors without any sharding (so, the only useful
+    information is tensors global shape and dtype).
+
+    Concrete implementation depends on the loading strategy. If no strategy is
+    given, a default for a given backend is used.
+
+    Args:
+        checkpoint_dir (str): checkpoint directory to load from
+        sharded_strategy (LoadShardedStrategy, optional): sharded strategy to load metadata.
+            Defaults to None - in this case a default load strategy for a given checkpoint type
+            is used.
+        common_strategy (LoadCommonStrategy, optional): common strategy to load metadata.
+            Defaults to None - in this case a default load strategy for a given checkpoint type is
+            used. This strategy won't be used unless `sharded_strategy` can't handle ShardedObjects
+
+    Returns:
+        CkptShardedMetadata: flat state dict without data describing ShardedTensors
+            and ShardedObjects in the checkpoint
+    """
+    sharded_strategy, common_strategy = verify_checkpoint_and_load_strategy(
+        checkpoint_dir, sharded_strategy, common_strategy
+    )
+    sharded_metadata = sharded_strategy.load_sharded_metadata(Path(checkpoint_dir))
+    if not sharded_strategy.can_handle_sharded_objects:
+        validate_sharded_objects_handling(sharded_strategy, common_strategy)
+        common_metadata = common_strategy.load_sharded_metadata(Path(checkpoint_dir))
+        sharded_metadata = merge(sharded_metadata, common_metadata)
+    return sharded_metadata
+
+
+def load_plain_tensors(checkpoint_dir: str) -> StateDict:
+    """Load checkpoint tensors without any sharding and plain structure.
+
+    NOTE: common state dict is NOT included.
+
+    Args:
+        checkpoint_dir (str): checkpoint directory to load the tensors from.
+
+    Returns:
+        StateDict: checkpoint state dict containing only torch.Tensors.
+    """
+    sharded_state_dict = load_tensors_metadata(checkpoint_dir)
+    # Don't validate integrity because shards will be overlapped
+    # if world_size > 1 (all processes load whole tensors)
+    return load(sharded_state_dict, checkpoint_dir, validate_access_integrity=False)
+
+
+#
+# def load_plain_tensors_and_objects(checkpoint_dir: str) -> StateDict:
+#     """Load checkpoint tensors and objects without any sharding and plain structure.
+#
+#     NOTE: state dict structure might be different than the one used for checkpoint saving.
+#     NOTE: common state dict is NOT included.
+#
+#     Args:
+#         checkpoint_dir (str): checkpoint directory to load the state dict from.
+#
+#     Returns:
+#         StateDict: complete checkpoint state dict without any sharding.
+#     """
+#     sharded_state_dict = load_tensors_metadata(checkpoint_dir)
+#     # Don't validate integrity because shards will be overlapped
+#     # if world_size > 1 (all processes load whole tensors)
+#     return load(sharded_state_dict, checkpoint_dir, validate_access_integrity=False)
+
+
+def save(
+    sharded_state_dict: ShardedStateDict,
+    checkpoint_dir: str,
+    sharded_strategy: Union[SaveShardedStrategy, Tuple[str, int], None] = None,
+    common_strategy: Union[SaveCommonStrategy, Tuple[str, int], None] = None,
+    validate_access_integrity: bool = True,
+    async_sharded_save: bool = False,
+) -> Optional[AsyncRequest]:
+    """Saving entrypoint.
+
+    Extracts ShardedTensors from the given state dict. Rank 0 saves the
+    "regular" part of the checkpoint to common torch file.
+    The ShardedTensors are saved according to a strategy specified by the
+    config.
+
+    Steps:
+    1. Apply factories
+    2. Extract and discard LocalNonPersistentObject
+    3. Extract all ShardedBase object
+    4. Save all other objects to common.pt
+    5. (optional) Extract and save ShardedObjects
+    6. Save all ShardedBase objects
+    7. Write metadata.json file with backend and version metadata.
+
+    Step (6) can be performed asynchronously (see `async_sharded_save`), in this
+    case the actual save is embodied in the returned async request and can be
+    scheduled by the external caller. For async request, step (7) is added as
+    one of the finalization functions, so that metadata.json is written only
+    if the checkpoint is complete.
+
+    Args:
+        sharded_state_dict (ShardedStateDict): state dict of the populated with
+            ShardedTensors. Used as a mapping to determine how local tensors
+            should be saved as global tensors in the checkpoint.
+        checkpoint_dir (str): directory to save the checkpoint to
+        sharded_strategy (SaveShardedStrategy, Tuple[str, int], optional):
+            configures sharded tensors saving behavior and backend
+        common_strategy (SaveCommonStrategy, Tuple[str, int], optional):
+            configures common data saving behavior and backend
+        validate_access_integrity (bool default = True): checks if each tensor shard is accessed
+            exactly once (as main replica) by some process
+        async_sharded_save (bool, optional): if True, for the sharded state dict part
+            an async save implementation will be called, with the AsyncRequest
+            being returned to the caller. Note that it is the caller responsibility to
+            actually schedule the async save. Defaults to False.
+
+    Returns:
+        AsyncRequest (optional): if `async_sharded_save` is True, returns
+            async request that should be scheduled by the caller of this function.
+            None otherwise.
+    """
+    checkpoint_dir = Path(checkpoint_dir)
+
+    if torch.distributed.get_rank() == 0:
+        if not checkpoint_dir.exists():
+            raise CheckpointingException(
+                f'Checkpoint destination directory does not exist: {checkpoint_dir}'
+            )
+
+        if next(checkpoint_dir.iterdir(), None) is not None:
+            raise CheckpointingException(
+                f'Checkpoint destination directory ({checkpoint_dir}) is not empty'
+            )
+
+    if common_strategy is not None:
+        raise NotImplementedError('The only supported common strategy is torch')
+
+    if sharded_strategy is None:
+        sharded_strategy = get_default_save_sharded_strategy()
+    if not isinstance(sharded_strategy, SaveShardedStrategy):
+        assert isinstance(sharded_strategy, tuple), type(sharded_strategy)
+        sharded_strategy = get_default_strategy(StrategyAction.SAVE_SHARDED, *sharded_strategy)
+
+    if common_strategy is None:
+        common_strategy = get_default_save_common_strategy()
+    if not isinstance(common_strategy, SaveCommonStrategy):
+        assert isinstance(common_strategy, tuple), type(common_strategy)
+        common_strategy = get_default_strategy(StrategyAction.SAVE_COMMON, *common_strategy)
+
+    sharded_state_dict, state_dict = save_preprocess(sharded_state_dict, validate_access_integrity)
+
+    common_strategy.save_common(state_dict, checkpoint_dir)
+
+    if not sharded_strategy.can_handle_sharded_objects:
+        validate_sharded_objects_handling(sharded_strategy, common_strategy)
+        sharded_objects_state_dict, sharded_state_dict = extract_matching_values(
+            sharded_state_dict, lambda v: isinstance(v, ShardedObject)
+        )
+        common_strategy.save_sharded_objects(sharded_objects_state_dict, checkpoint_dir)
+
+    def metadata_finalize_fn():
+        if torch.distributed.get_rank() == 0:
+            save_config(
+                CheckpointingConfig(sharded_strategy.backend, sharded_strategy.version),
+                checkpoint_dir,
+            )
+        torch.distributed.barrier()
+
+    if not async_sharded_save:
+        sharded_strategy.save(sharded_state_dict, checkpoint_dir)
+        metadata_finalize_fn()
+        return
+
+    if not isinstance(sharded_strategy, AsyncSaveShardedStrategy):
+        raise CheckpointingException(
+            f'Cannot apply async_save to non-async strategy {sharded_strategy}'
+        )
+    async_request = sharded_strategy.async_save(sharded_state_dict, checkpoint_dir)
+    async_request.finalize_fns.append(metadata_finalize_fn)
+    return async_request
+
+
+def get_default_save_sharded_strategy(
+    backend: str = 'torch_dist', version: int = 1
+) -> SaveShardedStrategy:
+    """Get default save sharded strategy."""
+    return get_default_strategy(StrategyAction.SAVE_SHARDED, backend, version)
+
+
+def get_default_save_common_strategy(
+    backend: str = 'torch', version: int = 1
+) -> SaveCommonStrategy:
+    """Get default save common strategy."""
+    return get_default_strategy(StrategyAction.SAVE_COMMON, backend, version)
+
+
+def get_default_load_sharded_strategy(checkpoint_dir: str) -> LoadShardedStrategy:
+    """Get default load sharded strategy."""
+    return verify_checkpoint_and_load_strategy(checkpoint_dir)[0]
diff --git a/megatron/core/dist_checkpointing/state_dict_transformation.py b/megatron/core/dist_checkpointing/state_dict_transformation.py
new file mode 100644
index 0000000000..ebb960e384
--- /dev/null
+++ b/megatron/core/dist_checkpointing/state_dict_transformation.py
@@ -0,0 +1,253 @@
+# Copyright (c) 2022-2023, NVIDIA CORPORATION.  All rights reserved.
+
+""" Utilities for transforming state_dict, including a tensor-aware implementation."""
+
+import logging
+from time import time
+from typing import Any, Optional
+
+import torch
+
+from .dict_utils import dict_list_map_inplace, extract_matching_values, merge, nested_values
+from .exchange_utils import determine_main_replica_uniform_distribution, exchange_by_distribution
+from .mapping import (
+    ShardedObject,
+    ShardedStateDict,
+    ShardedTensor,
+    ShardedTensorFactory,
+    apply_factories,
+    apply_factory_merges,
+)
+from .utils import (
+    _sharded_object_id,
+    _sharded_tensor_shard_id,
+    extract_nonpersistent,
+    extract_sharded_base,
+)
+from .validation import determine_global_metadata, validate_sharding_integrity
+
+logger = logging.getLogger(__name__)
+
+
+def save_preprocess(sharded_state_dict: ShardedStateDict, validate_access_integrity: bool = True):
+    """Preprocesses the given state dictionary by applying factories,
+    discarding non-persistent data and extracting the common state dictionary.
+    Optionally, it can validate sharding integrity.
+
+    Args:
+        sharded_state_dict (ShardedStateDict): The initial state dictionary to be preprocessed.
+        validate_access_integrity (bool): If True, triggers validation of sharding integrity.
+
+    Returns:
+        Tuple[ShardedStateDict, dict]:
+            The preprocessed sharded state dictionary and the common state dictionary.
+    """
+    apply_factories(sharded_state_dict)
+    _, sharded_state_dict = extract_nonpersistent(sharded_state_dict)
+    sharded_part, common_state_dict = extract_sharded_base(sharded_state_dict)
+    if validate_access_integrity:
+        validate_sharding_integrity(determine_global_metadata(sharded_part)[1])
+    return sharded_part, common_state_dict
+
+
+def load_preprocess(sharded_state_dict: ShardedStateDict):
+    """Preprocesses the given state dictionary by applying factories
+    and extracting non-persistent data, without modifying the original dictionary.
+
+    Args:
+        sharded_state_dict (ShardedStateDict):
+            The initial state dictionary to be processed (remains unchanged).
+
+    Returns:
+        Tuple[ShardedStateDict, dict, dict]:
+            - A preprocessed copy of the sharded state dictionary.
+            - A dictionary containing non-persistent state data.
+            - A dictionary of `ShardedTensorFactory` instances.
+    """
+    # Create a copy of sharded_state_dict as the passed in state dict may have
+    # references that prevent tensors from being deallocated
+    sharded_state_dict, _ = extract_matching_values(sharded_state_dict, lambda x: True)
+
+    sh_ten_factories, _ = extract_matching_values(
+        sharded_state_dict,
+        lambda x: isinstance(x, ShardedTensorFactory),
+        return_lists_as_dicts=True,
+    )
+    apply_factories(sharded_state_dict)
+
+    # Data inside sh_ten_factories no longer needed so delete them to reduce memory usage
+    dict_list_map_inplace(ShardedTensorFactory.without_data, sh_ten_factories)
+    # Non-persistent objects
+    nonpersistent_state_dict, sharded_state_dict = extract_nonpersistent(sharded_state_dict)
+    dict_list_map_inplace(lambda o: o.unwrap(), nonpersistent_state_dict)
+    return sharded_state_dict, nonpersistent_state_dict, sh_ten_factories
+
+
+def prepare_state_dict_for_save(
+    sharded_state_dict: ShardedStateDict,
+    async_prepare: bool = False,
+    algo: str = 'atomic',
+    validate_access_integrity: bool = True,
+    parallelization_group: Optional[torch.distributed.ProcessGroup] = None,
+    to_cpu: bool = True,
+):
+    """Creates a tensor-aware state dictionary that can be saved using the Local Checkpoint Manager.
+
+    Args:
+        sharded_state_dict (ShardedStateDict): The initial state dictionary.
+        async_prepare (bool): If True, enables asynchronous preparation.
+        algo (str): The algorithm used to create the tensor-aware state dictionary.
+        validate_access_integrity (bool): If True, validates sharding integrity.
+        parallelization_group (torch.distributed.ProcessGroup):
+            The process group used for exchanges to avoid duplications.
+        to_cpu (bool): If True, moves all tensors from device to CPU.
+
+    Returns:
+        ShardedStateDict: The tensor-aware state dictionary.
+    """
+
+    _start = time()
+
+    if async_prepare:
+        raise NotImplementedError('Async state_dict preparation is not yet implemented')
+    if algo != 'atomic' and algo != 'fully_parallel':
+        raise NotImplementedError(
+            'Only "atomic" and "fully_parallel" sharding algorithms are supported.'
+        )
+    fully_parallel = algo == 'fully_parallel'
+
+    sharded_part, common_state_dict = save_preprocess(sharded_state_dict, validate_access_integrity)
+    sharded_tensors = []
+    sharded_objects = []
+    for sh_base in nested_values(sharded_part):
+        if isinstance(sh_base, ShardedTensor):
+            sharded_tensors.append(sh_base)
+        else:
+            assert isinstance(sh_base, ShardedObject)
+            sharded_objects.append(sh_base)
+    if fully_parallel:
+        shard_to_saving_rank, _, shard_to_metadata = determine_main_replica_uniform_distribution(
+            sharded_part, parallelization_group, True
+        )
+
+    raw_tensors, raw_objects = {}, {}
+    for ten in sharded_tensors:
+        shard_id = _sharded_tensor_shard_id(ten)
+        if not fully_parallel or shard_to_saving_rank[shard_id] == torch.distributed.get_rank():
+            # TODO cover creating copies on host in CheckpointManager.save()
+            if to_cpu:
+                raw_tensors[shard_id] = ten.data.to("cpu", non_blocking=True)
+            else:
+                raw_tensors[shard_id] = ten.data
+        ten.data = None
+    for obj in sharded_objects:
+        raw_objects[_sharded_object_id(obj)] = obj.data
+        obj.data = None
+
+    logger.debug(f'prepare_state_dict_for_save took {time() - _start}')
+
+    state_dict_for_save = {
+        'raw_tensors': raw_tensors,
+        'raw_objects': raw_objects,
+        'common': common_state_dict,
+        'sharded_state_dict': sharded_part,
+    }
+    if fully_parallel:
+        state_dict_for_save['shard_to_rank'] = shard_to_saving_rank
+        state_dict_for_save['shard_to_metadata'] = shard_to_metadata
+    return state_dict_for_save
+
+
+def recreate_state_dict_after_load(
+    sharded_state_dict: ShardedStateDict,
+    loaded_state_dict: ShardedStateDict,
+    algo: str = 'atomic',
+    exchange_algo: str = 'broadcast',
+    validate_access_integrity: bool = True,
+    parallelization_group: Optional[torch.distributed.ProcessGroup] = None,
+):
+    """Creates a final sharded state dictionary from a tensor-aware state dictionary.
+
+    Args:
+        sharded_state_dict (ShardedStateDict):
+            The initial sharded state dictionary generated from the model.
+        loaded_state_dict (ShardedStateDict):
+            Tensor-aware state dictionary used to fill in missing data in the sharded state.
+        algo (str): The algorithm used to reconstruct the state dictionary
+            from the tensor-aware state dictionary.
+        exchange_algo (str): The algorithm used for tensor exchanges during retrieval.
+        validate_access_integrity (bool): If True, performs validation of sharding integrity.
+        parallelization_group (torch.distributed.ProcessGroup):
+            The process group used for efficient exchanges during retrieval.
+
+    Returns:
+        ShardedStateDict: The finalized sharded state dictionary.
+    """
+
+    if algo != 'atomic' and algo != 'fully_parallel':
+        raise NotImplementedError(
+            'Only "atomic" and "fully_parallel" sharding algorithms are supported.'
+        )
+    fully_parallel = algo == 'fully_parallel'
+
+    # __adding__ common part
+    recreated_state_dict, _ = extract_matching_values(loaded_state_dict["common"], lambda x: True)
+
+    if not sharded_state_dict:
+        return recreated_state_dict
+    # TODO validate laoded_state_dict["sharded_state_dict"] and sharded_state_dict are compatible
+
+    sharded_state_dict, nonpersistent_state_dict, sh_ten_factories = load_preprocess(
+        sharded_state_dict
+    )
+    # __adding__ nonpersistent part
+    merge(recreated_state_dict, nonpersistent_state_dict)
+
+    sharded_part, _ = extract_sharded_base(sharded_state_dict)
+    if validate_access_integrity:
+        validate_sharding_integrity(determine_global_metadata(sharded_part)[1])
+
+    # load sharded tensors and sharded objects to sharded_part
+    loaded_tensors = loaded_state_dict['raw_tensors']
+    # TODO cover restoring the original device (H2D) in CheckpointManager.load()
+    for k, v in loaded_tensors.items():
+        loaded_tensors[k] = v.cuda()  # H2D
+    if fully_parallel:
+        distribution = (
+            loaded_state_dict['shard_to_rank'],
+            None,
+            loaded_state_dict['shard_to_metadata'],
+        )
+        unloaded_shards = {}
+        for sh_base in nested_values(sharded_part):
+            if isinstance(sh_base, ShardedTensor):
+                shard_id = _sharded_tensor_shard_id(sh_base)
+                if shard_id not in loaded_tensors:
+                    unloaded_shards[shard_id] = sh_base
+        loaded_tensors = exchange_by_distribution(
+            loaded_tensors, unloaded_shards, distribution, parallelization_group, exchange_algo
+        )
+    loaded_objects = loaded_state_dict['raw_objects']
+
+    def load_sharded_base(x: Any):
+        if isinstance(x, ShardedTensor):
+            shard_id = _sharded_tensor_shard_id(x)
+            if shard_id not in loaded_tensors:
+                raise Exception(
+                    'The current local checkpoint implementation assumes'
+                    'consistent tensor sharding during load and save operations.'
+                    f'However, the expected shard {x} (ID: {shard_id})'
+                    f'was not found in the checkpoint. (IDs: {loaded_tensors.keys()})'
+                )
+            x = loaded_tensors[shard_id]
+        if isinstance(x, ShardedObject):
+            object_id = _sharded_object_id(x)
+            assert object_id in loaded_objects, (x, object_id, loaded_objects.keys())
+            x = loaded_objects[object_id]
+        return x
+
+    dict_list_map_inplace(load_sharded_base, sharded_part)
+    sharded_part = apply_factory_merges(sharded_part, sh_ten_factories)
+    # __adding__ sharded_part
+    merge(recreated_state_dict, sharded_part)
+    return recreated_state_dict
diff --git a/megatron/core/dist_checkpointing/strategies/__init__.py b/megatron/core/dist_checkpointing/strategies/__init__.py
new file mode 100644
index 0000000000..a786b8e84a
--- /dev/null
+++ b/megatron/core/dist_checkpointing/strategies/__init__.py
@@ -0,0 +1,7 @@
+# Copyright (c) 2022-2023, NVIDIA CORPORATION.  All rights reserved.
+
+""" Various loading and saving strategies """
+from megatron.core.dist_checkpointing.strategies.common import register_default_common_strategies
+
+# We load "common" strategies by default to be always available
+register_default_common_strategies()
diff --git a/megatron/core/dist_checkpointing/strategies/async_utils.py b/megatron/core/dist_checkpointing/strategies/async_utils.py
new file mode 100644
index 0000000000..7cdda8ac32
--- /dev/null
+++ b/megatron/core/dist_checkpointing/strategies/async_utils.py
@@ -0,0 +1,224 @@
+# Copyright (c) 2024, NVIDIA CORPORATION.  All rights reserved.
+
+"""
+This module provides an async utilities which allow to start
+a checkpoint save process in the background.
+"""
+import logging
+from collections import deque
+from time import time
+from typing import Callable, List, NamedTuple, Optional, Tuple
+
+import torch
+from torch import multiprocessing as mp
+
+logger = logging.getLogger(__name__)
+
+
+class AsyncRequest(NamedTuple):
+    """Represents an async request that needs to be scheduled for execution.
+
+    Args:
+        async_fn (Callable, optional): async function to call. None represents noop.
+        async_fn_args (Tuple): args to pass to `async_fn`.
+        finalize_fns (List[Callable]): list of functions to call to finalize the request.
+            These functions will be called synchronously after `async_fn` is done
+            *on all ranks*.
+    """
+
+    async_fn: Optional[Callable]
+    async_fn_args: Tuple
+    finalize_fns: List[Callable]
+    is_frozen: bool = False
+
+    def add_finalize_fn(self, fn: Callable) -> None:
+        """Adds a new finalize function to the request.
+
+        Args:
+            fn (Callable): function to add to the async request. This function
+                will be called *after* existing finalization functions.
+
+        Returns:
+            None
+        """
+        if self.is_frozen:
+            raise RuntimeError('Cannot add finalization functions to a frozen AsyncRequest')
+        self.finalize_fns.append(fn)
+
+    def execute_sync(self) -> None:
+        """Helper to synchronously execute the request.
+
+        This logic is equivalent to what should happen in case of the async call.
+        """
+        if self.async_fn is not None:
+            self.async_fn(*self.async_fn_args)
+        torch.distributed.barrier()
+        for finalize_fn in self.finalize_fns:
+            finalize_fn()
+
+    def freeze(self) -> 'AsyncRequest':
+        """Freezes the async request, disallowing adding new finalization functions.
+
+        Returns:
+            AsyncRequest: new async request with all same fields except for the
+                `is_frozen` flag.
+        """
+        return self._replace(is_frozen=True)
+
+
+class DistributedAsyncCaller:
+    """Wrapper around mp.Process that ensures correct semantic of distributed finalization.
+
+    Starts process asynchronously and allows checking if all processes on all ranks are done.
+    """
+
+    def __init__(self):
+        self.process: Optional[mp.Process] = None
+        self.start_time: Optional[float] = None
+
+    def schedule_async_call(self, async_fn: Optional[Callable], save_args: Tuple) -> None:
+        """Spawn a process with `async_fn` as the target.
+
+        This method must be called on all ranks.
+
+        Args:
+            async_fn (Callable, optional): async function to call. If None,
+                no process will be started.
+            save_args (Tuple): async function args.
+        """
+        if async_fn is None:
+            return  # nothing to do
+        start_sync = time()
+        torch.cuda.synchronize()
+        end_sync = time()
+        logger.debug(
+            f"rank: {torch.distributed.get_rank()}, takes {end_sync - start_sync} to finish D2H "
+        )
+
+        ctx = mp.get_context('fork')
+        self.start_time = time()
+        self.process = ctx.Process(target=async_fn, args=save_args)
+        self.process.start()
+        init_time = time()
+        logger.debug(
+            f"rank: {torch.distributed.get_rank()}, takes {init_time - self.start_time} to schedule async ckpt "
+        )
+
+    def is_current_async_call_done(self, blocking=False) -> bool:
+        """Check if async save is finished on all ranks.
+
+        For semantic correctness, requires rank synchronization in each check.
+        This method must be called on all ranks.
+
+        Args:
+            blocking (bool, optional): if True, will wait until the call is done
+                on all ranks. Otherwise, returns immediately if at least one rank
+                is still active. Defaults to False.
+
+        Returns:
+            bool: True if all ranks are done (immediately of after active wait
+                if `blocking` is True), False if at least one rank is still active.
+        """
+        # The following takes the same overhead as torch.distributed.barrier (single integer all-reduce)
+        is_alive = int(self.process.is_alive()) if self.process is not None else 0
+        ten = torch.tensor([is_alive], dtype=torch.int, device=torch.cuda.current_device())
+        logger.debug(
+            f"rank: {torch.distributed.get_rank()}, DistributedAsyncCaller is_alive: {is_alive}"
+        )
+        torch.distributed.all_reduce(ten)
+        if ten[0] > 0 and not blocking:
+            return False
+        else:
+            if self.process is not None:
+                logger.debug(f"rank: {torch.distributed.get_rank()}, joining self.process")
+                self.process.join()
+                self.process = None
+
+                logger.debug(
+                    f"DistributedAsyncCaller: Async process join finished after {time() - self.start_time:.2f}s from forking"
+                )
+                self.start_time = None
+            return True
+
+
+class _ActiveAsyncRequest(NamedTuple):
+    """Helper to represent an active async call.
+
+    Args:
+        idx (int): index of the call (starting from 0)
+        async_caller (DistributedAsyncCaller): async caller instance that represents
+            the async process handling the async request
+        async_request (AsyncRequest):  async request that is being called
+    """
+
+    idx: int
+    async_caller: DistributedAsyncCaller
+    async_request: AsyncRequest
+
+
+class AsyncCallsQueue:
+    """Manages a queue of async calls.
+
+    Allows adding a new async call with `schedule_async_request` and finalizing
+    active calls with `maybe_finalize_async_calls`.
+    """
+
+    def __init__(self):
+        self.async_calls: deque[_ActiveAsyncRequest] = deque([])
+        self.call_idx: int = -1
+
+    def schedule_async_request(self, async_request: AsyncRequest) -> int:
+        """Start a new async call and add it to a queue of active async calls.
+
+        This method must be called on all ranks.
+
+        Args:
+            async_request (AsyncRequest): async request to start.
+
+        Returns:
+            int: index of the async call that was started.
+                This can help the user keep track of the async calls.
+        """
+        self.call_idx += 1
+        async_caller = DistributedAsyncCaller()
+        async_request = async_request.freeze()
+        async_caller.schedule_async_call(async_request.async_fn, async_request.async_fn_args)
+        self.async_calls.append(_ActiveAsyncRequest(self.call_idx, async_caller, async_request))
+        return self.call_idx
+
+    def maybe_finalize_async_calls(self, blocking=False) -> List[int]:
+        """Finalizes all available calls.
+
+        This method must be called on all ranks.
+
+        Args:
+            blocking (bool, optional): if True, will wait until all active requests
+                are done. Otherwise, finalizes only the async request that already
+                finished. Defaults to False.
+        Returns:
+            List[int]: list of indices (as returned by `schedule_async_request`)
+                of async calls that have been successfully finalized.
+        """
+        call_idx_finalized = []
+        while self.async_calls:
+            next_async_done = self.async_calls[0].async_caller.is_current_async_call_done(blocking)
+            if not next_async_done:
+                break
+            call_idx, _, async_request = self.async_calls.popleft()
+            for finalize_fn in async_request.finalize_fns:
+                finalize_fn()
+            ten = torch.tensor([call_idx], dtype=torch.int, device=torch.cuda.current_device())
+            torch.distributed.all_reduce(ten, op=torch.distributed.ReduceOp.MAX)
+            assert (
+                ten.item() == call_idx
+            ), 'Unmatched async calls. That probably means not all ranks are participating in async finalization'
+            call_idx_finalized.append(call_idx)
+        return call_idx_finalized
+
+    def get_num_unfinalized_calls(self):
+        """Get the number of active async calls."""
+        return len(self.async_calls)
+
+    def close(self):
+        """Finalize all calls upon closing."""
+        self.maybe_finalize_async_calls(blocking=True)
diff --git a/megatron/core/dist_checkpointing/strategies/base.py b/megatron/core/dist_checkpointing/strategies/base.py
new file mode 100644
index 0000000000..35fca1f350
--- /dev/null
+++ b/megatron/core/dist_checkpointing/strategies/base.py
@@ -0,0 +1,223 @@
+# Copyright (c) 2022-2023, NVIDIA CORPORATION.  All rights reserved.
+
+""" Strategies base interfaces. """
+
+from abc import ABC, abstractmethod
+from collections import defaultdict
+from enum import Enum
+from pathlib import Path
+from typing import Any, DefaultDict, Union
+
+from ..mapping import CheckpointingException, ShardedStateDict, StateDict
+from .async_utils import AsyncCallsQueue, AsyncRequest
+
+
+class StrategyAction(Enum):
+    """Specifies save vs load and sharded vs common action."""
+
+    LOAD_COMMON = 'load_common'
+    LOAD_SHARDED = 'load_sharded'
+    SAVE_COMMON = 'save_common'
+    SAVE_SHARDED = 'save_sharded'
+
+
+default_strategies: DefaultDict[str, dict[tuple, Any]] = defaultdict(dict)
+
+async_calls = AsyncCallsQueue()
+
+
+def get_default_strategy(action: StrategyAction, backend: str, version: int):
+    """Retrieves a default strategy for a given action, backend and version."""
+    try:
+        if backend == 'zarr':
+            error_hint = ' Please install `zarr` and `tensorstore<=0.1.45` packages'
+            from .tensorstore import register_default_tensorstore_strategies
+
+            register_default_tensorstore_strategies()
+            from .zarr import register_default_zarr_strategies
+
+            register_default_zarr_strategies()
+        elif backend == 'torch_dist':
+            error_hint = ' Please use PyTorch version >=2.1'
+            from .torch import register_default_torch_strategies
+
+            register_default_torch_strategies()
+    except ImportError as e:
+        raise CheckpointingException(
+            f'Cannot import a default strategy for: {(action.value, backend, version)}. '
+            f'Error: {e}. Hint: {error_hint}'
+        ) from e
+    try:
+        return default_strategies[action.value][(backend, version)]
+    except KeyError as e:
+        raise CheckpointingException(
+            f'Cannot find a default strategy for: {(action.value, backend, version)}'
+        ) from e
+
+
+def register_default_strategy(
+    action: StrategyAction,
+    backend: str,
+    version: int,
+    strategy: Union['SaveStrategyBase', 'LoadStrategyBase'],
+):
+    """Adds a given strategy to the registry of default strategies.
+
+    Args:
+        action (StrategyAction): specifies save/load and sharded/common
+        backend (str): backend that the strategy becomes a default for
+        version (int): version that the strategy becomes a default for
+        strategy (SaveStrategyBase, LoadStrategyBase): strategy to register
+    """
+    default_strategies[action.value][(backend, version)] = strategy
+
+
+class LoadStrategyBase(ABC):
+    """Base class for a load strategy. Requires implementing checks for compatibility with a
+    given checkpoint version."""
+
+    @abstractmethod
+    def check_backend_compatibility(self, loaded_backend):
+        """Verifies if this strategy is compatible with `loaded_backend`."""
+        raise NotImplementedError
+
+    @abstractmethod
+    def check_version_compatibility(self, loaded_version):
+        """Verifies if this strategy is compatible with `loaded_version`."""
+        raise NotImplementedError
+
+    @property
+    def can_handle_sharded_objects(self):
+        """Returns whether or not this strategy can handle loading ShardedObjects."""
+        return False
+
+
+class SaveStrategyBase(ABC):
+    """Base class for a save strategy. Requires defining a backend type and
+    version of the saved format."""
+
+    def __init__(self, backend: str, version: int):
+        self.backend = backend
+        self.version = version
+
+    @property
+    def can_handle_sharded_objects(self):
+        """Returns whether or not this strategy can handle saving ShardedObjects."""
+        return False
+
+    def __str__(self):
+        return f'{self.__class__.__name__}({self.backend}, {self.version})'
+
+
+class LoadCommonStrategy(LoadStrategyBase):
+    """Load strategy for common (non-sharded) objects"""
+
+    @abstractmethod
+    def load_common(self, checkpoint_dir: Path):
+        """Load common part of the checkpoint."""
+        raise NotImplementedError
+
+    @abstractmethod
+    def load_sharded_objects(
+        self, sharded_objects_state_dict: ShardedStateDict, checkpoint_dir: Path
+    ):
+        """Load sharded objects from the checkpoint."""
+        raise NotImplementedError
+
+    def load_sharded_metadata(self, checkpoint_dir: Path) -> ShardedStateDict:
+        """Load just the metadata from the checkpoint."""
+        if not self.can_handle_sharded_objects:
+            return {}
+        raise NotImplementedError
+
+
+class LoadShardedStrategy(LoadStrategyBase):
+    """Load strategy for sharded tensors"""
+
+    @abstractmethod
+    def load(self, sharded_state_dict: ShardedStateDict, checkpoint_dir: Path):
+        """Load the sharded part of the checkpoint."""
+        raise NotImplementedError
+
+    @abstractmethod
+    def load_tensors_metadata(self, checkpoint_dir: Path):
+        """Load tensors metadata from the checkpoint for ShardedTensors.
+
+        Returns a dictionary similar to a sharded state dict, but note that
+        the dictionary keys are simply ShardedTensor keys (contrary to the
+        actual sharded state dicts where keys correspond to state dict keys).
+
+        Dict values are ShardedTensors without any data and sharding (so, the
+        only useful information is tensors global shape and dtype).
+        """
+        raise NotImplementedError(
+            f'Loading only tensors metadata not implemented for {self.__class__.__name__}'
+        )
+
+    def load_sharded_metadata(self, checkpoint_dir: Path):
+        """Load sharded metadata from the checkpoint for ShardedTensors and ShardedObjects.
+
+        Returns a dictionary similar to a sharded state dict, but note that
+        the dictionary keys are simply sharded keys (contrary to the
+        actual sharded state dicts where keys correspond to state dict keys).
+
+        Dict values are ShardedTensors or ShardedObjects without any data and sharding.
+        """
+        if not self.can_handle_sharded_objects:
+            return self.load_tensors_metadata(checkpoint_dir)
+        raise NotImplementedError(
+            f'Loading only sharded metadata not implemented for {self.__class__.__name__}'
+        )
+
+
+class SaveCommonStrategy(SaveStrategyBase):
+    """Save strategy for common (non-sharded) objects"""
+
+    @abstractmethod
+    def save_common(self, common_state_dict: StateDict, checkpoint_dir: Path):
+        """Save common part of the state dict."""
+        raise NotImplementedError
+
+    def save_sharded_objects(
+        self, sharded_objects_state_dict: ShardedStateDict, checkpoint_dir: Path
+    ):
+        """Save sharded objects from the state dict."""
+        raise NotImplementedError
+
+
+class SaveShardedStrategy(SaveStrategyBase):
+    """Save strategy for sharded tensors"""
+
+    @abstractmethod
+    def save(self, sharded_state_dict: ShardedStateDict, checkpoint_dir: Path):
+        """Save the sharded part of the state dict."""
+        raise NotImplementedError
+
+
+class AsyncSaveShardedStrategy(SaveShardedStrategy):
+    """Save strategy suitable for async save."""
+
+    @abstractmethod
+    def async_save(
+        self, sharded_state_dict: ShardedStateDict, checkpoint_dir: Path
+    ) -> AsyncRequest:
+        """Perform preparation and return an AsyncRequest to the external caller.
+
+        Args:
+            sharded_state_dict (ShardedStateDict): sharded state dict to save
+            checkpoint_dir (Path): checkpoint target directory
+
+        Returns:
+            AsyncRequest: represents the async save function and finalization function.
+                It is the caller responsibility to actually schedule the async save.
+        """
+        raise NotImplementedError
+
+    def save(self, sharded_state_dict: ShardedStateDict, checkpoint_dir: Path):
+        """Each async strategy can be trivially used as a sync strategy."""
+        async_request = self.async_save(sharded_state_dict, checkpoint_dir)
+        # multiprocessing routines  may cause issue when called on parent process
+        # We keep this verbose call for now
+        global async_calls
+        async_calls.schedule_async_request(async_request)
+        async_calls.maybe_finalize_async_calls(blocking=True)
diff --git a/megatron/core/dist_checkpointing/strategies/common.py b/megatron/core/dist_checkpointing/strategies/common.py
new file mode 100644
index 0000000000..f2c87b4d60
--- /dev/null
+++ b/megatron/core/dist_checkpointing/strategies/common.py
@@ -0,0 +1,157 @@
+# Copyright (c) 2024, NVIDIA CORPORATION.  All rights reserved.
+
+""" Common strategies. """
+
+import logging
+import os
+from pathlib import Path
+
+import torch
+
+from megatron.core.dist_checkpointing.mapping import ShardedStateDict, StateDict
+from megatron.core.dist_checkpointing.strategies.base import (
+    SaveCommonStrategy,
+    StrategyAction,
+    register_default_strategy,
+)
+
+from ..dict_utils import dict_list_map_inplace, nested_values
+from ..mapping import CheckpointingException, ShardedObject, is_main_replica
+from ..strategies.base import LoadCommonStrategy
+
+COMMON_STATE_FNAME = 'common.pt'
+
+logger = logging.getLogger(__name__)
+
+
+def register_default_common_strategies():
+    """Register default common strategies."""
+    register_default_strategy(StrategyAction.LOAD_COMMON, 'torch', 1, TorchCommonLoadStrategy())
+    register_default_strategy(
+        StrategyAction.SAVE_COMMON, 'torch', 1, TorchCommonSaveStrategy('torch', 1)
+    )
+
+
+class TorchCommonSaveStrategy(SaveCommonStrategy):
+    """Common save strategy leveraging native torch save/load."""
+
+    def save_common(self, common_state_dict: StateDict, checkpoint_dir: Path):
+        """Save common part of the state dict."""
+        if torch.distributed.get_rank() == 0:
+            torch.save(common_state_dict, checkpoint_dir / COMMON_STATE_FNAME)
+
+    def save_sharded_objects(
+        self, sharded_objects_state_dict: ShardedStateDict, checkpoint_dir: Path
+    ):
+        """Save sharded objects from the state dict."""
+        for sh_obj in nested_values(sharded_objects_state_dict):
+            if is_main_replica(sh_obj.replica_id):
+                save_path = checkpoint_dir / f'{sh_obj.unique_key}.pt'
+                os.makedirs(save_path.parent, exist_ok=True)
+                torch.save(sh_obj.data, save_path)
+
+    def can_handle_sharded_objects(self):
+        """This strategy can handle ShardedObjects."""
+        return True
+
+
+class TorchCommonLoadStrategy(LoadCommonStrategy):
+    """Common load strategy leveraging native torch save/load."""
+
+    def load_common(self, checkpoint_dir: Path):
+        """Load common (non-sharded) objects state dict from the checkpoint.
+
+        Args:
+            checkpoint_dir (Path): checkpoint directory
+
+        Returns:
+            StateDict: state dict with non-sharded objects from the checkpoint
+        """
+        load_path = Path(checkpoint_dir) / COMMON_STATE_FNAME
+        try:
+            return torch.load(load_path, map_location='cpu')
+        except FileNotFoundError as e:
+            err_msg = f'Common file {load_path} does not exist'
+            ckpt_files = [f.name for f in checkpoint_dir.iterdir()]
+            logger.debug(f'{err_msg}. Checkpoint directory content: {ckpt_files}')
+            raise CheckpointingException(err_msg) from e
+
+    def load_sharded_objects(
+        self, sharded_objects_state_dict: ShardedStateDict, checkpoint_dir: Path
+    ):
+        """Replaces all ShardedObject from a given state dict with values loaded from the
+        checkpoint.
+
+        Args:
+            sharded_objects_state_dict (ShardedStateDict):
+                sharded state dict defining what objects should be loaded.
+            checkpoint_dir (Path): checkpoint directory
+
+        Returns:
+            None: sharded state dict is modified in place
+        """
+
+        def load_sharded_object(sh_obj: ShardedObject):
+            sh_obj.data = None
+            load_path = checkpoint_dir / f'{sh_obj.unique_key}.pt'
+            try:
+                loaded_obj = torch.load(load_path)
+            except FileNotFoundError as e:
+                # Backward compatible logic: previously the save format was incorrect
+                old_load_path = (checkpoint_dir / sh_obj.unique_key).with_suffix('.pt')
+                try:
+                    loaded_obj = torch.load(old_load_path)
+                except FileNotFoundError:
+                    err_msg = f'Object shard {load_path} not found'
+                    obj_subdir = checkpoint_dir / sh_obj.key
+                    if obj_subdir.exists():
+                        obj_files = [f.name for f in obj_subdir.iterdir()]
+                        logger.debug(
+                            f'{err_msg}. Object {sh_obj.key} directory content: {obj_files}'
+                        )
+                    else:
+                        ckpt_files = [f.name for f in checkpoint_dir.iterdir()]
+                        logger.debug(
+                            f'{err_msg}. Object {sh_obj.key} directory does not exist. Checkpoint'
+                            f' directory content: {ckpt_files}'
+                        )
+                    raise CheckpointingException(err_msg) from e
+            return loaded_obj
+
+        return dict_list_map_inplace(load_sharded_object, sharded_objects_state_dict)
+
+    def load_sharded_metadata(self, checkpoint_dir: Path) -> ShardedStateDict:
+        sharded_metadata = {}
+        for subdir in checkpoint_dir.iterdir():
+            if not subdir.is_dir():
+                continue
+            shard_files = list(subdir.glob('shard_*.pt'))
+            if not shard_files:
+                continue
+            sh_objs = []
+            for shard_file in shard_files:
+                full_key = f'{subdir.name}/{shard_file.stem}'
+                sh_objs.append(ShardedObject.empty_from_unique_key(full_key))
+
+            # This is a backward-compatibility fix, where the last global shape is missing in the
+            # name
+            if sh_objs[0].global_shape[-1] < 0:
+                max_last_offset = max(map(lambda sh_obj: sh_obj.global_offset[-1], sh_objs))
+                for sh_obj in sh_objs:
+                    sh_obj.global_shape = (*sh_obj.global_shape[:-1], max_last_offset + 1)
+
+            # Update the sharded state dict
+            for sh_obj in sh_objs:
+                sharded_metadata[sh_obj.unique_key] = sh_obj
+        return sharded_metadata
+
+    @property
+    def can_handle_sharded_objects(self):
+        """This strategy can handle ShardedObjects."""
+        return True
+
+    def check_backend_compatibility(self, loaded_version):
+        pass
+
+    def check_version_compatibility(self, loaded_version):
+        pass
diff --git a/megatron/core/dist_checkpointing/strategies/filesystem_async.py b/megatron/core/dist_checkpointing/strategies/filesystem_async.py
new file mode 100644
index 0000000000..9d0be4d6e7
--- /dev/null
+++ b/megatron/core/dist_checkpointing/strategies/filesystem_async.py
@@ -0,0 +1,386 @@
+# Copyright (c) 2023, NVIDIA CORPORATION.  All rights reserved.
+
+""" Storage writer for PyT Distributed format allowing asynchronous save. """
+import gc
+import logging
+import os
+import queue
+from contextlib import contextmanager
+from itertools import chain
+from pathlib import Path
+from time import time
+from typing import Callable, Dict, List, Optional, Tuple, Union
+
+import psutil
+import torch
+from torch import multiprocessing as mp
+from torch.distributed.checkpoint import FileSystemWriter
+from torch.distributed.checkpoint.filesystem import DEFAULT_SUFFIX, _StoragePrefix, _write_item
+from torch.distributed.checkpoint.planner import SavePlan, SavePlanner, WriteItem, WriteItemType
+from torch.distributed.checkpoint.storage import WriteResult
+from torch.futures import Future
+
+logger = logging.getLogger(__name__)
+
+WriteBucket = Tuple[Path, str, Tuple[list, list]]  # represents writes to a single file
+
+_results_queue = None
+
+
+def _get_write_results_queue():
+    global _results_queue
+    if _results_queue is None:
+        ctx = mp.get_context('spawn')
+        _results_queue = ctx.Manager().Queue()
+    return _results_queue
+
+
+@contextmanager
+def _disable_gc():
+    """Temporarily disables GC."""
+    gc_enabled = gc.isenabled()
+    try:
+        if gc_enabled:
+            gc.disable()
+        yield
+    finally:
+        if gc_enabled:
+            gc.enable()
+
+
+class FileSystemWriterAsync(FileSystemWriter):
+    """
+    Async-enabled implementation of FileSystemWriter using file IO.
+
+    This class doesn't spawn the async process itself, relies on the external async mechanism.
+
+    Flow:
+    1. Call `write_data`
+    2. Externally start async process with `get_save_function_and_args` function and args
+    3. The async function to call is `writer_proxy_func` which calls
+       `write_preloaded_data` in multiple processes
+
+    After saving is finalized on all ranks:
+    4. Call `super().finish` with the results gathered in `self.writer_result`
+
+    Note that step (3) above can also be called synchronously.
+
+    Currently, it's assumed that a separate writer is created for each ckpt save
+    (intermediate state is stored as writer attributes).
+    """
+
+    def __init__(self, *args, **kwargs):
+        super().__init__(*args, **kwargs)
+        if not self.single_file_per_rank:
+            raise NotImplementedError(
+                'single_file_per_rank flag not supported for FileSystemWriterAsync'
+            )
+
+        # Intermediate state between preparation and finalization
+        self.write_buckets: Optional[List[WriteBucket]] = None
+        self.results_queue: Optional[mp.Queue] = None
+
+    def prepare_write_data(self, plan: SavePlan, planner: SavePlanner) -> None:
+        """
+        First stage of async saving. Copy data to CPU and plan the local saving.
+
+        Args:
+            plan (SavePlan): save plan generated by the PyT Distributed compatible planner
+            planner (SavePlanner): save planner used to resolve the bytes and tensor data
+
+        Returns: None, but stores the save plan in `self.write_buckets`
+        """
+        storage_plan: _StoragePrefix = plan.storage_data
+        start = time()
+        logger.debug(f"thread_count: {self.thread_count}, time: {start}")
+        item_buckets = _split_by_size_and_type(self.thread_count, plan.items)
+        logger.debug(f"bucket_prep, time: {time() - start}")
+
+        start = time()
+        # move tensors from GPU to CPU before starting async writing
+        # We do D2H synchronously for now
+        file_count = 0
+
+        def gen_file():
+            nonlocal file_count
+            file_name = f"{storage_plan.prefix}{file_count}{DEFAULT_SUFFIX}"
+            file_count += 1
+            return file_name
+
+        # Prepare bytes / tensor data in each bucket, which will be assigned to each writer process
+        self.write_buckets = []
+        for bucket in item_buckets:
+            bytes_data = [
+                (item, planner.resolve_data(item))
+                for item in bucket
+                if item.type == WriteItemType.BYTE_IO
+            ]
+            tensor_data = [
+                (item, planner.resolve_data(item).detach().to("cpu", non_blocking=True))
+                for item in bucket
+                if item.type != WriteItemType.BYTE_IO
+            ]
+            if len(bytes_data) > 0 or len(tensor_data) > 0:
+                file_name = gen_file()
+                self.write_buckets.append(
+                    (self.path / file_name, file_name, (bytes_data, tensor_data))
+                )
+
+        # Check if there is anything to write on this rank
+        if len(self.write_buckets) > 0:
+            assert len(self.write_buckets) <= self.thread_count, (
+                len(self.write_buckets),
+                self.thread_count,
+            )
+            self.results_queue = _get_write_results_queue()
+        else:
+            self.results_queue = None
+        end = time()
+        logger.debug(f"D2H and push, time: {end - start}")
+
+    def get_save_function_and_args(self) -> Tuple[Optional[Callable], Tuple]:
+        """
+        Get function that saves the data to storage along with its arguments.
+        Allows the external caller to apply the save function synchronously or asynchronously.
+
+        Returns: None (if there is nothing to write on this rank) or a tuple of:
+            - the function that saves the data
+            - arguments to that function
+        """
+        if not self.write_buckets:
+            return None, ()
+        return (self.write_preloaded_data_multiproc, (self.write_buckets, self.results_queue))
+
+    @staticmethod
+    @_disable_gc()
+    def write_preloaded_data_multiproc(
+        write_buckets: List[WriteBucket], global_results_queue: mp.Queue
+    ) -> None:
+        """
+        Performs saving data to storage with multiple processes.
+
+        Starts predefined number of processes and uses 2 queues to make sure the results
+        are complete:
+        - local_results_queue - to send the actual results
+        - count_queue - small queue to mark worker as completed
+
+        Using just one queue disallowed proper exception handling.
+
+        This method is meant to be run in a forked subprocess.
+        Triggering GC during execution leads to CUDA errors
+        (cleaning up tensors owned by the parent process).
+        To prevent this, we disable the GC explicitly for this function with _disable_gc.
+
+        Args:
+            write_buckets (List[WriteBucket]): write plan
+            global_results_queue (mp.Queue): mp.Queue to collect Dict[List[WriteResults]] (or an Exception)
+                from parallel write processes to the main training process
+        Returns: None
+        """
+        w_start = time()
+        write_results_or_exc: Union[dict, Exception] = dict()
+        ctx = mp.get_context('fork')
+        local_results_queue = ctx.Queue()
+        count_queue = ctx.JoinableQueue()
+        p_list = []
+        for i, write_bucket in enumerate(write_buckets):
+            try:
+                count_queue.put(i)
+                p_list.append(
+                    ctx.Process(
+                        target=FileSystemWriterAsync.write_preloaded_data,
+                        args=(i, write_bucket, local_results_queue, count_queue, True),
+                    )
+                )
+            except Exception as e:
+                err_msg = f'An error is caught while a proc {i} is created, error: {e}'
+                logger.error(err_msg)
+                write_results_or_exc = RuntimeError(err_msg)
+
+        if not isinstance(write_results_or_exc, Exception):
+            for p in p_list:
+                p.start()
+
+            logger.debug('FileSystemWriterAsync: collecting worker results...')
+
+            # To make sure all nodes are completed
+            count_queue.join()
+            # At this point, all workers completed, so the queue should have exactly `len(write_buckets)` items
+            for proc_idx in range(len(write_buckets)):
+                try:
+                    local_proc_idx, local_results_or_exc = local_results_queue.get()
+                except queue.Empty:
+                    write_results_or_exc = RuntimeError(
+                        f'Unexpected empty `local_results_queue` (got only {proc_idx}/{len(write_buckets)} items)'
+                    )
+                    break
+                else:
+                    if isinstance(local_results_or_exc, Exception):
+                        err_msg = f"Local process {local_proc_idx} encountered an error: {local_results_or_exc}"
+                        logger.error(err_msg)
+                        write_results_or_exc = local_results_or_exc
+                        break
+                    else:
+                        assert isinstance(local_results_or_exc, list), type(local_results_or_exc)
+                        write_results_or_exc[local_proc_idx] = local_results_or_exc
+                        p_list[local_proc_idx].join()
+
+            logger.debug('FileSystemWriterAsync: collected worker results successfully')
+
+        global_results_queue.put(write_results_or_exc)
+
+        w_end = time()
+        logger.debug(
+            f"{w_end}, rank: {torch.distributed.get_rank()}, write(sync,parallel): {w_end - w_start}"
+        )
+
+    @staticmethod
+    @_disable_gc()
+    def write_preloaded_data(
+        local_proc_idx: int,
+        write_bucket: WriteBucket,
+        results_queue: mp.SimpleQueue,
+        count_queue: mp.JoinableQueue,
+        use_fsync: bool,
+    ) -> None:
+        """
+        Performs actual data saving to storage.
+
+        Args:
+            local_proc_idx (int): index of a local process that performs writing
+            write_bucket (WriteBucket): data to write to storage
+            results_queue (mp.Queue): queue to return the write results to the proxy checkpoint process.
+            count_queue (mp.JoinableQueue): queue to marks worker task as completed
+            use_fsync (bool): if True, calls os.fsync at the end of saving
+
+        Returns: None, the write result are put into the `queue`
+        """
+        mem_before = _process_memory()
+
+        local_results = []
+        try:
+            file_name, storage_key, (bytes_data, tensor_data) = write_bucket
+            with open(file_name, "wb") as stream:
+                for write_item, data in bytes_data:
+                    local_results.append(_write_item(stream, data, write_item, storage_key))
+
+                for write_item, tensor in tensor_data:
+                    assert tensor.is_cpu
+                    local_results.append(_write_item(stream, tensor, write_item, storage_key))
+
+                if use_fsync:
+                    os.fsync(stream.fileno())
+            local_output = (local_proc_idx, local_results)
+        except Exception as e:
+            local_output = (local_proc_idx, e)
+
+        results_queue.put(local_output)
+        # Signal this process is done.
+        count_queue.get()
+        count_queue.task_done()
+
+        mem_after = _process_memory()
+        logger.debug(
+            f"{local_proc_idx} consumed: {mem_after - mem_before}, before: {mem_before}, after: {mem_after}"
+        )
+
+    def write_data(self, plan: SavePlan, planner: SavePlanner) -> Future[List[WriteResult]]:
+        raise NotImplementedError('write_data not implemented for FileSystemWriterAsync')
+
+    def retrieve_write_results(self) -> List[WriteResult]:
+        """
+        Turn the latest dict including write results from `self.results_queue` into a single results lists. Includes error check.
+
+        Returns (List[WriteResult]): the list of write results from all local processes performing the save.
+
+        """
+        assert self.write_buckets is not None
+
+        if self.results_queue is None:
+            write_results_or_exc = {}
+        else:
+            try:
+                write_results_or_exc = self.results_queue.get_nowait()
+            except queue.Empty:
+                raise RuntimeError(f'results_queue should not be empty')
+
+        if isinstance(write_results_or_exc, Exception):
+            raise RuntimeError(f'Worker failure: {write_results_or_exc}') from write_results_or_exc
+        write_results: dict = write_results_or_exc
+        if len(write_results) != len(self.write_buckets):
+            raise RuntimeError(
+                f'Incomplete worker results (expected {len(self.write_buckets)}, got {len(write_results)}.'
+                f' This probably indicates a worker failure.'
+            )
+        return list(chain.from_iterable(write_results.values()))
+
+
+def _split_by_size_and_type(bins: int, items: List[WriteItem]) -> List[List[WriteItem]]:
+    """
+    Splits write items according to item size into close to uniform bins.
+
+    Same as torch.distributed.checkpoint.filesystem._split_by_size_and_type,
+    but with a fixed _item_size function.
+
+    Args:
+        bins (int): numbers of bins to split to
+        items (List[WriteItem]): list of write items
+
+    Returns (List[List[WriteItem]]): write items split to bins
+    """
+    if bins == 1:
+        return [items]
+
+    bytes_items = [wi for wi in items if wi.type == WriteItemType.BYTE_IO]
+    tensor_items = [wi for wi in items if wi.type != WriteItemType.BYTE_IO]
+
+    buckets: List[List[WriteItem]] = [[] for _ in range(bins)]
+    bucket_sizes = [0 for _ in range(bins)]
+
+    tensor_items.sort(key=_item_size, reverse=True)
+
+    # Assign bytes with a simple round-robin
+    for i, item in enumerate(bytes_items):
+        buckets[i % bins].append(item)
+
+    # Then, assign tensors according to their sizes
+    for item in tensor_items:
+        # TODO replace with headq
+        idx = min(enumerate(bucket_sizes), key=lambda x: x[1])[0]
+        buckets[idx].append(item)
+        bucket_sizes[idx] += _item_size(item)
+
+    return buckets
+
+
+def _item_size(item: WriteItem) -> int:
+    """
+    Calculates size (in bytes) of a single write item.
+
+    Same as torch.distributed.checkpoint.filesystem._item_size,
+    but fixes computing chunk size (with item.tensor_data.chunk.sizes)
+
+    Args:
+        item (WriteItem): write item to compute the size of
+
+    Returns (int): size of an item in bytes
+    """
+    size = 1
+    assert item.tensor_data is not None
+    # can't use math.prod as PT needs to support older python
+    for s in item.tensor_data.chunk.sizes:
+        size *= s
+
+    dtype = item.tensor_data.properties.dtype
+    return size * torch._utils._element_size(dtype)
+
+
+def _process_memory() -> int:
+    """
+    Get memory used by current process.
+
+    Returns (int): memory used by current process
+    """
+    process = psutil.Process(os.getpid())
+    mem_info = process.memory_info()
+    return mem_info.rss
diff --git a/megatron/core/dist_checkpointing/strategies/fully_parallel.py b/megatron/core/dist_checkpointing/strategies/fully_parallel.py
new file mode 100644
index 0000000000..56523daf1c
--- /dev/null
+++ b/megatron/core/dist_checkpointing/strategies/fully_parallel.py
@@ -0,0 +1,439 @@
+# Copyright (c) 2024, NVIDIA CORPORATION. All rights reserved.
+import logging
+from pathlib import Path
+from time import time
+from typing import Dict, Optional, Tuple
+
+import torch
+import torch.distributed as dist
+
+from megatron.core.dist_checkpointing import ShardedTensor
+from megatron.core.dist_checkpointing.core import CheckpointingException
+from megatron.core.dist_checkpointing.dict_utils import (
+    dict_list_map_inplace,
+    extract_matching_values,
+    merge,
+    nested_values,
+)
+from megatron.core.dist_checkpointing.exchange_utils import (
+    ShardDistribution,
+    determine_main_replica_uniform_distribution,
+    exchange_by_distribution,
+)
+from megatron.core.dist_checkpointing.mapping import ShardedStateDict, StateDict, is_main_replica
+from megatron.core.dist_checkpointing.strategies.base import (
+    AsyncSaveShardedStrategy,
+    LoadShardedStrategy,
+    SaveShardedStrategy,
+)
+from megatron.core.dist_checkpointing.utils import _sharded_tensor_shard_id, _ShardId
+from megatron.core.dist_checkpointing.validation import (
+    determine_global_metadata,
+    validate_sharding_integrity,
+)
+
+logger = logging.getLogger(__name__)
+
+
+class FullyParallelSaveStrategyWrapper(AsyncSaveShardedStrategy):
+    """Wraps arbitrary strategy and distributes the save during `save`.
+
+    The save distribution happens without any *data* communication.
+    Only the *metadata* is exchanged and based on data replication on different
+    ranks, we try to distribute the save as uniformly as possible.
+
+    This wrapper assumes, that setting `replica_id` to 0 will make the
+    underlying strategy do the saving on current rank. All the other `replica_id`s
+    are set to 1.
+
+    Currently, the save distribution is realized with a greedy algorithm
+    described in `distribute_shards_to_ranks`.
+
+    Args:
+        strategy (SaveShardedStrategy): base strategy to wrap
+        parallelization_group (ProcessGroup, optional): process group to use for save
+            distribution. Note that this doesn't have to match exactly the
+            data distribution, but should cover the replication pattern
+            to maximize performance. Defaults to the whole world.
+        do_cache_distribution (bool, optional): whether to cache the save distribution
+            from previous calls. Should be set to True only if the state dict
+            structure between the calls is always the same. Defaults to True.
+    """
+
+    def __init__(
+        self,
+        strategy: SaveShardedStrategy,
+        parallelization_group: Optional[torch.distributed.ProcessGroup] = None,
+        do_cache_distribution: bool = False,
+    ):
+        super().__init__(strategy.backend, strategy.version)
+        self.base_strategy = strategy
+        self.parallelization_group = parallelization_group
+        self.do_cache_distribution = do_cache_distribution
+
+        self.cached_distribution: Optional[ShardDistribution] = None
+
+    def async_save(self, sharded_state_dict: ShardedStateDict, checkpoint_dir: Path):
+        if not isinstance(self.base_strategy, AsyncSaveShardedStrategy):
+            raise CheckpointingException(
+                f'Cannot apply async_save to non-async base strategy {self.base_strategy}'
+            )
+        self.apply_saving_parallelization(sharded_state_dict)
+        return self.base_strategy.async_save(sharded_state_dict, checkpoint_dir)
+
+    def save(self, sharded_state_dict: ShardedStateDict, checkpoint_dir: Path):
+        self.apply_saving_parallelization(sharded_state_dict)
+        return self.base_strategy.save(sharded_state_dict, checkpoint_dir)
+
+    def apply_saving_parallelization(self, sharded_state_dict: ShardedStateDict) -> None:
+        """Distributes the save across ranks by exchanging metadata.
+
+        Exchanges metadata from the state dict and computes the uniform
+        (as close as possible) distribution of saves among the ranks.
+
+        If `self.do_cache_distribution` is True, caches the distribution between
+        the calls and subsequent distributions happen without any inter-rank
+        communication.
+
+        Args:
+            sharded_state_dict (ShardedStateDict): state dict to distribute the saving
+
+        Returns: None
+        """
+        start = time()
+        if self.do_cache_distribution and self.cached_distribution is not None:
+            logger.debug(f'Apply *cached* save parallelization')
+            precomputed_distribution = self.cached_distribution
+        else:
+            logger.debug(f'Apply save parallelization')
+            precomputed_distribution = determine_main_replica_uniform_distribution(
+                sharded_state_dict, self.parallelization_group
+            )
+
+        distribute_main_replicas_with_precomputed_distribution(
+            sharded_state_dict, self.parallelization_group, precomputed_distribution
+        )
+        if self.cached_distribution is None:
+            # First time applying the parallelization
+            validate_sharding_integrity(determine_global_metadata(sharded_state_dict)[1])
+        if self.do_cache_distribution:
+            self.cached_distribution = precomputed_distribution
+        end = time()
+        logger.debug(f"parallel save sharding, time: {end - start}")
+
+    @property
+    def can_handle_sharded_objects(self):
+        return self.base_strategy.can_handle_sharded_objects
+
+
+class FullyParallelLoadStrategyWrapper(LoadShardedStrategy):
+    """Wraps arbitrary load strategy and distributes the load during `load`.
+
+    See `load` method docs for details.
+
+    Args:
+        strategy (LoadShardedStrategy): base strategy to wrap
+        parallelization_group (ProcessGroup, optional): process group to use for load
+            distribution. Note that this doesn't have to match exactly the
+            data distribution, but should cover the replication pattern
+            to maximize performance. Defaults to the whole world.
+            In most cases, it's recommended to set it to the DP group.
+        do_cache_distribution (bool, optional): whether to cache the load distribution
+            from previous calls. Should be set to True only if the state dict
+            structure between the calls is always the same. Defaults to False,
+            since the loading in general happens only once during training.
+            Note that the load distribution *cannot* be reused as a save distribution,
+            because save/load is not fully symmetrical.
+        exchange_algo (str): algorithm to use for exchanging the data.
+            Options:
+            - broadcast - each rank broadcasts individual tensors to others
+            - gather_object (default) - ranks all_gather_object the whole loaded state dicts
+            - gather_rounds (default) - ranks all gather individual tensors in rounds
+            See method docs for more details.
+    """
+
+    def __init__(
+        self,
+        strategy: LoadShardedStrategy,
+        parallelization_group: Optional[torch.distributed.ProcessGroup] = None,
+        do_cache_distribution: bool = False,
+        exchange_algo: str = 'broadcast',
+    ):
+        super().__init__()
+        self.base_strategy = strategy
+        if parallelization_group is None:
+            parallelization_group = (
+                dist.GroupMember.WORLD
+            )  # explicit group needed for torch.distributed.get_global_rank call
+        self.parallelization_group = parallelization_group
+        self.do_cache_distribution = do_cache_distribution
+        self.exchange_algo = exchange_algo
+
+        self.cached_distribution: Optional[ShardDistribution] = None
+
+    def load(self, sharded_state_dict: ShardedStateDict, checkpoint_dir: Path) -> StateDict:
+        """Distributes the load and calls underlying strategy only for parts of the state dict.
+
+        Steps:
+        1. Load metadata is exchanged between the ranks in the parallelization group.
+        2. Each rank deterministically plans the load for the whole workload
+            so that the loads are as uniform as possible.
+        3. Each ranks loads its planned shard of the checkpoint.
+        4. All ranks exchange the loaded shards.
+
+        Internode communication is involved in steps (1) (with metadata)
+        and (4) (with actual data). Storage interaction is involved in step (3).
+
+        Currently, the load distribution (step 2) is realized with a greedy algorithm
+        described in `distribute_shards_to_ranks` (same as for saving distribution).
+
+        Currently, the shards are all gathered between all ranks in the parallelization
+        group. This might not be optimal (some ranks do not need all tensors),
+        but it's a reasonable approximation for an optimal exchange in most scenarios.
+
+        Args:
+            sharded_state_dict (ShardedStateDict): sharded state dict to load
+            checkpoint_dir (Path): checkpoint directory to load from
+
+        Returns:
+            StateDict: loaded state dict. The state dict should be equivalent to
+            a state dict that would be loaded with the underlying strategy
+            without this wrapper.
+        """
+        if torch.distributed.get_world_size(self.parallelization_group) <= 1:
+            return self.base_strategy.load(sharded_state_dict, checkpoint_dir)
+
+        # Step 1 and 2: exchange load metadata and distribute the load
+        start = time()
+        precomputed_distribution = self.apply_loading_parallelization(sharded_state_dict)
+        assert (
+            precomputed_distribution is not None
+        ), 'Expecting non-trivial distribution for non-trivial parallelization group'
+        end = time()
+        logger.debug(f'self.apply_loading_parallelization took {end - start}s')
+        start = end
+
+        # Step 3: load part of the checkpoint.
+        # Load only sharded objects first. ShardedTensors will be loaded separately
+        # so that we can keep track of sharded tensors loaded by this rank
+        (sharded_tensors, sharded_state_dict, to_load_shards, unloaded_shards) = (
+            self._defer_loading_sharded_tensors(sharded_state_dict)
+        )
+        loaded_state_dict = self.base_strategy.load(sharded_state_dict, checkpoint_dir)
+
+        end = time()
+        logger.debug(f'Base load of ShardedObjects took {end - start}s')
+        start = end
+
+        # Load sharded tensors separately
+        loaded_tensors = self.base_strategy.load(to_load_shards, checkpoint_dir)
+
+        end = time()
+        logger.debug(f'Base load of ShardedTensors took {end - start}s')
+        start = end
+
+        # Step 4: exchange data between ranks
+        logger.debug(f'Applying parallel load with algo {self.exchange_algo}')
+        all_loaded_tensors = exchange_by_distribution(
+            loaded_tensors,
+            unloaded_shards,
+            precomputed_distribution,
+            self.parallelization_group,
+            self.exchange_algo,
+        )
+        if not set(unloaded_shards.keys()).issubset(all_loaded_tensors.keys()):
+            missing_shards = set(unloaded_shards.keys()) - all_loaded_tensors.keys()
+            raise CheckpointingException(
+                f'Missing shards after fully parallel loading: {missing_shards}'
+            )
+
+        sync_start = time()
+        torch.cuda.synchronize()
+        end = time()
+        logger.debug(f'torch.cuda.synchronize took {end - sync_start}s')
+        logger.debug(f'self.exchange_loaded_tensors took {end - start}s')
+
+        self.fill_in_deferred_sharded_tensors(sharded_tensors, all_loaded_tensors)
+        merge(loaded_state_dict, sharded_tensors)
+        return loaded_state_dict
+
+    def _defer_loading_sharded_tensors(
+        self, sharded_state_dict: ShardedStateDict
+    ) -> Tuple[
+        ShardedStateDict,
+        ShardedStateDict,
+        Dict[_ShardId, ShardedTensor],
+        Dict[_ShardId, ShardedTensor],
+    ]:
+        """Divides state dict into parts loaded by this vs other ranks.
+
+        ShardedTensors with main replica_id will be loaded by this rank,
+        others will be received by other ranks (after loading from storage).
+
+        Args:
+            sharded_state_dict (ShardedStateDict): state dict with ShardedTensor
+                that will be divided.
+
+        Returns: a tuple of:
+            - ShardedStateDict: sub-state dict only with ShardedTensors
+            - ShardedStateDict: sub-state dict with non-ShardedTensors
+            - Dict[_ShardId, ShardedTensor]: ShardedTensor are uniquely identified
+                by shard ids. This is a mapping from shard id to a corresponding
+                ShardedTensor for tensors loaded by *this* rank
+            - Dict[_ShardId, ShardedTensor]: mapping from shard id to a corresponding
+                ShardedTensor for tensors loaded by *other* ranks
+        """
+        to_load_shards = {}
+        unloaded_shards = {}
+
+        sharded_tensors, sharded_state_dict = extract_matching_values(
+            sharded_state_dict, lambda v: isinstance(v, ShardedTensor)
+        )
+
+        def wrap_non_main_replicas(x):
+            if isinstance(x, ShardedTensor):
+                # Assign shard to be loaded or not
+                if is_main_replica(x.replica_id):
+                    to_load_shards[_sharded_tensor_shard_id(x)] = x
+                else:
+                    unloaded_shards[_sharded_tensor_shard_id(x)] = x
+            return x
+
+        dict_list_map_inplace(wrap_non_main_replicas, sharded_tensors)
+        return sharded_tensors, sharded_state_dict, to_load_shards, unloaded_shards
+
+    def apply_loading_parallelization(
+        self, sharded_state_dict: ShardedStateDict
+    ) -> Optional[ShardDistribution]:
+        """Distributes the load across ranks by exchanging metadata.
+
+        Exchanges metadata from the state dict and computes the uniform
+        (as close as possible) distribution of loads among the ranks.
+        Marks ShardedTensors to be loaded by the current rank with replica_id 0
+        (and others with non 0 values).
+
+        If `self.do_cache_distribution` is True, caches the distribution between
+        the calls and subsequent distributions happen without any inter-rank
+        communication.
+
+        Args:
+            sharded_state_dict (ShardedStateDict): state dict to distribute the loading
+
+        Returns:
+            ShardDistribution (optional): the computed loading distribution
+        """
+        if self.do_cache_distribution and self.cached_distribution is not None:
+            logger.debug(f'Apply *cached* load parallelization')
+            precomputed_distribution = self.cached_distribution
+        else:
+            logger.debug(f'Apply load parallelization')
+            precomputed_distribution = determine_main_replica_uniform_distribution(
+                sharded_state_dict, self.parallelization_group, True
+            )
+
+        distribute_main_replicas_with_precomputed_distribution(
+            sharded_state_dict, self.parallelization_group, precomputed_distribution
+        )
+        if self.do_cache_distribution:
+            self.cached_distribution = precomputed_distribution
+
+        return precomputed_distribution
+
+    def fill_in_deferred_sharded_tensors(
+        self, sharded_state_dict: ShardedStateDict, loaded_tensors: Dict[_ShardId, torch.Tensor]
+    ) -> None:
+        """Fill in tensors not loaded by current rank with tensors from `loaded_tensors` map.
+
+        Args:
+            sharded_state_dict (ShardedStateDict): sharded state dict to fill in.
+                ShardedTensors are completely replaced with corresponding torch.Tensors.
+            loaded_tensors (Dict[_ShardId, torch.Tensor]): dict allowing to map
+                ShardedTensor from the sharded_state_dict to loaded tensors.
+
+        Returns:
+
+        """
+
+        def fill_in_sharded_tensor(x):
+            if isinstance(x, ShardedTensor):
+                try:
+                    x = loaded_tensors[_sharded_tensor_shard_id(x)]
+                except KeyError as e:
+                    raise CheckpointingException(
+                        f'Missing loaded tensor shard: {_sharded_tensor_shard_id(x)}'
+                    ) from e
+
+            return x
+
+        dict_list_map_inplace(fill_in_sharded_tensor, sharded_state_dict)
+
+    @property
+    def can_handle_sharded_objects(self):
+        return self.base_strategy.can_handle_sharded_objects
+
+    def load_tensors_metadata(self, checkpoint_dir: Path):
+        return self.base_strategy.load_tensors_metadata(checkpoint_dir)
+
+    def load_sharded_metadata(self, checkpoint_dir: Path):
+        return self.base_strategy.load_sharded_metadata(checkpoint_dir)
+
+    def check_backend_compatibility(self, loaded_version):
+        return self.base_strategy.check_backend_compatibility(loaded_version)
+
+    def check_version_compatibility(self, loaded_version):
+        return self.base_strategy.check_version_compatibility(loaded_version)
+
+
+def distribute_main_replicas_with_precomputed_distribution(
+    sharded_state_dict: ShardedStateDict,
+    parallelization_group: torch.distributed.ProcessGroup,
+    precomputed_distribution: Optional[ShardDistribution],
+):
+    """Applies the save distribution computed with `determine_main_replica_uniform_distribution`.
+
+    Based on rank assignment, sets replica ids of the shards saved by current rank to 0
+    and all the other replica ids to 1.
+
+    Args:
+        sharded_state_dict (ShardedStateDict): state dict to apply the save distribution to
+        parallelization_group (ProcessGroup): distribution will be applied within this
+            process group. Must match with the process group passed to
+            `determine_main_replica_uniform_distribution`.
+        precomputed_distribution (ShardDistribution): distribution computed with
+            `determine_main_replica_uniform_distribution`
+
+    Returns: None
+
+    Example replica ids of tensors A, B, C before distribution:
+    rank0: A: (0, 0, 0), B: (0, 0, 0), C: (0, 0, 0)
+    rank1: A: (0, 0, 1), B: (0, 0, 1), C: (0, 0, 1)
+    rank2: A: (0, 0, 2), B: (0, 0, 2), C: (0, 0, 2)
+
+    Replicas after distribution for the example above:
+    rank0: A: 0, B: 1, C: 1
+    rank1: A: 1, B: 0, C: 1
+    rank2: A: 1, B: 1, C: 0
+    """
+    if torch.distributed.get_world_size(group=parallelization_group) <= 1:
+        return
+    if precomputed_distribution is None:
+        raise ValueError(
+            'precomputed_distribution must be not None for non-trivial parallelization group'
+        )
+
+    local_shards = list(
+        sh_base
+        for sh_base in nested_values(sharded_state_dict)
+        if isinstance(sh_base, ShardedTensor)
+    )
+
+    rank_within_dp_group = torch.distributed.get_rank(parallelization_group)
+    for sh_ten in local_shards:
+        shard_id = _sharded_tensor_shard_id(sh_ten)
+        if (
+            shard_id in precomputed_distribution.shards_in_this_group
+            and rank_within_dp_group == precomputed_distribution.main_rank_for_shard[shard_id]
+        ):
+            sh_ten.replica_id = 0
+        else:
+            sh_ten.replica_id = 1
diff --git a/megatron/core/dist_checkpointing/strategies/resharding.py b/megatron/core/dist_checkpointing/strategies/resharding.py
new file mode 100644
index 0000000000..c1c2bcec84
--- /dev/null
+++ b/megatron/core/dist_checkpointing/strategies/resharding.py
@@ -0,0 +1,315 @@
+# Copyright (c) 2024, NVIDIA CORPORATION.  All rights reserved.
+
+""" Performant resharding of flattened tensors.
+
+Tensors that are first sharded (e.g. across TP) and then flattened cause
+very irregular access patterns during loading. The idea for performant save/load
+is to store tensors with global shape [X, Y, Z] and local shape [x, y, z]
+as tensors with global shape [X // x, Y // y, Z // z, x * y * z] and
+local shape [1, 1, 1, x * y * z]. This allows parallel save of tensors along the
+last (flattened) dimension. During loading, some additional resharding is needed.
+"""
+import logging
+import math
+from dataclasses import dataclass
+from itertools import product
+from typing import Any, Dict, Optional, Tuple, Union
+
+import numpy as np
+import torch
+from torch.distributed.checkpoint import ChunkStorageMetadata
+from torch.distributed.checkpoint.resharding import _shards_get_overlap_region_wrt_saved_tensor
+
+from megatron.core.dist_checkpointing import ShardedTensor
+from megatron.core.dist_checkpointing.core import CheckpointingException
+from megatron.core.dist_checkpointing.dict_utils import (
+    dict_list_map_inplace,
+    extract_matching_values,
+)
+from megatron.core.dist_checkpointing.mapping import (
+    ReplicaId,
+    ShardedStateDict,
+    ShardedTensorFactory,
+    StateDict,
+    apply_factories,
+    apply_factory_merges,
+)
+
+logger = logging.getLogger(__name__)
+
+
+@dataclass
+class TensorReformulationMetadata:
+    """Metadata needed to restore the original tensor shape.
+
+    Args:
+        ckpt_orig_global_shape (Tuple[int, ...]): original global shape of the tensor
+            saved in the checkpoint. This is the global shape of the application,
+            further reformulated into `ckpt_reform_global_shape` while saving.
+        ckpt_reform_global_shape (Tuple[int, ...]): reformulated global shape of the tensor
+            saved in the checkpoint. This is the actual saved shape.
+    """
+
+    ckpt_orig_global_shape: Tuple[int, ...]
+    ckpt_reform_global_shape: Tuple[int, ...]
+
+    def __post_init__(self):
+        assert self.ckpt_orig_global_shape
+
+
+def nd_flattened_tensor_reformulated_global_shape(sh_ten: ShardedTensor) -> Tuple[int, ...]:
+    """Reformulated global shape of the flattened N-D ShardedTensor.
+
+    N-D tensor global shape [X, Y, Z] and local shape [x, y, z]
+    is reformulated into global shape [X // x, Y // y, Z // z, x * y * z] and
+    local shape [1, 1, 1, x * y * z], to allow parallel save of tensors along the
+    last (flattened) dimension.
+
+    Args:
+        sh_ten (ShardedTensor): flattened N-D ShardedTensor (N > 1)
+
+    Returns:
+        Tuple[int, ...]: reformulated tensor shape
+    """
+    assert is_nd_flattened_tensor(sh_ten), sh_ten
+    return sh_ten.axis_fragmentations + (int(np.prod(sh_ten.local_shape)),)
+
+
+def is_nd_flattened_tensor(sh_ten: Any) -> bool:
+    """Checks if ShardedTensor is flattened and more than 1-dimensional
+
+    Args:
+        sh_ten (Any): any object
+
+    Returns:
+        bool: whether the given object is a flattened ShardedTensor and is N-dimensional (N > 1)
+    """
+    return (
+        isinstance(sh_ten, ShardedTensor)
+        and sh_ten.flattened_range is not None
+        and len(sh_ten.global_shape) > 1
+    )
+
+
+# information needed to restore. With current implementation, this is a nested state dict
+# with ShardedTensorFactories which is basically a ShardedStateDict type
+ReformulationRestoreMetadata = ShardedStateDict
+
+
+def apply_nd_flattened_tensors_reformulation(
+    sharded_state_dict: ShardedStateDict,
+    reformulation_metadata: Dict[str, TensorReformulationMetadata],
+) -> Tuple[ShardedStateDict, ReformulationRestoreMetadata]:
+    """Applies N-D reformulation to a given sharded state dict.
+
+    After applying the method and loading the reformulated state dict,
+    the `restore_nd_flattened_tensors_formulation` needs to be applied.
+
+    Current implementation uses ShardedTensorFactories for convenience of
+    restoring the original structure, but it's just an implementation detail.
+    Turns N-D ShardedTensors into factories and immediately applies them,
+    keeping the data needed to restore the original structure.
+
+    Args:
+        sharded_state_dict (ShardedStateDict): sharded state dict potentially
+            with tensors to reformulate.
+        reformulation_metadata (Dict[str, TensorReformulationMetadata]): dict
+            containing all metadata needed for reformulating tensors in `sharded_state_dict`.
+            for each N-D flattened tensor `sh_ten` in `sharded_state_dict` there must be an
+            entry with `sh_ten.key`.
+
+    Returns:
+        tuple:
+            ShardedStateDict - reformulated sharded state dict
+            ReformulationRestoreMetadata - data needed to restore the original formulation
+                with `restore_nd_flattened_tensors_formulation`
+    """
+
+    def maybe_reformulate_nd_flattened_tensor(sh_ten: Any):
+        if not isinstance(sh_ten, ShardedTensor) or not is_nd_flattened_tensor(sh_ten):
+            return sh_ten
+        # N-D flattened ShardedTensor
+        try:
+            sh_ten_reformulation_metadata = reformulation_metadata[sh_ten.key]
+        except KeyError as e:
+            raise CheckpointingException(
+                f'Missing reformulation metadata for tensor {sh_ten}. Existing keys: {reformulation_metadata.keys()}'
+            ) from e
+
+        ckpt_actual_saved_shape = sh_ten_reformulation_metadata.ckpt_reform_global_shape
+        app_actual_load_shape = nd_flattened_tensor_reformulated_global_shape(sh_ten)
+        if ckpt_actual_saved_shape == app_actual_load_shape:
+            # Same shape - no need to reshard
+            return sh_ten
+
+        return reformulate_single_nd_flattened_tensor(sh_ten, sh_ten_reformulation_metadata)
+
+    # Turn N-D tensors into factories and immediately apply them
+    dict_list_map_inplace(maybe_reformulate_nd_flattened_tensor, sharded_state_dict)
+    sh_ten_factories, _ = extract_matching_values(
+        sharded_state_dict,
+        lambda x: isinstance(x, ShardedTensorFactory),
+        return_lists_as_dicts=True,
+    )
+    apply_factories(sharded_state_dict)
+
+    # Unlink `data` pointers to free memory
+    def unlink_data(x):
+        x.data = None
+        return x
+
+    dict_list_map_inplace(unlink_data, sh_ten_factories)
+    return sharded_state_dict, sh_ten_factories
+
+
+def restore_nd_flattened_tensors_formulation(
+    state_dict: StateDict, formulation_restore_metadata: ReformulationRestoreMetadata
+) -> StateDict:
+    """Restores the original state dict from a reformulated form.
+
+    Inverse of `apply_nd_flattened_tensors_reformulation`.
+
+    Args:
+        state_dict (StateDict): state dict obtained by loading a reformulated
+            sharded state dict.
+        formulation_restore_metadata (ReformulationRestoreMetadata): metadata returned by
+            `apply_nd_flattened_tensors_reformulation` function
+
+    Returns:
+        StateDict: state dict with the original tensors formulation restored
+    """
+    return apply_factory_merges(state_dict, formulation_restore_metadata)
+
+
+def reformulate_single_nd_flattened_tensor(
+    sh_ten: ShardedTensor, reformulation_metadata: TensorReformulationMetadata
+) -> Union[Any, ShardedTensorFactory]:
+    """Reformulates shapes of a single N-D flattened ShardedTensor.
+
+    We need to define a pair of transformations:
+    - turn N-D ShardedTensor with original formulation into multiple reformulated ShardedTensors
+    - merge multiple reformulated loaded torch.Tensors into a single original tensor
+    Current implementation uses ShardedTensorFactories as a convenient mechanism
+    for specifying and keeping track of those transformations.
+
+    Args:
+        sh_ten (ShardedTensor): sharded tensor to reformulate.
+        reformulation_metadata (TensorReformulationMetadata): metadata needed to
+            perform the reformulation
+
+    Returns:
+        ShardedTensorFactory: factory that keeps information how to reformulate
+            (build) the ShardedTensor and then restore original formulation (merge)
+            after loading.
+    """
+    rmd = reformulation_metadata
+    # Data won't be needed - remove unnecessary tensor references
+    sh_ten = sh_ten.without_data()
+
+    # Based on reformulation_metadata, determine other tensor shapes and metadata
+    ckpt_axis_fragmentation = rmd.ckpt_reform_global_shape[:-1]
+    for sh, fragm in zip(rmd.ckpt_orig_global_shape, ckpt_axis_fragmentation):
+        assert sh % fragm == 0, (sh_ten, rmd.ckpt_reform_global_shape)
+    ckpt_local_shape_with_prepended_axis = tuple(
+        sh // fragm for sh, fragm in zip(rmd.ckpt_orig_global_shape, ckpt_axis_fragmentation)
+    )
+    assert (
+        ckpt_local_shape_with_prepended_axis[: sh_ten.prepend_axis_num]
+        == (1,) * sh_ten.prepend_axis_num
+    ), (ckpt_local_shape_with_prepended_axis, sh_ten)
+    ckpt_local_shape = ckpt_local_shape_with_prepended_axis[sh_ten.prepend_axis_num :]
+
+    # Iterate over reformulated shapes needed by the application and from checkpoint,
+    # and generate new ShardedTensors that match the checkpoint sharding.
+    overlap_dim_offsets = []
+    assert len(ckpt_axis_fragmentation) == len(sh_ten.axis_fragmentations), (
+        ckpt_axis_fragmentation,
+        sh_ten,
+    )
+    for dim, (app_chunk_dim_offset, ckpt_fragm, app_fragm) in enumerate(
+        zip(
+            sh_ten.local_chunk_offset_in_global(),
+            ckpt_axis_fragmentation,
+            sh_ten.axis_fragmentations,
+        )
+    ):
+        # without `int`, it's an exact offset of the app shard expressed in ckpt_local_shape units
+        first_overlap_dim_offset = int(ckpt_fragm / app_fragm * app_chunk_dim_offset)
+        # `math.ceil` argument is an exact offset of the app next shard expressed in ckpt_local_shape units
+        next_overlap_dim_offset = math.ceil(ckpt_fragm / app_fragm * (app_chunk_dim_offset + 1))
+        overlap_dim_offsets.append(range(first_overlap_dim_offset, next_overlap_dim_offset))
+
+    logger.debug(
+        f'Generated the following number of overlap shards for each dimension: {list(map(len, overlap_dim_offsets))}'
+        f' for fragmentation ckpt {ckpt_axis_fragmentation} vs app {sh_ten.axis_fragmentations} and chunk offset {sh_ten.local_chunk_offset_in_global()}'
+    )
+    reformulated_sh_tens = {}
+    for chunk_offset in product(*overlap_dim_offsets):
+        global_offset = tuple(
+            chunk_off * chunk_shape
+            for chunk_off, chunk_shape in zip(chunk_offset, ckpt_local_shape_with_prepended_axis)
+        )
+        reformulated_sh_tens[(global_offset, ckpt_local_shape)] = ShardedTensor(
+            sh_ten.key,
+            None,
+            sh_ten.dtype,
+            ckpt_local_shape,
+            rmd.ckpt_orig_global_shape,
+            global_offset,
+            ckpt_axis_fragmentation,
+            sh_ten.replica_id,
+            sh_ten.prepend_axis_num,
+            sh_ten.allow_shape_mismatch,
+            flattened_range=slice(0, rmd.ckpt_reform_global_shape[-1]),  # whole ckpt shard
+        )
+
+    # Now, we have to define the transformations from application sharding
+    # to checkpoint sharding.
+
+    @torch.no_grad()
+    def sh_ten_build_fn(*args, **kwargs):
+        # Here we simply return the precomputed tensors.
+        return reformulated_sh_tens
+
+    @torch.no_grad()
+    def sh_ten_merge_fn(sub_state_dict):
+        # This is the non-flattened local tensor with original formulation
+        # that we are going to fill with shards loaded from the checkpoint.
+        app_non_flat_ten = torch.empty(
+            sh_ten.local_shape,
+            dtype=sh_ten.dtype,
+            device=sh_ten.data.device if sh_ten.data is not None else None,
+        )
+
+        assert len(sub_state_dict) > 0
+        for (ckpt_global_offset, ckpt_local_shape), ckpt_ten in sub_state_dict.items():
+            # For each ckpt shard, we fill the appropriate application shard part
+            dest_ten = app_non_flat_ten
+            src_ten = ckpt_ten.view(ckpt_local_shape)
+            # We don't need narrowing over `prepend_axis_num` axes so we take the [sh_ten.prepend_axis_num:] offsets slice
+            for (
+                dim,
+                offset_for_saved_tensor,
+                offset_for_current_tensor,
+                length,
+            ) in _shards_get_overlap_region_wrt_saved_tensor(
+                saved_shard=ChunkStorageMetadata(
+                    ckpt_global_offset[sh_ten.prepend_axis_num :], ckpt_local_shape
+                ),
+                current_shard=ChunkStorageMetadata(
+                    sh_ten.global_offset[sh_ten.prepend_axis_num :], sh_ten.local_shape
+                ),
+            ):
+                src_ten = src_ten.narrow(dim, offset_for_saved_tensor, length)
+                dest_ten = dest_ten.narrow(dim, offset_for_current_tensor, length)
+            dest_ten.copy_(src_ten)
+        return app_non_flat_ten.flatten()[sh_ten.flattened_range]
+
+    return ShardedTensorFactory(
+        sh_ten.key,
+        sh_ten.data,
+        sh_ten_build_fn,
+        sh_ten_merge_fn,
+        sh_ten.replica_id,
+        sh_ten.flattened_range,
+    )
diff --git a/megatron/core/dist_checkpointing/strategies/state_dict_saver.py b/megatron/core/dist_checkpointing/strategies/state_dict_saver.py
new file mode 100644
index 0000000000..7b35209f21
--- /dev/null
+++ b/megatron/core/dist_checkpointing/strategies/state_dict_saver.py
@@ -0,0 +1,162 @@
+# Copyright (c) 2023, NVIDIA CORPORATION.  All rights reserved.
+
+""" State dict saver for PyT Distributed format allowing asynchronous save. """
+
+from logging import getLogger
+from time import time
+from typing import TYPE_CHECKING, Optional, Tuple, cast
+
+import torch
+import torch.distributed as dist
+from torch.distributed.checkpoint import CheckpointException
+from torch.distributed.checkpoint.default_planner import DefaultSavePlanner
+from torch.distributed.checkpoint.metadata import STATE_DICT_TYPE, Metadata
+from torch.distributed.checkpoint.planner import SavePlan, SavePlanner
+from torch.distributed.checkpoint.utils import _DistWrapper, _get_failure_dict
+
+if TYPE_CHECKING:
+    from .filesystem_async import FileSystemWriterAsync
+
+
+logger = getLogger(__name__)
+
+
+def save_state_dict_async_plan(
+    state_dict: STATE_DICT_TYPE,
+    storage_writer: 'FileSystemWriterAsync',
+    process_group: Optional[dist.ProcessGroup] = None,
+    coordinator_rank: int = 0,
+    planner: Optional[SavePlanner] = None,
+    cached_ckpt_structure: Optional[Tuple[SavePlan, SavePlan, bool]] = None,
+) -> Tuple[Tuple['FileSystemWriterAsync', Metadata, _DistWrapper], SavePlan, bool]:
+    """
+    First stage of saving a state dict to storage.
+
+    This is an async adjustment of torch.distributed.checkpoint.state_dict_saver.
+    In order to support async save, saving should be split into three parts:
+    1. Planning
+    2. Actual saving
+    3. Finalization
+
+    Out of these, step (2) *must* happen asynchronously.
+    The first step is realized with this function.
+
+    The planning part consists of several steps, described here:
+    https://pytorch.org/docs/stable/distributed.checkpoint.html#torch.distributed.checkpoint.SavePlanner
+
+    Args:
+        state_dict (STATE_DICT_TYPE): state dict to save
+        storage_writer (FileSystemWriterAsync): in current version only an instance of
+            FileSystemWriterAsync
+        process_group (dist.ProcessGroup, optional): process group used for save planning
+        coordinator_rank (int, optional): coordinator rank for planning. Defaults to 0.
+        planner (SavePlanner, optional): save planner for torch.distributed.checkpoint format
+        cached_ckpt_structure (Tuple[SavePlan, SavePlan, bool], Optional):
+            Each object of this tuple will be used in the order as following
+            cached_central_plan (SavePlan): a globally coordinated save plan
+                cached in the previous iteration
+            cached_local_plan (SavePlan): a local plan
+                cached in the previous iteration
+            validated_cache_reuse (bool): boolean value to tell global_metadata and planning dict
+                is consistent over iterations
+
+    Returns: Tuple of:
+        - storage writer (the one passed as input)
+        - metadata from planning
+        - distributed wrapper used for planning
+    The return value of this function should be passed as an input to
+    `save_state_dict_async_finalize` and cached_plan to skip `reduce_scatter` at planning.
+    """
+    cached_central_plan, cached_local_plan, validated_cache_reuse = (None, None, False)
+    if cached_ckpt_structure:
+        cached_central_plan, cached_local_plan, validated_cache_reuse = cached_ckpt_structure
+
+    rank = torch.distributed.get_rank() if torch.distributed.is_initialized() else 0
+    dist_wrapper = _DistWrapper(process_group, True, coordinator_rank)
+    if planner is None:
+        planner = DefaultSavePlanner()
+    assert planner is not None
+
+    global_metadata = None
+    logger.debug(f"rank: {rank}, starting state dict save")
+    local_plan = cached_local_plan
+
+    def local_step():
+        nonlocal local_plan
+        assert planner is not None
+        # PyTorch 2.4 introduced additional `metadata` argument,
+        # we have to reference `is_coordinator` args by name
+        planner.set_up_planner(state_dict, is_coordinator=dist_wrapper.is_coordinator)
+        storage_writer.set_up_storage_writer(dist_wrapper.is_coordinator)
+        if not validated_cache_reuse and local_plan is None:
+            local_plan = planner.create_local_plan()
+        local_plan = storage_writer.prepare_local_plan(local_plan)
+        return local_plan
+
+    def global_step(all_local_plans):
+        nonlocal global_metadata
+        assert planner is not None
+        all_local_plans, global_metadata = planner.create_global_plan(all_local_plans)
+        all_local_plans = storage_writer.prepare_global_plan(all_local_plans)
+        return all_local_plans
+
+    # Execute local and global planning
+    start_plan = time()
+    if validated_cache_reuse and cached_central_plan:
+        logger.debug(f"rank: {rank}, Passed cache reusable")
+        local_step()
+        central_plan = cached_central_plan
+    else:
+        central_plan = dist_wrapper.reduce_scatter("plan", local_step, global_step)
+    central_plan = planner.finish_plan(central_plan)
+    end_plan = time()
+    logger.debug(f"rank: {rank}, plan time: {end_plan - start_plan}")
+    # Prepare async writing of tensors.
+    # The `storage_writer` will store the information about tensors it needs to save
+    start = time()
+    storage_writer.prepare_write_data(central_plan, planner)
+    end = time()
+    logger.debug(f"{time()} rank: {rank}, write(async) time: {end - start}")
+    return (
+        (storage_writer, cast(Metadata, global_metadata), dist_wrapper),
+        central_plan,
+        local_plan,
+        cached_central_plan == central_plan,
+    )
+
+
+def save_state_dict_async_finalize(
+    storage_writer: 'FileSystemWriterAsync', global_metadata: Metadata, dist_wrapper: _DistWrapper
+) -> None:
+    """
+    Finalization of save_state_dict_async_plan.
+
+    The input arguments are the same as the save_state_dict_async_plan output,
+    the `write_results` are retrieved from the storage_writer.
+
+    Args:
+        storage_writer (FileSystemWriterAsync): storage writer used for planning
+        global_metadata (Metadata): metadata created during planning
+        dist_wrapper (_DistWrapper): distributed wrapper created during planning
+
+    Returns: None
+    """
+    write_results = storage_writer.retrieve_write_results()
+
+    # Gather the write results that will be saved to the metadata file.
+    gather_start = time()
+    all_results = dist_wrapper.gather_object(write_results)
+    gather_end = time()
+    logger.debug(f"{gather_end}, {torch.distributed.get_rank()}, gather: {gather_end-gather_start}")
+
+    # Store the metadata on coordinator rank
+    if dist_wrapper.is_coordinator:
+        node_failures = _get_failure_dict(all_results)
+        if len(node_failures) == 0:
+            assert global_metadata is not None
+            write_start = time()
+            storage_writer.finish(global_metadata, all_results)
+            write_end = time()
+            logger.debug(f"{write_end}, metadata_write: {write_end - write_start}")
+        else:
+            raise CheckpointException("write", node_failures)
diff --git a/megatron/core/dist_checkpointing/strategies/tensorstore.py b/megatron/core/dist_checkpointing/strategies/tensorstore.py
new file mode 100644
index 0000000000..0b20bf5e77
--- /dev/null
+++ b/megatron/core/dist_checkpointing/strategies/tensorstore.py
@@ -0,0 +1,128 @@
+# Copyright (c) 2022-2023, NVIDIA CORPORATION.  All rights reserved.
+
+""" Strategies using TensorStore to load and save Zarr arrays. """
+
+from functools import partial
+from itertools import starmap
+from logging import getLogger
+from pathlib import Path
+
+import tensorstore as ts
+import torch
+
+from ..core import CheckpointingException
+from ..dict_utils import dict_list_map_inplace
+from ..mapping import ShardedStateDict, ShardedTensor
+from .base import LoadShardedStrategy, StrategyAction, register_default_strategy
+from .zarr import load_zarr_based_sharded_metadata, postprocess_numpy_array
+
+logger = getLogger(__name__)
+
+
+def register_default_tensorstore_strategies():
+    """Register default strategies leveraging tensorstore."""
+    register_default_strategy(
+        StrategyAction.LOAD_SHARDED, 'zarr', 1, TensorStoreLoadShardedStrategy()
+    )
+
+
+class TensorStoreLoadShardedStrategy(LoadShardedStrategy):
+    """Load strategy for Zarr backend using `tensorstore` for loading."""
+
+    def __init__(self, load_directly_on_device: bool = False):
+        super().__init__()
+        self.load_directly_on_device = load_directly_on_device
+
+    def load(self, sharded_state_dict: ShardedStateDict, checkpoint_dir: Path):
+        if torch.distributed.get_rank() == 0:
+            print(f'Loading distributed checkpoint with {self.__class__.__name__}')
+            if self.load_directly_on_device:
+                print(f'Loading distributed checkpoint directly on the GPU')
+        load_fn = partial(
+            _load_from_array,
+            checkpoint_dir=checkpoint_dir,
+            load_directly_on_device=self.load_directly_on_device,
+        )
+        dict_list_map_inplace(load_fn, sharded_state_dict)
+        return sharded_state_dict
+
+    def load_tensors_metadata(self, checkpoint_dir: Path):
+        def get_ts_shape_dtype(path):
+            arr = open_ts_array(path)
+            return arr.shape, arr.dtype.numpy_dtype
+
+        return load_zarr_based_sharded_metadata(checkpoint_dir, get_ts_shape_dtype)
+
+    def check_backend_compatibility(self, loaded_version):
+        pass  # TODO
+
+    def check_version_compatibility(self, loaded_version):
+        pass  # TODO
+
+
+def merge_global_slice_with_shape(global_slice, actual_shape, key):
+    """Intersects the global slice with the actual shape (prevent overflow)."""
+
+    def _merge_slice(dim_slice, dim_size):
+        if isinstance(dim_slice, slice):
+            assert (
+                dim_slice.start < dim_size
+            ), f'Got empty slice for ShardedTensor {key} ({dim_slice}, {dim_size})'
+            if dim_slice.stop > dim_size:
+                dim_slice = slice(dim_slice.start, dim_size, dim_slice.step)
+        return dim_slice
+
+    assert len(global_slice) == len(actual_shape), (global_slice, actual_shape, key)
+    return tuple(starmap(_merge_slice, zip(global_slice, actual_shape)))
+
+
+def _load_from_array(
+    sharded_tensor: ShardedTensor,
+    checkpoint_dir: Path,
+    load_directly_on_device: bool = False,
+    apply_flattened_range: bool = True,
+):
+    x = _load_regular_chunk(sharded_tensor, checkpoint_dir)
+    ten = postprocess_numpy_array(x, sharded_tensor, apply_flattened_range)
+    if load_directly_on_device:
+        sharded_tensor.data.data.copy_(ten)
+        return sharded_tensor.data
+    else:
+        return ten
+
+
+def _load_regular_chunk(sharded_tensor: ShardedTensor, checkpoint_dir: Path):
+    assert isinstance(sharded_tensor, ShardedTensor), type(sharded_tensor)
+    arr = open_ts_array(checkpoint_dir / sharded_tensor.key)
+    if sharded_tensor.global_shape == arr.shape:
+        x = (
+            arr[sharded_tensor.global_slice()].read().result()
+        )  # flattened tensors loading is delayed
+    elif sharded_tensor.allow_shape_mismatch:
+        global_slice = merge_global_slice_with_shape(
+            sharded_tensor.global_slice(), arr.shape, sharded_tensor.key
+        )
+        x = arr[global_slice].read().result()  # flattened tensors loading is delayed
+    else:
+        _msg = (
+            f'Global shape mismatch for loaded ({arr.shape})'
+            f' and expected ({sharded_tensor.global_shape}) tensor'
+            f' for key {sharded_tensor.key}'
+        )
+        raise CheckpointingException(_msg)
+    return x
+
+
+def open_ts_array(arr_path: Path):
+    """Opens a Zarr file array with Tensorstore with basic setting.
+
+    Args:
+        arr_path (Path): path to a Zarr (Tensorstore) array
+    """
+    spec = {'driver': 'zarr', 'metadata_key': '.zarray', 'kvstore': {}}
+    spec['kvstore'] = {'driver': 'file', 'path': str(arr_path)}
+    try:
+        arr = ts.open(ts.Spec(spec), open=True).result()
+    except Exception as e:
+        raise CheckpointingException(f'Array {arr_path} could not be loaded. Error: {e}') from e
+    return arr
diff --git a/megatron/core/dist_checkpointing/strategies/torch.py b/megatron/core/dist_checkpointing/strategies/torch.py
new file mode 100644
index 0000000000..e1bd843417
--- /dev/null
+++ b/megatron/core/dist_checkpointing/strategies/torch.py
@@ -0,0 +1,840 @@
+# Copyright (c) 2022-2023, NVIDIA CORPORATION.  All rights reserved.
+
+""" Strategies using PyTorch distributed.checkpoint as an underlying format. """
+import io
+from collections import ChainMap, defaultdict
+from dataclasses import dataclass
+from itertools import product
+from logging import getLogger
+from pathlib import Path
+from typing import Any, Dict, Iterable, List, Optional, Tuple, Union, cast
+
+import torch
+from packaging.version import Version as PkgVersion
+from torch.distributed import checkpoint
+from torch.distributed._shard.metadata import ShardMetadata
+from torch.distributed._shard.sharded_tensor import Shard, ShardedTensorMetadata, TensorProperties
+from torch.distributed._sharded_tensor import ShardedTensor as TorchShardedTensor
+from torch.distributed._tensor import DTensor
+from torch.distributed.checkpoint import (
+    BytesStorageMetadata,
+    DefaultLoadPlanner,
+    DefaultSavePlanner,
+    FileSystemReader,
+    LoadPlan,
+    Metadata,
+    ReadItem,
+    SavePlan,
+    TensorStorageMetadata,
+    WriteItem,
+)
+from torch.distributed.checkpoint._nested_dict import FLATTEN_MAPPING, unflatten_state_dict
+from torch.distributed.checkpoint._traverse import OBJ_PATH, traverse_state_dict
+from torch.distributed.checkpoint.metadata import Metadata
+from torch.distributed.checkpoint.planner_helpers import _create_write_items
+
+from ..core import CheckpointingException
+from ..dict_utils import nested_values
+from ..mapping import (
+    ShardedBase,
+    ShardedObject,
+    ShardedStateDict,
+    ShardedTensor,
+    StateDict,
+    is_main_replica,
+)
+from .async_utils import AsyncRequest
+from .base import (
+    AsyncSaveShardedStrategy,
+    LoadShardedStrategy,
+    StrategyAction,
+    register_default_strategy,
+)
+from .filesystem_async import FileSystemWriterAsync
+from .resharding import (
+    TensorReformulationMetadata,
+    apply_nd_flattened_tensors_reformulation,
+    is_nd_flattened_tensor,
+    nd_flattened_tensor_reformulated_global_shape,
+    restore_nd_flattened_tensors_formulation,
+)
+from .state_dict_saver import save_state_dict_async_finalize, save_state_dict_async_plan
+
+try:
+    if not torch.cuda.is_available():
+        raise ImportError
+    from transformer_engine.pytorch.float8_tensor import Float8Tensor
+
+    HAVE_TE = True
+except ImportError:
+    HAVE_TE = False
+
+
+def register_default_torch_strategies():
+    """Register default strategies related to PyT Distributed backend."""
+    register_default_strategy(
+        StrategyAction.LOAD_SHARDED, 'torch_dist', 1, TorchDistLoadShardedStrategy()
+    )
+    register_default_strategy(
+        StrategyAction.SAVE_SHARDED, 'torch_dist', 1, TorchDistSaveShardedStrategy('torch_dist', 1)
+    )
+
+
+logger = getLogger(__name__)
+
+
+def flatten_state_dict(
+    state_dict: ShardedStateDict,
+) -> Tuple[ShardedStateDict, Dict[str, OBJ_PATH]]:
+    """Flattens state dict into a single level dict.
+
+    It's a copy of torch.distributed.checkpoint._nested_dict.flatten_state_dict
+    which also accepts ShardedBase tensors as terminal objects
+
+    Args:
+        state_dict (ShardedStateDict): state dict to be flattened
+
+    Returns (tuple): flattened state dict and a mapping allowing to recreate the original one
+
+    """
+    flattened = {}
+    mappings = {}
+
+    def flat_copy(path: OBJ_PATH, value: Any) -> None:
+        new_fqn = ".".join(map(str, path))
+        if new_fqn in flattened:
+            raise ValueError(f"duplicated flatten key {new_fqn}")
+        flattened[new_fqn] = value
+        mappings[new_fqn] = path
+
+    traverse_state_dict(state_dict, flat_copy, lambda x: isinstance(x, (torch.Tensor, ShardedBase)))
+    return flattened, mappings
+
+
+def sharded_tensor_to_torch_sharded_tensor(
+    sh_tens: List[ShardedTensor], rank: Optional[int] = None
+) -> TorchShardedTensor:
+    """Convert MCore ShardedTensor to PyT ShardedTensor. PyT requires information about all chunks.
+
+    On high-level, this function follows the logic of
+    torch.distributed.fsdp._shard_utils._create_chunk_sharded_tensor.
+    Additionally, it saves `prepend_axis_num` and `has_flattened_range` (specific to MCore)
+    as attributes for further restoration in `_unwrap_pyt_sharded_tensor`.
+
+    NOTE: this function assumes regular (grid) sharding of the MCore ShardedTensor.
+    The only local irregularities could be introduced with a `flattened_range` attribute.
+
+    This function handles 3 different type of ShardedTensors:
+    1. Non-flat regular ShardedTensors (`not has_flattened_range`)
+    2. 1D flattened ShardedTensors (`is_flattened_range_1d`)
+    3. N-D flattened ShardedTensors (`has_flattened_range`)
+
+    (1) and (2) type are saved according to their original shape.
+    Type (3) however requires global shape adjustment for efficiency:
+    we treat [X, Y, Z] global shape tensor with local shape [x, y, z]
+    as a [X // x, Y // y, Z // z, x * y * z] tensor with last axis
+    partitioned according to `flattened_range` slices.
+    This will need special handling while resharding.
+
+    Args:
+        sh_tens (List[ShardedTensor]): list of sharded tensors to convert
+        rank (int, optional): current process rank passed to PyT ShardedTensor.
+            If None, assumes rank in the default pg.
+
+    Returns (TorchShardedTensor): PyT ShardedTensor containing all passed shards.
+
+    """
+    if rank is None:
+        rank = torch.distributed.get_rank()
+
+    some_sh_ten = sh_tens[0]
+    has_flattened_range = some_sh_ten.flattened_range is not None
+    is_flattened_range_1d = has_flattened_range and len(some_sh_ten.global_shape) == 1
+
+    for sh_ten in sh_tens:
+        assert (sh_ten.flattened_range is not None) == has_flattened_range, sh_tens
+        if not sh_ten.data.is_contiguous():
+            sh_ten.data = sh_ten.data.contiguous()
+
+    local_global_offsets = {}
+
+    prepend_axis_num = sh_tens[0].prepend_axis_num
+    # Determine local shards according to tensor type (see docs)
+    if is_flattened_range_1d:
+        # Type (2) case: 1D flattened ShardedTensors
+        for sh_ten in sh_tens:
+            assert len(sh_ten.global_offset) == 1, sh_ten
+            assert sh_ten.prepend_axis_num == 0, sh_ten
+            local_global_offsets.setdefault(sh_ten.global_offset, []).append(sh_ten)
+
+        global_shape = some_sh_ten.global_shape
+        offsets_shape = (
+            some_sh_ten.local_shape
+        )  # local shape is not flattened, we need it for chunk offsets
+
+        local_shards = [
+            Shard.from_tensor_and_offsets(
+                sh_ten.data,
+                [
+                    sh_ten.global_offset[0] + sh_ten.flattened_range.start
+                ],  # additional flattened offset
+                rank,
+            )
+            for sh_ten in sh_tens
+        ]
+
+    elif has_flattened_range:
+        # Type (3) case: N-D flattened ShardedTensors
+        for sh_ten in sh_tens:
+            local_global_offsets.setdefault(sh_ten.local_chunk_offset_in_global(), []).append(
+                sh_ten
+            )
+            assert sh_ten.data.ndim == 1, sh_ten
+            sh_ten.data = sh_ten.data.view((1,) * len(sh_ten.global_shape) + (-1,))
+
+        # Global shape reformulation:
+        global_shape = nd_flattened_tensor_reformulated_global_shape(some_sh_ten)
+        offsets_shape = (1,) * len(
+            some_sh_ten.global_shape
+        )  # reformulated global shape has shape equal ti number of local chunks
+
+        local_shards = [
+            Shard.from_tensor_and_offsets(
+                sh_ten.data,
+                list(
+                    sh_ten.local_chunk_offset_in_global() + (sh_ten.flattened_range.start,)
+                ),  # additional flattened offset
+                rank,
+            )
+            for sh_ten in sh_tens
+        ]
+    else:
+        # Type (1) case: non-flat regular ShardedTensors
+        for sh_ten in sh_tens:
+            local_global_offsets.setdefault(sh_ten.global_offset, []).append(sh_ten)
+            sh_ten.data = sh_ten.data.view(
+                (1,) * prepend_axis_num + sh_ten.local_shape
+            )  # adjust to prepended_axis_num
+
+        global_shape = some_sh_ten.global_shape
+        offsets_shape = some_sh_ten.data.shape  # includes prepended axes
+
+        local_shards = [
+            Shard.from_tensor_and_offsets(
+                sh_ten.data, list(sh_ten.global_offset), rank  # simple case
+            )
+            for sh_ten in sh_tens
+        ]
+
+    # Create a ShardedTensor without invoking communication. Determine global shards
+    world_size = torch.distributed.get_world_size()
+    shard_metadata = []
+    # NOTE: here we assume a regular grid of shards
+    for fragment_offsets in product(*map(range, some_sh_ten.axis_fragmentations)):
+        offset = tuple(map(lambda x: x[0] * x[1], zip(fragment_offsets, offsets_shape)))
+        if offset in local_global_offsets:
+            # local shard
+            placement = f"rank:{rank}/cuda"
+            for sh_ten in local_global_offsets[offset]:
+                if is_flattened_range_1d:
+                    offset = (sh_ten.global_offset[0] + sh_ten.flattened_range.start,)
+                    size = sh_ten.data.shape
+                elif has_flattened_range:
+                    assert offset == sh_ten.local_chunk_offset_in_global()
+                    # This is not an actual offset, but an offset of the whole shard
+                    # This is needed for a PyT Dist internal integrity check
+                    offset = sh_ten.local_chunk_offset_in_global() + (0,)
+                    size = (1,) * len(offsets_shape) + global_shape[-1:]
+                else:
+                    size = sh_ten.data.shape
+                shard_metadata.append(ShardMetadata(offset, size, placement))
+
+        else:
+            # pylint: disable=line-too-long
+            # for shards from other ranks we provide simplistic data - this information will be discarded
+            # during TorchShardedTensor._init_from_local_shards_and_global_metadata call.
+            # Due to a bug in PyT 24.05 container we must specify some concrete rank within a world size.
+            # The exact rank doesn't matter as long as it's different than my rank - hence (rank + 1) % WS.
+            placement = f"rank:{(rank + 1) % world_size}/cuda"
+            if has_flattened_range and not is_flattened_range_1d:
+                offset = offset + (0,)
+                size = (1,) * len(offsets_shape) + global_shape[-1:]
+            else:
+                size = offsets_shape
+            shard_metadata.append(ShardMetadata(offset, size, placement))
+
+    tensor = some_sh_ten.data
+    sharded_tensor_metadata = ShardedTensorMetadata(
+        shards_metadata=shard_metadata,
+        size=torch.Size(global_shape),
+        tensor_properties=TensorProperties(
+            dtype=tensor.dtype,
+            layout=tensor.layout,
+            requires_grad=tensor.requires_grad,
+            memory_format=torch.contiguous_format,
+            pin_memory=tensor.is_pinned(),
+        ),
+    )
+    pyt_sh_ten = TorchShardedTensor._init_from_local_shards_and_global_metadata(
+        local_shards, sharded_tensor_metadata=sharded_tensor_metadata, process_group=None
+    )
+    # Store MCore related data as PyTShardedTensor attribute.
+    # This won't be stored in the checkpoint, only for runtime purposes
+    pyt_sh_ten.mcore_sh_ten = sh_ten.without_data()
+    pyt_sh_ten.mcore_metadata = {}
+    if has_flattened_range and not is_flattened_range_1d:
+        pyt_sh_ten.mcore_metadata['nd_reformulated_orig_global_shape'] = sh_ten.global_shape
+    return pyt_sh_ten
+
+
+def mcore_to_pyt_state_dict(
+    state_dict: Dict[str, List[ShardedBase]],
+    is_loading: bool = False,
+    init_device: torch.device = torch.device("cpu"),
+) -> Dict[str, Union[TorchShardedTensor, io.BytesIO]]:
+    """Convert state dict with ShardedTensors and ShardedObjects
+    to state dict compatible with PyT Dist format.
+
+    Operates in-place and returns the original state dict.
+
+    Args:
+        state_dict (Dict[str, List[ShardedBase]]): flattened state dict, where values
+            are lists of either ShardedTensor or ShardedObjects.
+        is_loading (bool, optional): flag indicating if loading or saving. Defaults to False.
+        init_device (torch.device, optional): device to initialize potentially missing tensors
+            during loading. Defaults to 'cpu'.
+
+    Returns (Dict[str, Union[TorchShardedTensor, io.BytesIO]]): original dictionary with values
+        converted either into PyT ShardedTensors or io.BytesIO.
+
+    """
+    rank = torch.distributed.get_rank()
+    pyt_state_dict = {}
+
+    def _mcore_to_torch_sharded_tensor(sh_tens: List[ShardedTensor]) -> TorchShardedTensor:
+        """Build a PyT ShardedTensor from given shards.
+
+        During loading:
+        - if data is None, initialize it with an empty tensor (will be used to copy the data into)
+        - if `allow_shape_mismatch` is True, the data is initialized with zeros
+            prior to loading (not all parts of the tensor will be read from the checkpoint)
+        """
+        assert all(isinstance(sh_ten, ShardedTensor) for sh_ten in sh_tens), sh_tens
+        for sh_ten in sh_tens:
+            if sh_ten.data is None:
+                if is_loading:
+                    sh_ten.init_data(
+                        init_device,
+                        init_fn=torch.zeros if sh_ten.allow_shape_mismatch else torch.empty,
+                    )
+                else:
+                    raise CheckpointingException(f'`data` attr is None for {sh_ten}')
+            else:
+                sh_ten.data = sh_ten.data.detach()
+                if sh_ten.allow_shape_mismatch and is_loading:
+                    sh_ten.data.zero_()
+
+        torch_sh_ten = sharded_tensor_to_torch_sharded_tensor(sh_tens, rank)
+        torch_sh_ten.key = sh_tens[0].key
+        return torch_sh_ten
+
+    def _mcore_to_torch_sharded_object(sh_objs: List[ShardedObject]) -> io.BytesIO:
+        """Build io.BytesIO from given sharded objects data."""
+        assert all(isinstance(sh_obj, ShardedObject) for sh_obj in sh_objs), sh_objs
+        serialized_data = io.BytesIO()
+        torch.save([sh_obj.data for sh_obj in sh_objs], serialized_data)
+        return serialized_data
+
+    for k, v in state_dict.items():
+        if isinstance(v[0], ShardedTensor):
+            v = cast(List[ShardedTensor], v)
+            pyt_state_dict[k] = _mcore_to_torch_sharded_tensor(v)
+        else:
+            v = cast(List[ShardedObject], v)
+            pyt_state_dict[k] = _mcore_to_torch_sharded_object(v)
+
+    return pyt_state_dict
+
+
+def _unwrap_pyt_sharded_tensor(sh_ten: TorchShardedTensor) -> List[torch.Tensor]:
+    """Unwrap tensor from PyT ShardedTensor instance.
+
+    If `prepend_axis_num` was non-zero (which is specific to MCore ShardedTensor)
+    then the tensor has additional singleton dimensions which should be squeezed.
+    """
+    mcore_sh_ten = sh_ten.mcore_sh_ten
+    ret_tensors = []
+    for sh in sh_ten.local_shards():
+        ten = sh.tensor
+        if mcore_sh_ten.flattened_range is not None:
+            assert ten.shape[:-1] == (1,) * (len(ten.shape) - 1), ten.shape
+            ten = ten.view(-1)
+        else:
+            for _ in range(mcore_sh_ten.prepend_axis_num):
+                ten = ten.squeeze(0)
+        ret_tensors.append(ten)
+    return ret_tensors
+
+
+def _replace_state_dict_keys_with_sharded_keys(
+    sharded_state_dict: ShardedStateDict, keep_only_main_replica: bool = False
+) -> Tuple[Dict[str, List[ShardedBase]], FLATTEN_MAPPING, Dict[str, List[str]]]:
+    """Group ShardedBase objects by keys and
+    return mappings required for recreating the original dict."""
+    flat_sd, flat_mapping = flatten_state_dict(sharded_state_dict)
+    rename_mapping = defaultdict(list)
+    new_flat_sd = defaultdict(list)
+    for k, sh_base in flat_sd.items():
+        assert isinstance(sh_base, ShardedBase), type(sh_base)
+        key = sh_base.unique_key if isinstance(sh_base, ShardedObject) else sh_base.key
+        if is_main_replica(sh_base.replica_id) or not keep_only_main_replica:
+            rename_mapping[key].append(k)
+            new_flat_sd[key].append(sh_base)
+    return new_flat_sd, flat_mapping, rename_mapping
+
+
+def _replace_sharded_keys_with_state_dict_keys(
+    state_dict: Dict[str, List[Union[torch.Tensor, io.BytesIO]]],
+    flat_mapping: FLATTEN_MAPPING,
+    rename_mapping: Dict[str, List[str]],
+):
+    """Inverse of _replace_state_dict_keys_with_sharded_keys."""
+    recovered_sd = {}
+    for k, tensors in state_dict.items():
+        assert len(tensors) == len(rename_mapping[k])
+        for ten, recovered_k in zip(tensors, rename_mapping[k]):
+            recovered_sd[recovered_k] = ten
+
+    return unflatten_state_dict(recovered_sd, flat_mapping)
+
+
+def _restore_dict_types(x: Union[dict, list, Any], keys_template: Union[dict, list, Any]):
+    """Recursively update `x` keys, based on `keys_template`."""
+    if isinstance(keys_template, dict):
+        assert isinstance(x, dict), type(x)
+        for k, v in keys_template.items():
+            if not isinstance(k, str):
+                assert str(k) in x, (k, x.keys)
+                x[k] = x.pop(str(k))
+            _restore_dict_types(x[k], v)
+    elif isinstance(keys_template, list):
+        assert isinstance(x, list), type(x)
+        for x_val, templ_val in zip(x, keys_template):
+            _restore_dict_types(x_val, templ_val)
+
+
+@dataclass(frozen=True)
+class MCoreSavePlan(SavePlan):
+    """SavePlan with MCore specific data."""
+
+    mcore_data: Dict[str, Dict[str, Any]] = None  # Mcore related data about each tensor
+
+
+class MCoreSavePlanner(DefaultSavePlanner):
+    """Differs with the default planner by saving BytesIO objects on all ranks.
+
+    In the integration of MCore with PyT Distributed format, BytesIO objects
+    come from ShardedObjects, which should be treated as separate objects on each rank
+    (not common on all ranks).
+
+    Also, the objects are already packed in io.BytesIO, so no need to redo it
+    in transform_object.
+    """
+
+    def __init__(
+        self,
+        *args,
+        dedup_replicated_tensors: Optional[bool] = None,
+        nd_flattened_global_shapes: Optional[Dict[str, Tuple[int, ...]]] = None,
+        **kwargs,
+    ) -> None:
+        # `dedup_replicated_tensors` was deprecated in 2.3; this check avoids warnings
+        # during saving.
+        if PkgVersion(torch.__version__) <= PkgVersion("2.2"):
+            kwargs['dedup_replicated_tensors'] = dedup_replicated_tensors
+        super().__init__(*args, **kwargs)
+        self.nd_flattened_global_shapes = nd_flattened_global_shapes or {}
+
+    def create_local_plan(self) -> SavePlan:
+        """Adds IOBytes write request on non-coordinator ranks."""
+
+        # NOTE: for PyT 2.4.0a0 we can't rely on `create_default_local_save_plan` because
+        # some alpha versions (specifically 2.4.0a0+f70bd71a48 in 24.06 NGC PyTorch container)
+        # add iobytes request only on coordinator ranks and some alpha versions
+        # (specifically 2.4.0a0+3bcc3cddb5 in 24.07 NGC PyTorch container)
+        # add those requests on all ranks. We inline a simplified version of this method below.
+        write_items = []
+        for fqn, obj in self.state_dict.items():
+            assert not isinstance(
+                obj, DTensor
+            )  # translation from MCore ShardedTensors shouldn't result in DTensors
+            # Create write requests for tensor and bytes values.
+            # For MCore, these should be already non-duplicates.
+            write_items += _create_write_items(fqn, obj)
+
+        self.plan = MCoreSavePlan(
+            items=write_items,
+            planner_data=self.mappings,
+            mcore_data={
+                k: sh_ten.mcore_metadata
+                for k, sh_ten in self.state_dict.items()
+                if isinstance(sh_ten, TorchShardedTensor)
+            },
+        )
+        return self.plan
+
+    def create_global_plan(self, all_plans: List[MCoreSavePlan]) -> Tuple[List[SavePlan], Metadata]:
+        """Merges MCore data for all plans."""
+        global_plan, metadata = super().create_global_plan(all_plans)
+        metadata.mcore_data = dict(ChainMap(*(plan.mcore_data for plan in all_plans)))
+        return global_plan, metadata
+
+    def transform_object(self, write_item: WriteItem, object: Any):
+        """Make no transformations - bytes objects are already serialized."""
+        return object
+
+
+class MCoreLoadPlanner(DefaultLoadPlanner):
+    """Adds global shape validation to the default planner.
+
+    If global shape validation can be ignored (shouldn't!), the default
+    load planner can be used.
+    """
+
+    def __init__(
+        self, *args, shapes_validation_sharded_tensors: Iterable[ShardedTensor] = (), **kwargs
+    ) -> None:
+        super().__init__(*args, **kwargs)
+        self.shapes_validation_sharded_tensors = shapes_validation_sharded_tensors
+        self._intermediate_read_item_and_target: Optional[Tuple[ReadItem, torch.Tensor]] = None
+
+    def _validate_global_shapes(self, metadata, sharded_tensors):
+        for sh_ten in sharded_tensors:
+            if sh_ten.key not in metadata.state_dict_metadata:
+                raise KeyError(
+                    f"{sh_ten.key} from model not in state dict:"
+                    f" {sorted(metadata.state_dict_metadata.keys())}"
+                )
+            loaded_shape = metadata.state_dict_metadata[sh_ten.key].size
+            if not is_nd_flattened_tensor(sh_ten):
+                expected_shape = sh_ten.global_shape
+            else:
+                expected_shape = nd_flattened_tensor_reformulated_global_shape(sh_ten)
+            if loaded_shape != expected_shape:
+                _msg = (
+                    f'Global shape mismatch for loaded ({loaded_shape})'
+                    f' and expected ({expected_shape}) tensor'
+                    f' for key {sh_ten.key}'
+                )
+                raise CheckpointingException(_msg)
+
+    def create_local_plan(self) -> LoadPlan:
+        """Runs additional shapes validation."""
+        self._validate_global_shapes(self.metadata, self.shapes_validation_sharded_tensors)
+        return super().create_local_plan()
+
+    def resolve_tensor(self, read_item: ReadItem):
+        """Override to add FP8 support.
+
+        Narrowing the Float8Tensor can create incontiguous tensors and there are
+        no `copy` kernels for such cases. This method creates a contiguous FP8
+        tensors so that the subsequent `copy_` in FileSystemReader succeeds.
+        Note that this requires tracking the original tensor
+        (as `self._intermediate_read_item_and_target` attribute)
+        and restoring it in `commit_tensor` method.
+        """
+        target_tensor = super().resolve_tensor(read_item)
+        if (
+            not target_tensor.is_contiguous()
+            and HAVE_TE
+            and isinstance(target_tensor, Float8Tensor)
+        ):
+            self._intermediate_read_item_and_target = (read_item, target_tensor)
+            target_tensor = Float8Tensor.make_like(
+                target_tensor, data=target_tensor._data.contiguous()
+            )
+        return target_tensor
+
+    def commit_tensor(self, read_item: ReadItem, tensor: torch.Tensor) -> None:
+        """Restores the original FP8 tensor saved in `resolve_tensor`."""
+        if self._intermediate_read_item_and_target is not None:
+            interm_read_item, target_tensor = self._intermediate_read_item_and_target
+            assert (
+                interm_read_item is read_item
+            ), '`commit_tensor` method should be called right after `resolve_tensor`'
+            target_tensor.copy_(tensor)
+            tensor = target_tensor
+            self._intermediate_read_item_and_target = None
+        return super().commit_tensor(read_item, tensor)
+
+
+class TorchDistSaveShardedStrategy(AsyncSaveShardedStrategy):
+    """Async save strategy for the PyT Distributed format.
+
+    The idea is to translate MCore ShardedTensors into PyT ShardedTensors
+    and use the async-adjusted torch.distributed.checkpoint saving mechanism
+    provided by the FileSystemWriterAsync writer.
+    """
+
+    def __init__(
+        self,
+        backend: str,
+        version: int,
+        keep_only_main_replica: bool = True,
+        thread_count: int = 2,
+        cached_metadata: bool = False,
+    ):
+        """Adds parameters specific to PyT Distributed format
+        Args:
+            backend (str): format backend string
+            version (int): format version
+            keep_only_main_replica (bool, optional): PyT Distributed has a mechanism
+                for deduplication, but replica_id aware deduplication is more coherent.
+                Default is True (recommended to keep it).
+            thread_count (int, optional): threads to use during saving.
+                Affects the number of files in the checkpoint (saving ranks * num_threads).
+            cached_metadata (bool, optional): Enables using cached global metadata to avoid
+                gathering local metadata every checkpointing invocation
+        """
+        super().__init__(backend, version)
+        self.keep_only_main_replica = keep_only_main_replica
+        self.thread_count = thread_count
+
+        # Cached SavePlans to skip plan in `save_state_dict_async_plan`
+        # cached outcome of `SavePlan.prepare_global_plan`,
+        # which aggregates local plans from all ranks
+        self.cached_central_plan: SavePlan = None
+        # cached outcome of `SavePlan.prepare_local_plan` describes how local state_dict is written
+        self.cached_local_plan: SavePlan = None
+        # Cached global metadata, only `coordinator` for dist-ckpt holds
+        # if central plans are consistent over iters
+        self.cached_global_metadata: Metadata = None
+        # This variable records if the ckpt structures are consistent
+        # so the following checkpoint savings reuse `cached_global_metadata`
+        self.validated_cache_reuse: bool = False
+        # The knob to enable cached metadata communication in saving
+        self.use_cached_ckpt_structure: bool = cached_metadata
+
+    def async_save(
+        self, sharded_state_dict: ShardedStateDict, checkpoint_dir: Path
+    ) -> AsyncRequest:
+        """Translates MCore ShardedTensors to PyT ShardedTensors & saves in PyT Distributed format.
+
+        Args:
+            sharded_state_dict (ShardedStateDict): sharded state dict to save
+            checkpoint_dir (Path): checkpoint directory
+
+        Returns: None
+        """
+        # Translate the state dict
+        (sharded_state_dict, flat_mapping, rename_mapping) = (
+            _replace_state_dict_keys_with_sharded_keys(
+                sharded_state_dict, self.keep_only_main_replica
+            )
+        )
+        pyt_state_dict = mcore_to_pyt_state_dict(sharded_state_dict, False)
+        # Use PyT saving mechanism
+        writer = FileSystemWriterAsync(checkpoint_dir, thread_count=self.thread_count)
+        # This should be set differently if we run in a smaller process group than the default
+        coordinator = 0
+        # Try twice to validate the generated `central_plan` is the same across iterations
+        # If so, reuse `cached_central_plan` and `cached_global_metadata`
+        # From the 3rd iteration, `save_state_dict_async_plan` will not generate `global_metadata`
+        # (return None) so `self.cached_global_metadata` is reused
+        args_cached_plans = None
+        if self.use_cached_ckpt_structure:
+            args_cached_plans = (
+                self.cached_central_plan,
+                self.cached_local_plan,
+                self.validated_cache_reuse,
+            )
+
+        (
+            save_state_dict_ret,
+            self.cached_central_plan,
+            self.cached_local_plan,
+            self.validated_cache_reuse,
+        ) = save_state_dict_async_plan(
+            pyt_state_dict,
+            writer,
+            None,
+            coordinator,
+            planner=MCoreSavePlanner(dedup_replicated_tensors=not self.keep_only_main_replica),
+            cached_ckpt_structure=args_cached_plans,
+        )
+        rank = torch.distributed.get_rank()
+        if self.use_cached_ckpt_structure:
+            if self.validated_cache_reuse:
+                logger.debug(f"rank: {rank}, cache validated")
+                if save_state_dict_ret[1]:  # when global_metadata is not cached
+                    self.cached_global_metadata = save_state_dict_ret[1]  # Cache Metadata
+                # Only Coordinator rank holds cached global_metadata
+                # (None is returned for global_metadata)
+                elif coordinator == rank:
+                    logger.debug(f"rank: {rank}, reuse metadata, {save_state_dict_ret[1]}")
+                    save_state_dict_ret = list(save_state_dict_ret)
+                    save_state_dict_ret[1] = self.cached_global_metadata
+
+        return self._get_save_and_finalize_callbacks(writer, save_state_dict_ret)
+
+    def _get_save_and_finalize_callbacks(self, writer, save_state_dict_ret) -> AsyncRequest:
+        save_fn_args = writer.get_save_function_and_args()
+        save_fn, save_args = save_fn_args
+
+        def finalize_fn():
+            save_state_dict_async_finalize(*save_state_dict_ret)
+            torch.distributed.barrier()
+
+        return AsyncRequest(save_fn, save_args, [finalize_fn])
+
+    def can_handle_sharded_objects(self):
+        return True
+
+
+def get_reformulation_metadata(
+    sharded_state_dict: ShardedStateDict, checkpoint_dir: Path
+) -> Dict[str, TensorReformulationMetadata]:
+    """Reads MCore data for N-D flattened tensors from checkpoint metadata during ckpt load.
+
+    Args:
+        sharded_state_dict (ShardedStateDict): sharded state dict to load
+        checkpoint_dir (Path): checkpoint directory
+
+    Returns:
+        Dict[str, TensorReformulationMetadata] - dictionary that maps keys of every
+            N-D flattened tensor from the sharded_state_dict to its original global shape
+            as stored in `mcore_data` in the checkpoint.
+    """
+    ckpt_metadata = FileSystemReader(checkpoint_dir).read_metadata()
+    reformulation_metadata = {}
+    for sh_ten in nested_values(sharded_state_dict):
+        if not is_nd_flattened_tensor(sh_ten):
+            continue
+        try:
+            ckpt_global_shape = ckpt_metadata.mcore_data[sh_ten.key][
+                'nd_reformulated_orig_global_shape'
+            ]
+        except KeyError as e:
+            raise CheckpointingException(
+                f'Cannot find global shape metadata for N-D flattened tensor {sh_ten} '
+                f'in checkpoint metadata: {ckpt_metadata.mcore_data}'
+            ) from e
+
+        reformulation_metadata[sh_ten.key] = TensorReformulationMetadata(
+            ckpt_global_shape, ckpt_metadata.state_dict_metadata[sh_ten.key].size
+        )
+    return reformulation_metadata
+
+
+class TorchDistLoadShardedStrategy(LoadShardedStrategy):
+    """Basic load strategy for the PyT Distributed format."""
+
+    def load(self, sharded_state_dict: ShardedStateDict, checkpoint_dir: Path) -> StateDict:
+        """Translates MCore ShardedTensors to PyT ShardedTensors & loads from PyT Distributed fmt.
+
+        Args:
+            sharded_state_dict (ShardedStateDict): sharded state dict with mapping
+                information to instruct loading
+            checkpoint_dir (Path): checkpoint directory
+
+        Returns: loaded state dict
+        """
+        # Apply N-D tensors resharding
+        sharded_state_dict, formulation_restore_data = apply_nd_flattened_tensors_reformulation(
+            sharded_state_dict, get_reformulation_metadata(sharded_state_dict, checkpoint_dir)
+        )
+
+        flexible_shape_sharded_tensors = [
+            sh_ten
+            for sh_ten in nested_values(sharded_state_dict)
+            if isinstance(sh_ten, ShardedTensor) and not sh_ten.allow_shape_mismatch
+        ]
+
+        orig_sharded_state_dict = sharded_state_dict
+        # MCore state dict to PyT Distributed compatible
+        (sharded_state_dict, flat_mapping, rename_mapping) = (
+            _replace_state_dict_keys_with_sharded_keys(sharded_state_dict)
+        )
+        pyt_state_dict = mcore_to_pyt_state_dict(sharded_state_dict, True)
+        # Load PyT Distributed format
+        checkpoint.load_state_dict(
+            pyt_state_dict,
+            FileSystemReader(checkpoint_dir),
+            planner=MCoreLoadPlanner(
+                shapes_validation_sharded_tensors=flexible_shape_sharded_tensors
+            ),
+        )
+        pyt_state_dict = cast(
+            Dict[str, Union[TorchShardedTensor, List[io.BytesIO]]], pyt_state_dict
+        )
+        # Unwrap ShardedTensors and return to original state dict
+        mcore_state_dict = {
+            k: v if not isinstance(v, TorchShardedTensor) else _unwrap_pyt_sharded_tensor(v)
+            for k, v in pyt_state_dict.items()
+        }
+        mcore_state_dict = _replace_sharded_keys_with_state_dict_keys(
+            mcore_state_dict, flat_mapping, rename_mapping
+        )
+        _restore_dict_types(mcore_state_dict, orig_sharded_state_dict)
+        # Apply N-D tensors resharding postprocessing
+        mcore_state_dict = restore_nd_flattened_tensors_formulation(
+            mcore_state_dict, formulation_restore_data
+        )
+        return mcore_state_dict
+
+    def load_tensors_metadata(self, checkpoint_dir: Path, metadata: Metadata = None):
+        """Uses tensors metadata stored in the metadata file."""
+        if metadata is None:
+            fs_reader = FileSystemReader(checkpoint_dir)
+            metadata = fs_reader.read_metadata()
+
+        mcore_data = getattr(metadata, 'mcore_data', {})
+        sharded_metadata = {}
+        for k, tp in metadata.state_dict_metadata.items():
+            if not isinstance(tp, TensorStorageMetadata):
+                continue  # load only tensors
+
+            nd_orig_global_shape = mcore_data.get(k, {}).get('nd_reformulated_orig_global_shape')
+            if nd_orig_global_shape is None:
+                # Regular tensor
+                sharded_metadata[k] = ShardedTensor.from_rank_offsets(
+                    k, torch.empty(tp.size, **tp.properties.__dict__, device='meta')
+                ).without_data()
+            else:
+                # N-D flattened tensor
+                unflat_ten = torch.empty(
+                    nd_orig_global_shape, **tp.properties.__dict__, device='meta'
+                )
+                flat_ten = unflat_ten.flatten()
+                sharded_metadata[k] = ShardedTensor.from_rank_offsets_flat(
+                    k,
+                    flat_ten,
+                    unflat_ten.shape,
+                    flattened_range=slice(0, unflat_ten.numel()),  # whole slice
+                ).without_data()
+
+        return sharded_metadata
+
+    def load_sharded_metadata(self, checkpoint_dir: Path) -> ShardedStateDict:
+        """Uses tensors and objects metadata stored in the metadata file."""
+        fs_reader = FileSystemReader(checkpoint_dir)
+        metadata = fs_reader.read_metadata()
+
+        sharded_metadata = {}
+        for metadata_key, storage_metadata in metadata.state_dict_metadata.items():
+            if not isinstance(storage_metadata, BytesStorageMetadata):
+                continue
+            sh_obj = ShardedObject.empty_from_unique_key(metadata_key)
+            sharded_metadata[sh_obj.unique_key] = sh_obj
+
+        sharded_metadata.update(self.load_tensors_metadata(checkpoint_dir, metadata))
+        return sharded_metadata
+
+    def can_handle_sharded_objects(self):
+        return True
+
+    def check_backend_compatibility(self, loaded_version):
+        pass  # TODO
+
+    def check_version_compatibility(self, loaded_version):
+        pass  # TODO
diff --git a/megatron/core/dist_checkpointing/strategies/two_stage.py b/megatron/core/dist_checkpointing/strategies/two_stage.py
new file mode 100644
index 0000000000..72e60bc79b
--- /dev/null
+++ b/megatron/core/dist_checkpointing/strategies/two_stage.py
@@ -0,0 +1,254 @@
+# Copyright (c) 2022-2023, NVIDIA CORPORATION.  All rights reserved.
+
+""" 2-stage checkpoint loading. """
+import os
+import time
+from collections import defaultdict
+from dataclasses import dataclass
+from functools import partial, wraps
+from itertools import chain
+from logging import DEBUG, INFO, StreamHandler, getLogger
+from operator import attrgetter, itemgetter
+from pathlib import Path
+from typing import Iterable, List, NamedTuple, Optional, Tuple, Union
+
+import torch
+
+from ..dict_utils import dict_list_map_inplace, map_reduce, nested_values
+from ..mapping import ShardedStateDict, ShardedTensor, StateDict
+from .base import LoadShardedStrategy
+from .tensorstore import TensorStoreLoadShardedStrategy, _load_from_array, open_ts_array
+from .zarr import flatten_range, load_zarr_based_sharded_metadata
+
+_import_trigger = None
+
+
+timers = defaultdict(list)
+
+logger = getLogger(__name__)
+
+
+def timed(verbose=True):
+    def timed_dec(fn):
+        name = fn.__name__
+
+        @wraps(fn)
+        def wrapped(*args, **kwargs):
+            if verbose:
+                logger.debug(f'{name} init')
+            start = time.time()
+            ret = fn(*args, **kwargs)
+            took = time.time() - start
+            if verbose:
+                logger.debug(f'{name} took {took}s')
+            timers[name].append(took)
+            return ret
+
+        return wrapped
+
+    return timed_dec
+
+
+@dataclass
+class _ShardedTensorMetadata:
+    global_rank: int
+    sharded_tensor_no_data: ShardedTensor
+    dist_group_rank: Tuple[int]  # id of distributed group
+    dist_group_ranks: Tuple[int]  # id of distributed group
+    data_size: Optional[int] = None  # bytes
+
+
+def sharded_tensor_chunk_id(sharded_tensor: ShardedTensor):
+    return (sharded_tensor.key, sharded_tensor.global_offset)
+
+
+class TwoStageDataParallelLoadShardedStrategy(LoadShardedStrategy):
+    """Loads one checkpoint replica from storage and broadcasts to other nodes.
+
+    This strategy loads checkpoint from storage on minimal set of nodes
+    and distributes the checkpoint to other nodes with torch.distributed.
+    Loading is performed with tensorstore.
+
+    Steps:
+    0. (optional) create Gloo distributed groups
+    1. Exchange ShardedTensors metadata between all nodes
+    2. Align needed tensors within DP groups
+    3. For each globally unique tensor:
+    3.a) on one of the ranks load it from storage to CPU and move to CUDA
+    3.b) allocate CUDA tensor on other ranks
+    3.c) broadcast within DP group
+    3.d) copy tensor content to the model param location
+    3.e) free tensor buffers from a) and b)
+
+    Notes:
+    1. Loading and broadcasting is done sequentially to avoid both host and device OOMs
+    2. There is a lot of overlap potential between all three steps done for each tensor:
+    2.a) loading from storage to numpy
+    2.b) moving CPU tensors to CUDA
+    2.c) broadcast
+    """
+
+    def __init__(self, data_parallel_group, cpu_transfer=True):
+        super().__init__()
+
+        self.cpu_transfer = cpu_transfer
+        self.data_parallel_group_orig = data_parallel_group
+        self.data_parallel_group = None if cpu_transfer else data_parallel_group
+        self.dp_group_ranks = tuple(
+            sorted(torch.distributed.get_process_group_ranks(data_parallel_group))
+        )
+        self.dp_group_rank = torch.distributed.get_rank(self.data_parallel_group_orig)
+        self.global_rank = torch.distributed.get_rank()
+
+    def load(self, sharded_state_dict: ShardedStateDict, checkpoint_dir: Path):
+        self.maybe_init_gloo_group()
+        all_tensors_sorted = self._build_load_plan(sharded_state_dict)
+        self._exchange_loaded_tensors(all_tensors_sorted, sharded_state_dict, checkpoint_dir)
+        # TODO: fix hang in summarize_load_times
+        # self.summarize_load_times()
+        return sharded_state_dict
+
+    def summarize_load_times(self):
+        torch.distributed.barrier()
+        logger.info('Checkpoint loading finished. Summary:')
+        # TODO: `timers` keys are not guaranteed to be the same across ranks which causes hangs
+        for key, times in sorted(timers.items()):
+            times_sum = sum(times)
+            max_times = torch.tensor([times_sum], device='cuda')
+            avg_times = torch.tensor([times_sum], device='cuda')
+            torch.distributed.all_reduce(max_times, op=torch.distributed.ReduceOp.MAX)
+            torch.distributed.all_reduce(avg_times, op=torch.distributed.ReduceOp.SUM)
+            avg_times /= torch.distributed.get_world_size()
+            if torch.distributed.get_rank() == 0:
+                logger.info(f'{key}: max {max_times[0]}, avg {avg_times[0]}')
+
+    @timed(verbose=False)
+    def load_tensor_from_storage(self, checkpoint_dir, ten_meta: _ShardedTensorMetadata):
+        logger.debug(f'_load_from_array({ten_meta.sharded_tensor_no_data.key}) init')
+        ret = _load_from_array(
+            ten_meta.sharded_tensor_no_data,
+            checkpoint_dir,
+            load_directly_on_device=False,
+            apply_flattened_range=False,
+        )
+        logger.debug(f'_load_from_array({ten_meta.sharded_tensor_no_data.key}) DONE')
+        return ret
+
+    @timed()
+    def maybe_init_gloo_group(self):
+        if not self.cpu_transfer:
+            return
+        all_groups = [None] * torch.distributed.get_world_size()
+        torch.distributed.all_gather_object(all_groups, self.dp_group_ranks)
+        all_groups = set(tuple(sorted(gr)) for gr in all_groups)
+        for group_ranks in sorted(all_groups):
+            gloo_pg = torch.distributed.new_group(ranks=group_ranks, backend='gloo')
+            if self.global_rank in group_ranks:
+                self.data_parallel_group = gloo_pg
+                assert self.dp_group_rank == torch.distributed.get_rank(self.data_parallel_group)
+
+    def check_backend_compatibility(self, loaded_version):
+        pass  # TODO
+
+    def check_version_compatibility(self, loaded_version):
+        pass  # TODO
+
+    @timed()
+    def _build_load_plan(
+        self, sharded_state_dict: ShardedStateDict
+    ) -> List[_ShardedTensorMetadata]:
+        local_meta = [
+            _ShardedTensorMetadata(
+                self.global_rank,
+                sharded_ten.without_data(),
+                self.dp_group_rank,
+                self.dp_group_ranks,
+            )
+            for sharded_ten in nested_values(sharded_state_dict)
+        ]
+        all_meta = [None] * torch.distributed.get_world_size(group=self.data_parallel_group)
+        torch.distributed.all_gather_object(all_meta, local_meta, group=self.data_parallel_group)
+        all_meta = list(chain.from_iterable(all_meta))
+        all_tensors_sorted = self.deduplicate_chunks(all_meta)
+        return all_tensors_sorted
+
+    @timed()
+    def deduplicate_chunks(self, ten_metas: List[_ShardedTensorMetadata]):
+        """Group tensors by chunk and then pick the tensor with the lowest rank.
+
+        NOTE: with proper loading overlap, loading from randomized ranks
+         (instead of the smallest one) could be beneficial here.
+        """
+        ten_metas = map_reduce(
+            ten_metas,
+            key_fn=lambda meta: sharded_tensor_chunk_id(meta.sharded_tensor_no_data),
+            reduce_fn=partial(min, key=attrgetter('dist_group_rank')),
+        )
+        all_metas_sorted = list(map(itemgetter(1), sorted(ten_metas.items())))
+        return all_metas_sorted
+
+    @timed()
+    def _exchange_loaded_tensors(
+        self, ten_metas: List[_ShardedTensorMetadata], sharded_state_dict, checkpoint_dir
+    ):
+        logger.debug(f'_exchange_loaded_tensors, num ten_metas: {len(ten_metas)}')
+        for ten_meta in ten_metas:
+
+            src_rank = torch.distributed.get_global_rank(
+                self.data_parallel_group, ten_meta.dist_group_rank
+            )
+
+            if self.dp_group_rank == ten_meta.dist_group_rank:
+                exchange_tensor = self.load_tensor_from_storage(checkpoint_dir, ten_meta)
+                if not self.cpu_transfer:
+                    exchange_tensor = exchange_tensor.cuda()
+            else:
+                # TODO: for non-flattened ranges we could reuse the buffer from the start here
+                exchange_tensor = torch.empty(
+                    ten_meta.sharded_tensor_no_data.local_shape,
+                    device='cpu' if self.cpu_transfer else 'cuda',
+                    dtype=ten_meta.sharded_tensor_no_data.dtype,
+                )
+
+            logger.debug(
+                f'exchange {ten_meta.sharded_tensor_no_data.key}, {exchange_tensor.shape}({exchange_tensor.numel()}), broadcast({src_rank} -> {self.dp_group_ranks})'
+            )
+            torch.distributed.broadcast(
+                exchange_tensor, group=self.data_parallel_group, src=src_rank
+            )
+            self._distribute_data_to_state_dict(ten_meta, exchange_tensor, sharded_state_dict)
+            logger.debug(f'exchange {ten_meta.sharded_tensor_no_data.key} done')
+
+            # free buffer memory
+            exchange_tensor = None
+
+    @timed(verbose=False)
+    def _distribute_data_to_state_dict(
+        self,
+        ten_meta: _ShardedTensorMetadata,
+        loaded_ten: torch.Tensor,
+        sharded_state_dict: ShardedStateDict,
+    ):
+        tensor_key = sharded_tensor_chunk_id(ten_meta.sharded_tensor_no_data)
+
+        def _fill_in_data(t: Union[ShardedTensor, torch.Tensor]):
+            if not isinstance(t, ShardedTensor) or sharded_tensor_chunk_id(t) != tensor_key:
+                # already filled-in or key not matching
+                return t
+            sharded_tensor: ShardedTensor = t
+            x = loaded_ten
+            if sharded_tensor.flattened_range is not None:
+                x = flatten_range(sharded_tensor, x)
+
+            # Reuse existing buffer
+            sharded_tensor.data.data.copy_(x)
+            return sharded_tensor.data
+
+        dict_list_map_inplace(_fill_in_data, sharded_state_dict)
+
+    def load_tensors_metadata(self, checkpoint_dir: Path):
+        def get_ts_shape_dtype(path):
+            arr = open_ts_array(path)
+            return arr.shape, arr.dtype.numpy_dtype
+
+        return load_zarr_based_sharded_metadata(checkpoint_dir, get_ts_shape_dtype)
diff --git a/megatron/core/dist_checkpointing/strategies/zarr.py b/megatron/core/dist_checkpointing/strategies/zarr.py
new file mode 100644
index 0000000000..6a52b12257
--- /dev/null
+++ b/megatron/core/dist_checkpointing/strategies/zarr.py
@@ -0,0 +1,321 @@
+# Copyright (c) 2022-2023, NVIDIA CORPORATION.  All rights reserved.
+
+""" Strategies using Zarr as an underlying format. """
+import logging
+import os
+from functools import partial
+from logging import getLogger
+from pathlib import Path
+from typing import Callable, List, Optional, Tuple
+
+import numpy as np
+import torch
+import zarr
+
+from ..core import CheckpointingException
+from ..dict_utils import dict_list_map_inplace, nested_values
+from ..mapping import ShardedStateDict, ShardedTensor, is_main_replica
+from .base import (
+    LoadShardedStrategy,
+    SaveShardedStrategy,
+    StrategyAction,
+    register_default_strategy,
+)
+
+logger = logging.getLogger(__name__)
+
+numpy_to_torch_dtype_dict = {
+    np.dtype('bool'): torch.bool,
+    np.dtype('uint8'): torch.uint8,
+    np.dtype('int8'): torch.int8,
+    np.dtype('int16'): torch.int16,
+    np.dtype('int32'): torch.int32,
+    np.dtype('int64'): torch.int64,
+    np.dtype('float16'): torch.float16,
+    np.dtype('float32'): torch.float32,
+    np.dtype('float64'): torch.float64,
+    np.dtype('complex64'): torch.complex64,
+    np.dtype('complex128'): torch.complex128,
+}
+
+torch_to_numpy_dtype_dict = {v: k for k, v in numpy_to_torch_dtype_dict.items()}
+
+
+try:
+    # Register a bfloat16 type with this import
+    import tensorstore  # pylint: disable=unused-import
+
+    HAS_BFLOAT16 = True
+    numpy_to_torch_dtype_dict[np.dtype('bfloat16')] = torch.bfloat16
+    torch_to_numpy_dtype_dict[torch.bfloat16] = np.dtype('bfloat16')
+except ImportError:
+    HAS_BFLOAT16 = False
+
+logger = getLogger(__name__)
+
+
+def register_default_zarr_strategies():
+    """Register default strategies related to Zarr backend."""
+    register_default_strategy(
+        StrategyAction.SAVE_SHARDED, 'zarr', 1, ZarrSaveShardedStrategy('zarr', 1)
+    )
+
+
+class ZarrSaveShardedStrategy(SaveShardedStrategy):
+    """Save strategy for Zarr backend."""
+
+    def __init__(self, backend: str, version: int):
+        super().__init__(backend, version)
+        logger.warning(
+            f'`zarr` distributed checkpoint backend is deprecated.'
+            ' Please switch to PyTorch Distributed format (`torch_dist`).'
+        )
+
+    def save(self, sharded_state_dict: ShardedStateDict, checkpoint_dir: Path):
+        sharded_tensors = list(nested_values(sharded_state_dict))
+        arrays = _create_or_open_zarr_arrays(sharded_tensors, checkpoint_dir)
+        for ten, arr in zip(sharded_tensors, arrays):
+            _save_to_existing_array(ten, arr)
+        torch.distributed.barrier()
+
+
+def _create_or_open_zarr_arrays(
+    sharded_tensors: List[ShardedTensor], checkpoint_dir: Path
+) -> List[Optional[zarr.Array]]:
+    """Returns list of zarr arrays corresponding to given tensors.
+
+    For a sharded tensors that:
+    a) is main replica and represents the first chunk (all offsets 0), creates the Zarr array
+    b) is main replica but not the first chunk,
+        opens the arrays created in (a) (possibly by other process)
+    c) otherwise, sets the corresponding array to None since it won't be used
+
+    Args:
+        sharded_tensors (List[ShardedTensor]): sharded tensors from a given rank
+            that will be saved to checkpoint
+        checkpoint_dir (Path): checkpoint in which the arrays will be created
+    """
+    arrays = []
+    for ten in sharded_tensors:
+        arr = _create_zarr_array(ten, checkpoint_dir) if _should_create_array(ten) else None
+        arrays.append(arr)
+
+    torch.distributed.barrier()
+    # Open arrays created above by other processes
+    for arr_idx, ten in enumerate(sharded_tensors):
+        if arrays[arr_idx] is not None:
+            # array created by this process
+            assert _should_create_array(ten), ten
+            continue
+        if not is_main_replica(ten.replica_id):
+            # this array won't be needed for saving and can stay None
+            continue
+        open_kwargs = {}
+        if ten.flattened_range is not None:
+            open_kwargs['synchronizer'] = zarr.ProcessSynchronizer(
+                str(checkpoint_dir / f'{ten.key}.sync')
+            )
+        arrays[arr_idx] = _open_zarr_array_verbose(checkpoint_dir / ten.key, 'r+', **open_kwargs)
+    return arrays
+
+
+def _should_create_array(ten: ShardedTensor):
+    return (
+        is_main_replica(ten.replica_id)
+        and set(ten.global_offset) == {0}
+        and (ten.flattened_range is None or ten.flattened_range.start == 0)
+    )
+
+
+def _save_to_existing_array(sharded_tensor: ShardedTensor, arr: Optional[zarr.Array]):
+    if not is_main_replica(sharded_tensor.replica_id):
+        return
+    assert arr is not None
+    x = sharded_tensor.data
+    x = x.detach().cpu()
+    torch.cuda.synchronize()
+    if x.dtype == torch.bfloat16:
+        x = x.float()
+        x = x.numpy()
+        x = x.astype('bfloat16')
+    else:
+        x = x.numpy()
+
+    if sharded_tensor.flattened_range is None:
+        arr[sharded_tensor.global_slice()] = x
+    else:
+        arr.set_coordinate_selection(sharded_tensor.global_coordinates(), x)
+
+
+def _create_zarr_array(sharded_tensor: ShardedTensor, checkpoint_dir: Path):
+    np_dtype = torch_to_numpy_dtype_dict[sharded_tensor.dtype]
+    try:
+        arr = zarr.create(
+            sharded_tensor.global_shape,
+            dtype=np_dtype,
+            store=checkpoint_dir / sharded_tensor.key,
+            chunks=sharded_tensor.max_allowed_chunks(),
+            compressor=None,
+            fill_value=None,
+            write_empty_chunks=True,
+        )
+        logger.debug(f'Created a new Zarr array at {checkpoint_dir / sharded_tensor.key}')
+    except zarr.errors.ContainsArrayError as e:
+        raise CheckpointingException(
+            f'Array {checkpoint_dir / sharded_tensor.key} already exists'
+        ) from e
+
+    if HAS_BFLOAT16 and np_dtype == np.dtype('bfloat16'):
+        arr._dtype = np_dtype
+        zarray = arr.store['.zarray']
+        arr.store['.zarray'] = zarray.replace(b'<V2', b'bfloat16')
+    return arr
+
+
+class ZarrLoadShardedStrategy(LoadShardedStrategy):
+    """Load strategy for the Zarr backend."""
+
+    def load(self, sharded_state_dict: ShardedStateDict, checkpoint_dir: Path):
+        dict_list_map_inplace(
+            partial(_load_from_array, checkpoint_dir=checkpoint_dir), sharded_state_dict
+        )
+        return sharded_state_dict
+
+    def load_tensors_metadata(self, checkpoint_dir: Path):
+        def get_zarr_shape_dtype(path):
+            arr = zarr.open(path, 'r')
+            return arr.shape, arr.dtype
+
+        return load_zarr_based_sharded_metadata(checkpoint_dir, get_zarr_shape_dtype)
+
+    def check_backend_compatibility(self, loaded_version):
+        pass  # TODO
+
+    def check_version_compatibility(self, loaded_version):
+        pass  # TODO
+
+
+def _load_from_array(sharded_tensor: ShardedTensor, checkpoint_dir: Path):
+    assert isinstance(sharded_tensor, ShardedTensor), type(sharded_tensor)
+    arr = _open_zarr_array_verbose(checkpoint_dir / sharded_tensor.key, 'r')
+
+    if not sharded_tensor.allow_shape_mismatch and sharded_tensor.global_shape != arr.shape:
+        _msg = (
+            f'Global shape mismatch for loaded ({arr.shape})'
+            f' and expected ({sharded_tensor.global_shape}) tensor'
+            f' for key {sharded_tensor.key}'
+        )
+        raise CheckpointingException(_msg)
+
+    x = arr[sharded_tensor.global_slice()]  # flattened tensors loading is delayed
+    return postprocess_numpy_array(x, sharded_tensor)
+
+
+def _open_zarr_array_verbose(path: Path, mode: str, **open_kwargs):
+    try:
+        return zarr.open(str(path), mode, **open_kwargs)
+    except zarr.errors.PathNotFoundError as e:
+        ckpt_dir = path.parent
+        err_msg = f'Array {path} not found'
+        if ckpt_dir.exists():
+            ckpt_files = [f.name for f in ckpt_dir.iterdir()]
+            logger.debug(f'{err_msg}. Checkpoint directory {ckpt_dir} content: {ckpt_files}')
+        else:
+            err_msg += f'. Checkpoint directory {ckpt_dir} does not exist.'
+        raise CheckpointingException(err_msg) from e
+
+
+def postprocess_numpy_array(loaded_array, sharded_tensor, apply_flattened_range=True):
+    """Turn numpy array to torch tensor."""
+    x = loaded_array
+    if HAS_BFLOAT16 and x.dtype == np.dtype('bfloat16'):
+        x = x.astype(np.dtype('float32'))
+        x = torch.from_numpy(x)
+        x = x.bfloat16()
+    else:
+        x = torch.from_numpy(x)
+    # TODO: consider some other consistency checks
+    if x.shape != sharded_tensor.local_shape:
+        if sharded_tensor.allow_shape_mismatch:
+            x = pad_to_expected_shape(x, sharded_tensor)
+        else:
+            _msg = (
+                f'Local shape mismatch for loaded ({x.shape})'
+                f' and expected ({sharded_tensor.local_shape}) tensor'
+                f' for key {sharded_tensor.key}'
+            )
+            raise CheckpointingException(_msg)
+
+    if apply_flattened_range and sharded_tensor.flattened_range is not None:
+        x = flatten_range(sharded_tensor, x)
+
+    # TODO: consider cuda() tensors support
+    return x
+
+
+def flatten_range(sharded_tensor, x):
+    """Apply flattened range to a tensor."""
+    return x.flatten()[sharded_tensor.flattened_range]
+
+
+def pad_to_expected_shape(x: torch.Tensor, expected_sharded_ten: ShardedTensor):
+    """Pad tensor to the expected shape."""
+    pad_args = []
+    assert len(x.shape) == len(expected_sharded_ten.local_shape)
+    # Reversed iteration order because F.pad expects so
+    for x_sh, exp_sh, axis_fragm in reversed(
+        list(
+            zip(x.shape, expected_sharded_ten.local_shape, expected_sharded_ten.axis_fragmentations)
+        )
+    ):
+        if x_sh == exp_sh:
+            pad_args.extend((0, 0))
+        elif x_sh > exp_sh:
+            assert False, (
+                f'Expected shape ({exp_sh}) smaller than actual ({x_sh})'
+                f' for {repr(expected_sharded_ten)}'
+            )
+        else:
+            pad_args.extend((0, exp_sh - x_sh))
+    # TODO: behavior control with envvar is for testing purposes only, remove it
+    if not int(os.environ.get('DIST_CKPT_PAD_REPLICATE', 0)):
+        return torch.nn.functional.pad(x, pad_args)
+
+    # unsqueeze and squeeze to get shapes supported by cudnn
+    print(f'Replicating last row for {expected_sharded_ten.key}')
+    if x.dtype == torch.bfloat16:
+        return (
+            torch.nn.functional.pad(x.float().unsqueeze(0), pad_args, mode='replicate')
+            .squeeze(0)
+            .bfloat16()
+        )
+    return torch.nn.functional.pad(x.unsqueeze(0), pad_args, mode='replicate').squeeze(0)
+
+
+def load_zarr_based_sharded_metadata(
+    checkpoint_dir: Path, get_shape_dtype_fn: Callable[[str], Tuple[Tuple[int], np.dtype]]
+) -> ShardedStateDict:
+    """Load metadata of Zarr arrays.
+
+    Args:
+        checkpoint_dir (str): checkpoint root directory
+        get_shape_dtype_fn (str -> ((int, ...), np.dtype)): a function returning
+            an array shape and dtype for a given Zarr array path
+    """
+    sharded_state_dict = {}
+    for subdir in checkpoint_dir.iterdir():
+        if not subdir.is_dir() or not (subdir / '.zarray').exists():
+            continue
+        key = subdir.name
+        arr_shape, arr_dtype = get_shape_dtype_fn(str(subdir))
+
+        sharded_state_dict[key] = ShardedTensor(
+            key,
+            None,
+            numpy_to_torch_dtype_dict[arr_dtype],
+            arr_shape,
+            arr_shape,
+            tuple(0 for _ in arr_shape),
+            tuple(1 for _ in arr_shape),
+        )
+    return sharded_state_dict
diff --git a/megatron/core/dist_checkpointing/utils.py b/megatron/core/dist_checkpointing/utils.py
new file mode 100644
index 0000000000..9186e4790a
--- /dev/null
+++ b/megatron/core/dist_checkpointing/utils.py
@@ -0,0 +1,219 @@
+# Copyright (c) 2022-2023, NVIDIA CORPORATION.  All rights reserved.
+
+""" Helpers for manipulating sharded tensors and sharded state dicts. """
+
+from typing import Dict, Optional, Tuple
+
+from .dict_utils import dict_list_map_inplace, extract_matching_values
+from .mapping import (
+    LocalNonpersistentObject,
+    ShardedBase,
+    ShardedObject,
+    ShardedStateDict,
+    ShardedTensor,
+    ShardedTensorFactory,
+    StateDict,
+)
+
+# _ShardId uniquely identifies a ShardedTensor. This is a subset of ShardedTensor
+# attributes: key (str), global_offset (tuple) and flattened_range (optional tuple)
+_ShardId = Tuple[str, tuple, Optional[tuple]]
+
+
+def _sharded_tensor_shard_id(sharded_tensor: ShardedTensor) -> _ShardId:
+    """Unique id of the sharded tensor data.
+
+    Should yield the same value for same data replicated on different ranks.
+
+    Args:
+        sharded_tensor (ShardedTensor): sharded tensor representing the data shard
+
+    Returns (tuple): unique id of a data shard
+    """
+    f_range = sharded_tensor.flattened_range
+    return (
+        sharded_tensor.key,
+        sharded_tensor.global_offset,
+        None if f_range is None else (f_range.start, f_range.stop),
+    )
+
+
+def _sharded_object_id(sharded_object: ShardedObject) -> _ShardId:
+    """Unique id of the sharded object data.
+
+    Should yield the same value for same data replicated on different ranks.
+
+    Args:
+        sharded_object (ShardedObject): sharded object representing the data shard
+
+    Returns (tuple): unique id of a data shard
+    """
+    return (sharded_object.key, sharded_object.global_offset, sharded_object.global_shape)
+
+
+def extract_sharded_tensors(
+    sharded_state_dict: ShardedStateDict,
+) -> Tuple[ShardedStateDict, StateDict]:
+    """Extract a dict consisting of only ShardedTensor objects
+    from a given state dict with any objects.
+
+    Args:
+        sharded_state_dict: state dict possibly containing ShardedTensor objects
+
+    Returns:
+        Tuple[ShardedStateDict, StateDict]: tuple of:
+            - state dict with all ShardedTensor (keeping the original state dict structure)
+            - state dict with all objects other than ShardedTensor
+              (keeping the original state dict structure)
+    """
+    return extract_matching_values(sharded_state_dict, lambda v: isinstance(v, ShardedTensor))
+
+
+def extract_sharded_tensors_and_factories(
+    sharded_state_dict: ShardedStateDict,
+) -> Tuple[ShardedStateDict, StateDict]:
+    """Extract a dict consisting of only ShardedTensor and ShardedTensorFactory objects
+    from a given state dict with any objects.
+
+    Args:
+        sharded_state_dict:
+            state dict possibly containing ShardedTensor and ShardedTensorFactory objects
+
+    Returns:
+        Tuple[ShardedStateDict, StateDict]: tuple of:
+            - state dict with all ShardedTensor and ShardedTensorFactory
+              (keeping the original state dict structure)
+            - state dict with all other objects (keeping the original state dict structure)
+    """
+    return extract_matching_values(
+        sharded_state_dict, lambda v: isinstance(v, (ShardedTensor, ShardedTensorFactory))
+    )
+
+
+def extract_sharded_tensors_or_nonpersistent(
+    sharded_state_dict: ShardedStateDict,
+) -> Tuple[ShardedStateDict, StateDict]:
+    """Extract a dict consisting of only ShardedTensor, ShardedTensorFactory
+    and LocalNonpersistentObject objects from a given state dict with any objects.
+
+    Args:
+        sharded_state_dict: state dict possibly containing ShardedTensor, ShardedTensorFactory
+        and LocalNonpersistentObject objects
+
+    Returns:
+        Tuple[ShardedStateDict, StateDict]: tuple of:
+            - state dict with all ShardedTensor, ShardedTensorFactory and LocalNonpersistentObject
+              (keeping the original state dict structure)
+            - state dict with all other objects (keeping the original state dict structure)
+    """
+    return extract_matching_values(
+        sharded_state_dict,
+        lambda v: isinstance(v, (ShardedTensor, LocalNonpersistentObject, ShardedTensorFactory)),
+    )
+
+
+def extract_sharded_base(
+    sharded_state_dict: ShardedStateDict,
+) -> Tuple[ShardedStateDict, StateDict]:
+    """Extract a dict consisting of only ShardedBase from a given state dict with any objects.
+
+    Args:
+        sharded_state_dict: state dict possibly containing ShardedBase objects
+
+    Returns:
+        Tuple[ShardedStateDict, StateDict]: tuple of:
+            - state dict with all ShardedBase objects (keeping the original state dict structure)
+            - state dict with all other objects (keeping the original state dict structure)
+    """
+    return extract_matching_values(sharded_state_dict, lambda v: isinstance(v, ShardedBase))
+
+
+def extract_nonpersistent(
+    sharded_state_dict: ShardedStateDict,
+) -> Tuple[ShardedStateDict, StateDict]:
+    """Extract a dict consisting of only LocalNonpersistentObjects from a given state dict.
+
+    Args:
+        sharded_state_dict: state dict possibly containing LocalNonpersistentObjects
+
+    Returns:
+        Tuple[ShardedStateDict, StateDict]: tuple of:
+            - state dict with all LocalNonpersistentObjects
+              (keeping the original state dict structure)
+            - state dict with all other objects (keeping the original state dict structure)
+    """
+
+    return extract_matching_values(
+        sharded_state_dict, lambda v: isinstance(v, LocalNonpersistentObject)
+    )
+
+
+def add_prefix_for_sharding(sharded_state_dict: ShardedStateDict, prefix: str):
+    """Prepend a given prefix to all ShardedBase objects in a given state dict *in-place*.
+
+    Args:
+        sharded_state_dict (ShardedStateDict): sharded state dict
+        prefix (str): prefix to be prepended
+
+    Returns:
+        None: state dict is modified in-place
+    """
+
+    def add_prefix(t):
+        if isinstance(t, ShardedBase):
+            t.key = f'{prefix}{t.key}'
+        return t
+
+    dict_list_map_inplace(add_prefix, sharded_state_dict)
+
+
+def replace_prefix_for_sharding(
+    sharded_state_dict: ShardedStateDict, old_prefix: str, new_prefix: str
+):
+    """Replaces the given prefix in *all* sharded keys in a given state dict.
+
+    Errors out if some key does not begin with a given prefix.
+
+    Args:
+        sharded_state_dict (ShardedStateDict): sharded state dict to replace keys in
+        old_prefix (str): prefix to be replaced in each key
+        new_prefix (str): new prefix
+
+    Returns:
+        None: state dict is modified in place
+    """
+
+    def _replace_prefix(x):
+        if isinstance(x, (ShardedTensor, ShardedTensorFactory, ShardedObject)):
+            if not x.key.startswith(old_prefix):
+                raise ValueError(f'Expected {x.key} to begin with prefix {old_prefix}')
+            x.key = f'{new_prefix}{x.key[len(old_prefix):]}'  # str.removeprefix in Python >= 3.9
+        return x
+
+    dict_list_map_inplace(_replace_prefix, sharded_state_dict)
+
+
+def apply_prefix_mapping(sharded_state_dict: ShardedStateDict, prefix_map: Dict[str, str]):
+    """Replaces prefixes *only in keys matching* with one of prefixes in the map.
+
+    Args:
+        sharded_state_dict (ShardedStateDict): sharded state dict to replace keys in
+        prefix_map (Dict[str, str]):
+            map of old->new prefixes. The first matching prefix for each key is used
+
+    Returns:
+        None: state dict is modified in place
+    """
+
+    def _replace_prefixes(x):
+        if not isinstance(x, (ShardedTensor, ShardedTensorFactory, ShardedObject)):
+            return x
+        for old_prefix, new_prefix in prefix_map.items():
+            if x.key.startswith(old_prefix):
+                x.key = (
+                    f'{new_prefix}{x.key[len(old_prefix):]}'  # str.removeprefix in Python >= 3.9
+                )
+                break
+        return x
+
+    dict_list_map_inplace(_replace_prefixes, sharded_state_dict)
diff --git a/megatron/core/dist_checkpointing/validation.py b/megatron/core/dist_checkpointing/validation.py
new file mode 100644
index 0000000000..cd11b82ed6
--- /dev/null
+++ b/megatron/core/dist_checkpointing/validation.py
@@ -0,0 +1,525 @@
+# Copyright (c) 2024, NVIDIA CORPORATION. All rights reserved.
+import logging
+from collections import Counter, defaultdict
+from enum import Enum
+from pathlib import Path
+from typing import TYPE_CHECKING, List, Optional, Set, Tuple, Union
+
+import numpy as np
+import torch
+
+from megatron.core.dist_checkpointing import ShardedTensor
+from megatron.core.dist_checkpointing.core import CheckpointingException, maybe_load_config
+from megatron.core.dist_checkpointing.dict_utils import (
+    extract_matching_values,
+    map_reduce,
+    nested_values,
+)
+from megatron.core.dist_checkpointing.mapping import (
+    ShardedBase,
+    ShardedObject,
+    ShardedStateDict,
+    is_main_replica,
+)
+from megatron.core.dist_checkpointing.strategies.base import (
+    LoadCommonStrategy,
+    LoadShardedStrategy,
+    SaveCommonStrategy,
+    SaveShardedStrategy,
+    StrategyAction,
+    get_default_strategy,
+)
+
+if TYPE_CHECKING:
+    from megatron.core.dist_checkpointing.serialization import CkptShardedMetadata
+
+logger = logging.getLogger(__name__)
+
+# list of local saved/loaded ShardedBase objects
+_LocalMetadata = List[Union[ShardedTensor, ShardedObject]]
+# list of lists of global saved/loaded ShardedBase objects (each list element corresponds to global rank)
+_GlobalMetadata = List[_LocalMetadata]
+
+
+class StrictHandling(Enum):
+    """Determines handling of load mismatch (non-empty "unexpected" or "missing" keys).
+
+    Different flags carry different implications on performance and behaviour and
+    are divided into two groups:
+    - *_UNEXPECTED
+    - *_ALL
+    The first group ignores missing keys (present in the checkpoint but missing
+    in the sharded state dict) which is created in order to avoid inter-rank
+    metadata exchange. Note that the metadata exchange will happen anyway
+    with `load(..., validate_access_integrity=True)` flag in which case using the
+    `*_ALL` option is recommended as it provides a more thorough check with no
+    performance penalty wrt. `*_UNEXPECTED` group.
+
+    All options except for the first one (`ASSUME_OK_UNEXPECTED`) require
+    extra disk access before the load in order to remove unexpected keys
+    from the sharded state dict requested to load.
+    """
+
+    # Relies on the underlying strategy to raise error on unexpected keys
+    ASSUME_OK_UNEXPECTED = 'assume_ok_unexpected'
+    # Logs (with WARNING level) "unexpected" keys. Missing keys are ignored.
+    # This is treated as a reasonable default for a "non-strict" load
+    LOG_UNEXPECTED = 'log_unexpected'
+    # Logs (with WARNING level) all mismatched keys.
+    LOG_ALL = 'log_all'
+    # Raise error on unexpected keys before load attempt.
+    # Gives cleaner error message than `ASSUME_OK_UNEXPECTED` but requires
+    # extra disk access.
+    RAISE_UNEXPECTED = 'raise_unexpected'
+    # Raise error on any mismatch. Similar to `RAISE_UNEXPECTED` but requires
+    # metadata exchange.
+    RAISE_ALL = 'raise_all'
+    # "Unexpected" mismatches are not reported, but returned by the `load`
+    # function along with the loaded state dict. Missing keys are ignored.
+    RETURN_UNEXPECTED = 'return_unexpected'
+    # All mismatches are returned along with the loaded state dict.
+    RETURN_ALL = 'return_all'
+    # Simply ignores mismatches (not recommended)
+    IGNORE_ALL = 'ignore_all'
+
+    @staticmethod
+    def requires_explicit_ckpt_mismatch_check(val: 'StrictHandling') -> bool:
+        """Whether a given strict flag involves mismatch check against the checkpoint."""
+        return val != StrictHandling.ASSUME_OK_UNEXPECTED
+
+    @staticmethod
+    def requires_global_app_metadata(val: 'StrictHandling') -> bool:
+        """Whether a given strict option requires global metadata for validation."""
+        return val in (
+            StrictHandling.IGNORE_ALL,
+            StrictHandling.RAISE_ALL,
+            StrictHandling.RETURN_ALL,
+            StrictHandling.LOG_ALL,
+        )
+
+    @staticmethod
+    def requires_returning_mismatch_keys(val: 'StrictHandling') -> bool:
+        """Whether a given strict option results in extra return value from the `load` function."""
+        return val in (StrictHandling.RETURN_UNEXPECTED, StrictHandling.RETURN_ALL)
+
+
+def parse_strict_flag(strict: Union[str, StrictHandling]) -> StrictHandling:
+    """Parse user passed strict flag from a string to StrictHandling instance.
+
+    Args:
+        strict (str, StrictHandling): strict flag to parse. If already an instance
+            of StrictHandling, this function is a noop.
+
+    Returns:
+        StrictHandling: enum instance
+    """
+    if isinstance(strict, StrictHandling):
+        return strict
+    try:
+        return StrictHandling(strict)
+    except (ValueError, TypeError) as e:
+        raise ValueError(f'Invalid strict flag: {e}') from e
+
+
+def validate_integrity_and_strict_load(
+    sharded_state_dict: ShardedStateDict,
+    strict: StrictHandling,
+    validate_access_integrity: bool,
+    local_metadata: Optional[_LocalMetadata] = None,
+    global_metadata: Optional[_GlobalMetadata] = None,
+    ckpt_sharded_metadata: Optional['CkptShardedMetadata'] = None,
+) -> Tuple[ShardedStateDict, Set[str], Set[str]]:
+    """Validates sharding integrity and potential mismatches with the checkpoint.
+
+    `validate_access_integrity` controls sharding integrity check (orthogonal
+    to strictness checking) which verifies `sharded_state_dict` runtime completeness
+    (in isolation from the actual checkpoint).
+
+    `strict` flag controls handling of mismatches between the requested
+    sharded state dict to load and the actual checkpoint. See `StrictHandling`
+    docs for details regarding flag behavior and performance implications
+    (disk interactions or inter-rank communication).
+
+    Args:
+        sharded_state_dict (ShardedStateDict): sharded state dict to verify.
+        strict (StrictHandling): flag determining how to handle sharded keys mismatch.
+        validate_access_integrity (bool): whether to perform sharding validation.
+        local_metadata (_LocalMetadata, optional): local sharded state dict metadata.
+            Defaults to None, in which case it's determined based on `sharded_state_dict`.
+        global_metadata (_GlobalMetadata, optional): global sharded state dict metadata
+            (exchanged between ranks). Defaults to None, in which case "missing"
+            keys are not determined.
+        ckpt_sharded_metadata (CkptShardedMetadata, optional): sharded metadata
+            from the checkpoint. Defaults to None, which only makes sense
+            for the `StrictHandling.ASSUME_OK_UNEXPECTED` strict value.
+
+    Returns:
+        Tuple[ShardedStateDict, Set[str], Set[str]]: tuple of: sharded state dict
+            without unexpected keys, missing and unexpected keys. Missing keys are equal
+            on all ranks, unexpected keys might differ across ranks. Additionally,
+            missing keys might be erroneously empty (depending on `strict` value).
+    """
+    missing_keys, unexpected_keys = [], []
+    if StrictHandling.requires_explicit_ckpt_mismatch_check(strict):
+        if ckpt_sharded_metadata is None:
+            raise CheckpointingException(
+                'Cannot verify checkpoint mismatch with ckpt_sharded_metadata=None.'
+            )
+        if local_metadata is None:
+            local_metadata = [
+                sh_base.without_data() for sh_base in nested_values(sharded_state_dict)
+            ]
+        # We don't want to check for missing keys even if we could
+        _skip_missing_keys = strict in (
+            StrictHandling.ASSUME_OK_UNEXPECTED,
+            StrictHandling.LOG_UNEXPECTED,
+            StrictHandling.RAISE_UNEXPECTED,
+            StrictHandling.RETURN_UNEXPECTED,
+        )
+        missing_keys, unexpected_keys = _determine_missing_and_unexpected_keys(
+            ckpt_sharded_metadata, local_metadata, None if _skip_missing_keys else global_metadata
+        )
+
+        sharded_state_dict = adjust_non_strict_load(sharded_state_dict, unexpected_keys)
+
+        if strict == StrictHandling.IGNORE_ALL:
+            missing_keys, unexpected_keys = [], []
+        elif strict in (StrictHandling.RAISE_UNEXPECTED, StrictHandling.RAISE_ALL):
+            maybe_report_missing_and_unexpected_keys(missing_keys, unexpected_keys, True)
+        elif strict in (StrictHandling.LOG_UNEXPECTED, StrictHandling.LOG_ALL):
+            maybe_report_missing_and_unexpected_keys(missing_keys, unexpected_keys, False)
+
+    if validate_access_integrity:
+        if global_metadata is None:
+            raise CheckpointingException(
+                'Cannot check sharding intergrity without global_metadata (None).'
+            )
+        validate_sharding_integrity(global_metadata)
+
+    return sharded_state_dict, missing_keys, unexpected_keys
+
+
+def verify_checkpoint_and_load_strategy(
+    checkpoint_dir: str,
+    sharded_strategy: Union[LoadShardedStrategy, Tuple[str, int], None] = None,
+    common_strategy: Union[LoadCommonStrategy, Tuple[str, int], None] = None,
+) -> Tuple[LoadShardedStrategy, LoadCommonStrategy]:
+    """Verifies if checkpoint metadata exists and matches given strategies.
+
+    If no strategies are passed, they are determined based on the checkpoint metadata.
+
+    Args:
+        checkpoint_dir (str): checkpoint directory
+        sharded_strategy (LoadShardedStrategy, Tuple[str, int], optional): sharded load strategy to be verified
+            if compatible with the checkpoint content. If None, the default sharded load strategy
+            for the checkpoint backend will be returned.
+        common_strategy (LoadCommonStrategy, Tuple[str, int], optional): common load strategy to be verified
+            if compatible with the checkpoint content. If None, the default common load strategy
+            for the checkpoint backend will be returned.
+    """
+    if not Path(checkpoint_dir).exists():
+        raise CheckpointingException(f'Checkpoint directory {checkpoint_dir} does not exist')
+
+    saved_config = maybe_load_config(checkpoint_dir)
+    if saved_config is None:
+        raise CheckpointingException(f'{checkpoint_dir} is not a distributed checkpoint')
+
+    if sharded_strategy is None:
+        sharded_strategy = get_default_strategy(
+            StrategyAction.LOAD_SHARDED,
+            saved_config.sharded_backend,
+            saved_config.sharded_backend_version,
+        )
+    elif isinstance(sharded_strategy, tuple):
+        sharded_strategy = get_default_strategy(StrategyAction.LOAD_SHARDED, *sharded_strategy)
+
+    if common_strategy is None:
+        common_strategy = get_default_strategy(
+            StrategyAction.LOAD_COMMON,
+            saved_config.common_backend,
+            saved_config.common_backend_version,
+        )
+    elif isinstance(common_strategy, tuple):
+        sharded_strategy = get_default_strategy(StrategyAction.LOAD_COMMON, *common_strategy)
+
+    sharded_strategy.check_backend_compatibility(saved_config.sharded_backend)
+    sharded_strategy.check_version_compatibility(saved_config.sharded_backend_version)
+    common_strategy.check_backend_compatibility(saved_config.common_backend)
+    common_strategy.check_version_compatibility(saved_config.common_backend_version)
+    return sharded_strategy, common_strategy
+
+
+def adjust_non_strict_load(
+    sharded_state_dict: ShardedStateDict, sharded_keys_to_remove: Set[str]
+) -> ShardedStateDict:
+    """Adjusts sharded state dict removing keys not existing in the checkpoint.
+
+    Args:
+        sharded_state_dict (ShardedStateDict): sharded state dict to modify
+        sharded_keys_to_remove (Set[str]): keys to remove from the state dict
+
+    Returns:
+        ShardedStateDict: state dict without ShardedBase objects with specified keys
+    """
+
+    def is_unexpected_key(x: ShardedBase):
+        assert isinstance(x, ShardedBase), f'Unexpected type {type(x)}'
+        return x.key in sharded_keys_to_remove
+
+    _, sharded_state_dict = extract_matching_values(sharded_state_dict, is_unexpected_key)
+    return sharded_state_dict
+
+
+def _determine_missing_and_unexpected_keys(
+    ckpt_sharded_metadata: 'CkptShardedMetadata',
+    local_metadata: _LocalMetadata,
+    global_metadata: Optional[_GlobalMetadata] = None,
+) -> Tuple[Set[str], Set[str]]:
+    """Determines load mismatches based on metadata.
+
+    There is an asymmetry between "unexpected" and "missing" keys.
+    Unexpected keys can be determined based only on local metadata.
+    Missing keys must be based on global metadata, since other ranks might access
+    different keys than the current rank.
+    In consequence, the return value of this function is different on each rank:
+    "missing_keys" are equal, but "unexpected_keys" might differ across ranks.
+
+    Args:
+        ckpt_sharded_metadata (CkptShardedMetadata): sharded state dict (without data)
+            constructed based on the checkpoint content
+        local_metadata (_LocalMetadata): list of local ShardedBase objects
+            requested to be loaded by this rank
+        global_metadata (_GlobalMetadata, optional): list of global ShardedBase objects
+            requested to be loaded by all ranks. Defaults to None, in which case
+            returned "missing" keys are empty.
+
+    Returns:
+        Tuple[Set[str], Set[str]]: missing and unexpected keys. Missing keys are equal
+            on all ranks, unexpected keys might differ across ranks. If passed
+            `global_metadata` is empty, returned missing keys are empty as well.
+
+    """
+    local_accessed_keys = set(sh_base.key for sh_base in local_metadata)
+    ckpt_keys = set(sh_base.key for sh_base in ckpt_sharded_metadata.values())
+    unexpected_keys = local_accessed_keys - ckpt_keys
+    if global_metadata is not None:
+        global_accessed_keys = set(
+            sh_base.key for rank_metadata in global_metadata for sh_base in rank_metadata
+        )
+        missing_keys = ckpt_keys - global_accessed_keys
+    else:
+        missing_keys = set()
+
+    if missing_keys:
+        logger.debug(f'Dist ckpt load missing keys: {missing_keys}')
+    if unexpected_keys:
+        logger.debug(f'Dist ckpt load unexpected keys: {unexpected_keys}')
+
+    return missing_keys, unexpected_keys
+
+
+def maybe_report_missing_and_unexpected_keys(
+    missing_keys: Set[str], unexpected_keys: Set[str], raise_error: bool = True
+) -> None:
+    """Raises or logs an error in case missing or unexpected keys are non-empty.
+
+    Args:
+        missing_keys (Set[str]): missing keys in the state dict
+        unexpected_keys (Set[str]): unexpected keys in the state dict
+        raise_error: If True, raises error on mismatch. Otherwise, logs mismatch
+            with WARNING level.
+
+    Returns:
+        None
+
+    Raises:
+        CheckpointingException: if `raise_error` is True and at least one of
+        `missing_keys` or `unexpected_keys` are non-empty.
+    """
+    if not missing_keys and not unexpected_keys:
+        return
+    missing_title_msg = (
+        f'Some keys found in the checkpoint are missing in the provided sharded state dict. '
+    )
+    missing_body_msg = f'Missing keys (for all ranks): {missing_keys}. '
+    unexpected_title_msg = f'Unexpected keys (not found in the checkpoint) encountered in the provided sharded state dict. '
+    unexpected_body_msg = f'Unexpected keys (for this rank): {unexpected_keys}. '
+    error_msg = ''
+    if missing_keys:
+        error_msg += missing_title_msg
+    if unexpected_keys:
+        error_msg += unexpected_title_msg
+
+    error_msg += '\n'
+    if missing_keys:
+        error_msg += missing_body_msg
+    if unexpected_keys:
+        error_msg += unexpected_body_msg
+
+    if raise_error:
+        raise CheckpointingException(error_msg)
+    else:
+        logger.warning(error_msg)
+
+
+def validate_sharding_integrity(global_metadata: _GlobalMetadata) -> None:
+    """Validate if the ShardedTensors and ShardedObjects from multiple processes define correct sharding.
+
+    Local ShardedTensors and ShardedObject metadata is exchanged with `torch.distributed.all_gather_object`
+    and then process with global rank 0 checks if main replicas of the shards:
+    - cover the whole global tensors
+    - don't overlap
+
+    Args:
+        global_metadata (_GlobalMetadata): ShardedTensor and ShardedObject objects from all ranks.
+
+    Returns:
+        None
+
+    Raises:
+        CheckpointingException for invalid access pattern
+    """
+    if torch.distributed.get_rank() != 0:
+        return
+
+    key_shardings = defaultdict(list)
+    for rank, rank_shardings in enumerate(global_metadata):
+        for sharding in rank_shardings:
+            key_shardings[sharding.key].append((rank, sharding))
+    for key, shardings in key_shardings.items():
+        if isinstance(shardings[0][1], ShardedObject):
+            _validate_objects_for_key(shardings)
+        else:
+            _validate_sharding_for_key(shardings)
+
+
+def _validate_sharding_for_key(rank_sharding: List[Tuple[int, ShardedTensor]]):
+    some_rank_shard = rank_sharding[0][1]
+    global_shape = some_rank_shard.global_shape
+    local_shape = some_rank_shard.local_shape
+    dtype = some_rank_shard.dtype
+    has_flattened_range = some_rank_shard.flattened_range is not None
+    for rank, sharding in rank_sharding:
+        assert sharding.dtype == dtype, (sharding.dtype, dtype, some_rank_shard)
+        assert sharding.global_shape == global_shape, (
+            sharding.global_shape,
+            global_shape,
+            some_rank_shard,
+        )
+        assert sharding.local_shape == local_shape, (
+            sharding.local_shape,
+            local_shape,
+            some_rank_shard,
+        )
+        assert (sharding.flattened_range is not None) == has_flattened_range, (
+            (sharding.flattened_range is not None),
+            has_flattened_range,
+            some_rank_shard,
+        )
+
+    shard_access_cnt = _compute_shards_access(rank_sharding)
+    if has_flattened_range:
+        map_reduce(
+            rank_sharding,
+            lambda x: x[1].global_offset,
+            lambda x: x[1],
+            _validate_sharding_for_key_flattened,
+        )
+    else:
+        if not torch.all(shard_access_cnt == 1):
+            logger.error(f'Invalid access pattern for {rank_sharding[0][1]}: {shard_access_cnt}')
+            raise CheckpointingException(f'Invalid access pattern for {rank_sharding[0][1]}')
+
+
+def _compute_shards_access(rank_sharding):
+    shard_access_cnt = torch.zeros(
+        rank_sharding[0][1].axis_fragmentations, dtype=torch.int, device='cpu'
+    )
+    for rank, sharding in rank_sharding:
+        if is_main_replica(sharding.replica_id):
+            shard_access_cnt[sharding.local_chunk_offset_in_global()] += 1
+    return shard_access_cnt
+
+
+def _validate_sharding_for_key_flattened(tensors_by_shard):
+    all_slices = []
+    local_shape = tensors_by_shard[0].local_shape
+    for sharding in tensors_by_shard:
+        assert sharding.local_shape == local_shape
+        sharding: ShardedTensor
+        if not is_main_replica(sharding.replica_id):
+            continue
+
+        all_slices.append((sharding.flattened_range.start, sharding.flattened_range.stop))
+
+    starts, stops = map(np.asarray, zip(*sorted(all_slices)))
+    if (
+        starts[0] != 0
+        or stops[-1] != np.product(local_shape)
+        or not np.all(starts[1:] == stops[:-1])
+    ):
+        logger.error(
+            f'Flattened ranges dont cover the whole shard {tensors_by_shard[0]}. Ranges: {(starts, stops)}'
+        )
+        raise CheckpointingException(
+            f'Flattened ranges dont cover the whole shard {tensors_by_shard[0]}. Ranges: {(starts, stops)}'
+        )
+
+
+def _validate_objects_for_key(sharded_objects: List[ShardedObject]):
+    """Ensure uniqueness of saved objects."""
+    unique_keys = [
+        sh_obj.unique_key for _, sh_obj in sharded_objects if is_main_replica(sh_obj.replica_id)
+    ]
+    if len(unique_keys) != len(set(unique_keys)):
+        duplicates = {k: cnt for k, cnt in Counter(unique_keys).items() if cnt > 1}
+        logger.error(f'Duplicate ShardedObject keys and counts: {duplicates}')
+        raise CheckpointingException(f'Duplicate ShardedObject keys: {list(duplicates.keys())}')
+    expected_shard_num = np.prod(sharded_objects[0][1].global_shape)
+    if len(unique_keys) != expected_shard_num:
+        err_msg = f'Invalid access pattern: {expected_shard_num - len(unique_keys)} ShardedObject are missing.'
+        logger.error(f'{err_msg} Existing shards: {unique_keys}')
+        raise CheckpointingException(err_msg)
+
+
+def determine_global_metadata(
+    sharded_state_dict: ShardedStateDict,
+) -> Tuple[_LocalMetadata, _GlobalMetadata]:
+    """Exchanges local metadata with `all_gather_object` to determine global metadata.
+
+    Args:
+        sharded_state_dict (ShardedStateDict): local sharded state dict
+
+    Returns:
+        Tuple[_LocalMetadata, _GlobalMetadata]: local and global ShardedBase objects with stripped data
+    """
+    local_metadata = [ten.without_data() for ten in nested_values(sharded_state_dict)]
+    global_metadata = [None] * torch.distributed.get_world_size()
+    torch.distributed.all_gather_object(global_metadata, local_metadata)
+    return local_metadata, global_metadata
+
+
+def validate_sharded_objects_handling(
+    sharded_strategy: Union[SaveShardedStrategy, LoadShardedStrategy],
+    common_strategy: Union[SaveCommonStrategy, LoadCommonStrategy],
+) -> None:
+    """Checks if either of the passed strategies can handle sharded objects.
+
+    Args:
+        sharded_strategy (Union[SaveShardedStrategy, LoadShardedStrategy]): sharded strategy used for saving/loading
+        common_strategy (Union[SaveCommonStrategy, LoadCommonStrategy]): common strategy used for saving/loading
+
+    Returns:
+        None
+
+    Raises:
+        CheckpointingException: if both strategies can't handle ShardedObjects
+    """
+    if (
+        not sharded_strategy.can_handle_sharded_objects
+        and not common_strategy.can_handle_sharded_objects
+    ):
+        raise CheckpointingException(
+            f'Either sharded strategy or common strategy must implement ShardedObjects handling.'
+            f' Both {sharded_strategy} and {common_strategy} specify can_handle_sharded_objects=False'
+        )
diff --git a/megatron/core/distributed/__init__.py b/megatron/core/distributed/__init__.py
new file mode 100644
index 0000000000..e43ae115ae
--- /dev/null
+++ b/megatron/core/distributed/__init__.py
@@ -0,0 +1,10 @@
+# Copyright (c) 2024, NVIDIA CORPORATION. All rights reserved.
+
+from .distributed_data_parallel import DistributedDataParallel
+from .distributed_data_parallel_config import DistributedDataParallelConfig
+from .finalize_model_grads import finalize_model_grads
+
+# For backwards compatibility. ParamAndGradBuffer will be deprecated in future release.
+# ParamAndGradBuffer (which is an alias of _ParamAndGradBuffer) is not intended to be
+# consumed directly by external code.
+from .param_and_grad_buffer import ParamAndGradBuffer
diff --git a/megatron/core/distributed/distributed_data_parallel.py b/megatron/core/distributed/distributed_data_parallel.py
new file mode 100644
index 0000000000..6e5bbd96d7
--- /dev/null
+++ b/megatron/core/distributed/distributed_data_parallel.py
@@ -0,0 +1,485 @@
+# Copyright (c) 2024, NVIDIA CORPORATION. All rights reserved.
+
+import logging
+from contextlib import contextmanager
+
+import torch
+
+from .. import parallel_state
+from ..config_logger import has_config_logger_enabled, log_config_to_disk
+from ..transformer.module import MegatronModule
+from ..transformer.transformer_config import TransformerConfig
+from ..utils import is_float8tensor, log_single_rank
+from .distributed_data_parallel_config import DistributedDataParallelConfig
+from .param_and_grad_buffer import _ParamAndGradBuffer, partition_buckets
+
+logger = logging.getLogger(__name__)
+
+
+class DistributedDataParallel(MegatronModule):
+    """
+    DDP wrapper which stores grads in contiguous buffers. Also has option of overlapping
+    communication with backprop computation by breaking up full model's gradients into smaller
+    buckets and running all-reduce / reduce-scatter on each bucket asynchronously. This class
+    also provides the option to do the gradient accumulation in a type other than the param type
+    (e.g., fp32 for a bf16 model).
+
+    Args:
+        config: Transformer config object.
+        ddp_config: DistributedDataParallel config object.
+        module: Underlying model.
+        disable_bucketing: If true, force assign all parameters to a single bucket. If false,
+            use standard bucketing policy: assign parameters to smaller buckets and all-reduce
+            per bucket _if_ overlap_grad_reduce is True and pp_rank is 0.
+
+    """
+
+    def __init__(
+        self,
+        config: TransformerConfig,
+        ddp_config: DistributedDataParallelConfig,
+        module: torch.nn.Module,
+        disable_bucketing: bool = False,
+    ):
+        super().__init__(config=config)
+        if has_config_logger_enabled(config):
+            log_config_to_disk(config, locals(), prefix=type(self).__name__)
+
+        self.module = module
+
+        # If bucket_size is not provided as an input, use sane default.
+        # If using very large dp_sizes, make buckets larger to ensure that chunks used in NCCL
+        # ring-reduce implementations are large enough to remain bandwidth-bound rather than
+        # latency-bound.
+        if ddp_config.bucket_size is None:
+            ddp_config.bucket_size = max(
+                40000000, 1000000 * parallel_state.get_data_parallel_world_size()
+            )
+        # Set bucket_size to infinity if overlap_grad_reduce is False.
+        if not ddp_config.overlap_grad_reduce:
+            ddp_config.bucket_size = None
+
+        self.ddp_config = ddp_config
+        log_single_rank(
+            logger,
+            logging.INFO,
+            f'Setting up DistributedDataParallel with config {self.ddp_config}',
+        )
+
+        # Turn off bucketing if we are on a pipeline stage that is not the first (since
+        # data-parallel communication on these stages is not on the critical path), or if
+        # disable_bucketing is True (e.g., we might not want to break up model parameters
+        # into buckets for model chunks after the first in the interleaved schedule).
+        self.bucket_size = self.ddp_config.bucket_size
+        if parallel_state.get_pipeline_model_parallel_rank() > 0:
+            self.bucket_size = None
+        if disable_bucketing:
+            self.bucket_size = None
+
+        self.param_to_bucket_group = {}
+
+        # Group parameters by their gradient type.
+        param_to_name = {}
+        dense_params = []
+        expert_parallel_params = []
+        self.params_with_grad = []
+        for name, param in self.module.named_parameters():
+            if not param.requires_grad:
+                continue
+
+            # Track params with grad to enable direct setting
+            # of param.grad_added_to_main_grad
+            self.params_with_grad.append(param)
+
+            param.grad_added_to_main_grad = False
+            param_to_name[param] = name
+
+            if getattr(param, 'allreduce', True):
+                dense_params.append(param)
+            else:
+                expert_parallel_params.append(param)
+
+        def _allocate_buffers_for_parameters(
+            input_params, data_parallel_group, gradient_scaling_factor
+        ):
+            param_and_grad_dtype_to_params = {}
+            param_and_grad_dtype_to_offsets = {}
+            param_and_grad_dtype_to_indices = {}
+
+            # Group parameters by their gradient type.
+            for param in input_params:
+                assert param.requires_grad
+
+                param_dtype = param.dtype
+                if is_float8tensor(param):
+                    # Currently TE's Float8Tensor is a wrapper of torch.Tensor. It has a "fake"
+                    # dtype (usually a higher precision dtype such as bfloat16), but its actual
+                    # data is stored in the form of a torch uint8 tensor within the Float8Tensor's
+                    # ".data" attribute. Therefore, when creating the param buffer for fp8 params,
+                    # it is necessary to use torch.uint8, not the "fake" dtype got from
+                    # "param.dtype".
+                    param_dtype = torch.uint8
+                grad_dtype = torch.float if self.ddp_config.grad_reduce_in_fp32 else param.dtype
+
+                params = param_and_grad_dtype_to_params.get((param_dtype, grad_dtype), [])
+                params.append(param)
+                param_and_grad_dtype_to_params[(param_dtype, grad_dtype)] = params
+
+                # Get the index of each param among the params with same dtype, if a param is fp8,
+                # use its "fake" high precision dtype to find which params have same dtype with it.
+                # For example:
+                #     Case 1:
+                #         params = [p1(bf16), p2(bf16), p3(bf16), p4(bf16)]
+                #         param_and_grad_dtype_to_indices = {
+                #             (torch.bfloat16, torch.float32): [0, 1, 2, 3],
+                #         }
+                #     Case 2:
+                #         params = [p1(bf16), p2(fp8), p3(fp8), p4(bf16)]
+                #         param_and_grad_dtype_to_indices = {
+                #             (torch.bfloat16, torch.float32): [0, 3],
+                #             (torch.uint8, torch.float32): [1, 2],
+                #         }
+                # We need these indices to load a non-native-fp8 checkpoint in native-fp8 mode.
+                offset = param_and_grad_dtype_to_offsets.get((param.dtype, grad_dtype), 0)
+                param_and_grad_dtype_to_offsets[(param.dtype, grad_dtype)] = offset + 1
+                indices = param_and_grad_dtype_to_indices.get((param_dtype, grad_dtype), [])
+                indices.append(offset)
+                param_and_grad_dtype_to_indices[(param_dtype, grad_dtype)] = indices
+
+            if not config.calculate_per_token_loss:
+                target_gradient_scaling_factor = 1.0 / parallel_state.get_data_parallel_world_size(
+                    with_context_parallel=True
+                )
+                if self.ddp_config.average_in_collective:
+                    # Collective is averaging gradients in collective with data_parallel_group.
+                    assert (
+                        gradient_scaling_factor
+                        / torch.distributed.get_world_size(group=data_parallel_group)
+                        == target_gradient_scaling_factor
+                    )
+                else:
+                    assert gradient_scaling_factor == target_gradient_scaling_factor
+
+            # Allocate the grad buffers and map the grads.
+            buffers = []
+            for (param_dtype, grad_dtype), params in param_and_grad_dtype_to_params.items():
+                buffers.append(
+                    _ParamAndGradBuffer(
+                        self.ddp_config,
+                        param_dtype,
+                        grad_dtype,
+                        params,
+                        data_parallel_group,
+                        self.bucket_size,
+                        param_to_name,
+                        gradient_scaling_factor,
+                        param_and_grad_dtype_to_indices[(param_dtype, grad_dtype)],
+                    )
+                )
+
+            # In some scenarios, we want to put buckets from different buffers into a group so that
+            # their communication can be aggregated. For example, when there are both fp8 buffers
+            # and bf16 buffers in the model and vpp is enabled, each model chunk will have an fp8
+            # bucket and a bf16 bucket, which doubles the number of communication kernels, and
+            # because of the use of CUDA_DEVICE_MAX_CONNECTIONS=1, having multiple back-to-back
+            # communications will prevent the overlap of the communication kernels with computation
+            # kernels.
+            # If bucketing is explicitly disabled, then put all buckets in a buffer into a single
+            # bucket group.
+            bucket_groups = partition_buckets(buffers, force_single_bucket_group=disable_bucketing)
+
+            # Set `next_param_gather_bucket_group` for different bucket groups by iterating through
+            # buckets in reverse order (since all-gathers happen in reverse order of buckets).
+            if self.ddp_config.use_distributed_optimizer and self.ddp_config.overlap_param_gather:
+                num_bucket_groups = len(bucket_groups)
+                for i in range(1, num_bucket_groups):
+                    bucket_groups[num_bucket_groups - i].next_param_gather_bucket_group = (
+                        bucket_groups[num_bucket_groups - i - 1]
+                    )
+
+            # Create map from param to bucket group, used in pre_hook.
+            for bucket_group in bucket_groups:
+                for bucket in bucket_group.buckets:
+                    for param in bucket.params_list:
+                        self.param_to_bucket_group[param] = bucket_group
+
+            return buffers, bucket_groups
+
+        if config.calculate_per_token_loss:
+            gradient_scaling_factor = 1.0
+            expert_gradient_scaling_factor = 1.0
+        else:
+            if self.ddp_config.average_in_collective:
+                gradient_scaling_factor = 1.0
+                expert_gradient_scaling_factor = (
+                    1.0 / parallel_state.get_expert_model_parallel_world_size()
+                )
+            else:
+                data_parallel_world_size = parallel_state.get_data_parallel_world_size(
+                    with_context_parallel=True
+                )
+                gradient_scaling_factor = 1.0 / data_parallel_world_size
+                expert_gradient_scaling_factor = 1.0 / data_parallel_world_size
+
+        # Allocate the param+grad buffers for dense params' grads.
+        self.buffers, self.bucket_groups = _allocate_buffers_for_parameters(
+            dense_params,
+            parallel_state.get_data_parallel_group(with_context_parallel=True),
+            gradient_scaling_factor=gradient_scaling_factor,
+        )
+
+        # Allocate separate param+grad buffers for expert parallel params' grads.
+        self.expert_parallel_buffers, self.expert_parallel_bucket_groups = (
+            _allocate_buffers_for_parameters(
+                expert_parallel_params,
+                parallel_state.get_data_modulo_expert_parallel_group(with_context_parallel=True),
+                gradient_scaling_factor=expert_gradient_scaling_factor,
+            )
+        )
+
+        # Delete references to weight_tensor if they exist since we don't want two parameter copies
+        # if we re-mapped parameters (which happens when we use the distributed optimizer).
+        # This is a temporary workaround around a TE bug that is fixed with
+        # https://github.com/NVIDIA/TransformerEngine/pull/719.
+        if self.ddp_config.use_distributed_optimizer:
+
+            @torch.no_grad()
+            def unmap_weight_tensor(m):
+                if hasattr(m, 'weight_tensor'):
+                    m.weight_tensor = None
+
+            self.module.apply(unmap_weight_tensor)
+
+        # Register backward hook.
+        # Accumulation function for the gradients need to be stored so they
+        # don't go out of scope.
+        self.grad_accs = []
+        for param in self.module.parameters():
+            if param.requires_grad:
+                # Expand so we get access to grad_fn.
+                param_tmp = param.expand_as(param)
+                # Get the gradient accumulator function.
+                grad_acc = param_tmp.grad_fn.next_functions[0][0]
+                grad_acc.register_hook(self._make_backward_post_hook(param))
+                self.grad_accs.append(grad_acc)
+
+        self.use_forward_hook = (
+            self.ddp_config.use_distributed_optimizer and self.ddp_config.overlap_param_gather
+        )
+        self.remove_forward_pre_hook_handles = {}
+        if self.use_forward_hook:
+            self.enable_forward_pre_hook()
+        self.overlap_param_gather_with_optimizer_step = False
+
+    def enable_forward_pre_hook(self):
+        """
+        Enable forward pre-hooks needed for param all-gather overlap with forward compute.
+        """
+        assert self.use_forward_hook
+        assert len(self.remove_forward_pre_hook_handles) == 0
+        # Register forward pre-hook for all sub-modules.
+        for module in self.module.modules():
+            self.remove_forward_pre_hook_handles[module] = module.register_forward_pre_hook(
+                self._make_forward_pre_hook()
+            )
+
+    def disable_forward_pre_hook(self):
+        """
+        Disable forward pre-hooks needed for param all-gather overlap with forward compute.
+        """
+        assert self.use_forward_hook
+        # De-register forward pre-hook for all sub-modules.
+        for module in self.module.modules():
+            assert self.remove_forward_pre_hook_handles[module] is not None
+            self.remove_forward_pre_hook_handles[module].remove()
+            del self.remove_forward_pre_hook_handles[module]
+        assert len(self.remove_forward_pre_hook_handles) == 0
+
+        # Force synchronize parameters.
+        self.start_param_sync(force_sync=True)
+
+    def forward(self, *inputs, **kwargs):
+        """
+        Calls the wrapped module's forward() method.
+        """
+        return self.module(*inputs, **kwargs)
+
+    def _make_forward_pre_hook(self):
+        """
+        Create a forward pre-hook to wait on all-gather handles when necessary (i.e.,
+        when a module uses a parameter in a bucket with a still incomplete all-gather).
+        """
+
+        def hook(module, *unused):
+            assert (
+                self.use_forward_hook
+            ), "Should use pre-hook only when overlap_param_gather is True"
+
+            # Make sure all parameters in this module have been all-gathered as necessary.
+            for param in module.parameters(recurse=False):
+                # Skip parameters without an associated buffer (such parameters have a
+                # .requires_grad field equal to False).
+                if param not in self.param_to_bucket_group:
+                    continue
+                assert param.requires_grad
+
+                # If aligning param all-gather across pipeline stages, all-gather is dispatched
+                # by start_param_sync calls in core/pipeline_parallelism/schedules.py.
+                # If overlapping param all-gather with optimizer step, then all-gather has
+                # already been dispatched in optimizer step.
+                skip_next_bucket_dispatch = (
+                    self.ddp_config.align_param_gather
+                    or self.overlap_param_gather_with_optimizer_step
+                )
+                self.param_to_bucket_group[param].finish_param_sync(
+                    skip_next_bucket_dispatch=skip_next_bucket_dispatch
+                )
+
+        return hook
+
+    def _make_backward_post_hook(self, param: torch.nn.Parameter):
+        """
+        Creates a backward post-hook to dispatch an all-reduce / reduce-scatter when
+        ready (i.e., when all grads in a bucket have been computed in all microbatches
+        in a batch).
+        """
+
+        def hook(*unused):
+            if param in self.param_to_bucket_group:
+                assert param.requires_grad
+                if self.ddp_config.overlap_grad_reduce:
+                    assert (
+                        param.grad is not None
+                    ), 'param.grad being None is not safe when overlap_grad_reduce is True'
+                if param.grad is not None and (
+                    not param.grad_added_to_main_grad or getattr(param, 'zero_out_wgrad', False)
+                ):
+                    param.main_grad.add_(param.grad.data)
+                param.grad = None
+
+                if self.ddp_config.overlap_grad_reduce:
+                    self.param_to_bucket_group[param].register_grad_ready(param)
+
+        return hook
+
+    @contextmanager
+    def no_sync(self):
+        """
+        Context manager that turns off gradient synchronization.
+        """
+        for bucket_group in self.bucket_groups + self.expert_parallel_bucket_groups:
+            bucket_group.is_last_microbatch = False
+        try:
+            yield
+        finally:
+            for bucket_group in self.bucket_groups + self.expert_parallel_bucket_groups:
+                bucket_group.is_last_microbatch = True
+
+    def start_param_sync(self, *unused, force_sync: bool = False, force_dispatch: bool = False):
+        """
+        Initiates param sync (all-gather) communication operations for all model parameters.
+
+        By default, when overlap_param_gather is set to True, dispatches asynchronous communication
+        calls; when overlap_param_gather is set to False, calls synchronous communication
+        ops. Can override this default behavior using flags below.
+
+        Args:
+            force_sync (bool, optional): force synchronous collective regardless of
+                other settings.
+            force_dispatch (bool, optional): force dispatch regardless of other settings.
+        """
+        if not force_sync:
+            # If overlapping param AG with optimizer step, AG should not be dispatched again
+            # in forward_backward_step.
+            if self.overlap_param_gather_with_optimizer_step and not force_dispatch:
+                return
+
+        for bucket_group in self.bucket_groups + self.expert_parallel_bucket_groups:
+            bucket_group.start_param_sync(force_sync=force_sync)
+
+    def start_grad_sync(self, *unused):
+        """
+        Initiates grad sync (all-reduce or reduce-scatter) communication operations
+        for all model gradients.
+
+        When overlap_grad_reduce is set to True, dispatches asynchronous communication
+        calls. When overlap_grad_reduce is set to False, calls synchronous
+        communication ops.
+        """
+        for bucket_group in self.bucket_groups + self.expert_parallel_bucket_groups:
+            bucket_group.start_grad_sync()
+
+    def finish_grad_sync(self):
+        """
+        Finishes grad sync (all-reduce or reduce-scatter) communication operations
+        for all model gradients.
+
+        When overlap_grad_reduce is set to True, waits for asynchronous communication
+        calls to complete. When overlap_grad_reduce is set to False, calls synchronous
+        communication ops.
+        """
+        for bucket_group in self.bucket_groups + self.expert_parallel_bucket_groups:
+            bucket_group.finish_grad_sync()
+
+    def scale_gradients(self, scaling_factor: float):
+        """Scale all gradients inside the buffers by `scaling_factor`."""
+        for buffer in self.buffers + self.expert_parallel_buffers:
+            buffer.scale_gradients(scaling_factor)
+
+    def zero_grad_buffer(self):
+        """
+        Zeros out all grad buffers. Needs to be called at the beginning of each
+        training iteration.
+        """
+        for param in self.params_with_grad:
+            param.grad_added_to_main_grad = False
+        for buffer in self.buffers + self.expert_parallel_buffers:
+            buffer.reset()
+        for bucket_group in self.bucket_groups + self.expert_parallel_bucket_groups:
+            bucket_group.reset()
+
+    def broadcast_params(self):
+        """
+        Syncs parameters across all DP ranks.
+        """
+        for param in self.module.parameters():
+            is_expert_parallel = not getattr(param, 'allreduce', True)
+
+            if is_expert_parallel:
+                data_parallel_group = parallel_state.get_data_modulo_expert_parallel_group(
+                    with_context_parallel=True
+                )
+            else:
+                data_parallel_group = parallel_state.get_data_parallel_group(
+                    with_context_parallel=True
+                )
+            torch.distributed.broadcast(
+                param.data,
+                src=torch.distributed.get_global_rank(data_parallel_group, 0),
+                group=data_parallel_group,
+            )
+
+    def state_dict(self, prefix='', keep_vars=False):
+        """
+        Returns a dictionary containing references to the whole state of the
+        wrapped module.
+
+        Both parameters and persistent buffers (e.g. running averages) are included.
+        Keys are corresponding parameter and buffer names. Parameters and buffers
+        set to None are not included.
+        """
+        return self.module.state_dict(prefix=prefix, keep_vars=keep_vars)
+
+    def state_dict_for_save_checkpoint(self, prefix='', keep_vars=False):
+        """
+        Returns wrapped module's state_dict for checkpoint saving.
+        """
+        return self.module.state_dict_for_save_checkpoint(prefix=prefix, keep_vars=keep_vars)
+
+    def load_state_dict(self, state_dict, strict=True):
+        """
+        Copies parameters and buffers from state_dict into the wrapped module and its
+        descendants. If strict is True, then the keys of state_dict must exactly match
+        the keys returned by this module’s state_dict() function.
+        """
+        self.module.load_state_dict(state_dict, strict=strict)
diff --git a/megatron/core/distributed/distributed_data_parallel_config.py b/megatron/core/distributed/distributed_data_parallel_config.py
new file mode 100644
index 0000000000..14068ea367
--- /dev/null
+++ b/megatron/core/distributed/distributed_data_parallel_config.py
@@ -0,0 +1,44 @@
+# Copyright (c) 2024, NVIDIA CORPORATION. All rights reserved.
+
+from dataclasses import dataclass
+from typing import Optional
+
+
+@dataclass
+class DistributedDataParallelConfig:
+    """Configuration for DistributedDataParallel."""
+
+    grad_reduce_in_fp32: bool = False
+    """If true, reduce grads in fp32."""
+
+    overlap_grad_reduce: bool = False
+    """If true, overlap grad all-reduce / reduce-scatter with backward compute."""
+
+    overlap_param_gather: bool = False
+    """If true, overlap param all-gather with forward compute."""
+
+    align_param_gather: bool = False
+    """If true, all PP stages will launch param all-gathers simultaneously. Otherwise, each
+    PP stage will independently launch as needed.
+    """
+
+    use_distributed_optimizer: bool = False
+    """If true, issue reduce-scatter collectives to aggregate gradients and clean up
+       originally allocated model parameters, otherwise issue all-reduce collectives.
+    """
+
+    check_for_nan_in_grad: bool = False
+    """ If true, check for NaNs in gradients _before_ communication collective."""
+
+    bucket_size: Optional[int] = None
+    """Maximum number of parameters in each bucket. If unspecified, MCore uses a default
+       value of max(40000000, 1000000 * dp_size) parameters (larger DP sizes need larger
+       buckets to ensure collectives do not become latency-bound)."""
+
+    average_in_collective: bool = False
+    """If true, compute average in collective directly, as opposed to dividing by the
+       dp_size first and then computing sum in the collective."""
+
+    fp8_param_gather: bool = False
+    """If true, keep the compute param in fp8 (do not use any other intermediate dtype) and
+       perform the param all-gather in fp8."""
diff --git a/megatron/core/distributed/finalize_model_grads.py b/megatron/core/distributed/finalize_model_grads.py
new file mode 100644
index 0000000000..ff5046afa5
--- /dev/null
+++ b/megatron/core/distributed/finalize_model_grads.py
@@ -0,0 +1,162 @@
+# Copyright (c) 2024, NVIDIA CORPORATION. All rights reserved.
+
+from typing import List, Optional
+
+import torch
+from torch._utils import _flatten_dense_tensors, _unflatten_dense_tensors
+
+from .. import parallel_state
+from ..transformer.transformer_config import TransformerConfig
+from ..utils import get_attr_wrapped_model, get_model_config
+
+
+def _allreduce_word_embedding_grads(model: List[torch.nn.Module], config: TransformerConfig):
+    """
+    All-reduce word embedding grads.
+
+    Reduce grads across first and last stages to ensure that word_embeddings parameters stay in
+    sync.
+    """
+
+    if (
+        parallel_state.is_rank_in_embedding_group(ignore_virtual=True)
+        and torch.distributed.get_world_size(parallel_state.get_embedding_group()) > 1
+    ):
+        if parallel_state.is_pipeline_first_stage(ignore_virtual=True):
+            model_module = model[0]
+        elif parallel_state.is_pipeline_last_stage(ignore_virtual=True):
+            model_module = model[-1]
+        else:  # We do not support an interleaved schedule for models with encoders yet.
+            model_module = model[0]
+
+        model_module = get_attr_wrapped_model(model_module, 'pre_process', return_model_obj=True)
+        if model_module.share_embeddings_and_output_weights:
+            weight = model_module.shared_embedding_or_output_weight()
+            grad = weight.main_grad
+            torch.distributed.all_reduce(grad, group=parallel_state.get_embedding_group())
+
+
+def _allreduce_position_embedding_grads(model: List[torch.nn.Module], config: TransformerConfig):
+    """
+    All-reduce position_embeddings grad across encoder and decoder stages to ensure that position
+    embeddings parameters stay in sync.
+    """
+    if (
+        parallel_state.is_rank_in_position_embedding_group()
+        and torch.distributed.get_world_size(parallel_state.get_position_embedding_group()) > 1
+    ):
+        if parallel_state.is_pipeline_first_stage(ignore_virtual=True):
+            model_module = model[0]
+        elif parallel_state.is_pipeline_last_stage(ignore_virtual=True):
+            model_module = model[-1]
+        else:  # We do not support an interleaved schedule for models with encoders yet.
+            model_module = model[0]
+
+        model_module = get_attr_wrapped_model(model_module, 'pre_process', return_model_obj=True)
+        assert hasattr(model_module, 'position_embeddings')
+        grad = model_module.position_embeddings.weight.main_grad
+        torch.distributed.all_reduce(grad, group=parallel_state.get_position_embedding_group())
+
+
+def _allreduce_embedding_grads(model: List[torch.nn.Module], config: TransformerConfig):
+    """
+    All-reduce both word and position embeddings.
+    """
+    _allreduce_word_embedding_grads(model, config)
+    _allreduce_position_embedding_grads(model, config)
+
+
+def _allreduce_layernorm_grads(model: List[torch.nn.Module], config: TransformerConfig):
+    """
+    All-reduce layernorm grads (for sequence parallelism).
+    """
+
+    # All-reduce layernorm parameters across model parallel nodes
+    # when sequence parallelism is used
+    if parallel_state.get_tensor_model_parallel_world_size() > 1 and (
+        config.sequence_parallel or config.qk_layernorm
+    ):
+        grads = []
+        for model_chunk in model:
+            for name, param in get_attr_wrapped_model(model_chunk, 'named_parameters')():
+                if (
+                    param.requires_grad
+                    and getattr(param, 'sequence_parallel', False)
+                    or 'q_layernorm' in name
+                    or 'k_layernorm' in name
+                ):
+                    grad = param.main_grad
+                    grads.append(grad.data)
+        if grads:
+            coalesced = _flatten_dense_tensors(grads)
+            torch.distributed.all_reduce(
+                coalesced, group=parallel_state.get_tensor_model_parallel_group()
+            )
+            for buf, synced in zip(grads, _unflatten_dense_tensors(coalesced, grads)):
+                buf.copy_(synced)
+
+
+def finalize_model_grads(model: List[torch.nn.Module], num_tokens: Optional[torch.Tensor] = None):
+    """
+    All-reduce all model grads across DP replicas, layernorm grads for sequence parallelism,
+    embedding grads across first and last pipeline stages (if not tied),
+    scale gradients by `num_tokens`.
+    """
+
+    config = get_model_config(model[0])
+
+    # All-reduce / reduce-scatter across DP replicas.
+    if config.timers is not None:
+        config.timers('all-grads-sync', log_level=1).start(barrier=config.barrier_with_L1_time)
+    for model_chunk in model:
+        model_chunk.finish_grad_sync()
+    if config.timers is not None:
+        config.timers('all-grads-sync').stop()
+
+    # All-reduce layer-norm grads (for sequence parallelism).
+    if config.timers is not None:
+        config.timers('layernorm-grads-all-reduce', log_level=1).start(
+            barrier=config.barrier_with_L1_time
+        )
+    _allreduce_layernorm_grads(model, config)
+    if config.timers is not None:
+        config.timers('layernorm-grads-all-reduce').stop()
+
+    # All-reduce embedding grads (for pipeline parallelism).
+    if config.timers is not None:
+        config.timers('embedding-grads-all-reduce', log_level=1).start(
+            barrier=config.barrier_with_L1_time
+        )
+    _allreduce_embedding_grads(model, config)
+    if config.timers is not None:
+        config.timers('embedding-grads-all-reduce').stop()
+
+    # normalize gradients for per-token loss normalization.
+    # if we are using by the number of tokens, then we use that as a divisor. this number
+    # will be the total number of non-padded tokens in the global batch.
+    if num_tokens is not None:
+
+        # the number of tokens is only present on the last stage, so broadcast it
+        # to the other ranks in the pipeline parallel group.
+        last_rank = parallel_state.get_pipeline_model_parallel_last_rank()
+        pp_group = parallel_state.get_pipeline_model_parallel_group()
+
+        if not isinstance(last_rank, list):
+            assert not isinstance(last_rank, list)
+            last_rank = [last_rank]
+            assert not isinstance(pp_group, list)
+            pp_group = [pp_group]
+
+        # need to do a broadcast for every pp group, even though num_tokens should be the same.
+        num_tokens_list = []
+        for lr, group in zip(last_rank, pp_group):
+            torch.distributed.broadcast(num_tokens, src=lr, group=group)
+            num_tokens_list.append(torch.clone(num_tokens))
+        assert all(x.item() == num_tokens_list[0] for x in num_tokens_list)
+
+        # all-reduce across DP ranks.
+        torch.distributed.all_reduce(num_tokens, group=parallel_state.get_data_parallel_group())
+        for model_chunk in model:
+            if num_tokens > 0:
+                scaling = 1.0 / num_tokens
+                model_chunk.scale_gradients(scaling)
diff --git a/megatron/core/distributed/param_and_grad_buffer.py b/megatron/core/distributed/param_and_grad_buffer.py
new file mode 100644
index 0000000000..351ff9e0bf
--- /dev/null
+++ b/megatron/core/distributed/param_and_grad_buffer.py
@@ -0,0 +1,769 @@
+# Copyright (c) 2024, NVIDIA CORPORATION. All rights reserved.
+
+import logging
+import math
+import os
+import warnings
+from enum import Enum
+from typing import Dict, List, Optional
+
+import torch
+from torch.distributed import _coalescing_manager
+
+from ..utils import is_float8tensor, log_on_each_pipeline_stage
+from .distributed_data_parallel_config import DistributedDataParallelConfig
+
+logger = logging.getLogger(__name__)
+
+
+class BufferType(Enum):
+    """
+    Enumeration for buffer type.
+    """
+
+    PARAM = 1
+    GRAD = 2
+
+
+def shard_buffer(buffer: torch.Tensor, data_parallel_world_size: int):
+    """
+    Shard buffer into data_parallel_world_size chunks of equal size.
+    """
+    assert buffer.numel() % data_parallel_world_size == 0
+    shard_size = buffer.numel() // data_parallel_world_size
+    sharded_buffer = [
+        buffer[(r * shard_size) : ((r + 1) * shard_size)] for r in range(data_parallel_world_size)
+    ]
+    return sharded_buffer
+
+
+class _ParamAndGradBucket:
+    """
+    Bucket to keep track of a subset of the model's parameters and gradients.
+
+    Args:
+        params: List of parameters whose gradients are collated in this bucket.
+        param_data: View in ParamAndGradBuffer.param_data that this bucket is responsible for.
+        grad_data: View in ParamAndGradBuffer.grad_data that this bucket is responsible for.
+        offset: Offset of this bucket's view in the larger ParamAndGradBuffer.
+        numel_unpadded: Number of unpadded elements in bucket.
+        gradient_scaling_factor: This factor is utilized to scale gradients prior to their
+            communication. Its application is twofold: it facilitates the averaging of gradients
+            and the scaling of gradients in the context of the Mixture of Experts (MoE) model.
+        bucket_id: Index of bucket in buffer.
+    """
+
+    def __init__(
+        self,
+        params: List[torch.nn.Parameter],
+        param_data: Optional[torch.Tensor],
+        grad_data: torch.Tensor,
+        offset: int,
+        numel_unpadded: int,
+        gradient_scaling_factor: float,
+        bucket_id: int,
+    ):
+        self.params_list = params
+        self.params = set(params)
+        # Make sure there are no duplicate params.
+        assert len(self.params_list) == len(self.params)
+        self.param_data = param_data
+        self.grad_data = grad_data
+        # The distributed optimizer needs to keep track of this bucket's offset
+        # within the full grad_buffer.
+        self.offset = offset
+        self.numel_unpadded = numel_unpadded
+        self.gradient_scaling_factor = gradient_scaling_factor
+        self.bucket_id = bucket_id
+
+
+class _ParamAndGradBucketGroup:
+    """
+    Put multiple buckets into a group so that their communications can be aggregated together.
+    Provides functionality to register when params in the bucket group have grads ready to be
+    synced; an asynchronous communication call is automatically launched when _all_ params in
+    the bucket group have grads ready.
+
+    Args:
+        buckets: A list of buckets.
+        ddp_config: DistributedDataParallel config object.
+        data_parallel_group: Data-parallel process group.
+        data_parallel_world_size: World size using the data-parallel group group.
+    """
+
+    def __init__(
+        self,
+        buckets: List[_ParamAndGradBucket],
+        ddp_config: DistributedDataParallelConfig,
+        data_parallel_group: torch.distributed.ProcessGroup,
+        data_parallel_world_size: int,
+    ):
+        self.buckets = buckets
+        self.ddp_config = ddp_config
+        self.data_parallel_group = data_parallel_group
+        self.data_parallel_world_size = data_parallel_world_size
+        self.data_parallel_rank = torch.distributed.get_rank(group=data_parallel_group)
+
+        # State for bookkeeping: params is the set of parameters this bucket group is
+        # responsible for, params_with_grad is the set of parameters with grads
+        # available. When overlap_grad_reduce is True, communication (all-reduce
+        # or reduce-scatter) is issued when params_with_grad equals params.
+        self.param_to_bucket = {}
+        self.params = set()
+        for bucket in self.buckets:
+            for param in bucket.params_list:
+                self.param_to_bucket[param] = bucket
+                self.params.add(param)
+
+        self.next_param_gather_bucket_group = None
+
+        self.reset()
+        self.param_gather_handle = None
+        self.param_gather_dispatched = False
+        self.grad_reduce_handle = None
+
+    def reset(self):
+        """
+        Reset metadata in bucket group in preparation for the next iteration of training.
+        """
+        self.params_with_grad = set()
+        self.is_last_microbatch = True
+
+    def check_for_nan_in_grad(self):
+        """
+        Make sure norm of grads in bucket are not NaN prior to data-parallel
+        all-reduce / reduce-scatter.
+        """
+        global_rank = torch.distributed.get_rank()
+        norm_is_nan = self.buckets[0].grad_data.norm(p=2).isnan()
+        for i in range(1, len(self.buckets)):
+            norm_is_nan.logical_or_(self.buckets[i].grad_data.norm(p=2).isnan())
+        assert not norm_is_nan, (
+            f'Rank {global_rank}: found NaN in local grad norm in '
+            f'backward pass before data-parallel communication collective. '
+            f'Device: {torch.cuda.current_device()}, node: {os.uname()[1]}'
+        )
+
+    def start_param_sync(self, force_sync: bool = False):
+        """
+        Initiates all necessary param all-gathers for this bucket.
+
+        When ddp_config.overlap_param_gather is set to True, dispatches an asynchronous
+        communication call (unless force_sync is True). When ddp_config.overlap_param_gather
+        is set to False, makes synchronous call.
+
+        Args:
+            force_sync (bool, optional): force synchronous collective regardless of
+                other settings if true.
+        """
+        assert self.ddp_config.use_distributed_optimizer
+
+        if force_sync:
+            if self.param_gather_handle is not None:
+                self.param_gather_handle.wait()
+                self.param_gather_handle = None
+                return
+        else:
+            assert self.param_gather_handle is None
+
+        async_op = self.ddp_config.overlap_param_gather and not force_sync
+        # Coalesce communication kernels across buckets in the bucket group.
+        with _coalescing_manager(self.data_parallel_group, async_ops=async_op) as cm:
+            for bucket in self.buckets:
+                local_data_view = shard_buffer(bucket.param_data, self.data_parallel_world_size)[
+                    self.data_parallel_rank
+                ]
+                torch.distributed._all_gather_base(
+                    bucket.param_data,
+                    local_data_view,
+                    group=self.data_parallel_group,
+                    async_op=async_op,
+                )
+        if async_op:
+            self.param_gather_handle = cm
+        else:
+            # When using `_coalescing_manager`, even if a synchronous op (async_op=False) is used,
+            # `cm` is not None, which is different from when `_coalescing_manager` is not used in
+            # which case the torch.distributed._all_gather_base() will return None. In order to
+            # maintain consistency with prior code, we need to manually set communication handle to
+            # None.
+            self.param_gather_handle = None
+        self.param_gather_dispatched = True
+
+    def finish_param_sync(self, skip_next_bucket_dispatch: bool = False):
+        """
+        Finishes param sync communication operation for this bucket. Dispatches
+        next bucket's param sync if available, unless skip_next_bucket_dispatch
+        is True.
+
+        When ddp_config.overlap_param_gather is set to True, waits for asynchronous
+        communication call to complete (and dispatches one if one is not already
+        outstanding). Throws assertion error if ddp_config.overlap_param_gather is set to
+        False.
+
+        Args:
+            skip_next_bucket_dispatch (bool, optional): if true, dispatch next
+                bucket's communication if available.
+        """
+        assert self.ddp_config.use_distributed_optimizer
+        assert self.ddp_config.overlap_param_gather
+
+        # If current bucket's param AG has not been dispatched, dispatch it now (e.g., first
+        # AG bucket in first model chunk if ddp_config.align_param_gather is False).
+        if not self.param_gather_dispatched:
+            self.start_param_sync()
+
+        if self.param_gather_handle is not None:
+            self.param_gather_handle.wait()
+            self.param_gather_handle = None
+            # Dispatch next bucket's asynchronous param AG.
+            if self.next_param_gather_bucket_group is not None and not skip_next_bucket_dispatch:
+                self.next_param_gather_bucket_group.start_param_sync()
+
+    def start_grad_sync(self):
+        """
+        Initiates grad sync (all-reduce or reduce-scatter) communication operations
+        for all buckets in the bucket group.
+
+        When ddp_config.overlap_grad_reduce is set to True, dispatches an asynchronous
+        communication call. When ddp_config.overlap_grad_reduce is set to False, makes
+        synchronous call.
+        """
+        assert (
+            self.grad_reduce_handle is None
+        ), 'Should not have multiple communication calls outstanding at once'
+
+        if self.ddp_config.check_for_nan_in_grad:
+            self.check_for_nan_in_grad()
+
+        # gradient_scaling_factor already takes into account whether we are computing
+        # an average or sum in the data-parallel collective.
+        for bucket in self.buckets:
+            if bucket.gradient_scaling_factor != 1.0:
+                bucket.grad_data *= bucket.gradient_scaling_factor
+
+        # Decide reduce_op.
+        reduce_op = torch.distributed.ReduceOp.SUM
+        if self.ddp_config.average_in_collective:
+            reduce_op = torch.distributed.ReduceOp.AVG
+
+        # Use async communications only when overlap_grad_reduce is True.
+        async_op = self.ddp_config.overlap_grad_reduce
+        # Coalesce communication kernels across buckets in the bucket group.
+        with _coalescing_manager(self.data_parallel_group, async_ops=async_op) as cm:
+            for bucket in self.buckets:
+                if self.ddp_config.use_distributed_optimizer:
+                    local_data_view = shard_buffer(bucket.grad_data, self.data_parallel_world_size)[
+                        self.data_parallel_rank
+                    ]
+                    torch.distributed._reduce_scatter_base(
+                        local_data_view,
+                        bucket.grad_data,
+                        op=reduce_op,
+                        group=self.data_parallel_group,
+                        async_op=async_op,
+                    )
+                else:
+                    torch.distributed.all_reduce(
+                        bucket.grad_data,
+                        op=reduce_op,
+                        group=self.data_parallel_group,
+                        async_op=async_op,
+                    )
+        if async_op:
+            self.grad_reduce_handle = cm
+        else:
+            # When using `_coalescing_manager`, even if a synchronous op (async_op=False) is used,
+            # `cm` is not None, which is different from when `_coalescing_manager` is not used in
+            # which case the torch.distributed._reduce_scatter_base() will return None. In order to
+            # maintain consistency with prior code, we need to manually set communication handle to
+            # None.
+            self.grad_reduce_handle = None
+
+    def finish_grad_sync(self):
+        """
+        Finishes grad sync (all-reduce or reduce-scatter) communication operations
+        for all buckets in the bucket group.
+
+        When ddp_config.overlap_grad_reduce is set to True, waits for asynchronous
+        communication call to complete. When ddp_config.overlap_grad_reduce is set to False,
+        makes synchronous call.
+        """
+        # If overlap_grad_reduce is False, start (and finish) synchronous communication call here.
+        self.param_gather_dispatched = False
+        if not self.ddp_config.overlap_grad_reduce:
+            self.start_grad_sync()
+            return
+        assert self.grad_reduce_handle is not None, (
+            f'Communication call has not been issued for this bucket '
+            f'({len(self.params_with_grad)}/{len(self.params)} params have grad available)'
+        )
+        self.grad_reduce_handle.wait()
+        self.grad_reduce_handle = None
+
+    def register_grad_ready(self, param: torch.nn.Parameter):
+        """
+        Registers grads for the passed-in param to be "ready" for grad sync.
+
+        When the number of microbatches is greater than 1, we only want to register
+        grads as ready when processing the last microbatch and ddp_config.overlap_grad_reduce
+        is True.
+        """
+        assert (
+            self.ddp_config.overlap_grad_reduce
+        ), 'register_grad_ready() should only be called when overlap_grad_reduce is True'
+        if self.is_last_microbatch:
+            assert param in self.param_to_bucket, 'Param is not in the bucket group'
+            assert param not in self.params_with_grad, 'Cannot set grad twice'
+            self.params_with_grad.add(param)
+            # If all params in bucket group have grads available, issue communication call.
+            if len(self.params_with_grad) == len(self.params):
+                self.start_grad_sync()
+
+
+class _ParamAndGradBuffer:
+    """
+    Groups parameters and gradients into a contiguous buffer, and then breaks the buffer into
+    buckets with roughly `bucket_size` parameters each.
+
+    Args:
+        ddp_config: DistributedDataParallel config object.
+        param_dtype: Type of param tensor.
+        grad_dtype: Type of grad tensor.
+        params: List of parameters whose parameters and gradients are collated in the underlying
+            tensor.
+        data_parallel_group: Data-parallel process group.
+        bucket_size: The rough size of each bucket in terms of number of parameters.
+        param_to_name: Mapping from `torch.nn.Parameter` to name (for logging purposes).
+        gradient_scaling_factor: This factor is utilized to scale gradients prior to their
+            communication. Its application is twofold: it facilitates the averaging of gradients
+            and the scaling of gradients in the context of the Mixture of Experts (MoE) model.
+        param_indices: The index of each param among the params with same dtype, if a param is fp8,
+            use its "fake" high precision dtype to determine which params have same dtype with it.
+            These indices are needed when loading a non-native-fp8 checkpoint in native-fp8 mode.
+    """
+
+    def __init__(
+        self,
+        ddp_config: DistributedDataParallelConfig,
+        param_dtype: torch.dtype,
+        grad_dtype: torch.dtype,
+        params: List[torch.nn.Parameter],
+        data_parallel_group: torch.distributed.ProcessGroup,
+        bucket_size: int,
+        param_to_name: Dict[torch.nn.Parameter, str],
+        gradient_scaling_factor: float,
+        param_indices: List[int],
+    ):
+        self.ddp_config = ddp_config
+        self.params = params
+        self.param_indices = param_indices
+
+        # Check that params are unique.
+        unique_params = set()
+        for param in params:
+            assert param not in unique_params
+            unique_params.add(param)
+        del unique_params
+
+        # Store attributes that will be needed later.
+        self.param_dtype = param_dtype
+        self.grad_dtype = grad_dtype
+        self.data_parallel_group = data_parallel_group
+        self.data_parallel_world_size = torch.distributed.get_world_size(
+            group=self.data_parallel_group
+        )
+        self.gradient_scaling_factor = gradient_scaling_factor
+
+        # Data structures to store underlying buckets and relevant indexing data.
+        self.buckets = []
+        self.param_to_bucket = {}  # Param -> bucket mapping.
+        self.param_index_map = {}  # Param -> location in buffer mapping (used in dist. optimizer).
+
+        def _pad(number_to_be_padded: int, divisor: int) -> int:
+            return int(math.ceil(number_to_be_padded / divisor) * divisor)
+
+        def _pad_end_of_bucket_if_needed(bucket_end_index: int) -> int:
+            """
+            Pads end index of bucket if using distributed optimizer (to ensure uniform sharding).
+            """
+            if self.ddp_config.use_distributed_optimizer:
+                # Workaround for TE bug causing cuBLAS to pick an incompatible algorithm.
+                # This also helps cuBLAS pick more efficient algorithms for GEMMs.
+                # We now ensure that all buckets start at a memory address that is 256-byte
+                # aligned (128 values since params and grads use >= 16-bit precision).
+                return _pad(bucket_end_index, math.lcm(self.data_parallel_world_size, 128))
+            return bucket_end_index
+
+        def _pad_start_of_param_if_needed(param_start_index: int) -> int:
+            """
+            Pads start index of param if using distributed optimizer (to ensure "good" alignment).
+            """
+            if self.ddp_config.use_distributed_optimizer:
+                # Ensure that params start at 128-byte aligned addresses (64 values
+                # since params are >= 16-bit precision).
+                return _pad(param_start_index, 64)
+            return param_start_index
+
+        # First, figure out how many elements should be in the underlying buffer storage.
+        # Note that if we need to split the buffer into smaller buckets, each of these
+        # might need to be padded as well (if using the distributed optimizer).
+        param_start_index = 0
+        bucket_start_index = param_start_index
+        bucket_params = set()
+        self.bucket_indices = []
+        per_bucket_numel_unpadded = []
+        bucket_id = 0
+
+        def _update_bucket_metadata(param_end_index: int) -> int:
+            """
+            Record metadata for the bucket starting at bucket_start_index and ending with the
+            passed-in param_end_index. Returns the bucket's end_index.
+            """
+            nonlocal bucket_start_index, bucket_params, bucket_id
+            per_bucket_numel_unpadded.append(param_end_index - bucket_start_index)
+            bucket_end_index = _pad_end_of_bucket_if_needed(param_end_index)
+
+            # Record metadata of new bucket.
+            self.bucket_indices.append((bucket_start_index, bucket_end_index))
+            bucket_start_index = bucket_end_index
+
+            # Prepare for next bucket.
+            bucket_params = set()
+            bucket_id += 1
+
+            # Return the potentially padded bucket_end_index.
+            return bucket_end_index
+
+        def _does_param_require_new_bucket(param):
+            """
+            Split shared embedding parameters into separate bucket if using distributed
+            optimizer that makes use of reduce-scatters instead of all-reduces.
+            This ensures that the first and last pipeline stage partition optimizer state
+            for the shared embedding parameters the same way across DP replicas, allowing
+            the DP reduce-scatter to be before the embedding all-reduce.
+            """
+            return (
+                getattr(param, "shared_embedding", False)
+                and self.ddp_config.use_distributed_optimizer
+            )
+
+        for param in params[::-1]:
+            # Iterate through parameters in reverse order to roughly follow backprop order.
+
+            this_numel = param.data.nelement()
+            param_start_index = _pad_start_of_param_if_needed(param_start_index)
+
+            # Create bucket with collected parameters if current param needs its own bucket.
+            if _does_param_require_new_bucket(param):
+                # We are creating a bucket for the already accumulated parameters, whose params
+                # end at the current param_start_index.
+                if self.ddp_config.use_distributed_optimizer:
+                    # Make sure new bucket is appropriately padded.
+                    if param_start_index % self.data_parallel_world_size != 0:
+                        param_start_index = _pad_end_of_bucket_if_needed(param_start_index)
+                if len(bucket_params) > 0:
+                    bucket_end_index = _update_bucket_metadata(param_start_index)
+
+            param_end_index = param_start_index + this_numel
+            self.param_index_map[param] = (param_start_index, param_end_index, bucket_id)
+            bucket_params.add(param)
+
+            # If we have enough elements already or the current param is part of the shared
+            # embedding layer and needs a separate bucket, form a new bucket.
+            if (
+                bucket_size is not None and (param_end_index - bucket_start_index) >= bucket_size
+            ) or _does_param_require_new_bucket(param):
+                bucket_end_index = _update_bucket_metadata(param_end_index)
+                param_start_index = bucket_end_index
+            else:
+                param_start_index = param_end_index
+
+        # Add remaining params to a new bucket.
+        if len(bucket_params) > 0:
+            bucket_end_index = _update_bucket_metadata(param_end_index)
+
+        # Next, create underlying storage for buffer (with numel elements that includes
+        # padding as necessary).
+        self.numel = bucket_end_index
+        self.numel_unpadded = sum(per_bucket_numel_unpadded)
+        assert self.numel_unpadded <= self.numel
+        if self.ddp_config.use_distributed_optimizer:
+            assert self.numel % self.data_parallel_world_size == 0
+        else:
+            assert self.numel == self.numel_unpadded
+
+        self.param_data = None
+        # Only re-map param tensors if using distributed optimizer.
+        if self.ddp_config.use_distributed_optimizer:
+            self.param_data = torch.zeros(
+                self.numel,
+                dtype=self.param_dtype,
+                device=torch.cuda.current_device(),
+                requires_grad=False,
+            )
+        self.grad_data = torch.zeros(
+            self.numel,
+            dtype=self.grad_dtype,
+            device=torch.cuda.current_device(),
+            requires_grad=False,
+        )
+
+        # Finally, map param.data and param.main_grad fields to buffers.
+        bucket_params = []
+        bucket_start_index = 0
+        cur_bucket_id = 0
+        for param in params[::-1]:
+            param_start_index, param_end_index, bucket_id = self.param_index_map[param]
+
+            # Assign param.data to appropriate segment of self.param_data.
+            if self.param_data is not None:
+                old_param_data = param.data
+                new_param_data = self._get(
+                    param.data.shape, param_start_index, buffer_type=BufferType.PARAM
+                )
+                if is_float8tensor(param):
+                    param._data = new_param_data
+                else:
+                    param.data = new_param_data
+                assert old_param_data._base is None
+                # Copy tensor values (from initialization or checkpoint).
+                param.data.detach().copy_(old_param_data)
+                del old_param_data
+
+            param.main_grad = self._get(
+                param.data.shape, param_start_index, buffer_type=BufferType.GRAD
+            )
+            if bucket_id != cur_bucket_id:
+                bucket_end_index = _pad_end_of_bucket_if_needed(param_start_index)
+                self.buckets.append(
+                    self._new_bucket(
+                        bucket_params=bucket_params,
+                        start_index=bucket_start_index,
+                        end_index=bucket_end_index,
+                        numel_unpadded=per_bucket_numel_unpadded[cur_bucket_id],
+                        bucket_id=cur_bucket_id,
+                    )
+                )
+                bucket_start_index = bucket_end_index
+                bucket_params = []
+                assert cur_bucket_id + 1 == len(self.buckets)
+                assert bucket_id == cur_bucket_id + 1
+                cur_bucket_id = bucket_id
+            bucket_params.append(param)
+
+        # Add remaining params to a new bucket.
+        if len(bucket_params) > 0:
+            bucket_end_index = _pad_end_of_bucket_if_needed(param_end_index)
+            self.buckets.append(
+                self._new_bucket(
+                    bucket_params=bucket_params,
+                    start_index=bucket_start_index,
+                    end_index=bucket_end_index,
+                    numel_unpadded=per_bucket_numel_unpadded[cur_bucket_id],
+                    bucket_id=cur_bucket_id,
+                )
+            )
+
+        # Log buckets for all PP stages.
+        log_strs = []
+        log_strs.append(
+            f'Number of buckets for gradient all-reduce / reduce-scatter: {len(self.buckets)}'
+        )
+        for index, bucket in enumerate(self.buckets):
+            numel = 0
+            for param in bucket.params:
+                numel += param.data.nelement()
+            log_strs.append(f'Params for bucket {index+1} ({numel} elements):')
+            for param in bucket.params:
+                log_strs.append(f'\t{param_to_name[param]}')
+        log_on_each_pipeline_stage(logger, logging.INFO, '\n'.join(log_strs))
+
+    def scale_gradients(self, scaling_factor: float) -> None:
+        """Scale the gradient data by `scaling_factor`."""
+        self.grad_data *= scaling_factor
+
+    def _get(self, shape: torch.Size, start_index: int, buffer_type: BufferType) -> torch.Tensor:
+        """
+        Return a tensor with the input `shape` as a view into the 1-D data starting at
+        `start_index`.
+        """
+        end_index = start_index + shape.numel()
+        assert end_index <= self.numel, 'Requested tensor is out of buffer range'
+        if buffer_type == BufferType.PARAM:
+            assert self.param_data is not None
+            buffer_tensor = self.param_data[start_index:end_index]
+        elif buffer_type == BufferType.GRAD:
+            buffer_tensor = self.grad_data[start_index:end_index]
+        else:
+            raise Exception("Illegal buffer type provided to GradBuffer._get() function")
+        buffer_tensor = buffer_tensor.view(shape)
+        return buffer_tensor
+
+    def _new_bucket(
+        self,
+        bucket_params: List[torch.nn.Parameter],
+        start_index: int,
+        end_index: int,
+        numel_unpadded: int,
+        bucket_id: int,
+    ) -> _ParamAndGradBucket:
+        """
+        Helper function that creates a new bucket. Also updates param->bucket mapping.
+        """
+
+        # Assert that indices are correctly padded (if needed), and that bucket
+        # position is same as originally computed.
+        if self.ddp_config.use_distributed_optimizer:
+            assert start_index % self.data_parallel_world_size == 0
+            assert end_index % self.data_parallel_world_size == 0
+        assert (start_index, end_index) == self.bucket_indices[bucket_id]
+
+        # Get appropriate view into global ParamAndGradBuffer.
+        bucketed_param_data = None
+        if self.param_data is not None:
+            bucketed_param_data = self._get(
+                torch.Size([end_index - start_index]), start_index, buffer_type=BufferType.PARAM
+            )
+        bucketed_grad_data = self._get(
+            torch.Size([end_index - start_index]), start_index, buffer_type=BufferType.GRAD
+        )
+        bucket = _ParamAndGradBucket(
+            params=bucket_params,
+            param_data=bucketed_param_data,
+            grad_data=bucketed_grad_data,
+            offset=start_index,
+            numel_unpadded=numel_unpadded,
+            gradient_scaling_factor=self.gradient_scaling_factor,
+            bucket_id=bucket_id,
+        )
+        for bucket_param in bucket_params:
+            assert bucket_param not in self.param_to_bucket
+            self.param_to_bucket[bucket_param] = bucket
+
+        return bucket
+
+    def reset(self):
+        """
+        Zero out the underlying grad_buffer.
+        """
+        self.grad_data.zero_()
+
+
+def partition_buckets(
+    buffers: List[_ParamAndGradBuffer], force_single_bucket_group: bool = False
+) -> List[_ParamAndGradBucketGroup]:
+    """
+    Automatically regroup the buckets of input buffers and return a list of bucket groups.
+
+    In some scenarios, we need to put buckets from different buffers into a group so that their
+    communication can be aggregated.
+
+    For example, when there are both fp8 weights and bf16 biases in the model and virtual
+    pipeline parallelism is enabled, each model chunk will have an fp8 bucket and a bf16 bucket,
+    which doubles the number of communication kernels, and because of the use of
+    CUDA_DEVICE_MAX_CONNECTIONS=1, having multiple back-to-back communications will prevent the
+    overlap of communication kernels with computation kernels.
+
+    The grouping strategy is:
+    1. If force_single_bucket_group is True, put all buckets across all buffers into a single
+       bucket group.
+    2. If force_single_bucket_group is False, when there is no fp8 buffer in the input buffers,
+       let each bucket group have only one bucket.
+    3. If force_single_bucket_group is False, when using fp8 params, merge all non-fp8 buckets
+       into the last fp8 bucket group.
+       - Since the non-fp8 parameters (typically the biases of various layers) are relatively
+         small, they are likely to be grouped into a single non-fp8 bucket.
+       - The fp8 buckets start from the end of the model, i.e., the first bucket corresponds to
+         the end of the model, while the last bucket corresponds to the beginning.
+       - If we combine the non-fp8 bucket with the first fp8 bucket, we cannot initiate the
+         reduce-scatter to synchronize gradients after the backward pass at the end of the model
+         has completed. This is because we need to wait for the non-fp8 params from the beginning
+         layers to obtain their gradients.
+       - Combining the non-fp8 bucket with the last fp8 bucket can help avoid this issue.
+
+    Args:
+        buffers (list): list of input buffers.
+        single_bucket_group_per_buffer (bool, optional): force group all buckets in each buffer
+            into a single bucket group.
+    """
+
+    if len(buffers) == 0:
+        return []
+
+    dtype_to_buffer_map = {}
+    for buffer in buffers:
+        dtype = buffer.param_dtype
+        # Make sure that the param_dtype of any two buffers is different.
+        assert dtype not in dtype_to_buffer_map
+        dtype_to_buffer_map[dtype] = buffer
+
+    # Case 1: Put all buckets into a single bucket group if force_single_bucket_group is True.
+    if force_single_bucket_group:
+        buckets = []
+        ddp_config = buffers[0].ddp_config
+        data_parallel_group = buffers[0].data_parallel_group
+        data_parallel_world_size = buffers[0].data_parallel_world_size
+        for buffer in buffers:
+            assert ddp_config == buffer.ddp_config
+            assert data_parallel_group == buffer.data_parallel_group
+            assert data_parallel_world_size == buffer.data_parallel_world_size
+            buckets.extend(buffer.buckets)
+
+        bucket_group = _ParamAndGradBucketGroup(
+            buckets, ddp_config, data_parallel_group, data_parallel_world_size
+        )
+        return [bucket_group]
+
+    if torch.uint8 not in dtype_to_buffer_map:
+        # Case 2: When there is no fp8 buffer in the input buffers, let each bucket group have
+        #         only one bucket.
+        bucket_groups = []
+        for buffer in buffers:
+            for bucket in buffer.buckets:
+                bucket_groups.append(
+                    _ParamAndGradBucketGroup(
+                        [bucket],
+                        buffer.ddp_config,
+                        buffer.data_parallel_group,
+                        buffer.data_parallel_world_size,
+                    )
+                )
+        return bucket_groups
+    else:
+        # Case 3: When using fp8 params, merge all non-fp8 buckets into the last fp8 bucket group.
+        non_fp8_buckets = []
+        for buffer in buffers:
+            if buffer.param_dtype != torch.uint8:
+                for bucket in buffer.buckets:
+                    non_fp8_buckets.append(bucket)
+
+        bucket_groups = []
+        fp8_buffer = dtype_to_buffer_map[torch.uint8]
+        for bucket in fp8_buffer.buckets:
+            if len(bucket_groups) == len(fp8_buffer.buckets) - 1:
+                # The last bucket group.
+                group_buckets = [bucket] + non_fp8_buckets
+            else:
+                # The first N-1 bucket groups.
+                group_buckets = [bucket]
+            bucket_groups.append(
+                _ParamAndGradBucketGroup(
+                    group_buckets,
+                    buffer.ddp_config,
+                    buffer.data_parallel_group,
+                    buffer.data_parallel_world_size,
+                )
+            )
+        return bucket_groups
+
+
+# For backwards compatibility. ParamAndGradBuffer will be deprecated in future release.
+# _ParamAndGradBuffer is not intended to be consumed directly by external code.
+class ParamAndGradBuffer(_ParamAndGradBuffer):
+    def __init__(self, *args, **kwargs):
+        super().__init__(*args, **kwargs)
+        warnings.warn(
+            "`ParamAndGradBuffer` will be deprecated in a future release, and is not "
+            "intended to be used by external code."
+        )
diff --git a/megatron/core/enums.py b/megatron/core/enums.py
index cf1452b23e..46e7d3b766 100644
--- a/megatron/core/enums.py
+++ b/megatron/core/enums.py
@@ -2,6 +2,7 @@
 
 import enum
 
+
 class ModelType(enum.Enum):
     encoder_or_decoder = 1
     encoder_and_decoder = 2
diff --git a/megatron/core/export/__init__.py b/megatron/core/export/__init__.py
new file mode 100644
index 0000000000..f8011007a5
--- /dev/null
+++ b/megatron/core/export/__init__.py
@@ -0,0 +1 @@
+# Copyright (c) 2024, NVIDIA CORPORATION. All rights reserved.
diff --git a/megatron/core/export/data_type.py b/megatron/core/export/data_type.py
new file mode 100644
index 0000000000..38fbdea8f6
--- /dev/null
+++ b/megatron/core/export/data_type.py
@@ -0,0 +1,5 @@
+# Copyright (c) 2024, NVIDIA CORPORATION. All rights reserved.
+
+from enum import Enum
+
+DataType = Enum('DataType', ["bfloat16", "float16", "float32"])
diff --git a/megatron/core/export/export_config.py b/megatron/core/export/export_config.py
new file mode 100644
index 0000000000..2cc1e208be
--- /dev/null
+++ b/megatron/core/export/export_config.py
@@ -0,0 +1,19 @@
+# Copyright (c) 2024, NVIDIA CORPORATION. All rights reserved.
+
+from dataclasses import dataclass
+
+
+@dataclass
+class ExportConfig:
+    """Base configuration for Megatron Core Export
+
+    These parameters control the export setting for trtllm
+    """
+
+    inference_tp_size: int = 1
+
+    inference_pp_size: int = 1
+
+    use_parallel_embedding: bool = False
+
+    use_embedding_sharing: bool = False
diff --git a/megatron/core/export/model_type.py b/megatron/core/export/model_type.py
new file mode 100644
index 0000000000..6a33d6440e
--- /dev/null
+++ b/megatron/core/export/model_type.py
@@ -0,0 +1,7 @@
+# Copyright (c) 2024, NVIDIA CORPORATION. All rights reserved.
+
+from enum import Enum
+
+ModelType = Enum(
+    'ModelType', ["gpt", "gptnext", "llama", "falcon", "starcoder", "mixtral", "gemma"]
+)
diff --git a/megatron/core/export/trtllm/__init__.py b/megatron/core/export/trtllm/__init__.py
new file mode 100644
index 0000000000..f8011007a5
--- /dev/null
+++ b/megatron/core/export/trtllm/__init__.py
@@ -0,0 +1 @@
+# Copyright (c) 2024, NVIDIA CORPORATION. All rights reserved.
diff --git a/megatron/core/export/trtllm/engine_builder/__init__.py b/megatron/core/export/trtllm/engine_builder/__init__.py
new file mode 100644
index 0000000000..f8011007a5
--- /dev/null
+++ b/megatron/core/export/trtllm/engine_builder/__init__.py
@@ -0,0 +1 @@
+# Copyright (c) 2024, NVIDIA CORPORATION. All rights reserved.
diff --git a/megatron/core/export/trtllm/engine_builder/trtllm_engine_builder.py b/megatron/core/export/trtllm/engine_builder/trtllm_engine_builder.py
new file mode 100644
index 0000000000..e729fec410
--- /dev/null
+++ b/megatron/core/export/trtllm/engine_builder/trtllm_engine_builder.py
@@ -0,0 +1,148 @@
+# Copyright (c) 2024, NVIDIA CORPORATION. All rights reserved.
+
+import tensorrt_llm
+from tensorrt_llm._common import check_max_num_tokens
+from tensorrt_llm.builder import BuildConfig
+from tensorrt_llm.commands.build import build as build_trtllm
+from tensorrt_llm.logger import logger
+from tensorrt_llm.lora_manager import LoraConfig
+from tensorrt_llm.models.modeling_utils import optimize_model, preprocess_weights
+from tensorrt_llm.plugin import PluginConfig
+
+
+class TRTLLMEngineBuilder:
+    """A utility class to build TRTLLM engine"""
+
+    @staticmethod
+    def build_and_save_engine(
+        engine_dir: str,
+        trtllm_model_weights: dict,
+        trtllm_model_config,
+        max_input_len: int = 1024,
+        max_output_len: int = 1024,
+        max_batch_size: int = 4,
+        lora_ckpt_list=None,
+        use_lora_plugin=None,
+        max_lora_rank: int = 64,
+        lora_target_modules=None,
+        max_prompt_embedding_table_size: int = 0,
+        paged_kv_cache: bool = True,
+        remove_input_padding: bool = True,
+        paged_context_fmha: bool = False,
+        use_refit: bool = False,
+        max_num_tokens: int = None,
+        max_seq_len: int = None,
+        opt_num_tokens: int = None,
+        max_beam_width: int = 1,
+        tokens_per_block: int = 128,
+        multiple_profiles: bool = False,
+        gpt_attention_plugin: str = "auto",
+        gemm_plugin: str = "auto",
+    ):
+        """Method to build the TRTLLM Engine
+
+        This method uses the TRTLLMEngineBuilder to build and save the engine to engine dir
+
+        Args:
+            engine_dir (str): The file path to save the engine
+            trtllm_model_weights (dict): The TRTLLM converted model weights dict
+            trtllm_model_config : The TRTLLM Config
+            max_input_len (int, optional): Max input length. Defaults to 1024.
+            max_output_len (int, optional): Max output length. Defaults to 1024.
+            max_batch_size (int, optional): Max batch size. Defaults to 4.
+            model_type (ModelType, optional): ModelType enum. Defaults to ModelType.gpt.
+            lora_ckpt_list (_type_, optional): Lora checkpoint list. Defaults to None.
+            use_lora_plugin (_type_, optional): Use lora plugin. Defaults to None.
+            max_lora_rank (int, optional): Max lora rank. Defaults to 64.
+            lora_target_modules (_type_, optional): Lora target modules. Defaults to None.
+            max_prompt_embedding_table_size (int, optional): Defaults to 0.
+            paged_kv_cache (bool, optional): Use Paged KV cache. Defaults to True.
+            remove_input_padding (bool, optional): Remove input padding. Defaults to True.
+            paged_context_fmha (bool, optional): Paged context fmha. Defaults to False.
+            use_refit (bool, optional): Use refit. Defaults to False.
+            max_num_tokens (int, optional): Max num of tokens. Defaults to None.
+            max_seq_len (int, optional): Max seq length. Defaults to None.
+            opt_num_tokens (int, optional): Opt number of tokens. Defaults to None.
+            max_beam_width (int, optional): Max beam width. Defaults to 1.
+            tokens_per_block (int, optional): Nmber of tokens per block. Defaults to 128.
+            multiple_profiles (bool, optional): Use multiple profiles. Defaults to False.
+            gpt_attention_plugin (str, optional): Gpt attention plugin to use. Defaults to "auto".
+            gemm_plugin (str, optional): Gemma plugin to use. Defaults to "auto".
+        """
+        architecture = (
+            "LLaMAForCausalLM"
+            if trtllm_model_config.architecture == "LlamaForCausalLM"
+            else trtllm_model_config.architecture
+        )
+        try:
+            model_cls = getattr(tensorrt_llm.models, architecture)
+        except:
+            raise AttributeError(f"Could not find TRTLLM model for architecture: {architecture}!")
+
+        logger.set_level("info")
+        plugin_config = PluginConfig()
+        plugin_config.gpt_attention_plugin = gpt_attention_plugin
+        plugin_config.gemm_plugin = gemm_plugin
+        if paged_kv_cache:
+            plugin_config.enable_paged_kv_cache(tokens_per_block=tokens_per_block)
+        else:
+            plugin_config.paged_kv_cache = False
+        plugin_config.remove_input_padding = remove_input_padding
+        plugin_config.use_paged_context_fmha = paged_context_fmha
+        plugin_config.multiple_profiles = multiple_profiles
+
+        if max_seq_len is None:
+            max_seq_len = max_input_len + max_output_len
+
+        max_num_tokens, opt_num_tokens = check_max_num_tokens(
+            max_num_tokens=max_num_tokens,
+            opt_num_tokens=opt_num_tokens,
+            max_seq_len=max_seq_len,
+            max_batch_size=max_batch_size,
+            max_input_len=max_input_len,
+            max_beam_width=max_beam_width,
+            remove_input_padding=remove_input_padding,
+            enable_context_fmha=plugin_config.context_fmha,
+            tokens_per_block=tokens_per_block,
+            multiple_profiles=multiple_profiles,
+        )
+
+        build_dict = {
+            'max_input_len': max_input_len,
+            'max_output_len': max_output_len,
+            'max_batch_size': max_batch_size,
+            'max_beam_width': max_beam_width,
+            'max_seq_len': max_seq_len,
+            'max_num_tokens': max_num_tokens,
+            'opt_num_tokens': opt_num_tokens,
+            'max_prompt_embedding_table_size': max_prompt_embedding_table_size,
+            'gather_context_logits': False,
+            'gather_generation_logits': False,
+            'strongly_typed': False,
+            'builder_opt': None,
+            'use_refit': use_refit,
+            'multiple_profiles': multiple_profiles,
+        }
+        build_config = BuildConfig.from_dict(build_dict, plugin_config=plugin_config)
+
+        if use_lora_plugin is not None:
+            # build_config.plugin_config.set_lora_plugin(use_lora_plugin)
+            # build_config.plugin_config._lora_plugin = use_lora_plugin
+            lora_config = LoraConfig(
+                lora_dir=lora_ckpt_list,
+                lora_ckpt_source='nemo',  # TODO : NEED TO SEE HOW TO HANDLE THIS FOR MCORE
+                max_lora_rank=max_lora_rank,
+                lora_target_modules=lora_target_modules,
+            )
+            build_config.lora_config = lora_config
+
+        model = model_cls.from_config(trtllm_model_config)
+        model = optimize_model(
+            model,
+            use_parallel_embedding=trtllm_model_config.use_parallel_embedding,
+            share_embedding_table=trtllm_model_config.share_embedding_table,
+        )
+        preprocess_weights(trtllm_model_weights, trtllm_model_config)
+        model.load(trtllm_model_weights)
+        engine = build_trtllm(model, build_config)
+        engine.save(engine_dir)
diff --git a/megatron/core/export/trtllm/model_to_trllm_mapping/__init__.py b/megatron/core/export/trtllm/model_to_trllm_mapping/__init__.py
new file mode 100644
index 0000000000..f8011007a5
--- /dev/null
+++ b/megatron/core/export/trtllm/model_to_trllm_mapping/__init__.py
@@ -0,0 +1 @@
+# Copyright (c) 2024, NVIDIA CORPORATION. All rights reserved.
diff --git a/megatron/core/export/trtllm/model_to_trllm_mapping/default_conversion_dict.py b/megatron/core/export/trtllm/model_to_trllm_mapping/default_conversion_dict.py
new file mode 100644
index 0000000000..cad9315034
--- /dev/null
+++ b/megatron/core/export/trtllm/model_to_trllm_mapping/default_conversion_dict.py
@@ -0,0 +1,18 @@
+# Copyright (c) 2024, NVIDIA CORPORATION. All rights reserved.
+
+from megatron.core.export.model_type import ModelType
+from megatron.core.export.trtllm.model_to_trllm_mapping.falcon_model import FALCON_DICT
+from megatron.core.export.trtllm.model_to_trllm_mapping.gemma_model import GEMMA_DICT
+from megatron.core.export.trtllm.model_to_trllm_mapping.gpt_model import GPT_DICT
+from megatron.core.export.trtllm.model_to_trllm_mapping.gpt_next_model import GPT_NEXT_DICT
+from megatron.core.export.trtllm.model_to_trllm_mapping.llama_model import LLAMA_DICT
+from megatron.core.export.trtllm.model_to_trllm_mapping.starcoder_model import STARCODER_DICT
+
+DEFAULT_CONVERSION_DICT = {
+    ModelType.llama: LLAMA_DICT,
+    ModelType.falcon: FALCON_DICT,
+    ModelType.gemma: GEMMA_DICT,
+    ModelType.starcoder: STARCODER_DICT,
+    ModelType.gpt: GPT_DICT,
+    ModelType.gptnext: GPT_NEXT_DICT,
+}
diff --git a/megatron/core/export/trtllm/model_to_trllm_mapping/falcon_model.py b/megatron/core/export/trtllm/model_to_trllm_mapping/falcon_model.py
new file mode 100644
index 0000000000..d1469d02ba
--- /dev/null
+++ b/megatron/core/export/trtllm/model_to_trllm_mapping/falcon_model.py
@@ -0,0 +1,26 @@
+# Copyright (c) 2024, NVIDIA CORPORATION. All rights reserved.
+
+from megatron.core.export.trtllm.trtllm_layers import TRTLLMLayers
+
+# pylint: disable=line-too-long
+FALCON_DICT = {
+    # INPUT
+    'embedding.word_embeddings.weight': TRTLLMLayers.vocab_embedding,
+    'embedding.position_embeddings.weight': TRTLLMLayers.position_embedding,
+    # ATTENTION
+    'decoder.layers.input_layernorm.weight': TRTLLMLayers.input_layernorm_weight,
+    'decoder.layers.input_layernorm.bias': TRTLLMLayers.input_layernorm_bias,
+    'decoder.layers.self_attention.linear_qkv.weight': TRTLLMLayers.attention_qkv_weight,
+    'decoder.layers.self_attention.linear_qkv.layer_norm_weight': TRTLLMLayers.input_layernorm_weight,
+    'decoder.layers.self_attention.linear_proj.weight': TRTLLMLayers.attention_dense_weight,
+    # MLP
+    'decoder.layers.pre_mlp_layernorm.weight': TRTLLMLayers.post_layernorm_weight,
+    'decoder.layers.pre_mlp_layernorm.bias': TRTLLMLayers.post_layernorm_bias,
+    'decoder.layers.mlp.linear_fc1.weight': TRTLLMLayers.mlp_fc_weight,
+    'decoder.layers.mlp.linear_fc2.weight': TRTLLMLayers.mlp_projection_weight,
+    # FINAL LAYER NORM
+    'decoder.final_layernorm.weight': TRTLLMLayers.final_layernorm_weight,
+    'decoder.final_layernorm.bias': TRTLLMLayers.final_layernorm_bias,
+    # OUTPUT LAYER
+    'output_layer.weight': TRTLLMLayers.lm_head,
+}
diff --git a/megatron/core/export/trtllm/model_to_trllm_mapping/gemma_model.py b/megatron/core/export/trtllm/model_to_trllm_mapping/gemma_model.py
new file mode 100644
index 0000000000..47a0211706
--- /dev/null
+++ b/megatron/core/export/trtllm/model_to_trllm_mapping/gemma_model.py
@@ -0,0 +1,21 @@
+# Copyright (c) 2024, NVIDIA CORPORATION. All rights reserved.
+
+from megatron.core.export.trtllm.trtllm_layers import TRTLLMLayers
+
+# pylint: disable=line-too-long
+GEMMA_DICT = {
+    # INPUT
+    'embedding.word_embeddings.weight': TRTLLMLayers.vocab_embedding,
+    # ATTENTION
+    'decoder.layers.self_attention.linear_qkv.weight': TRTLLMLayers.attention_qkv_weight,
+    'decoder.layers.self_attention.linear_qkv.layer_norm_weight': TRTLLMLayers.input_layernorm_weight,
+    'decoder.layers.self_attention.linear_proj.weight': TRTLLMLayers.attention_dense_weight,
+    # MLP
+    'decoder.layers.mlp.linear_fc1.weight': TRTLLMLayers.mlp_fc_weight,
+    'decoder.layers.mlp.linear_fc2.weight': TRTLLMLayers.mlp_projection_weight,
+    'decoder.layers.mlp.linear_fc1.layer_norm_weight': TRTLLMLayers.post_layernorm_weight,
+    # FINAL LAYER NORM
+    'decoder.final_layernorm.weight': TRTLLMLayers.final_layernorm_weight,
+    # OUTPUT LAYER
+    'output_layer.weight': TRTLLMLayers.lm_head,
+}
diff --git a/megatron/core/export/trtllm/model_to_trllm_mapping/gpt_model.py b/megatron/core/export/trtllm/model_to_trllm_mapping/gpt_model.py
new file mode 100644
index 0000000000..eda27600c6
--- /dev/null
+++ b/megatron/core/export/trtllm/model_to_trllm_mapping/gpt_model.py
@@ -0,0 +1,28 @@
+# Copyright (c) 2024, NVIDIA CORPORATION. All rights reserved.
+
+from megatron.core.export.trtllm.trtllm_layers import TRTLLMLayers
+
+GPT_DICT = {
+    # INPUT
+    'embedding.word_embeddings.weight': TRTLLMLayers.vocab_embedding,
+    'embedding.position_embeddings.weight': TRTLLMLayers.position_embedding,
+    # ATTENTION
+    'decoder.layers.input_layernorm.weight': TRTLLMLayers.input_layernorm_weight,
+    'decoder.layers.input_layernorm.bias': TRTLLMLayers.input_layernorm_bias,
+    'decoder.layers.self_attention.linear_qkv.weight': TRTLLMLayers.attention_qkv_weight,
+    'decoder.layers.self_attention.linear_qkv.bias': TRTLLMLayers.attention_qkv_bias,
+    'decoder.layers.self_attention.linear_proj.weight': TRTLLMLayers.attention_dense_weight,
+    'decoder.layers.self_attention.linear_proj.bias': TRTLLMLayers.attention_dense_bias,
+    # MLP
+    'decoder.layers.pre_mlp_layernorm.weight': TRTLLMLayers.post_layernorm_weight,
+    'decoder.layers.pre_mlp_layernorm.bias': TRTLLMLayers.post_layernorm_bias,
+    'decoder.layers.mlp.linear_fc1.weight': TRTLLMLayers.mlp_fc_weight,
+    'decoder.layers.mlp.linear_fc1.bias': TRTLLMLayers.mlp_fc_bias,
+    'decoder.layers.mlp.linear_fc2.weight': TRTLLMLayers.mlp_projection_weight,
+    'decoder.layers.mlp.linear_fc2.bias': TRTLLMLayers.mlp_projection_bias,
+    # FINAL LAYER NORM
+    'decoder.final_layernorm.weight': TRTLLMLayers.final_layernorm_weight,
+    'decoder.final_layernorm.bias': TRTLLMLayers.final_layernorm_bias,
+    # OUTPUT LAYER
+    'output_layer.weight': TRTLLMLayers.lm_head,
+}
diff --git a/megatron/core/export/trtllm/model_to_trllm_mapping/gpt_next_model.py b/megatron/core/export/trtllm/model_to_trllm_mapping/gpt_next_model.py
new file mode 100644
index 0000000000..ac5f84ef1b
--- /dev/null
+++ b/megatron/core/export/trtllm/model_to_trllm_mapping/gpt_next_model.py
@@ -0,0 +1,24 @@
+# Copyright (c) 2024, NVIDIA CORPORATION. All rights reserved.
+
+from megatron.core.export.trtllm.trtllm_layers import TRTLLMLayers
+
+# pylint: disable=line-too-long
+GPT_NEXT_DICT = {
+    # INPUT
+    'embedding.word_embeddings.weight': TRTLLMLayers.vocab_embedding,
+    # ATTENTION
+    'decoder.layers.self_attention.linear_proj.weight': TRTLLMLayers.attention_dense_weight,
+    'decoder.layers.self_attention.linear_qkv.layer_norm_weight': TRTLLMLayers.input_layernorm_weight,
+    'decoder.layers.self_attention.linear_qkv.layer_norm_bias': TRTLLMLayers.input_layernorm_bias,
+    'decoder.layers.self_attention.linear_qkv.weight': TRTLLMLayers.attention_qkv_weight,
+    # MLP
+    'decoder.layers.mlp.linear_fc1.layer_norm_weight': TRTLLMLayers.post_layernorm_weight,
+    'decoder.layers.mlp.linear_fc1.layer_norm_bias': TRTLLMLayers.post_layernorm_bias,
+    'decoder.layers.mlp.linear_fc1.weight': TRTLLMLayers.mlp_fc_weight,
+    'decoder.layers.mlp.linear_fc2.weight': TRTLLMLayers.mlp_projection_weight,
+    # FINAL LAYER NORM
+    'decoder.final_layernorm.weight': TRTLLMLayers.final_layernorm_weight,
+    'decoder.final_layernorm.bias': TRTLLMLayers.final_layernorm_bias,
+    # OUTPUT LAYER
+    'output_layer.weight': TRTLLMLayers.lm_head,
+}
diff --git a/megatron/core/export/trtllm/model_to_trllm_mapping/llama_model.py b/megatron/core/export/trtllm/model_to_trllm_mapping/llama_model.py
new file mode 100644
index 0000000000..5fd2067081
--- /dev/null
+++ b/megatron/core/export/trtllm/model_to_trllm_mapping/llama_model.py
@@ -0,0 +1,22 @@
+# Copyright (c) 2024, NVIDIA CORPORATION. All rights reserved.
+
+from megatron.core.export.trtllm.trtllm_layers import TRTLLMLayers
+
+# pylint: disable=line-too-long
+LLAMA_DICT = {
+    # INPUT
+    'embedding.word_embeddings.weight': TRTLLMLayers.vocab_embedding,
+    'embedding.position_embeddings.weight': TRTLLMLayers.position_embedding,
+    # ATTENTION
+    'decoder.layers.self_attention.linear_qkv.weight': TRTLLMLayers.attention_qkv_weight,
+    'decoder.layers.self_attention.linear_qkv.layer_norm_weight': TRTLLMLayers.input_layernorm_weight,
+    'decoder.layers.self_attention.linear_proj.weight': TRTLLMLayers.attention_dense_weight,
+    # MLP
+    'decoder.layers.mlp.linear_fc1.weight': TRTLLMLayers.mlp_fc_weight,
+    'decoder.layers.mlp.linear_fc2.weight': TRTLLMLayers.mlp_projection_weight,
+    'decoder.layers.mlp.linear_fc1.layer_norm_weight': TRTLLMLayers.post_layernorm_weight,
+    # FINAL LAYER NORM
+    'decoder.final_layernorm.weight': TRTLLMLayers.final_layernorm_weight,
+    # OUTPUT LAYER
+    'output_layer.weight': TRTLLMLayers.lm_head,
+}
diff --git a/megatron/core/export/trtllm/model_to_trllm_mapping/starcoder_model.py b/megatron/core/export/trtllm/model_to_trllm_mapping/starcoder_model.py
new file mode 100644
index 0000000000..dce61d26c5
--- /dev/null
+++ b/megatron/core/export/trtllm/model_to_trllm_mapping/starcoder_model.py
@@ -0,0 +1,30 @@
+# Copyright (c) 2024, NVIDIA CORPORATION. All rights reserved.
+
+from megatron.core.export.trtllm.trtllm_layers import TRTLLMLayers
+
+# pylint: disable=line-too-long
+STARCODER_DICT = {
+    # INPUT
+    'embedding.word_embeddings.weight': TRTLLMLayers.vocab_embedding,
+    # ATTENTION
+    'decoder.layers.input_layernorm.weight': TRTLLMLayers.input_layernorm_weight,
+    'decoder.layers.input_layernorm.bias': TRTLLMLayers.input_layernorm_bias,
+    'decoder.layers.self_attention.linear_qkv.weight': TRTLLMLayers.attention_qkv_weight,
+    'decoder.layers.self_attention.linear_qkv.bias': TRTLLMLayers.attention_qkv_bias,
+    'decoder.layers.self_attention.linear_qkv.layer_norm_weight': TRTLLMLayers.input_layernorm_weight,
+    'decoder.layers.self_attention.linear_qkv.layer_norm_bias': TRTLLMLayers.input_layernorm_bias,
+    'decoder.layers.self_attention.linear_proj.weight': TRTLLMLayers.attention_dense_weight,
+    'decoder.layers.self_attention.linear_proj.bias': TRTLLMLayers.attention_dense_bias,
+    # MLP
+    'decoder.layers.mlp.linear_fc1.weight': TRTLLMLayers.mlp_fc_weight,
+    'decoder.layers.mlp.linear_fc1.bias': TRTLLMLayers.mlp_fc_bias,
+    'decoder.layers.mlp.linear_fc2.weight': TRTLLMLayers.mlp_projection_weight,
+    'decoder.layers.mlp.linear_fc2.bias': TRTLLMLayers.mlp_projection_bias,
+    'decoder.layers.mlp.linear_fc1.layer_norm_weight': TRTLLMLayers.post_layernorm_weight,
+    'decoder.layers.mlp.linear_fc1.layer_norm_bias': TRTLLMLayers.post_layernorm_bias,
+    # FINAL LAYER NORM
+    'decoder.final_layernorm.weight': TRTLLMLayers.final_layernorm_weight,
+    'decoder.final_layernorm.bias': TRTLLMLayers.final_layernorm_bias,
+    # OUTPUT LAYER
+    'output_layer.weight': TRTLLMLayers.lm_head,
+}
diff --git a/megatron/core/export/trtllm/trt_model_config.py b/megatron/core/export/trtllm/trt_model_config.py
new file mode 100644
index 0000000000..2ed09398c2
--- /dev/null
+++ b/megatron/core/export/trtllm/trt_model_config.py
@@ -0,0 +1,15 @@
+# Copyright (c) 2024, NVIDIA CORPORATION. All rights reserved.
+
+import tensorrt_llm
+
+from megatron.core.export.model_type import ModelType
+
+TRT_MODEL_CONFIG = {
+    ModelType.gpt: tensorrt_llm.models.gpt.config.GPTConfig,
+    ModelType.gptnext: tensorrt_llm.models.gpt.config.GPTConfig,
+    ModelType.starcoder: tensorrt_llm.models.gpt.config.GPTConfig,
+    ModelType.mixtral: tensorrt_llm.models.llama.config.LLaMAConfig,
+    ModelType.llama: tensorrt_llm.models.llama.config.LLaMAConfig,
+    ModelType.gemma: tensorrt_llm.models.GemmaConfig,
+    ModelType.falcon: tensorrt_llm.models.falcon.config.FalconConfig,
+}
diff --git a/megatron/core/export/trtllm/trt_model_type.py b/megatron/core/export/trtllm/trt_model_type.py
new file mode 100644
index 0000000000..f45ff1786e
--- /dev/null
+++ b/megatron/core/export/trtllm/trt_model_type.py
@@ -0,0 +1,13 @@
+# Copyright (c) 2024, NVIDIA CORPORATION. All rights reserved.
+
+from megatron.core.export.model_type import ModelType
+
+TRT_MODEL_TYPE_STRING = {
+    ModelType.gpt: 'GPTForCausalLM',
+    ModelType.gptnext: 'GPTForCausalLM',
+    ModelType.starcoder: 'GPTForCausalLM',
+    ModelType.mixtral: 'LlamaForCausalLM',
+    ModelType.llama: 'LlamaForCausalLM',
+    ModelType.gemma: 'GemmaForCausalLM',
+    ModelType.falcon: 'FalconForCausalLM',
+}
diff --git a/megatron/core/export/trtllm/trtllm_helper.py b/megatron/core/export/trtllm/trtllm_helper.py
new file mode 100644
index 0000000000..d8bef18b33
--- /dev/null
+++ b/megatron/core/export/trtllm/trtllm_helper.py
@@ -0,0 +1,461 @@
+# Copyright (c) 2024, NVIDIA CORPORATION. All rights reserved.
+
+import tensorrt_llm
+from tensorrt_llm.functional import non_gated_version
+from tensorrt_llm.layers import MoeConfig
+
+from megatron.core.export.data_type import DataType
+from megatron.core.export.export_config import ExportConfig
+from megatron.core.export.model_type import ModelType
+from megatron.core.export.trtllm.engine_builder.trtllm_engine_builder import TRTLLMEngineBuilder
+from megatron.core.export.trtllm.model_to_trllm_mapping.default_conversion_dict import (
+    DEFAULT_CONVERSION_DICT,
+)
+from megatron.core.export.trtllm.trt_model_config import TRT_MODEL_CONFIG
+from megatron.core.export.trtllm.trt_model_type import TRT_MODEL_TYPE_STRING
+
+# pylint: disable=line-too-long
+from megatron.core.export.trtllm.trtllm_weights_converter.distributed_trtllm_model_weights_converter import (
+    DistributedTRTLLMModelWeightsConverter,
+)
+from megatron.core.export.trtllm.trtllm_weights_converter.single_device_trtllm_model_weights_converter import (
+    SingleDeviceTRTLLMModelWeightsConverter,
+)
+from megatron.core.transformer.transformer_config import TransformerConfig
+
+
+class TRTLLMHelper:
+    """TRTLLM Helper class to convert export and build TRTLLM model."""
+
+    def __init__(
+        self,
+        transformer_config: TransformerConfig,
+        model_type: ModelType,
+        trtllm_conversion_dict: dict = {},
+        position_embedding_type: str = 'learned_absolute',
+        max_position_embeddings: int = None,
+        rotary_percentage: int = 1.0,
+        rotary_base: int = 10000,
+        moe_tp_mode: int = 2,
+        multi_query_mode: bool = False,
+        activation: str = "gelu",
+        seq_len_interpolation_factor: float = None,
+        moe_renorm_mode=None,
+        share_embeddings_and_output_weights=False,
+    ):
+        """Constructor for the TRTLLMHelper
+
+        There are two public API's supported  by this helper.
+        a) get_trtllm_pretrained_config_and_model_weights
+        b) build_and_save_engine
+
+        Args:
+            transformer_config (TransformerConfig): The transformer config
+            model_type (ModelType): The type of the input model. Enum (megatron.core.export.model_type.ModelType)
+            conversion_dict (dict, optional): A conversion dictionary that will map your model layer names to trtllm equivalent layer names. Sample dictionaries are given megatron/core/export/model_mapping. NOTE: Ingore layer numbers in the model layer names. (e.g) decoder.layers.0.attention_qkv.weight will be decoder.layers.attention_qkv.weight in the mapping dictionary. Defaults to {}.
+            position_embedding_type (str, optional): The position embedding type. Defaults to None.
+            max_position_embeddings (int, optional): Max posistion embeddings value. Defaults to None.
+            rotary_percentage (int, optional): The rotary percentage if using rope embedding. Defaults to 1.0.
+            rotary_base (int, optional): The rotary base (theta value) if using rope embeddings. Defaults to 10000.
+            moe_tp_mode (int, optional): TRTLLM Config. Defaults to 2.
+            multi_query_mode (bool, optional): Defaults to False.
+            activation (str, optional): Defaults to "gelu".
+            seq_len_interpolation_factor (float, optional): The sequence length interpolation factor if using rope embeddings. Defaults to None.
+            moe_renorm_mode (optional) : Renormalization mode if using mixture of experts. Defaults to None.
+            share_embeddings_and_output_weights (bool, optional): True if input and output layers share weights. Defaults to False.
+        """
+
+        self.transformer_config = transformer_config
+        self.model_type = model_type
+        self.trtllm_conversion_dict = DEFAULT_CONVERSION_DICT[model_type]
+        self.trtllm_conversion_dict.update(trtllm_conversion_dict)
+        assert position_embedding_type in [
+            'learned_absolute',
+            'rope',
+        ], f"Position embedding type should be one of learned_absolute, rope. You entered {position_embedding_type}"
+        self.position_embedding_type = position_embedding_type
+        self.max_position_embeddings = max_position_embeddings
+        self.rotary_percentage = rotary_percentage
+        self.rotary_base = rotary_base
+        self.moe_tp_mode = moe_tp_mode
+        self.multi_query_mode = multi_query_mode
+        self.activation = activation
+        self.seq_len_interpolation_factor = seq_len_interpolation_factor
+        self.moe_renorm_mode = moe_renorm_mode
+        self.share_embeddings_and_output_weights = share_embeddings_and_output_weights
+
+    def _get_trtllm_config(
+        self,
+        export_config: ExportConfig,
+        world_size: int,
+        gpus_per_node: int,
+        vocab_size_padded: int,
+        dtype: DataType,
+    ):
+        """Get TRTLLM Config
+
+        Returns appropriate TRTLLM PretrainedConfig used by TRTLLM for building engine
+
+        Args:
+            export_config (ExportConfig): The export config that defines inference tp , pp size etc.
+            world_size (int): The number of gpus (Mostly TP * PP)
+            gpus_per_node (int): Num gpus per node
+            vocab_size_padded (int): Padded vocab size
+            dtype (DataType): The datatype or model precision
+
+        Returns:
+            GPTConfig or the LLamaConfig or the PretrainedConfig constructed from your model config
+        """
+        hidden_act = self.activation
+        hidden_act = (
+            hidden_act.split("-")[-1]
+            if self.transformer_config.num_moe_experts
+            else non_gated_version(hidden_act)
+        )
+
+        config = {
+            'architecture': TRT_MODEL_TYPE_STRING[self.model_type],
+            'dtype': dtype.name,
+            'num_hidden_layers': self.transformer_config.num_layers,
+            'num_attention_heads': self.transformer_config.num_attention_heads,
+            'num_key_value_heads': (
+                self.transformer_config.num_query_groups
+                if self.transformer_config.num_query_groups
+                else self.transformer_config.num_attention_heads
+            ),
+            'head_size': self.transformer_config.kv_channels,
+            'hidden_size': self.transformer_config.hidden_size,
+            'intermediate_size': self.transformer_config.ffn_hidden_size,
+            'norm_epsilon': self.transformer_config.layernorm_epsilon,
+            'vocab_size': vocab_size_padded,
+            'position_embedding_type': (
+                "rope_gpt_neox" if self.position_embedding_type == "rope" else "learned_absolute"
+            ),
+            'max_position_embeddings': self.max_position_embeddings,
+            'hidden_act': hidden_act,
+            'use_parallel_embedding': export_config.use_parallel_embedding,
+            'embedding_sharding_dim': 0,
+            'share_embedding_table': export_config.use_embedding_sharing,
+            'quantization': {'quant_algo': None, 'kv_cache_quant_algo': None},
+            'bias': self.transformer_config.add_bias_linear,
+            'apply_query_key_layer_scaling': False,
+            'rotary_pct': self.rotary_percentage,
+            'rotary_base': self.rotary_base,
+            'moe_num_experts': (
+                0
+                if self.transformer_config.moe_router_topk == 0
+                else (self.transformer_config.num_moe_experts or 1)
+            ),
+            'moe_top_k': self.transformer_config.moe_router_topk,
+            'moe_normalization_mode': self.moe_renorm_mode
+            or MoeConfig.ExpertScaleNormalizationMode.RENORMALIZE,
+            'moe_tp_mode': self.moe_tp_mode,
+            'logits_dtype': 'float32',
+            'world_size': world_size,
+            'tp_size': export_config.inference_tp_size,
+            'pp_size': export_config.inference_pp_size,
+            'gpus_per_node': gpus_per_node,
+        }
+
+        if self.model_type == ModelType.falcon:
+            config["new_decoder_architecture"] = (
+                False if self.transformer_config.num_layers == 32 else True
+            )
+            config["parallel_attention"] = True
+
+        if self.seq_len_interpolation_factor is not None:
+            config["rotary_scaling"] = {
+                "type": "linear",
+                "factor": float(self.seq_len_interpolation_factor),
+            }
+
+        config_cls = TRT_MODEL_CONFIG[self.model_type]
+        return config_cls(**config)
+
+    # pylint: disable=line-too-long
+    def get_trtllm_pretrained_config_and_model_weights(
+        self,
+        model_state_dict,
+        dtype: DataType,
+        export_config: ExportConfig = None,
+        on_device_distributed_conversion: bool = False,
+        vocab_size: int = None,
+        gpus_per_node: int = None,
+        state_dict_split_by_layer_numbers: bool = True,
+    ):
+        """Get TRTLLM Config and Converted Model Weights
+
+        This function returns the trtllm model weights as a list.
+        There are two modes for conversion. The default is to use a single device cpu/gpu for conversion.
+        NOTE: For faster performance, if your entire model will fit in memory, pre transfer the model state dict to cuda device and then call this function.
+        For on device conversion it returns weights which will be used on the device itself.
+        Same thing happens with the pretrained config
+
+        Args:
+            model_state_dict (dict, optional): The input model state dictionary (Entire model state loaded on CPU). Used only when on device conversion is set to False. Defaults to None.
+            False, or the model state dict of each GPU in the case of on_device conversion)
+            export_config (ExportConfig): The export config used to define inference tp size, pp size etc. Used only for on device conversion.
+            dtype (DataType): The data type of model precision
+            on_device_distributed_conversion (bool, optional): Convert on gpus in distributed setting. This assumes that the model state dict is sharded according to required inference model parallelism and that each gpu gets its part of the model state dict . Defaults to False.
+            vocab_size (int, optional): The vocabulary size. Defaults to None.
+            gpus_per_node (int, optional): The number of gpus per node. Used for on device conversion.
+            state_dict_split_by_layer_numbers (bool, optional): Are the model layers split by layer numbers in state dict. For example : mlp.fc1.weight can be represented like mlp.fc1.weight of shape [num_layers, hidden_dim, ffn_hidden_dim]} or it can be like mlp.fc1.layers.0.weight of shape [hidden_dim, ffn_hidden_dim], then mlp.fc1.layers.1.weight ... for all layers. If you use represenation 2 set this to True. Defaults to True
+
+        Returns:
+            Two lists . First list of trtllm converted model weights(Either on device, or a list of weights for each gpu) and the trtllm_model_configs.
+        """
+        if on_device_distributed_conversion:
+            assert (vocab_size is not None, "Need to pass in vocab_size for on device")
+            assert (
+                self.model_type in [ModelType.gpt, ModelType.gptnext, ModelType.llama],
+                "On device conversion only supported for model types gptnext and llama",
+            )
+            assert (
+                export_config is None,
+                "Export config is inferred based on the parallel state. If you want to set inference tp 2, then load the model with this TP2 setting and just pass in the model state dict. ",
+            )
+            assert (
+                gpus_per_node is not None
+            ), "Need to pass in gpus_per_node for on device conversion"
+            trtllm_model_weights_on_device, trtllm_model_config = (
+                self._get_trtllm_pretrained_config_and_model_weights_in_distributed_setting(
+                    model_state_dict, dtype, vocab_size, gpus_per_node
+                )
+            )
+            return [trtllm_model_weights_on_device], [trtllm_model_config]
+
+        else:
+            assert not (
+                self.share_embeddings_and_output_weights and not export_config.use_embedding_sharing
+            ), "Found share_embeddings_and_output_weights is True in the model. So set export_config.use_embedding_sharing to True"
+            assert (
+                vocab_size is None
+            ), "Vocab size is inferred from the input layer for cpu conversion. So leave it as None"
+            trtllm_model_weights_list, trtllm_model_config_list = (
+                self._get_trtllm_pretrained_config_and_model_weights_list_on_single_device(
+                    export_config,
+                    model_state_dict,
+                    dtype,
+                    gpus_per_node,
+                    state_dict_split_by_layer_numbers,
+                )
+            )
+
+            return trtllm_model_weights_list, trtllm_model_config_list
+
+    def _get_trtllm_pretrained_config_and_model_weights_in_distributed_setting(
+        self, model_state_dict: dict, dtype: DataType, vocab_size: int, gpus_per_node: int
+    ):
+        """Get the TRTLLM Pretrained config and model weights list in a distributed setting
+
+        This function assumes the  model state dict is distributed according to model parallelism .
+        Each device gets its own model state dict
+
+        Args:
+            export_config (ExportConfig): The export config to set inference tp, pp size etc.
+            model_state_dict (dict): The model state dictionary (All collected on cpu)
+            dtype (DataType): The data type or model precision
+            vocab_size (int): Tokenizer vocab size
+            gpus_per_node (int): The number of gpus per node
+
+        Returns:
+            Two lists . List of trtllm converted model weights and trtllm model configs (One for each gpu).
+        """
+
+        distributed_trtllm_model_weights_converter = DistributedTRTLLMModelWeightsConverter(
+            transformer_config=self.transformer_config,
+            dtype=dtype,
+            multi_query_mode=self.multi_query_mode,
+            activation=self.activation,
+        )
+        distributed_trtllm_model_weights_converter.convert(
+            model_state_dict=model_state_dict,
+            trtllm_conversion_dict=self.trtllm_conversion_dict,
+            tokenizer_vocab_size=vocab_size,
+        )
+
+        export_config = ExportConfig(
+            inference_pp_size=distributed_trtllm_model_weights_converter.inference_pp_size,
+            inference_tp_size=distributed_trtllm_model_weights_converter.inference_tp_size,
+            use_parallel_embedding=True,
+            use_embedding_sharing=self.share_embeddings_and_output_weights,
+        )
+
+        world_size = export_config.inference_tp_size * export_config.inference_pp_size
+
+        trtllm_model_config = self._get_trtllm_config(
+            export_config=export_config,
+            world_size=world_size,
+            gpus_per_node=gpus_per_node,
+            vocab_size_padded=vocab_size,
+            dtype=dtype,
+        )
+
+        model_parallel_rank = (
+            distributed_trtllm_model_weights_converter.pp_rank
+            * distributed_trtllm_model_weights_converter.inference_tp_size
+            + distributed_trtllm_model_weights_converter.tp_rank
+        )
+
+        trtllm_model_config.mapping = tensorrt_llm.Mapping(
+            world_size=world_size,
+            rank=model_parallel_rank,
+            tp_size=export_config.inference_tp_size,
+            pp_size=export_config.inference_pp_size,
+        )
+
+        return distributed_trtllm_model_weights_converter.trtllm_model_weights, trtllm_model_config
+
+    def _get_trtllm_pretrained_config_and_model_weights_list_on_single_device(
+        self,
+        export_config: ExportConfig,
+        model_state_dict: dict,
+        dtype: DataType,
+        gpus_per_node=None,
+        state_dict_split_by_layer_numbers=True,
+    ):
+        """Get the TRTLLM Pretrained config and model weights list (one per gpu rank) on single device (CPU/GPU)
+
+        This function assumes the entire model state dict is present in CPU or on one GPU
+
+        Args:
+            export_config (ExportConfig): The export config to set inference tp, pp size etc.
+            model_state_dict (dict): The model state dictionary (All collected on cpu)
+            dtype (DataType): The data type or model precision
+            gpus_per_node (int, optional): Number of gpus per node
+            state_dict_split_by_layer_numbers (bool, optional): Are the model layers split by layer numbers in state dict. For example : mlp.fc1.weight can be represented like mlp.fc1.weight of shape [num_layers, hidden_dim, ffn_hidden_dim]} or it can be like mlp.fc1.layers.0.weight of shape [hidden_dim, ffn_hidden_dim], then mlp.fc1.layers.1.weight ... for all layers. If you use represenation 2 set this to True. Defaults to True
+
+        Returns:
+            Two lists . List of trtllm converted model weights and trtllm model configs (One for each gpu).
+        """
+        trtllm_model_configs_list = []
+        trtllm_model_weights_list = []
+
+        single_device_trtllm_model_weights_converter = SingleDeviceTRTLLMModelWeightsConverter(
+            export_config=export_config,
+            transformer_config=self.transformer_config,
+            dtype=dtype,
+            activation=self.activation,
+            multi_query_mode=self.multi_query_mode,
+        )
+        # Convert the input model state dict to trtllm model weights dictionary
+        single_device_trtllm_model_weights_converter.convert(
+            model_state_dict=model_state_dict,
+            trtllm_conversion_dict=self.trtllm_conversion_dict,
+            state_dict_split_by_layer_numbers=state_dict_split_by_layer_numbers,
+        )
+
+        vocab_size_padded = single_device_trtllm_model_weights_converter.get_padded_vocab_size()
+        world_size = export_config.inference_tp_size * export_config.inference_pp_size
+        gpus_per_node = gpus_per_node or export_config.inference_tp_size
+
+        for gpu_rank in range(world_size):
+            mapping = tensorrt_llm.Mapping(
+                world_size=world_size,
+                rank=gpu_rank,
+                tp_size=export_config.inference_tp_size,
+                pp_size=export_config.inference_pp_size,
+            )
+
+            # Important to create a new instance everytime so that the list elements have differnt rank values in the mapping object
+            trtllm_model_config = self._get_trtllm_config(
+                export_config=export_config,
+                world_size=world_size,
+                gpus_per_node=gpus_per_node,
+                vocab_size_padded=vocab_size_padded,
+                dtype=dtype,
+            )
+            trtllm_model_config.mapping = mapping
+            trtllm_model_configs_list.append(trtllm_model_config)
+
+            # Get the model weights for each rank and append it to the trtllm_model_weights_list
+            trtllm_model_weights_per_gpu = (
+                single_device_trtllm_model_weights_converter.get_local_model_weights_per_gpu(
+                    mapping, trtllm_model_config
+                )
+            )
+            trtllm_model_weights_list.append(trtllm_model_weights_per_gpu)
+
+        return trtllm_model_weights_list, trtllm_model_configs_list
+
+    def build_and_save_engine(
+        self,
+        engine_dir: str,
+        trtllm_model_weights: dict,
+        trtllm_model_config,
+        max_input_len: int = 1024,
+        max_output_len: int = 1024,
+        max_batch_size: int = 4,
+        lora_ckpt_list=None,
+        use_lora_plugin=None,
+        max_lora_rank: int = 64,
+        lora_target_modules=None,
+        max_prompt_embedding_table_size: int = 0,
+        paged_kv_cache: bool = True,
+        remove_input_padding: bool = True,
+        paged_context_fmha: bool = False,
+        use_refit: bool = False,
+        max_num_tokens: int = None,
+        max_seq_len: int = None,
+        opt_num_tokens: int = None,
+        max_beam_width: int = 1,
+        tokens_per_block: int = 128,
+        multiple_profiles: bool = False,
+        gpt_attention_plugin: str = "auto",
+        gemm_plugin: str = "auto",
+    ):
+        """Method to build the TRTLLM Engine
+
+        This method uses the TRTLLMEngineBuilder to build and save the engine to engine dir
+
+        Args:
+            engine_dir (str): The file path to save the engine
+            trtllm_model_weights (dict): The TRTLLM converted model weights dict
+            trtllm_model_config : The TRTLLM Config
+            max_input_len (int, optional): Max input length. Defaults to 1024.
+            max_output_len (int, optional): Max output length. Defaults to 1024.
+            max_batch_size (int, optional): Max batch size. Defaults to 4.
+            lora_ckpt_list (_type_, optional): Lora checkpoint list. Defaults to None.
+            use_lora_plugin (_type_, optional): Use lora plugin. Defaults to None.
+            max_lora_rank (int, optional): Max lora rank. Defaults to 64.
+            lora_target_modules (_type_, optional): Lora target modules. Defaults to None.
+            max_prompt_embedding_table_size (int, optional): Max size of prompt embedding table. Defaults to 0.
+            paged_kv_cache (bool, optional): Use Paged KV cache. Defaults to True.
+            remove_input_padding (bool, optional): Remove input padding. Defaults to True.
+            paged_context_fmha (bool, optional): Paged context fmha. Defaults to False.
+            use_refit (bool, optional): Use refit. Defaults to False.
+            max_num_tokens (int, optional): Max num of tokens. Defaults to None.
+            max_seq_len (int, optional): Max seq length. Defaults to None.
+            opt_num_tokens (int, optional): Opt number of tokens. Defaults to None.
+            max_beam_width (int, optional): Max beam width. Defaults to 1.
+            tokens_per_block (int, optional): Nmber of tokens per block. Defaults to 128.
+            multiple_profiles (bool, optional): Use multiple profiles. Defaults to False.
+            gpt_attention_plugin (str, optional): Gpt attention plugin to use. Defaults to "auto".
+            gemm_plugin (str, optional): Gemma plugin to use. Defaults to "auto".
+        """
+
+        TRTLLMEngineBuilder.build_and_save_engine(
+            engine_dir,
+            trtllm_model_weights,
+            trtllm_model_config,
+            max_input_len,
+            max_output_len,
+            max_batch_size,
+            lora_ckpt_list,
+            use_lora_plugin,
+            max_lora_rank,
+            lora_target_modules,
+            max_prompt_embedding_table_size,
+            paged_kv_cache,
+            remove_input_padding,
+            paged_context_fmha,
+            use_refit,
+            max_num_tokens,
+            max_seq_len,
+            opt_num_tokens,
+            max_beam_width,
+            tokens_per_block,
+            multiple_profiles,
+            gpt_attention_plugin,
+            gemm_plugin,
+        )
diff --git a/megatron/core/export/trtllm/trtllm_layers.py b/megatron/core/export/trtllm/trtllm_layers.py
new file mode 100644
index 0000000000..0cf805dcb6
--- /dev/null
+++ b/megatron/core/export/trtllm/trtllm_layers.py
@@ -0,0 +1,157 @@
+# Copyright (c) 2024, NVIDIA CORPORATION. All rights reserved.
+
+import re
+from enum import Enum
+from typing import Tuple
+
+
+class TRTLLMLayers(Enum):
+    """TRTLLM Layer names
+
+    This Enum will be used to map input model layer names to TRTLLM Layer names
+    """
+
+    # ONE TIME LAYERS (NOT ASSOCIATED TO TRANSFORMER BLOCK)
+    # Input layers
+    position_embedding = 'transformer.position_embedding.weight'
+    vocab_embedding = 'transformer.vocab_embedding.weight'
+    lm_head = 'lm_head.weight'
+
+    # Output layers
+    final_layernorm_weight = 'transformer.ln_f.weight'
+    final_layernorm_bias = 'transformer.ln_f.bias'
+
+    # TRANSFORMER LAYERS
+    # Attention block related layers
+    input_layernorm_weight = 'transformer.layers.input_layernorm.weight'
+    input_layernorm_bias = 'transformer.layers.input_layernorm.bias'
+    attention_qkv_weight = 'transformer.layers.attention.qkv.weight'
+    attention_qkv_bias = 'transformer.layers.attention.qkv.bias'
+    attention_dense_weight = 'transformer.layers.attention.dense.weight'
+    attention_dense_bias = 'transformer.layers.attention.dense.bias'
+
+    # mlp layers
+    mlp_fc_weight = 'transformer.layers.mlp.fc.weight'
+    mlp_fc_bias = 'transformer.layers.mlp.fc.bias'
+    post_layernorm_weight = 'transformer.layers.post_layernorm.weight'
+    post_layernorm_bias = 'transformer.layers.post_layernorm.bias'
+    mlp_projection_weight = 'transformer.layers.mlp.proj.weight'
+    mlp_projection_bias = 'transformer.layers.mlp.proj.bias'
+
+    # mixture of expert layers
+    mlp_router_weight = 'transformer.layers.mlp.router.weight'
+    mlp_fc_weight_mixture_of_experts = 'transformer.layers.mlp.fc.weight.expert'
+    mlp_projection_weight_mixture_of_experts = 'transformer.layers.mlp.proj.weight.expert'
+
+    @staticmethod
+    def return_layer_name_and_number(layer_name: str) -> Tuple[str, int]:
+        """Helper function to return layer name and number
+        Given an input layer e.g decoder.layers.2.self_attention.linear_qkv.weight,
+        this function returns decoder.layers.self_attention.linear_qkv.weight and layernumber 2.
+        In case no layer number is present, it returns None for the layer number
+        Args:
+            layer_name (dict): The input layer name
+
+        Returns:
+            Tuple[str, int]: The layer name , layer number (layer number could be None)
+        """
+        # Use regular expression to find the number specifically after 'layers.'
+        match = re.search(r'(?<=layers\.)\d+(?=\.)', layer_name)
+        if match:
+            # Extract the number and remove it from the layer name
+            number = match.group(0)
+            layer_name_without_number = re.sub(r'\.{}\.'.format(number), '.', layer_name)
+            return layer_name_without_number, int(number)
+        else:
+            # Return the original name if no number is found
+            return layer_name, None
+
+    # pylint: disable=line-too-long
+    @staticmethod
+    def rename_input_layer_names_to_trtllm_layer_names(
+        model_state_dict: dict,
+        trtllm_conversion_dict: dict,
+        state_dict_split_by_layer_numbers: bool = True,
+    ) -> dict:
+        """Helper function to rename model layer names to TRTLLM Layer names
+
+        We go through each layer (keys) in the model state dict,
+        and map it to the equivalent TRTLLMLayer name (megatron/core/export/trtllm/trtllm).
+        If we have a layer number associated with layer, we extract it out,
+        map the original layer name to equivalent trtllm layer name and add layer number back.
+        CPU Conversion will pass in model state dict without layer numbers
+        (i.e decoder.layers.mlp.linear_fc1.weight of shape [num_layers, hidden_dim, 4 * hidden_dim]) .
+        GPU conversion will pass model state dict with each layer seperated
+        (i.e decoder.layers.2.mlp.linear_fc1.weight of shape [hidden_dim, 4 * hidden_dim]).
+
+        Args:
+            model_state_dict (dict): The original model state dict
+            trtllm_conversion_dict (dict): The conversion dictionary mapping input model layer names to trtllm layer names
+            state_dict_split_by_layer_numbers (bool, optional): Are the model layers split by layer numbers in state dict. For example : mlp.fc1.weight can be represented like mlp.fc1.weight of shape [num_layers, hidden_dim, ffn_hidden_dim]} or it can be like mlp.fc1.layers.0.weight of shape [hidden_dim, ffn_hidden_dim], then mlp.fc1.layers.1.weight ... for all layers. If you use represenation 2 set this to True. Defaults to True
+
+        Raises:
+            ValueError: In case the keys dont match to trtllm keys or if all model layers are not mapped to equivalent trtllm keys
+
+        Returns:
+            dict: The model state dict with the key (i.e original model layer name) replaced by trtllm layer names
+        """
+        for original_model_layer_name in list(model_state_dict.keys()):
+            if "_extra_state" in original_model_layer_name:
+                del model_state_dict[original_model_layer_name]
+                continue
+
+            original_layer_name_without_number, layer_number = (
+                TRTLLMLayers.return_layer_name_and_number(original_model_layer_name)
+            )
+            if 'layers' in original_layer_name_without_number and state_dict_split_by_layer_numbers:
+                assert (
+                    layer_number is not None
+                ), f"Layer number is None for {original_model_layer_name} and state_dict_split_by_layer_numbers is set to True. Consider setting it False"
+
+            if original_layer_name_without_number not in trtllm_conversion_dict:
+                raise ValueError(
+                    f'Unable to rename key {original_layer_name_without_number}. Provide an appropriate mapping in the trtllm_conversion_dict when you initialize TRTLLMHelper'
+                )
+
+            trtllm_layer = trtllm_conversion_dict[original_layer_name_without_number]
+            assert isinstance(
+                trtllm_layer, TRTLLMLayers
+            ), f"{trtllm_layer} is not supported for conversion. Please use one of the TRTLLMLayerNames we provided in megatron/core/export/trtllm/trtllm_layer_names"
+
+            value = model_state_dict.pop(original_model_layer_name)
+
+            if layer_number is not None:
+                trtllm_layer_name_with_number = re.sub(
+                    r'(?<=layers\.)', f'{layer_number}.', trtllm_layer.value
+                )
+                model_state_dict[trtllm_layer_name_with_number] = value
+            else:
+                model_state_dict[trtllm_layer.value] = value
+
+        return model_state_dict
+
+
+# These layers are not associated within the transformer block.
+# So they dont have a layer number (i.e independant of number of layers in the model)
+NON_TRANSFORMER_LAYERS_NAMES = [
+    TRTLLMLayers.vocab_embedding.value,
+    TRTLLMLayers.position_embedding.value,
+    TRTLLMLayers.lm_head.value,
+    TRTLLMLayers.final_layernorm_weight.value,
+    TRTLLMLayers.final_layernorm_bias.value,
+]
+
+
+def get_layer_name_without_prefix(layer: TRTLLMLayers) -> str:
+    """Get TRTLayer name without prefix
+
+    Given a layer e.g TRTLLMLayers.attention_qkv_weight it returns 'attention.qkv.weight'
+
+    Args:
+        layer (TRTLLMLayers): The TRTLLMLayer
+
+    Returns:
+        str: The TRTLLMLayers suffix (i.e Removing transformer.layers. fromt he layer name)
+    """
+    layer_name_without_prefix = layer.value.replace("transformer.layers.", "")
+    return layer_name_without_prefix
diff --git a/megatron/core/export/trtllm/trtllm_weights_converter/__init__.py b/megatron/core/export/trtllm/trtllm_weights_converter/__init__.py
new file mode 100644
index 0000000000..f8011007a5
--- /dev/null
+++ b/megatron/core/export/trtllm/trtllm_weights_converter/__init__.py
@@ -0,0 +1 @@
+# Copyright (c) 2024, NVIDIA CORPORATION. All rights reserved.
diff --git a/megatron/core/export/trtllm/trtllm_weights_converter/distributed_trtllm_model_weights_converter.py b/megatron/core/export/trtllm/trtllm_weights_converter/distributed_trtllm_model_weights_converter.py
new file mode 100644
index 0000000000..035e23a16c
--- /dev/null
+++ b/megatron/core/export/trtllm/trtllm_weights_converter/distributed_trtllm_model_weights_converter.py
@@ -0,0 +1,258 @@
+# Copyright (c) 2024, NVIDIA CORPORATION. All rights reserved.
+
+import torch
+from tqdm import tqdm
+
+from megatron.core import parallel_state
+from megatron.core.export.data_type import DataType
+from megatron.core.export.trtllm.trtllm_layers import NON_TRANSFORMER_LAYERS_NAMES, TRTLLMLayers
+from megatron.core.export.trtllm.trtllm_layers import get_layer_name_without_prefix as suffix
+from megatron.core.tensor_parallel.utils import VocabUtility
+from megatron.core.transformer.transformer_config import TransformerConfig
+
+
+def str_dtype_to_torch(dtype: DataType):
+    """Get torch datatype from input datatype"""
+    from tensorrt_llm._utils import str_dtype_to_torch
+
+    return str_dtype_to_torch(dtype.name)
+
+
+# pylint: disable=line-too-long
+class DistributedTRTLLMModelWeightsConverter:
+    """The TRTLLM Converter class used for GPU (on device) conversion
+
+    This class is used to convert models sharded and on gpus. (It assumes that the model is already sharded appropriate to how you want to export it). (i.e) If you want to export to tp2pp2, then load the model in tp2pp2 setting and pass in their respective state dictionaries
+    """
+
+    def __init__(
+        self,
+        transformer_config: TransformerConfig,
+        dtype: DataType,
+        multi_query_mode: bool = False,
+        activation: str = "gelu",
+    ):
+        """Constructor for the TRTLLMModelWeightsConverterGPU class
+
+        This class is responsible to convert the model weights to TRTLLM equivalent weights.
+
+        Args:
+            transformer_config (TransformerConfig): The transformer config
+            dtype (DataType): The data type or model precision
+            multi_query_mode (bool, optional): Defaults to False.
+            activation (str, optional): Defaults to "gelu".
+        """
+        self.transformer_config = transformer_config
+        self.trtllm_model_weights = {}
+        self.storage_type = str_dtype_to_torch(dtype)
+        self.activation = activation
+        num_kv_heads = self.transformer_config.num_query_groups
+        if num_kv_heads == 0:
+            if multi_query_mode:
+                num_kv_heads = 1
+            else:
+                num_kv_heads = self.transformer_config.num_attention_heads
+        self.num_kv_heads = num_kv_heads
+
+        self.inference_pp_size = parallel_state.get_pipeline_model_parallel_world_size()
+        self.inference_tp_size = parallel_state.get_tensor_model_parallel_world_size()
+        self.tp_rank = parallel_state.get_tensor_model_parallel_rank()
+        self.pp_rank = parallel_state.get_pipeline_model_parallel_rank()
+        self.tp_group = parallel_state.get_tensor_model_parallel_group()
+        vp_size = parallel_state.get_virtual_pipeline_model_parallel_world_size()
+
+        assert (
+            vp_size is None or vp_size == 1
+        ), "Virtual parallelism is not supported in GPU Converter. Gather the VP chunks and use PP config."
+
+    def _add_to_trtllm_model_weights(self, val: torch.Tensor, layer_name: str):
+        assert torch.is_tensor(val), f"Expected a tensor for {layer_name} but got {type(val)}"
+        val = val.to(self.storage_type)
+        val = val.detach().contiguous()
+        if val.ndim >= 2:
+            val = torch.transpose(val.reshape(val.shape[0], -1), 0, 1)
+        if layer_name not in self.trtllm_model_weights:
+            self.trtllm_model_weights[layer_name] = torch.empty(
+                val.size(), dtype=val.dtype, layout=val.layout, device="cpu", pin_memory=True
+            )
+        self.trtllm_model_weights[layer_name] = val
+
+    def _convert_transformer_layer(self, layer_name: str, val: torch.Tensor):
+        """Convert Transformer layers to TRTLLM weights
+
+        Transformer layers referes to layers within the transformber block. They have a layer number associated with them. Depending on the layer we either directly save it to trtllm_model_weights, or split it across some dimension and save the splits
+
+        Args:
+            model_state_dict (dict): The input model state dictionary (All collected on CPU)
+            layer (TRTLLMLayerNames): The TRTLLM Layer that we want to change
+        """
+        if val.ndim == 2:
+            val = val.T
+
+        if (
+            layer_name.endswith(suffix(TRTLLMLayers.input_layernorm_weight))
+            or layer_name.endswith(suffix(TRTLLMLayers.input_layernorm_bias))
+            or layer_name.endswith(suffix(TRTLLMLayers.post_layernorm_weight))
+            or layer_name.endswith(suffix(TRTLLMLayers.post_layernorm_bias))
+            or layer_name.endswith(suffix(TRTLLMLayers.attention_dense_bias))
+            or layer_name.endswith(suffix(TRTLLMLayers.attention_dense_bias))
+            or layer_name.endswith(suffix(TRTLLMLayers.mlp_projection_bias))
+            or layer_name.endswith(suffix(TRTLLMLayers.mlp_router_weight))
+            or layer_name.endswith(suffix(TRTLLMLayers.attention_dense_weight))
+            or layer_name.endswith(suffix(TRTLLMLayers.mlp_projection_weight))
+        ):
+            # Same as layernorm1p in NeMo
+            if (
+                self.transformer_config.layernorm_zero_centered_gamma
+                and self.transformer_config.normalization == "LayerNorm"
+                and 'layernorm.weight' in layer_name
+            ):
+                val = val + 1.0
+
+            self._add_to_trtllm_model_weights(val=val, layer_name=layer_name)
+
+        elif layer_name.endswith(suffix(TRTLLMLayers.mlp_fc_weight)) or layer_name.endswith(
+            suffix(TRTLLMLayers.mlp_fc_bias)
+        ):
+
+            split_gated_activation = self.activation in [
+                "swiglu",
+                "geglu",
+                "fast-swiglu",
+                "fast-geglu",
+            ]
+            if split_gated_activation:
+                vals, gates = [[n] for n in torch.chunk(val, 2, axis=-1)]
+                gate_layer_name = layer_name.replace("fc", "gate")
+                self._add_to_trtllm_model_weights(val=gates[0], layer_name=gate_layer_name)
+                val = vals[0]
+
+            self._add_to_trtllm_model_weights(val=val, layer_name=layer_name)
+
+        elif layer_name.endswith(suffix(TRTLLMLayers.attention_qkv_bias)):
+            qkv_hidden_dim = val.shape[0]
+            size_per_head = (
+                qkv_hidden_dim
+                // (self.transformer_config.num_attention_heads + 2 * self.num_kv_heads)
+                * self.inference_tp_size
+            )
+            q_num = self.transformer_config.num_attention_heads // self.num_kv_heads
+
+            # We first concat all sub weights per tp rank together.
+            val = val.reshape(self.num_kv_heads // self.inference_tp_size, q_num + 2, size_per_head)
+            qkv = torch.split(val, [q_num, 1, 1], dim=1)
+            split_vals = torch.concatenate(
+                [qkv[0].reshape(-1), qkv[1].reshape(-1), qkv[2].reshape(-1)], dim=0
+            )
+            self._add_to_trtllm_model_weights(val=split_vals, layer_name=layer_name)
+
+        # TODO : Should add a atten layer dimension "qkvqkv, qqkkvv etc to see how to reshape here"
+        elif layer_name.endswith(suffix(TRTLLMLayers.attention_qkv_weight)):
+            hidden_dim = val.shape[0]
+            size_per_head = self.transformer_config.kv_channels
+            if size_per_head is None:
+                size_per_head = hidden_dim // self.transformer_config.num_attention_heads
+            q_num = self.transformer_config.num_attention_heads // self.num_kv_heads
+
+            val = val.reshape(
+                hidden_dim, self.num_kv_heads // self.inference_tp_size, q_num + 2, size_per_head
+            )
+            qkv = torch.split(val, [q_num, 1, 1], dim=2)
+            split_vals = torch.concatenate(
+                [
+                    qkv[0].reshape(hidden_dim, -1),
+                    qkv[1].reshape(hidden_dim, -1),
+                    qkv[2].reshape(hidden_dim, -1),
+                ],
+                dim=1,
+            )
+            self._add_to_trtllm_model_weights(val=split_vals, layer_name=layer_name)
+
+        else:
+            raise ValueError(f"{layer_name} cannot be handled by GPU converter")
+
+    def _convert_non_transformer_layer(self, model_state_dict: dict, layer_name: str):
+        """Convert Non Transformer layers to TRTLLM weights
+
+        Non transformer layers referes to layers that occur only once in the model (e.g Embedding , final output layer etc. ) They dont have any layer number associated with them. We remove this layer from the original state dict and cast it to storage type and convert to numpy and add it to trtllm_model_weights
+
+        Args:
+            model_state_dict (dict): The input model state dictionary (All collected on CPU)
+            layer (TRTLLMLayerNames): The TRTLLM Layer that we want to change
+        """
+        if layer_name in model_state_dict:
+            val = model_state_dict.pop(layer_name)
+            self._add_to_trtllm_model_weights(val=val, layer_name=layer_name)
+
+    # ----------------Convert Embeddings----------------
+    def _get_remove_vocab_padding(self, layer_name, model_state_dict, tokenizer_vocab_size):
+        val = model_state_dict.get(layer_name, None)
+        if val is None:
+            return None
+
+        if self.inference_tp_size > 1:  # Gather padded tensor chunks
+            vocab_size_padded = val.shape[0] * self.inference_tp_size
+            vocab_start_index, vocab_end_index = VocabUtility.vocab_range_from_global_vocab_size(
+                vocab_size_padded, self.tp_rank, self.inference_tp_size
+            )
+            dim_size = list(val.size())
+            dim_size[0] = vocab_size_padded
+            gathered_val = torch.zeros(
+                dim_size, dtype=val.dtype, device=torch.cuda.current_device()
+            )
+            gathered_val[vocab_start_index:vocab_end_index] = val
+            torch.distributed.all_reduce(gathered_val, group=self.tp_group)
+            val = gathered_val
+        unpadded = val[:tokenizer_vocab_size]
+        if self.inference_tp_size > 1:  # Split gathered val for val parallel embedding
+            vocab_start_index, vocab_end_index = VocabUtility.vocab_range_from_global_vocab_size(
+                tokenizer_vocab_size, self.tp_rank, self.inference_tp_size
+            )
+            unpadded = unpadded[vocab_start_index:vocab_end_index]
+        return unpadded.T  # TRTLLM expects (vocab_size, hidden_size) so need extra transpose
+
+    @torch.no_grad()
+    def convert(
+        self, model_state_dict: dict, trtllm_conversion_dict: dict, tokenizer_vocab_size: int
+    ):
+        """Convert model weights to trtllm model weights
+
+        This method goes through each layer in the model state dict and converts to equivalent trtllm model weights. It also handles splitting across TP dimension , expert split etc.
+
+        Args:
+            model_state_dict (dict): The full model state dict (all on CPU)
+            trtllm_conversion_dict (dict): The conversion dictionary used to convert model layer names to trtllm layer names
+            tokenizer_vocab_size (int): The vocab size of the tokenizer
+        """
+
+        # First step is to convert input model layer names to equivalent trtllm layer names
+        model_state_dict = TRTLLMLayers.rename_input_layer_names_to_trtllm_layer_names(
+            model_state_dict=model_state_dict, trtllm_conversion_dict=trtllm_conversion_dict
+        )
+
+        # Convert the non transformer layers
+        for layer_name in NON_TRANSFORMER_LAYERS_NAMES:
+            if (
+                layer_name in TRTLLMLayers.vocab_embedding.value
+                or layer_name in TRTLLMLayers.lm_head.value
+            ):
+                # For embedding layers alone we do some pre processing
+                embed_val = self._get_remove_vocab_padding(
+                    layer_name, model_state_dict, tokenizer_vocab_size
+                )
+                model_state_dict[layer_name] = embed_val
+            # TODO : Check if this handling of position embedding is right.
+            if layer_name == TRTLLMLayers.position_embedding.value:
+                position_embedding = model_state_dict[layer_name]
+                req_position_embedding = position_embedding.chunk(self.inference_tp_size)[
+                    self.tp_rank
+                ]
+                model_state_dict[layer_name] = req_position_embedding.T
+            self._convert_non_transformer_layer(
+                model_state_dict=model_state_dict, layer_name=layer_name
+            )
+
+        for layer_name, value in tqdm(
+            model_state_dict.items(), desc="Converting to TRTLLM Weights"
+        ):
+            self._convert_transformer_layer(layer_name, value)
diff --git a/megatron/core/export/trtllm/trtllm_weights_converter/single_device_trtllm_model_weights_converter.py b/megatron/core/export/trtllm/trtllm_weights_converter/single_device_trtllm_model_weights_converter.py
new file mode 100644
index 0000000000..c7a98972d2
--- /dev/null
+++ b/megatron/core/export/trtllm/trtllm_weights_converter/single_device_trtllm_model_weights_converter.py
@@ -0,0 +1,437 @@
+# Copyright (c) 2024, NVIDIA CORPORATION. All rights reserved.
+
+import re
+
+import torch
+from tqdm import tqdm
+
+from megatron.core.export.data_type import DataType
+from megatron.core.export.export_config import ExportConfig
+from megatron.core.export.trtllm.trtllm_layers import NON_TRANSFORMER_LAYERS_NAMES, TRTLLMLayers
+from megatron.core.export.trtllm.trtllm_layers import get_layer_name_without_prefix as suffix
+from megatron.core.transformer.transformer_config import TransformerConfig
+
+
+# pylint: disable=line-too-long
+# TODO: Writing TRT imports this way so that it can be mocked in the test_trtllm_cpu_converter.py unit test
+# TODO: Figure out how to patch it directly from the trtllm library
+def pad_vocab_size(vocab_size: int, tp_size: int):
+    """Pad vocab size based on inference size"""
+    from tensorrt_llm._utils import pad_vocab_size
+
+    return pad_vocab_size(vocab_size, tp_size)
+
+
+def str_dtype_to_torch(dtype: DataType):
+    """Get torch datatype from input datatype"""
+    from tensorrt_llm._utils import str_dtype_to_torch
+
+    return str_dtype_to_torch(dtype.name)
+
+
+class SingleDeviceTRTLLMModelWeightsConverter:
+    """Class to convert Model weights to TRTLLM weights on CPU"""
+
+    def __init__(
+        self,
+        export_config: ExportConfig,
+        transformer_config: TransformerConfig,
+        dtype: DataType,
+        multi_query_mode: bool = False,
+        activation: str = "gelu",
+    ):
+        """Constructor for the TRTLLMModelWeightsConverterCPU class
+
+        This class is responsible to convert the model weights to TRTLLM equivalent weights and also split them for each GPU rank and return as a list.
+
+        Args:
+            export_config (ExportConfig): The export config with inference tp size, pp size etc.
+            transformer_config (TransformerConfig): The transformer config
+            dtype (DataType): The data type or model precision
+            multi_query_mode (bool, optional): Defaults to False.
+            activation (str, optional): Defaults to "gelu".
+        """
+        self.export_config = export_config
+        self.transformer_config = transformer_config
+        self.trtllm_model_weights = {}
+        self.storage_type = str_dtype_to_torch(dtype)
+        self.activation = activation
+        num_kv_heads = self.transformer_config.num_query_groups
+        if num_kv_heads == 0:
+            if multi_query_mode:
+                num_kv_heads = 1
+            else:
+                num_kv_heads = self.transformer_config.num_attention_heads
+        self.num_kv_heads = num_kv_heads
+
+    def _convert_non_transformer_layer(self, model_state_dict: dict, layer_name: str):
+        """Convert Non Transformer layers to TRTLLM weights
+
+        Non transformer layers referes to layers that occur only once in the model (e.g Embedding , final output layer etc. ) They dont have any layer number associated with them. We remove this layer from the original state dict and cast it to storage type and convert to numpy and add it to trtllm_model_weights
+
+        Args:
+            model_state_dict (dict): The input model state dictionary (All collected on CPU)
+            layer_name (str): The TRTLLM Layer name that we want to convert
+        """
+        if layer_name in model_state_dict:
+            val = model_state_dict.pop(layer_name)
+            val = val.to(self.storage_type).detach().contiguous()
+            self.trtllm_model_weights[layer_name] = val
+
+    def _convert_transformer_layer(self, layer_name: str, val: torch.Tensor):
+        """Convert Transformer layers to TRTLLM weights
+
+        Transformer layers referes to layers within the transformber block. They have a layer number associated with them. Depending on the layer we either directly save it to trtllm_model_weights, or split it across some dimension and save the splits
+
+        Args:
+            model_state_dict (dict): The input model state dictionary (All collected on CPU)
+            layer (TRTLLMLayerNames): The TRTLLM Layer that we want to change
+        """
+
+        def _add_to_trtllm_model_weights(val: torch.Tensor, layer_name: str, split_type=None):
+            """Add the input weight to trtllm_model_weights
+
+            Depending on split (Expert split/Tensor split/None) we split the input data and add accordingly
+
+            Args:
+                val (torch.Tensor): The model weight to be added
+                layer_name (str): The TRTLLMlayername as a string
+                split_type (str, optional): The split type. Defaults to None.
+            """
+            if split_type == 'expert_split':
+                for split_num, split_val in enumerate(val):
+                    self.trtllm_model_weights[f'{layer_name}.{split_num}.bin'] = (
+                        split_val.to(self.storage_type).detach().contiguous()
+                    )
+            elif split_type == 'tensor_split':
+                for split_num, split_val in enumerate(val):
+                    if split_val.ndim >= 2:
+                        split_val = torch.transpose(split_val.reshape(split_val.shape[0], -1), 1, 0)
+
+                    self.trtllm_model_weights[f'{layer_name}.{split_num}.bin'] = (
+                        split_val.to(self.storage_type).detach().contiguous()
+                    )
+            else:
+                if val.ndim >= 2:
+                    val = torch.transpose(val.reshape(val.shape[0], -1), 1, 0)
+                self.trtllm_model_weights[layer_name] = (
+                    val.to(self.storage_type).detach().contiguous()
+                )
+
+        if val.ndim == 2:
+            val = val.T
+
+        if (
+            layer_name.endswith(suffix(TRTLLMLayers.input_layernorm_weight))
+            or layer_name.endswith(suffix(TRTLLMLayers.input_layernorm_bias))
+            or layer_name.endswith(suffix(TRTLLMLayers.post_layernorm_weight))
+            or layer_name.endswith(suffix(TRTLLMLayers.post_layernorm_bias))
+            or layer_name.endswith(suffix(TRTLLMLayers.attention_dense_bias))
+            or layer_name.endswith(suffix(TRTLLMLayers.attention_dense_bias))
+            or layer_name.endswith(suffix(TRTLLMLayers.mlp_projection_bias))
+            or layer_name.endswith(suffix(TRTLLMLayers.mlp_router_weight))
+        ):
+            # Same as layernorm1p in NeMo
+            if (
+                self.transformer_config.layernorm_zero_centered_gamma
+                and self.transformer_config.normalization == "LayerNorm"
+                and 'layernorm.weight' in layer_name
+            ):
+                val = val + 1.0
+
+            _add_to_trtllm_model_weights(val=val, layer_name=layer_name, split_type=None)
+
+        elif layer_name.endswith(
+            suffix(TRTLLMLayers.attention_dense_weight)
+        ) or layer_name.endswith(suffix(TRTLLMLayers.mlp_projection_weight)):
+            split_vals = torch.chunk(val, self.export_config.inference_tp_size, axis=0)
+            _add_to_trtllm_model_weights(
+                val=split_vals, layer_name=layer_name, split_type='tensor_split'
+            )
+
+        elif layer_name.endswith(suffix(TRTLLMLayers.mlp_fc_weight)) or layer_name.endswith(
+            suffix(TRTLLMLayers.mlp_fc_bias)
+        ):
+            split_gated_activation = self.activation in [
+                "swiglu",
+                "geglu",
+                "fast-swiglu",
+                "fast-geglu",
+            ]
+            if split_gated_activation:
+                val, gate = torch.chunk(val, 2, axis=-1)
+                gate_layer_name = layer_name.replace("fc", "gate")
+                split_vals = torch.chunk(gate, self.export_config.inference_tp_size, axis=-1)
+                _add_to_trtllm_model_weights(
+                    val=split_vals, layer_name=gate_layer_name, split_type='tensor_split'
+                )
+
+            split_vals = torch.chunk(val, self.export_config.inference_tp_size, axis=-1)
+            _add_to_trtllm_model_weights(
+                val=split_vals, layer_name=layer_name, split_type='tensor_split'
+            )
+
+        elif layer_name.endswith(suffix(TRTLLMLayers.attention_qkv_bias)):
+            qkv_hidden_dim = val.shape[0]
+            size_per_head = qkv_hidden_dim // (
+                self.transformer_config.num_attention_heads + 2 * self.num_kv_heads
+            )
+            q_num = self.transformer_config.num_attention_heads // self.num_kv_heads
+
+            # We first concat all sub weights per tp rank together.
+            val = val.reshape(self.num_kv_heads, q_num + 2, size_per_head)
+
+            qkv = torch.split(val, [q_num, 1, 1], dim=1)
+            q_split = torch.chunk(qkv[0], self.export_config.inference_tp_size, axis=0)
+            k_split = torch.chunk(qkv[1], self.export_config.inference_tp_size, axis=0)
+            v_split = torch.chunk(qkv[2], self.export_config.inference_tp_size, axis=0)
+
+            # Concatenate Q, K, and V together
+            split_vals = [
+                torch.concatenate(
+                    [q_split[i].reshape(-1), k_split[i].reshape(-1), v_split[i].reshape(-1)], dim=0
+                )
+                for i in range(self.export_config.inference_tp_size)
+            ]
+            _add_to_trtllm_model_weights(
+                val=split_vals, layer_name=layer_name, split_type='tensor_split'
+            )
+
+        # TODO : Should add a atten layer dimension "qkvqkv, qqkkvv etc to see how to reshape here"
+        elif layer_name.endswith(suffix(TRTLLMLayers.attention_qkv_weight)):
+            hidden_dim = val.shape[0]
+            size_per_head = self.transformer_config.kv_channels
+            if size_per_head is None:
+                size_per_head = hidden_dim // self.transformer_config.num_attention_heads
+            q_num = self.transformer_config.num_attention_heads // self.num_kv_heads
+
+            # When the merge factor exceeds 1, the 'vals' list will have multiple entries.
+            # Depending on the format, 'vals' can look like either [QQQQ..KV, QQQQ..KV, ...](for GQA) or [QKV, QKV, ...](for MHA).
+            # We first concat all sub weights per tp rank together.
+            val = val.reshape(hidden_dim, self.num_kv_heads, q_num + 2, size_per_head)
+
+            # Split the QKV to separate variables.
+            qkv = torch.split(val, [q_num, 1, 1], dim=2)
+
+            query_groups_shape = qkv[0].shape
+            if len(query_groups_shape) > 1:
+                if (query_groups_shape[1] % self.export_config.inference_tp_size) != 0:
+                    raise Exception(
+                        "Number of query groups of the models is {0}. Please select tensor parallelism size "
+                        "that can split the number of query groups to equal number of query matrices in the "
+                        "each GPU.".format(query_groups_shape[1])
+                    )
+
+            q_split = torch.chunk(qkv[0], self.export_config.inference_tp_size, axis=1)
+            k_split = torch.chunk(qkv[1], self.export_config.inference_tp_size, axis=1)
+            v_split = torch.chunk(qkv[2], self.export_config.inference_tp_size, axis=1)
+
+            # Concatenate Q, K, and V together
+            split_vals = [
+                torch.concatenate(
+                    [
+                        q_split[i].reshape(hidden_dim, -1),
+                        k_split[i].reshape(hidden_dim, -1),
+                        v_split[i].reshape(hidden_dim, -1),
+                    ],
+                    dim=1,
+                )
+                for i in range(self.export_config.inference_tp_size)
+            ]
+            _add_to_trtllm_model_weights(
+                val=split_vals, layer_name=layer_name, split_type='tensor_split'
+            )
+
+        elif layer_name.endswith(suffix(TRTLLMLayers.mlp_fc_weight_mixture_of_experts)):
+            w1, w3 = torch.chunk(val, 2, axis=1)
+            # w1 splits
+            split_w1s = torch.chunk(w1, self.export_config.inference_tp_size, axis=1)
+            # w3 splits
+            split_w3s = torch.chunk(w3, self.export_config.inference_tp_size, axis=1)
+
+            split_vals = [torch.concatenate(item, dim=1) for item in zip(split_w3s, split_w1s)]
+            layer_name = layer_name.replace(".expert", "")  # Remove suffix .expert from key
+            _add_to_trtllm_model_weights(
+                val=split_vals, layer_name=layer_name, split_type='expert_split'
+            )
+
+        elif layer_name.endswith(suffix(TRTLLMLayers.mlp_projection_weight_mixture_of_experts)):
+            split_vals = torch.chunk(val, self.export_config.inference_tp_size, axis=-1)
+            layer_name = layer_name.replace(".expert", "")  # Remove suffix .expert from key
+            _add_to_trtllm_model_weights(
+                val=split_vals, layer_name=layer_name, split_type='expert_split'
+            )
+        else:
+            raise ValueError(f"{layer_name} cannot be handled by converter")
+
+    @torch.no_grad()
+    def convert(
+        self, model_state_dict: dict, trtllm_conversion_dict, state_dict_split_by_layer_numbers=True
+    ):
+        """Convert model weights to trtllm model weights
+
+        This method goes through each layer in the model state dict and converts to equivalent trtllm model weights. It also handles splitting across TP dimension , expert split etc.
+
+        Args:
+            model_state_dict (dict): The full model state dict (all on CPU)
+            trtllm_conversion_dict (dict): The conversion dictionary used to convert model layer names to trtllm layer names
+            state_dict_split_by_layer_numbers (bool, optional): Are the model layers split by layer numbers in state dict. For example : mlp.fc1.weight can be represented like mlp.fc1.weight of shape [num_layers, hidden_dim, ffn_hidden_dim]} or it can be like mlp.fc1.layers.0.weight of shape [hidden_dim, ffn_hidden_dim], then mlp.fc1.layers.1.weight ... for all layers. If you use represenation 2 set this to True. Defaults to True
+        """
+
+        # First step is to convert input model layer names to equivalent trtllm layer names
+        model_state_dict = TRTLLMLayers.rename_input_layer_names_to_trtllm_layer_names(
+            model_state_dict=model_state_dict,
+            trtllm_conversion_dict=trtllm_conversion_dict,
+            state_dict_split_by_layer_numbers=state_dict_split_by_layer_numbers,
+        )
+
+        # Convert the non transformer layers
+        for layer_name in NON_TRANSFORMER_LAYERS_NAMES:
+            # For vocab embedding layer alone we pad the weights to be divisible by inference tp size
+            if (
+                layer_name == TRTLLMLayers.vocab_embedding.value
+                and self.export_config.use_parallel_embedding
+            ):
+                val = model_state_dict[TRTLLMLayers.vocab_embedding.value]
+                vocab_size = val.shape[0]
+                if vocab_size % self.export_config.inference_tp_size != 0:
+                    vocab_size_padded = pad_vocab_size(
+                        vocab_size, self.export_config.inference_tp_size
+                    )
+                    pad_width = vocab_size_padded - vocab_size
+                    val = torch.nn.functional.pad(val, (0, 0, 0, pad_width), value=0)
+                    model_state_dict[layer_name] = val
+
+            self._convert_non_transformer_layer(
+                model_state_dict=model_state_dict, layer_name=layer_name
+            )
+
+        transformer_layers_dict = {}
+        # Convert the transformer layers
+        if state_dict_split_by_layer_numbers:
+            # Already model dict is split by layer numbers
+            transformer_layers_dict = model_state_dict
+        else:
+            # Here we split the model state dict into individual layers
+            for layer_name in list(model_state_dict.keys()):
+                value = model_state_dict.pop(layer_name)
+                for layer_number in range(self.transformer_config.num_layers):
+                    # e.g transformer.layers.mlp.fc.bias => transformer.layers.2.mlp.fc.bias
+                    layer_name_with_layer_number = re.sub(
+                        r'(?<=layers\.)', f'{layer_number}.', layer_name
+                    )
+                    transformer_layers_dict[layer_name_with_layer_number] = value[layer_number]
+
+        for layer_name, value in tqdm(
+            transformer_layers_dict.items(), desc="Converting to TRTLLM Weights"
+        ):
+            self._convert_transformer_layer(layer_name, value)
+
+    def get_padded_vocab_size(self) -> int:
+        """Return the paded vocab size
+
+        We extract the lm head and vocab embedding and use that to determine padded_vocab_size
+
+        Returns:
+            int: Padded vocab size
+        """
+        lm_head_weight = self.trtllm_model_weights.get(TRTLLMLayers.lm_head.value, None)
+        vocab_size = self.trtllm_model_weights[TRTLLMLayers.vocab_embedding.value].shape[0]
+        vocab_size_padded = (
+            vocab_size
+            if lm_head_weight is None
+            else pad_vocab_size(vocab_size, self.export_config.inference_tp_size)
+        )
+        return vocab_size_padded
+
+    def get_local_model_weights_per_gpu(self, mapping, trtllm_model_config: dict):
+        """Get the trtllm model weights split per gpu
+
+        Given the trtllm mapping information (tp, pp rank etc) we split the model weights in a list, with each element of the list corresponding to the weights of each gpu rank
+
+        Args:
+            mapping : The trtllm mapping information
+            trtllm_model_config (dict): The trtllm model config
+        """
+
+        def _split(torch_tensor, tp_size, idx, dim=0):
+            """Splits the np tensor v on dim and return the idx's slice."""
+            if tp_size == 1:
+                return torch_tensor
+            if len(torch_tensor.shape) == 1:
+                return torch.chunk(torch_tensor, tp_size)[idx].contiguous()
+            else:
+                return torch.chunk(torch_tensor, tp_size, axis=dim)[idx].contiguous()
+
+        pp_layer_range = mapping.pp_layers(self.transformer_config.num_layers)
+
+        trtllm_model_weights_per_gpu = {}
+        for layer_name, value in self.trtllm_model_weights.items():
+            if layer_name in NON_TRANSFORMER_LAYERS_NAMES:
+                continue
+
+            # Happens in the case of TP split or expert split
+            if layer_name.endswith(".bin"):
+                if layer_name.endswith(f"{mapping.tp_rank}.bin"):
+                    layer_name = layer_name.replace(f".{mapping.tp_rank}.bin", "")
+                else:
+                    continue
+
+            layer_num = int(layer_name.split(".")[2])
+            if layer_num in pp_layer_range:
+                layer_name = layer_name.replace(
+                    f"layers.{layer_num}", f"layers.{layer_num - pp_layer_range[0]}"
+                )
+            else:
+                continue
+            if (
+                hasattr(trtllm_model_config, 'new_decoder_architecture')
+                and trtllm_model_config.new_decoder_architecture
+                and "post_layernorm" in layer_name
+            ):
+                layer_name = layer_name.replace("post_layernorm", "mlp_layernorm")
+
+            trtllm_model_weights_per_gpu[layer_name] = value
+
+        if mapping.is_first_pp_rank():
+            embedding_weight = (
+                _split(
+                    self.trtllm_model_weights[TRTLLMLayers.vocab_embedding.value],
+                    mapping.tp_size,
+                    mapping.tp_rank,
+                )
+                if self.export_config.use_parallel_embedding
+                else self.trtllm_model_weights[TRTLLMLayers.vocab_embedding.value]
+            )
+
+            trtllm_model_weights_per_gpu[TRTLLMLayers.vocab_embedding.value] = embedding_weight
+
+            pos_embedding_weight = self.trtllm_model_weights.get(
+                TRTLLMLayers.position_embedding.value
+            )
+            if pos_embedding_weight is not None:
+                if self.export_config.use_parallel_embedding:
+                    pos_embedding_weight = _split(
+                        pos_embedding_weight, mapping.tp_size, mapping.tp_rank
+                    )
+
+                trtllm_model_weights_per_gpu[TRTLLMLayers.position_embedding.value] = (
+                    pos_embedding_weight
+                )
+
+        if mapping.is_last_pp_rank():
+            lm_head_weight = self.trtllm_model_weights.get(TRTLLMLayers.lm_head.value, None)
+            if lm_head_weight is not None:
+                trtllm_model_weights_per_gpu[TRTLLMLayers.lm_head.value] = _split(
+                    lm_head_weight, mapping.tp_size, mapping.tp_rank
+                )
+
+            trtllm_model_weights_per_gpu[TRTLLMLayers.final_layernorm_weight.value] = (
+                self.trtllm_model_weights[TRTLLMLayers.final_layernorm_weight.value]
+            )
+
+            ln_f_bias = self.trtllm_model_weights.get(TRTLLMLayers.final_layernorm_bias.value)
+            if ln_f_bias is not None:
+                trtllm_model_weights_per_gpu[TRTLLMLayers.final_layernorm_bias.value] = ln_f_bias
+
+        return trtllm_model_weights_per_gpu
diff --git a/megatron/mpu/tests/__init__.py b/megatron/core/extensions/__init__.py
similarity index 100%
rename from megatron/mpu/tests/__init__.py
rename to megatron/core/extensions/__init__.py
diff --git a/megatron/core/extensions/transformer_engine.py b/megatron/core/extensions/transformer_engine.py
new file mode 100644
index 0000000000..bf5159c759
--- /dev/null
+++ b/megatron/core/extensions/transformer_engine.py
@@ -0,0 +1,1087 @@
+# Copyright (c) 2024, NVIDIA CORPORATION. All rights reserved.
+
+import dataclasses
+import os
+import warnings
+from typing import Callable
+
+import torch
+import transformer_engine as te
+from packaging.version import Version as PkgVersion
+from torch import Tensor
+from torch.nn.parameter import Parameter
+
+from megatron.core import ModelParallelConfig, parallel_state
+from megatron.core.dist_checkpointing.utils import replace_prefix_for_sharding
+from megatron.core.packed_seq_params import PackedSeqParams
+from megatron.core.parallel_state import (
+    get_context_parallel_global_ranks,
+    get_context_parallel_group,
+    get_tensor_and_expert_parallel_world_size,
+    get_tensor_model_parallel_group,
+    get_tensor_model_parallel_rank,
+    get_tensor_model_parallel_world_size,
+)
+from megatron.core.tensor_parallel import get_cuda_rng_tracker, get_expert_parallel_rng_tracker_name
+from megatron.core.tensor_parallel.layers import (
+    _initialize_affine_weight_cpu,
+    set_tensor_model_parallel_attributes,
+)
+from megatron.core.tensor_parallel.utils import divide
+from megatron.core.transformer.enums import AttnMaskType
+from megatron.core.transformer.transformer_config import TransformerConfig
+from megatron.core.transformer.utils import make_sharded_tensors_for_checkpoint
+from megatron.core.utils import get_te_version, is_te_min_version
+
+
+def _get_extra_te_kwargs(config: TransformerConfig):
+    extra_transformer_engine_kwargs = {"params_dtype": config.params_dtype}
+
+    if is_te_min_version("0.12.0"):
+        if config.use_cpu_initialization:
+            extra_transformer_engine_kwargs["device"] = 'cpu'
+        else:
+            extra_transformer_engine_kwargs["device"] = torch.cuda.current_device()
+    return extra_transformer_engine_kwargs
+
+
+def condition_init_method(config, init_method):
+    """Condition TE init_method on config.perform_initialization."""
+    return init_method if config.perform_initialization else (lambda w: None)
+
+
+class TENorm:
+    """
+    A conditional wrapper to initialize an instance of Transformer-Engine's
+    `LayerNorm` or `RMSNorm` based on input
+    """
+
+    # TODO should we ditch normalization config and just use spec to choose LayerNorm vs RMSNorm?
+    def __new__(cls, config: TransformerConfig, hidden_size: int, eps: float = 1e-5):
+        if config.normalization == "LayerNorm":
+            instance = te.pytorch.LayerNorm(
+                hidden_size=hidden_size,
+                eps=eps,
+                sequence_parallel=config.sequence_parallel,
+                zero_centered_gamma=config.layernorm_zero_centered_gamma,
+                **_get_extra_te_kwargs(config),
+            )
+        elif config.normalization == "RMSNorm":
+            assert hasattr(
+                te.pytorch, "RMSNorm"
+            ), "Transformer-Engine >= v0.11 required to use this feature"
+            instance = te.pytorch.RMSNorm(
+                hidden_size=hidden_size,
+                eps=eps,
+                sequence_parallel=config.sequence_parallel,
+                zero_centered_gamma=config.layernorm_zero_centered_gamma,
+                **_get_extra_te_kwargs(config),
+            )
+        else:
+            raise Exception('Only LayerNorm and RMSNorm are curently supported')
+
+        return instance
+
+
+class TELinear(te.pytorch.Linear):
+    """
+    Wrapper for the Transformer-Engine's `Linear` layer.
+
+    Note that if Megatron's parallel_state has not been initialized
+    yet, the tp_group passed to TE will be None and must be set later
+    via set_tensor_parallel_group().
+    """
+
+    def __init__(
+        self,
+        input_size: int,
+        output_size: int,
+        *,
+        parallel_mode: str,
+        config: ModelParallelConfig,
+        init_method: Callable,
+        bias: bool,
+        skip_bias_add: bool,
+        skip_weight_param_allocation: bool,
+        tp_comm_buffer_name: str = None,
+        is_expert: bool = False,
+    ):
+        self.config = config
+
+        # TE returns a zero length Tensor when bias=False and
+        # return_bias=True, but we prefer None.  So in that case we
+        # tell TE to not return the bias, and return None
+        # ourselves. This way our forward always returns two values
+        # and we don't have to deal with the zero length Tensor.
+        self.te_return_bias = skip_bias_add and bias
+        self.is_first_microbatch = True
+        self.disable_parameter_transpose_cache = self.config.disable_parameter_transpose_cache
+        if skip_weight_param_allocation:
+            raise ValueError(
+                'Transformer Engine linear layers do not support skip_weight_param_allocation'
+            )
+
+        extra_kwargs = _get_extra_te_kwargs(config)
+
+        if is_te_min_version("0.8.0"):
+            if self.config.tp_comm_overlap:
+                if is_te_min_version("1.5.0"):
+                    # Use old overlap flags if they were supplied instead
+                    extra_kwargs["ub_overlap_ag"] = (
+                        self.config.tp_comm_overlap_ag
+                        if hasattr(self.config, "tp_comm_overlap_ag")
+                        else self.config.tp_comm_split_ag or self.config.tp_comm_atomic_ag
+                    )
+                    extra_kwargs["ub_overlap_rs"] = (
+                        self.config.tp_comm_overlap_rs
+                        if hasattr(self.config, "tp_comm_overlap_rs")
+                        else self.config.tp_comm_split_rs or self.config.tp_comm_atomic_rs
+                    )
+                    # Disable ub overlap for experts.
+                    if is_expert:
+                        extra_kwargs["ub_overlap_ag"] = False
+                        extra_kwargs["ub_overlap_rs"] = False
+                else:
+                    extra_kwargs["ub_split_ag"] = self.config.tp_comm_split_ag
+                    extra_kwargs["ub_atomic_gemm_ag"] = self.config.tp_comm_atomic_ag
+                    extra_kwargs["ub_split_rs"] = self.config.tp_comm_split_rs
+                    extra_kwargs["ub_atomic_gemm_rs"] = self.config.tp_comm_atomic_rs
+                    # Disable ub overlap for experts.
+                    if is_expert:
+                        extra_kwargs["ub_split_ag"] = False
+                        extra_kwargs["ub_atomic_gemm_ag"] = False
+                        extra_kwargs["ub_split_rs"] = False
+                        extra_kwargs["ub_atomic_gemm_rs"] = False
+                if is_te_min_version("1.0.0", check_equality=False):
+                    assert (
+                        tp_comm_buffer_name is not None
+                    ), "Buffer name should be set to configure communication overlap settings"
+                    extra_kwargs["ub_name"] = tp_comm_buffer_name
+
+        self.expert_parallel = self.config.expert_model_parallel_size > 1
+        if is_expert and self.expert_parallel:
+            rng_tracker_name = get_expert_parallel_rng_tracker_name()
+        else:
+            rng_tracker_name = None
+        if is_te_min_version("1.7.0"):
+            extra_kwargs["rng_tracker_name"] = rng_tracker_name
+
+        # Disable communications in TE when using SP or EP by making TE agnostic of model parallel.
+        tp_size = self.config.tensor_model_parallel_size
+        tp_group = get_tensor_model_parallel_group(check_initialized=False)
+        if is_expert and (self.config.sequence_parallel or self.expert_parallel):
+            if self.config.moe_extended_tp:
+                tp_size = get_tensor_and_expert_parallel_world_size()
+            if parallel_mode == "column":
+                output_size = divide(output_size, tp_size)
+            elif parallel_mode == "row":
+                input_size = divide(input_size, tp_size)
+            parallel_mode = None
+            tp_size = 1
+            tp_group = None
+
+        super().__init__(
+            in_features=input_size,
+            out_features=output_size,
+            sequence_parallel=self.config.sequence_parallel,
+            fuse_wgrad_accumulation=self.config.gradient_accumulation_fusion,
+            tp_group=tp_group,
+            tp_size=tp_size,
+            get_rng_state_tracker=(
+                get_cuda_rng_tracker if get_cuda_rng_tracker().is_initialized() else None
+            ),
+            init_method=condition_init_method(config, init_method),
+            bias=bias,
+            return_bias=self.te_return_bias,
+            parallel_mode=parallel_mode,
+            **extra_kwargs,
+        )
+
+        for param in self.parameters():
+            setattr(param, 'allreduce', not (is_expert and self.expert_parallel))
+
+    def forward(self, x):
+        """Forward."""
+        _is_first_microbatch = (
+            None if self.disable_parameter_transpose_cache else self.is_first_microbatch
+        )
+        out = super().forward(x, is_first_microbatch=_is_first_microbatch)
+        self.is_first_microbatch = False
+
+        # TE only returns a tuple when return_bias is True, otherwise
+        # it returns a single Tensor, we always want to return two
+        # values regardless of the arguments.
+        if self.te_return_bias:
+            return out
+        return out, None
+
+
+class TELayerNormColumnParallelLinear(te.pytorch.LayerNormLinear):
+    """
+    Wrapper for the Transformer-Engine's `LayerNormLinear` layer that combines
+    layernorm and linear layers
+    """
+
+    def __init__(
+        self,
+        input_size: int,
+        output_size: int,
+        *,
+        config: TransformerConfig,
+        init_method: Callable,
+        gather_output: bool,
+        bias: bool,
+        skip_bias_add: bool,
+        is_expert: bool,
+        skip_weight_param_allocation: bool = False,
+        tp_comm_buffer_name: str = None,
+    ):
+        self.config = config
+
+        if gather_output:
+            raise ValueError('Transformer Engine linear layers do not support gather_output = True')
+
+        if is_expert:
+            raise ValueError('Transformer Engine linear layers do not yet support MoE')
+
+        if skip_weight_param_allocation:
+            raise ValueError(
+                'Transformer Engine linear layers do not support skip_weight_param_allocation'
+            )
+
+        # TE returns a zero length Tensor when bias=False and
+        # return_bias=True, but we prefer None.  So in that case we
+        # tell TE to not return the bias, and return None
+        # ourselves. This way our forward always returns two values
+        # and we don't have to deal with the zero length Tensor.
+        self.te_return_bias = skip_bias_add and bias
+        self.is_first_microbatch = True
+        self.disable_parameter_transpose_cache = self.config.disable_parameter_transpose_cache
+        extra_kwargs = _get_extra_te_kwargs(config)
+
+        # Only Transformer-Engine version >= 0.11.0 supports `RMSNorm`
+        if is_te_min_version("0.11.0"):
+            extra_kwargs["normalization"] = self.config.normalization
+        elif self.config.normalization != "LayerNorm":
+            te_version = get_te_version()
+            raise ValueError(
+                f"Transformer Engine v{te_version} does not support {self.config.normalization}."
+            )
+
+        if is_te_min_version("0.8.0"):
+            if self.config.tp_comm_overlap:
+                extra_kwargs["ub_bulk_wgrad"] = self.config.tp_comm_bulk_wgrad
+                extra_kwargs["ub_bulk_dgrad"] = self.config.tp_comm_bulk_dgrad
+                if is_te_min_version("1.5.0", check_equality=False):
+                    # Use old overlap flags if they were supplied instead
+                    extra_kwargs["ub_overlap_ag"] = (
+                        self.config.tp_comm_overlap_ag
+                        if hasattr(self.config, "tp_comm_overlap_ag")
+                        else self.config.tp_comm_split_ag or self.config.tp_comm_atomic_ag
+                    )
+                    if is_te_min_version("1.6.0.dev0", check_equality=False):
+                        extra_kwargs["ub_overlap_rs_dgrad"] = (
+                            self.config.tp_comm_overlap_rs_dgrad
+                            if hasattr(self.config, "tp_comm_overlap_rs_dgrad")
+                            else False
+                        )
+                    if tp_comm_buffer_name == 'qkv' and self.config.tp_comm_overlap_disable_qkv:
+                        extra_kwargs["ub_overlap_ag"] = False
+                        extra_kwargs["ub_overlap_rs_dgrad"] = False
+
+                    if tp_comm_buffer_name == 'fc1' and self.config.tp_comm_overlap_disable_fc1:
+                        extra_kwargs["ub_overlap_ag"] = False
+                        extra_kwargs["ub_overlap_rs_dgrad"] = False
+                else:
+                    extra_kwargs["ub_atomic_gemm_ag"] = self.config.tp_comm_atomic_ag
+                    extra_kwargs["ub_split_ag"] = self.config.tp_comm_split_ag
+                if is_te_min_version("1.0.0", check_equality=False):
+                    assert (
+                        tp_comm_buffer_name is not None
+                    ), "Buffer name should be set to configure communication overlap settings"
+                    extra_kwargs["ub_name"] = tp_comm_buffer_name
+
+        super().__init__(
+            in_features=input_size,
+            out_features=output_size,
+            eps=self.config.layernorm_epsilon,
+            sequence_parallel=self.config.sequence_parallel,
+            fuse_wgrad_accumulation=self.config.gradient_accumulation_fusion,
+            tp_group=get_tensor_model_parallel_group(check_initialized=False),
+            tp_size=self.config.tensor_model_parallel_size,
+            get_rng_state_tracker=(
+                get_cuda_rng_tracker if get_cuda_rng_tracker().is_initialized() else None
+            ),
+            init_method=(
+                condition_init_method(config, init_method)
+                if not config.use_cpu_initialization
+                else lambda w: None
+            ),
+            bias=bias,
+            return_bias=self.te_return_bias,
+            parallel_mode="column",
+            return_layernorm_output=False,
+            zero_centered_gamma=self.config.layernorm_zero_centered_gamma,
+            **extra_kwargs,
+        )
+
+        world_size = get_tensor_model_parallel_world_size()
+        rank = get_tensor_model_parallel_rank()
+
+        if config.use_cpu_initialization:
+            output_size_per_partition = divide(output_size, world_size)
+            _ = _initialize_affine_weight_cpu(
+                self.weight,
+                output_size,
+                input_size,
+                output_size_per_partition,
+                0,
+                init_method,
+                stride=1,
+                return_master_weight=False,
+                rank=rank,
+                world_size=world_size,
+                skip_set_tensor_parallel_attributes=True,
+            )
+            if bias:
+                self.bias = Parameter(
+                    torch.empty(output_size_per_partition, dtype=config.params_dtype)
+                )
+                set_tensor_model_parallel_attributes(self.bias, True, 0, 1)
+                with torch.no_grad():
+                    self.bias.zero_()
+                setattr(self.bias, 'allreduce', True)
+
+    def forward(self, x):
+        """Forward."""
+        _is_first_microbatch = (
+            None if self.disable_parameter_transpose_cache else self.is_first_microbatch
+        )
+        out = super().forward(x, is_first_microbatch=_is_first_microbatch)
+        self.is_first_microbatch = False
+
+        # TE only returns a tuple when return_bias is True, otherwise
+        # it returns a single Tensor, we always want to return two
+        # values regardless of the arguments.
+        if self.te_return_bias:
+            return out
+        return out, None
+
+    def sharded_state_dict(self, prefix='', sharded_offsets=(), metadata=None):
+        """Sharding along axis 0, bias sharded"""
+        state_dict = self.state_dict(prefix='', keep_vars=True)
+        return make_sharded_tensors_for_checkpoint(
+            state_dict, prefix, {'weight': 0, 'bias': 0}, sharded_offsets
+        )
+
+
+class TEColumnParallelLinear(TELinear):
+    """
+    Wrapper for the Transformer-Engine's `Linear` layer but specialized similar
+    to megatron's `ColumnParallelLinear` layer.
+    """
+
+    def __init__(
+        self,
+        input_size: int,
+        output_size: int,
+        *,
+        config: ModelParallelConfig,
+        init_method: Callable,
+        gather_output: bool,
+        bias: bool,
+        skip_bias_add: bool,
+        is_expert: bool,
+        skip_weight_param_allocation: bool = False,
+        tp_comm_buffer_name: str = None,
+    ):
+        if gather_output:
+            raise ValueError('Transformer Engine linear layers do not support gather_output = True')
+
+        super().__init__(
+            input_size=input_size,
+            output_size=output_size,
+            parallel_mode="column",
+            config=config,
+            init_method=(
+                condition_init_method(config, init_method)
+                if not config.use_cpu_initialization
+                else lambda w: None
+            ),
+            bias=bias,
+            skip_bias_add=skip_bias_add,
+            is_expert=is_expert,
+            skip_weight_param_allocation=skip_weight_param_allocation,
+            tp_comm_buffer_name=tp_comm_buffer_name,
+        )
+
+        world_size = get_tensor_model_parallel_world_size()
+        rank = get_tensor_model_parallel_rank()
+        if config.use_cpu_initialization:
+            output_size_per_partition = divide(output_size, world_size)
+            _ = _initialize_affine_weight_cpu(
+                self.weight,
+                output_size,
+                input_size,
+                output_size_per_partition,
+                0,
+                init_method,
+                stride=1,
+                return_master_weight=False,
+                rank=rank,
+                world_size=world_size,
+                skip_set_tensor_parallel_attributes=True,
+            )
+            if bias:
+                self.bias = Parameter(
+                    torch.empty(output_size_per_partition, dtype=config.params_dtype)
+                )
+                set_tensor_model_parallel_attributes(self.bias, True, 0, 1)
+                with torch.no_grad():
+                    self.bias.zero_()
+                setattr(self.bias, 'allreduce', True)
+
+    def sharded_state_dict(self, prefix='', sharded_offsets=(), metadata=None):
+        """Sharding along axis 0, bias sharded"""
+        state_dict = self.state_dict(prefix='', keep_vars=True)
+        return make_sharded_tensors_for_checkpoint(
+            state_dict, prefix, {'weight': 0, 'bias': 0}, sharded_offsets
+        )
+
+
+class TERowParallelLinear(TELinear):
+    """
+    Wrapper for the Transformer-Engine's `Linear` layer but specialized similar
+    to megatron's `RowParallelLinear` layer.
+    """
+
+    def __init__(
+        self,
+        input_size: int,
+        output_size: int,
+        *,
+        config: ModelParallelConfig,
+        init_method: Callable,
+        bias: bool,
+        input_is_parallel: bool,
+        skip_bias_add: bool,
+        is_expert: bool,
+        tp_comm_buffer_name: str = None,
+    ):
+        if not input_is_parallel:
+            raise ValueError(
+                "Transformer Engine linear layers do not support input_is_parallel = False"
+            )
+
+        super().__init__(
+            input_size=input_size,
+            output_size=output_size,
+            parallel_mode="row",
+            config=config,
+            init_method=(
+                condition_init_method(config, init_method)
+                if not config.use_cpu_initialization
+                else lambda w: None
+            ),
+            bias=bias,
+            skip_bias_add=skip_bias_add,
+            skip_weight_param_allocation=False,  # We don't currently use this for row parallel layers # pylint: disable=line-too-long
+            is_expert=is_expert,
+            tp_comm_buffer_name=tp_comm_buffer_name,
+        )
+        world_size = get_tensor_model_parallel_world_size()
+        rank = get_tensor_model_parallel_rank()
+        if config.use_cpu_initialization:
+            input_size_per_partition = divide(input_size, world_size)
+            self.master_weight = _initialize_affine_weight_cpu(
+                self.weight,
+                output_size,
+                input_size,
+                input_size_per_partition,
+                1,
+                init_method,
+                stride=1,
+                return_master_weight=False,
+                params_dtype=config.params_dtype,
+                rank=rank,
+                world_size=world_size,
+                skip_set_tensor_parallel_attributes=True,
+            )
+            if bias:
+                self.bias = Parameter(torch.empty(output_size, dtype=config.params_dtype))
+                # Always initialize bias to zero.
+                with torch.no_grad():
+                    self.bias.zero_()
+                setattr(self.bias, 'allreduce', True)
+                setattr(self.bias, 'sequence_parallel', config.sequence_parallel)
+
+    def sharded_state_dict(self, prefix='', sharded_offsets=(), metadata=None):
+        """Sharding along axis 1, bias not sharded"""
+        state_dict = self.state_dict(prefix='', keep_vars=True)
+        return make_sharded_tensors_for_checkpoint(
+            state_dict, prefix, {'weight': 1}, sharded_offsets
+        )
+
+
+class TEDotProductAttention(te.pytorch.DotProductAttention):
+    """
+    Wrapper for the Transformer-Engine's `DotProductAttention` layer that also
+    has "flash attention" enabled.
+
+    Note that if Megatron's parallel_state has not been initialized yet, the
+    tp_group and cp_group passed to TE will be None and must be set later
+    via set_tensor_parallel_group() and set_context_parallel_group().
+    """
+
+    cp_stream: torch.cuda.Stream = None
+
+    def __init__(
+        self,
+        config: TransformerConfig,
+        layer_number: int,
+        attn_mask_type: AttnMaskType,
+        attention_type: str,
+        attention_dropout: float = None,
+        softmax_scale: float = None,
+        k_channels: int = None,
+        v_channels: int = None,
+    ):
+        self.config = config
+        self.te_forward_mask_type = False
+        self.qkv_format: str = 'sbhd'
+
+        if self.config.apply_query_key_layer_scaling != bool(
+            int(os.getenv('NVTE_APPLY_QK_LAYER_SCALING', '0'))
+        ):
+            raise ValueError(
+                f"apply_query_key_layer_scaling is {self.config.apply_query_key_layer_scaling} "
+                f"but environment variable NVTE_APPLY_QK_LAYER_SCALING is "
+                f"{os.getenv('NVTE_APPLY_QK_LAYER_SCALING')}. Transformer Engine does not support "
+                f"setting query key layer scaling via argument, so these two must match."
+            )
+
+        extra_kwargs = {}
+        if is_te_min_version("0.11.0"):
+            extra_kwargs["num_gqa_groups"] = self.config.num_query_groups
+        elif self.config.num_query_groups != self.config.num_attention_heads:
+            raise ValueError(
+                f"Transformer Engine v{get_te_version()} does not support Grouped Query Attention, "
+                f"use a newer version of Transformer Engine. "
+                f"(num_query_groups ({self.config.num_query_groups}) != "
+                f"num_attention_heads ({self.config.num_attention_heads}))"
+            )
+
+        if is_te_min_version("0.10.0"):
+            extra_kwargs["attention_type"] = attention_type
+            # older version don't need attention_type
+
+        if is_te_min_version("0.12.0", check_equality=False):
+            self.te_forward_mask_type = True
+
+        # Only Transformer-Engine version >= 1.0.0 supports context parallelism
+        if is_te_min_version("1.0.0"):
+            if getattr(TEDotProductAttention, "cp_stream") is None:
+                TEDotProductAttention.cp_stream = torch.cuda.Stream()
+            extra_kwargs["cp_group"] = get_context_parallel_group(check_initialized=False)
+            extra_kwargs["cp_global_ranks"] = get_context_parallel_global_ranks(
+                check_initialized=False
+            )
+            extra_kwargs["cp_stream"] = TEDotProductAttention.cp_stream
+        else:
+            assert (
+                self.config.context_parallel_size == 1
+            ), "Only Transformer-Engine version >= 1.0.0 supports context parallelism!"
+
+        if self.config.deterministic_mode:
+            if int(os.getenv("NVTE_ALLOW_NONDETERMINISTIC_ALGO", "1")) != 0:
+                raise RuntimeError(
+                    "deterministic_mode is on and we are using DotProductAttention from "
+                    "Transformer Engine, but NVTE_ALLOW_NONDETERMINISTIC_ALGO is not 0. "
+                    f"Currently set to: {os.getenv('NVTE_ALLOW_NONDETERMINISTIC_ALGO', 'not set')}."
+                )
+
+        if config.window_size is not None:
+            # Check version
+            assert is_te_min_version("1.2.0"), (
+                f"Transformer-Engine v{get_te_version()} must be >= 1.2.0 to support"
+                "sliding window attention."
+            )
+            extra_kwargs['window_size'] = config.window_size
+
+        if is_te_min_version("1.10.0"):
+            # TE 1.10.0 introduces the ability to set the different k and v channels
+            kv_channels = (
+                (k_channels, v_channels)
+                if k_channels is not None and v_channels is not None
+                else self.config.kv_channels
+            )
+            extra_kwargs['softmax_scale'] = softmax_scale
+        else:
+            kv_channels = self.config.kv_channels
+
+        super().__init__(
+            num_attention_heads=self.config.num_attention_heads,
+            kv_channels=kv_channels,
+            attention_dropout=(
+                self.config.attention_dropout if attention_dropout is None else attention_dropout
+            ),
+            attn_mask_type=attn_mask_type.name,
+            sequence_parallel=self.config.sequence_parallel,
+            tp_size=self.config.tensor_model_parallel_size,
+            get_rng_state_tracker=(
+                get_cuda_rng_tracker if get_cuda_rng_tracker().is_initialized() else None
+            ),
+            tp_group=get_tensor_model_parallel_group(check_initialized=False),
+            layer_number=layer_number,
+            **extra_kwargs,
+        )
+
+    def forward(
+        self,
+        query: Tensor,
+        key: Tensor,
+        value: Tensor,
+        attention_mask: Tensor,
+        attn_mask_type: AttnMaskType,
+        packed_seq_params: PackedSeqParams = None,
+    ):
+        """Forward."""
+        packed_seq_kwargs = (
+            dataclasses.asdict(packed_seq_params) if packed_seq_params is not None else {}
+        )
+        # overwrite self.qkv_format depending on self.config.apply_rope_fusion, which can be set
+        # after init
+        if self.config.apply_rope_fusion and is_te_min_version("0.13.0", check_equality=False):
+            self.qkv_format = 'bshd'
+
+        qkv_format = packed_seq_kwargs.get('qkv_format', self.qkv_format)
+
+        if get_te_version() < PkgVersion("1.3.0"):
+            # TE 1.3.0 introduces precomputing max_seqlen to remove unnecessary kernels and D2H
+            # copies (#555)
+            # These two arguments did not exist prior to 1.3.0
+            packed_seq_kwargs.pop("max_seqlen_q", None)
+            packed_seq_kwargs.pop("max_seqlen_kv", None)
+
+        if get_te_version() < PkgVersion("1.10.0"):
+            # TE 1.8.0 introduces cu_seqlens_padded which is the cu_seqlens with paddings counted
+            # in each individual sequence in THD format dataset
+            # These two arguments did not exist prior to 1.8.0.Full support added in 1.10.0 (#1012)
+            packed_seq_kwargs.pop("cu_seqlens_q_padded", None)
+            packed_seq_kwargs.pop("cu_seqlens_kv_padded", None)
+
+        if self.config.apply_rope_fusion and qkv_format == 'bshd':
+            query, key, value = [x.transpose(0, 1).contiguous() for x in (query, key, value)]
+            # In PyTorch, the following two tensors are in fact the same:
+            #   Tensor with shape (1, S, H, D) and stride (S*H*D, H*D, D, 1)
+            #   Tensor with shape (1, S, H, D) and stride (H*D, H*D, D, 1)
+            # Stride for a dimension that is 1 has no meaning, so tensors created two different ways
+            # can have same shape but different strides.
+            # We unify them to the first one to pass the stride check in TE
+            if value.shape == key.shape and value.shape[0] == 1 and value.stride() != key.stride():
+                value = value.as_strided(value.shape, key.stride())
+
+        if self.te_forward_mask_type:
+            if qkv_format == 'thd' and is_te_min_version("1.7.0"):
+                # thd format uses flash attention with cuDNN kernel which requires is_padding=True,
+                # so the only acceptable mask types are `padding_causal` and `padding`. These do not
+                # necessarily indicate there are padded tokens in the sequence.
+                if attn_mask_type == AttnMaskType.causal:
+                    attn_mask_type = AttnMaskType.padding_causal
+                elif attn_mask_type == AttnMaskType.no_mask:
+                    attn_mask_type = AttnMaskType.padding
+            core_attn_out = super().forward(
+                query,
+                key,
+                value,
+                attention_mask,
+                attn_mask_type=attn_mask_type.name,
+                **packed_seq_kwargs,
+            )
+        else:
+            core_attn_out = super().forward(query, key, value, attention_mask, **packed_seq_kwargs)
+
+        if self.config.apply_rope_fusion and qkv_format == 'bshd':
+            return core_attn_out.transpose(0, 1)
+        else:
+            return core_attn_out
+
+
+if is_te_min_version("1.9.0.dev0"):
+
+    class TEGroupedLinear(te.pytorch.GroupedLinear):
+        """
+        Wrapper for the Transformer-Engine's `GroupedLinear` layer.
+
+        Note that if Megatron's parallel_state has not been initialized
+        yet, the tp_group passed to TE will be None and must be set later
+        via set_tensor_parallel_group().
+        """
+
+        def __init__(
+            self,
+            num_gemms: int,
+            input_size: int,
+            output_size: int,
+            *,
+            parallel_mode: str,
+            config: ModelParallelConfig,
+            init_method: Callable,
+            bias: bool,
+            skip_bias_add: bool,
+            is_expert: bool = False,
+            tp_comm_buffer_name: str = None,
+        ):
+            self.config = config
+
+            # TE returns a zero length Tensor when bias=False and
+            # return_bias=True, but we prefer None.  So in that case we
+            # tell TE to not return the bias, and return None
+            # ourselves. This way our forward always returns two values
+            # and we don't have to deal with the zero length Tensor.
+            self.te_return_bias = skip_bias_add and bias
+            self.is_first_microbatch = True
+            self.disable_parameter_transpose_cache = self.config.disable_parameter_transpose_cache
+
+            extra_kwargs = _get_extra_te_kwargs(config)
+            extra_kwargs["ub_name"] = tp_comm_buffer_name
+
+            self.expert_parallel = self.config.expert_model_parallel_size > 1
+            if self.expert_parallel:
+                extra_kwargs["rng_tracker_name"] = get_expert_parallel_rng_tracker_name()
+
+            # For MoE models, the comms between TP and EP group is explicitly handled by
+            # MoE token dispatcher. So we disable comms by making TE agnostic of model parallel.
+            self.explicit_expert_comm = is_expert and (
+                config.tensor_model_parallel_size > 1 or self.expert_parallel
+            )
+            tp_group = get_tensor_model_parallel_group(check_initialized=False)
+            if self.explicit_expert_comm and config.moe_extended_tp:
+                tp_size = parallel_state.get_tensor_and_expert_parallel_world_size()
+            else:
+                tp_size = parallel_state.get_tensor_model_parallel_world_size()
+            if self.explicit_expert_comm:
+                if parallel_mode == "column":
+                    output_size = divide(output_size, tp_size)
+                elif parallel_mode == "row":
+                    input_size = divide(input_size, tp_size)
+                parallel_mode = None
+                tp_size = 1
+                tp_group = None
+
+            super().__init__(
+                num_gemms=num_gemms,
+                in_features=input_size,
+                out_features=output_size,
+                sequence_parallel=self.config.sequence_parallel,
+                fuse_wgrad_accumulation=self.config.gradient_accumulation_fusion,
+                tp_group=tp_group,
+                tp_size=tp_size,
+                get_rng_state_tracker=(
+                    get_cuda_rng_tracker if get_cuda_rng_tracker().is_initialized() else None
+                ),
+                init_method=condition_init_method(config, init_method),
+                bias=bias,
+                return_bias=self.te_return_bias,
+                parallel_mode=parallel_mode,
+                **extra_kwargs,
+            )
+
+            for param in self.parameters():
+                setattr(param, 'allreduce', not (is_expert and self.expert_parallel))
+
+        def forward(self, x, m_splits):
+            """Forward."""
+            _is_first_microbatch = (
+                None if self.disable_parameter_transpose_cache else self.is_first_microbatch
+            )
+            out = super().forward(x, m_splits, is_first_microbatch=_is_first_microbatch)
+            self.is_first_microbatch = False
+
+            # TE only returns a tuple when return_bias is True, otherwise
+            # it returns a single Tensor, we always want to return two
+            # values regardless of the arguments.
+            if self.te_return_bias:
+                return out
+            return out, None
+
+        def _sharded_state_dict_grouped(
+            self, tp_axis_map, prefix='', sharded_offsets=(), metadata=None
+        ):
+            """
+            prefix should be module_name to make keys identical to sequetial ones.
+            """
+            sharded_state_dict = {}
+            full_state_dict = self.state_dict(prefix='', keep_vars=True)
+            num_global_experts = (
+                parallel_state.get_expert_model_parallel_world_size() * self.num_gemms
+            )
+            local_expert_indices_offset = (
+                parallel_state.get_expert_model_parallel_rank() * self.num_gemms
+            )
+            ep_axis = len(sharded_offsets)
+            for gemm_idx in range(self.num_gemms):
+                state_dict = {
+                    f'{gemm_idx}.weight': full_state_dict[f'weight{gemm_idx}'],
+                    f'{gemm_idx}._extra_state': full_state_dict['_extra_state'],
+                }
+                if self.use_bias:
+                    state_dict[f'{gemm_idx}.bias'] = full_state_dict[f'bias{gemm_idx}']
+                sub_sd = make_sharded_tensors_for_checkpoint(
+                    state_dict,
+                    '',
+                    tp_axis_map,
+                    (
+                        *sharded_offsets,
+                        (ep_axis, local_expert_indices_offset + gemm_idx, num_global_experts),
+                    ),
+                )
+                # Remove expert layers indexing from sharded keys
+                replace_prefix_for_sharding(sub_sd, f'{gemm_idx}.', prefix)
+                sharded_state_dict.update(
+                    {
+                        f'{prefix}weight{gemm_idx}': sub_sd[f'{gemm_idx}.weight'],
+                        # TODO: TE's GroupedLinear only has one _extra_state for all experts.
+                        # We need sharding or build/merge fn to handle _extra_state correctly.
+                        f'{prefix}_extra_state{"" if gemm_idx == 0 else gemm_idx}': sub_sd[
+                            f'{gemm_idx}._extra_state'
+                        ],
+                    }
+                )
+                if self.use_bias:
+                    sharded_state_dict[f'{prefix}bias{gemm_idx}'] = sub_sd[f'{gemm_idx}.bias']
+            # Adjust replica ids - replication along DP modulo EP
+            for k, sh_ten in sharded_state_dict.items():
+                replica_id = sh_ten.replica_id
+                assert (
+                    len(replica_id) == 3
+                ), f'Expected replica_id for {k} to be in (PP, TP, DP) format, got: {replica_id}'
+                sh_ten.replica_id = (
+                    *replica_id[:2],
+                    parallel_state.get_data_modulo_expert_parallel_rank(),
+                )
+            return sharded_state_dict
+
+    class TEColumnParallelGroupedLinear(TEGroupedLinear):
+        """
+        Wrapper for the Transformer-Engine's `GroupedLinear` layer but specialized
+        to column-parallel style.
+        """
+
+        def __init__(
+            self,
+            num_gemms: int,
+            input_size: int,
+            output_size: int,
+            *,
+            config: ModelParallelConfig,
+            init_method: Callable,
+            bias: bool,
+            skip_bias_add: bool,
+            is_expert: bool,
+            tp_comm_buffer_name: str = None,
+        ):
+
+            super().__init__(
+                num_gemms=num_gemms,
+                input_size=input_size,
+                output_size=output_size,
+                parallel_mode="column",
+                config=config,
+                init_method=condition_init_method(config, init_method),
+                bias=bias,
+                skip_bias_add=skip_bias_add,
+                is_expert=is_expert,
+                tp_comm_buffer_name=tp_comm_buffer_name,
+            )
+
+        def sharded_state_dict(self, prefix='', sharded_offsets=(), metadata=None):
+            """
+            For each gemm, sharding along axis 0, bias sharded.
+            Assume sharded_offsets[-1] is the expert parallel offset.
+            """
+            tp_axis_map = {}
+            for gemm_idx in range(self.num_gemms):
+                tp_axis_map.update({f'{gemm_idx}.weight': 0, f'{gemm_idx}.bias': 0})
+            return super()._sharded_state_dict_grouped(
+                tp_axis_map, prefix, sharded_offsets, metadata
+            )
+
+    class TERowParallelGroupedLinear(TEGroupedLinear):
+        """
+        Wrapper for the Transformer-Engine's `GroupedLinear` layer but specialized
+        to row-parallel style.
+        """
+
+        def __init__(
+            self,
+            num_gemms: int,
+            input_size: int,
+            output_size: int,
+            *,
+            config: ModelParallelConfig,
+            init_method: Callable,
+            bias: bool,
+            skip_bias_add: bool,
+            is_expert: bool,
+            tp_comm_buffer_name: str = None,
+        ):
+
+            super().__init__(
+                num_gemms=num_gemms,
+                input_size=input_size,
+                output_size=output_size,
+                parallel_mode="row",
+                config=config,
+                init_method=condition_init_method(config, init_method),
+                bias=bias,
+                skip_bias_add=skip_bias_add,
+                is_expert=is_expert,
+                tp_comm_buffer_name=tp_comm_buffer_name,
+            )
+
+        def sharded_state_dict(self, prefix='', sharded_offsets=(), metadata=None):
+            """
+            For each gemm, sharding along axis 1, bias not sharded.
+            Assume sharded_offsets[-1] is the expert parallel offset.
+            """
+            tp_axis_map = {f'{gemm_idx}.weight': 1 for gemm_idx in range(self.num_gemms)}
+            return super()._sharded_state_dict_grouped(
+                tp_axis_map, prefix, sharded_offsets, metadata
+            )
+
+else:
+
+    TEGroupedLinear = None
+    TEColumnParallelGroupedLinear = None
+    TERowParallelGroupedLinear = None
+
+
+class TEDelayedScaling(te.common.recipe.DelayedScaling):
+    """
+    Wrapper for the Transformer-Engine's `DelayedScaling` layer.
+    """
+
+    def __init__(
+        self,
+        config: ModelParallelConfig,
+        fp8_format: int,
+        override_linear_precision: tuple = (False, False, False),
+    ):
+        extra_kwargs = _get_extra_te_kwargs(config)
+        if is_te_min_version("1.6.0.dev0"):
+            extra_kwargs["fp8_dpa"] = config.fp8_dot_product_attention
+            extra_kwargs["fp8_mha"] = config.fp8_multi_head_attention
+        if get_te_version() < PkgVersion("1.8.0"):
+            extra_kwargs["interval"] = config.fp8_interval
+        elif config.fp8_interval != 1:
+            warnings.warn("fp8_interval is deprecated and ignored from Transformer-Engine v1.8.0.")
+
+        super().__init__(
+            margin=config.fp8_margin,
+            fp8_format=fp8_format,
+            amax_compute_algo=config.fp8_amax_compute_algo,
+            amax_history_len=config.fp8_amax_history_len,
+            override_linear_precision=override_linear_precision,
+            **extra_kwargs,
+        )
+
+
+class TECudaRNGStatesTracker(te.pytorch.distributed.CudaRNGStatesTracker):
+    """Wraps TransformerEngine's CudaRNGStatesTracker so that it is
+    interchangeable with Megatron's RNG tracker"""
+
+    def is_initialized(self):
+        """Checks if the internal RNG state has been set wirth set_states()."""
+        return self._is_initialized
+
+    def reset(self):
+        """Reset the internal RNG state."""
+        super().reset()
+        self._is_initialized = False
+
+    def set_states(self, states):
+        """Set the internal RNG state."""
+        super().set_states(states)
+        self._is_initialized = True
+
+    def add(self, name, seed):
+        """Track the rng state."""
+        super().add(name, seed)
+        self._is_initialized = True
+
+
+def te_checkpoint(
+    forward_func,
+    distribute_saved_activations,
+    get_rng_state_tracker,
+    tp_group,
+    hidden_states,
+    attention_mask,
+    context,
+    context_mask,
+    rotary_pos_emb,
+):
+    """Checkpointing with Transformer-Engine."""
+    from transformer_engine.pytorch.distributed import checkpoint
+
+    if is_te_min_version("1.5.0"):
+        return checkpoint(
+            forward_func,
+            hidden_states,
+            attention_mask,
+            context,
+            context_mask,
+            rotary_pos_emb,
+            distribute_saved_activations=distribute_saved_activations,
+            get_rng_state_tracker=get_rng_state_tracker,
+            tp_group=tp_group,
+        )
+    else:
+        return checkpoint(
+            forward_func,
+            distribute_saved_activations,
+            get_rng_state_tracker,
+            tp_group,
+            hidden_states,
+            attention_mask,
+            context,
+            context_mask,
+            rotary_pos_emb,
+        )
+
+
+try:
+
+    from transformer_engine.pytorch.attention import _SplitAlongDim
+
+    SplitAlongDim = _SplitAlongDim.apply
+
+except ImportError:
+
+    SplitAlongDim = None
+
+try:
+
+    from transformer_engine.pytorch.cpu_offload import (
+        get_cpu_offload_context as _get_cpu_offload_context,
+    )
+
+    def get_cpu_offload_context(
+        enabled, num_layers, model_layers, activation_offloading, weight_offloading
+    ):
+        """Get CPU offload context and sync function."""
+        if is_te_min_version("1.10.0.dev0"):
+            context, sync_func = _get_cpu_offload_context(
+                enabled, num_layers, model_layers, activation_offloading, weight_offloading
+            )
+        else:
+            context, sync_func = _get_cpu_offload_context(
+                enabled, num_layers, activation_offloading, weight_offloading
+            )
+
+        return context, sync_func
+
+except ImportError:
+
+    get_cpu_offload_context = None
diff --git a/tests/pipeline_parallel/__init__.py b/megatron/core/fusions/__init__.py
similarity index 100%
rename from tests/pipeline_parallel/__init__.py
rename to megatron/core/fusions/__init__.py
diff --git a/megatron/core/fusions/fused_bias_dropout.py b/megatron/core/fusions/fused_bias_dropout.py
new file mode 100644
index 0000000000..c7fa8419a0
--- /dev/null
+++ b/megatron/core/fusions/fused_bias_dropout.py
@@ -0,0 +1,73 @@
+# Copyright (c) 2023, NVIDIA CORPORATION. All rights reserved.
+from typing import Optional, Tuple
+
+import torch
+
+from megatron.core.jit import jit_fuser
+
+
+def _bias_dropout_add_func(x_with_bias, residual, prob, training):
+    # type: (Tuple[Tensor, Optional[Tensor]], Tensor, float, bool) -> Tensor
+    # NOTE: Previously, the argument `bias` used to be passed as
+    # `bias.expand_as(residual)` when the `bias_dropout_func` is called from the
+    # transformer layer but broadcasting should automatically take care of that.
+    # Also, looking at broadcasting semantics, `expand_as` and broadcasting
+    # seem to be identical performance-wise (both just change the view).
+
+    x, bias = x_with_bias  # unpack
+
+    # If we want to train mixed precision, then the output of this function
+    # should be half precision. However, in AMP O1, the input (residual) is
+    # in fp32, and it will up-cast the result to fp32, causing pipeline parallel
+    # GPU communication to hang. Therefore, we need to cast residual to the same
+    # dtype as x.
+    residual = residual if residual.dtype == x.dtype else residual.to(x.dtype)
+
+    # The Dropout operation, Residual Addition and the tensor returning can be
+    # done generically outside the if statement, but that stops fusing of Bias
+    # Addition-Dropout-Residual Addition operation. So doing it together inside
+    # the conditional branch to improve performance
+    if bias is not None:
+        x = x + bias
+        out = torch.nn.functional.dropout(x, p=prob, training=training)
+        out = residual + out
+        return out
+    else:
+        out = torch.nn.functional.dropout(x, p=prob, training=training)
+        out = residual + out
+        return out
+
+
+def bias_dropout_add_unfused(training):
+    def _bias_dropout_add(x_with_bias, residual, prob):
+        return _bias_dropout_add_func(x_with_bias, residual, prob, training)
+
+    return _bias_dropout_add
+
+
+@jit_fuser
+def bias_dropout_add_fused_train(
+    x_with_bias: Tuple[torch.Tensor, Optional[torch.Tensor]], residual: torch.Tensor, prob: float
+) -> torch.Tensor:
+    return _bias_dropout_add_func(x_with_bias, residual, prob, True)
+
+
+@jit_fuser
+def bias_dropout_add_fused_inference(
+    x_with_bias: Tuple[torch.Tensor, Optional[torch.Tensor]], residual: torch.Tensor, prob: float
+) -> torch.Tensor:
+    return _bias_dropout_add_func(x_with_bias, residual, prob, False)
+
+
+def get_bias_dropout_add(training, fused):
+    if fused:
+        # jit scripting for a nn.module (with dropout) is not
+        # triggering the fusion kernel. For now, we use two
+        # different nn.functional routines to account for varying
+        # dropout semantics during training and inference phases.
+        if training:
+            return bias_dropout_add_fused_train
+        else:
+            return bias_dropout_add_fused_inference
+    else:
+        return bias_dropout_add_unfused(training)
diff --git a/megatron/core/fusions/fused_bias_geglu.py b/megatron/core/fusions/fused_bias_geglu.py
new file mode 100644
index 0000000000..70ef348828
--- /dev/null
+++ b/megatron/core/fusions/fused_bias_geglu.py
@@ -0,0 +1,85 @@
+# Copyright (c) 2022, NVIDIA CORPORATION. All rights reserved.
+
+import torch
+
+from megatron.core.jit import jit_fuser
+
+###### BIAS GELU FUSION/ NO AUTOGRAD ################
+# 1/sqrt(2*pi)-> 0.3989423
+# 1/sqrt(2)   -> 0.70710678
+# sqrt(2/pi)  -> 0.79788456
+# this function is tanh approximation of gelu
+# actual gelu is:
+# x * 0.5 * (1.0 + torch.erf(x * 0.70710678))
+
+
+@jit_fuser
+def geglu(y):
+    y_1, y_2 = torch.chunk(y, 2, -1)
+    return (y_1 * 0.5 * (1.0 + torch.tanh(0.79788456 * y_1 * (1 + 0.044715 * y_1 * y_1)))) * y_2
+
+
+@jit_fuser
+def bias_geglu(bias, y):
+    y = y + bias
+    return geglu(y)
+
+
+# gradient of tanh approximation of gelu
+# gradient of actual gelu is:
+# 0.5 * (1. + torch.erf(x * 0.70710678)) + 0.3989423 * x * torch.exp(-0.5 * x * x)
+@jit_fuser
+def geglu_back(g, y):
+    y_1, y_2 = torch.chunk(y, 2, -1)
+    tanh_out = torch.tanh(0.79788456 * y_1 * (1 + 0.044715 * y_1 * y_1))
+    # sqrt(2/pi) * 3 * 0.044715 -> 0.1070322243
+    ff = 0.5 * y_1 * ((1 - tanh_out * tanh_out) * (0.79788456 + 0.1070322243 * y_1 * y_1)) + 0.5 * (
+        1 + tanh_out
+    )
+    return torch.cat(((g * y_2) * ff, g * (y_1 * 0.5 * (1.0 + tanh_out))), -1)
+
+
+@jit_fuser
+def bias_geglu_back(g, y, bias):
+    y = y + bias
+    return geglu_back(g, y)
+
+
+class BiasGeGLUFunction(torch.autograd.Function):
+    @staticmethod
+    # bias is an optional argument
+    def forward(ctx, input, bias):
+        ctx.save_for_backward(input, bias)
+        return bias_geglu(input, bias)
+
+    @staticmethod
+    def backward(ctx, grad_output):
+        input, bias = ctx.saved_tensors
+        tmp = bias_geglu_back(grad_output, input, bias)
+        return tmp, tmp
+
+
+class GeGLUFunction(torch.autograd.Function):
+    @staticmethod
+    # bias is an optional argument
+    def forward(ctx, input):
+        ctx.save_for_backward(input)
+        return geglu(input)
+
+    @staticmethod
+    def backward(ctx, grad_output):
+        input = ctx.saved_tensors
+        tmp = geglu_back(grad_output, input[0])
+        return tmp
+
+
+def bias_geglu_impl(input, bias):
+    ori_shape = input.shape
+    assert len(ori_shape) in [2, 3]
+    input = input.view(-1, ori_shape[-1])
+    if bias is not None:
+        output = BiasGeGLUFunction.apply(input, bias)
+    else:
+        output = GeGLUFunction.apply(input)
+
+    return output if len(ori_shape) == 2 else output.view(ori_shape[0], ori_shape[1], -1)
diff --git a/megatron/core/fusions/fused_bias_gelu.py b/megatron/core/fusions/fused_bias_gelu.py
new file mode 100644
index 0000000000..8cc90f6174
--- /dev/null
+++ b/megatron/core/fusions/fused_bias_gelu.py
@@ -0,0 +1,55 @@
+# Copyright (c) 2022, NVIDIA CORPORATION. All rights reserved.
+
+import torch
+
+from megatron.core.jit import jit_fuser
+
+# BIAS GELU FUSION/ NO AUTOGRAD ################
+# 1/sqrt(2*pi)-> 0.3989423
+# 1/sqrt(2)   -> 0.70710678
+# sqrt(2/pi)  -> 0.79788456
+# this function is tanh approximation of gelu
+# actual gelu is:
+# x * 0.5 * (1.0 + torch.erf(x * 0.70710678))
+
+
+@jit_fuser
+def bias_gelu(bias, y):
+    x = bias + y
+    return x * 0.5 * (1.0 + torch.tanh(0.79788456 * x * (1 + 0.044715 * x * x)))
+
+
+# gradient of tanh approximation of gelu
+# gradient of actual gelu is:
+# 0.5 * (1. + torch.erf(x * 0.70710678)) + 0.3989423 * x * torch.exp(-0.5 * x * x)
+@jit_fuser
+def bias_gelu_back(g, bias, y):
+    x = bias + y
+    tanh_out = torch.tanh(0.79788456 * x * (1 + 0.044715 * x * x))
+    # sqrt(2/pi) * 3 * 0.044715 -> 0.1070322243
+    ff = 0.5 * x * ((1 - tanh_out * tanh_out) * (0.79788456 + 0.1070322243 * x * x)) + 0.5 * (
+        1 + tanh_out
+    )
+    return ff * g
+
+
+class GeLUFunction(torch.autograd.Function):
+    @staticmethod
+    # bias is an optional argument
+    def forward(ctx, input, bias):
+        ctx.save_for_backward(input, bias)
+        return bias_gelu(bias, input)
+
+    @staticmethod
+    def backward(ctx, grad_output):
+        input, bias = ctx.saved_tensors
+        tmp = bias_gelu_back(grad_output, bias, input)
+        return tmp, tmp
+
+    # This is required to make Sphinx happy :-(
+    @classmethod
+    def apply(cls, *args, **kwargs):
+        return super().apply(*args, **kwargs)
+
+
+bias_gelu_impl = GeLUFunction.apply
diff --git a/megatron/core/fusions/fused_bias_swiglu.py b/megatron/core/fusions/fused_bias_swiglu.py
new file mode 100644
index 0000000000..fd3ac3ec6f
--- /dev/null
+++ b/megatron/core/fusions/fused_bias_swiglu.py
@@ -0,0 +1,89 @@
+# Copyright (c) 2024, NVIDIA CORPORATION. All rights reserved.
+
+import torch
+import torch.nn.functional as F
+
+from megatron.core.jit import jit_fuser
+
+###### BIAS SWIGLU FUSION/ NO AUTOGRAD ################
+
+
+@jit_fuser
+def swiglu(y):
+    y_1, y_2 = torch.chunk(y, 2, -1)
+    return F.silu(y_1) * y_2
+
+
+@jit_fuser
+def bias_swiglu(y, bias):
+    y = y + bias
+    return swiglu(y)
+
+
+# gradient of tanh approximation of gelu
+# gradient of actual gelu is:
+# 0.5 * (1. + torch.erf(x * 0.70710678)) + 0.3989423 * x * torch.exp(-0.5 * x * x)
+@jit_fuser
+def swiglu_back(g, y):
+    y_1, y_2 = torch.chunk(y, 2, -1)
+    return torch.cat(
+        (g * torch.sigmoid(y_1) * (1 + y_1 * (1 - torch.sigmoid(y_1))) * y_2, g * F.silu(y_1)), -1
+    )
+
+
+@jit_fuser
+def bias_swiglu_back(g, y, bias):
+    y = y + bias
+    return swiglu_back(g, y)
+
+
+class BiasSwiGLUFunction(torch.autograd.Function):
+    @staticmethod
+    # bias is an optional argument
+    def forward(ctx, input, bias, fp8_input_store):
+        input_for_backward = input.to(torch.float8_e4m3fn) if fp8_input_store else input
+        ctx.save_for_backward(input_for_backward, bias)
+        ctx.ori_input_dtype = input.dtype
+        ctx.fp8_input_store = fp8_input_store
+        return bias_swiglu(input, bias)
+
+    @staticmethod
+    def backward(ctx, grad_output):
+        input, bias = ctx.saved_tensors
+        input = input.to(ctx.ori_input_dtype) if ctx.fp8_input_store else input
+        tmp = bias_swiglu_back(grad_output, input, bias)
+        return tmp, tmp, None
+
+
+class SwiGLUFunction(torch.autograd.Function):
+    @staticmethod
+    # bias is an optional argument
+    def forward(ctx, input, fp8_input_store):
+        input_for_backward = input.to(torch.float8_e4m3fn) if fp8_input_store else input
+        ctx.save_for_backward(input_for_backward)
+        ctx.ori_input_dtype = input.dtype
+        ctx.fp8_input_store = fp8_input_store
+        return swiglu(input)
+
+    @staticmethod
+    def backward(ctx, grad_output):
+        input = ctx.saved_tensors[0]
+        input = input.to(ctx.ori_input_dtype) if ctx.fp8_input_store else input
+        tmp = swiglu_back(grad_output, input)
+        return tmp, None
+
+
+def bias_swiglu_impl(input, bias, fp8_input_store=False):
+    ori_shape = input.shape
+    assert len(ori_shape) in [2, 3]
+    input = input.view(-1, ori_shape[-1])
+    if bias is not None:
+        output = BiasSwiGLUFunction.apply(input, bias, fp8_input_store)
+    else:
+        output = SwiGLUFunction.apply(input, fp8_input_store)
+
+    return output if len(ori_shape) == 2 else output.view(ori_shape[0], ori_shape[1], -1)
+
+
+# bias_swiglu_impl = BiasSwiGLUFunction.apply
+# swiglu_impl = SwiGLUFunction.apply
diff --git a/megatron/core/fusions/fused_cross_entropy.py b/megatron/core/fusions/fused_cross_entropy.py
new file mode 100644
index 0000000000..909cc403cf
--- /dev/null
+++ b/megatron/core/fusions/fused_cross_entropy.py
@@ -0,0 +1,143 @@
+# Copyright (c) 2024, NVIDIA CORPORATION. All rights reserved.
+
+from typing import Tuple
+
+import torch
+
+from megatron.core.jit import jit_fuser
+from megatron.core.parallel_state import (
+    get_tensor_model_parallel_group,
+    get_tensor_model_parallel_rank,
+    get_tensor_model_parallel_world_size,
+)
+from megatron.core.tensor_parallel.cross_entropy import VocabParallelCrossEntropy
+from megatron.core.tensor_parallel.utils import VocabUtility
+
+
+@jit_fuser
+def calculate_logits_max(vocab_parallel_logits: torch.Tensor) -> Tuple[torch.Tensor, torch.Tensor]:
+
+    vocab_parallel_logits, logits_max = VocabParallelCrossEntropy.calculate_logits_max(
+        vocab_parallel_logits
+    )
+
+    return vocab_parallel_logits, logits_max
+
+
+@jit_fuser
+def calculate_predicted_logits(
+    vocab_parallel_logits: torch.Tensor,
+    target: torch.Tensor,
+    logits_max: torch.Tensor,
+    vocab_start_index: int,
+    vocab_end_index: int,
+) -> Tuple[torch.Tensor, torch.Tensor, torch.Tensor, torch.Tensor]:
+
+    (target_mask, masked_target_1d, predicted_logits, sum_exp_logits, exp_logits) = (
+        VocabParallelCrossEntropy.calculate_predicted_logits(
+            vocab_parallel_logits, target, logits_max, vocab_start_index, vocab_end_index
+        )
+    )
+
+    predicted_logits_sum_exp_logits = torch.cat((predicted_logits, sum_exp_logits))
+
+    return target_mask, masked_target_1d, predicted_logits_sum_exp_logits, exp_logits
+
+
+@jit_fuser
+def calculate_cross_entropy_loss(
+    exp_logits: torch.Tensor, predicted_logits_sum_exp_logits: torch.Tensor
+) -> Tuple[torch.Tensor, torch.Tensor]:
+
+    split_val = predicted_logits_sum_exp_logits.size()[0] // 2
+    predicted_logits, sum_exp_logits = torch.split(predicted_logits_sum_exp_logits, split_val)
+
+    exp_logits, loss = VocabParallelCrossEntropy.calculate_cross_entropy_loss(
+        exp_logits, predicted_logits, sum_exp_logits
+    )
+
+    return exp_logits, loss
+
+
+@jit_fuser
+def calculate_gradients(
+    softmax: torch.Tensor,
+    grad_output: torch.Tensor,
+    target_mask: torch.Tensor,
+    masked_target_1d: torch.Tensor,
+) -> torch.Tensor:
+
+    (grad_2d, arange_1d, softmax_update, grad_input) = (
+        VocabParallelCrossEntropy.prepare_gradient_calculation_operands(softmax, target_mask)
+    )
+
+    grad_input = VocabParallelCrossEntropy.calculate_gradients(
+        grad_2d, arange_1d, masked_target_1d, softmax_update, grad_input, grad_output
+    )
+
+    grad_input = grad_input.to(torch.bfloat16)
+
+    return grad_input
+
+
+class _VocabParallelCrossEntropy(torch.autograd.Function):
+    @staticmethod
+    def forward(ctx, vocab_parallel_logits, target):
+
+        vocab_parallel_logits, logits_max = calculate_logits_max(vocab_parallel_logits)
+        torch.distributed.all_reduce(
+            logits_max, op=torch.distributed.ReduceOp.MAX, group=get_tensor_model_parallel_group()
+        )
+
+        # Get the partition's vocab indices
+        get_vocab_range = VocabUtility.vocab_range_from_per_partition_vocab_size
+        partition_vocab_size = vocab_parallel_logits.size()[-1]
+        rank = get_tensor_model_parallel_rank()
+        world_size = get_tensor_model_parallel_world_size()
+        vocab_start_index, vocab_end_index = get_vocab_range(partition_vocab_size, rank, world_size)
+
+        (target_mask, masked_target_1d, predicted_logits_sum_exp_logits, exp_logits) = (
+            calculate_predicted_logits(
+                vocab_parallel_logits, target, logits_max, vocab_start_index, vocab_end_index
+            )
+        )
+
+        # All reduce is needed to get the chunks from other GPUs.
+        # In the fused case, tensors are batches to invoke a single
+        # AllReduce call
+        torch.distributed.all_reduce(
+            predicted_logits_sum_exp_logits,
+            op=torch.distributed.ReduceOp.SUM,
+            group=get_tensor_model_parallel_group(),
+        )
+
+        exp_logits, loss = calculate_cross_entropy_loss(exp_logits, predicted_logits_sum_exp_logits)
+
+        # Store softmax, target-mask and masked-target for backward pass.
+        ctx.save_for_backward(exp_logits, target_mask, masked_target_1d)
+
+        return loss
+
+    @staticmethod
+    def backward(ctx, grad_output):
+
+        # Retreive tensors from the forward path.
+        softmax, target_mask, masked_target_1d = ctx.saved_tensors
+
+        grad_input = calculate_gradients(softmax, grad_output, target_mask, masked_target_1d)
+
+        return grad_input, None
+
+
+def fused_vocab_parallel_cross_entropy(vocab_parallel_logits, target):
+    """
+    Performs cross entropy loss when logits are split across tensor parallel ranks
+
+    Args:
+        vocab_parallel_logits: logits split across tensor parallel ranks
+                               dimension is [sequence_length, batch_size, hidden_size]
+
+        target: correct vocab ids of dimseion [sequence_length, micro_batch_size]
+
+    """
+    return _VocabParallelCrossEntropy.apply(vocab_parallel_logits, target)
diff --git a/megatron/core/fusions/fused_layer_norm.py b/megatron/core/fusions/fused_layer_norm.py
new file mode 100644
index 0000000000..d02ae7aa4d
--- /dev/null
+++ b/megatron/core/fusions/fused_layer_norm.py
@@ -0,0 +1,169 @@
+# Copyright (c) 2023, NVIDIA CORPORATION. All rights reserved.
+
+import importlib
+import inspect
+import numbers
+
+import torch
+from torch import Tensor
+from torch.nn import init
+from torch.nn.parameter import Parameter
+
+from megatron.core.transformer import TransformerConfig
+from megatron.core.utils import make_viewless_tensor
+
+try:
+    from apex.contrib.layer_norm.layer_norm import FastLayerNormFN
+
+    HAVE_PERSIST_LAYER_NORM = True
+except ImportError:
+    HAVE_PERSIST_LAYER_NORM = False
+
+try:
+    from apex.normalization.fused_layer_norm import FusedLayerNormAffineFunction
+
+    HAVE_FUSED_LAYER_NORM = True
+except ImportError:
+    HAVE_FUSED_LAYER_NORM = False
+
+
+class FusedLayerNorm(torch.nn.Module):
+    """Layer Norm, fused into a single CUDA kernel.
+
+    Args:
+      hidden_size (int): Transformer hidden dimension.
+
+      eps (float): Epsilon added to denominator, for numerical stability.
+
+      persist_layer_norm (bool): Use persistent fused layer norm kernel.
+      This kernel supports only a set of hidden sizes. Please
+      check persist_ln_hidden_sizes if your hidden size is supported.
+
+      zero_centered_gamma (bool): Adjust LayerNorm weights such that they are
+      centered around zero. This improves numerical stability.
+
+      config (TransformerConfig): Transformer config. Include to match custom
+      layer norm interfaces.
+
+      normalization (str): Normalization type, used for Transformer Engine.
+      Must equal 'LayerNorm' here.
+    """
+
+    def __init__(
+        self,
+        config: TransformerConfig,
+        hidden_size: int,
+        eps: float = 1e-5,
+        persist_layer_norm: bool = True,
+        zero_centered_gamma: bool = False,
+        normalization: str = "LayerNorm",  # included to match TE interface
+    ):
+        super().__init__()
+
+        self.config = config
+
+        self.zero_centered_gamma = self.config.layernorm_zero_centered_gamma
+        assert (
+            self.config.normalization == "LayerNorm"
+        ), f'({self.config.normalization}) is not supported in FusedLayerNorm'
+
+        # List of hiddens sizes supported in the persistent layer norm kernel
+        # If the hidden size is not supported, fall back to the non-persistent
+        # kernel.
+        persist_ln_hidden_sizes = [
+            1024,
+            1536,
+            2048,
+            2304,
+            3072,
+            3840,
+            4096,
+            5120,
+            6144,
+            8192,
+            10240,
+            12288,
+            12800,
+            15360,
+            16384,
+            18432,
+            20480,
+            24576,
+            25600,
+            30720,
+            32768,
+            40960,
+            49152,
+            65536,
+        ]
+        persist_layer_norm = self.config.persist_layer_norm
+        if hidden_size not in persist_ln_hidden_sizes or not HAVE_PERSIST_LAYER_NORM:
+            persist_layer_norm = False
+
+        if not persist_layer_norm and not HAVE_FUSED_LAYER_NORM:
+            # TODO: Add pytorch only layer norm
+            raise ValueError(f'Apex must be installed to use FusedLayerNorm.')
+
+        if isinstance(hidden_size, numbers.Integral):
+            hidden_size = (hidden_size,)
+        self.hidden_size = torch.Size(hidden_size)
+        self.eps = eps
+        # Parameters need to be initialized with torch.empty rather than torch.Tensor for correct device placement with nemo2.
+        self.weight = Parameter(torch.empty(*hidden_size))
+        self.bias = Parameter(torch.empty(*hidden_size))
+        self.reset_parameters()
+        self.persist_layer_norm = persist_layer_norm
+        self.sequence_parallel = self.config.sequence_parallel
+
+        # set sequence parallelism flag on weight and bias parameters
+        setattr(self.weight, 'sequence_parallel', self.sequence_parallel)
+        setattr(self.bias, 'sequence_parallel', self.sequence_parallel)
+
+    def reset_parameters(self):
+
+        if self.zero_centered_gamma:
+            init.zeros_(self.weight)
+            init.zeros_(self.bias)
+        else:
+            init.ones_(self.weight)
+            init.zeros_(self.bias)
+
+    def forward(self, input: Tensor) -> Tensor:
+
+        weight = self.weight + 1 if self.zero_centered_gamma else self.weight
+
+        if self.persist_layer_norm:
+            if 'memory_efficient' in inspect.getfullargspec(FastLayerNormFN.forward).args:
+                output = FastLayerNormFN.apply(
+                    input, weight, self.bias, self.eps, self.config.memory_efficient_layer_norm
+                )
+            else:
+                output = FastLayerNormFN.apply(input, weight, self.bias, self.eps)
+
+            # Apex's fast layer norm function outputs a 'view' tensor (i.e., has
+            # a populated '_base' field). This will result in schedule.py's
+            # deallocate_output_tensor() throwing an error, so a viewless tensor is
+            # created to prevent this.
+            output = make_viewless_tensor(
+                inp=output, requires_grad=input.requires_grad, keep_graph=True
+            )
+
+        else:
+            if (
+                'memory_efficient'
+                in inspect.getfullargspec(FusedLayerNormAffineFunction.forward).args
+            ):
+                return FusedLayerNormAffineFunction.apply(
+                    input,
+                    weight,
+                    self.bias,
+                    self.hidden_size,
+                    self.eps,
+                    self.config.memory_efficient_layer_norm,
+                )
+            else:
+                return FusedLayerNormAffineFunction.apply(
+                    input, weight, self.bias, self.hidden_size, self.eps
+                )
+
+        return output
diff --git a/megatron/model/fused_softmax.py b/megatron/core/fusions/fused_softmax.py
similarity index 81%
rename from megatron/model/fused_softmax.py
rename to megatron/core/fusions/fused_softmax.py
index ed29262acd..c7bfbb768b 100644
--- a/megatron/model/fused_softmax.py
+++ b/megatron/core/fusions/fused_softmax.py
@@ -1,9 +1,12 @@
 # Copyright (c) 2022, NVIDIA CORPORATION. All rights reserved.
 
+from typing import Optional
 
 import torch
 import torch.nn as nn
-from megatron.model.enums import AttnMaskType
+
+from megatron.core.transformer.enums import AttnMaskType
+from megatron.core.transformer.utils import get_default_causal_mask
 
 
 class ScaledUpperTriangMaskedSoftmax(torch.autograd.Function):
@@ -19,9 +22,7 @@ def forward(ctx, inputs, scale):
         import scaled_upper_triang_masked_softmax_cuda
 
         scale_t = torch.tensor([scale])
-        softmax_results = scaled_upper_triang_masked_softmax_cuda.forward(
-            inputs, scale_t[0]
-        )
+        softmax_results = scaled_upper_triang_masked_softmax_cuda.forward(inputs, scale_t[0])
 
         ctx.save_for_backward(softmax_results, scale_t)
         return softmax_results
@@ -62,9 +63,7 @@ def backward(ctx, output_grads):
 
         softmax_results, scale_t = ctx.saved_tensors
 
-        input_grads = scaled_masked_softmax_cuda.backward(
-            output_grads, softmax_results, scale_t[0]
-        )
+        input_grads = scaled_masked_softmax_cuda.backward(output_grads, softmax_results, scale_t[0])
         return input_grads, None, None
 
 
@@ -81,9 +80,7 @@ def forward(ctx, inputs, scale):
 
         scale_t = torch.tensor([scale])
 
-        softmax_results = scaled_softmax_cuda.forward(
-            inputs, scale_t[0]
-        )
+        softmax_results = scaled_softmax_cuda.forward(inputs, scale_t[0])
         ctx.save_for_backward(softmax_results, scale_t)
         return softmax_results
 
@@ -93,9 +90,7 @@ def backward(ctx, output_grads):
 
         softmax_results, scale_t = ctx.saved_tensors
 
-        input_grads = scaled_softmax_cuda.backward(
-            output_grads, softmax_results, scale_t[0]
-        )
+        input_grads = scaled_softmax_cuda.backward(output_grads, softmax_results, scale_t[0])
         return input_grads, None, None
 
 
@@ -103,7 +98,7 @@ class FusedScaleMaskSoftmax(nn.Module):
     """
     fused operation: scaling + mask + softmax
 
-    Arguments:
+    Args:
         input_in_fp16: flag to indicate if input in fp16 data format.
         input_in_bf16: flag to indicate if input in bf16 data format.
         attn_mask_type: attention mask type (pad or causal)
@@ -136,11 +131,14 @@ def __init__(
         self.softmax_in_fp32 = softmax_in_fp32
         self.scale = scale
 
-        assert (
-            self.scale is None or softmax_in_fp32
-        ), "softmax should be in fp32 when scaled"
+        assert self.scale is None or softmax_in_fp32, "softmax should be in fp32 when scaled"
+
+    def forward(self, input: torch.Tensor, mask: Optional[torch.Tensor]):
+        """Forward pass of softmax with masked input.
 
-    def forward(self, input, mask):
+        In case attn_mask_type is causal the mask is generated and None can be passed.
+        A user-defined mask is only needed when attn_mask_type is not causal.
+        """
         # [b, np, sq, sk]
         assert input.dim() == 4
 
@@ -157,7 +155,7 @@ def is_kernel_available(self, mask, b, np, sq, sk):
             and self.input_in_float16  # input must be fp16
             and 16 < sk <= 4096  # sk must be 16 ~ 2048
             and sq % 4 == 0  # sq must be divisor of 4
-            and sk % 4 == 0  # sk must be divisor of 4 
+            and sk % 4 == 0  # sk must be divisor of 4
             and attn_batches % 4 == 0  # np * b must be divisor of 4
         ):
             if 0 <= sk <= 4096:
@@ -195,6 +193,15 @@ def forward_torch_softmax(self, input, mask):
 
         if self.scale is not None:
             input = input * self.scale
+
+        # Generate causal mask if not given
+        sq, sk = input.size(2), input.size(3)
+        if self.attn_mask_type == AttnMaskType.causal and mask is None and sq > 1:
+            # If sq == 1 then either KV cache is used or one-element context is passed
+            # so keeping mask=None in this case; subsequent code should handle it
+            assert sq == sk, "causal mask is only for self attention"
+            mask = get_default_causal_mask(sq)
+
         mask_output = self.mask_func(input, mask) if mask is not None else input
         probs = torch.nn.Softmax(dim=-1)(mask_output)
 
diff --git a/megatron/core/inference/__init__.py b/megatron/core/inference/__init__.py
new file mode 100644
index 0000000000..f8011007a5
--- /dev/null
+++ b/megatron/core/inference/__init__.py
@@ -0,0 +1 @@
+# Copyright (c) 2024, NVIDIA CORPORATION. All rights reserved.
diff --git a/megatron/core/inference/ammo_support/__init__.py b/megatron/core/inference/ammo_support/__init__.py
new file mode 100644
index 0000000000..12be50cefe
--- /dev/null
+++ b/megatron/core/inference/ammo_support/__init__.py
@@ -0,0 +1,8 @@
+# Copyright (c) 2024, NVIDIA CORPORATION. All rights reserved.
+import warnings
+
+warnings.warn(
+    "The 'megatron.core.inference.ammo_support' module is deprecated and will be removed in a future release. "
+    "Please use megatron.core.inference.modelopt_support instead",
+    DeprecationWarning,
+)
diff --git a/megatron/core/inference/ammo_support/gpt/model_specs.py b/megatron/core/inference/ammo_support/gpt/model_specs.py
new file mode 100644
index 0000000000..ba3bd9fa0f
--- /dev/null
+++ b/megatron/core/inference/ammo_support/gpt/model_specs.py
@@ -0,0 +1,2 @@
+# Copyright (c) 2024, NVIDIA CORPORATION. All rights reserved.
+from megatron.core.inference.modelopt_support.gpt.model_specs import get_gpt_layer_modelopt_spec
diff --git a/megatron/core/inference/ammo_support/gpt/state_dict_hooks.py b/megatron/core/inference/ammo_support/gpt/state_dict_hooks.py
new file mode 100644
index 0000000000..8532366222
--- /dev/null
+++ b/megatron/core/inference/ammo_support/gpt/state_dict_hooks.py
@@ -0,0 +1,5 @@
+# Copyright (c) 2024, NVIDIA CORPORATION. All rights reserved.
+from megatron.core.inference.modelopt_support.gpt.state_dict_hooks import (
+    mcore_gpt_load_legacy_state_dict_pre_hook,
+    mcore_gpt_load_te_state_dict_pre_hook,
+)
diff --git a/megatron/core/inference/common_inference_params.py b/megatron/core/inference/common_inference_params.py
new file mode 100644
index 0000000000..22353088f8
--- /dev/null
+++ b/megatron/core/inference/common_inference_params.py
@@ -0,0 +1,29 @@
+# Copyright (c) 2024, NVIDIA CORPORATION. All rights reserved.
+from dataclasses import dataclass
+
+
+@dataclass
+class CommonInferenceParams:
+    """Inference parameters sent along with the prompts
+
+    For an explanation of these parameters refer to this blog https://ivibudh.medium.com/a-guide-to-controlling-llm-model-output-exploring-top-k-top-p-and-temperature-parameters-ed6a31313910
+    """
+
+    temperature: float = 1.0
+    top_k: int = 0
+    top_p: float = 0.0
+    return_log_probs: bool = False
+    num_tokens_to_generate: int = 30
+
+    def add_attributes(self, attribute_value_pair: dict):
+        """Utility to add more attributes to inference params
+
+        Use this method to pass in a custom dictonary to add more inference parameter attributes to the instance you created. Use as follows
+        c = CommonInferenceParams
+        c.add_attributes({'min_length':4, 'eod_id':153})
+
+        Args:
+            attribute_value_pair (dict): A dictionary containing attributes as the key names and their values as the values.
+        """
+        for key, value in attribute_value_pair.items():
+            setattr(self, key, value)
diff --git a/megatron/core/inference/communication_utils.py b/megatron/core/inference/communication_utils.py
new file mode 100644
index 0000000000..0c23a583de
--- /dev/null
+++ b/megatron/core/inference/communication_utils.py
@@ -0,0 +1,50 @@
+# Copyright (c) 2024, NVIDIA CORPORATION. All rights reserved.
+import torch
+
+from megatron.core import parallel_state
+
+
+def _is_cuda(tensor):
+    """Check if a tensor is not none and is cuda."""
+    assert tensor is not None
+    assert tensor.is_cuda
+
+
+def broadcast_from_last_pipeline_stage(size, dtype, tensor=None):
+    """Broadcast a tensor from last pipeline stage to all ranks."""
+
+    if parallel_state.is_pipeline_last_stage():
+        _is_cuda(tensor)
+        assert tensor.is_contiguous()
+    else:
+        tensor = torch.empty(size, dtype=dtype, device=torch.cuda.current_device())
+    # Get the group and corresponding source rank.
+    src = parallel_state.get_pipeline_model_parallel_last_rank()
+    group = parallel_state.get_pipeline_model_parallel_group()
+    torch.distributed.broadcast(tensor, src, group)
+    return tensor
+
+
+def recv_from_prev_pipeline_rank_(recv_buffer=None):
+    """Receive from previous pipeline stage and update the
+    input buffer inplace."""
+    recv_prev_op = torch.distributed.P2POp(
+        torch.distributed.irecv, recv_buffer, parallel_state.get_pipeline_model_parallel_prev_rank()
+    )
+    reqs = torch.distributed.batch_isend_irecv([recv_prev_op])
+    for req in reqs:
+        req.wait()
+    # To protect against race condition when using batch_isend_irecv().
+    torch.cuda.synchronize()
+
+
+def send_to_next_pipeline_rank(tensor=None):
+    """Send output to the next pipeline stage."""
+    send_next_op = torch.distributed.P2POp(
+        torch.distributed.isend, tensor, parallel_state.get_pipeline_model_parallel_next_rank()
+    )
+    reqs = torch.distributed.batch_isend_irecv([send_next_op])
+    for req in reqs:
+        req.wait()
+    # To protect against race condition when using batch_isend_irecv().
+    torch.cuda.synchronize()
diff --git a/megatron/core/inference/engines/__init__.py b/megatron/core/inference/engines/__init__.py
new file mode 100644
index 0000000000..f8011007a5
--- /dev/null
+++ b/megatron/core/inference/engines/__init__.py
@@ -0,0 +1 @@
+# Copyright (c) 2024, NVIDIA CORPORATION. All rights reserved.
diff --git a/megatron/core/inference/engines/abstract_engine.py b/megatron/core/inference/engines/abstract_engine.py
new file mode 100644
index 0000000000..6893f6a905
--- /dev/null
+++ b/megatron/core/inference/engines/abstract_engine.py
@@ -0,0 +1,17 @@
+# Copyright (c) 2024, NVIDIA CORPORATION. All rights reserved.
+from abc import ABC, abstractmethod
+from typing import List
+
+
+class AbstractEngine(ABC):
+    @staticmethod
+    @abstractmethod
+    def generate(self) -> dict:
+        """The abstract backend's generate function.
+
+        To define a new backend, implement this and return the outputs as a dictionary.
+
+        Returns:
+            dict: The output dictionary containing keys for `input_prompt`, `generated_text`, `generated_tokens`.
+        """
+        pass
diff --git a/megatron/core/inference/engines/mcore_engine.py b/megatron/core/inference/engines/mcore_engine.py
new file mode 100644
index 0000000000..fe8160228b
--- /dev/null
+++ b/megatron/core/inference/engines/mcore_engine.py
@@ -0,0 +1,113 @@
+# Copyright (c) 2024, NVIDIA CORPORATION. All rights reserved.
+from typing import Dict, List
+
+import torch
+
+from megatron.core.inference.common_inference_params import CommonInferenceParams
+from megatron.core.inference.engines.abstract_engine import AbstractEngine
+from megatron.core.inference.inference_request import InferenceRequest
+from megatron.core.inference.scheduler import Scheduler
+from megatron.core.inference.text_generation_controllers.simple_text_generation_controller import (
+    SimpleTextGenerationController,
+)
+
+
+class MCoreEngine(AbstractEngine):
+    """The Megatron core backend constructor
+
+    This is the backend that does a simple forward pass on the model.
+    Supports any model that is callable (Accepts the inputs and outputs the tensor)
+
+    Args:
+        text_generation_controller (SimpleTextGenerationController): A text generation
+            controller that will be used to define how to preprocess prompts, generate
+            outputs and detokenizer the output tokens.
+        max_batch_size : The maxinum number of requests to process at once
+        random_seed (int, optional): Use a random seed if you want deterministic
+            results. Defaults to None.
+    """
+
+    def __init__(
+        self,
+        text_generation_controller: SimpleTextGenerationController,
+        max_batch_size,
+        random_seed: int = None,
+    ):
+        self.text_generation_controller = text_generation_controller
+        self.random_seed = random_seed
+        self.scheduler = Scheduler(max_batch_size=max_batch_size)
+
+    def generate(
+        self,
+        prompts: List[str],
+        add_BOS: bool = False,
+        encoder_prompts: List[str] = None,
+        common_inference_params: CommonInferenceParams = None,
+    ) -> dict:
+        """The megatron core inference backend generate function
+
+        This backend returns the output generations as a dictionary.
+        It returns the prompt tokens along with the generated tokens, the prompt
+        plus the generated string and the output log probabilities if requested
+
+        Args:
+            prompts (List[str]): All the prompts as a list of strings
+            add_BOS (bool): Whether to add BOS token to beginning of prompts
+            encoder_prompts (List[dict]): All the encoder prompts as a list of strings
+            common_inference_params (CommonInferenceParams): The inference parameters
+
+        Returns:
+            List[InferenceRequest]: The output is list of inference requests containing the
+            generated tokens, texts and log probs if required
+        """
+        # TODO :M core- get rng state tracker
+        if self.random_seed:
+            torch.random.manual_seed(self.random_seed)
+
+        for i in range(len(prompts)):
+            prompt = prompts[i]
+            encoder_prompt = encoder_prompts[i] if encoder_prompts is not None else None
+            prompt_tokens = self.text_generation_controller.tokenize_prompt(prompt, add_BOS)
+
+            self.scheduler.add_request(
+                prompt=prompt,
+                prompt_tokens=prompt_tokens,
+                encoder_prompt=encoder_prompt,
+                inference_parameters=common_inference_params,
+            )
+
+        self.run_engine()
+
+        result: List[InferenceRequest] = self.scheduler.completed_request_pool.values()
+        return result
+
+    def run_engine(self):
+        """Main functionality to run inference
+
+        Runs the engine until there are no requests in the queue.
+
+        Args:
+            dynamic_generation (bool, optional): Set this to True, if you want
+                to enable dynamic batching. Mainly used with an inference server.
+                Defaults to False.
+        """
+        while self.scheduler.have_requests_pending():
+            active_requests: Dict[int, InferenceRequest] = self.scheduler.active_request_pool.copy()
+            result_dict: Dict[int, InferenceRequest] = (
+                self.text_generation_controller.generate_all_output_tokens_static_batch(
+                    active_requests
+                )
+            )
+
+            self.scheduler.update_requests_pools(result_dict=result_dict)
+
+        # TODO: Later for dynamic batching we will do something like this
+        """ 
+            if dynamic_batching:
+                result_dict: Dict[
+                    int, InferenceRequest
+                ] = self.text_generation_controller.generate_output_tokens_one_step_dynamic_batch(
+                    active_requests
+                )
+            self.scheduler.update_requests_pools(result_dict=result_dict)         
+        """
diff --git a/megatron/core/inference/inference_request.py b/megatron/core/inference/inference_request.py
new file mode 100644
index 0000000000..4825dfd366
--- /dev/null
+++ b/megatron/core/inference/inference_request.py
@@ -0,0 +1,39 @@
+# Copyright (c) 2024, NVIDIA CORPORATION. All rights reserved.
+from dataclasses import dataclass
+from enum import Enum
+from typing import List
+
+import torch
+
+from megatron.core.inference.common_inference_params import CommonInferenceParams
+
+
+# class syntax
+class Status(Enum):
+    """Enum for status"""
+
+    WAITING_IN_QUEUE = 1
+    ACTIVE_AND_GENERATING_TOKENS = 2
+    ACTIVE_BUT_NOT_GENERATING_TOKENS = 3
+    COMPLETED = 4
+
+
+@dataclass
+class InferenceRequest:
+    """Class for one inference request
+
+    Containing relevant data for an inference request
+
+    """
+
+    request_id: str
+    prompt: str
+    inference_parameters: CommonInferenceParams
+    prompt_tokens: List[int]
+    arrival_time: float
+    status: Status
+    encoder_prompt: str = None
+    generated_text: str = None
+    generated_tokens: torch.Tensor = None
+    generated_log_probs: torch.Tensor = None
+    generated_length: int = 0
diff --git a/megatron/core/inference/model_inference_wrappers/__init__.py b/megatron/core/inference/model_inference_wrappers/__init__.py
new file mode 100644
index 0000000000..f8011007a5
--- /dev/null
+++ b/megatron/core/inference/model_inference_wrappers/__init__.py
@@ -0,0 +1 @@
+# Copyright (c) 2024, NVIDIA CORPORATION. All rights reserved.
diff --git a/megatron/core/inference/model_inference_wrappers/abstract_model_inference_wrapper.py b/megatron/core/inference/model_inference_wrappers/abstract_model_inference_wrapper.py
new file mode 100644
index 0000000000..b7f58efcfe
--- /dev/null
+++ b/megatron/core/inference/model_inference_wrappers/abstract_model_inference_wrapper.py
@@ -0,0 +1,234 @@
+# Copyright (c) 2024, NVIDIA CORPORATION. All rights reserved.
+import abc
+import math
+from argparse import Namespace
+from typing import Iterable, List, Union
+
+import torch
+
+from megatron.core import parallel_state, tensor_parallel
+from megatron.core.inference.common_inference_params import CommonInferenceParams
+from megatron.core.inference.communication_utils import (
+    recv_from_prev_pipeline_rank_,
+    send_to_next_pipeline_rank,
+)
+from megatron.core.inference.model_inference_wrappers.inference_wrapper_config import (
+    InferenceWrapperConfig,
+)
+from megatron.core.inference_params import InferenceParams
+from megatron.core.models.gpt.gpt_model import GPTModel
+
+
+class AbstractModelInferenceWrapper(abc.ABC):
+    def __init__(
+        self,
+        model: Union['LegacyGPTModel', GPTModel],
+        inference_wrapper_config: InferenceWrapperConfig,
+    ):
+        """Constructor for the model inference wrapper
+
+        The wrapper prepares the model for inference, provides the required input data and runs the forward pass.
+
+        Args:
+            model (Union[GPTModel, LegacyGPTModel]): The actual GPT model (MCore or MLM)
+            args (Namespace): The commadline arguments that were passed
+        """
+        assert not isinstance(
+            model, Iterable
+        ), 'interleaving schedule is not supported for inference'
+        self.model = model
+        self.inference_wrapper_config = inference_wrapper_config
+        self.pipeline_communication_dtype = (
+            torch.float
+            if self.inference_wrapper_config.fp32_residual_connection
+            else self.inference_wrapper_config.params_dtype
+        )
+
+    def prep_model_for_inference(self, prompts_tokens: torch.Tensor):
+        """A utility function for preparing model for inference
+
+        The function gets called once before the auto regressive inference loop. It puts the model in eval mode , and gets some model and inference data parameters. Extend this to build position ids ,attention mask etc, so that required slices can be extracted during the forward pass.
+
+        Args:
+            prompts_tokens (torch.Tensor): A tensor of shape [batch_size, max_seq_len]
+
+        """
+        self.model.eval()
+
+        # For TP only model both is_pp_first_stage and _is_pp_last_stage returns True
+        self.model_is_pipeline_parallel = not (
+            parallel_state.is_pipeline_first_stage() and parallel_state.is_pipeline_last_stage()
+        )
+        self.prompts_tokens = prompts_tokens
+        batch_size, max_sequence_length = self.prompts_tokens.shape
+        self.inference_params = InferenceParams(batch_size, max_sequence_length)
+
+    @abc.abstractmethod
+    def get_batch_for_context_window(self) -> List:
+        """Returns the input data for inference
+
+        This function gets called iteratively in the inference loop . It can be used to extract relevant input from the prompt tokens, attention mask etc. required for each step in inference.
+
+        """
+        pass
+
+    def forward_pass_without_pipeline_parallel(self, inference_input: List) -> torch.Tensor:
+        """Utility to carry out simple forward pass for TP or no model parallel models
+
+        Runs a very simple forward pass for model. Used  in the case of models without any parallelism or only tensor parallelism.
+
+        Args:
+            inference_input (List): A list containg the inputs for the gpt model [tokens, position ids, attention mask]
+
+        Returns:
+            torch.Tensor: The output logits of shape [batch_size, seq_len, padded_vocab_size]
+        """
+        tokens, position_ids, attention_mask = inference_input
+        logits = self.model(
+            tokens, position_ids, attention_mask, inference_params=self.inference_params
+        )
+        logits = tensor_parallel.gather_from_tensor_model_parallel_region(logits)
+        self.inference_params.sequence_len_offset += tokens.size(1)
+
+        return logits
+
+    def _allocate_recv_buffer(self, batch_size, seq_len):
+        """Receive happens between the layers with size [seq_len, batch_size, hidden_size]."""
+        recv_size = (seq_len, batch_size, self.inference_wrapper_config.hidden_size)
+        return torch.empty(
+            recv_size, dtype=self.pipeline_communication_dtype, device=torch.cuda.current_device()
+        )
+
+    def forward_pass_with_pipeline_parallel_small_input_batch(
+        self, inference_input: List
+    ) -> torch.Tensor:
+        """Utility to carry out forward pass for PP models with very small inputs
+
+        If a model is pipeline parallel, yet, the input global batch is very small, we compute a foward pass on the entire global batch, rather than splitting it up into micro batches and doing something more complex as in the forward_pass_with_pipeline_parallel_large_input_batch method
+
+        Args:
+            inference_input (List): A list containg the inputs for the gpt model [tokens, position ids, attention mask]
+
+        Returns:
+            torch.Tensor: The output logits of shape [batch_size, seq_len, padded_vocab_size]
+        """
+        tokens, position_ids, attention_mask = inference_input
+        batch_size, seq_len = tokens.shape
+        recv_buffer = None
+        if not parallel_state.is_pipeline_first_stage():
+            recv_buffer = self._allocate_recv_buffer(batch_size, seq_len)
+            recv_from_prev_pipeline_rank_(recv_buffer)
+
+        self.model.set_input_tensor(recv_buffer)
+        output_tensor = self.model(
+            tokens, position_ids, attention_mask, inference_params=self.inference_params
+        )
+
+        if not parallel_state.is_pipeline_last_stage():
+            send_to_next_pipeline_rank(output_tensor.type(dtype=self.pipeline_communication_dtype))
+
+        self.inference_params.sequence_len_offset += seq_len
+
+        logits = None
+        if parallel_state.is_pipeline_last_stage():
+            logits = output_tensor
+            logits = tensor_parallel.gather_from_tensor_model_parallel_region(logits)
+
+        return logits
+
+    def forward_pass_with_pipeline_parallel_large_input_batch(
+        self, inference_input: List
+    ) -> torch.Tensor:
+        """Utility to carry out forward pass PP models.
+
+        Runs the forward pass for models which are pipeline parallel. This is more complex than forward_pass_with_pipeline_parallel_small_input_batch coz this splits the global batch into small micro batches and runs them through the model.
+
+        Args:
+            inference_input (List): A list containg the inputs for the gpt model [tokens, position ids, attention mask]
+
+        Returns:
+            torch.Tensor: The output logits of shape [batch_size, seq_len, padded_vocab_size]
+        """
+        tokens, position_ids, attention_mask = inference_input
+        micro_batch_size = max(
+            1,
+            self.inference_wrapper_config.inference_batch_times_seqlen_threshold // tokens.size(1),
+        )
+        batch_size, seq_len = tokens.shape
+        # Round up to account for the last partial micro batch if present
+        num_micro_batches = math.ceil(batch_size / micro_batch_size)
+
+        logits = None
+        # Preallocate memory for output logits.
+        if parallel_state.is_pipeline_last_stage():
+            logits = torch.empty(
+                (batch_size, seq_len, self.inference_wrapper_config.padded_vocab_size),
+                dtype=torch.float32,
+                device=torch.cuda.current_device(),
+            )
+
+        recv_buffer = None
+        if not parallel_state.is_pipeline_first_stage():
+            recv_buffer = self._allocate_recv_buffer(micro_batch_size, seq_len)
+        for micro_batch_index in range(num_micro_batches):
+            start = micro_batch_index * micro_batch_size
+            end = min(start + micro_batch_size, batch_size)
+            tokens2use = tokens[start:end, ...]
+            position_ids2use = position_ids[start:end, ...]
+            current_micro_batch_size = end - start
+
+            # Need to change recv buffer shape for the last partial microbatch (if exists)
+            if current_micro_batch_size != micro_batch_size:
+                recv_buffer = self._allocate_recv_buffer(current_micro_batch_size, seq_len)
+
+            if not parallel_state.is_pipeline_first_stage():
+                recv_from_prev_pipeline_rank_(recv_buffer)
+
+            self.model.set_input_tensor(recv_buffer)
+            output_tensor = self.model(
+                tokens2use, position_ids2use, attention_mask, inference_params=self.inference_params
+            )
+
+            if not parallel_state.is_pipeline_last_stage():
+                send_to_next_pipeline_rank(output_tensor)
+
+            self.inference_params.batch_size_offset += current_micro_batch_size
+
+            if parallel_state.is_pipeline_last_stage():
+                output_tensor = tensor_parallel.gather_from_tensor_model_parallel_region(
+                    output_tensor
+                )
+                logits[start:end, ...] = output_tensor
+
+        # Once done with all micro batches, we reset batch size offset and seq len offset
+        self.inference_params.sequence_len_offset += seq_len
+        self.inference_params.batch_size_offset = 0
+
+        # NOTE: Only returns the logits on the last pipeline stage
+        return logits
+
+    def run_one_forward_step(self, inference_input: List) -> torch.Tensor:
+        """The forward pass of the model for inference
+
+        Appropriate utility is called for the forward pass depending on the type of model parallelism used
+
+        Args:
+            inference_input (List): A list containg the inputs for the gpt model [tokens, position ids, attention mask]
+
+        Returns:
+            torch.Tensor: The output logits of shape [batch_size, seq_len, padded_vocab_size]. The logits are returned only in the last pipeline stage for PP models.
+        """
+        if self.model_is_pipeline_parallel:
+            tokens = inference_input[0]
+            current_batch_size, seq_len = tokens.shape
+            # If input batch is large, we need to split into micro batches and run the forward pass
+            if (
+                current_batch_size * seq_len
+                > self.inference_wrapper_config.inference_batch_times_seqlen_threshold
+            ):
+                return self.forward_pass_with_pipeline_parallel_large_input_batch(inference_input)
+            else:
+                # If input batch is very small we can do a simple forward pass on the entire global batch
+                return self.forward_pass_with_pipeline_parallel_small_input_batch(inference_input)
+        else:
+            return self.forward_pass_without_pipeline_parallel(inference_input)
diff --git a/megatron/core/inference/model_inference_wrappers/gpt/__init__.py b/megatron/core/inference/model_inference_wrappers/gpt/__init__.py
new file mode 100644
index 0000000000..f8011007a5
--- /dev/null
+++ b/megatron/core/inference/model_inference_wrappers/gpt/__init__.py
@@ -0,0 +1 @@
+# Copyright (c) 2024, NVIDIA CORPORATION. All rights reserved.
diff --git a/megatron/core/inference/model_inference_wrappers/gpt/gpt_inference_wrapper.py b/megatron/core/inference/model_inference_wrappers/gpt/gpt_inference_wrapper.py
new file mode 100644
index 0000000000..87b1d2df77
--- /dev/null
+++ b/megatron/core/inference/model_inference_wrappers/gpt/gpt_inference_wrapper.py
@@ -0,0 +1,85 @@
+# Copyright (c) 2024, NVIDIA CORPORATION. All rights reserved.
+from argparse import Namespace
+from typing import List, Tuple
+
+import torch
+
+from megatron.core.inference.model_inference_wrappers.abstract_model_inference_wrapper import (
+    AbstractModelInferenceWrapper,
+)
+from megatron.core.models.gpt import GPTModel
+
+
+class GPTInferenceWrapper(AbstractModelInferenceWrapper):
+    def __init__(self, model: GPTModel, args: Namespace):
+        """Constructor for the model inference wrapper
+
+        The wrapper prepares the model for inference, provides the required input data, and runs the forward pass
+
+        Args:
+            model (GPTModel): The GPT model (MCore or legacy)
+            args (Namespace): The command line arguments that were passed
+        """
+        super().__init__(model, args)
+
+    def prep_model_for_inference(self, prompts_tokens: torch.Tensor):
+        """A utility function for preparing model for inference
+
+        This function is called before the forward pass. It puts the model in eval mode, builds position ids, and creates attention masks so that required slices can be extracted during the forward pass.
+
+        Args:
+            prompts_tokens (torch.Tensor): A tensor of shape [batch_size, max_seq_len]
+        """
+
+        super().prep_model_for_inference(prompts_tokens=prompts_tokens)
+        self.attention_mask, self.position_ids = self._build_attention_mask_and_position_ids(
+            prompts_tokens
+        )
+
+    def _build_attention_mask_and_position_ids(
+        self, prompts_tokens: torch.Tensor
+    ) -> Tuple[torch.Tensor, torch.Tensor]:
+        """Builds the full attention mask and position ids for the input tokens
+
+        Args:
+            prompts_tokens (torch.Tensor): A tensor of shape [batch_size, max_seq_len]
+
+        Returns:
+            Tuple[torch.Tensor, torch.Tensor]: The attention mask of shape [1, 1, max_seq_len, max_seq_len] and position ids of shape [batch_size, max_seq_len]
+        """
+        seq_length = prompts_tokens.size(1)
+        attention_mask = torch.tril(
+            torch.ones((1, seq_length, seq_length), device=prompts_tokens.device)
+        ).view(1, 1, seq_length, seq_length)
+        # Convert to boolean
+        attention_mask = attention_mask < 0.5
+
+        position_ids = (
+            torch.arange(seq_length, dtype=torch.long, device=prompts_tokens.device)
+            .unsqueeze(0)
+            .expand_as(prompts_tokens)
+        )
+
+        return attention_mask, position_ids
+
+    def get_batch_for_context_window(
+        self, context_start_position: int, context_end_position: int
+    ) -> List:
+        """Returns the inference data given context window
+
+        This function gets called iteratively in a loop . Given the start and end context positions , it extracts the appropriate data.
+
+        Args:
+            context_start_position (int): Start of the context window. During the first inference step it is mostly 0
+            context_end_position (int): End of the context window. During the last inference step it will mostly be the max generated sequence length.
+
+        Returns:
+            List: A list of inputs that will be used by your model in the forward step
+        """
+        tokens2use = self.prompts_tokens[:, context_start_position:context_end_position]
+        positions2use = self.position_ids[:, context_start_position:context_end_position]
+        attention_mask2use = self.attention_mask[
+            ..., context_start_position:context_end_position, :context_end_position
+        ]
+        data_at_step_idx = [tokens2use, positions2use, attention_mask2use]
+        return data_at_step_idx
diff --git a/megatron/core/inference/model_inference_wrappers/inference_wrapper_config.py b/megatron/core/inference/model_inference_wrappers/inference_wrapper_config.py
new file mode 100644
index 0000000000..e22550e7e3
--- /dev/null
+++ b/megatron/core/inference/model_inference_wrappers/inference_wrapper_config.py
@@ -0,0 +1,40 @@
+# Copyright (c) 2024, NVIDIA CORPORATION. All rights reserved.
+from dataclasses import dataclass
+
+import torch
+
+
+@dataclass
+class InferenceWrapperConfig:
+    """Config for the model inference wrapper
+
+    NOTE : All the arguments here are obtained from arguments.py file
+    """
+
+    hidden_size: int
+    """Receive happens between the layers during PP with size [seq_len, batch_size, hidden_size]"""
+
+    params_dtype: torch.dtype
+    """Can be torch.float or torch.half if --fp16 is used, or torch.bfloat16 if --bf16 is used"""
+
+    inference_batch_times_seqlen_threshold: int
+    """if batch-size times sequence-length is smaller than this threshold then we will not use pipelining, otherwise we will."""
+
+    padded_vocab_size: int
+    """The final padded vocab size (Padded to make it divisible by --make-vocab-size-divisible-by value)"""
+
+    fp32_residual_connection: bool = False
+    """Move residual connections to fp32. Obtained from arguments.py"""
+
+    def add_attributes(self, attribute_value_pair: dict):
+        """Utility to add more attributes to inference params
+
+        Use this method to pass in a custom dictonary to add more config to the instance you created. Use as follows
+        c = InferenceWrapperConfig
+        c.add_attributes({'precision':'fp32'})
+
+        Args:
+            attribute_value_pair (dict): A dictionary containing attributes as the key names and their values as the values.
+        """
+        for key, value in attribute_value_pair.items():
+            setattr(self, key, value)
diff --git a/megatron/core/inference/model_inference_wrappers/t5/__init__.py b/megatron/core/inference/model_inference_wrappers/t5/__init__.py
new file mode 100644
index 0000000000..f8011007a5
--- /dev/null
+++ b/megatron/core/inference/model_inference_wrappers/t5/__init__.py
@@ -0,0 +1 @@
+# Copyright (c) 2024, NVIDIA CORPORATION. All rights reserved.
diff --git a/megatron/core/inference/model_inference_wrappers/t5/t5_inference_wrapper.py b/megatron/core/inference/model_inference_wrappers/t5/t5_inference_wrapper.py
new file mode 100644
index 0000000000..10e1da4812
--- /dev/null
+++ b/megatron/core/inference/model_inference_wrappers/t5/t5_inference_wrapper.py
@@ -0,0 +1,205 @@
+# Copyright (c) 2024, NVIDIA CORPORATION. All rights reserved.
+from argparse import Namespace
+from collections import deque
+from typing import Any, List, Tuple
+
+import numpy
+import torch
+
+from megatron.core import tensor_parallel
+from megatron.core.datasets.t5_dataset import T5MaskedWordPieceDataset
+from megatron.core.inference.model_inference_wrappers.abstract_model_inference_wrapper import (
+    AbstractModelInferenceWrapper,
+)
+from megatron.core.models.T5 import T5Model
+
+
+class T5InferenceWrapper(AbstractModelInferenceWrapper):
+    """Constructor for the model inference wrapper
+
+    The wrapper prepares the model for inference, provides the required input
+    data, and runs the forward pass
+
+    Args:
+        model (T5Model): The T5 model (MCore or legacy)
+        args (Namespace): The command line arguments that were passed
+    """
+
+    def __init__(self, model: T5Model, args: Namespace):
+        super().__init__(model, args)
+
+    def prep_model_for_inference(
+        self, prompts_tokens: torch.Tensor, encoder_prompts: List[str] = None, tokenizer: Any = None
+    ):
+        """A utility function for preparing model for inference
+
+        This function is called before the forward pass. It puts the model in eval mode, builds
+        position ids, and creates attention masks so that required slices can be extracted during
+        the forward pass.
+
+        Args:
+            prompts_tokens (torch.Tensor): A tensor of shape [batch_size, max_sequence_length]
+            encoder_prompts (dict): List of string of encoder input prompts
+            tokenizer (_type_): Tokenizer used for tokenizing and detokenizing text
+        """
+
+        super().prep_model_for_inference(prompts_tokens=prompts_tokens)
+
+        encoder_prompts_tokens_list = [
+            self.tokenize_encoder_prompt(encoder_prompt, tokenizer)
+            for encoder_prompt in encoder_prompts
+        ]
+        self.batch_encoder_prompts_tokens = self.pad_encoder_prompts_tokens(
+            encoder_prompts_tokens_list, self.model.max_sequence_length, tokenizer
+        )
+
+        # create batch mask for encoder_prompt (self.batch_input_tokens) and
+        # decoder_input (self.prompts_tokens), similar to megatron/core/datasets/t5_dataset.py
+        decoder_prompts_tokens = self.prompts_tokens.cpu().numpy()
+        encoder_prompts_tokens = self.batch_encoder_prompts_tokens.cpu().numpy()
+        self.batch_mask_encoder = []
+        self.batch_mask_decoder = []
+        self.batch_mask_encoder_decoder = []
+        for i in range(len(self.prompts_tokens)):
+            self.batch_mask_encoder.append(
+                T5MaskedWordPieceDataset._make_attention_mask(
+                    encoder_prompts_tokens[i], encoder_prompts_tokens[i]
+                )
+            )
+            self.batch_mask_decoder.append(
+                T5MaskedWordPieceDataset._make_attention_mask(
+                    decoder_prompts_tokens[i], decoder_prompts_tokens[i]
+                )
+                * T5MaskedWordPieceDataset._make_history_mask(decoder_prompts_tokens[i])
+            )
+            self.batch_mask_encoder_decoder.append(
+                T5MaskedWordPieceDataset._make_attention_mask(
+                    decoder_prompts_tokens[i], encoder_prompts_tokens[i]
+                )
+            )
+        self.batch_mask_encoder = torch.tensor(numpy.array(self.batch_mask_encoder)).cuda()
+        self.batch_mask_decoder = torch.tensor(numpy.array(self.batch_mask_decoder)).cuda()
+        self.batch_mask_encoder_decoder = torch.tensor(
+            numpy.array(self.batch_mask_encoder_decoder)
+        ).cuda()
+        self.batch_mask_encoder = self.batch_mask_encoder < 0.5
+        self.batch_mask_decoder = self.batch_mask_decoder < 0.5
+        self.batch_mask_encoder_decoder = self.batch_mask_encoder_decoder < 0.5
+
+    def tokenize_encoder_prompt(
+        self, encoder_prompt: str, tokenizer
+    ) -> Tuple[torch.Tensor, torch.Tensor]:
+        """Utility to tokenize the encoder_prompt
+
+        Args:
+            encoder_prompt (str): The encoder_prompt
+            tokenizer (_type_): Tokenizer used for tokenizing and detokenizing string
+
+        Returns:
+            torch.Tensor: Returns the tokenized prompt
+        """
+
+        # if there is the word "<mask>" in prompt, replacing it with special_additional_token,
+        # similar to processing step in megatron/core/datasets/t5_dataset.py
+        divided_encoder_prompt_list = encoder_prompt.split("<mask>")
+        masks_count = len(divided_encoder_prompt_list) - 1
+        sentinels = deque(tokenizer.additional_special_tokens_ids)
+
+        encoder_prompt_tokens = []
+        for divided_encoder_prompt in divided_encoder_prompt_list:
+            divided_encoder_prompt_tokens = tokenizer.tokenize(divided_encoder_prompt)
+            encoder_prompt_tokens.extend(divided_encoder_prompt_tokens)
+            if masks_count > 0:
+                sentinel = sentinels.popleft()
+                encoder_prompt_tokens.extend([sentinel])
+
+        return encoder_prompt_tokens
+
+    def pad_encoder_prompts_tokens(
+        self, encoder_prompts_tokens_list: List[List[int]], max_sequence_length: int, tokenizer
+    ) -> torch.Tensor:
+        """Method to pad input prompts
+
+        Given a list of prompts, pad them all to uniform length
+
+        Args:
+            encoder_prompts_tokens_list (List[List[int]]): A list containing the
+                encoder_input_tokens
+            max_sequence_length (int): Maximum of the length of the encoder inputs tokens
+            tokenizer (_type_): Tokenizer used for tokenizing and detokenizing text
+
+        Returns:
+            torch.Tensor: A torch tensor of shape [bs, max_sequence_length]
+        """
+
+        for encoder_prompt_tokens in encoder_prompts_tokens_list:
+            padding_size = max_sequence_length - len(encoder_prompt_tokens)
+            encoder_prompt_tokens.extend([tokenizer.pad] * padding_size)
+
+        return torch.tensor(encoder_prompts_tokens_list).cuda()
+
+    def get_batch_for_context_window(
+        self, context_start_position: int, context_end_position: int
+    ) -> List:
+        """Returns the inference data given context window
+
+        This function gets called iteratively in a loop . Given the start and end context
+        positions , it extracts the appropriate data.
+
+        Args:
+            context_start_position (int): Start of the context window. During
+                the first inference step it is mostly 0
+            context_end_position (int): End of the context window. During the
+                last inference step it will mostly be the max generated sequence length.
+
+        Returns:
+            List: A list of inputs that will be used by your model in the forward step
+        """
+
+        # rerun encoder every step
+        # T5 inference not yet support kv_cache
+        encoder_tokens2use = self.batch_encoder_prompts_tokens
+        decoder_tokens2use = self.prompts_tokens[:, :context_end_position]
+        encoder_mask2use = self.batch_mask_encoder
+        decoder_mask2use = self.batch_mask_decoder[:, :context_end_position, :context_end_position]
+        encoder_decoder_mask2use = self.batch_mask_encoder_decoder[:, :context_end_position, :]
+        data_at_step_idx = [
+            encoder_tokens2use,
+            decoder_tokens2use,
+            encoder_mask2use,
+            decoder_mask2use,
+            encoder_decoder_mask2use,
+        ]
+
+        return data_at_step_idx
+
+    def forward_pass_without_pipeline_parallel(self, inference_input: List) -> torch.Tensor:
+        """Utility to carry out simple forward pass for TP or no model parallel models
+
+        Runs a very simple forward pass for model. Used  in the case of models without
+        any parallelism or only tensor parallelism.
+
+        Args:
+            inference_input (List): A list containg the inputs for the gpt
+                model [tokens, position ids, attention mask]
+
+        Returns:
+            torch.Tensor: The output logits of shape [batch_size, seq_len, padded_vocab_size]
+        """
+        [encoder_tokens, decoder_tokens, encoder_mask, decoder_mask, encoder_decoder_mask] = (
+            inference_input
+        )
+        tokens = decoder_tokens
+
+        # T5 inference not yet support kv_cache
+        logits = self.model(
+            encoder_tokens,
+            decoder_tokens,
+            encoder_mask,
+            decoder_mask,
+            encoder_decoder_mask,
+            inference_params=None,
+        )
+        logits = tensor_parallel.gather_from_tensor_model_parallel_region(logits)
+
+        return logits
diff --git a/megatron/core/inference/modelopt_support/__init__.py b/megatron/core/inference/modelopt_support/__init__.py
new file mode 100644
index 0000000000..f8eb8f3d9f
--- /dev/null
+++ b/megatron/core/inference/modelopt_support/__init__.py
@@ -0,0 +1,8 @@
+# Copyright (c) 2024, NVIDIA CORPORATION. All rights reserved.
+"""Integrations with NVIDIA TensorRT Model Optimizer (referred as ModelOpt).
+
+ModelOpt is a library comprising state-of-the-art model optimization techniques including quantization and sparsity to
+compress model for efficient inference on NVIDIA GPUs. ModelOpt is integrated with Megatron-core to provide a seamless
+experience for users to optimize their Megatron-core models for inference. More details on ModelOpt including
+installation and usage can be found at https://github.com/NVIDIA/TensorRT-Model-Optimizer.
+"""
diff --git a/megatron/core/inference/modelopt_support/gpt/__init__.py b/megatron/core/inference/modelopt_support/gpt/__init__.py
new file mode 100644
index 0000000000..f8011007a5
--- /dev/null
+++ b/megatron/core/inference/modelopt_support/gpt/__init__.py
@@ -0,0 +1 @@
+# Copyright (c) 2024, NVIDIA CORPORATION. All rights reserved.
diff --git a/megatron/core/inference/modelopt_support/gpt/model_specs.py b/megatron/core/inference/modelopt_support/gpt/model_specs.py
new file mode 100644
index 0000000000..ba1ab8993d
--- /dev/null
+++ b/megatron/core/inference/modelopt_support/gpt/model_specs.py
@@ -0,0 +1,57 @@
+# Copyright (c) 2024, NVIDIA CORPORATION. All rights reserved.
+
+from megatron.core.extensions.transformer_engine import TEDotProductAttention, TENorm
+from megatron.core.fusions.fused_bias_dropout import get_bias_dropout_add
+from megatron.core.tensor_parallel.layers import ColumnParallelLinear, RowParallelLinear
+from megatron.core.transformer.attention import SelfAttention, SelfAttentionSubmodules
+from megatron.core.transformer.enums import AttnMaskType
+from megatron.core.transformer.identity_op import IdentityOp
+from megatron.core.transformer.mlp import MLP, MLPSubmodules
+from megatron.core.transformer.spec_utils import ModuleSpec
+from megatron.core.transformer.transformer_layer import TransformerLayer, TransformerLayerSubmodules
+
+
+# Use this spec for ModelOpt PTQ and TensorRT-LLM export
+def get_gpt_layer_modelopt_spec(
+    remap_te_layernorm: bool = False, qk_layernorm: bool = False
+) -> ModuleSpec:
+    """Mix the native spec with TENorm.
+
+    This is essentially the native local spec except for the layernorm implementation
+    is using TENorm from Transformer-Engine. The issue is that FusedLayerNorm from apex
+    has stopped supporting RMSNorm needed by llama.
+    """
+    sharded_state_dict_keys_map = {}
+    if remap_te_layernorm:
+        sharded_state_dict_keys_map = {
+            'input_layernorm.': 'self_attention.linear_qkv.layer_norm_',
+            'pre_mlp_layernorm.': 'mlp.linear_fc1.layer_norm_',
+        }
+    return ModuleSpec(
+        module=TransformerLayer,
+        submodules=TransformerLayerSubmodules(
+            input_layernorm=TENorm,
+            self_attention=ModuleSpec(
+                module=SelfAttention,
+                params={"attn_mask_type": AttnMaskType.causal},
+                submodules=SelfAttentionSubmodules(
+                    linear_qkv=ColumnParallelLinear,
+                    core_attention=TEDotProductAttention,
+                    linear_proj=RowParallelLinear,
+                    q_layernorm=TENorm if qk_layernorm else IdentityOp,
+                    k_layernorm=TENorm if qk_layernorm else IdentityOp,
+                ),
+            ),
+            self_attn_bda=get_bias_dropout_add,
+            pre_mlp_layernorm=TENorm,
+            mlp=ModuleSpec(
+                module=MLP,
+                submodules=MLPSubmodules(
+                    linear_fc1=ColumnParallelLinear, linear_fc2=RowParallelLinear
+                ),
+            ),
+            mlp_bda=get_bias_dropout_add,
+            # Map TE-layernorm-fusion keys back
+            sharded_state_dict_keys_map=sharded_state_dict_keys_map,
+        ),
+    )
diff --git a/megatron/core/inference/modelopt_support/gpt/state_dict_hooks.py b/megatron/core/inference/modelopt_support/gpt/state_dict_hooks.py
new file mode 100644
index 0000000000..15c3527c94
--- /dev/null
+++ b/megatron/core/inference/modelopt_support/gpt/state_dict_hooks.py
@@ -0,0 +1,133 @@
+# Copyright (c) 2024, NVIDIA CORPORATION. All rights reserved.
+
+from logging import getLogger
+
+import torch
+
+logger = getLogger(__name__)
+
+
+def mcore_gpt_load_legacy_state_dict_pre_hook(
+    state_dict, prefix, local_metadata, strict, missing_keys, unexpected_keys, error_msgs
+):
+    """Register a pre-hook to fix the state_dict key difference.
+
+    This prehook is used when trying to load the legacy Megatron-LM GPTModel into its
+    megatron/core variant that uses native ParallelLinear and Transformer-Engine Norm.
+    Only this particular spec supports post-training quantization and TensorRT-LLM
+    config export through `nvidia-modelopt` package.
+
+    Args:
+        state_dict: state dictionary
+        prefix: module name prefix
+        local_metadata: local metatdata
+        strict: whether is in strict mode
+        missing_keys: missing state dict keys
+        unexpected_keys: unexpected state dict keys
+        error_msgs: error messages
+    """
+    if "modelopt_state" in state_dict:
+        state_dict.pop("modelopt_state")
+
+    if "language_model" in state_dict:
+        language_model_state_dict = state_dict.pop("language_model")
+        if "embedding" in language_model_state_dict:
+            if "word_embeddings" in language_model_state_dict["embedding"]:
+                for key, param in language_model_state_dict["embedding"]["word_embeddings"].items():
+                    state_dict.update({"embedding.word_embeddings." + key: param})
+            if "position_embeddings" in language_model_state_dict["embedding"]:
+                for key, param in language_model_state_dict["embedding"][
+                    "position_embeddings"
+                ].items():
+                    state_dict.update({"embedding.position_embeddings." + key: param})
+        if "transformer" in language_model_state_dict:
+            for key, param in language_model_state_dict["transformer"].items():
+                state_dict.update({"decoder." + key: param})
+        else:
+            for key, param in language_model_state_dict["encoder"].items():
+                state_dict.update({"decoder." + key: param})
+        if "output_layer" in language_model_state_dict:
+            for key, param in language_model_state_dict["output_layer"].items():
+                state_dict.update({"output_layer." + key: param})
+
+    if torch.distributed.get_rank() == 0:
+        logger.info("ModelOptGPTModel {}".format(state_dict.keys()))
+
+    module_name_rewrite_list = [
+        ("input_norm", "input_layernorm"),
+        (".attention.query_key_value", ".self_attention.linear_qkv"),
+        (".attention.dense", ".self_attention.linear_proj"),
+        ("self_attention.query_key_value", "self_attention.linear_qkv"),
+        ("self_attention.dense", "self_attention.linear_proj"),
+        ("post_attention_layernorm", "pre_mlp_layernorm"),
+        ("post_attention_norm", "pre_mlp_layernorm"),
+        ("dense_h_to_4h", "linear_fc1"),
+        ("dense_4h_to_h", "linear_fc2"),
+        ("final_norm", "final_layernorm"),
+    ]
+
+    key_rewrite_list = []
+
+    for key, _ in state_dict.items():
+        for old_name, new_name in module_name_rewrite_list:
+            if old_name in key:
+                key_rewrite_list += [(key, key.replace(old_name, new_name))]
+
+    for old_key, new_key in key_rewrite_list:
+        if torch.distributed.get_rank() == 0:
+            logger.info("replace {} with {}".format(old_key, new_key))
+        state_dict[new_key] = state_dict[old_key]
+        state_dict.pop(old_key)
+
+
+def mcore_gpt_load_te_state_dict_pre_hook(
+    state_dict, prefix, local_metadata, strict, missing_keys, unexpected_keys, error_msgs
+):
+    """Register a pre-hook to fix the state_dict key difference of.
+
+    This prehook is used when trying to load the megatron/core GPTModel that uses a
+    fused Transformer-Engine ParallelLinear into the variant that uses native ParallelLinear
+    and Transformer-Engine Norm (effectively to restore the fusion).
+    Only this particular spec supports post-training quantization and TensorRT-LLM
+    config export through `nvidia-modelopt` package.
+
+    Args:
+        state_dict: state dictionary
+        prefix: module name prefix
+        local_metadata: local metatdata
+        strict: whether is in strict mode
+        missing_keys: missing state dict keys
+        unexpected_keys: unexpected state dict keys
+        error_msgs: error messages
+    """
+    if "modelopt_state" in state_dict:
+        state_dict.pop("modelopt_state")
+
+    key_with_te_extra_state_to_pop = []
+
+    for key, _ in state_dict.items():
+        if "_extra_state" in key:
+            key_with_te_extra_state_to_pop += [key]
+
+    for key in key_with_te_extra_state_to_pop:
+        state_dict.pop(key)
+
+    module_name_rewrite_list = [
+        ("self_attention.linear_qkv.layer_norm_weight", "input_layernorm.weight"),
+        ("self_attention.linear_qkv.layer_norm_bias", "input_layernorm.bias"),
+        ("mlp.linear_fc1.layer_norm_weight", "pre_mlp_layernorm.weight"),
+        ("mlp.linear_fc1.layer_norm_bias", "pre_mlp_layernorm.bias"),
+    ]
+
+    key_rewrite_list = []
+
+    for key, _ in state_dict.items():
+        for old_name, new_name in module_name_rewrite_list:
+            if old_name in key:
+                key_rewrite_list += [(key, key.replace(old_name, new_name))]
+
+    for old_key, new_key in key_rewrite_list:
+        if torch.distributed.get_rank() == 0:
+            logger.info("replace {} with {}".format(old_key, new_key))
+        state_dict[new_key] = state_dict[old_key]
+        state_dict.pop(old_key)
diff --git a/megatron/core/inference/scheduler.py b/megatron/core/inference/scheduler.py
new file mode 100644
index 0000000000..00ab81b4ab
--- /dev/null
+++ b/megatron/core/inference/scheduler.py
@@ -0,0 +1,127 @@
+# Copyright (c) 2024, NVIDIA CORPORATION. All rights reserved.
+import time
+import typing
+from collections import OrderedDict
+from typing import Dict
+
+import torch
+
+from megatron.core.inference.common_inference_params import CommonInferenceParams
+from megatron.core.inference.inference_request import InferenceRequest, Status
+from megatron.core.inference.utils import Counter
+
+
+class Scheduler:
+    """Scheduler for handling requests to inference engine
+
+    This class is responsible for handing of all the incomign requests
+
+    Args:
+        max_batch_size (int): The max batch size that we can pass to the
+            inference engine at a time.
+    """
+
+    def __init__(self, max_batch_size: int):
+        self.max_batch_size = max_batch_size
+        self.active_request_pool: Dict[int, InferenceRequest] = OrderedDict()
+        self.waiting_request_pool: Dict[int, InferenceRequest] = OrderedDict()
+        self.completed_request_pool: Dict[int, InferenceRequest] = OrderedDict()
+        self.request_counter = Counter()
+
+    def add_request(
+        self,
+        prompt: str,
+        prompt_tokens: torch.Tensor,
+        encoder_prompt: str = None,
+        inference_parameters: CommonInferenceParams = None,
+        arrival_time: float = None,
+    ):
+        """Add an incoming request
+
+        This method will add the request to either the active pool or the waiting pool
+        depending on the batch size.
+
+        Args:
+            prompt (str): Input prompt string
+            prompt_tokens (torch.Tensor): A torch tensor having the input prompts tokenized
+            encoder_prompt (str): Encoder input string
+            inference_parameters (CommonInferenceParams): The inference parameters
+            arrival_time (float, optional): The incoming request time. Defaults to None.
+        """
+        request_id = str(next(self.request_counter))
+
+        if arrival_time is None:
+            arrival_time = time.time()
+
+        status = (
+            Status.ACTIVE_BUT_NOT_GENERATING_TOKENS
+            if len(self.active_request_pool) < self.max_batch_size
+            else Status.WAITING_IN_QUEUE
+        )
+
+        inference_request = InferenceRequest(
+            request_id=request_id,
+            prompt=prompt,
+            inference_parameters=inference_parameters,
+            arrival_time=arrival_time,
+            prompt_tokens=prompt_tokens,
+            status=status,
+            encoder_prompt=encoder_prompt,
+        )
+
+        if status == status.ACTIVE_BUT_NOT_GENERATING_TOKENS:
+            self.active_request_pool[request_id] = inference_request
+        else:
+            self.waiting_request_pool[request_id] = inference_request
+
+    def have_requests_pending(self) -> bool:
+        """Method to check if there are requests pending
+
+        This method returns False only when there are no active requests or waiting requests.
+        """
+        num_requests_pending = len(self.active_request_pool) + len(self.waiting_request_pool)
+        return num_requests_pending > 0
+
+    def add_earliest_waiting_request_to_active_pool(self):
+        """Utility to add the waiting request to active pool
+
+        This method will add the earliest request (FIFO) that is in the waiting request
+        pool to the active request pool.
+        """
+        assert (
+            len(self.active_request_pool) < self.max_batch_size
+        ), "Active request pool is already full. Cant add any more requests"
+        if len(self.waiting_request_pool) > 0:
+            (earliest_waiting_request_request_id, earliest_waiting_request) = (
+                self.waiting_request_pool.popitem(last=False)
+            )
+            earliest_waiting_request.status = Status.ACTIVE_BUT_NOT_GENERATING_TOKENS
+            self.active_request_pool[earliest_waiting_request_request_id] = earliest_waiting_request
+
+    def update_requests_pools(self, result_dict: typing.OrderedDict[int, InferenceRequest] = None):
+        """Update request pool status
+
+        This method will full up the active request pool, if it has less than max batch size
+        elements from the waiting request pool.
+        If provided with a request dict, it will put the completed requests into the completed
+        request pool and add waiting request into active pool.
+
+        Args:
+            result (typing.OrderedDict[int, InferenceRequest], optional): The result returned
+                by the engine. A dictionary with keys as the request ids, and values as the
+                requests. Defaults to None
+        """
+        for result_request_id in list(result_dict.keys()):
+            active_request = self.active_request_pool[result_request_id]
+
+            # If a request has completed put it into the completed request pool.
+            if active_request.status == Status.COMPLETED:
+                completed_request = self.active_request_pool.pop(result_request_id)
+                self.completed_request_pool[result_request_id] = completed_request
+
+        # If the active request pool is not full, add waiting requests in FIFO order
+        while (
+            len(self.active_request_pool) < self.max_batch_size
+            and len(self.waiting_request_pool) > 0
+        ):
+            self.add_earliest_waiting_request_to_active_pool()
diff --git a/megatron/core/inference/text_generation_controllers/__init__.py b/megatron/core/inference/text_generation_controllers/__init__.py
new file mode 100644
index 0000000000..f8011007a5
--- /dev/null
+++ b/megatron/core/inference/text_generation_controllers/__init__.py
@@ -0,0 +1 @@
+# Copyright (c) 2024, NVIDIA CORPORATION. All rights reserved.
diff --git a/megatron/core/inference/text_generation_controllers/encoder_decoder_text_generation_controller.py b/megatron/core/inference/text_generation_controllers/encoder_decoder_text_generation_controller.py
new file mode 100644
index 0000000000..61beff0211
--- /dev/null
+++ b/megatron/core/inference/text_generation_controllers/encoder_decoder_text_generation_controller.py
@@ -0,0 +1,35 @@
+# Copyright (c) 2024, NVIDIA CORPORATION. All rights reserved.
+from typing import OrderedDict
+
+import torch
+
+from megatron.core.inference.inference_request import InferenceRequest
+from megatron.core.inference.text_generation_controllers.simple_text_generation_controller import (
+    SimpleTextGenerationController,
+)
+
+
+class EncoderDecoderTextGenerationController(SimpleTextGenerationController):
+    """The text generation controller for encoder-decoder architecture
+
+    This class ingherits from SimpleTextGenerationController, adding features
+    relating to encoder input encoder_prompt
+
+    """
+
+    def prep_model_for_inference(
+        self, prompts_tokens: torch.Tensor, active_requests: OrderedDict[int, InferenceRequest]
+    ):
+        """Preparing batch for inference, using respective wrapper's prep_model_for_inference method
+
+        Args:
+            prompts_tokens (torch.Tensor): A tensor of shape [batch_size, max_sequence_length]
+            active_requests (OrderedDict[int, InferenceRequest]): The input active requests
+        """
+        encoder_prompts = list(
+            map(lambda request: request.encoder_prompt, active_requests.values())
+        )
+
+        self.inference_wrapped_model.prep_model_for_inference(
+            prompts_tokens=prompts_tokens, encoder_prompts=encoder_prompts, tokenizer=self.tokenizer
+        )
diff --git a/megatron/core/inference/text_generation_controllers/simple_text_generation_controller.py b/megatron/core/inference/text_generation_controllers/simple_text_generation_controller.py
new file mode 100644
index 0000000000..0667af8373
--- /dev/null
+++ b/megatron/core/inference/text_generation_controllers/simple_text_generation_controller.py
@@ -0,0 +1,400 @@
+# Copyright (c) 2024, NVIDIA CORPORATION. All rights reserved.
+from typing import List, OrderedDict, Tuple
+
+import torch
+import torch.nn.functional as F
+
+from megatron.core import parallel_state
+from megatron.core.inference.common_inference_params import CommonInferenceParams
+from megatron.core.inference.communication_utils import broadcast_from_last_pipeline_stage
+from megatron.core.inference.inference_request import InferenceRequest, Status
+from megatron.core.inference.model_inference_wrappers.abstract_model_inference_wrapper import (
+    AbstractModelInferenceWrapper,
+)
+
+
+class SimpleTextGenerationController:
+    """The basic text generation controller
+
+    This class is responsible for tokenizing the input , running the inference, sampling
+    and also detokenizing the output
+
+    Args:
+        inference_wrapped_model (AbstractModelInferenceWrapper): A model that
+            is wrapped using the specs given in the abstract_model_inference_wrapper.py
+        tokenizer (_type_): Tokenizer used for tokenizing and detokenizing the prompts
+    """
+
+    def __init__(self, inference_wrapped_model: AbstractModelInferenceWrapper, tokenizer):
+        self.inference_wrapped_model = inference_wrapped_model
+        self.tokenizer = tokenizer
+
+        # For models without pipeline parallelism, is_first_stage and is_last_stage returns True
+        self.model_is_pipeline_parallel = not (
+            parallel_state.is_pipeline_first_stage() and parallel_state.is_pipeline_last_stage()
+        )
+
+    def tokenize_prompt(
+        self, prompt: str, add_BOS: bool = False
+    ) -> Tuple[torch.Tensor, torch.Tensor]:
+        """Utility to tokenize the input prompts
+
+        Args:
+            prompt (str): The input prompt
+
+        Returns:
+            torch.Tensor: Returns the tokenized prompt
+        """
+        prompt_tokens = self.tokenizer.tokenize(prompt)
+
+        if add_BOS:
+            prompt_tokens = [self.tokenizer.bos] + prompt_tokens
+
+        return prompt_tokens
+
+    def detokenize_generations(self, prompt_tokens_with_generated_tokens: torch.Tensor) -> str:
+        """Detokenize the output generations
+
+        Args:
+            prompt_tokens_with_generated_tokens (torch.Tensor): The input prompt
+            tokens plus the generated tokens
+
+        Returns:
+            str: The detokenized output
+        """
+        tokens = prompt_tokens_with_generated_tokens.cpu().numpy().tolist()
+        return self.tokenizer.detokenize(tokens)
+
+    def sample_from_logits(
+        self,
+        last_token_logits: torch.Tensor,
+        common_inference_params: CommonInferenceParams,
+        vocab_size: int = None,
+    ) -> torch.Tensor:
+        """Samples the logits to generate outputs
+
+        Given the logits of the last token, this function samples it
+        according to the parameters defined in common_inference_params
+        and returns the samples
+
+        Args:
+            last_token_logits (torch.Tensor): The last token logits. A tensor of
+                size [batch_size, vocab_size]
+            common_inference_params (CommonInferenceParams): The paramters to use
+                for inference
+            vocab_size (int): Obtained from the tokenizer. Defaults to None
+
+        Returns:
+            torch.Tensor: 1D tensor of the sampled logits with [batch_size] elements
+        """
+
+        top_p = common_inference_params.top_p
+        top_k = common_inference_params.top_k
+        temperature = common_inference_params.temperature
+
+        assert not (top_k > 0 and top_p > 0), 'Cannot have top-p and top-k both greater than zero'
+        assert top_p <= 1.0, 'top-p should be in (0,1]'
+
+        def modify_logits_for_top_k_filtering(logits, top_k):
+            """Set the logits for none top-k values to -inf."""
+            filter_ = logits < torch.topk(logits, top_k)[0][..., -1, None]
+            logits.masked_fill_(filter_, float('-Inf'))
+
+        def modify_logits_for_top_p_filtering(logits, top_p):
+            """Set the logits for none top-p values to -inf."""
+            # First sort and calculate cumulative sum of probabilities.
+            sorted_logits, sorted_indices = torch.sort(logits, descending=True)
+            cumulative_probs = sorted_logits.softmax(dim=-1).cumsum(dim=-1)
+
+            # Filteration based on the cumulative sum.
+            filter_ = cumulative_probs > top_p
+            # This shift by 1 is weird and I cannot justify it. This existed
+            # in the original implementation:
+            #   https://github.com/ari-holtzman/degen/blob/master/gen.py
+            # and I guess it is needed so keeping it for now.
+            filter_[:, 1:] = filter_[:, :-1].clone()
+            # Make sure we at least have one token to select from.
+            filter_[..., 0] = 0
+
+            # Fill in the filtered part
+            filter_ = filter_.scatter(1, sorted_indices, filter_)
+            logits.masked_fill_(filter_, float('-Inf'))
+
+        # Greedy sampling
+        if top_k == 1:
+            sampled_logits = torch.argmax(last_token_logits, dim=-1)
+        else:
+            last_token_logits = last_token_logits.clone()
+            if temperature != 1.0:
+                last_token_logits.div_(temperature)
+
+            if top_k > 1:
+                assert top_k <= last_token_logits.size(1), 'top-k is larger than logit size.'
+                if vocab_size:
+                    assert top_k < vocab_size, 'top-k is larger than vocab size.'
+                modify_logits_for_top_k_filtering(last_token_logits, top_k)
+
+            elif top_p > 0.0:
+                modify_logits_for_top_p_filtering(last_token_logits, top_p)
+
+            # After filtering, we need to recalculate the distribution.
+            probabilities = last_token_logits.softmax(dim=-1)
+            sampled_logits = torch.multinomial(probabilities, num_samples=1).view(-1)
+
+            # If vocab size is provided, make sure the samples are in in the range [0, vocab-size).
+            if vocab_size:
+                sampled_logits = torch.clamp(sampled_logits, min=0, max=(vocab_size - 1))
+        return sampled_logits
+
+    def update_generation_status(
+        self,
+        updated_prompts_tokens: torch.Tensor,
+        generation_started: torch.Tensor,
+        current_context_end_position: int,
+        is_generation_done_tensor: torch.Tensor,
+        generated_sequence_lengths: torch.Tensor,
+    ) -> Tuple[torch.Tensor, torch.Tensor, torch.Tensor]:
+        """Checks which prompts have reached an end condition
+
+        We check which prompts have reached an end condition and set the corresponding
+        flags of the is_generation_done_tensor to True. The generated sequence lengths
+        increase as we keep generating, until that prompts hits an end condition. The
+        generation_started tensor determines which prompts have started generating.
+
+        Args:
+            updated_prompts_tokens (torch.Tensor): The prompts tokens updated with the latest
+                generated tokens. A tensor of shape [batch_size, max_seq_len]
+                (i.e max_seq_len = max_prompt_len + tokens_to_generate)
+            generation_started (torch.Tensor): A boolean tensor of shape [batch_size]. True
+                indicates the prompt at that index has started generating tokens.
+            current_context_end_position (int): An integer indicating which position to
+                extract from the prompts tokens to get the latest generated tokens.
+            is_generation_done_tensor (torch.Tensor): A boolean tensor of shape [batch_size].
+                True indicates the prompt at that index has reached end condition.
+            generated_sequence_lengths (torch.Tensor): A int tensor of shape [batch_size].
+                Each value represents the generated sequence lengths for that prompt.
+
+        Returns:
+            Tuple[torch.Tensor, torch.Tensor, torch.Tensor]: Returns the boolean
+                is_generation_done_tensor and the generated_sequence_lengths after updating it
+        """
+        latest_samples = updated_prompts_tokens[:, current_context_end_position]
+        # Make sure we are checking eod criterion only for prompts that have started generating
+        # (i.e) We only look at the generated tokenns and not the input tokens.
+        reached_eod = (latest_samples == self.tokenizer.eod) & generation_started
+        is_generation_done_tensor = is_generation_done_tensor | reached_eod
+        # We increment generated sequence lengths when that prompt has not hit the
+        # EOD and generation has started
+        generated_sequence_lengths += ~is_generation_done_tensor & generation_started
+
+        return is_generation_done_tensor, generated_sequence_lengths
+
+    def pad_input_prompt_tokens(
+        self,
+        batch_prompt_tokens_list: List[List[int]],
+        max_prompt_length_in_batch: int,
+        num_tokens_to_generate: int,
+    ) -> torch.Tensor:
+        """Method to pad input prompts
+
+        Given a list of prompts, pad them all to uniform length
+
+        Args:
+            batch_prompt_tokens_list (List[List[int]]): A list containing the prompt tokens
+            max_prompt_length_in_batch (int): Maximum of the length of the input prompt tokens
+            num_tokens_togenerate (int): The number of tokens to generate for each prompt
+
+        Returns:
+            torch.Tensor: A torch tensor of shape [bs, max_seq_len] (i.e)
+            max_seq_len = max_prompt_length_in_batch + num_tokens_to_generate,
+            with extra indices for each tensor padded with mask id.
+        """
+        max_seq_len = max_prompt_length_in_batch + num_tokens_to_generate
+
+        for prompt_tokens in batch_prompt_tokens_list:
+            padding_size = max_seq_len - len(prompt_tokens)
+            prompt_tokens.extend([self.tokenizer.eod] * padding_size)
+
+        return torch.tensor(batch_prompt_tokens_list).cuda()
+
+    def generate_output_tokens_dynamic_batch(
+        self, active_requests: OrderedDict[int, InferenceRequest]
+    ) -> OrderedDict[int, InferenceRequest]:
+        """Utility to generate the output tokens and probabilities for the prompts
+
+        This utility generates the output tokens for a dynamic batch. It will run one forward step
+        at a time, and pass control back to the engine, which will update the request pool and call
+        this method again.
+
+        Args:
+            active_requests (OrderedDict[int, InferenceRequest]): The input active requests.
+
+        Returns:
+            OrderedDict[int, InferenceRequest]: The result for each of the incoming requests
+            after running one forward step.
+        """
+        raise Exception("Not implemented yet")
+
+    def generate_all_output_tokens_static_batch(
+        self, active_requests: OrderedDict[int, InferenceRequest]
+    ) -> OrderedDict[int, InferenceRequest]:
+        """Utility to generate the all the output tokens and probabilities for the prompts .
+
+        This utility generates the output tokens for a static batch. It runs the forward steps till
+        all prompts complete generation, updates the status of these requests to completed, adds
+        the generated result and returns these requests
+
+        Args:
+            active_requests (OrderedDict[int, InferenceRequest]): The input active requests.
+
+        Returns:
+            OrderedDict[int, InferenceRequest]: The result for each of the incoming requests
+        """
+        batch_prompt_tokens_list = list(
+            map(lambda request: request.prompt_tokens, active_requests.values())
+        )
+        prompt_lengths_in_batch = torch.tensor(
+            [len(prompt_tokens) for prompt_tokens in batch_prompt_tokens_list]
+        ).cuda()
+        max_prompt_length_in_batch = max(prompt_lengths_in_batch)
+        min_prompt_length_in_batch = min(prompt_lengths_in_batch)
+
+        # For batch inference the inference params are the same for all request
+        common_inference_params: CommonInferenceParams = list(active_requests.values())[
+            0
+        ].inference_parameters
+
+        # max_seq_len = max_prompt_length_in_batch + num_tokens_to_generate
+        batch_prompt_tokens = self.pad_input_prompt_tokens(
+            batch_prompt_tokens_list,
+            max_prompt_length_in_batch=max_prompt_length_in_batch,
+            num_tokens_to_generate=common_inference_params.num_tokens_to_generate,
+        )
+        batch_size, max_sequence_length = batch_prompt_tokens.shape
+
+        # Pre allocate log probs tensor
+        output_log_probs = None
+        if common_inference_params.return_log_probs:
+            output_log_probs = torch.empty(
+                (batch_size, max_sequence_length - 1), dtype=torch.float32
+            ).cuda()
+
+        # An array to check which of the prompts have reached end of generation condition
+        is_generation_done_tensor = torch.zeros(batch_size, dtype=torch.bool).cuda()
+
+        # An array to act as a counter to keep track of generated sequence lengths
+        generated_sequence_lengths = torch.zeros(batch_size).cuda()
+
+        with torch.no_grad():
+
+            self.prep_model_for_inference(
+                prompts_tokens=batch_prompt_tokens, active_requests=active_requests
+            )
+
+            context_start_position = 0
+            # Pick the context window that we need to pass through the network.
+            for context_end_position in range(min_prompt_length_in_batch, max_sequence_length):
+
+                inference_input = self.inference_wrapped_model.get_batch_for_context_window(
+                    context_start_position, context_end_position
+                )
+
+                # Returns the final logits of shape [batch_size, context_length, vocab_size]
+                # Note: This is returned in all TP ranks or last PP stage in PP models
+                logits = self.inference_wrapped_model.run_one_forward_step(inference_input)
+                if self.model_is_pipeline_parallel:
+                    context_length = context_end_position - context_start_position
+                    logits = broadcast_from_last_pipeline_stage(
+                        [batch_size, context_length, self.tokenizer.vocab_size],
+                        dtype=torch.float32,
+                        tensor=logits,
+                    )
+
+                # Indicates which of the input prompts have started generating tokens.
+                # A 1D boolean tensor with [batch_size] elements (i.e) The shortest
+                # prompts will start generating first and so on
+                generation_started = prompt_lengths_in_batch <= context_end_position
+                last_token_logits = logits[:, -1, :]
+                sampled_logits = self.sample_from_logits(
+                    last_token_logits, common_inference_params, self.tokenizer.vocab_size
+                )
+
+                # Substitute the sampled logits only for only the prompts that
+                # have started generating tokens
+                batch_prompt_tokens[generation_started, context_end_position] = sampled_logits[
+                    generation_started
+                ]
+
+                if common_inference_params.return_log_probs:
+                    log_probs = F.log_softmax(logits, dim=2)
+                    indices = torch.unsqueeze(
+                        batch_prompt_tokens[
+                            :, (context_start_position + 1) : (context_end_position + 1)
+                        ],
+                        2,
+                    )
+                    # Get the log probabilities for only the prompt tokens
+                    output_log_probs[:, context_start_position:context_end_position] = torch.gather(
+                        log_probs, 2, indices
+                    ).squeeze(2)
+
+                context_start_position = context_end_position
+
+                # Check end of generation status for each tensor
+                # and update generated sequence lengths
+                (is_generation_done_tensor, generated_sequence_lengths) = (
+                    self.update_generation_status(
+                        updated_prompts_tokens=batch_prompt_tokens,
+                        generation_started=generation_started,
+                        current_context_end_position=context_end_position,
+                        is_generation_done_tensor=is_generation_done_tensor,
+                        generated_sequence_lengths=generated_sequence_lengths,
+                    )
+                )
+                # Boolean flag indicating if all prompts are finished
+                all_prompts_done = torch.all(is_generation_done_tensor)
+                if all_prompts_done:
+                    break
+
+        # Include all the generated tokens
+        batch_prompt_tokens_with_generations = batch_prompt_tokens[:, : (context_end_position + 1)]
+        if common_inference_params.return_log_probs:
+            output_log_probs = output_log_probs[:, :context_end_position]
+
+        generated_sequence_lengths[
+            generated_sequence_lengths > common_inference_params.num_tokens_to_generate
+        ] = common_inference_params.num_tokens_to_generate
+
+        for idx, request in enumerate(active_requests.values()):
+            input_prompt_length = int(prompt_lengths_in_batch[idx])
+            # Shorter prompts might have generated more than required tokens. So we trim them down
+            required_sequence_length = int(
+                min(generated_sequence_lengths[idx], common_inference_params.num_tokens_to_generate)
+            )
+            # Extract only the generated tokens
+            required_result_tokens = batch_prompt_tokens_with_generations[
+                idx, input_prompt_length : (input_prompt_length + required_sequence_length)
+            ]
+
+            request.generated_length = required_sequence_length
+            request.generated_tokens = required_result_tokens
+            request.generated_log_probs = (
+                None
+                if output_log_probs is None
+                else output_log_probs[idx, input_prompt_length:required_sequence_length]
+            )
+            request.status = Status.COMPLETED
+            request.generated_text = self.detokenize_generations(required_result_tokens)
+
+        return active_requests
+
+    def prep_model_for_inference(
+        self, prompts_tokens: torch.Tensor, active_requests: OrderedDict[int, InferenceRequest]
+    ):
+        """Preparing batch for inference, using respective wrapper's prep_model_for_inference method
+
+        Args:
+            prompts_tokens (torch.Tensor): A tensor of shape [batch_size, max_sequence_length]
+            active_requests (OrderedDict[int, InferenceRequest]): The input active requests
+        """
+        self.inference_wrapped_model.prep_model_for_inference(prompts_tokens=prompts_tokens)
diff --git a/megatron/core/inference/utils.py b/megatron/core/inference/utils.py
new file mode 100644
index 0000000000..bdb1021ef5
--- /dev/null
+++ b/megatron/core/inference/utils.py
@@ -0,0 +1,17 @@
+# Copyright (c) 2024, NVIDIA CORPORATION. All rights reserved.
+class Counter:
+    """A simple counter class
+
+    This class is responsible for assigning request ids to incoming requests
+    """
+
+    def __init__(self, start: int = 0) -> None:
+        self.counter = start
+
+    def __next__(self) -> int:
+        i = self.counter
+        self.counter += 1
+        return i
+
+    def reset(self) -> None:
+        self.counter = 0
diff --git a/megatron/core/inference_params.py b/megatron/core/inference_params.py
new file mode 100644
index 0000000000..0db49e3115
--- /dev/null
+++ b/megatron/core/inference_params.py
@@ -0,0 +1,31 @@
+# Copyright (c) 2024, NVIDIA CORPORATION. All rights reserved.
+class InferenceParams:
+    """Inference parameters that are passed to the main model in order
+    to efficienly calculate and store the context during inference."""
+
+    def __init__(self, max_batch_size, max_sequence_length):
+        self.max_sequence_length = max_sequence_length
+        self.max_batch_size = max_batch_size
+        self.sequence_len_offset = 0
+        self.batch_size_offset = 0
+        self.key_value_memory_dict = {}
+
+    def swap_key_value_dict(self, batch_idx):
+        "swap between batches"
+        if len(self.key_value_memory_dict) == 0:
+            raise ValueError("should not swap when dict in empty")
+
+        for layer_number in self.key_value_memory_dict.keys():
+            inference_key_memory, inference_value_memory = self.key_value_memory_dict[layer_number]
+            assert (
+                len(batch_idx) == inference_key_memory.shape[1]
+            )  # make sure batch size is the same
+            new_inference_key_memory = inference_key_memory[:, batch_idx]
+            new_inference_value_memory = inference_value_memory[:, batch_idx]
+            self.key_value_memory_dict[layer_number] = (
+                new_inference_key_memory,
+                new_inference_value_memory,
+            )
+
+    def __str__(self):
+        return f"InferenceParams(max_seq_len = {self.max_sequence_length}, max_batch_size = {self.max_batch_size}, sequence_len_offset = {self.sequence_len_offset}, batch_size_offset = {self.batch_size_offset}, key_value_memory_dict = {self.key_value_memory_dict.keys()})"
diff --git a/megatron/core/jit.py b/megatron/core/jit.py
new file mode 100644
index 0000000000..8bb18d393c
--- /dev/null
+++ b/megatron/core/jit.py
@@ -0,0 +1,11 @@
+# Copyright (c) 2024, NVIDIA CORPORATION. All rights reserved.
+
+import torch
+
+TORCH_MAJOR = int(torch.__version__.split(".")[0])
+TORCH_MINOR = int(torch.__version__.split(".")[1])
+
+jit_fuser = torch.jit.script
+# nvFuser is deprecated in PyTorch JIT starting from 2.2
+if (TORCH_MAJOR > 2) or (TORCH_MAJOR == 2 and TORCH_MINOR >= 2):
+    jit_fuser = torch.compile
diff --git a/megatron/core/model_parallel_config.py b/megatron/core/model_parallel_config.py
new file mode 100644
index 0000000000..f2751673e4
--- /dev/null
+++ b/megatron/core/model_parallel_config.py
@@ -0,0 +1,344 @@
+# Copyright (c) 2023, NVIDIA CORPORATION. All rights reserved.
+
+from dataclasses import dataclass
+from typing import Callable, ContextManager, Optional
+
+import torch
+
+
+@dataclass
+class ModelParallelConfig:
+    """Base configuration for Megatron Core
+
+    The initialization function has an argument for each parameter.
+    """
+
+    ###################
+    # Model parallelism
+    ###################
+    tensor_model_parallel_size: int = 1
+    """Intra-layer model parallelism. Splits tensors across GPU ranks."""
+
+    pipeline_model_parallel_size: int = 1
+    """Inter-layer model parallelism. Splits transformer layers across GPU ranks."""
+
+    virtual_pipeline_model_parallel_size: Optional[int] = None
+    """Interleaved pipeline parallelism is used to improve performance by reducing the pipeline
+       bubble.  Considers a transformer block as a list of smaller transformer (virtual) blocks.
+       The number of virtual blocks per pipeline model parallel rank is the virtual model parallel
+       size.  See Efficient Large-Scale Language Model Training on GPU Clusters Using Megatron-LM:
+       arxiv.org/pdf/2104.04473.pdf for more details.
+    """
+
+    sequence_parallel: bool = False
+    """Makes tensor parallelism more memory efficient for LLMs (20B+) by parallelizing layer norms
+       and dropout sequentially.  See Reducing Activation Recomputation in Large Transformer Models
+       (https://arxiv.org/abs/2205.05198) for more details.
+    """
+
+    context_parallel_size: int = 1
+    """Splits network input along sequence dimension across GPU ranks."""
+
+    expert_model_parallel_size: int = 1
+    """Distributes Moe Experts across sub data parallel dimension."""
+
+    moe_extended_tp: bool = False
+    """Alternative parallelization strategy for expert parallelism. Instead of distributing experts
+       across expert_model_parallel_size, each expert is sharded along extendended tensor parallel
+       domain (tensor_model_paralle_size * expert_model_parallel_size). It avoids the load balancing
+       problem with MOE training.
+    """
+
+    ###################
+    # Initialization
+    ###################
+    perform_initialization: bool = True
+    """If true, weights are initialized. This option can be useful when you know you are going to
+       load values from a checkpoint.
+    """
+
+    use_cpu_initialization: bool = False
+    """When set to False, we initialize the weights directly on the GPU. CPU initialization is the
+       same regardless of tensor model parallelism, but GPU initialization is not. Transferring
+       weights from CPU to GPU can take a significant amount of time for large models.
+    """
+
+    ###################
+    # Training
+    ###################
+    fp16: bool = False
+    """If true, train with fp16 mixed precision training."""
+
+    bf16: bool = False
+    """If true, train with bf16 mixed precision training."""
+
+    params_dtype: torch.dtype = torch.float32
+    """dtype used when intializing the weights."""
+
+    timers: Callable = None
+    """Timers object to call for various timing functions. See megatron.core.timers.Timers"""
+
+    finalize_model_grads_func: Callable = None
+    """Function that finalizes gradients on all workers. Could include ensuring that grads are
+       all-reduced across data parallelism, pipeline parallelism, and sequence parallelism
+       dimensions.
+    """
+
+    grad_scale_func: Callable = None
+    """If using loss scaling, this function should take the loss and return the scaled loss. If
+       None, no function is called on the loss.
+    """
+
+    no_sync_func: Callable = None
+    """Function that creates a context that suppresses asynchronous data-parallel communication. If
+       the model is an instance of core.distributed.DistributedDataParallel, the default is to use
+       core.distributed.DistributedDataParallel.no_sync.
+    """
+
+    grad_sync_func: Callable = None
+    """Function that launches asynchronous gradient reductions (e.g. distributed optimizer gradient
+       reduce-scatters). The function should take one argument: an iterable of parameters whose
+       gradients are to be synchronized.
+    """
+
+    param_sync_func: Callable = None
+    """Function that launches asynchronous parameter synchronizations (e.g. distributed optimizer
+       parameter all-gathers). The function should take one argument: an iterable of parameters to
+       be synchronized.
+    """
+
+    deterministic_mode: bool = False
+    """If true, code that has deterministic execution will be chosen. This usually
+       means slower execution, but is good for debugging and testing. Defaults to False."""
+
+    enable_autocast: bool = False
+    """If true runs the forward step function inside torch.autocast context."""
+
+    autocast_dtype: torch.dtype = None
+    """dtype to pass to torch.amp.autocast when enabled. If None, is set to pipeline_dtype."""
+
+    num_microbatches_with_partial_activation_checkpoints: Optional[int] = None
+    """If int, set the number of microbatches where not all of the layers will be checkpointed and
+       recomputed. The rest of the microbatches within the window of maximum outstanding
+       microbatches will recompute all layers (either full recompute or selective recompute). If
+       None, the checkpoint and recompute will be left up to the forward_step function.
+
+    """
+
+    ###################
+    # Optimizations
+    ###################
+    gradient_accumulation_fusion: bool = False
+    """If true, fuses weight gradient accumulation to GEMMs. Requires the custom CUDA extension
+       fused_weight_gradient_mlp_cuda module. To use gradient_accumulation_fusion you must install
+       APEX with --cpp_ext and --cuda_ext. For example: "pip install --global-option=\"--cpp_ext\"
+       --global-option=\"--cuda_ext\" ". Note that the extension requires CUDA>=11. Otherwise, you
+       must turn off gradient accumulation fusion.
+    """
+
+    async_tensor_model_parallel_allreduce: bool = False
+    """NOTE: Deprecated. This flag is ignored."""
+
+    use_te_rng_tracker: bool = False
+    """If true, uses RNG state tracker in TransformerEngine if exists.
+    """
+
+    tp_comm_overlap: bool = False
+    """If true, allows overlapping of Linear layer execution with tensor parallel communication
+       collectives like AllGather/ReduceScatter. Overlapping is done for the linear layers wherever
+       possible during the forward and the backward pass.
+    """
+
+    tp_comm_bulk_wgrad: bool = True
+    """If true, allows All-Gather overlap with Bprop activation gradient GEMM. Don't care if
+       tp_comm_overlap is False.
+    """
+
+    tp_comm_bulk_dgrad: bool = True
+    """If true, allows Reduce-Scatter overlap with Bprop weight gradient GEMM. Don't care if
+       tp_comm_overlap is False.
+    """
+
+    tp_comm_overlap_ag: bool = True
+    """If true, allows All-Gather overlap with GEMM by pipelining the GEMM and All-Gather.
+       Don't care if tp_comm_overlap is False.
+    """
+
+    tp_comm_overlap_rs: bool = True
+    """If true, allows Reduce-Scatter overlap with GEMM by pipelining the GEMM and Reduce-Scatter.
+       Don't care if tp_comm_overlap is False.
+    """
+
+    tp_comm_overlap_rs_dgrad: bool = False
+    """If true, allows Reduce-Scatter overlap with DGRAD GEMM by pipelining the
+       GEMM and Reduce-Scatter splits. Don't care if tp_comm_overlap is False.
+    """
+
+    tp_comm_split_ag: bool = True
+    """Deprecated from TransformerEngine v1.6.0.
+       If true, allows All-Gather overlap with Fprop GEMM by pipelining the GEMM and All-Gather
+       splits. Don't care if tp_comm_overlap is False.
+    """
+
+    tp_comm_atomic_ag: bool = False
+    """Deprecated from TransformerEngine v1.6.0.
+       If true, allows All-Gather overlap with Fprop GEMM by pipelining the GEMM and All-Gather
+       both done atomically. Don't care if tp_comm_overlap is False.
+    """
+
+    tp_comm_split_rs: bool = True
+    """Deprecated from TransformerEngine v1.6.0.
+       If true, allows Reduce-Scatter overlap with Fprop GEMM by pipelining the GEMM and
+       Reduce-Scatter splits. Don't care if tp_comm_overlap is False.
+    """
+
+    tp_comm_atomic_rs: bool = False
+    """Deprecated from TransformerEngine v1.6.0.
+       If true, allows Reduce-Scatter overlap with Fprop GEMM by pipelining the GEMM and
+       Reduce-Scatter both done atomically. Don't care if tp_comm_overlap is False.
+    """
+
+    cross_entropy_loss_fusion: bool = False
+    """If this is enabled, the fused cross entropy implementation would be used.
+       Defaults to False.
+    """
+
+    tp_comm_overlap_disable_qkv: bool = False
+    """
+       If true, the AllGather -> Gemm overlap for QKV gets disabled
+    """
+
+    tp_comm_overlap_disable_fc1: bool = False
+    """
+       If true, the AllGather -> Gemm overlap for FC1 layer of MLP gets disabled
+    """
+
+    tp_comm_bootstrap_backend: str = 'nccl'
+    """
+       Set the bootstrapping backend out of 'nccl', 'mpi', and 'gloo'
+    """
+
+    ###################
+    # Pipeline Parallel
+    ###################
+    pipeline_dtype: torch.dtype = None
+    """dtype used in p2p communication, usually params_dtype"""
+
+    variable_seq_lengths: bool = False
+    """Support for variable sequence lengths across microbatches. Setting this communicates the size
+        of tensors during pipeline parallelism communication, because of this extra overhead it
+        should only be set if the sequence length varies by microbatch within a global batch.
+    """
+
+    overlap_p2p_comm: bool = False
+    """When True some of the peer to peer communication for pipeline parallelism will overlap with
+       computation. Must be False if batch_p2p_comm is true.
+    """
+
+    batch_p2p_comm: bool = True
+    """Use batch_isend_irecv instead of individual isend/irecv calls. Must be False if
+       overlap_p2p_comm is True.
+    """
+
+    batch_p2p_sync: bool = True
+    """When using batch_isend_irecv, do a cuda.device.synchronize afterward to work around a bug in
+       older version of PyTorch.
+    """
+
+    use_ring_exchange_p2p: bool = False
+    """Use custom ring_exchange kernel instead of torch.distributed.batch_isend_irecv(). Requires
+       custom built torch with torch.distributed.ring_exchange.
+    """
+
+    deallocate_pipeline_outputs: bool = False
+    """If True, output data is deallocated after the tensor is sent to the next pipeline stage.
+       Helps with saving memory, does nothing when pipeline parallel is not used.
+    """
+
+    defer_embedding_wgrad_compute: bool = False
+    """If true, defers the embedding WGRAD GEMMs while pipeline flush is
+       taking place enabling us to hide pipeline flush latency. Defaults to False.
+    """
+
+    wgrad_deferral_limit: int = 0
+    """This value tunes the number of micro-batches for which the embedding weight gradient compute
+       needs to be deferred to pipeline flush, this argument is invalid if
+       `defer_embedding_wgrad_compute` is False.
+       Defaults to 0, which means all micro-batches are deferred.
+    """
+
+    pipeline_model_parallel_split_rank: Optional[int] = None
+    """If int, rank where encoder and decoder should be split in cases where the model has both an
+       encoder and decoder (e.g., T5). Ignored if None.
+    """
+
+    ###################
+    # CPU Offloading
+    ###################
+    cpu_offloading: bool = False
+    """When set to True, all the activations are offloaded to the CPU asynchronously."""
+
+    cpu_offloading_num_layers: int = 0
+    """Tells the number of transformer layers for which activations has to be offloaded."""
+
+    _cpu_offloading_context: ContextManager = (
+        None
+        # Used for internal use only, not to be set by a user.
+        # TODO: Need to move to the 'right' place when possible.
+    )
+    """For internal use only, do not set."""
+
+    cpu_offloading_activations: bool = True
+    """If True, offloads the activations to CPU."""
+
+    cpu_offloading_weights: bool = True
+    """If True, offloads the weights to CPU."""
+
+    ###################
+    # Timing
+    ###################
+    barrier_with_L1_time: bool = True
+    """If true, use barrier with level 1 time measurements. It is up to the user to make sure
+       calling barrier with their timers will not result in hangs. This can happen if for example
+       the user adds a level 1 timer that is not called by all ranks.
+    """
+
+    def __post_init__(self):
+        """Python dataclass method that is used to modify attributes after initialization.
+        See https://docs.python.org/3/library/dataclasses.html#post-init-processing for more
+        details.
+        """
+        if self.sequence_parallel:
+            if self.tensor_model_parallel_size <= 1:
+                raise ValueError("Can not use sequence paralllelism without tensor parallelism")
+
+        if self.pipeline_model_parallel_size > 1:
+            if self.pipeline_dtype is None:
+                raise ValueError(
+                    "When using pipeline parallelism, pipeline_dtype must be specified"
+                )
+
+        if self.autocast_dtype is None:
+            self.autocast_dtype = self.params_dtype
+
+        if self.defer_embedding_wgrad_compute and self.pipeline_model_parallel_size == 1:
+            raise ValueError(
+                "Cannot defer embedding wgrad compute when pipeline model parallel is not used"
+            )
+
+        if self.defer_embedding_wgrad_compute and not self.gradient_accumulation_fusion:
+            raise ValueError(
+                "Cannot defer embedding wgrad compute when gradient accumulation fusion is not used"
+            )
+
+        if self.defer_embedding_wgrad_compute and self.wgrad_deferral_limit < 0:
+            raise ValueError(
+                "Wgrad deferral limit should be greater than or equal to 0 when it is enabled!"
+            )
+
+        if self.expert_model_parallel_size > 1 and self.tensor_model_parallel_size > 1:
+            if self.sequence_parallel is False:
+                raise ValueError(
+                    "When using expert parallelism and tensor parallelism, sequence parallelism "
+                    "must be used"
+                )
diff --git a/megatron/core/models/T5/__init__.py b/megatron/core/models/T5/__init__.py
new file mode 100644
index 0000000000..2551f81e65
--- /dev/null
+++ b/megatron/core/models/T5/__init__.py
@@ -0,0 +1,2 @@
+# Copyright (c) 2024, NVIDIA CORPORATION. All rights reserved.
+from .t5_model import T5Model
diff --git a/megatron/core/models/T5/t5_model.py b/megatron/core/models/T5/t5_model.py
new file mode 100644
index 0000000000..bce998c6e8
--- /dev/null
+++ b/megatron/core/models/T5/t5_model.py
@@ -0,0 +1,422 @@
+# Copyright (c) 2023, NVIDIA CORPORATION. All rights reserved.
+
+from typing import List, Literal, Optional
+
+import torch
+from torch import Tensor
+
+from megatron.core import InferenceParams, tensor_parallel
+from megatron.core.config_logger import has_config_logger_enabled, log_config_to_disk
+from megatron.core.models.common.embeddings.language_model_embedding import LanguageModelEmbedding
+from megatron.core.models.common.embeddings.rotary_pos_embedding import RotaryEmbedding
+from megatron.core.models.common.language_module.language_module import LanguageModule
+from megatron.core.transformer.enums import ModelType
+from megatron.core.transformer.module import MegatronModule
+from megatron.core.transformer.spec_utils import ModuleSpec
+from megatron.core.transformer.transformer_block import TransformerBlock
+from megatron.core.transformer.transformer_config import TransformerConfig
+
+
+class T5LMHead(MegatronModule):
+    """Masked LM head for T5
+
+    Args:
+        config (TransformerConfig): transformer config
+        parallel_output (bool): wether output logits being distributed or not.
+        vocab_size (int): vocabulary size
+        pre_process (bool): Include embedding layer
+        share_embeddings_and_output_weights (bool): When True, input
+            embeddings and output logit weights are shared.
+    """
+
+    def __init__(
+        self,
+        config: TransformerConfig,
+        parallel_output: bool,
+        vocab_size: int,
+        pre_process: bool = True,
+        share_embeddings_and_output_weights: bool = False,
+    ):
+        super(T5LMHead, self).__init__(config=config)
+
+        if has_config_logger_enabled(config):
+            log_config_to_disk(config, locals(), prefix=type(self).__name__)
+
+        self.parallel_output = parallel_output
+
+        self.output_layer = tensor_parallel.ColumnParallelLinear(
+            config.hidden_size,
+            vocab_size,
+            config=config,
+            init_method=config.init_method,
+            bias=share_embeddings_and_output_weights,
+            skip_bias_add=not share_embeddings_and_output_weights,
+            gather_output=not self.parallel_output,
+            skip_weight_param_allocation=pre_process and share_embeddings_and_output_weights,
+        )
+
+    def forward(self, hidden_states: Tensor, word_embeddings_weight: Tensor) -> Tensor:
+        """Forward pass.
+
+        Args:
+            hidden_states (Tensor): output hidden states from decoder
+            word_embeddings_weight (Tensor): word embedding weight
+
+        Returns:
+            Tensor: logits tensor
+        """
+
+        logits, _ = self.output_layer(hidden_states, weight=word_embeddings_weight)
+        return logits
+
+
+class T5Model(LanguageModule):
+    """T5 Language model.
+
+    Args:
+        config (TransformerConfig): transformer config
+
+        encoder_config (TransformerConfig): encoder transformer config
+
+        transformer_encoder_layer_spec (ModuleSpec): transformer layer
+            customization specs for encoder
+
+        transformer_decoder_layer_spec (ModuleSpec): transformer layer
+            customization specs for decoder
+
+        vocab_size (int): vocabulary size
+
+        max_sequence_length (int): maximum size of sequence. This is used for positional embedding
+
+        pre_process (bool): Include embedding layer (used with pipeline parallelism)
+
+        post_process (bool): Include an output layer (used with pipeline parallelism)
+
+        fp16_lm_cross_entropy (bool, optional): Defaults to False
+
+        parallel_output (bool): Do not gather the outputs,
+            keep them split across tensor parallel ranks
+
+        share_embeddings_and_output_weights (bool): When True,
+            input embeddings and output logit weights are shared. Defaults to False.
+
+        position_embedding_type (string): Position embedding type.
+            Options ['learned_absolute', 'rope'].
+            Defaults is 'learned_absolute'.
+
+        rotary_percent (float): Percent of rotary dimension to use for rotary position embeddings.
+            Defaults to 1.0 (100%). Ignored unless position_embedding_type is 'rope'.
+
+        seq_len_interpolation_factor (float): scale of linearly interpolating
+            RoPE for longer sequences. The value must be a float larger than 1.0.
+            Defaults to None.
+
+        add_encoder (bool): Create the encoder (used with pipeline parallelism).
+            When using pipelining, the encoder will only be created on a subset
+            of the pipeline ranks.
+
+        add_decoder (bool): Include an output layer (used with pipeline parallelism).
+            As with `add_encoder`, when using this model and pipelining,
+            the decoder will only be created on a subset of the pipeline ranks.
+    """
+
+    def __init__(
+        self,
+        config: TransformerConfig,
+        encoder_config: TransformerConfig,
+        transformer_encoder_layer_spec: ModuleSpec,
+        transformer_decoder_layer_spec: ModuleSpec,
+        vocab_size: int,
+        max_sequence_length: int,
+        pre_process: bool = True,
+        post_process: bool = True,
+        fp16_lm_cross_entropy: bool = False,
+        parallel_output: bool = True,
+        share_embeddings_and_output_weights: bool = False,
+        position_embedding_type: Literal['learned_absolute', 'rope'] = 'learned_absolute',
+        rotary_percent: float = 1.0,
+        seq_len_interpolation_factor: Optional[float] = None,
+        add_encoder: bool = True,
+        add_decoder: bool = True,
+    ):
+
+        super(T5Model, self).__init__(config=config)
+
+        self.config: TransformerConfig = config
+        self.encoder_config: TransformerConfig = encoder_config
+        self.transformer_encoder_layer_spec: ModuleSpec = transformer_encoder_layer_spec
+        self.transformer_decoder_layer_spec: ModuleSpec = transformer_decoder_layer_spec
+        self.vocab_size = vocab_size
+        self.max_sequence_length = max_sequence_length
+        self.pre_process = pre_process
+        self.post_process = post_process
+        self.add_encoder = add_encoder
+        self.add_decoder = add_decoder
+        self.fp16_lm_cross_entropy = fp16_lm_cross_entropy
+        self.parallel_output = parallel_output
+        self.share_embeddings_and_output_weights = share_embeddings_and_output_weights
+        self.position_embedding_type = position_embedding_type
+        self.encoder_hidden_state = None
+
+        self.model_type = ModelType.encoder_and_decoder
+
+        # Tells schedules.py that this model has a skip connection
+        # between the encoder's output and the decoder
+        # (and hence both the encoder and decoder's tensors are required for correct backprop).
+        self.xattn_needed = True
+
+        # specify the position embeddings as a member
+        # variable in the T5 class so that they are easy to
+        # find for `finalize_model_grads._allreduce_position_embedding_grads`
+        self.position_embeddings = None
+        if self.pre_process:
+            self.embedding = LanguageModelEmbedding(
+                config=self.config,
+                vocab_size=self.vocab_size,
+                max_sequence_length=self.max_sequence_length,
+                position_embedding_type=self.position_embedding_type,
+            )
+            self.position_embeddings = self.embedding.position_embeddings
+
+        # Rotary Position Embeddings
+        if self.position_embedding_type == 'rope':
+            self.rotary_pos_emb = RotaryEmbedding(
+                kv_channels=self.config.kv_channels,
+                rotary_percent=rotary_percent,
+                rotary_interleaved=self.config.rotary_interleaved,
+                seq_len_interpolation_factor=seq_len_interpolation_factor,
+                use_cpu_initialization=self.config.use_cpu_initialization,
+            )
+
+        # Transformer encoder
+        encoder_spec, decoder_spec = (
+            self.transformer_encoder_layer_spec,
+            self.transformer_decoder_layer_spec,
+        )
+        if self.add_encoder:
+            self.encoder = TransformerBlock(
+                config=self.encoder_config,
+                spec=encoder_spec,
+                pre_process=self.pre_process,
+                post_process=self.post_process,
+            )
+        else:
+            self.encoder = None
+
+        if self.add_decoder:
+            # Transformer decoder
+            self.decoder = TransformerBlock(
+                config=self.config,
+                spec=decoder_spec,
+                pre_process=self.pre_process,
+                post_process=self.post_process,
+            )
+        else:
+            self.decoder = None
+
+        # Output
+        if post_process:
+            self.lm_head = T5LMHead(
+                config,
+                parallel_output,
+                self.vocab_size,
+                self.pre_process,
+                self.share_embeddings_and_output_weights,
+            )
+            self.output_layer = self.lm_head.output_layer
+
+        if self.pre_process or self.post_process:
+            self.setup_embeddings_and_output_layer()
+
+    def forward(
+        self,
+        encoder_input_ids: Tensor,
+        decoder_input_ids: Tensor,
+        encoder_attn_mask: Tensor,
+        decoder_attn_mask: Tensor,
+        encoder_decoder_attn_mask: Tensor,
+        lm_labels: Tensor = None,
+        encoder_hidden_states: Tensor = None,
+        output_encoder_hidden_only: bool = False,
+        inference_params: InferenceParams = None,
+    ) -> Tensor:
+        """Forward pass.
+
+        Args:
+            encoder_input_ids (Tensor): input ids for encoder
+            decoder_input_ids (Tensor): input ids for decoder
+            encoder_attn_mask (Tensor): self-attention mask for encoder
+            decoder_attn_mask (Tensor): self-attention mask for decoder
+            encoder_decoder_attn_mask (Tensor): cross-attention mask between encoder and decoder
+            lm_labels (Tensor): labels for decoder output
+            inference_params (InferenceParams): relevant arguments for inferencing
+
+        Returns:
+            Tensor: loss tensor
+        """
+
+        (encoder_attn_mask, decoder_attn_mask, encoder_decoder_attn_mask) = (
+            t5_extended_attention_mask(
+                [encoder_attn_mask, decoder_attn_mask, encoder_decoder_attn_mask]
+            )
+        )
+
+        ## Encoder forward
+        if encoder_hidden_states is None:
+
+            # Encoder position ids
+            encoder_position_ids = t5_position_ids(encoder_input_ids)
+
+            # Encoder embedding.
+            if self.pre_process:
+                encoder_input = self.embedding(
+                    input_ids=encoder_input_ids, position_ids=encoder_position_ids
+                )
+            else:
+                # intermediate stage of pipeline
+                encoder_input = None
+
+            # Rotary positional embeddings
+            rotary_pos_emb = None
+            if self.position_embedding_type == 'rope':
+                rotary_seq_len = self.rotary_pos_emb.get_rotary_seq_len(
+                    inference_params, self.encoder, encoder_input, self.config
+                )
+                rotary_pos_emb = self.rotary_pos_emb(rotary_seq_len)
+
+            # Run encoder.
+            if self.add_encoder:
+                encoder_hidden_states = self.encoder(
+                    hidden_states=encoder_input,
+                    attention_mask=encoder_attn_mask,
+                    inference_params=inference_params,
+                    rotary_pos_emb=rotary_pos_emb,
+                )
+            else:
+                encoder_hidden_states = self.encoder_hidden_state
+
+        if not self.add_decoder or output_encoder_hidden_only:
+            return encoder_hidden_states
+
+        ## Decoder forward
+        # Decoder position ids
+        decoder_position_ids = t5_position_ids(decoder_input_ids)
+
+        # Decoder embedding.
+        if self.pre_process:
+            decoder_input = self.embedding(
+                input_ids=decoder_input_ids, position_ids=decoder_position_ids
+            )
+        else:
+            # intermediate stage of pipeline
+            decoder_input = None  ### should it take encoder_hidden_states
+
+        # Rotary positional embeddings
+        rotary_pos_emb = None
+        if self.position_embedding_type == 'rope':
+            rotary_seq_len = self.rotary_pos_emb.get_rotary_seq_len(
+                inference_params, self.decoder, decoder_input, self.config
+            )
+            rotary_pos_emb = self.rotary_pos_emb(rotary_seq_len)
+
+        # Run decoder.
+        decoder_hidden_states = self.decoder(
+            hidden_states=decoder_input,
+            attention_mask=decoder_attn_mask,
+            context=encoder_hidden_states,
+            context_mask=encoder_decoder_attn_mask,
+            inference_params=inference_params,
+            rotary_pos_emb=rotary_pos_emb,
+        )
+
+        if self.post_process:
+            lm_logits = self.lm_head(
+                decoder_hidden_states, self.shared_embedding_or_output_weight()
+            )
+            if lm_labels is None:
+                # [s b h] => [b s h]
+                return lm_logits.transpose(0, 1).contiguous()
+            else:
+                # [b s] => [s b]
+                lm_loss = self.compute_language_model_loss(lm_labels, lm_logits)
+                return lm_loss
+        else:
+            return decoder_hidden_states
+
+    def set_input_tensor(self, input_tensor):
+        """See megatron.model.transformer.set_input_tensor()"""
+
+        # This is usually handled in schedules.py but some inference code still
+        # gives us non-lists or None
+        if not isinstance(input_tensor, list):
+            input_tensor = [input_tensor]
+
+        if self.add_encoder and self.add_decoder:
+            assert (
+                len(input_tensor) == 1
+            ), 'input_tensor should only be length 1 for stage with both encoder and decoder'
+            self.encoder.set_input_tensor(input_tensor[0])
+        elif self.add_encoder:
+            assert (
+                len(input_tensor) == 1
+            ), 'input_tensor should only be length 1 for stage with only encoder'
+            self.encoder.set_input_tensor(input_tensor[0])
+        elif self.add_decoder:
+            if len(input_tensor) == 2:
+                self.decoder.set_input_tensor(input_tensor[0])
+                self.encoder_hidden_state = input_tensor[1]
+            elif len(input_tensor) == 1:
+                self.decoder.set_input_tensor(None)
+                self.encoder_hidden_state = input_tensor[0]
+            else:
+                raise Exception('input_tensor must have either length 1 or 2')
+        else:
+            raise Exception('Stage must have at least either encoder or decoder')
+
+    def shared_embedding_or_output_weight(self) -> Tensor:
+        """Function to share the input embeddings and output logit weights."""
+
+        if self.pre_process:
+            return self.embedding.word_embeddings.weight
+        elif self.post_process:
+            return self.lm_head.output_layer.weight
+        return None
+
+
+def t5_extended_attention_mask(attention_mask_list: List[Tensor]) -> List[Tensor]:
+    """Creates the extended attention mask
+
+    Converts the attention mask of dimension [batch size, seq_len, seq_len]
+    to [batch size, 1, seq_len, seq_len]
+
+    Args:
+        attention_mask (Tensor): The input attention mask
+
+    Returns:
+        Tensor: The extended binary attention mask
+    """
+
+    def attn_mask_postprocess(attn_mask):
+        # [b, 1, s, s]
+        extended_attention_mask = attn_mask.unsqueeze(1)
+        return extended_attention_mask
+
+    return [
+        (attn_mask_postprocess(attn_mask) if attn_mask is not None else None)
+        for attn_mask in attention_mask_list
+    ]
+
+
+def t5_position_ids(token_ids: Tensor) -> Tensor:
+    """Calculate position ids from token ids
+    Args:
+        token_ids (Tensor): input tokens
+
+    Returns:
+        Tensor: position ids
+    """
+    seq_length = token_ids.size(1)
+    position_ids = torch.arange(seq_length, dtype=torch.long, device=token_ids.device)
+    position_ids = position_ids.unsqueeze(0).expand_as(token_ids)
+
+    return position_ids
diff --git a/megatron/core/models/T5/t5_spec.py b/megatron/core/models/T5/t5_spec.py
new file mode 100644
index 0000000000..ecdcdbc260
--- /dev/null
+++ b/megatron/core/models/T5/t5_spec.py
@@ -0,0 +1,248 @@
+# Copyright (c) 2024, NVIDIA CORPORATION. All rights reserved.
+from megatron.core.fusions.fused_bias_dropout import get_bias_dropout_add
+from megatron.core.tensor_parallel.layers import ColumnParallelLinear, RowParallelLinear
+from megatron.core.transformer.attention import (
+    CrossAttention,
+    CrossAttentionSubmodules,
+    SelfAttention,
+    SelfAttentionSubmodules,
+)
+from megatron.core.transformer.dot_product_attention import DotProductAttention
+from megatron.core.transformer.enums import AttnMaskType
+from megatron.core.transformer.identity_op import IdentityOp
+from megatron.core.transformer.mlp import MLP, MLPSubmodules
+from megatron.core.transformer.spec_utils import ModuleSpec
+from megatron.core.transformer.transformer_block import TransformerBlockSubmodules
+from megatron.core.transformer.transformer_layer import TransformerLayer, TransformerLayerSubmodules
+
+try:
+    from megatron.core.extensions.transformer_engine import (
+        TEColumnParallelLinear,
+        TEDotProductAttention,
+        TELayerNormColumnParallelLinear,
+        TENorm,
+        TERowParallelLinear,
+    )
+
+    HAVE_TE = True
+except ImportError:
+    HAVE_TE = False
+
+try:
+    import apex  # pylint: disable=unused-import
+
+    from megatron.core.fusions.fused_layer_norm import FusedLayerNorm
+
+    HAVE_APEX = True
+    LNImpl = FusedLayerNorm
+except ImportError:
+    import warnings
+
+    from megatron.core.transformer.torch_layer_norm import WrappedTorchLayerNorm
+
+    warnings.warn(f'Apex is not installed. Falling back to Torch LayerNorm')
+    LNImpl = WrappedTorchLayerNorm
+
+
+def encoder_model_with_transformer_engine_default_spec() -> ModuleSpec:
+    """T5 encoder TE spec (uses Transformer Engine components)."""
+
+    return ModuleSpec(
+        module=TransformerLayer,
+        submodules=TransformerLayerSubmodules(
+            self_attention=ModuleSpec(
+                module=SelfAttention,
+                params={"attn_mask_type": AttnMaskType.arbitrary},
+                submodules=SelfAttentionSubmodules(
+                    linear_qkv=TELayerNormColumnParallelLinear,
+                    core_attention=TEDotProductAttention,
+                    linear_proj=TERowParallelLinear,
+                    q_layernorm=IdentityOp,
+                    k_layernorm=IdentityOp,
+                ),
+            ),
+            self_attn_bda=get_bias_dropout_add,
+            mlp=ModuleSpec(
+                module=MLP,
+                submodules=MLPSubmodules(
+                    linear_fc1=TELayerNormColumnParallelLinear, linear_fc2=TERowParallelLinear
+                ),
+            ),
+            mlp_bda=get_bias_dropout_add,
+        ),
+    )
+
+
+def decoder_model_with_transformer_engine_default_spec() -> ModuleSpec:
+    """T5 decoder TE spec (uses Transformer Engine components)."""
+
+    return ModuleSpec(
+        module=TransformerLayer,
+        submodules=TransformerLayerSubmodules(
+            self_attention=ModuleSpec(
+                module=SelfAttention,
+                params={"attn_mask_type": AttnMaskType.causal},
+                submodules=SelfAttentionSubmodules(
+                    linear_qkv=TELayerNormColumnParallelLinear,
+                    core_attention=TEDotProductAttention,
+                    linear_proj=TERowParallelLinear,
+                    q_layernorm=IdentityOp,
+                    k_layernorm=IdentityOp,
+                ),
+            ),
+            self_attn_bda=get_bias_dropout_add,
+            pre_cross_attn_layernorm=TENorm,
+            cross_attention=ModuleSpec(
+                module=CrossAttention,
+                params={"attn_mask_type": AttnMaskType.arbitrary},
+                submodules=CrossAttentionSubmodules(
+                    linear_q=TEColumnParallelLinear,
+                    linear_kv=TEColumnParallelLinear,
+                    core_attention=TEDotProductAttention,
+                    linear_proj=TERowParallelLinear,
+                ),
+            ),
+            cross_attn_bda=get_bias_dropout_add,
+            mlp=ModuleSpec(
+                module=MLP,
+                submodules=MLPSubmodules(
+                    linear_fc1=TELayerNormColumnParallelLinear, linear_fc2=TERowParallelLinear
+                ),
+            ),
+            mlp_bda=get_bias_dropout_add,
+        ),
+    )
+
+
+def encoder_model_with_local_spec() -> ModuleSpec:
+    """T5 encoder local spec (uses Megatron-Core components)."""
+
+    return ModuleSpec(
+        module=TransformerLayer,
+        submodules=TransformerLayerSubmodules(
+            input_layernorm=LNImpl,
+            self_attention=ModuleSpec(
+                module=SelfAttention,
+                params={"attn_mask_type": AttnMaskType.arbitrary},
+                submodules=SelfAttentionSubmodules(
+                    linear_qkv=ColumnParallelLinear,
+                    core_attention=DotProductAttention,
+                    linear_proj=RowParallelLinear,
+                    q_layernorm=IdentityOp,
+                    k_layernorm=IdentityOp,
+                ),
+            ),
+            self_attn_bda=get_bias_dropout_add,
+            pre_mlp_layernorm=LNImpl,
+            mlp=ModuleSpec(
+                module=MLP,
+                submodules=MLPSubmodules(
+                    linear_fc1=ColumnParallelLinear, linear_fc2=RowParallelLinear
+                ),
+            ),
+            mlp_bda=get_bias_dropout_add,
+            sharded_state_dict_keys_map={
+                'input_layernorm.': 'self_attention.linear_qkv.layer_norm_',
+                'pre_mlp_layernorm.': 'mlp.linear_fc1.layer_norm_',
+            },
+        ),
+    )
+
+
+def decoder_model_with_local_spec() -> ModuleSpec:
+    """T5 decoder local spec (uses Megatron-Core components)."""
+
+    return ModuleSpec(
+        module=TransformerLayer,
+        submodules=TransformerLayerSubmodules(
+            input_layernorm=LNImpl,
+            self_attention=ModuleSpec(
+                module=SelfAttention,
+                params={"attn_mask_type": AttnMaskType.causal},
+                submodules=SelfAttentionSubmodules(
+                    linear_qkv=ColumnParallelLinear,
+                    core_attention=DotProductAttention,
+                    linear_proj=RowParallelLinear,
+                    q_layernorm=IdentityOp,
+                    k_layernorm=IdentityOp,
+                ),
+            ),
+            self_attn_bda=get_bias_dropout_add,
+            pre_cross_attn_layernorm=LNImpl,
+            cross_attention=ModuleSpec(
+                module=CrossAttention,
+                params={"attn_mask_type": AttnMaskType.arbitrary},
+                submodules=CrossAttentionSubmodules(
+                    linear_q=ColumnParallelLinear,
+                    linear_kv=ColumnParallelLinear,
+                    core_attention=DotProductAttention,
+                    linear_proj=RowParallelLinear,
+                ),
+            ),
+            cross_attn_bda=get_bias_dropout_add,
+            pre_mlp_layernorm=LNImpl,
+            mlp=ModuleSpec(
+                module=MLP,
+                submodules=MLPSubmodules(
+                    linear_fc1=ColumnParallelLinear, linear_fc2=RowParallelLinear
+                ),
+            ),
+            mlp_bda=get_bias_dropout_add,
+            sharded_state_dict_keys_map={
+                'input_layernorm.': 'self_attention.linear_qkv.layer_norm_',
+                'pre_mlp_layernorm.': 'mlp.linear_fc1.layer_norm_',
+            },
+        ),
+    )
+
+
+def get_t5_encoder_with_transformer_engine_block_spec(
+    num_layers: int,
+) -> TransformerBlockSubmodules:
+    """T5 encoder block spec for Transformer Engine
+
+    Args:
+      config (TransformerConfig): config, containing number of layers for encoder
+    """
+
+    layer_spec = encoder_model_with_transformer_engine_default_spec()
+    block_spec = TransformerBlockSubmodules([layer_spec] * num_layers, layer_norm=TENorm)
+    return block_spec
+
+
+def get_t5_decoder_with_transformer_engine_block_spec(
+    num_layers: int,
+) -> TransformerBlockSubmodules:
+    """T5 decoder block spec for Transformer Engine
+
+    Args:
+      config (TransformerConfig): config, containing number of layers for decoder
+    """
+
+    layer_spec = decoder_model_with_transformer_engine_default_spec()
+    block_spec = TransformerBlockSubmodules([layer_spec] * num_layers, layer_norm=TENorm)
+    return block_spec
+
+
+def get_t5_encoder_with_local_block_spec(num_layers: int) -> TransformerBlockSubmodules:
+    """T5 encoder block spec for local (uses Megatron-Core components)
+
+    Args:
+      num_layers (int): number of encoder layers
+    """
+
+    layer_spec = encoder_model_with_local_spec()
+    block_spec = TransformerBlockSubmodules([layer_spec] * num_layers, layer_norm=TENorm)
+    return block_spec
+
+
+def get_t5_decoder_with_local_block_spec(num_layers: int) -> TransformerBlockSubmodules:
+    """T5 decoder block spec for local (uses Megatron-Core components)
+
+    Args:
+      num_layers (int): number of decoder layers
+    """
+
+    layer_spec = decoder_model_with_local_spec()
+    block_spec = TransformerBlockSubmodules([layer_spec] * num_layers, layer_norm=TENorm)
+    return block_spec
diff --git a/megatron/core/models/__init__.py b/megatron/core/models/__init__.py
new file mode 100644
index 0000000000..e69de29bb2
diff --git a/megatron/core/models/bert/__init__.py b/megatron/core/models/bert/__init__.py
new file mode 100644
index 0000000000..e69de29bb2
diff --git a/megatron/core/models/bert/bert_layer_specs.py b/megatron/core/models/bert/bert_layer_specs.py
new file mode 100644
index 0000000000..cd51c124c9
--- /dev/null
+++ b/megatron/core/models/bert/bert_layer_specs.py
@@ -0,0 +1,92 @@
+# Copyright (c) 2024, NVIDIA CORPORATION. All rights reserved.
+from megatron.core.fusions.fused_bias_dropout import get_bias_dropout_add
+from megatron.core.tensor_parallel.layers import ColumnParallelLinear, RowParallelLinear
+from megatron.core.transformer.attention import SelfAttention, SelfAttentionSubmodules
+from megatron.core.transformer.dot_product_attention import DotProductAttention
+from megatron.core.transformer.enums import AttnMaskType
+from megatron.core.transformer.identity_op import IdentityOp
+from megatron.core.transformer.mlp import MLP, MLPSubmodules
+from megatron.core.transformer.spec_utils import ModuleSpec
+from megatron.core.transformer.transformer_layer import TransformerLayer, TransformerLayerSubmodules
+
+try:
+    from megatron.core.extensions.transformer_engine import (
+        TEDotProductAttention,
+        TELayerNormColumnParallelLinear,
+        TERowParallelLinear,
+    )
+
+    HAVE_TE = True
+except ImportError:
+    HAVE_TE = False
+
+try:
+    import apex  # pylint: disable=unused-import
+
+    from megatron.core.fusions.fused_layer_norm import FusedLayerNorm
+
+    HAVE_APEX = True
+    LNImpl = FusedLayerNorm
+except ImportError:
+    import warnings
+
+    from megatron.core.transformer.torch_layer_norm import WrappedTorchLayerNorm
+
+    warnings.warn(f'Apex is not installed. Falling back to Torch LayerNorm')
+    LNImpl = WrappedTorchLayerNorm
+
+# Use this spec to use lower level Transformer Engine modules (required for fp8 training)
+bert_layer_with_transformer_engine_spec = ModuleSpec(
+    module=TransformerLayer,
+    submodules=TransformerLayerSubmodules(
+        self_attention=ModuleSpec(
+            module=SelfAttention,
+            params={"attn_mask_type": AttnMaskType.padding},
+            submodules=SelfAttentionSubmodules(
+                linear_qkv=TELayerNormColumnParallelLinear,
+                core_attention=TEDotProductAttention,
+                linear_proj=TERowParallelLinear,
+                q_layernorm=IdentityOp,
+                k_layernorm=IdentityOp,
+            ),
+        ),
+        self_attn_bda=get_bias_dropout_add,
+        mlp=ModuleSpec(
+            module=MLP,
+            submodules=MLPSubmodules(
+                linear_fc1=TELayerNormColumnParallelLinear, linear_fc2=TERowParallelLinear
+            ),
+        ),
+        mlp_bda=get_bias_dropout_add,
+    ),
+)
+
+# Use this spec for an implementation using only modules in megatron core
+bert_layer_local_spec = ModuleSpec(
+    module=TransformerLayer,
+    submodules=TransformerLayerSubmodules(
+        input_layernorm=LNImpl,
+        self_attention=ModuleSpec(
+            module=SelfAttention,
+            params={"attn_mask_type": AttnMaskType.padding},
+            submodules=SelfAttentionSubmodules(
+                linear_qkv=ColumnParallelLinear,
+                core_attention=DotProductAttention,
+                linear_proj=RowParallelLinear,
+                q_layernorm=IdentityOp,
+                k_layernorm=IdentityOp,
+            ),
+        ),
+        self_attn_bda=get_bias_dropout_add,
+        pre_mlp_layernorm=LNImpl,
+        mlp=ModuleSpec(
+            module=MLP,
+            submodules=MLPSubmodules(linear_fc1=ColumnParallelLinear, linear_fc2=RowParallelLinear),
+        ),
+        mlp_bda=get_bias_dropout_add,
+        sharded_state_dict_keys_map={
+            'input_layernorm.': 'self_attention.linear_qkv.layer_norm_',
+            'pre_mlp_layernorm.': 'mlp.linear_fc1.layer_norm_',
+        },
+    ),
+)
diff --git a/megatron/core/models/bert/bert_lm_head.py b/megatron/core/models/bert/bert_lm_head.py
new file mode 100644
index 0000000000..fd26ebd16f
--- /dev/null
+++ b/megatron/core/models/bert/bert_lm_head.py
@@ -0,0 +1,54 @@
+# Copyright (c) 2024, NVIDIA CORPORATION. All rights reserved.
+import torch
+from torch import Tensor
+
+from megatron.core.transformer.module import MegatronModule
+from megatron.core.transformer.transformer_config import TransformerConfig
+from megatron.core.transformer.utils import get_linear_layer
+
+try:
+    import apex
+
+    from megatron.core.fusions.fused_layer_norm import FusedLayerNorm
+
+    HAVE_APEX = True
+    LNImpl = FusedLayerNorm
+except ImportError:
+    import warnings
+
+    from megatron.core.transformer.torch_layer_norm import WrappedTorchLayerNorm
+
+    warnings.warn(f'Apex is not installed. Falling back to Torch LayerNorm')
+    LNImpl = WrappedTorchLayerNorm
+
+
+class BertLMHead(MegatronModule):
+    """Masked LM head for Bert.
+
+    Args:
+        hidden_size: hidden size
+        config (TransformerConfig): TransformerConfig object
+    """
+
+    def __init__(self, hidden_size: int, config: TransformerConfig):
+        super().__init__(config=config)
+
+        # TODO: Should switch this to TE ?
+        self.dense = get_linear_layer(
+            hidden_size, hidden_size, config.init_method, config.perform_initialization
+        )
+
+        setattr(self.dense.weight, 'sequence_parallel', config.sequence_parallel)
+        setattr(self.dense.bias, 'sequence_parallel', config.sequence_parallel)
+
+        self.layer_norm = LNImpl(
+            config=config, hidden_size=hidden_size, eps=config.layernorm_epsilon
+        )
+
+        self.gelu = torch.nn.functional.gelu
+
+    def forward(self, hidden_states: Tensor) -> Tensor:
+        hidden_states = self.dense(hidden_states)
+        hidden_states = self.gelu(hidden_states)
+        hidden_states = self.layer_norm(hidden_states)
+        return hidden_states
diff --git a/megatron/core/models/bert/bert_model.py b/megatron/core/models/bert/bert_model.py
new file mode 100644
index 0000000000..eb08d4cfd6
--- /dev/null
+++ b/megatron/core/models/bert/bert_model.py
@@ -0,0 +1,366 @@
+# Copyright (c) 2023, NVIDIA CORPORATION. All rights reserved.
+import os
+import warnings
+from typing import Literal, Optional
+
+import torch
+from torch import Tensor
+
+from megatron.core import parallel_state, tensor_parallel
+from megatron.core.config_logger import has_config_logger_enabled, log_config_to_disk
+from megatron.core.models.bert.bert_layer_specs import bert_layer_local_spec
+from megatron.core.models.bert.bert_lm_head import BertLMHead
+from megatron.core.models.bert.pooler import Pooler
+from megatron.core.models.common.embeddings.language_model_embedding import LanguageModelEmbedding
+from megatron.core.models.common.embeddings.rotary_pos_embedding import RotaryEmbedding
+from megatron.core.models.common.language_module.language_module import LanguageModule
+from megatron.core.transformer.enums import AttnMaskType, ModelType
+from megatron.core.transformer.spec_utils import ModuleSpec
+from megatron.core.transformer.transformer_block import TransformerBlock
+from megatron.core.transformer.transformer_config import TransformerConfig
+from megatron.core.transformer.utils import get_linear_layer
+from megatron.core.utils import get_te_version as _get_te_version
+from megatron.core.utils import is_te_min_version
+
+
+def get_te_version():
+    """Included for backwards compatibility."""
+    warnings.warn("`get_te_version` will be deprecated in a future release")
+    return _get_te_version()
+
+
+class BertModel(LanguageModule):
+    """Transformer language model.
+
+    Args:
+        config (TransformerConfig): transformer config
+        num_tokentypes (int) : Set to 2 when args.bert_binary_head is True, and 0 otherwise.
+            Defaults to 0.
+        transformer_layer_spec (ModuleSpec): Specifies module to use for transformer layers
+        vocab_size (int): vocabulary size
+        max_sequence_length (int): maximum size of sequence. This is used for positional embedding
+        pre_process (bool): Include embedding layer (used with pipeline parallelism)
+        post_process (bool): Include an output layer (used with pipeline parallelism)
+        parallel_output (bool): Do not gather the outputs, keep them split across tensor parallel
+            ranks
+        share_embeddings_and_output_weights (bool): When True, input embeddings and output logit
+            weights are shared. Defaults to False.
+        position_embedding_type (string): Position embedding type.
+            Options ['learned_absolute', 'rope']. Defaults is 'learned_absolute'.
+        rotary_percent (float): Percent of rotary dimension to use for rotary position embeddings.
+            Defaults to 1.0 (100%). Ignored unless position_embedding_type is 'rope'.
+    """
+
+    def __init__(
+        self,
+        config: TransformerConfig,
+        num_tokentypes: int,
+        transformer_layer_spec: ModuleSpec,
+        vocab_size: int,
+        max_sequence_length: int,
+        pre_process: bool = True,
+        post_process: bool = True,
+        fp16_lm_cross_entropy: bool = False,
+        parallel_output: bool = True,
+        share_embeddings_and_output_weights: bool = False,
+        position_embedding_type: Literal['learned_absolute', 'rope'] = 'learned_absolute',
+        rotary_percent: float = 1.0,
+        seq_len_interpolation_factor: Optional[float] = None,
+        add_binary_head=True,
+        return_embeddings=False,
+    ):
+        super(BertModel, self).__init__(config=config)
+
+        if has_config_logger_enabled(config):
+            log_config_to_disk(config, locals(), prefix=type(self).__name__)
+
+        if return_embeddings:
+            assert self.post_process and self.add_binary_head
+
+        self.config: TransformerConfig = config
+        self.transformer_layer_spec: ModuleSpec = transformer_layer_spec
+        self.vocab_size = vocab_size
+        self.max_sequence_length = max_sequence_length
+        self.pre_process = pre_process
+        self.post_process = post_process
+        self.fp16_lm_cross_entropy = fp16_lm_cross_entropy
+        self.parallel_output = parallel_output
+        self.share_embeddings_and_output_weights = share_embeddings_and_output_weights
+        self.position_embedding_type = position_embedding_type
+        self.add_binary_head = add_binary_head
+        self.return_embeddings = return_embeddings
+
+        # megatron core pipelining currently depends on model type
+        self.model_type = ModelType.encoder_or_decoder
+
+        self.attn_mask_dimensions = self._sanity_check_attention_and_get_attn_mask_dimension()
+
+        # Embeddings.
+        if self.pre_process:
+            self.embedding = LanguageModelEmbedding(
+                config=self.config,
+                vocab_size=self.vocab_size,
+                max_sequence_length=self.max_sequence_length,
+                position_embedding_type=position_embedding_type,
+                num_tokentypes=num_tokentypes,
+            )
+
+        if self.position_embedding_type == 'rope':
+            self.rotary_pos_emb = RotaryEmbedding(
+                kv_channels=self.config.kv_channels,
+                rotary_percent=rotary_percent,
+                rotary_interleaved=self.config.rotary_interleaved,
+                seq_len_interpolation_factor=seq_len_interpolation_factor,
+                use_cpu_initialization=self.config.use_cpu_initialization,
+            )
+
+        # Transformer.
+        self.encoder = TransformerBlock(
+            config=self.config,
+            spec=self.transformer_layer_spec,
+            pre_process=self.pre_process,
+            post_process=self.post_process,
+        )
+
+        # Output
+        if post_process:
+            # TODO: Make sure you are passing in the mpu_vocab_size properly
+            self.lm_head = BertLMHead(config.hidden_size, config)
+
+            self.output_layer = tensor_parallel.ColumnParallelLinear(
+                config.hidden_size,
+                self.vocab_size,
+                config=config,
+                init_method=config.init_method,
+                bias=True,
+                skip_bias_add=False,
+                gather_output=not self.parallel_output,
+                skip_weight_param_allocation=pre_process and share_embeddings_and_output_weights,
+            )
+
+            self.binary_head = None
+            if self.add_binary_head:
+                # TODO: Shoudl switch this to TE ?
+                self.binary_head = get_linear_layer(
+                    config.hidden_size, 2, config.init_method, config.perform_initialization
+                )
+
+                self.pooler = Pooler(
+                    config.hidden_size, config.init_method, config, config.sequence_parallel
+                )
+
+        if self.pre_process or self.post_process:
+            self.setup_embeddings_and_output_layer()
+
+    # pylint: disable=line-too-long
+    def _sanity_check_attention_and_get_attn_mask_dimension(self) -> str:
+        """We do some checks and return attention mask dimensions for self attention
+
+        Transformer engine library underwent a lot of change. So we need to change dimensions of
+        the attention mask depending on the TE version. We also santiy check some arguments.
+
+        1. If we use local version of attention dimension of the mask is [b,1,s,s]
+        2. If we use transformer engine > 1.10 we support all 3 backends with padding mask and [b,1,s,s]
+        3. If we use transformer engine >= 1.7 but less than 1.10
+          a ) Flash and Fused attention uses padding mask with [b,1,1,s]
+          b ) Unfused attention works with arbitrary mask with [b,1,s,s]
+        4. If we use transformer engine < 1.7
+          Flash and fused attention is not supported. Unfused attention will work with padding mask [b,1,s,s]
+
+        Default if you dont set any NVTE_ATTN flag will it will just use the fused path for transformer engine version >= 1.7 and unfused path for other
+
+        Args:
+            transformer_layer_spec (ModuleSpec): The transformer layer spec
+
+        Returns:
+            str: A string showing the format of the attn mask dimensions
+        """
+        attn_mask_dimensions = None
+        # For local layer spec we just use b1ss
+        if self.transformer_layer_spec == bert_layer_local_spec:
+            attn_mask_dimensions = "b1ss"
+        else:
+            attn_mask_type = self.transformer_layer_spec.submodules.self_attention.params[
+                'attn_mask_type'
+            ]
+            flash_attention_enabled = os.getenv('NVTE_FLASH_ATTN') == '1'
+            fused_attention_enabled = os.getenv('NVTE_FUSED_ATTN') == '1'
+            # For TE >= 1.10 (We always use padding mask and use b11s)
+            if is_te_min_version("1.10.0"):
+                attn_mask_dimensions = "b11s"
+                if attn_mask_type != AttnMaskType.padding:
+                    warnings.warn(
+                        f'For TE versions >= 1.10 , flash/fused/unfused support padding mask. Setting attention mask from {attn_mask_type} to padding'
+                    )
+                    self.transformer_layer_spec.submodules.self_attention.params[
+                        'attn_mask_type'
+                    ] = AttnMaskType.padding
+            # For 1.7 >= TE < 1.10 flash and fused path use padding mask with b11s and unfused path uses arbitrary mask with b1ss
+            elif is_te_min_version("1.7.0"):
+                if flash_attention_enabled or fused_attention_enabled:
+                    attn_mask_dimensions = "b11s"
+                else:
+                    if attn_mask_type != AttnMaskType.arbitrary:
+                        warnings.warn(
+                            f'For TE versions >= 1.7 but < 1.10 , unfused path supports only arbitrary mask. Setting attention mask from {attn_mask_type} to arbitray'
+                        )
+                        self.transformer_layer_spec.submodules.self_attention.params[
+                            'attn_mask_type'
+                        ] = AttnMaskType.arbitrary
+                    attn_mask_dimensions = "b1ss"
+            # For TE < 1.7 we only support unfused attention with b1ss and padding mask
+            else:
+                attn_mask_dimensions = "b1ss"
+                assert not flash_attention_enabled and not fused_attention_enabled, (
+                    "Flash and fused attention is not supported with transformer engine version "
+                    "< 1.7. Set NVTE_FLASH_ATTN=0 and NVTE_FUSED_ATTN=0 or upgrade transformer "
+                    "engine >= 1.7"
+                )
+
+        return attn_mask_dimensions
+
+    def bert_extended_attention_mask(self, attention_mask: Tensor) -> Tensor:
+        """Creates the extended attention mask
+
+        Converts the attention mask of dimension
+        [batch size, 1, seq len] to [batch size, 1, seq len, seq len]
+        or [batch size, 1, 1, seq_len] and makes it binary
+
+        Args:
+            attention_mask (Tensor): The input attention mask
+
+        Returns:
+            Tensor: The extended binary attention mask
+        """
+        # We create a 3D attention mask from a 2D tensor mask.
+        if self.attn_mask_dimensions == "b1ss":
+            # [b, 1, s]
+            attention_mask_b1s = attention_mask.unsqueeze(1)
+            # [b, s, 1]
+            attention_mask_bs1 = attention_mask.unsqueeze(2)
+            # [b, s, s]
+            attention_mask_bss = attention_mask_b1s * attention_mask_bs1
+            # [b, 1, s, s]
+            extended_attention_mask = attention_mask_bss.unsqueeze(1)
+        else:
+            # [b, 1, 1, s]
+            extended_attention_mask = attention_mask.unsqueeze(1).unsqueeze(1)
+
+        # Convert attention mask to binary:
+        extended_attention_mask = extended_attention_mask < 0.5
+
+        return extended_attention_mask
+
+    def bert_position_ids(self, token_ids):
+        """Position ids for bert model"""
+        # Create position ids
+        seq_length = token_ids.size(1)
+        position_ids = torch.arange(seq_length, dtype=torch.long, device=token_ids.device)
+        position_ids = position_ids.unsqueeze(0).expand_as(token_ids)
+
+        return position_ids
+
+    def set_input_tensor(self, input_tensor: Tensor) -> None:
+        """Sets input tensor to the model.
+
+        See megatron.model.transformer.set_input_tensor()
+
+        Args:
+            input_tensor (Tensor): Sets the input tensor for the model.
+        """
+        # This is usually handled in schedules.py but some inference code still
+        # gives us non-lists or None
+        if not isinstance(input_tensor, list):
+            input_tensor = [input_tensor]
+
+        assert len(input_tensor) == 1, 'input_tensor should only be length 1 for gpt/bert'
+        self.encoder.set_input_tensor(input_tensor[0])
+
+    def forward(
+        self,
+        input_ids: Tensor,
+        attention_mask: Tensor,
+        tokentype_ids: Tensor = None,
+        lm_labels: Tensor = None,
+        inference_params=None,
+    ):
+        """Forward function of BERT model
+
+        Forward function of the BERT Model This function passes the input tensors
+        through the embedding layer, and then the encoder and finally into the post
+        processing layer (optional).
+
+        It either returns the Loss values if labels are given  or the final hidden units
+        """
+        extended_attention_mask = self.bert_extended_attention_mask(attention_mask)
+
+        if parallel_state.is_pipeline_first_stage():
+            input_ids = input_ids
+            position_ids = self.bert_position_ids(input_ids)
+        else:
+            position_ids = None
+            input_ids = None
+
+        # Encoder embedding.
+        if self.pre_process:
+            encoder_input = self.embedding(
+                input_ids=input_ids, position_ids=position_ids, tokentype_ids=tokentype_ids
+            )
+        else:
+            # intermediate stage of pipeline
+            # encoder will get hidden_states from encoder.input_tensor
+            encoder_input = None
+
+        # Rotary positional embeddings (Why not move this into BERT/GPTEmberdding ?)
+        rotary_pos_emb = None
+        if self.position_embedding_type == 'rope':
+            rotary_seq_len = self.rotary_pos_emb.get_rotary_seq_len(
+                inference_params, self.encoder, encoder_input, self.config
+            )
+            rotary_pos_emb = self.rotary_pos_emb(rotary_seq_len)
+
+        # Run encoder.
+        hidden_states = self.encoder(
+            hidden_states=encoder_input,
+            attention_mask=extended_attention_mask,
+            inference_params=inference_params,
+            rotary_pos_emb=rotary_pos_emb,
+        )
+        if not self.post_process:
+            return hidden_states
+
+        if self.add_binary_head:
+            pooled_output = self.pooler(hidden_states, 0)
+
+        if self.return_embeddings:
+            embeddings = torch.transpose(hidden_states, 0, 1)
+            masks = torch.sum(attention_mask, dim=1)
+            # Collect masked embeddings.
+            output = torch.zeros(
+                size=(embeddings.shape[0], embeddings.shape[2]),
+                dtype=torch.float32,
+                device=torch.cuda.current_device(),
+            )
+            for i, (embedding, mask) in enumerate(zip(embeddings, masks)):
+                output[i, :] = torch.mean(embedding[1 : mask - 1], dim=0)
+            return output
+
+        # logits and loss
+        output_weight = None
+        if self.share_embeddings_and_output_weights:
+            output_weight = self.shared_embedding_or_output_weight()
+
+        hidden_states_after_lm_head = self.lm_head(hidden_states=hidden_states)
+        logits, _ = self.output_layer(hidden_states_after_lm_head, weight=output_weight)
+
+        binary_logits = None
+        if self.binary_head is not None:
+            binary_logits = self.binary_head(pooled_output)
+
+        if lm_labels is None:
+            # [s b h] => [b s h]
+            return logits.transpose(0, 1).contiguous(), binary_logits
+
+        loss = self.compute_language_model_loss(lm_labels, logits)
+
+        return loss, binary_logits
diff --git a/megatron/core/models/bert/pooler.py b/megatron/core/models/bert/pooler.py
new file mode 100644
index 0000000000..e0de1a845a
--- /dev/null
+++ b/megatron/core/models/bert/pooler.py
@@ -0,0 +1,52 @@
+# Copyright (c) 2024, NVIDIA CORPORATION. All rights reserved.
+import torch
+from torch import Tensor
+
+from megatron.core import tensor_parallel
+from megatron.core.transformer.module import MegatronModule
+from megatron.core.transformer.transformer_config import TransformerConfig
+from megatron.core.transformer.utils import get_linear_layer
+
+
+class Pooler(MegatronModule):
+    """Pooler layer.
+
+    Pool hidden states of a specific token (for example start of the
+    sequence) and add a linear transformation followed by a tanh.
+
+    Args:
+        hidden_size (int): The hidden size_
+        init_method (callable): weight initialization method for the linear layer. bias is set to zero.
+        config (TransformerConfig): The transformer configuration
+        sequence_parallel (bool): Using squence parallel ? Defaults to False
+    """
+
+    def __init__(
+        self,
+        hidden_size: int,
+        init_method: callable,
+        config: TransformerConfig,
+        sequence_parallel: bool = False,
+    ):
+        super(Pooler, self).__init__(config)
+        # TODO: Shoudl switch this to TE ?
+        self.dense = get_linear_layer(
+            hidden_size, hidden_size, init_method, config.perform_initialization
+        )
+        self.sequence_parallel = sequence_parallel
+
+    def forward(self, hidden_states: Tensor, sequence_index=0):
+        # hidden_states: [s, b, h]
+        # sequence_index: index of the token to pool.
+
+        # gather data along sequence dimensions
+        # same pooler is run on all tensor parallel nodes
+        if self.sequence_parallel:
+            hidden_states = tensor_parallel.gather_from_sequence_parallel_region(
+                hidden_states, tensor_parallel_output_grad=False
+            )
+
+        pooled = hidden_states[sequence_index, :, :]
+        pooled = self.dense(pooled)
+        pooled = torch.tanh(pooled)
+        return pooled
diff --git a/megatron/core/models/common/__init__.py b/megatron/core/models/common/__init__.py
new file mode 100644
index 0000000000..e69de29bb2
diff --git a/megatron/core/models/common/embeddings/__init__.py b/megatron/core/models/common/embeddings/__init__.py
new file mode 100644
index 0000000000..865f96da5d
--- /dev/null
+++ b/megatron/core/models/common/embeddings/__init__.py
@@ -0,0 +1,5 @@
+# Copyright (c) 2024, NVIDIA CORPORATION. All rights reserved.
+
+from .rope_utils import apply_rotary_pos_emb
+from .rotary_pos_embedding import RotaryEmbedding
+from .yarn_rotary_pos_embedding import YarnRotaryEmbedding, _yarn_get_mscale
diff --git a/megatron/core/models/common/embeddings/language_model_embedding.py b/megatron/core/models/common/embeddings/language_model_embedding.py
new file mode 100644
index 0000000000..bc1a2de9cb
--- /dev/null
+++ b/megatron/core/models/common/embeddings/language_model_embedding.py
@@ -0,0 +1,137 @@
+# Copyright (c) 2023, NVIDIA CORPORATION. All rights reserved.
+
+from typing import Literal
+
+import torch
+from torch import Tensor
+
+from megatron.core import tensor_parallel
+from megatron.core.transformer.module import MegatronModule
+from megatron.core.transformer.transformer_config import TransformerConfig
+
+
+class LanguageModelEmbedding(MegatronModule):
+    """Language model embeddings.
+
+    Args:
+        config (TransformerConfig): config object with all necessary configs for TransformerBlock
+        vocab_size (int): vocabulary size
+        max_sequence_length (int): maximum size of sequence. This
+                             is used for positional embedding
+        add_position_embedding (bool): Add a position embedding.
+        embedding_dropout_prob (float): dropout probability for embeddings
+        num_tokentypes (int): Set to 0 without binary head, and 2 with a binary head . Defaults to 0.
+    """
+
+    def __init__(
+        self,
+        config: TransformerConfig,
+        vocab_size: int,
+        max_sequence_length: int,
+        position_embedding_type: Literal['learned_absolute', 'rope', 'none'] = 'learned_absolute',
+        num_tokentypes: int = 0,
+    ):
+        super().__init__(config=config)
+
+        self.config: TransformerConfig = config
+        self.vocab_size: int = vocab_size
+        self.max_sequence_length: int = max_sequence_length
+        self.add_position_embedding: bool = position_embedding_type == 'learned_absolute'
+        self.num_tokentypes = num_tokentypes
+        self.reduce_scatter_embeddings = (
+            (not self.add_position_embedding)
+            and self.num_tokentypes <= 0
+            and self.config.sequence_parallel
+        )
+
+        # Word embeddings (parallel).
+        self.word_embeddings = tensor_parallel.VocabParallelEmbedding(
+            num_embeddings=self.vocab_size,
+            embedding_dim=self.config.hidden_size,
+            init_method=self.config.init_method,
+            reduce_scatter_embeddings=self.reduce_scatter_embeddings,
+            config=self.config,
+        )
+
+        # Position embedding (serial).
+        if self.add_position_embedding:
+            self.position_embeddings = torch.nn.Embedding(
+                self.max_sequence_length, self.config.hidden_size
+            )
+
+            # Initialize the position embeddings.
+            if self.config.perform_initialization:
+                self.config.init_method(self.position_embeddings.weight)
+
+        if self.num_tokentypes > 0:
+            self.tokentype_embeddings = torch.nn.Embedding(
+                self.num_tokentypes, self.config.hidden_size
+            )
+            # Initialize the token-type embeddings.
+            if self.config.perform_initialization:
+                self.config.init_method(self.tokentype_embeddings.weight)
+        else:
+            self.tokentype_embeddings = None
+
+        # Embeddings dropout
+        self.embedding_dropout = torch.nn.Dropout(self.config.hidden_dropout)
+
+    def zero_parameters(self):
+        """Zero out all parameters in embedding."""
+        self.word_embeddings.weight.data.fill_(0)
+        self.word_embeddings.weight.shared = True
+        self.position_embeddings.weight.data.fill_(0)
+        self.position_embeddings.weight.shared = True
+        if self.num_tokentypes > 0:
+            self.tokentype_embeddings.weight.data.fill_(0)
+            self.tokentype_embeddings.weight.shared = True
+
+    def forward(self, input_ids: Tensor, position_ids: Tensor, tokentype_ids: int = None) -> Tensor:
+        """Forward pass of the embedding module.
+
+        Args:
+            input_ids (Tensor): The input tokens
+            position_ids (Tensor): The position id's used to calculate position embeddings
+            tokentype_ids (int): The token type ids. Used when args.bert_binary_head is set to True. Defaults to None
+
+        Returns:
+            Tensor: The output embeddings
+        """
+        word_embeddings = self.word_embeddings(input_ids)
+        if self.add_position_embedding:
+            position_embeddings = self.position_embeddings(position_ids)
+            embeddings = word_embeddings + position_embeddings
+        else:
+            embeddings = word_embeddings
+
+        if not self.reduce_scatter_embeddings:
+            # Data format change to avoid explicit tranposes : [b s h] --> [s b h].
+            embeddings = embeddings.transpose(0, 1).contiguous()
+
+        if tokentype_ids is not None:
+            assert self.tokentype_embeddings is not None
+            # [b s h] -> [s b h] (So that it can be added with embeddings)
+            tokentype_embedding = self.tokentype_embeddings(tokentype_ids).permute(1, 0, 2)
+            embeddings = embeddings + tokentype_embedding
+        else:
+            assert self.tokentype_embeddings is None
+
+        # If the input flag for fp32 residual connection is set, convert for float.
+        if self.config.fp32_residual_connection:
+            embeddings = embeddings.float()
+
+        # Dropout.
+        if self.config.sequence_parallel:
+            if not self.reduce_scatter_embeddings:
+                embeddings = tensor_parallel.scatter_to_sequence_parallel_region(embeddings)
+            # `scatter_to_sequence_parallel_region` returns a view, which prevents
+            # the original tensor from being garbage collected. Clone to facilitate GC.
+            # Has a small runtime cost (~0.5%).
+            if self.config.clone_scatter_output_in_embedding:
+                embeddings = embeddings.clone()
+            with tensor_parallel.get_cuda_rng_tracker().fork():
+                embeddings = self.embedding_dropout(embeddings)
+        else:
+            embeddings = self.embedding_dropout(embeddings)
+
+        return embeddings
diff --git a/megatron/core/models/common/embeddings/rope_utils.py b/megatron/core/models/common/embeddings/rope_utils.py
new file mode 100644
index 0000000000..accb251961
--- /dev/null
+++ b/megatron/core/models/common/embeddings/rope_utils.py
@@ -0,0 +1,191 @@
+# Copyright (c) 2024, NVIDIA CORPORATION. All rights reserved.
+
+from __future__ import annotations
+
+from typing import TYPE_CHECKING, Optional
+
+if TYPE_CHECKING:
+    from megatron.core.transformer.transformer_config import TransformerConfig
+
+import logging
+
+import torch
+from torch import Tensor
+
+from megatron.core import parallel_state
+
+logger = logging.getLogger(__name__)
+
+try:
+    from apex.transformer.functional import (
+        fused_apply_rotary_pos_emb,
+        fused_apply_rotary_pos_emb_thd,
+    )
+
+    HAVE_APPLY_ROPE_FUSION = True
+except ImportError:
+    HAVE_APPLY_ROPE_FUSION = False
+
+
+def get_pos_emb_on_this_cp_rank(pos_emb: Tensor, seq_dim: int) -> Tensor:
+    """Get the position embedding on the current context parallel rank.
+
+    Args:
+        pos_emb (Tensor): Positional embedding tensor
+        seq_dim (int): Sequence dimension
+    """
+    cp_size = parallel_state.get_context_parallel_world_size()
+    cp_rank = parallel_state.get_context_parallel_rank()
+    cp_idx = torch.tensor(
+        [cp_rank, (2 * cp_size - cp_rank - 1)], device="cpu", pin_memory=True
+    ).cuda(non_blocking=True)
+    pos_emb = pos_emb.view(
+        *pos_emb.shape[:seq_dim], 2 * cp_size, -1, *pos_emb.shape[(seq_dim + 1) :]
+    )
+    pos_emb = pos_emb.index_select(seq_dim, cp_idx)
+    pos_emb = pos_emb.view(*pos_emb.shape[:seq_dim], -1, *pos_emb.shape[(seq_dim + 2) :])
+    return pos_emb
+
+
+def _rotate_half(x: Tensor, rotary_interleaved: bool) -> Tensor:
+    """Change sign so the last dimension becomes [-odd, +even]
+
+    Args:
+        x (Tensor): Input tensor
+
+    Returns:
+        Tensor: Tensor rotated half
+    """
+    if not rotary_interleaved:
+        x1, x2 = torch.chunk(x, 2, dim=-1)
+        return torch.cat((-x2, x1), dim=-1)
+    else:
+        x1 = x[:, :, :, ::2]
+        x2 = x[:, :, :, 1::2]
+        x_new = torch.stack((-x2, x1), dim=-1)
+        return x_new.view(x_new.shape[0], x_new.shape[1], x_new.shape[2], -1)
+
+
+def _apply_rotary_pos_emb_bshd(
+    t: Tensor,
+    freqs: Tensor,
+    rotary_interleaved: bool = False,
+    multi_latent_attention: bool = False,
+    mscale: float = 1.0,
+) -> Tensor:
+    """Apply rotary positional embedding to input tensor T.
+
+    check https://kexue.fm/archives/8265 for detailed formulas
+
+    Args:
+        t (Tensor): Input tensor T is of shape [seq_length, ... , dim]
+        freqs (Tensor): Rotary Positional embedding tensor freq is of shape [seq_length, ..., dim]
+
+    Returns:
+        Tensor: The input tensor after applying RoPE
+    """
+    rot_dim = freqs.shape[-1]
+
+    # ideally t_pass is empty so rotary pos embedding is applied to all tensor t
+    t, t_pass = t[..., :rot_dim], t[..., rot_dim:]
+
+    if multi_latent_attention:
+        x1 = t[..., 0::2]
+        x2 = t[..., 1::2]
+        t = torch.cat((x1, x2), dim=-1)
+
+    # first part is cosine component
+    # second part is sine component, need to change signs with _rotate_half method
+    cos_ = (torch.cos(freqs) * mscale).to(t.dtype)
+    sin_ = (torch.sin(freqs) * mscale).to(t.dtype)
+
+    t = (t * cos_) + (_rotate_half(t, rotary_interleaved) * sin_)
+    return torch.cat((t, t_pass), dim=-1)
+
+
+def _apply_rotary_pos_emb_thd(
+    t: Tensor,
+    cu_seqlens: Tensor,
+    freqs: Tensor,
+    rotary_interleaved: bool = False,
+    multi_latent_attention: bool = False,
+    mscale: float = 1.0,
+) -> Tensor:
+    """A baseline implementation of applying RoPE for `thd` format.
+
+    Args:
+        t (Tensor): Input tensor T is of shape [t, h, d]
+        cu_seqlens(Tensor):  Cumulative sum of sequence lengths in a batch for `t`,
+        with shape [b + 1] and dtype torch.int32.
+        freqs (Tensor): Rotary Positional embedding tensor freq is of shape [max_s, 1, 1, d]
+
+    Returns:
+        Tensor: Shape [t, h, d]. The input tensor after applying RoPE.
+    """
+
+    seqlens = (cu_seqlens[1:] - cu_seqlens[:-1]).tolist()
+    return torch.cat(
+        [
+            _apply_rotary_pos_emb_bshd(
+                x.unsqueeze(1),
+                freqs[: x.size(0)],
+                rotary_interleaved=rotary_interleaved,
+                multi_latent_attention=multi_latent_attention,
+                mscale=mscale,
+            )
+            for x in torch.split(t, seqlens)
+        ]
+    ).squeeze(1)
+
+
+def apply_rotary_pos_emb(
+    t: Tensor,
+    freqs: Tensor,
+    config: TransformerConfig,
+    cu_seqlens: Optional[Tensor] = None,
+    mscale: float = 1.0,
+):
+    """
+    Reroute to the appropriate apply_rotary_pos_emb function depending on
+    fused/unfused kernels, or bshd (conventional) / thd (packed seq) format
+    """
+    if config.apply_rope_fusion and not HAVE_APPLY_ROPE_FUSION:
+        # setting apply_rope_fusion in config to False
+        # so that subsequent queries to this config also return False
+        config.apply_rope_fusion = False
+        if not getattr(apply_rotary_pos_emb, "printed_fused_warning", False):
+            logger.warning(
+                "Setting apply_rope_fusion to false because its implementation"
+                " is not included in Apex. Try upgrading to the latest version"
+            )
+            apply_rotary_pos_emb.printed_fused_warning = True
+
+    if getattr(config, "multi_latent_attention", False) and config.rotary_interleaved:
+        logger.warning(
+            "rotary_interleaved is not supported with multi_latent_attention, setting it to False"
+        )
+        config.rotary_interleaved = False
+
+    if config.apply_rope_fusion:
+        if cu_seqlens is None:
+            return fused_apply_rotary_pos_emb(t, freqs, transpose_output_memory=True)
+        else:
+            return fused_apply_rotary_pos_emb_thd(t, cu_seqlens, freqs)
+    else:
+        if cu_seqlens is None:
+            return _apply_rotary_pos_emb_bshd(
+                t,
+                freqs,
+                rotary_interleaved=config.rotary_interleaved,
+                multi_latent_attention=config.multi_latent_attention,
+                mscale=mscale,
+            )
+        else:
+            return _apply_rotary_pos_emb_thd(
+                t,
+                cu_seqlens,
+                freqs,
+                rotary_interleaved=config.rotary_interleaved,
+                multi_latent_attention=config.multi_latent_attention,
+                mscale=mscale,
+            )
diff --git a/megatron/core/models/common/embeddings/rotary_pos_embedding.py b/megatron/core/models/common/embeddings/rotary_pos_embedding.py
new file mode 100644
index 0000000000..5232faec60
--- /dev/null
+++ b/megatron/core/models/common/embeddings/rotary_pos_embedding.py
@@ -0,0 +1,186 @@
+# Copyright (c) 2023, NVIDIA CORPORATION. All rights reserved.
+
+from __future__ import annotations
+
+from typing import TYPE_CHECKING
+
+if TYPE_CHECKING:
+    from megatron.core.transformer.transformer_config import TransformerConfig
+    from megatron.core.transformer.transformer_block import TransformerBlock
+
+import logging
+import math
+
+import torch
+from torch import Tensor, nn
+
+from megatron.core import parallel_state
+from megatron.core.models.common.embeddings.rope_utils import (  # for backward compatibility; pylint: disable=unused-import
+    _apply_rotary_pos_emb_bshd,
+    _apply_rotary_pos_emb_thd,
+    _rotate_half,
+    apply_rotary_pos_emb,
+    get_pos_emb_on_this_cp_rank,
+)
+
+logger = logging.getLogger(__name__)
+
+
+__all__ = ['RotaryEmbedding']
+
+
+class RotaryEmbedding(nn.Module):
+    """Rotary Embedding for language model.
+
+    Args:
+        kv_channels (int): Projection weights dimension in multi-head attention. Obtained
+            from transformer config
+        rotary_percent (float): Percent of rotary dimension to use for rotary position
+            embeddings.
+        rotary_interleaved (bool, optional): If True, interleaved rotary position embeddings.
+            Defaults to False.
+        seq_len_interpolation_factor (float, optional): scale of linearly interpolating RoPE
+            for longer sequences. The value must be a float larger than 1.0. Defaults to None
+        rotary_base (int, optional): Base period for rotary position embeddings. Defaults to
+            10000.
+        rope_scaling (bool, optional): Apply rope scaling as used in llama 3.1
+        use_cpu_initialization (bool, optional): If False, initialize the inv_freq directly
+            on the GPU. Defaults to False
+    """
+
+    def __init__(
+        self,
+        kv_channels: int,
+        rotary_percent: float,
+        rotary_interleaved: bool = False,
+        seq_len_interpolation_factor: float = None,
+        rotary_base: int = 10000,
+        rope_scaling: bool = False,
+        use_cpu_initialization: bool = False,
+    ) -> None:
+        super().__init__()
+
+        dim = kv_channels
+        if rotary_percent < 1.0:
+            dim = int(dim * rotary_percent)
+        self.rotary_interleaved = rotary_interleaved
+
+        self.seq_len_interpolation_factor = seq_len_interpolation_factor
+        device = 'cpu' if use_cpu_initialization else torch.cuda.current_device()
+        self.inv_freq = 1.0 / (
+            rotary_base ** (torch.arange(0, dim, 2, dtype=torch.float32, device=device) / dim)
+        )
+
+        if rope_scaling:
+            self.inv_freq = self._apply_scaling(self.inv_freq)
+
+    def _apply_scaling(
+        self,
+        freqs,
+        factor=8,
+        low_freq_factor=1,
+        high_freq_factor=4,
+        original_max_position_embeddings=8192,
+    ):
+        # This implementation is adapted from:
+        # https://github.com/huggingface/transformers/blob/2a5a6ad18aa22e98429bb5ecb880660328030ea0/src/transformers/modeling_rope_utils.py#L303-L343
+
+        factor = factor  # `8` in the original implementation
+        low_freq_factor = low_freq_factor  # `1` in the original implementation
+        high_freq_factor = high_freq_factor  # `4` in the original implementation
+        old_context_len = original_max_position_embeddings  # `8192` in the original implementation
+
+        low_freq_wavelen = old_context_len / low_freq_factor
+        high_freq_wavelen = old_context_len / high_freq_factor
+
+        wavelen = 2 * math.pi / freqs
+        # wavelen < high_freq_wavelen: do nothing
+        # wavelen > low_freq_wavelen: divide by factor
+        inv_freq_llama = torch.where(wavelen > low_freq_wavelen, freqs / factor, freqs)
+        # otherwise: interpolate between the two, using a smooth factor
+        smooth_factor = (old_context_len / wavelen - low_freq_factor) / (
+            high_freq_factor - low_freq_factor
+        )
+        smoothed_inv_freq = (
+            1 - smooth_factor
+        ) * inv_freq_llama / factor + smooth_factor * inv_freq_llama
+        is_medium_freq = ~(wavelen < high_freq_wavelen) * ~(wavelen > low_freq_wavelen)
+        inv_freq_llama = torch.where(is_medium_freq, smoothed_inv_freq, inv_freq_llama)
+
+        return inv_freq_llama
+
+    def forward(self, max_seq_len: int, offset: int = 0) -> Tensor:
+        """Forward pass of RoPE embedding.
+
+        Args:
+            max_seq_len (int): Maximum size of sequence
+            offset (int, optional): _description_. Defaults to 0.
+
+        Returns:
+            Tensor: Embeddings after applying RoPE.
+        """
+        if self.inv_freq.device.type == 'cpu':
+            # move `inv_freq` to GPU once at the first micro-batch forward pass
+            self.inv_freq = self.inv_freq.to(device=torch.cuda.current_device())
+        seq = (
+            torch.arange(max_seq_len, device=self.inv_freq.device, dtype=self.inv_freq.dtype)
+            + offset
+        )
+
+        if self.seq_len_interpolation_factor is not None:
+            seq *= 1 / self.seq_len_interpolation_factor
+
+        freqs = torch.outer(seq, self.inv_freq)
+        # first part even vector components, second part odd vector components,
+        #  2 * dim in dimension size
+        if not self.rotary_interleaved:
+            emb = torch.cat((freqs, freqs), dim=-1)
+        else:
+            emb = torch.stack((freqs.view(-1, 1), freqs.view(-1, 1)), dim=-1).view(
+                freqs.shape[0], -1
+            )
+        # emb [seq_length, .., dim]
+        emb = emb[:, None, None, :]
+        if parallel_state.get_context_parallel_world_size() > 1:
+            # slice rotary_pos_emb along sequence dimension and select the parition of the current
+            # CP rank
+            emb = get_pos_emb_on_this_cp_rank(emb, 0)
+        return emb
+
+    def _load_from_state_dict(self, state_dict, prefix, *args, **kwargs):
+        state_dict.pop(f'{prefix}inv_freq', None)
+        return super()._load_from_state_dict(state_dict, prefix, *args, **kwargs)
+
+    def get_rotary_seq_len(
+        self,
+        inference_params,
+        transformer: TransformerBlock,
+        transformer_input: Tensor,
+        transformer_config: TransformerConfig,
+    ) -> float:
+        """Function to get the rotary sequence length.
+
+        Args:
+            inference_params : Used during Inference time
+            transformer (TransformerBlock): The transformer block (decoder/encoder) used
+                by the model
+            transformer_input (Tensor): Input tensor to the transformer
+            transformer_config (TransformerConfig): Transformer config used by the model
+
+        Returns:
+            float: The rotary sequence length
+        """
+        if inference_params is not None:
+            rotary_seq_len = inference_params.max_sequence_length
+        else:
+            if transformer.input_tensor is not None:
+                rotary_seq_len = transformer.input_tensor.size(0)
+            else:
+                rotary_seq_len = transformer_input.size(0)
+
+            if transformer_config.sequence_parallel:
+                rotary_seq_len *= transformer_config.tensor_model_parallel_size
+
+        rotary_seq_len *= transformer_config.context_parallel_size
+
+        return rotary_seq_len
diff --git a/megatron/core/models/common/embeddings/yarn_rotary_pos_embedding.py b/megatron/core/models/common/embeddings/yarn_rotary_pos_embedding.py
new file mode 100644
index 0000000000..14d147ea34
--- /dev/null
+++ b/megatron/core/models/common/embeddings/yarn_rotary_pos_embedding.py
@@ -0,0 +1,169 @@
+# Copyright (c) 2024, NVIDIA CORPORATION. All rights reserved.
+
+from __future__ import annotations
+
+import logging
+import math
+
+import torch
+from torch import Tensor
+
+from megatron.core import parallel_state
+from megatron.core.models.common.embeddings.rope_utils import get_pos_emb_on_this_cp_rank
+from megatron.core.models.common.embeddings.rotary_pos_embedding import RotaryEmbedding
+
+logger = logging.getLogger(__name__)
+
+
+class YarnRotaryEmbedding(RotaryEmbedding):
+    """Yarn Rotary Embedding for language model.
+
+    Args:
+        kv_channels (int): Projection weights dimension in multi-head attention. Obtained from
+            transformer config
+        rotary_percent (float): Percent of rotary dimension to use for rotary position embeddings.
+        rotary_interleaved (bool, optional): If True, interleaved rotary position embeddings.
+            Defaults to False.
+        seq_len_interpolation_factor (float, optional): scale of linearly interpolating RoPE for
+            longer sequences. The value must be a float larger than 1.0. Defaults to None
+        rotary_base (float, optional): Base period for rotary position embeddings. Defaults to
+            10000.
+        use_cpu_initialization (bool, optional): If False, initialize the inv_freq directly on
+            the GPU. Defaults to False
+        scaling_factor (float, optional): Scaling factor for Yarn RoPE. Defaults to 1.0.
+        original_max_position_embeddings (int, optional): Original maximum position embeddings
+            length. Defaults to 4096.
+        beta_fast (float, optional): Fast beta value for Yarn RoPE. Defaults to 32.
+        beta_slow (float, optional): Slow beta value for Yarn RoPE. Defaults to 1.
+        mscale (float, optional): Mscale value for Yarn RoPE. Defaults to 1.
+        mscale_all_dim (float, optional): Mscale all dim value for Yarn RoPE. Defaults to 0.
+    """
+
+    def __init__(
+        self,
+        kv_channels: int,
+        rotary_percent: float = 1.0,
+        rotary_interleaved: bool = False,
+        seq_len_interpolation_factor: float = None,
+        rotary_base: float = 10000.0,
+        use_cpu_initialization: bool = False,
+        scaling_factor: float = 1.0,
+        original_max_position_embeddings: int = 4096,
+        beta_fast: float = 32.0,
+        beta_slow: float = 1.0,
+        mscale: float = 1.0,
+        mscale_all_dim: float = 0.0,
+    ):
+        self.dim = kv_channels
+        self.rotary_base = rotary_base
+        self.scaling_factor = scaling_factor
+        self.original_max_position_embeddings = original_max_position_embeddings
+        self.beta_fast = beta_fast
+        self.beta_slow = beta_slow
+        self.mscale = mscale
+        self.mscale_all_dim = mscale_all_dim
+
+        device = 'cpu' if use_cpu_initialization else torch.cuda.current_device()
+        self.inv_freq_extra = 1.0 / (
+            self.rotary_base
+            ** (torch.arange(0, self.dim, 2, dtype=torch.float32, device=device) / self.dim)
+        )
+        self.inv_freq_inter = 1.0 / (
+            self.scaling_factor
+            * self.rotary_base
+            ** (torch.arange(0, self.dim, 2, dtype=torch.float32, device=device) / self.dim)
+        )
+        super().__init__(
+            kv_channels,
+            rotary_percent,
+            rotary_interleaved,
+            seq_len_interpolation_factor,
+            rotary_base,
+            use_cpu_initialization,
+        )
+
+    def forward(self, max_seq_len: int, offset: int = 0) -> Tensor:
+
+        assert (
+            not self.rotary_interleaved
+        ), "Yarn RoPE does not support interleaved rotary embeddings"
+
+        if self.inv_freq_extra.device.type == 'cpu':
+            # move `inv_freq_extra` to GPU once at the first micro-batch forward pass
+            self.inv_freq_extra = self.inv_freq_extra.to(device=torch.cuda.current_device())
+
+        if self.inv_freq_inter.device.type == 'cpu':
+            # move `inv_freq_inter` to GPU once at the first micro-batch forward pass
+            self.inv_freq_inter = self.inv_freq_inter.to(device=torch.cuda.current_device())
+
+        low, high = _yarn_find_correction_range(
+            self.beta_fast,
+            self.beta_slow,
+            self.dim,
+            self.rotary_base,
+            self.original_max_position_embeddings,
+        )
+        inv_freq_mask = 1.0 - _yarn_linear_ramp_mask(low, high, self.dim // 2).to(
+            device=self.inv_freq_extra.device, dtype=torch.float32
+        )
+        inv_freq = self.inv_freq_inter * (1 - inv_freq_mask) + self.inv_freq_extra * inv_freq_mask
+
+        seq = (
+            torch.arange(
+                max_seq_len, device=self.inv_freq_extra.device, dtype=self.inv_freq_extra.dtype
+            )
+            + offset
+        )
+
+        freqs = torch.outer(seq, inv_freq)
+
+        _mscale = float(
+            _yarn_get_mscale(self.scaling_factor, self.mscale)
+            / _yarn_get_mscale(self.scaling_factor, self.mscale_all_dim)
+        )
+
+        emb = torch.cat((freqs, freqs), dim=-1)
+        # emb [seq_length, .., dim]
+        emb = emb[:, None, None, :]
+        if parallel_state.get_context_parallel_world_size() > 1:
+            # slice rotary_pos_emb along sequence dimension
+            # and select the parition of the current CP rank
+            emb = get_pos_emb_on_this_cp_rank(emb, 0)
+        return emb, _mscale
+
+
+# Inverse dim formula to find dim based on number of rotations
+def _yarn_find_correction_dim(
+    num_rotations: float, dim: int, rotary_base: float = 10000, max_position_embeddings: int = 2048
+) -> float:
+    return (dim * math.log(max_position_embeddings / (num_rotations * 2 * math.pi))) / (
+        2 * math.log(rotary_base)
+    )
+
+
+# Find dim range bounds based on rotations
+def _yarn_find_correction_range(
+    low_rot: float,
+    high_rot: float,
+    dim: int,
+    rotary_base: float = 10000,
+    max_position_embeddings: int = 2048,
+) -> tuple[int, int]:
+    low = math.floor(_yarn_find_correction_dim(low_rot, dim, rotary_base, max_position_embeddings))
+    high = math.ceil(_yarn_find_correction_dim(high_rot, dim, rotary_base, max_position_embeddings))
+    return max(low, 0), min(high, dim - 1)  # Clamp values just in case
+
+
+def _yarn_linear_ramp_mask(min: float, max: float, dim: int) -> Tensor:
+    if min == max:
+        max += 0.001  # Prevent singularity
+
+    linear_func = (torch.arange(dim, dtype=torch.float32) - min) / (max - min)
+    ramp_func = torch.clamp(linear_func, 0, 1)
+    return ramp_func
+
+
+def _yarn_get_mscale(scale: float = 1, mscale: float = 1) -> float:
+    if scale <= 1:
+        return 1.0
+    return 0.1 * mscale * math.log(scale) + 1.0
diff --git a/megatron/core/models/common/language_module/__init__.py b/megatron/core/models/common/language_module/__init__.py
new file mode 100644
index 0000000000..e69de29bb2
diff --git a/megatron/core/models/common/language_module/language_module.py b/megatron/core/models/common/language_module/language_module.py
new file mode 100644
index 0000000000..7075e57f98
--- /dev/null
+++ b/megatron/core/models/common/language_module/language_module.py
@@ -0,0 +1,204 @@
+# Copyright (c) 2024, NVIDIA CORPORATION. All rights reserved.
+import logging
+from typing import Optional, Tuple
+
+import torch
+from torch import Tensor
+
+from megatron.core import parallel_state, tensor_parallel
+from megatron.core.dist_checkpointing.mapping import ShardedStateDict
+from megatron.core.fusions.fused_cross_entropy import fused_vocab_parallel_cross_entropy
+from megatron.core.transformer.module import MegatronModule
+from megatron.core.transformer.transformer_config import TransformerConfig
+from megatron.core.utils import make_tp_sharded_tensor_for_checkpoint
+
+
+class LanguageModule(MegatronModule):
+    """Base language module that has common helper functions used across GPT, BERT etc.
+
+    Args:
+        config (TransformerConfig): Input transformer config for the model
+    """
+
+    def __init__(self, config: TransformerConfig) -> None:
+        super().__init__(config=config)
+
+    def compute_language_model_loss(self, labels: Tensor, logits: Tensor) -> Tensor:
+        """Computes the language model loss (Cross entropy across vocabulary)
+
+        Args:
+            labels (Tensor): The labels of dimension [batch size, seq length]
+            logits (Tensor): The final logits returned by the output layer of the transformer model
+
+        Returns:
+            Tensor: Loss tensor of dimensions [batch size, sequence_length]
+        """
+        # [b s] => [s b]
+        labels = labels.transpose(0, 1).contiguous()
+        if self.config.cross_entropy_loss_fusion:
+            loss = fused_vocab_parallel_cross_entropy(logits, labels)
+        else:
+            loss = tensor_parallel.vocab_parallel_cross_entropy(logits, labels)
+
+        # [s b] => [b, s]
+        loss = loss.transpose(0, 1).contiguous()
+        return loss
+
+    def setup_embeddings_and_output_layer(self) -> None:
+        """Sets up embedding layer in first stage and output layer in last stage.
+
+        This function initalizes word embeddings in the final stage when we are
+        using pipeline parallelism and sharing word embeddings, and sets up param
+        attributes on the embedding and output layers.
+        """
+
+        # Set `is_embedding_or_output_parameter` attribute.
+        if self.pre_process:
+            self.embedding.word_embeddings.weight.is_embedding_or_output_parameter = True
+        if self.post_process and self.output_layer.weight is not None:
+            self.output_layer.weight.is_embedding_or_output_parameter = True
+
+        if not self.share_embeddings_and_output_weights:
+            return
+
+        if parallel_state.get_pipeline_model_parallel_world_size() == 1:
+            # Zero out wgrad if sharing embeddings between two layers on same
+            # pipeline stage to make sure grad accumulation into main_grad is
+            # correct and does not include garbage values (e.g., from torch.empty).
+            self.shared_embedding_or_output_weight().zero_out_wgrad = True
+            return
+
+        if parallel_state.is_pipeline_first_stage() and self.pre_process and not self.post_process:
+            self.shared_embedding_or_output_weight().shared_embedding = True
+
+        if self.post_process and not self.pre_process:
+            assert not parallel_state.is_pipeline_first_stage()
+            # set word_embeddings weights to 0 here, then copy first
+            # stage's weights using all_reduce below.
+            self.output_layer.weight.data.fill_(0)
+            self.output_layer.weight.shared = True
+            self.output_layer.weight.shared_embedding = True
+
+        # Parameters are shared between the word embeddings layers, and the
+        # heads at the end of the model. In a pipelined setup with more than
+        # one stage, the initial embedding layer and the head are on different
+        # workers, so we do the following:
+        # 1. Create a second copy of word_embeddings on the last stage, with
+        #    initial parameters of 0.0.
+        # 2. Do an all-reduce between the first and last stage to ensure that
+        #    the two copies of word_embeddings start off with the same
+        #    parameter values.
+        # 3. In the training loop, before an all-reduce between the grads of
+        #    the two word_embeddings layers to ensure that every applied weight
+        #    update is the same on both stages.
+
+        # Ensure that first and last stages have the same initial parameter
+        # values.
+        if torch.distributed.is_initialized():
+            if parallel_state.is_rank_in_embedding_group():
+                weight = self.shared_embedding_or_output_weight()
+                weight.data = weight.data.cuda()
+                torch.distributed.all_reduce(
+                    weight.data, group=parallel_state.get_embedding_group()
+                )
+
+        elif not getattr(LanguageModule, "embedding_warning_printed", False):
+            logging.getLogger(__name__).warning(
+                "Distributed processes aren't initialized, so the output layer "
+                "is not initialized with weights from the word embeddings. "
+                "If you are just manipulating a model this is fine, but "
+                "this needs to be handled manually. If you are training "
+                "something is definitely wrong."
+            )
+            LanguageModule.embedding_warning_printed = True
+
+    def shared_embedding_or_output_weight(self) -> Tensor:
+        """Gets the emedding weight or output logit weights when share embedding and output weights set to True.
+
+        Returns:
+            Tensor: During pre processing it returns the input embeddings weight while during post processing it returns the final output layers weight
+        """
+        if self.pre_process:
+            return self.embedding.word_embeddings.weight
+        elif self.post_process:
+            return self.output_layer.weight
+        return None
+
+    def sharded_state_dict(
+        self,
+        prefix: str = '',
+        sharded_offsets: Tuple[Tuple[int, int, int]] = (),
+        metadata: Optional[dict] = None,
+    ) -> ShardedStateDict:
+        """Sharded state dict implementation that handles the output layer weights tying.
+
+        Args:
+            prefix (str): Module name prefix.
+            sharded_offsets (tuple): PP related offsets, expected to be empty at this module level.
+            metadata (Optional[Dict]): metadata controlling sharded state dict creation.
+
+        Returns:
+            ShardedStateDict: sharded state dict for the LanguageModel
+        """
+        assert not sharded_offsets, "Unexpected sharded offsets"
+        sharded_state_dict = super().sharded_state_dict(prefix, sharded_offsets, metadata)
+
+        first_stage_word_emb_key = f'{prefix}embedding.word_embeddings.weight'
+        output_layer_weight_key = f'{prefix}output_layer.weight'
+        output_layer_bias_key = f'{prefix}output_layer.bias'
+
+        if self.share_embeddings_and_output_weights:
+            self.tie_embeddings_and_output_weights_state_dict(
+                sharded_state_dict, output_layer_weight_key, first_stage_word_emb_key
+            )
+        elif self.post_process:
+            # Make sure the output layer follows the embeddings padding logic
+            sharded_state_dict[output_layer_weight_key].allow_shape_mismatch = True
+
+        # Regardless of sharing the output weights with embeddings, we must handle the bias padding
+        if self.post_process and output_layer_bias_key in sharded_state_dict:
+            sharded_state_dict[output_layer_bias_key].allow_shape_mismatch = True
+
+        return sharded_state_dict
+
+    def tie_embeddings_and_output_weights_state_dict(
+        self,
+        sharded_state_dict: ShardedStateDict,
+        output_layer_weight_key: str,
+        first_stage_word_emb_key: str,
+    ) -> None:
+        """Ties the embedding and output weights in a given sharded state dict.
+
+        Args:
+            sharded_state_dict (ShardedStateDict): state dict with the weight to tie
+            output_layer_weight_key (str): key of the output layer weight in the state dict.
+                This entry will be replaced with a tied version
+            first_stage_word_emb_key (str): this must be the same as the
+                ShardedTensor.key of the first stage word embeddings.
+
+        Returns: None, acts in-place
+        """
+        if not self.post_process:
+            # No output layer
+            assert output_layer_weight_key not in sharded_state_dict, sharded_state_dict.keys()
+            return
+
+        if self.pre_process:
+            # Output layer is equivalent to the embedding already
+            return
+
+        # Replace the default output layer with a one sharing the weights with the embedding
+        del sharded_state_dict[output_layer_weight_key]
+        tensor = self.shared_embedding_or_output_weight()
+        last_stage_word_emb_replica_id = (
+            1,  # copy of first stage embedding
+            0,
+            parallel_state.get_data_parallel_rank(with_context_parallel=True),
+        )
+
+        sharded_state_dict[output_layer_weight_key] = make_tp_sharded_tensor_for_checkpoint(
+            tensor=tensor,
+            key=first_stage_word_emb_key,
+            replica_id=last_stage_word_emb_replica_id,
+            allow_shape_mismatch=True,
+        )
diff --git a/megatron/core/models/common/vision_module/__init__.py b/megatron/core/models/common/vision_module/__init__.py
new file mode 100644
index 0000000000..e69de29bb2
diff --git a/megatron/core/models/common/vision_module/vision_module.py b/megatron/core/models/common/vision_module/vision_module.py
new file mode 100644
index 0000000000..5dc51873a4
--- /dev/null
+++ b/megatron/core/models/common/vision_module/vision_module.py
@@ -0,0 +1,17 @@
+# Copyright (c) 2024, NVIDIA CORPORATION. All rights reserved.
+"""Megatron Vision Module."""
+
+from megatron.core.transformer.module import MegatronModule
+from megatron.core.transformer.transformer_config import TransformerConfig
+
+
+# Note: This is only a stub at the moment. This will be expanded in follow-up changes.
+class VisionModule(MegatronModule):
+    """Base vision module that has common helper functions used across CLIP, ViT, etc.
+
+    Args:
+        config (TransformerConfig): Input transformer config for the model
+    """
+
+    def __init__(self, config: TransformerConfig) -> None:
+        super().__init__(config=config)
diff --git a/megatron/core/models/gpt/__init__.py b/megatron/core/models/gpt/__init__.py
new file mode 100644
index 0000000000..8bbecfcb09
--- /dev/null
+++ b/megatron/core/models/gpt/__init__.py
@@ -0,0 +1,2 @@
+# Copyright (c) 2024, NVIDIA CORPORATION. All rights reserved.
+from .gpt_model import GPTModel
diff --git a/megatron/core/models/gpt/gpt_layer_specs.py b/megatron/core/models/gpt/gpt_layer_specs.py
new file mode 100755
index 0000000000..1db68dc886
--- /dev/null
+++ b/megatron/core/models/gpt/gpt_layer_specs.py
@@ -0,0 +1,247 @@
+# Copyright (c) 2023, NVIDIA CORPORATION. All rights reserved.
+
+from typing import Optional
+
+from megatron.core.fusions.fused_bias_dropout import get_bias_dropout_add
+from megatron.core.tensor_parallel.layers import ColumnParallelLinear, RowParallelLinear
+from megatron.core.transformer.attention import SelfAttention, SelfAttentionSubmodules
+from megatron.core.transformer.dot_product_attention import DotProductAttention
+from megatron.core.transformer.enums import AttnMaskType
+from megatron.core.transformer.identity_op import IdentityOp
+from megatron.core.transformer.mlp import MLP, MLPSubmodules
+from megatron.core.transformer.moe.moe_layer import MoELayer, MoESubmodules
+from megatron.core.transformer.moe.shared_experts import SharedExpertMLP
+from megatron.core.transformer.multi_latent_attention import (
+    MLASelfAttention,
+    MLASelfAttentionSubmodules,
+)
+from megatron.core.transformer.spec_utils import ModuleSpec
+from megatron.core.transformer.transformer_layer import TransformerLayer, TransformerLayerSubmodules
+
+try:
+    from megatron.core.extensions.transformer_engine import (
+        TEColumnParallelGroupedLinear,
+        TEColumnParallelLinear,
+        TEDotProductAttention,
+        TELayerNormColumnParallelLinear,
+        TENorm,
+        TERowParallelGroupedLinear,
+        TERowParallelLinear,
+    )
+
+    HAVE_TE = True
+except ImportError:
+    HAVE_TE = False
+
+try:
+    import apex  # pylint: disable=unused-import
+
+    from megatron.core.fusions.fused_layer_norm import FusedLayerNorm
+
+    HAVE_APEX = True
+    LNImpl = FusedLayerNorm
+except ImportError:
+    import warnings
+
+    from megatron.core.transformer.torch_layer_norm import WrappedTorchLayerNorm
+
+    warnings.warn('Apex is not installed. Falling back to Torch LayerNorm')
+    LNImpl = WrappedTorchLayerNorm
+
+
+def get_gpt_layer_with_transformer_engine_spec(
+    num_experts: Optional[int] = None,
+    moe_grouped_gemm: Optional[bool] = False,
+    qk_layernorm: Optional[bool] = False,
+    multi_latent_attention: Optional[bool] = False,
+    fp8: Optional[str] = None,
+) -> ModuleSpec:
+    """Use this spec to use lower-level Transformer Engine modules (required for fp8 training).
+
+
+    Args:
+        num_experts (int, optional): Number of experts. Defaults to None.
+        moe_grouped_gemm (bool, optional): To use Grouped GEMM. Defaults to False.
+        qk_layernorm (bool, optional): To use layernorm for queries/keys. Defaults to False.
+        fp8 (str, optional): Flag to decide the linear layer spec for MoE. Defaults to None.
+
+    Returns:
+        ModuleSpec: Module specification with TE modules
+    """
+    mlp = _get_mlp_module_spec(
+        use_te=True, num_experts=num_experts, moe_grouped_gemm=moe_grouped_gemm, fp8=fp8
+    )
+
+    if multi_latent_attention:
+        return ModuleSpec(
+            module=TransformerLayer,
+            submodules=TransformerLayerSubmodules(
+                self_attention=ModuleSpec(
+                    module=MLASelfAttention,
+                    params={"attn_mask_type": AttnMaskType.causal},
+                    submodules=MLASelfAttentionSubmodules(
+                        linear_q_proj=TEColumnParallelLinear,
+                        linear_q_down_proj=TEColumnParallelLinear,
+                        linear_q_up_proj=TEColumnParallelLinear,
+                        linear_kv_down_proj=TEColumnParallelLinear,
+                        linear_kv_up_proj=TEColumnParallelLinear,
+                        core_attention=TEDotProductAttention,
+                        linear_proj=TERowParallelLinear,
+                        q_layernorm=TENorm if qk_layernorm else IdentityOp,
+                        kv_layernorm=TENorm if qk_layernorm else IdentityOp,
+                    ),
+                ),
+                self_attn_bda=get_bias_dropout_add,
+                pre_mlp_layernorm=TENorm if num_experts else IdentityOp,
+                input_layernorm=TENorm if num_experts else IdentityOp,
+                mlp=mlp,
+                mlp_bda=get_bias_dropout_add,
+            ),
+        )
+    else:
+        return ModuleSpec(
+            module=TransformerLayer,
+            submodules=TransformerLayerSubmodules(
+                self_attention=ModuleSpec(
+                    module=SelfAttention,
+                    params={"attn_mask_type": AttnMaskType.causal},
+                    submodules=SelfAttentionSubmodules(
+                        linear_qkv=TELayerNormColumnParallelLinear,
+                        core_attention=TEDotProductAttention,
+                        linear_proj=TERowParallelLinear,
+                        # TENorm significantly harms convergence when used
+                        # for QKLayerNorm; we instead use the Apex implementation.
+                        q_layernorm=FusedLayerNorm if qk_layernorm else IdentityOp,
+                        k_layernorm=FusedLayerNorm if qk_layernorm else IdentityOp,
+                    ),
+                ),
+                self_attn_bda=get_bias_dropout_add,
+                pre_mlp_layernorm=TENorm if num_experts else IdentityOp,
+                mlp=mlp,
+                mlp_bda=get_bias_dropout_add,
+            ),
+        )
+
+
+def get_gpt_layer_local_spec(
+    num_experts: Optional[int] = None,
+    moe_grouped_gemm: Optional[bool] = False,
+    qk_layernorm: Optional[bool] = False,
+    multi_latent_attention: Optional[bool] = False,
+) -> ModuleSpec:
+    """Use this spec for an implementation using only modules in Megatron-Core.
+
+
+    Args:
+        num_experts (int, optional): Number of experts. Defaults to None.
+        moe_grouped_gemm (bool, optional): To use Grouped GEMM. Defaults to False.
+        qk_layernorm (bool, optional): To use layernorm for queries/keys. Defaults to False.
+
+    Returns:
+        ModuleSpec: Module specification with Megatron-Core modules
+    """
+    mlp = _get_mlp_module_spec(
+        use_te=False, num_experts=num_experts, moe_grouped_gemm=moe_grouped_gemm
+    )
+    if multi_latent_attention:
+        return ModuleSpec(
+            module=TransformerLayer,
+            submodules=TransformerLayerSubmodules(
+                self_attention=ModuleSpec(
+                    module=MLASelfAttention,
+                    params={"attn_mask_type": AttnMaskType.causal},
+                    submodules=MLASelfAttentionSubmodules(
+                        linear_q_proj=ColumnParallelLinear,
+                        linear_q_down_proj=ColumnParallelLinear,
+                        linear_q_up_proj=ColumnParallelLinear,
+                        linear_kv_down_proj=ColumnParallelLinear,
+                        linear_kv_up_proj=ColumnParallelLinear,
+                        core_attention=DotProductAttention,
+                        linear_proj=RowParallelLinear,
+                        q_layernorm=LNImpl if qk_layernorm else IdentityOp,
+                        kv_layernorm=LNImpl if qk_layernorm else IdentityOp,
+                    ),
+                ),
+                self_attn_bda=get_bias_dropout_add,
+                pre_mlp_layernorm=LNImpl if num_experts else IdentityOp,
+                input_layernorm=LNImpl if num_experts else IdentityOp,
+                mlp=mlp,
+                mlp_bda=get_bias_dropout_add,
+            ),
+        )
+    else:
+        return ModuleSpec(
+            module=TransformerLayer,
+            submodules=TransformerLayerSubmodules(
+                input_layernorm=LNImpl,
+                self_attention=ModuleSpec(
+                    module=SelfAttention,
+                    params={"attn_mask_type": AttnMaskType.causal},
+                    submodules=SelfAttentionSubmodules(
+                        linear_qkv=ColumnParallelLinear,
+                        core_attention=DotProductAttention,
+                        linear_proj=RowParallelLinear,
+                        q_layernorm=LNImpl if qk_layernorm else IdentityOp,
+                        k_layernorm=LNImpl if qk_layernorm else IdentityOp,
+                    ),
+                ),
+                self_attn_bda=get_bias_dropout_add,
+                pre_mlp_layernorm=LNImpl,
+                mlp=mlp,
+                mlp_bda=get_bias_dropout_add,
+                sharded_state_dict_keys_map={
+                    'input_layernorm.': 'self_attention.linear_qkv.layer_norm_',
+                    'pre_mlp_layernorm.': 'mlp.linear_fc1.layer_norm_',
+                },
+            ),
+        )
+
+
+def _get_mlp_module_spec(
+    use_te: Optional[bool] = True,
+    num_experts: Optional[int] = None,
+    moe_grouped_gemm: Optional[bool] = False,
+    fp8: Optional[str] = None,
+) -> ModuleSpec:
+    """Helper function to get module spec for MLP/MoE"""
+    if num_experts is None:
+        # Dense MLP w/ or w/o TE modules.
+        return ModuleSpec(
+            module=MLP,
+            submodules=MLPSubmodules(
+                linear_fc1=TELayerNormColumnParallelLinear if use_te else ColumnParallelLinear,
+                linear_fc2=TERowParallelLinear if use_te else RowParallelLinear,
+            ),
+        )
+    else:
+        # Mixture of experts with modules in megatron core.
+        if use_te and moe_grouped_gemm:
+            linear_fc1 = TEColumnParallelGroupedLinear
+            linear_fc2 = TERowParallelGroupedLinear
+        elif use_te and fp8:
+            linear_fc1 = TEColumnParallelLinear
+            linear_fc2 = TERowParallelLinear
+        else:
+            linear_fc1 = ColumnParallelLinear
+            linear_fc2 = RowParallelLinear
+
+        use_te_grouped_gemm = use_te and TEColumnParallelGroupedLinear is not None
+
+        return ModuleSpec(
+            module=MoELayer,
+            submodules=MoESubmodules(
+                experts=(
+                    MLPSubmodules(linear_fc1=linear_fc1, linear_fc2=linear_fc2)
+                    if not moe_grouped_gemm or use_te_grouped_gemm
+                    else None
+                ),
+                shared_experts=ModuleSpec(
+                    module=SharedExpertMLP,
+                    params={"gate": False},
+                    submodules=MLPSubmodules(
+                        linear_fc1=TEColumnParallelLinear if use_te else ColumnParallelLinear,
+                        linear_fc2=TERowParallelLinear if use_te else RowParallelLinear,
+                    ),
+                ),
+            ),
+        )
diff --git a/megatron/core/models/gpt/gpt_model.py b/megatron/core/models/gpt/gpt_model.py
new file mode 100644
index 0000000000..bd52f89680
--- /dev/null
+++ b/megatron/core/models/gpt/gpt_model.py
@@ -0,0 +1,290 @@
+# Copyright (c) 2023, NVIDIA CORPORATION. All rights reserved.
+
+from collections import OrderedDict
+from typing import Dict, Literal, Optional
+
+from torch import Tensor
+
+from megatron.core import InferenceParams, tensor_parallel
+from megatron.core.config_logger import has_config_logger_enabled, log_config_to_disk
+from megatron.core.dist_checkpointing.mapping import ShardedStateDict
+from megatron.core.models.common.embeddings.language_model_embedding import LanguageModelEmbedding
+from megatron.core.models.common.embeddings.rotary_pos_embedding import RotaryEmbedding
+from megatron.core.models.common.language_module.language_module import LanguageModule
+from megatron.core.packed_seq_params import PackedSeqParams
+from megatron.core.transformer.enums import ModelType
+from megatron.core.transformer.spec_utils import ModuleSpec
+from megatron.core.transformer.transformer_block import TransformerBlock
+from megatron.core.transformer.transformer_config import TransformerConfig
+
+
+class GPTModel(LanguageModule):
+    """GPT Transformer language model.
+
+    Args:
+        config (TransformerConfig):
+            Transformer config
+        transformer_layer_spec (ModuleSpec):
+            Specifies module to use for transformer layers
+        vocab_size (int):
+            Vocabulary size
+        max_sequence_length (int):
+            maximum size of sequence. This is used for positional embedding
+        pre_process (bool, optional):
+            Include embedding layer (used with pipeline parallelism). Defaults to True.
+        post_process (bool, optional):
+            Include an output layer (used with pipeline parallelism). Defaults to True.
+        fp16_lm_cross_entropy (bool, optional):
+            Defaults to False.
+        parallel_output (bool, optional):
+            Do not gather the outputs, keep them split across tensor
+            parallel ranks. Defaults to True.
+        share_embeddings_and_output_weights (bool, optional):
+            When True, input embeddings and output logit weights are shared. Defaults to False.
+        position_embedding_type (Literal[learned_absolute,rope], optional):
+            Position embedding type.. Defaults to 'learned_absolute'.
+        rotary_percent (float, optional):
+            Percent of rotary dimension to use for rotary position embeddings.
+            Ignored unless position_embedding_type is 'rope'. Defaults to 1.0.
+        rotary_base (int, optional):
+            Base period for rotary position embeddings. Ignored unless
+            position_embedding_type is 'rope'.
+            Defaults to 10000.
+        seq_len_interpolation_factor (Optional[float], optional):
+            scale of linearly interpolating RoPE for longer sequences.
+            The value must be a float larger than 1.0. Defaults to None.
+    """
+
+    def __init__(
+        self,
+        config: TransformerConfig,
+        transformer_layer_spec: ModuleSpec,
+        vocab_size: int,
+        max_sequence_length: int,
+        pre_process: bool = True,
+        post_process: bool = True,
+        fp16_lm_cross_entropy: bool = False,
+        parallel_output: bool = True,
+        share_embeddings_and_output_weights: bool = False,
+        position_embedding_type: Literal['learned_absolute', 'rope', 'none'] = 'learned_absolute',
+        rotary_percent: float = 1.0,
+        rotary_base: int = 10000,
+        rope_scaling: bool = False,
+        seq_len_interpolation_factor: Optional[float] = None,
+    ) -> None:
+        super().__init__(config=config)
+
+        if has_config_logger_enabled(config):
+            log_config_to_disk(config, locals(), prefix=type(self).__name__)
+
+        self.transformer_layer_spec: ModuleSpec = transformer_layer_spec
+        self.vocab_size = vocab_size
+        self.max_sequence_length = max_sequence_length
+        self.pre_process = pre_process
+        self.post_process = post_process
+        self.fp16_lm_cross_entropy = fp16_lm_cross_entropy
+        self.parallel_output = parallel_output
+        self.share_embeddings_and_output_weights = share_embeddings_and_output_weights
+        self.position_embedding_type = position_embedding_type
+
+        # megatron core pipelining currently depends on model type
+        # TODO: remove this dependency ?
+        self.model_type = ModelType.encoder_or_decoder
+
+        # These 4 attributes are needed for TensorRT-LLM export.
+        self.max_position_embeddings = max_sequence_length
+        self.rotary_percent = rotary_percent
+        self.rotary_base = rotary_base
+        self.rotary_scaling = rope_scaling
+
+        if self.pre_process:
+            self.embedding = LanguageModelEmbedding(
+                config=self.config,
+                vocab_size=self.vocab_size,
+                max_sequence_length=self.max_sequence_length,
+                position_embedding_type=position_embedding_type,
+            )
+
+        if self.position_embedding_type == 'rope' and not self.config.multi_latent_attention:
+            self.rotary_pos_emb = RotaryEmbedding(
+                kv_channels=self.config.kv_channels,
+                rotary_percent=rotary_percent,
+                rotary_interleaved=self.config.rotary_interleaved,
+                seq_len_interpolation_factor=seq_len_interpolation_factor,
+                rotary_base=rotary_base,
+                rope_scaling=rope_scaling,
+                use_cpu_initialization=self.config.use_cpu_initialization,
+            )
+
+        # Transformer.
+        self.decoder = TransformerBlock(
+            config=self.config,
+            spec=transformer_layer_spec,
+            pre_process=self.pre_process,
+            post_process=self.post_process,
+        )
+
+        # Output
+        if post_process:
+            if self.config.defer_embedding_wgrad_compute:
+                # The embedding activation buffer preserves a reference to the input activations
+                # of the final embedding projection layer GEMM. It will hold the activations for
+                # all the micro-batches of a global batch for the last pipeline stage. Once we are
+                # done with all the back props for all the microbatches for the last pipeline stage,
+                # it will be in the pipeline flush stage. During this pipeline flush we use the
+                # input activations stored in embedding activation buffer and gradient outputs
+                # stored in gradient buffer to calculate the weight gradients for the embedding
+                # final linear layer.
+                self.embedding_activation_buffer = []
+                self.grad_output_buffer = []
+            else:
+                self.embedding_activation_buffer = None
+                self.grad_output_buffer = None
+
+            self.output_layer = tensor_parallel.ColumnParallelLinear(
+                config.hidden_size,
+                self.vocab_size,
+                config=config,
+                init_method=config.init_method,
+                bias=False,
+                skip_bias_add=False,
+                gather_output=not self.parallel_output,
+                skip_weight_param_allocation=self.pre_process
+                and self.share_embeddings_and_output_weights,
+                embedding_activation_buffer=self.embedding_activation_buffer,
+                grad_output_buffer=self.grad_output_buffer,
+            )
+
+        if self.pre_process or self.post_process:
+            self.setup_embeddings_and_output_layer()
+
+        if has_config_logger_enabled(self.config):
+            log_config_to_disk(
+                self.config, self.state_dict(), prefix=f'{type(self).__name__}_init_ckpt'
+            )
+
+    def set_input_tensor(self, input_tensor: Tensor) -> None:
+        """Sets input tensor to the model.
+
+        See megatron.model.transformer.set_input_tensor()
+
+        Args:
+            input_tensor (Tensor): Sets the input tensor for the model.
+        """
+        # This is usually handled in schedules.py but some inference code still
+        # gives us non-lists or None
+        if not isinstance(input_tensor, list):
+            input_tensor = [input_tensor]
+
+        assert len(input_tensor) == 1, 'input_tensor should only be length 1 for gpt/bert'
+        self.decoder.set_input_tensor(input_tensor[0])
+
+    def forward(
+        self,
+        input_ids: Tensor,
+        position_ids: Tensor,
+        attention_mask: Tensor,
+        decoder_input: Tensor = None,
+        labels: Tensor = None,
+        inference_params: InferenceParams = None,
+        packed_seq_params: PackedSeqParams = None,
+        extra_block_kwargs: dict = None,
+        runtime_gather_output: Optional[bool] = None,
+    ) -> Tensor:
+        """Forward function of the GPT Model This function passes the input tensors
+        through the embedding layer, and then the decoeder and finally into the post
+        processing layer (optional).
+
+        It either returns the Loss values if labels are given  or the final hidden units
+
+        Args:
+            runtime_gather_output (bool): Gather output at runtime. Default None means
+                `parallel_output` arg in the constructor will be used.
+        """
+        # If decoder_input is provided (not None), then input_ids and position_ids are ignored.
+        # Otherwise, apply embedding layer on input_ids and position_ids to get decoder_input.
+
+        # Decoder embedding.
+        if decoder_input is not None:
+            pass
+        elif self.pre_process:
+            decoder_input = self.embedding(input_ids=input_ids, position_ids=position_ids)
+        else:
+            # intermediate stage of pipeline
+            # decoder will get hidden_states from encoder.input_tensor
+            decoder_input = None
+
+        # Rotary positional embeddings (embedding is None for PP intermediate devices)
+        rotary_pos_emb = None
+        if self.position_embedding_type == 'rope' and not self.config.multi_latent_attention:
+            rotary_seq_len = self.rotary_pos_emb.get_rotary_seq_len(
+                inference_params, self.decoder, decoder_input, self.config
+            )
+            rotary_pos_emb = self.rotary_pos_emb(rotary_seq_len)
+
+        # Run decoder.
+        hidden_states = self.decoder(
+            hidden_states=decoder_input,
+            attention_mask=attention_mask,
+            inference_params=inference_params,
+            rotary_pos_emb=rotary_pos_emb,
+            packed_seq_params=packed_seq_params,
+            **(extra_block_kwargs or {}),
+        )
+
+        if not self.post_process:
+            return hidden_states
+
+        # logits and loss
+        output_weight = None
+        if self.share_embeddings_and_output_weights:
+            output_weight = self.shared_embedding_or_output_weight()
+        logits, _ = self.output_layer(
+            hidden_states, weight=output_weight, runtime_gather_output=runtime_gather_output
+        )
+
+        if has_config_logger_enabled(self.config):
+            payload = OrderedDict(
+                {
+                    'input_ids': input_ids,
+                    'position_ids': position_ids,
+                    'attention_mask': attention_mask,
+                    'decoder_input': decoder_input,
+                    'logits': logits,
+                }
+            )
+            log_config_to_disk(self.config, payload, prefix='input_and_logits')
+
+        if labels is None:
+            # [s b h] => [b s h]
+            return logits.transpose(0, 1).contiguous()
+
+        loss = self.compute_language_model_loss(labels, logits)
+
+        return loss
+
+    def sharded_state_dict(
+        self, prefix: str = '', sharded_offsets: tuple = (), metadata: Optional[Dict] = None
+    ) -> ShardedStateDict:
+        """Sharded state dict implementation for GPTModel backward-compatibility
+        (removing extra state).
+
+        Args:
+            prefix (str): Module name prefix.
+            sharded_offsets (tuple): PP related offsets, expected to be empty at this module level.
+            metadata (Optional[Dict]): metadata controlling sharded state dict creation.
+
+        Returns:
+            ShardedStateDict: sharded state dict for the GPTModel
+        """
+        sharded_state_dict = super().sharded_state_dict(prefix, sharded_offsets, metadata)
+        output_layer_extra_state_key = f'{prefix}output_layer._extra_state'
+
+        # Old GPT checkpoints only stored the output layer weight key. So we remove the
+        # _extra_state key but check that it doesn't contain any data anyway
+        output_extra_state = sharded_state_dict.pop(output_layer_extra_state_key, None)
+        assert not (
+            output_extra_state and output_extra_state.data
+        ), f'Expected output layer extra state to be empty, got: {output_extra_state}'
+
+        return sharded_state_dict
diff --git a/megatron/core/models/mamba/__init__.py b/megatron/core/models/mamba/__init__.py
new file mode 100644
index 0000000000..5aaf852401
--- /dev/null
+++ b/megatron/core/models/mamba/__init__.py
@@ -0,0 +1,2 @@
+# Copyright (c) 2024, NVIDIA CORPORATION. All rights reserved.
+from .mamba_model import MambaModel
diff --git a/megatron/core/models/mamba/mamba_layer_specs.py b/megatron/core/models/mamba/mamba_layer_specs.py
new file mode 100755
index 0000000000..e5fa9efa72
--- /dev/null
+++ b/megatron/core/models/mamba/mamba_layer_specs.py
@@ -0,0 +1,67 @@
+# Copyright (c) 2023, NVIDIA CORPORATION. All rights reserved.
+
+from megatron.core.extensions.transformer_engine import (
+    TEDotProductAttention,
+    TELayerNormColumnParallelLinear,
+    TERowParallelLinear,
+)
+from megatron.core.fusions.fused_bias_dropout import get_bias_dropout_add
+from megatron.core.ssm.mamba_block import MambaStack, MambaStackSubmodules
+from megatron.core.ssm.mamba_layer import MambaLayer, MambaLayerSubmodules
+from megatron.core.ssm.mamba_mixer import MambaMixer, MambaMixerSubmodules
+from megatron.core.transformer.attention import SelfAttention, SelfAttentionSubmodules
+from megatron.core.transformer.enums import AttnMaskType
+from megatron.core.transformer.mlp import MLP, MLPSubmodules
+from megatron.core.transformer.spec_utils import ModuleSpec
+from megatron.core.transformer.transformer_layer import TransformerLayer, TransformerLayerSubmodules
+
+mamba_stack_spec = ModuleSpec(
+    module=MambaStack,
+    submodules=MambaStackSubmodules(
+        mamba_layer=ModuleSpec(
+            module=MambaLayer,
+            submodules=MambaLayerSubmodules(
+                mixer=ModuleSpec(
+                    module=MambaMixer,
+                    submodules=MambaMixerSubmodules(
+                        in_proj=TELayerNormColumnParallelLinear, out_proj=TERowParallelLinear
+                    ),
+                ),
+                mamba_bda=get_bias_dropout_add,
+            ),
+        ),
+        # Started with spec from gpt_layer_specs.py (with MLP removed)
+        # Using the TE spec because we had problems getting the non-TE spec
+        # working
+        attention_layer=ModuleSpec(
+            module=TransformerLayer,
+            submodules=TransformerLayerSubmodules(
+                self_attention=ModuleSpec(
+                    module=SelfAttention,
+                    params={"attn_mask_type": AttnMaskType.causal},
+                    submodules=SelfAttentionSubmodules(
+                        linear_qkv=TELayerNormColumnParallelLinear,
+                        core_attention=TEDotProductAttention,
+                        linear_proj=TERowParallelLinear,
+                    ),
+                ),
+                self_attn_bda=get_bias_dropout_add,
+            ),
+        ),
+        # Started with spec from gpt_layer_specs.py
+        # Using the TE spec because we had problems getting the non-TE spec
+        # working
+        mlp_layer=ModuleSpec(
+            module=TransformerLayer,
+            submodules=TransformerLayerSubmodules(
+                mlp=ModuleSpec(
+                    module=MLP,
+                    submodules=MLPSubmodules(
+                        linear_fc1=TELayerNormColumnParallelLinear, linear_fc2=TERowParallelLinear
+                    ),
+                ),
+                mlp_bda=get_bias_dropout_add,
+            ),
+        ),
+    ),
+)
diff --git a/megatron/core/models/mamba/mamba_model.py b/megatron/core/models/mamba/mamba_model.py
new file mode 100644
index 0000000000..5794b1b41a
--- /dev/null
+++ b/megatron/core/models/mamba/mamba_model.py
@@ -0,0 +1,228 @@
+# Copyright (c) 2023, NVIDIA CORPORATION. All rights reserved.
+
+from typing import Literal, Optional
+
+from torch import Tensor
+
+from megatron.core import InferenceParams, tensor_parallel
+from megatron.core.config_logger import has_config_logger_enabled, log_config_to_disk
+from megatron.core.models.common.embeddings.language_model_embedding import LanguageModelEmbedding
+from megatron.core.models.common.embeddings.rotary_pos_embedding import RotaryEmbedding
+from megatron.core.models.common.language_module.language_module import LanguageModule
+from megatron.core.transformer.enums import ModelType
+from megatron.core.transformer.spec_utils import ModuleSpec, build_module
+from megatron.core.transformer.transformer_config import TransformerConfig
+
+
+class MambaModel(LanguageModule):
+    """Mamba language model.
+
+    Args:
+        config (TransformerConfig): Transformer config
+        mamba_stack_spec (ModuleSpec): Specifies the modules to use for the various layer types
+        vocab_size (int): Vocabulary size
+        max_sequence_length (int): maximum size of sequence.
+            This is used for positional embedding
+        pre_process (bool, optional): Include embedding layer
+            (used with pipeline parallelism). Defaults to True.
+        mamba_ssm_ngroups (int, optional): Specifies the number of groups to use.
+            The default value is 8, as in the NVIDIA Mamba2 (pure and hybrid) 8b.
+            However, in the original Mamba2 paper, the checkpoints use a setting of 1.
+            Defaults to 8.
+        hybrid_attention_ratio (float, optional): The target ratio of attention
+            layers to total layers
+        hybrid_mlp_ratio (float, optional): The target ratio of mlp layers to total layers
+        hybrid_override_pattern (str, optional): The hybrid layer pattern to override with
+        post_process (bool, optional): Include an output layer (used with pipeline parallelism).
+            Defaults to True.
+        fp16_lm_cross_entropy (bool, optional): Defaults to False.
+        parallel_output (bool, optional): Do not gather the outputs, keep them split across tensor
+            parallel ranks. Defaults to True.
+        share_embeddings_and_output_weights (bool, optional): When True, input embeddings and
+            output logit weights are shared. Defaults to False.
+        position_embedding_type (Literal[learned_absolute,rope,none], optional):  Position
+            embedding type. Defaults to 'none'.
+        rotary_percent (float, optional): Percent of rotary dimension to use for rotary position
+            embeddings. Ignored unless position_embedding_type is 'rope'. Defaults to 1.0.
+        rotary_base (int, optional): Base period for rotary position embeddings. Ignored unless
+            position_embedding_type is 'rope'. Defaults to 10000.
+        seq_len_interpolation_factor (Optional[float], optional): scale of linearly
+            interpolating RoPE for longer sequences. The value must be a float larger than 1.0.
+             Defaults to None.
+    """
+
+    def __init__(
+        self,
+        config: TransformerConfig,
+        mamba_stack_spec: ModuleSpec,
+        vocab_size: int,
+        max_sequence_length: int,
+        mamba_ssm_ngroups: int = 8,
+        pre_process: bool = True,
+        hybrid_attention_ratio: float = 0.0,
+        hybrid_mlp_ratio: float = 0.0,
+        hybrid_override_pattern: str = None,
+        post_process: bool = True,
+        fp16_lm_cross_entropy: bool = False,
+        parallel_output: bool = True,
+        share_embeddings_and_output_weights: bool = False,
+        # Mamba with no attention has no need for position embeddings, so none is default
+        position_embedding_type: Literal['learned_absolute', 'rope', 'none'] = 'none',
+        rotary_percent: float = 1.0,
+        rotary_base: int = 10000,
+        seq_len_interpolation_factor: Optional[float] = None,
+    ) -> None:
+        super().__init__(config=config)
+
+        if has_config_logger_enabled(config):
+            log_config_to_disk(config, locals(), prefix=type(self).__name__)
+
+        self.mamba_stack_spec: ModuleSpec = mamba_stack_spec
+        self.vocab_size = vocab_size
+        self.max_sequence_length = max_sequence_length
+        self.mamba_ssm_ngroups = mamba_ssm_ngroups
+        self.pre_process = pre_process
+        self.hybrid_attention_ratio = hybrid_attention_ratio
+        self.hybrid_mlp_ratio = hybrid_mlp_ratio
+        self.hybrid_override_pattern = hybrid_override_pattern
+        self.post_process = post_process
+        self.fp16_lm_cross_entropy = fp16_lm_cross_entropy
+        self.parallel_output = parallel_output
+        self.share_embeddings_and_output_weights = share_embeddings_and_output_weights
+        self.position_embedding_type = position_embedding_type
+
+        # megatron core pipelining currently depends on model type
+        # TODO: remove this dependency ?
+        self.model_type = ModelType.encoder_or_decoder
+
+        if self.pre_process:
+            self.embedding = LanguageModelEmbedding(
+                config=self.config,
+                vocab_size=self.vocab_size,
+                max_sequence_length=self.max_sequence_length,
+                position_embedding_type=position_embedding_type,
+            )
+
+        if self.position_embedding_type == 'rope':
+            self.rotary_pos_emb = RotaryEmbedding(
+                kv_channels=self.config.kv_channels,
+                rotary_percent=rotary_percent,
+                seq_len_interpolation_factor=seq_len_interpolation_factor,
+                rotary_base=rotary_base,
+                use_cpu_initialization=self.config.use_cpu_initialization,
+            )
+
+        self.decoder = build_module(
+            mamba_stack_spec,
+            self.config,
+            mamba_ssm_ngroups=self.mamba_ssm_ngroups,
+            pre_process=self.pre_process,
+            hybrid_attention_ratio=self.hybrid_attention_ratio,
+            hybrid_mlp_ratio=self.hybrid_mlp_ratio,
+            hybrid_override_pattern=self.hybrid_override_pattern,
+            post_process=self.post_process,
+            dtype=config.params_dtype,
+        )
+
+        # Output
+        if post_process:
+            self.output_layer = tensor_parallel.ColumnParallelLinear(
+                config.hidden_size,
+                self.vocab_size,
+                config=config,
+                init_method=config.init_method,
+                bias=False,
+                skip_bias_add=False,
+                gather_output=not self.parallel_output,
+                skip_weight_param_allocation=self.pre_process
+                and self.share_embeddings_and_output_weights,
+            )
+
+        if self.pre_process or self.post_process:
+            self.setup_embeddings_and_output_layer()
+
+    def set_input_tensor(self, input_tensor: Tensor) -> None:
+        """Sets input tensor to the model.
+
+        See megatron.model.transformer.set_input_tensor()
+
+        Args:
+            input_tensor (Tensor): Sets the input tensor for the model.
+        """
+        # This is usually handled in schedules.py but some inference code still
+        # gives us non-lists or None
+        if not isinstance(input_tensor, list):
+            input_tensor = [input_tensor]
+
+        assert len(input_tensor) == 1, 'input_tensor should only be length 1 for gpt/bert'
+        self.decoder.set_input_tensor(input_tensor[0])
+
+    def forward(
+        self,
+        input_ids: Tensor,
+        position_ids: Tensor,
+        attention_mask: Tensor,
+        decoder_input: Tensor = None,
+        labels: Tensor = None,
+        inference_params: InferenceParams = None,
+    ) -> Tensor:
+        """Forward function of the Mamba model. This function passes the input tensors
+        through the embedding layer, and then the decoder and finally into the post
+        processing layer (optional).
+
+        It either returns the Loss values if labels are given or the final hidden units
+        """
+        # If decoder_input is provided (not None), then input_ids and position_ids are ignored.
+        # Otherwise, apply embedding layer on input_ids and position_ids to get decoder_input.
+
+        # Decoder embedding.
+        if decoder_input is not None:
+            pass
+        elif self.pre_process:
+            decoder_input = self.embedding(input_ids=input_ids, position_ids=position_ids)
+        else:
+            # intermediate stage of pipeline
+            # decoder will get hidden_states from encoder.input_tensor
+            decoder_input = None
+
+        rotary_pos_emb = None
+        if self.position_embedding_type == 'rope':
+            rotary_seq_len = self.rotary_pos_emb.get_rotary_seq_len(
+                inference_params, self.decoder, decoder_input, self.config
+            )
+            rotary_pos_emb = self.rotary_pos_emb(rotary_seq_len)
+
+        # The following assert will currently fail when running inference.
+        # Commented out for now.
+        # TODO (duncan/rwaleffe): (1) confirm that the externally-generated
+        #   attention mask is not needed and is ignored by the model in
+        #   inference mode, (2) reduce the size of the externally-generated
+        #   attention mask to prevent CPU OOM (as we did for training), (3)
+        #   force the attention mask passed to the model in inference mode to
+        #   be None, so this assert will succeed.
+        # assert attention_mask is None, "The attention mask is ignored and should be set to None"
+
+        # Run decoder.
+        hidden_states = self.decoder(
+            hidden_states=decoder_input,
+            attention_mask=attention_mask,
+            inference_params=inference_params,
+            rotary_pos_emb=rotary_pos_emb,
+        )
+
+        if not self.post_process:
+            return hidden_states
+
+        # logits and loss
+        output_weight = None
+        if self.share_embeddings_and_output_weights:
+            output_weight = self.shared_embedding_or_output_weight()
+        logits, _ = self.output_layer(hidden_states, weight=output_weight)
+
+        if labels is None:
+            # [s b h] => [b s h]
+            return logits.transpose(0, 1).contiguous()
+
+        loss = self.compute_language_model_loss(labels, logits)
+
+        return loss
diff --git a/megatron/core/models/multimodal/__init__.py b/megatron/core/models/multimodal/__init__.py
new file mode 100644
index 0000000000..f8011007a5
--- /dev/null
+++ b/megatron/core/models/multimodal/__init__.py
@@ -0,0 +1 @@
+# Copyright (c) 2024, NVIDIA CORPORATION. All rights reserved.
diff --git a/megatron/core/models/multimodal/llava_model.py b/megatron/core/models/multimodal/llava_model.py
new file mode 100644
index 0000000000..29f18ee725
--- /dev/null
+++ b/megatron/core/models/multimodal/llava_model.py
@@ -0,0 +1,660 @@
+# Copyright (c) 2024, NVIDIA CORPORATION. All rights reserved.
+import logging
+from collections import namedtuple
+from functools import partial
+from typing import List, Optional
+
+import torch
+
+from megatron.core import InferenceParams, tensor_parallel
+from megatron.core.config_logger import has_config_logger_enabled, log_config_to_disk
+from megatron.core.extensions.transformer_engine import TEDotProductAttention
+from megatron.core.models.gpt import GPTModel
+from megatron.core.models.vision.clip_vit_model import CLIPViTModel, get_num_image_embeddings
+from megatron.core.models.vision.multimodal_projector import MultimodalProjector
+from megatron.core.parallel_state import get_tensor_model_parallel_world_size
+from megatron.core.transformer import MegatronModule
+from megatron.core.transformer.spec_utils import ModuleSpec
+from megatron.core.transformer.transformer_config import TransformerConfig
+from megatron.core.utils import is_te_min_version
+
+IMAGE_TOKEN_INDEX = -200  # ID for images in the input sequence.
+IGNORE_INDEX = -100  # ID for labels that should be ignored.
+
+
+# Note: This is under development and may be missing features.
+class LLaVAModel(MegatronModule):
+    """LLaVA multi-modal model.
+
+    Args:
+        language_transformer_config (TransformerConfig): Transformer config for the language model.
+        language_transformer_layer_spec (ModuleSpec): Language model spec.
+        language_vocab_size (int): Language model vocabulary size.
+        language_max_sequence_length (int): Language model maximum sequence length.
+        vision_transformer_config (TransformerConfig): Transformer config for the vision model.
+        vision_transformer_layer_spec (ModuleSpec): Vision model spec.
+        drop_vision_class_token (bool): Drop vision class token(s) before the language model.
+        vision_projection_config (TransformerConfig): Vision projection config.
+        vision_projection_layer_spec (ModuleSpec): Vision projection spec.
+        vision_projection_type (str): Type of the vision projection. Default: 2-layer MLP.
+        allow_missing_vision_projection_checkpoint (bool): Allow vision projection weights to be
+            missing when loading a checkpoint. Default False.
+        parallel_output (bool): Keep outputs split across tensor parallel ranks.
+            This is typically True for training and False for inference.
+        language_position_embedding_type (str): Language model position embedding type.
+        language_rotary_percent (float): RoPE percent. Defaults to 1.0.
+        pre_process (bool): Include embedding layer in the decoder (used with pipeline parallel).
+        post_process (bool): Include output layer in the decoder (used with pipeline parallel).
+        add_encoder (bool): Construct the encoder (used with pipeline parallel).
+            When we use pipelining, the encoder will live on only the first stage
+        add_decoder (bool): Construct the decoder (used with pipeline parallel).
+            When we use pipelining, the decoder will live on every stage after the first one.
+        img_h (int): Input image height.
+        img_w (int): Input image width.
+        patch_dim (int): The size of each image patch side.
+        language_rotary_base (int): RoPE base.
+    """
+
+    def __init__(
+        self,
+        language_transformer_config: TransformerConfig,
+        language_transformer_layer_spec: ModuleSpec,
+        language_vocab_size: int,
+        language_max_sequence_length: int,
+        vision_transformer_config: TransformerConfig,
+        vision_transformer_layer_spec: ModuleSpec,
+        drop_vision_class_token: bool,
+        vision_projection_config: TransformerConfig,
+        vision_projection_layer_spec: ModuleSpec,
+        vision_projection_type: str = "mlp",
+        allow_missing_vision_projection_checkpoint: bool = False,
+        parallel_output: bool = True,
+        language_position_embedding_type: str = 'learned_absolute',
+        language_rotary_percent: float = 1.0,
+        pre_process: bool = True,
+        post_process: bool = True,
+        add_encoder: bool = True,
+        add_decoder: bool = True,
+        img_h: int = 336,
+        img_w: int = 336,
+        patch_dim: int = 14,
+        language_rotary_base: int = 10000,
+        language_rope_scaling: bool = False,
+    ) -> None:
+        super().__init__(config=language_transformer_config)
+
+        if has_config_logger_enabled(language_transformer_config):
+            log_config_to_disk(language_transformer_config, locals(), prefix=type(self).__name__)
+
+        logging.getLogger(__name__).warning(
+            "LLaVA model is under active development. "
+            "It may be missing features and its methods may change."
+        )
+
+        self.pre_process = pre_process
+        self.post_process = post_process
+        self.add_encoder = add_encoder
+        self.add_decoder = add_decoder
+
+        self.encoder_hidden_state = None
+        self.vision_model = None
+        self.vision_projection = None
+        self.language_model = None
+
+        self.sequence_parallel_lm = language_transformer_config.sequence_parallel
+        if self.sequence_parallel_lm:
+            assert (
+                language_transformer_layer_spec.submodules.self_attention.submodules.core_attention
+                == TEDotProductAttention
+            ), "Sequence Parallelism is supported only with Transformer Engine DotProductAttention."
+        self.tp_comm_overlap_lm = language_transformer_config.tp_comm_overlap
+
+        # This attribute is needed to check if an all-reduce is required
+        # on the word embeddings inside `finalize_model_grads._allreduce_word_embedding_grads`.
+        self.share_embeddings_and_output_weights = False
+        if self.add_decoder:
+            self.language_model = GPTModel(
+                config=language_transformer_config,
+                transformer_layer_spec=language_transformer_layer_spec,
+                vocab_size=language_vocab_size,
+                max_sequence_length=language_max_sequence_length,
+                parallel_output=parallel_output,
+                position_embedding_type=language_position_embedding_type,
+                rotary_percent=language_rotary_percent,
+                pre_process=self.pre_process,
+                post_process=self.post_process,
+                rotary_base=language_rotary_base,
+                rope_scaling=language_rope_scaling,
+            )
+            self.share_embeddings_and_output_weights = (
+                self.language_model.share_embeddings_and_output_weights
+            )
+            self._language_max_sequence_length = language_max_sequence_length
+            self._language_is_pipeline_parallel = (
+                language_transformer_config.pipeline_model_parallel_size > 1
+            )
+
+        class_token_len = 1
+        if self.add_encoder:
+            self._drop_vision_class_token = drop_vision_class_token
+            add_class_token = True
+            if vision_transformer_config.vision_model_type == "siglip":
+                class_token_len = 0
+                add_class_token = False
+                error_msg = (
+                    "Siglip does not support vision class token, "
+                    "set disable-vision-class-token to False."
+                )
+                assert not self._drop_vision_class_token, error_msg
+            self.vision_model = CLIPViTModel(
+                vision_transformer_config,
+                vision_transformer_layer_spec,
+                img_h=img_h,
+                img_w=img_w,
+                class_token_len=class_token_len,
+                patch_dim=patch_dim,
+                model_subtype=vision_transformer_config.vision_model_type,
+                add_class_token=add_class_token,
+            )
+            # Map (intermediate) vision model outputs to the language model input dimension.
+            self.vision_projection = MultimodalProjector(
+                vision_projection_config,
+                vision_projection_layer_spec,
+                vision_projection_type,
+                vision_transformer_config.hidden_size,  # input size to the projection.
+            )
+            # Ignore missing weights for the vision projection during checkpoint loading.
+            # This should be disabled by default but can be enabled if your checkpoint contains
+            # pretrained vision and language models but not the projection from vision model
+            # outputs to language model inputs.
+            if allow_missing_vision_projection_checkpoint:
+                vision_projection_param_names = [
+                    f"vision_projection.{name}"
+                    for name in self.vision_projection.state_dict().keys()
+                ]
+                self.vision_projection.register_load_state_dict_post_hook(
+                    partial(_load_state_dict_hook_ignore_param_names, vision_projection_param_names)
+                )
+
+        self._img_seq_len = get_num_image_embeddings(
+            img_h,
+            img_w,
+            patch_dim,
+            vision_transformer_config.vision_model_type,
+            drop_vision_class_token,
+            class_token_len,
+        )
+
+    def shared_embedding_or_output_weight(self):
+        """This is a convenience method to surface the language model's word embeddings, which is
+        necessary for `finalize_model_grads._allreduce_word_embedding_grads`."""
+        if self.add_decoder:
+            return self.language_model.shared_embedding_or_output_weight()
+        return None
+
+    def set_input_tensor(self, input_tensor) -> None:
+        """Set model chunk input tensor."""
+        # This is usually handled in schedules.py but some inference code still
+        # gives us non-lists or None
+        if not isinstance(input_tensor, list):
+            input_tensor = [input_tensor]
+        assert len(input_tensor) == 1, 'input_tensor should only be length 1 for llava'
+
+        if self.add_encoder and self.add_decoder:
+            self.vision_model.set_input_tensor(input_tensor[0])
+        elif self.add_encoder:
+            self.vision_model.set_input_tensor(input_tensor[0])
+        elif self.pre_process:
+            self.encoder_hidden_state = input_tensor[0]
+        else:
+            self.language_model.set_input_tensor(input_tensor[0])
+
+    def freeze(
+        self, freeze_language_model: bool, freeze_vision_model: bool, freeze_vision_projection: bool
+    ):
+        """Freeze model modules.
+
+        Make specific modules non-trainable by setting requires_grad to False.
+
+        Args:
+            freeze_language_model (bool): Freeze the language model module.
+            freeze_vision_model (bool): Freeze the vision model module.
+            freeze_vision_projection (bool): Freeze the vision projection module.
+        """
+        modules = []
+        if freeze_language_model and self.language_model is not None:
+            modules.append(self.language_model)
+        if freeze_vision_model and self.vision_model is not None:
+            modules.append(self.vision_model)
+        if freeze_vision_projection and self.vision_projection is not None:
+            modules.append(self.vision_projection)
+
+        for module in modules:
+            for param in module.parameters():
+                param.requires_grad = False
+
+    def _preprocess_data(
+        self,
+        image_embeddings,
+        language_embeddings,
+        input_ids,
+        loss_mask,
+        labels,
+        use_inference_kv_cache,
+        image_token_index,
+        num_image_tiles,
+        attention_mask,
+    ):
+        """Preprocess input data before input to language model.
+
+        This function is adopted from
+        https://github.com/huggingface/transformers/blob/85817d98fb60977c97e3014196a462b732d2ed1a/src/transformers/models/llava_next/modeling_llava_next.py#L409
+        for our input data conventions.
+
+        image_token_index = -200 indicates the image position in the input_ids = [0, 1, -200, 2, 3]
+        and labels = [1, -200, 2, 3, 4], for example.
+        We want to replace the image position (-200) with image_embeddings and return the following:
+        - final_embeddings = [0, 1, image_embeddings, 2, 3],
+        - final_labels = [1, -100, 2, 3, 4]
+        - final_loss_mask = [1, 0, 0, 1, 1]
+
+        This function handles samples without images (text-only sample). It also handles samples
+        with images that are split into multiples tiles.
+
+        If pipeline parallelism is not used, then self.pre_process and self.post_process
+        are both True and we update both input embeddings, labels and loss masks (if available).
+
+        If pipeline parallelism is used, then we do the following
+        - the first language model chunk has self.pre_process = True and
+          self.post_process = False. We update input embeddings.
+        - the middle language model chunk(s) has self.pre_process = False and
+          self.post_process = False. We don't need to update anything.
+        - the last language model chunk has self.pre_process = False and
+          self.post_process = True. We update labels and loss mask.
+
+        TODO: This function should adjust the attention mask too.
+        Currently, we assume the language model uses a causal mask.
+
+        Returns:
+            final_embedding (torch.Tensor): image and text embeddings [combined_seq_len, b, h].
+            final_labels (torch.Tensor): labels for image and text positions [b, combined_seq_len].
+            final_loss_mask (torch.Tensor): loss mask [b, combined_seq_len].
+        """
+        assert self.add_decoder, "input text preprocessing is only needed for the language model"
+
+        # No pre- or postprocessing needed.
+        # With pipeline parallel > 2, this means a chunk in the middle of the model.
+        if not self.pre_process and not self.post_process:
+            return language_embeddings, loss_mask, labels, attention_mask
+
+        # If using the inference KV cache, the image tokens are already computed.
+        if use_inference_kv_cache:
+            return language_embeddings, loss_mask, labels, attention_mask
+
+        img_seq_len = self._img_seq_len
+        batch_size, text_seq_len = input_ids.shape
+
+        has_labels = labels is not None
+        if has_labels:
+            assert (
+                labels.shape == loss_mask.shape
+            ), f"mismatching labels shape {labels.shape} and loss mask shape {loss_mask.shape}"
+
+        # Create indices for new text and label positions.
+        with torch.no_grad():
+            image_token_mask = input_ids == image_token_index
+            num_images_per_sample = torch.sum(image_token_mask, dim=-1)
+
+            # Number of tiles per sample.
+            num_image_tiles_batch = num_image_tiles.split(num_images_per_sample.tolist(), dim=0)
+            num_image_tiles_batch = torch.tensor(
+                [x.sum() for x in num_image_tiles_batch], device=input_ids.device
+            )
+
+            # Sequence length for each sample is the image sequence length multiplied by
+            # the number of tiles for that image, minus image token indices,
+            # plus text sequence length.
+            seq_lens = num_image_tiles_batch * img_seq_len - num_images_per_sample + text_seq_len
+            max_seq_len = seq_lens.max()
+            # Pipeline parallel expects fixed input size. Check if we need to pad.
+            if (
+                self._language_is_pipeline_parallel
+                and max_seq_len < self._language_max_sequence_length
+            ):
+                max_seq_len = self._language_max_sequence_length
+
+            if self.sequence_parallel_lm:
+                if self.tp_comm_overlap_lm:
+                    # If shorter: Pad to language_max_sequence_length to use TP Comm overlap.
+                    # If longer: Gets truncated later.
+                    if max_seq_len < self._language_max_sequence_length:
+                        padded_seq_len = self._language_max_sequence_length
+                else:
+                    # Pad to multiple of tp size for sequence parallelism
+                    tp_world_size = get_tensor_model_parallel_world_size()
+                    padded_seq_len = int(
+                        (max_seq_len + (tp_world_size - 1)) // tp_world_size * tp_world_size
+                    )
+                sp_padding_needed = padded_seq_len - max_seq_len
+                max_seq_len = padded_seq_len
+            batch_indices, non_image_indices = torch.where(input_ids != image_token_index)
+
+            # New position ids for the text tokens, shifted by the image sequence length.
+            # E.g. for input_ids = [-200, 1, 2, 3] and img_seq_len = 576, we get
+            # new_position_ids = [576, 577, 578, 579]. text_position_ids are then [577, 578, 579].
+            image_token_mask_lens = image_token_mask.int().clone()
+            # -1 is for the removed image token index.
+            image_token_mask_lens[image_token_mask] = num_image_tiles * img_seq_len - 1
+            # +1 is needed here for the cumulative sum. -1 is adjusting for zero-based indexing.
+            new_position_ids = torch.cumsum((image_token_mask_lens + 1), dim=-1) - 1
+            text_position_ids = new_position_ids[batch_indices, non_image_indices]
+
+            # Labels are shifted to left by one.
+            # So, shift text position ids and non-image indices to left by one.
+            if has_labels:
+                label_text_position_ids = text_position_ids - 1
+                valid_label_text_position_ids = label_text_position_ids >= 0
+                label_text_position_ids = label_text_position_ids[valid_label_text_position_ids]
+
+                label_batch_indices = batch_indices[valid_label_text_position_ids]
+
+                label_non_image_indices = non_image_indices - 1
+                valid_label_non_image_indices = label_non_image_indices >= 0
+                label_non_image_indices = label_non_image_indices[valid_label_non_image_indices]
+
+            # Create a mask for the image embedding positions.
+            images_mask = torch.full(
+                (batch_size, max_seq_len), True, dtype=torch.bool, device=input_ids.device
+            )
+            # No images in the text positions.
+            images_mask[batch_indices, text_position_ids] = False
+            # Samples can have different amount of images tokens.
+            # new_position_ids[:, -1] gives the last text position id for each sample.
+            # Padding is needed when the number of image tokens differs.
+            first_padding_idx = new_position_ids[:, -1] + 1
+            images_mask[
+                torch.arange(max_seq_len, device=first_padding_idx.device).repeat(batch_size, 1)
+                >= first_padding_idx.unsqueeze(1)
+            ] = False
+
+        # Create the final input embedding (if this is the first language model stage).
+        final_embedding = None
+        if self.pre_process:
+            embed_dim = language_embeddings.shape[-1]
+            final_embedding = torch.zeros(
+                batch_size,
+                max_seq_len,
+                embed_dim,
+                dtype=language_embeddings.dtype,
+                device=language_embeddings.device,
+            )
+
+            # Put text embeddings to the text positions in the result tensor.
+            final_embedding[batch_indices, text_position_ids] = language_embeddings[
+                batch_indices, non_image_indices
+            ]
+
+            # Put image embeddings to image positions.
+            final_embedding[images_mask] = (
+                image_embeddings.permute(1, 0, 2).reshape(-1, embed_dim).contiguous()
+            )
+
+        # Create the final labels and loss mask (if this is the last language model stage).
+        final_labels, final_loss_mask = None, None
+        if has_labels:
+            final_labels = torch.full(
+                (batch_size, max_seq_len), IGNORE_INDEX, dtype=labels.dtype, device=labels.device
+            )
+            final_loss_mask = torch.full(
+                (batch_size, max_seq_len), 0, dtype=loss_mask.dtype, device=loss_mask.device
+            )
+
+            # Put text labels and loss mask to the text positions.
+            final_labels[label_batch_indices, label_text_position_ids] = labels[
+                label_batch_indices, label_non_image_indices
+            ]
+
+            final_loss_mask[batch_indices, text_position_ids] = loss_mask[
+                batch_indices, non_image_indices
+            ]
+
+            # For labels, pick the last label index that got dropped by the shift to left.
+            label_extra_text_position_ids = seq_lens - 1
+            batch_range = torch.arange(len(label_extra_text_position_ids))
+            final_labels[batch_range, label_extra_text_position_ids] = labels[batch_range, -1]
+
+            # Loss mask the image positions.
+            final_loss_mask[images_mask] = 0
+
+            # Loss mask last text position just before an image
+            # so that text token does not need to predict the first image token.
+            batch_image_indices, image_indices = torch.where(image_token_mask)
+            # Indices just before image tokens. If it's -1, skip it.
+            before_image_indices = image_indices - 1
+            valid = before_image_indices >= 0
+            valid_batch_image_indices = batch_image_indices[valid]
+            valid_before_image_indices = before_image_indices[valid]
+            # Map those indices those position ids.
+            valid_before_image_indices = new_position_ids[
+                valid_batch_image_indices, valid_before_image_indices
+            ]
+
+            final_loss_mask[valid_batch_image_indices, valid_before_image_indices] = 0
+
+        if final_embedding is not None and has_labels:
+            assert (
+                final_embedding.shape[:2] == final_labels.shape == final_loss_mask.shape
+            ), "unexpected shapes after data preprocessing"
+
+        truncate_labels = has_labels and final_labels.shape[1] > self._language_max_sequence_length
+        if truncate_labels:
+            final_labels = final_labels[:, : self._language_max_sequence_length]
+            final_loss_mask = final_loss_mask[:, : self._language_max_sequence_length]
+
+        if final_embedding is not None:
+            final_embedding = final_embedding.transpose(1, 0).contiguous()
+            # Truncate if exceeding the language model's max sequence length.
+            if final_embedding.shape[0] > self._language_max_sequence_length:
+                final_embedding = final_embedding[: self._language_max_sequence_length]
+            if self.sequence_parallel_lm:
+                # Create an attention mask. This ensures correct computation.
+                # This is done even when no padding was done as we set mask_type to
+                # 'padding' or 'padding_causal' when using SP.
+                if attention_mask is None:
+                    # Create base attention mask with original seq len to indicate valid tokens
+                    attention_mask = (
+                        torch.ones(
+                            (
+                                final_embedding.shape[1],
+                                final_embedding.shape[0] - sp_padding_needed,
+                            ),
+                            device=final_embedding.device,
+                        )
+                        .unsqueeze(1)
+                        .unsqueeze(1)
+                    )  # [b, 1, 1, final seq len - sp_padding_needed]
+                if sp_padding_needed > 0:
+                    # Add the padding portion of the mask
+                    attention_mask = torch.nn.functional.pad(attention_mask, (0, sp_padding_needed))
+                if is_te_min_version("1.7.0"):
+                    # Attention mask True/False meaning flipped in 1.7.0
+                    attention_mask = attention_mask < 0.5
+                final_embedding = tensor_parallel.scatter_to_sequence_parallel_region(
+                    final_embedding
+                )
+
+        return final_embedding, final_labels, final_loss_mask, attention_mask
+
+    def forward(
+        self,
+        images: torch.Tensor,
+        input_ids: torch.Tensor,
+        position_ids: torch.Tensor,
+        attention_mask: torch.Tensor,
+        labels: Optional[torch.Tensor] = None,
+        loss_mask: Optional[torch.Tensor] = None,
+        inference_params: Optional[InferenceParams] = None,
+        num_image_tiles: Optional[List[int]] = None,
+        image_token_index: Optional[int] = IMAGE_TOKEN_INDEX,
+        runtime_gather_output: Optional[bool] = None,
+    ) -> torch.Tensor:
+        """Forward function of the LLaVA model.
+
+        Args:
+            images (torch.Tensor): input images of shape [num_tiles, img_h, img_w].
+                num_tiles means the number of image tiles in this batch.
+                num_tiles = 0 if the batch doesn't contain images.
+            input_ids (torch.Tensor): input text ids [batch, text_seq_len].
+            position_ids (torch.Tensor): input text position ids [batch, text_seq_len].
+            attention_mask (torch.Tensor): Language model attention mask
+                [batch, 1, 1, combined_seq_len].
+            labels (torch.Tensor): Optional target text labels [batch, combined_seq_len].
+            loss_mask (torch.Tensor): Text loss mask [batch, text_seq_len].
+            inference_params (InferenceParams): Inference-time parameters including KV cache.
+            num_image_tiles (list of int): Number of tiles per image. Default 1 tile per image.
+            image_token_index (int): ID for input images.
+            runtime_gather_output (bool): Gather output at runtime. Default None means
+                `parallel_output` arg in the constructor will be used.
+
+        Returns:
+            output (torch.Tensor): Loss of shape [b, s] if labels are provided,
+                otherwise logits of shape [b, s, vocab_size].
+            loss_mask (torch.Tensor): Loss mask expanded to combined sequence length. Shape [b, s].
+        """
+        use_inference_kv_cache = (
+            inference_params is not None
+            and "image_tokens_count" in inference_params.key_value_memory_dict
+        )
+        has_images = images.shape[0] > 0
+
+        # If running inference, we can skip image token computation
+        # if they were computed already earlier for this sample.
+        if use_inference_kv_cache:
+            image_embeddings = None
+        elif self.add_encoder and not has_images:
+            # If no images provided, use an empty image embeddings tensor.
+            image_embeddings = torch.tensor([], dtype=images.dtype, device=images.device).reshape(
+                0, 0, 0
+            )
+        elif self.add_encoder and has_images:
+            image_embeddings = self.vision_model(images)  # [num_tiles, img_seq_len, h_vision]
+            if self._drop_vision_class_token:
+                image_embeddings = image_embeddings[:, self.vision_model.class_token_len :, :]
+            # contiguous() required as `permute` can sparsify the tensor and this breaks pipelining
+            image_embeddings = image_embeddings.permute(
+                1, 0, 2
+            ).contiguous()  # [img_seq_len, num_tiles, h_vision]
+
+            # map vision model output size to language model input size.
+            image_embeddings = self.vision_projection(
+                image_embeddings
+            )  # [img_seq_len, num_tiles, h_language]
+
+            # TODO: Support batched inference.
+            # In inference, the language model KV cache will be updated for image token positions.
+            # Store the image tokens sequence length to be used as an offset to the KV cache later.
+            if inference_params is not None:
+                inference_params.key_value_memory_dict["image_tokens_count"] = (
+                    image_embeddings.shape[0] * image_embeddings.shape[1]
+                )
+        else:
+            image_embeddings = self.encoder_hidden_state
+
+        if not self.add_decoder:
+            return image_embeddings, loss_mask
+
+        language_embeddings = None
+        if self.pre_process:
+            input_ids_text = input_ids.clone()
+            input_ids_text[input_ids_text == image_token_index] = 0
+            # Note: This adds absolute position embedding but not RoPE.
+            # Each image is counted as one position.
+            # RoPE is added in language_model forward. Each image embedding is one position.
+            if self.sequence_parallel_lm:
+                # Pad to nearest multiple of TP world size for embedding.
+                tp_world_size = get_tensor_model_parallel_world_size()
+                padded_seq_len = (
+                    int(
+                        (input_ids_text.shape[1] + tp_world_size - 1)
+                        // tp_world_size
+                        * tp_world_size
+                    )
+                    - input_ids_text.shape[1]
+                )
+                if padded_seq_len != 0:
+                    input_ids_text = torch.nn.functional.pad(input_ids_text, (0, padded_seq_len))
+                    if position_ids is not None:
+                        position_ids = torch.nn.functional.pad(position_ids, (0, padded_seq_len))
+            language_embeddings = self.language_model.embedding(
+                input_ids=input_ids_text, position_ids=position_ids
+            )  # [text_seq_len, b, h_language]
+            if self.sequence_parallel_lm:
+                # Gather the language embeddings back.
+                # We use the full embedding to insert image embeddings
+                # and then scatter to avoid load imbalance.
+                language_embeddings = tensor_parallel.gather_from_sequence_parallel_region(
+                    language_embeddings, tensor_parallel_output_grad=False
+                )
+                # Remove the padding done for SP as we'll need new padding calculation
+                # after image embeddings are inserted.
+                if padded_seq_len != 0:
+                    language_embeddings = language_embeddings[:-padded_seq_len]
+            language_embeddings = language_embeddings.transpose(
+                1, 0
+            ).contiguous()  # [b, text_seq_len, h_language]
+
+        # Assume 1 tile per image if the number of tiles is not provided.
+        if num_image_tiles is None:
+            num_image_tiles = torch.ones(images.shape[0], dtype=torch.int, device=input_ids.device)
+
+        # Preprocess input, labels and loss mask.
+        combined_embeddings, new_labels, new_loss_mask, attention_mask = self._preprocess_data(
+            image_embeddings,
+            language_embeddings,
+            input_ids,
+            loss_mask,
+            labels,
+            use_inference_kv_cache,
+            image_token_index,
+            num_image_tiles,
+            attention_mask,
+        )  # [combined_seq_len, b, h_language], [b, combined_seq_len], [b, combined_seq_len]
+
+        output = self.language_model(
+            input_ids=None,
+            position_ids=None,
+            attention_mask=attention_mask,
+            decoder_input=combined_embeddings,
+            labels=new_labels,
+            inference_params=inference_params,
+            runtime_gather_output=runtime_gather_output,
+        )
+
+        if labels is None or loss_mask is None:
+            return output
+
+        return output, new_loss_mask
+
+
+def _load_state_dict_hook_ignore_param_names(
+    param_names: List[str], module: torch.nn.Module, incompatible_keys: namedtuple
+):
+    """Hook to ignore missing keys during checkpoint loading.
+
+    By default, this should not be used to avoid accidentally missing weights in checkpoint loading.
+
+    Example use case: Use this if you want to load a checkpoint that contains vision and language
+    model weights but not the vision projection weights.
+
+    Args:
+        param_names (list str): Parameter names allowed to be missing when calling load_state_dict.
+        module (torch.nn.Module): The torch module this hook applies to. Required by the torch API.
+        incompatible_keys (namedtuple): Namedtuple with fields missing_keys and unexpected_keys,
+            which collect the missing and unexpected keys, respectively.
+    """
+    for param_name in param_names:
+        if param_name in incompatible_keys.missing_keys:
+            logging.getLogger(__name__).warning(
+                f"{param_name} being removed from incompatible_keys.missing_keys in LlavaModel"
+            )
+            incompatible_keys.missing_keys.remove(param_name)
diff --git a/megatron/core/models/multimodal/llava_spec.py b/megatron/core/models/multimodal/llava_spec.py
new file mode 100644
index 0000000000..40e58d0bfc
--- /dev/null
+++ b/megatron/core/models/multimodal/llava_spec.py
@@ -0,0 +1,87 @@
+# Copyright (c) 2024, NVIDIA CORPORATION. All rights reserved.
+from megatron.core.extensions.transformer_engine import (
+    TEDotProductAttention,
+    TELayerNormColumnParallelLinear,
+    TENorm,
+    TERowParallelLinear,
+)
+from megatron.core.fusions.fused_bias_dropout import get_bias_dropout_add
+from megatron.core.models.gpt.gpt_layer_specs import _get_mlp_module_spec
+from megatron.core.tensor_parallel.layers import ColumnParallelLinear, RowParallelLinear
+from megatron.core.transformer.attention import SelfAttention, SelfAttentionSubmodules
+from megatron.core.transformer.dot_product_attention import DotProductAttention
+from megatron.core.transformer.enums import AttnMaskType
+from megatron.core.transformer.identity_op import IdentityOp
+from megatron.core.transformer.spec_utils import ModuleSpec
+from megatron.core.transformer.transformer_layer import TransformerLayer, TransformerLayerSubmodules
+
+try:
+    import apex  # pylint: disable=unused-import
+
+    from megatron.core.fusions.fused_layer_norm import FusedLayerNorm
+
+    HAVE_APEX = True
+    LNImpl = FusedLayerNorm
+except ImportError:
+    import warnings
+
+    from megatron.core.transformer.torch_layer_norm import WrappedTorchLayerNorm
+
+    warnings.warn(f'Apex is not installed. Falling back to Torch LayerNorm')
+    LNImpl = WrappedTorchLayerNorm
+
+
+def decoder_model_with_transformer_engine_default_spec(
+    num_experts: int = None, moe_grouped_gemm: bool = False, qk_layernorm: bool = False
+) -> ModuleSpec:
+    """LLava decoder TE spec (uses Transformer Engine components)."""
+    mlp = _get_mlp_module_spec(
+        use_te=True, num_experts=num_experts, moe_grouped_gemm=moe_grouped_gemm
+    )
+    return ModuleSpec(
+        module=TransformerLayer,
+        submodules=TransformerLayerSubmodules(
+            self_attention=ModuleSpec(
+                module=SelfAttention,
+                params={"attn_mask_type": AttnMaskType.causal},
+                submodules=SelfAttentionSubmodules(
+                    linear_qkv=TELayerNormColumnParallelLinear,
+                    core_attention=TEDotProductAttention,
+                    linear_proj=TERowParallelLinear,
+                    q_layernorm=TENorm if qk_layernorm else IdentityOp,
+                    k_layernorm=TENorm if qk_layernorm else IdentityOp,
+                ),
+            ),
+            self_attn_bda=get_bias_dropout_add,
+            mlp=mlp,
+            mlp_bda=get_bias_dropout_add,
+        ),
+    )
+
+
+def decoder_model_with_local_default_spec(
+    num_experts: int = None, moe_grouped_gemm: bool = False, qk_layernorm: bool = False
+) -> ModuleSpec:
+    """LLava decoder local spec."""
+    mlp = _get_mlp_module_spec(
+        use_te=False, num_experts=num_experts, moe_grouped_gemm=moe_grouped_gemm
+    )
+    return ModuleSpec(
+        module=TransformerLayer,
+        submodules=TransformerLayerSubmodules(
+            input_layernorm=LNImpl,
+            self_attention=ModuleSpec(
+                module=SelfAttention,
+                params={"attn_mask_type": AttnMaskType.causal},
+                submodules=SelfAttentionSubmodules(
+                    linear_qkv=ColumnParallelLinear,
+                    core_attention=DotProductAttention,
+                    linear_proj=RowParallelLinear,
+                ),
+            ),
+            self_attn_bda=get_bias_dropout_add,
+            pre_mlp_layernorm=LNImpl,
+            mlp=mlp,
+            mlp_bda=get_bias_dropout_add,
+        ),
+    )
diff --git a/megatron/core/models/retro/__init__.py b/megatron/core/models/retro/__init__.py
new file mode 100644
index 0000000000..ea7cea6d8f
--- /dev/null
+++ b/megatron/core/models/retro/__init__.py
@@ -0,0 +1,13 @@
+# Copyright (c) 2024, NVIDIA CORPORATION.  All rights reserved.
+
+"""
+Exports:
+
+  - RetroConfig: configuration dataclass for RetroModel.
+  - RetroModel: The Retro model.
+  - get_retro_decoder_block_spec: Get spec for Retro decoder transformer block.
+"""
+
+from .config import RetroConfig
+from .decoder_spec import get_retro_decoder_block_spec
+from .model import RetroModel
diff --git a/megatron/core/models/retro/base_attention.py b/megatron/core/models/retro/base_attention.py
new file mode 100644
index 0000000000..ee8656d96a
--- /dev/null
+++ b/megatron/core/models/retro/base_attention.py
@@ -0,0 +1,43 @@
+# Copyright (c) 2024, NVIDIA CORPORATION.  All rights reserved.
+
+"""Base class for decoder and encoder attention modules."""
+
+from megatron.core.models.retro.config import RetroConfig
+from megatron.core.transformer.attention import CrossAttention, CrossAttentionSubmodules
+from megatron.core.transformer.enums import AttnMaskType
+from megatron.core.transformer.module import MegatronModule
+
+
+class BaseRetroCrossAttention(MegatronModule):
+    """Base class for Retro cross attention, for both encoder & decoder layers.
+
+    This class collects the retro arguments below (i.e., num neighbors, chunk
+    length, and retrieve length) for use in Retro's custom cross attention
+    operators.
+
+    Args:
+        config (RetroConfig): Retro config.
+        submodules (CrossAttentionSubmodules): Cross attention submodules.
+        layer_number (int): Layer number within transformer block.
+        attn_mask_type (AttnMaskType): Mask type ('causal' or 'padding').
+    """
+
+    def __init__(
+        self,
+        config: RetroConfig,
+        submodules: CrossAttentionSubmodules,
+        layer_number: int = 1,
+        attn_mask_type: AttnMaskType = AttnMaskType.padding,
+    ):
+        super().__init__(config=config)
+
+        self.attn = CrossAttention(
+            config=config,
+            submodules=submodules,
+            layer_number=layer_number,
+            attn_mask_type=attn_mask_type,
+        )
+
+        self.retro_num_neighbors = config.retro_num_neighbors
+        self.retro_chunk_length = config.retro_chunk_length
+        self.retro_retrieved_length = config.retro_retrieved_length
diff --git a/megatron/core/models/retro/config.py b/megatron/core/models/retro/config.py
new file mode 100644
index 0000000000..d4b5c9684b
--- /dev/null
+++ b/megatron/core/models/retro/config.py
@@ -0,0 +1,85 @@
+# Copyright (c) 2024, NVIDIA CORPORATION. All rights reserved.
+
+"""Configuration dataclass for a RetroModel."""
+
+import os
+from dataclasses import dataclass
+
+from megatron.core.transformer import TransformerConfig
+from megatron.core.utils import is_te_min_version
+
+
+@dataclass
+class RetroConfig(TransformerConfig):
+    """Configuration object for Retro models."""
+
+    # Retro.
+    retro_project_dir: str = None
+    """Retro project directory, which contains the preprocessed data for for pretraining. This
+       directory is built during preprocessing (see tools/retro/README.md), and contains
+       subdirectories for the chunk database and pretraining neighbors.
+    """
+
+    retro_block_size: int = None
+    """Number of records to load per data file, as saved during preprocessing. Block processing is
+       used for efficient data preprocessing.
+    """
+
+    retro_chunk_length: int = None
+    """Chunk length used for performing chunked- cross-attention (CCA)."""
+
+    retro_encoder_num_layers: int = 2
+    """Number of layers to use for the retrieval encoder."""
+
+    retro_encoder_hidden_dropout: float = 0.1
+    """Hidden dropout for retrieval encoder."""
+
+    retro_encoder_attention_dropout: float = 0.1
+    """Attention dropout for retrieval encoder."""
+
+    retro_neighbor_dirs: dict = None
+    """Directory names of saved neighbor id files for train, valid, and test datasets."""
+
+    retro_num_neighbors: int = 2
+    """Number of neighbors to retrieve during pretraining."""
+
+    retro_num_retrieved_chunks: int = 2
+    """Number of chunks to retrieve from the retrieval database."""
+
+    retro_retrieved_length: int = None
+    """Cached value of retro_num_retrieved_chunks * retro_chunk_length (i.e., the total number of
+       retrieved tokens; neighbor + continuation).
+    """
+
+    retro_split_preprocessing: str = None
+    """Data split used during data preprocessing."""
+
+    retro_verify_neighbor_count: bool = True
+    """Verify that len(GPT dataset) == len(saved neighbors)."""
+
+    def __post_init__(self) -> None:
+        """Validate Retro config."""
+
+        super().__post_init__()
+
+        # Validate Transformer Engine version.
+        if is_te_min_version("1.3"):
+            try:
+                assert os.getenv("NVTE_FLASH_ATTN") == "0"
+                assert os.getenv("NVTE_FUSED_ATTN") == "0"
+            except Exception as e:
+                raise Exception(
+                    "When using Transformer Engine >= 1.3, environment vars NVTE_FLASH_ATTN "
+                    "and NVTE_FUSED_ATTN most both be defined and set to '0'. "
+                    "Currently, NVTE_FLASH_ATTN == %s, NVTE_FUSED_ATTN == %s."
+                    % (
+                        os.getenv("NVTE_FLASH_ATTN", "[unset]"),
+                        os.getenv("NVTE_FUSED_ATTN", "[unset]"),
+                    )
+                )
+
+        # Preprocessing split should be defined.
+        assert self.retro_split_preprocessing is not None
+
+        # Pre-compute retrieved length.
+        self.retro_retrieved_length = self.retro_num_retrieved_chunks * self.retro_chunk_length
diff --git a/megatron/core/models/retro/decoder_attention.py b/megatron/core/models/retro/decoder_attention.py
new file mode 100644
index 0000000000..6b7a04d884
--- /dev/null
+++ b/megatron/core/models/retro/decoder_attention.py
@@ -0,0 +1,305 @@
+# Copyright (c) 2024, NVIDIA CORPORATION. All rights reserved.
+
+"""Retro's cross attention modules for the decoder block."""
+
+from functools import partial
+from typing import Callable
+
+import numpy as np
+import torch
+from torch import Tensor
+
+from megatron.core import InferenceParams
+from megatron.core.fusions.fused_bias_dropout import get_bias_dropout_add
+from megatron.core.models.retro.base_attention import BaseRetroCrossAttention
+from megatron.core.models.retro.config import RetroConfig
+from megatron.core.models.retro.utils import get_all_true_mask
+from megatron.core.transformer import ModuleSpec
+from megatron.core.transformer.attention import CrossAttentionSubmodules
+from megatron.core.transformer.enums import AttnMaskType
+from megatron.core.transformer.module import MegatronModule
+from megatron.core.transformer.transformer_block import TransformerBlock
+
+
+class RetroDecoderCrossAttention(BaseRetroCrossAttention):
+    """Retro decoder's chunked cross attention operator.
+
+    See this paper for more details: https://arxiv.org/abs/2112.04426.
+    Neighboring chunks retrieved from the chunk database are used here for
+    chunked-cross attention.
+
+    ** Note about 'encoder_block_spec' **
+
+    Retro is an encoder-decoder model that uses its encoder for encoding
+    neighboring chunks that are retrieved from a chunk database. These
+    encoded neighbors are then used in the decoder stack for performing
+    chunked-cross attention (see paper link above).
+
+    In contrast to the T5 model, the encoder and decoder are computationally
+    intertwined, since the input to the encoder is the output of the self-
+    attention of the first decoder layer. As such, the encoder block itself
+    is instantiated within the first Retro decoder layer, in order to receive
+    the self-attention's output. (Note, that only the first decoder layer
+    instantiates an encoder block, and the remaining decoder layers use the
+    encoder output from the first decoder layer.)
+
+    Args:
+        config (RetroConfig): Retro config.
+        submodules (CrossAttentionSubmodules): Cross attention submodules.
+        layer_number (int): Layer number within transformer block.
+        attn_mask_type (AttnMaskType): Mask type ('causal' or 'padding').
+        encoder_block_spec (ModuleSpec): The first Retro decoder layer is provided with a transformer block spec to construct the neighbor encoder.
+    """
+
+    def __init__(
+        self,
+        config: RetroConfig,
+        submodules: CrossAttentionSubmodules,
+        layer_number: int = 1,
+        attn_mask_type: AttnMaskType = AttnMaskType.padding,
+        encoder_block_spec: ModuleSpec = None,
+    ):
+        super().__init__(
+            config=config,
+            submodules=submodules,
+            layer_number=layer_number,
+            attn_mask_type=attn_mask_type,
+        )
+
+        if encoder_block_spec:
+            self.encoder = TransformerBlock(
+                config=config, spec=encoder_block_spec, pre_process=True, post_process=False
+            )
+            # self._encoder_key = 'encoder' # ... necessary?
+        else:
+            self.encoder = None
+
+    def forward(
+        self,
+        hidden_states: Tensor,
+        attention_mask: Tensor,
+        key_value_states: Tensor = None,
+        inference_params: InferenceParams = None,
+        # rotary_pos_emb: Tensor = None, # ... unsupported for retro.
+    ) -> dict:
+        """Cross attention for Retro decoder.
+
+        Notation:
+            ns : Sequence length.
+            bs : Batch size.
+            d  : Hidden size.
+            l  : Number of chunks per sample (i.e., seq_length/chunk_length).
+            m  : Number of tokens per chunk.
+            k  : Number of neighbors.
+            r  : Number of retrieved tokens (neighbors + continuation).
+
+        Args:
+            hidden_states (Tensor): Transformer layer hidden states.
+            attention_mask (Tensor): Attention mask.
+            key_value_states (Tensor): Neighbor embeddings if first decoder layer, else encoder output.
+            inference_params (InferenceParams): Inference params.
+
+        Returns:
+            A dict consisting of the attention output and context, along with other scalars necessary for performing the downstream bias-dropout-add.
+        """
+
+        # hidden_states: [ ns, bs, d ]
+        # key_value_states: [ r, k*bs*l, d ]
+
+        ns, bs, d = hidden_states.shape
+        l = int(np.ceil(ns / self.retro_chunk_length))
+
+        # Retrieve neighbors.
+        if self.encoder:
+
+            # Sequence length remainder.
+            first_ns = ns % self.retro_chunk_length
+
+            # Case 1: Sequence length not divisible by chunk length.
+            if first_ns > 0:
+
+                # Split sequence into first partial chunk & remaining chunks.
+                first_chunk, rest_chunk = hidden_states[:first_ns], hidden_states[first_ns:]
+
+                # Pad partial chunk with zeros.
+                first_chunk = torch.nn.functional.pad(
+                    first_chunk, (0, 0, 0, 0, 0, self.retro_chunk_length - first_ns), 'constant', 0
+                )
+
+                # Concatenate padded chunk with remaining chunks.
+                chunked_output = torch.cat((first_chunk, rest_chunk), dim=0)  # [ l*m, bs, d ]
+
+            # Case 2: Sequence length is divisible by chunk length.
+            else:
+                chunked_output = hidden_states  # [ l*m, bs, d ]
+
+            # Chunk & permute hidden states.
+            # - hidden_states:  [ l*m, bs, d ]
+            # - chunked_output: [ m, bs*l, d ]
+            chunked_output = (
+                chunked_output.reshape(l, self.retro_chunk_length, bs, d)
+                .permute(1, 2, 0, 3)
+                .reshape(self.retro_chunk_length, bs * l, d)
+                .contiguous()
+            )
+
+            # flash attn: [ b, h, sq, sk ]
+            # fused attn: [ b, 1, 1, sq ]
+            chunked_output_mask = get_all_true_mask(
+                size=(1, 1, chunked_output.shape[0], key_value_states.shape[0]),
+                device=chunked_output.device,
+            )
+
+            # Encode neighbors. (Note: 'key_value_states' re-assigned here.)
+            key_value_states = self.encoder(
+                hidden_states=key_value_states,
+                attention_mask=attention_mask,
+                context=chunked_output,
+                context_mask=chunked_output_mask,
+                inference_params=inference_params,
+            )  # [ r, k*bs*l, d ]
+            key_value_states = key_value_states.reshape(
+                self.retro_retrieved_length * self.retro_num_neighbors, bs * l, d
+            )  # [ r*k, bs*l, d ]
+
+        # Attend starting at last token of first chunk.
+        pad = (ns - 1) % self.retro_chunk_length
+        attending_chunks = hidden_states[pad:]
+
+        # Pad attending tokens to sequence length.
+        padded_chunks = torch.nn.functional.pad(
+            attending_chunks, (0, 0, 0, 0, 0, self.retro_chunk_length - 1), 'constant', 0
+        )
+
+        # Permute attending chunks.
+        # - padded_chunks:         [ l*m, bs, d ]
+        # - padded_chunked_output: [ m, bs*l, d ] (matches 'chunked_output' above)
+        padded_chunked_output = padded_chunks.reshape(l, self.retro_chunk_length, bs, d).permute(
+            1, 2, 0, 3
+        )
+        padded_chunked_output = padded_chunked_output.reshape(
+            self.retro_chunk_length, bs * l, d
+        ).contiguous()
+
+        # flash attn: [ b, h, sq, sk ]
+        # fused attn: [ b, 1, 1, sq ]
+        padded_chunked_output_mask = get_all_true_mask(
+            size=(1, 1, padded_chunked_output.shape[0], key_value_states.shape[0]),
+            device=padded_chunked_output.device,
+        )
+
+        # Attend to encoded neighbors.
+        attention_output, attention_bias = self.attn(
+            hidden_states=padded_chunked_output,
+            attention_mask=padded_chunked_output_mask,
+            key_value_states=key_value_states,
+        )
+
+        # Return dimensions for bias-dropout step.
+        return {
+            "ns": ns,
+            "bs": bs,
+            "d": d,
+            "l": l,
+            "pad": pad,
+            "attention_output": attention_output,  # [ m, bs*l, d ]
+            "attention_bias": attention_bias,  # [ d ]
+            "context": key_value_states,  # [ r*k, bs*l, d ]
+        }
+
+
+class RetroDecoderBiasDropoutAdd(MegatronModule):
+    """Retro decoder's bias-dropout-add operator.
+
+    This operator takes care of reshaping and permuting the output from the
+    chunk dimension to the sequence dimension.
+
+    Args:
+        config (RetroConfig): Retro config.
+    """
+
+    def __init__(self, config: RetroConfig):
+        super().__init__(config=config)
+        self.retro_chunk_length = config.retro_chunk_length
+
+    @classmethod
+    def _forward(
+        cls,
+        x_with_bias: dict,
+        residual: Tensor,
+        prob: float,
+        retro_chunk_length: int,
+        bias_dropout_add: Callable,
+    ) -> Tensor:
+        """Per-chunk bias-dropout-add.
+
+        Args:
+            x_with_bias (dict): Attention output and bias, along with other Retro relevant parameters.
+            residual (Tensor): Transformer layer residual.
+            prob (float): Dropout probability.
+            retro_chunk_length (int): Retro chunk length (e.g., 64).
+            bias_dropout_add (Callable): Bias-dropout-add function.
+
+        Returns:
+            Output of bias-dropout-add.
+        """
+
+        # Extract input dict.
+        ns = x_with_bias["ns"]
+        bs = x_with_bias["bs"]
+        d = x_with_bias["d"]
+        l = x_with_bias["l"]
+        pad = x_with_bias["pad"]
+        attention_output = x_with_bias["attention_output"]  # [ m, bs*l, d ]
+        attention_bias = x_with_bias["attention_bias"]  # [ d ]
+
+        # Re-enable torch grad to enable fused optimization.
+        with torch.enable_grad():
+
+            # Bias-dropout-add.
+            x = bias_dropout_add(
+                (
+                    attention_output,
+                    None if attention_bias is None else attention_bias.expand_as(attention_output),
+                ),
+                torch.zeros_like(attention_output),
+                prob,
+            )
+
+            # Permute chunks back to sequence dimension.
+            # 1. [ m, bs*l, d ]
+            # 2. [ m, bs, l, d ]
+            # 3. [ l, m, bs, d ]
+            # 4. [ m*l, bs, d ] == [ ns, bs, d ]
+            x = (
+                x.reshape(retro_chunk_length, bs, l, d)
+                .permute(2, 0, 1, 3)
+                .reshape(retro_chunk_length * l, bs, d)
+            )
+
+            # Prepend zeros for non-attending tokens.
+            x = torch.nn.functional.pad(x, (0, 0, 0, 0, pad, 0), 'constant', 0)[
+                :ns
+            ]  # [ ns, bs, d ]
+
+            # Add residual. [ ns, bs, d ]
+            x = x + residual
+
+        # Output. [ ns, bs, d ]
+        return x
+
+    def forward(self, training: bool, fused: bool) -> partial:
+        """Retro decoder bias-dropout-add.
+
+        Args:
+            training (bool): If training, then apply dropout.
+            fused (bool): Fuse bias-dropout-add.
+
+        Returns:
+            The partial function for performing bias-dropout-add.
+        """
+        return partial(
+            self._forward,
+            retro_chunk_length=self.retro_chunk_length,
+            bias_dropout_add=get_bias_dropout_add(training, fused),
+        )
diff --git a/megatron/core/models/retro/decoder_spec.py b/megatron/core/models/retro/decoder_spec.py
new file mode 100644
index 0000000000..2ad234b96b
--- /dev/null
+++ b/megatron/core/models/retro/decoder_spec.py
@@ -0,0 +1,185 @@
+# Copyright (c) 2024, NVIDIA CORPORATION. All rights reserved.
+
+"""Specs for Retro decoder."""
+
+import typing
+
+from megatron.core import parallel_state
+from megatron.core.models.gpt.gpt_layer_specs import (
+    get_gpt_layer_local_spec,
+    get_gpt_layer_with_transformer_engine_spec,
+)
+from megatron.core.models.retro.config import RetroConfig
+from megatron.core.models.retro.decoder_attention import (
+    RetroDecoderBiasDropoutAdd,
+    RetroDecoderCrossAttention,
+)
+from megatron.core.models.retro.encoder_spec import get_retro_encoder_block_spec
+from megatron.core.tensor_parallel.layers import ColumnParallelLinear, RowParallelLinear
+from megatron.core.transformer import ModuleSpec
+from megatron.core.transformer.attention import CrossAttentionSubmodules
+from megatron.core.transformer.dot_product_attention import DotProductAttention
+from megatron.core.transformer.transformer_block import (
+    TransformerBlockSubmodules,
+    get_num_layers_to_build,
+)
+
+try:
+    import apex  # pylint: disable=unused-import
+
+    from megatron.core.fusions.fused_layer_norm import FusedLayerNorm
+
+    HAVE_APEX = True
+    LNImpl = FusedLayerNorm
+except ImportError:
+    import warnings
+
+    from megatron.core.transformer.torch_layer_norm import WrappedTorchLayerNorm
+
+    warnings.warn(f'Apex is not installed. Falling back to Torch LayerNorm')
+    LNImpl = WrappedTorchLayerNorm
+
+try:
+    from megatron.core.extensions.transformer_engine import (
+        TEColumnParallelLinear,
+        TEDotProductAttention,
+        TENorm,
+        TERowParallelLinear,
+    )
+
+    HAVE_TE = True
+except ImportError:
+    HAVE_TE = False
+
+
+def get_retro_decoder_layer_te_spec(
+    encoder_block_spec: typing.Union[ModuleSpec, TransformerBlockSubmodules, None] = None
+) -> ModuleSpec:
+    """Retro decoder TE spec (uses Transformer Engine components).
+
+    A Retro decoder layer uses custom attention and bias-dropout-add operators
+    to perform chunked-cross attention. Additionally, the first Retro decoder
+    layer instantiates an entire encoder transformer block. As such, the decoder
+    cross attention module takes an optional encoder block spec, which is only
+    provided for the first Retro decoder layer.
+
+    Args:
+        encoder_block_spec (ModuleSpec): Retro encoder block spec, to be provided for
+            the first Retro decoder layer.
+
+    Returns:
+        A module spec with Transformer Engine modules.
+    """
+    spec = get_gpt_layer_with_transformer_engine_spec()
+    spec.submodules.pre_cross_attn_layernorm = TENorm
+    spec.submodules.cross_attention = ModuleSpec(
+        module=RetroDecoderCrossAttention,
+        params={"encoder_block_spec": encoder_block_spec},
+        submodules=CrossAttentionSubmodules(
+            linear_q=TEColumnParallelLinear,
+            linear_kv=TEColumnParallelLinear,
+            core_attention=TEDotProductAttention,
+            linear_proj=TERowParallelLinear,
+        ),
+    )
+    spec.submodules.cross_attn_bda = ModuleSpec(module=RetroDecoderBiasDropoutAdd)
+    return spec
+
+
+def get_retro_decoder_layer_local_spec(
+    encoder_block_spec: typing.Optional[ModuleSpec] = None,
+) -> ModuleSpec:
+    """Retro decoder local spec (uses Megatron-Core components).
+
+    A Retro decoder layer uses custom attention and bias-dropout-add operators
+    to perform chunked-cross attention. Additionally, the first Retro decoder
+    layer instantiates an entire encoder transformer block. As such, the decoder
+    cross attention module takes an optional encoder block spec, which is only
+    provided for the first Retro decoder layer.
+
+    Args:
+        encoder_block_spec (ModuleSpec): Retro encoder block spec, to be provided
+            for the first Retro decoder layer.
+
+    Returns:
+        A module spec with local modules.
+    """
+    spec = get_gpt_layer_local_spec()
+    spec.submodules.pre_cross_attn_layernorm = LNImpl
+    spec.submodules.cross_attention = ModuleSpec(
+        module=RetroDecoderCrossAttention,
+        params={"encoder_block_spec": encoder_block_spec},
+        submodules=CrossAttentionSubmodules(
+            linear_q=ColumnParallelLinear,
+            linear_kv=ColumnParallelLinear,
+            core_attention=DotProductAttention,
+            linear_proj=RowParallelLinear,
+        ),
+    )
+    spec.submodules.cross_attn_bda = ModuleSpec(module=RetroDecoderBiasDropoutAdd)
+    return spec
+
+
+def get_retro_decoder_block_spec(
+    config: RetroConfig, use_transformer_engine: bool
+) -> TransformerBlockSubmodules:
+    """Retro decoder block spec.
+
+    Retro decoder block implementation details:
+    - The retro decoder block consists of interleaved GPT layers
+        and customized Retro decoder layers.
+    - The Retro decoder layers are spaced three layers apart,
+        and start on layer 6 or 9 (depending on the total number of layers).
+    - The first decoder layer instantiates an encoder block,
+        and it therefore passes in an encoder_block_spec.
+
+    Args:
+        config (RetroConfig): Retro config.
+        use_transformer_engine (bool): If True, use Transformer Engine (instead of local modules.
+
+    Returns:
+        Transformer block submodules for the given spec.
+    """
+
+    # Num layers.
+    assert (
+        parallel_state.get_pipeline_model_parallel_world_size() == 1
+    ), "retro does not currently support pipeline parallelism."
+    assert (
+        parallel_state.get_virtual_pipeline_model_parallel_world_size() is None
+    ), "retro does not currently support virtual pipeline parallelism."
+    num_layers = get_num_layers_to_build(config)
+
+    # Retro layer numbers.
+    retro_layer_start = 6 if num_layers <= 15 else 9
+    retro_layer_numbers = list(range(retro_layer_start, num_layers + 1, 3))
+
+    # Layer specs.
+    gpt_layer_spec = (
+        get_gpt_layer_with_transformer_engine_spec()
+        if use_transformer_engine
+        else get_gpt_layer_local_spec()
+    )
+    get_retro_decoder_layer_spec = (
+        get_retro_decoder_layer_te_spec
+        if use_transformer_engine
+        else get_retro_decoder_layer_local_spec
+    )
+    retro_layer_spec = get_retro_decoder_layer_spec()
+    retro_layer_spec_with_retriever = get_retro_decoder_layer_spec(
+        get_retro_encoder_block_spec(config, use_transformer_engine)
+    )
+
+    layer_specs = []
+    for layer_number in range(1, num_layers + 1):
+        if layer_number == retro_layer_numbers[0]:
+            layer_specs.append(retro_layer_spec_with_retriever)
+        elif layer_number in retro_layer_numbers:
+            layer_specs.append(retro_layer_spec)
+        else:
+            layer_specs.append(gpt_layer_spec)
+
+    # Block spec.
+    block_spec = TransformerBlockSubmodules(layer_specs=layer_specs)
+
+    return block_spec
diff --git a/megatron/core/models/retro/encoder_attention.py b/megatron/core/models/retro/encoder_attention.py
new file mode 100644
index 0000000000..76625abe33
--- /dev/null
+++ b/megatron/core/models/retro/encoder_attention.py
@@ -0,0 +1,226 @@
+# Copyright (c) 2024, NVIDIA CORPORATION.  All rights reserved.
+
+"""Retro's cross attention modules for the encoder block."""
+
+from functools import partial
+from typing import Callable, List, Optional, Tuple, Type
+
+import torch
+from torch import Tensor
+
+from megatron.core import InferenceParams
+from megatron.core.fusions.fused_bias_dropout import get_bias_dropout_add
+from megatron.core.models.retro.base_attention import BaseRetroCrossAttention
+from megatron.core.models.retro.config import RetroConfig
+from megatron.core.models.retro.utils import get_all_true_mask
+from megatron.core.transformer.module import MegatronModule
+
+
+class RetroEncoderCrossAttention(BaseRetroCrossAttention):
+    """Retro encoder's cross attention operator.
+
+    See this paper for more details: https://arxiv.org/abs/2112.04426.
+    Neighboring chunks are retrieved from the chunk database, encoded, and
+    used by the decoder layers for chunked cross attention.
+
+    Args:
+        config (RetroConfig): Retro config.
+        submodules (CrossAttentionSubmodules): Cross attention submodules.
+        layer_number (int): Layer number within transformer block.
+        attn_mask_type (AttnMaskType): Mask type ('causal' or 'padding').
+    """
+
+    def forward(
+        self,
+        hidden_states: Tensor,
+        attention_mask: Tensor,
+        key_value_states: Tensor = None,
+        inference_params: InferenceParams = None,
+        # rotary_pos_emb: Tensor = None, # unsupported for retro.
+    ) -> List[Tuple[Tensor, Optional[Tensor], Tensor]]:
+        """Cross attention for Retro encoder.
+
+        Notation:
+            ns : Sequence length.
+            bs : Batch size.
+            d  : Hidden size.
+            l  : Number of chunks per sample (i.e., seq_length/chunk_length).
+            k  : Number of neighbors.
+            r  : Number of retrieved tokens (neighbors + continuation).
+
+        Args:
+            hidden_states (Tensor): Transformer layer hidden states.
+            attention_mask (Tensor): Attention mask.
+            key_value_states (Tensor): Neighbor embeddings.
+            inference_params (InferenceParams): Inference params.
+
+        Returns:
+            List of tuples, where each tuple is (attention_output, attention_bias, residual).
+        """
+
+        # Input shape. [ r, bs*l*k, d ]
+        ns, bs, d = hidden_states.shape
+
+        # Reshape sequence into neighboring chunks.
+        # - hidden_states:   [ r, bs*l*k, d ]
+        # - chunked_outputs: [ r, bs*l, k, d ]
+        chunked_outputs = hidden_states.reshape(
+            self.retro_retrieved_length, -1, self.retro_num_neighbors, d
+        )
+
+        # flash attn: [ b, h, sq, sk ]
+        # fused attn: [ b, 1, 1, sq ]
+        chunked_output_mask = get_all_true_mask(
+            size=(1, 1, chunked_outputs.shape[0], key_value_states.shape[0]),
+            device=chunked_outputs.device,
+        )
+
+        # Per-chunk attention.
+        attention_output_tuples = []
+        for k in range(self.retro_num_neighbors):
+
+            # Attend to current neighboring chunks.
+            # - chunked_output:   [ r, bs*l, d ]
+            # - key_value_states: [ m, bs*l, d ]
+            # - attention_output: [ r, bs*l, d ]
+            # - attention_bias:   [ d ]
+            chunked_output = chunked_outputs[:, :, k].contiguous()
+            attention_output, attention_bias = self.attn(
+                hidden_states=chunked_output,  # Q (neighbor embedding)
+                attention_mask=chunked_output_mask,
+                key_value_states=key_value_states,  # K, V (hidden act)
+            )
+
+            # Residual connection. [ r, bs*l, d ]
+            residual = chunked_output
+
+            # Collect tensors.
+            attention_output_tuples.append((attention_output, attention_bias, residual))
+
+        # Output. (List[Tuple[( [ r, bs*l, d ], [ d ] )]])
+        return attention_output_tuples
+
+
+class RetroEncoderBiasDropoutAdd(MegatronModule):
+    """Retro encoder's bias-dropout-add operator.
+
+    This operator applies bias-dropout-add individually on each neighboring
+    chunk that is retrieved from the chunk database.
+
+    Args:
+        config (RetroConfig): Retro config.
+    """
+
+    def __init__(self, config: RetroConfig):
+        super().__init__(config=config)
+        self.retro_num_neighbors = config.retro_num_neighbors
+
+    @classmethod
+    def _forward(
+        cls,
+        x_with_bias: List[Tuple[Tensor, Optional[Tensor], Tensor]],
+        residual: Tensor,
+        prob: float,
+        retro_num_neighbors: int,
+        bias_dropout_add: Callable,
+    ) -> Tensor:
+        """Per-chunk bias-dropout-add.
+
+        Args:
+            x_with_bias (dict): Attention output and bias tuple.
+            residual (Tensor): Transformer layer residual.
+            prob (float): Dropout probability.
+            retro_num_neighbors (int): Number of retrieved neighbor chunks (e.g., 2).
+            bias_dropout_add (Callable): Bias-dropout-add function.
+
+        Returns:
+            Output of bias-dropout-add.
+        """
+
+        # Re-enable torch grad to enable fused optimization.
+        with torch.enable_grad():
+
+            # Per-neighbor bias-dropout-add.
+            # - attention_output: [ r, bs*l, d ]
+            # - attention_bias:   [ d ]
+            # - residual:         [ r, bs*l, d ]
+            # - output:           [ r, bs*l, d ]
+            outputs = [
+                bias_dropout_add(
+                    (
+                        attention_output,
+                        None if attention_bias is None else attention_bias.expand_as(residual),
+                    ),
+                    residual,
+                    prob,
+                )
+                for attention_output, attention_bias, residual in x_with_bias
+            ]
+
+        # Concatenate outputs (to shape [r, k*bs*l, d]; see notation above).
+        r, _, d = outputs[0].shape
+        output = torch.stack(outputs, dim=1).reshape(r, -1, d)
+
+        # Output. [ r, k*bs*l, d ]
+        return output
+
+    def forward(self, training: bool, fused: bool) -> partial:
+        """Retro decoder bias-dropout-add.
+
+        Args:
+            training (bool): If training, then apply dropout.
+            fused (bool): Fuse bias-dropout-add.
+
+        Returns:
+            A partial function for performing bias-dropout-add.
+        """
+        return partial(
+            self._forward,
+            retro_num_neighbors=self.retro_num_neighbors,
+            bias_dropout_add=get_bias_dropout_add(training, fused),
+        )
+
+
+class RetroEncoderLayerNorm(MegatronModule):
+    """Retro encoder's layernorm operator.
+
+    This operator applies layernorm individually on each neighboring chunk that
+    is retrieved from the chunk database, and then concatenates the chunks into
+    a single tensor.
+
+    Args:
+        config (RetroConfig): Retro config.
+        submodules (Type): Layer norm class. (Named 'submodules' to fit external interface.)
+    """
+
+    def __init__(self, config: RetroConfig, submodules: Type, **kwargs: dict):
+        super().__init__(config=config)
+        norm_class = submodules
+        self.norm = norm_class(config=config, **kwargs)
+        self.retro_num_neighbors = config.retro_num_neighbors
+
+    def forward(self, input: Tensor) -> Tensor:
+        """Per-chunk layer norm.
+
+        Args:
+            input (Tensor): Input chunks, concatenated into a single tensor.
+
+        Returns:
+            Output of the layer norm.
+        """
+
+        # Input shape: [ r, k*bs*l, d ]. (see notation above in attention module)
+
+        # Split input into 'num_neighbors' tensors.
+        chunk_size = input.shape[1] // self.retro_num_neighbors
+        inputs = torch.split(input, chunk_size, dim=1)
+
+        # Norm.
+        outputs = [self.norm(inp.contiguous()) for inp in inputs]
+
+        # Concatenate layer norms (to shape [r, k*bs*l, d]; see notation above).
+        r, _, d = inputs[0].shape
+        output = torch.stack(outputs, dim=1).reshape(r, -1, d)
+
+        # Output. [ r, k*bs*l, d ]
+        return output
diff --git a/megatron/core/models/retro/encoder_spec.py b/megatron/core/models/retro/encoder_spec.py
new file mode 100644
index 0000000000..b8a969bd84
--- /dev/null
+++ b/megatron/core/models/retro/encoder_spec.py
@@ -0,0 +1,168 @@
+# Copyright (c) 2024, NVIDIA CORPORATION.  All rights reserved.
+
+"""Specs for Retro encoder."""
+
+from megatron.core.models.gpt.gpt_layer_specs import (
+    get_gpt_layer_local_spec,
+    get_gpt_layer_with_transformer_engine_spec,
+)
+from megatron.core.models.retro.config import RetroConfig
+from megatron.core.models.retro.encoder_attention import (
+    RetroEncoderBiasDropoutAdd,
+    RetroEncoderCrossAttention,
+    RetroEncoderLayerNorm,
+)
+from megatron.core.tensor_parallel.layers import ColumnParallelLinear, RowParallelLinear
+from megatron.core.transformer import ModuleSpec
+from megatron.core.transformer.attention import CrossAttentionSubmodules
+from megatron.core.transformer.dot_product_attention import DotProductAttention
+from megatron.core.transformer.enums import AttnMaskType
+from megatron.core.transformer.mlp import MLP, MLPSubmodules
+from megatron.core.transformer.transformer_block import TransformerBlockSubmodules
+
+try:
+    from megatron.core.extensions.transformer_engine import (
+        TEColumnParallelLinear,
+        TEDotProductAttention,
+        TENorm,
+        TERowParallelLinear,
+    )
+
+    HAVE_TE = True
+except ImportError:
+    HAVE_TE = False
+
+try:
+    import apex  # pylint: disable=unused-import
+
+    from megatron.core.fusions.fused_layer_norm import FusedLayerNorm
+
+    HAVE_APEX = True
+    LNImpl = FusedLayerNorm
+except ImportError:
+    import warnings
+
+    from megatron.core.transformer.torch_layer_norm import WrappedTorchLayerNorm
+
+    warnings.warn(f'Apex is not installed. Falling back to Torch LayerNorm')
+    LNImpl = WrappedTorchLayerNorm
+
+
+def get_retro_encoder_layer_te_spec() -> ModuleSpec:
+    """Retro encoder TE spec (uses Transformer Engine components).
+
+    A Retro encoder layer uses custom attention, bias-dropout-add, and layernorm
+    operators to encode neighboring chunks that are retrieved from the chunk
+    database. Each operator is responsible for iterating the retrieved chunks
+    and processing them individually.
+
+    Returns:
+        A module spec if Transformer Engine modules.
+    """
+    spec = get_gpt_layer_with_transformer_engine_spec()
+    spec.submodules.pre_cross_attn_layernorm = TENorm
+    spec.submodules.cross_attention = ModuleSpec(
+        module=RetroEncoderCrossAttention,
+        params={"attn_mask_type": AttnMaskType.padding},
+        submodules=CrossAttentionSubmodules(
+            linear_q=TEColumnParallelLinear,
+            linear_kv=TEColumnParallelLinear,
+            core_attention=TEDotProductAttention,
+            linear_proj=TERowParallelLinear,
+        ),
+    )
+    spec.submodules.cross_attn_bda = ModuleSpec(module=RetroEncoderBiasDropoutAdd)
+    spec.submodules.pre_mlp_layernorm = ModuleSpec(module=RetroEncoderLayerNorm, submodules=TENorm)
+    spec.submodules.mlp = ModuleSpec(
+        module=MLP,
+        submodules=MLPSubmodules(linear_fc1=TEColumnParallelLinear, linear_fc2=TERowParallelLinear),
+    )
+    return spec
+
+
+def get_retro_encoder_layer_local_spec() -> ModuleSpec:
+    """Retro encoder local spec (uses Megatron-Core components).
+
+    A Retro encoder layer uses custom attention, bias-dropout-add, and layernorm
+    operators to encode neighboring chunks that are retrieved from the chunk
+    database. Each operator is responsible for iterating the retrieved chunks
+    and processing them individually.
+
+    Returns:
+        A module spec if local modules.
+    """
+    spec = get_gpt_layer_local_spec()
+    spec.submodules.pre_cross_attn_layernorm = LNImpl
+    spec.submodules.cross_attention = ModuleSpec(
+        module=RetroEncoderCrossAttention,
+        params={"attn_mask_type": AttnMaskType.padding},
+        submodules=CrossAttentionSubmodules(
+            linear_q=ColumnParallelLinear,
+            linear_kv=ColumnParallelLinear,
+            core_attention=DotProductAttention,
+            linear_proj=RowParallelLinear,
+        ),
+    )
+    spec.submodules.cross_attn_bda = ModuleSpec(module=RetroEncoderBiasDropoutAdd)
+    spec.submodules.pre_mlp_layernorm = ModuleSpec(module=RetroEncoderLayerNorm, submodules=LNImpl)
+    spec.submodules.mlp = ModuleSpec(
+        module=MLP,
+        submodules=MLPSubmodules(linear_fc1=ColumnParallelLinear, linear_fc2=RowParallelLinear),
+    )
+    spec.submodules.sharded_state_dict_keys_map = {
+        'input_layernorm.': 'self_attention.linear_qkv.layer_norm_'
+    }  # pre_mlp_layernorm doesn't need remapping
+    return spec
+
+
+def get_retro_encoder_block_spec(
+    config: RetroConfig, use_transformer_engine: bool
+) -> TransformerBlockSubmodules:
+    """Retro encoder block spec.
+
+    The retro encoder block consists of one customized Retro encoder layer
+    (layer 1), and all of the following layers are standard GPT layers.
+
+    Args:
+      config (RetroConfig): Retro config.
+      use_transformer_engine (bool): If True, use Transformer Engine (instead of local modules).
+
+    Returns:
+        Transformer block submodules for the given spec.
+    """
+
+    # Num layers.
+    num_layers = config.retro_encoder_num_layers
+    retro_layer_numbers = [1]
+
+    # Layer specs.
+    gpt_layer_spec = (
+        get_gpt_layer_with_transformer_engine_spec()
+        if use_transformer_engine
+        else get_gpt_layer_local_spec()
+    )
+    get_retro_encoder_layer_spec = (
+        get_retro_encoder_layer_te_spec
+        if use_transformer_engine
+        else get_retro_encoder_layer_local_spec
+    )
+    retro_layer_spec = get_retro_encoder_layer_spec()
+    for spec in (gpt_layer_spec, retro_layer_spec):
+        spec.params["hidden_dropout"] = config.retro_encoder_hidden_dropout
+        spec.submodules.self_attention.params["attn_mask_type"] = AttnMaskType.padding
+        spec.submodules.self_attention.submodules.core_attention = ModuleSpec(
+            module=TEDotProductAttention if use_transformer_engine else DotProductAttention,
+            params={"attention_dropout": config.retro_encoder_attention_dropout},
+        )
+
+    layer_specs = []
+    for layer_number in range(1, num_layers + 1):
+        if layer_number in retro_layer_numbers:
+            layer_specs.append(retro_layer_spec)
+        else:
+            layer_specs.append(gpt_layer_spec)
+
+    # Block spec.
+    block_spec = TransformerBlockSubmodules(layer_specs=layer_specs)
+
+    return block_spec
diff --git a/megatron/core/models/retro/model.py b/megatron/core/models/retro/model.py
new file mode 100644
index 0000000000..8142c91f7a
--- /dev/null
+++ b/megatron/core/models/retro/model.py
@@ -0,0 +1,99 @@
+# Copyright (c) 2024, NVIDIA CORPORATION.  All rights reserved.
+
+"""Retro Model."""
+from typing import Dict, Optional
+
+from torch import Tensor
+
+from megatron.core import InferenceParams
+from megatron.core.dist_checkpointing.mapping import ShardedStateDict
+from megatron.core.models.gpt import GPTModel
+
+
+class RetroModel(GPTModel):
+    """Retro Model.
+
+    A Retro model mostly re-uses the GPTModel interface, with the only difference
+    being the embedding of the 'context' this is used by Retro for processing
+    neighbor tokens. This embedded context is then forwarded to the Transformer
+    Block.
+    """
+
+    def forward(
+        self,
+        input_ids: Tensor,
+        position_ids: Tensor,
+        attention_mask: Tensor,
+        context_input_ids: Tensor = None,
+        context_position_ids: Tensor = None,
+        context_mask: Tensor = None,
+        decoder_input: Tensor = None,
+        labels: Tensor = None,
+        inference_params: InferenceParams = None,
+    ) -> Tensor:
+        """RetroModel forward method.
+
+        Foward input tokens & mask, along with neighbor tokens & mask, through
+        the Retro model..
+
+        Args:
+            input_ids (Tensor): Input token IDs.
+            position_ids (Tensor): Input position IDs.
+            attention_mask (Tensor): Input attention mask.
+            context_input_ids (Tensor): Context (i.e., neighbor) token IDs.
+            context_position_ids (Tensor): Context (i.e., neighbor) position IDs.
+            context_mask (Tensor): Context (i.e., neighbor) attention mask.
+            decoder_input (Tensor): When using pipeline parallelism, input_ids and position_ids will only be used on the first stage, and for all other stages decoder_input will be provided via communication from the previous stage.
+            labels (Tensor): The labels of dimension [batch size, seq length].
+            inference_params (InferenceParams): Parameters for inference.
+
+        Returns:
+            Output tensor of forward pass.
+        """
+
+        # Argument shapes:
+        #   Notation:
+        #     ns : Sequence length.
+        #     bs : Batch size.
+        #     d  : Hidden size.
+        #     l  : Number of chunks per sample (i.e., seq_length/chunk_length).
+        #     k  : Number of neighbors.
+        #     r  : Number of retrieved tokens (neighbors + continuation).
+        # - input_ids:   [ bs, ns ]
+        # - context_ids: [ k*bs*l, r ]
+        # - context:     [ r, k*bs*l, d ]
+        # - output:      [ ns, bs, d ]
+
+        # Context embedding (e.g., for Retro neighbor tokens).
+        if context_input_ids is not None:
+            context = self.embedding(context_input_ids, context_position_ids)
+        else:
+            context = None
+
+        # Call GPTModel.forward, and pass in embedded context.
+        return super().forward(
+            input_ids=input_ids,
+            position_ids=position_ids,
+            attention_mask=attention_mask,
+            decoder_input=decoder_input,
+            labels=labels,
+            inference_params=inference_params,
+            extra_block_kwargs={"context": context, "context_mask": context_mask},
+        )
+
+    def sharded_state_dict(
+        self, prefix: str = '', sharded_offsets: tuple = (), metadata: Optional[Dict] = None
+    ) -> ShardedStateDict:
+        """Get sharded state dict.
+
+        Args:
+            prefix (str): Module name prefix.
+            sharded_offsets (tuple): Offsets of local shard within global tensor.
+            metadata (Optional[Dict]): Shard metadata.
+
+        Returns:
+            A <ShardedStateDict> ?
+        """
+        metadata = metadata or {}
+        metadata['non_homogeneous_layers'] = True
+        return super().sharded_state_dict(prefix, sharded_offsets, metadata)
diff --git a/megatron/core/models/retro/utils.py b/megatron/core/models/retro/utils.py
new file mode 100644
index 0000000000..7d83c5d306
--- /dev/null
+++ b/megatron/core/models/retro/utils.py
@@ -0,0 +1,24 @@
+# Copyright (c) 2024, NVIDIA CORPORATION.  All rights reserved.
+
+import os
+
+import torch
+
+
+def get_config_path(project_dir: str) -> str:
+    """Config copy stored within retro project dir."""
+    return os.path.join(project_dir, "config.json")
+
+
+def get_gpt_data_dir(project_dir: str) -> str:
+    """Get project-relative directory of GPT bin/idx datasets."""
+    return os.path.join(project_dir, "data")
+
+
+# ** Note ** : Retro's compatibility between cross attention and Flash/Fused
+#   Attention is currently a work in progress. We default to returning None for
+#   now.
+# def get_all_true_mask(size, device):
+#     return torch.full(size=size, fill_value=True, dtype=torch.bool, device=device)
+def get_all_true_mask(size, device):
+    return None
diff --git a/megatron/core/models/vision/__init__.py b/megatron/core/models/vision/__init__.py
new file mode 100644
index 0000000000..e69de29bb2
diff --git a/megatron/core/models/vision/clip_vit_model.py b/megatron/core/models/vision/clip_vit_model.py
new file mode 100644
index 0000000000..53c3feddee
--- /dev/null
+++ b/megatron/core/models/vision/clip_vit_model.py
@@ -0,0 +1,190 @@
+# Copyright (c) 2024, NVIDIA CORPORATION. All rights reserved.
+
+from typing import Optional, Union
+
+import torch
+
+from megatron.core.config_logger import has_config_logger_enabled, log_config_to_disk
+from megatron.core.extensions.transformer_engine import TENorm
+from megatron.core.models.common.vision_module.vision_module import VisionModule
+from megatron.core.transformer.enums import ModelType
+from megatron.core.transformer.spec_utils import ModuleSpec, build_module
+from megatron.core.transformer.transformer_block import TransformerBlock
+from megatron.core.transformer.transformer_config import TransformerConfig
+
+
+# Note: This is under development and is missing features like position embedding interpolation.
+class CLIPViTModel(VisionModule):
+    """CLIP ViT vision model.
+
+    Args:
+        transformer_config (TransformerConfig): Transformer config.
+        transformer_layer_spec (ModuleSpec): Specifies module to use for transformer layers.
+        ln_pre_impl (ModuleSpec or type): Specifies the layer norm type to use for ln_pre.
+        add_class_token (bool, optional): Include a class token. Defaults to True.
+        class_token_len (int): Class token length. Defaults to 1 but 8 may be faster.
+        patch_dim (int): Image patch size.
+        img_h (int): Input image height.
+        img_w (int): Input image width.
+    """
+
+    def __init__(
+        self,
+        transformer_config: TransformerConfig,
+        transformer_layer_spec: ModuleSpec,
+        ln_pre_impl: Union[ModuleSpec, type] = TENorm,
+        ln_post_impl: Union[ModuleSpec, type] = TENorm,
+        add_class_token: bool = True,
+        class_token_len: int = 1,
+        patch_dim: int = 14,
+        img_h: int = 336,
+        img_w: int = 336,
+        model_subtype: str = "clip",
+    ) -> None:
+
+        error_msg = f"CLIPViTModel model subtype {model_subtype} is not supported."
+        assert model_subtype in ["clip", "siglip"], error_msg
+
+        if model_subtype == "siglip":
+            assert class_token_len == 0, "SigLIP does not support class tokens."
+            assert not add_class_token, "SigLIP does not support class tokens."
+
+        super().__init__(config=transformer_config)
+
+        if has_config_logger_enabled(transformer_config):
+            log_config_to_disk(transformer_config, locals(), prefix=type(self).__name__)
+
+        self.class_token_len = class_token_len
+        self.visual_hidden_size = transformer_config.hidden_size
+        self.patch_dim = patch_dim
+        self.img_h = img_h
+        self.img_w = img_w
+
+        assert self.img_h % self.patch_dim == 0
+        assert self.img_w % self.patch_dim == 0
+        self.num_patches_per_dim_h = self.img_h // self.patch_dim
+        self.num_patches_per_dim_w = self.img_w // self.patch_dim
+        self.num_patches = self.num_patches_per_dim_h * self.num_patches_per_dim_w
+
+        self.add_class_token = add_class_token
+        self.class_token_len = class_token_len
+
+        self.seq_length = self.num_patches + (self.class_token_len if self.add_class_token else 0)
+
+        self.ln_pre = None
+        self.ln_post = None
+        if model_subtype == "clip":
+            self.ln_pre = build_module(
+                ln_pre_impl,
+                config=transformer_config,
+                hidden_size=self.visual_hidden_size,
+                eps=transformer_config.layernorm_epsilon,
+            )
+            conv_bias = False
+            padding = 0
+        if model_subtype == "siglip":
+            self.ln_post = build_module(
+                ln_post_impl,
+                config=transformer_config,
+                hidden_size=self.visual_hidden_size,
+                eps=transformer_config.layernorm_epsilon,
+            )
+            conv_bias = True
+            padding = "valid"
+
+        self.conv1 = torch.nn.Conv2d(
+            in_channels=3,
+            out_channels=self.visual_hidden_size,
+            kernel_size=self.patch_dim,
+            stride=self.patch_dim,
+            bias=conv_bias,
+            padding=padding,
+        )
+
+        self.position_ids = torch.arange(self.seq_length).expand(1, -1).cuda()
+
+        self.position_embeddings = torch.nn.Embedding(self.seq_length, self.visual_hidden_size)
+
+        self.add_class_token = add_class_token
+        if self.add_class_token:
+            self.class_token = torch.nn.Parameter(
+                torch.randn(1, self.class_token_len, self.visual_hidden_size)
+            )
+
+        self.model_type = ModelType.encoder_or_decoder
+
+        # Transformer layers.
+        # TODO: Make pre_process and post_process configurable.
+        # NOTE: a final layer norm and/or linear layer in some implementations are omitted here.
+        # They can be added separately where needed.
+        self.decoder = TransformerBlock(
+            config=transformer_config,
+            spec=transformer_layer_spec,
+            pre_process=True,
+            post_process=False,
+        )
+
+    def set_input_tensor(self, input_tensor: torch.Tensor) -> None:
+        """Sets input tensor to the model.
+
+        Args:
+            input_tensor (Tensor): Sets the input tensor for the model.
+        """
+        self.decoder.set_input_tensor(input_tensor)
+
+    def forward(
+        self, x: torch.Tensor, attention_mask: Optional[torch.Tensor] = None
+    ) -> torch.Tensor:
+        """Forward function of the CLIP ViT Model. This function passes the input tensors
+        through the embedding layer and then the transformer.
+
+        Args:
+            x (torch.Tensor): input data of shape [batch, img_h, img_w]
+            attention_mask (torch.Tensor with dtype=bool): Attention mask to use.
+
+        Returns:
+            x (torch.Tensor): output after final transformer block of shape [b, s, h].
+        """
+        x = self.conv1(x)  # shape = [batch, hidden_size, grid, grid]
+        x = x.reshape(x.shape[0], x.shape[1], -1)  # [batch, hidden_size, grid ** 2]
+        x = x.permute(0, 2, 1)  # [batch, grid ** 2, hidden_size]
+
+        if self.add_class_token:
+            class_token = self.class_token.expand(
+                x.shape[0], -1, -1
+            )  # [batch, class_token_len, hidden_size]
+            x = torch.cat(
+                [class_token, x], dim=1
+            )  # [batch, grid ** 2 + class_token_len, hidden_size]
+
+        assert x.shape[1] == self.seq_length, f"{x.shape[1]} != {self.seq_length}"
+        x = x + self.position_embeddings(self.position_ids)
+        if self.ln_pre:
+            x = self.ln_pre(x)
+        x = x.permute(1, 0, 2)  # [b, s, h] -> [s, b, h]
+        # `permute` can make the tensor non-contiguous, breaking pipelining.
+        x = x.contiguous()
+
+        x = self.decoder(x, attention_mask)
+        x = x.permute(1, 0, 2)  # [s, b, h] -> [b, s, h]
+        x = x.contiguous()
+        if self.ln_post:
+            x = self.ln_post(x)
+        return x
+
+
+def get_num_image_embeddings(
+    img_h, img_w, patch_dim, vision_model_type, disable_vision_class_token, class_token_len
+):
+    """Get the number of image embeddings per image tile."""
+    if vision_model_type == "siglip":
+        keep_class_token = False
+    elif vision_model_type == "clip":
+        keep_class_token = not disable_vision_class_token
+
+    num_patches_per_dim_h = img_h // patch_dim
+    num_patches_per_dim_w = img_w // patch_dim
+    num_patches = num_patches_per_dim_h * num_patches_per_dim_w
+    num_image_embeddings_per_tile = num_patches + (class_token_len if keep_class_token else 0)
+
+    return num_image_embeddings_per_tile
diff --git a/megatron/core/models/vision/multimodal_projector.py b/megatron/core/models/vision/multimodal_projector.py
new file mode 100644
index 0000000000..18e62c68a5
--- /dev/null
+++ b/megatron/core/models/vision/multimodal_projector.py
@@ -0,0 +1,67 @@
+# Copyright (c) 2024, NVIDIA CORPORATION. All rights reserved.
+from megatron.core import tensor_parallel
+from megatron.core.transformer.mlp import MLP, MLPSubmodules
+from megatron.core.transformer.module import MegatronModule
+from megatron.core.transformer.spec_utils import ModuleSpec, build_module
+from megatron.core.transformer.transformer_config import TransformerConfig
+from megatron.core.utils import make_viewless_tensor
+
+
+class MultimodalProjector(MegatronModule):
+    """
+    MultimodalProjector will take the encoded input with input_size hidden state and project
+    it into the hidden size of the language model for multimodal training. When projector is
+    type affine linear_fc1 from submodules is used.
+
+    Args:
+        transformer_config (TransformerConfig): Transformer config
+        submodules (MLPSubmodules): Specifies MLP submodules for mlp type projector
+        projector_type (str): Projector type
+        input_size (int): Input size from feature encoder
+    """
+
+    def __init__(
+        self,
+        config: TransformerConfig,
+        submodules: MLPSubmodules,
+        projector_type: str,
+        input_size: int,
+    ):
+        super().__init__(config=config)
+        self.projector_type = projector_type
+
+        assert submodules is not None, "MLPSubmodules must be provided"
+
+        if self.projector_type == "mlp":
+            self.encoder = MLP(config=config, submodules=submodules, input_size=input_size)
+        elif self.projector_type == "affine":
+            self.encoder = build_module(
+                submodules.linear_fc1,
+                input_size,
+                config.hidden_size,
+                config=config,
+                init_method=config.init_method,
+                gather_output=True,
+                bias=config.add_bias_linear,
+                skip_bias_add=True,
+                is_expert=False,
+                tp_comm_buffer_name=None,
+            )
+        else:
+            raise Exception(f"Unsupported multimodal projection type {self.projector_type}")
+
+    def forward(self, hidden_states):
+        # Run encoder.
+        encoder_output, encoder_output_bias = self.encoder(hidden_states)
+
+        if encoder_output_bias is not None:
+            encoder_output = encoder_output + encoder_output_bias
+
+        # the encoder produces "viewed" tensor. This will result in schedule.py's
+        # deallocate_output_tensor() throwing an error, so a viewless tensor is
+        # created to prevent this.
+        encoder_output = make_viewless_tensor(
+            inp=encoder_output, requires_grad=True, keep_graph=True
+        )
+
+        return encoder_output
diff --git a/megatron/core/models/vision/vit_layer_specs.py b/megatron/core/models/vision/vit_layer_specs.py
new file mode 100644
index 0000000000..da9066b007
--- /dev/null
+++ b/megatron/core/models/vision/vit_layer_specs.py
@@ -0,0 +1,95 @@
+# Copyright (c) 2024, NVIDIA CORPORATION. All rights reserved.
+
+from megatron.core.extensions.transformer_engine import (
+    TEDotProductAttention,
+    TELayerNormColumnParallelLinear,
+    TERowParallelLinear,
+)
+from megatron.core.fusions.fused_bias_dropout import get_bias_dropout_add
+from megatron.core.tensor_parallel.layers import ColumnParallelLinear, RowParallelLinear
+from megatron.core.transformer.attention import SelfAttention, SelfAttentionSubmodules
+from megatron.core.transformer.dot_product_attention import DotProductAttention
+from megatron.core.transformer.enums import AttnMaskType
+from megatron.core.transformer.identity_op import IdentityOp
+from megatron.core.transformer.mlp import MLP, MLPSubmodules
+from megatron.core.transformer.spec_utils import ModuleSpec
+from megatron.core.transformer.transformer_layer import TransformerLayer, TransformerLayerSubmodules
+
+try:
+    import apex  # pylint: disable=unused-import
+
+    from megatron.core.fusions.fused_layer_norm import FusedLayerNorm
+
+    HAVE_APEX = True
+    LNImpl = FusedLayerNorm
+except ImportError:
+    import warnings
+
+    from megatron.core.transformer.torch_layer_norm import WrappedTorchLayerNorm
+
+    warnings.warn(f'Apex is not installed. Falling back to Torch LayerNorm')
+    LNImpl = WrappedTorchLayerNorm
+
+
+# Use this spec to use lower level Transformer Engine modules (required for fp8 training)
+def get_vit_layer_with_transformer_engine_spec() -> ModuleSpec:
+    '''
+    Returns ViT layer spec with Transformer Engine layers
+    '''
+    mlp = _get_mlp_module_spec(use_te=True)
+    return ModuleSpec(
+        module=TransformerLayer,
+        submodules=TransformerLayerSubmodules(
+            self_attention=ModuleSpec(
+                module=SelfAttention,
+                params={"attn_mask_type": AttnMaskType.no_mask},
+                submodules=SelfAttentionSubmodules(
+                    linear_qkv=TELayerNormColumnParallelLinear,
+                    core_attention=TEDotProductAttention,
+                    linear_proj=TERowParallelLinear,
+                ),
+            ),
+            self_attn_bda=get_bias_dropout_add,
+            pre_mlp_layernorm=IdentityOp,
+            mlp=mlp,
+            mlp_bda=get_bias_dropout_add,
+        ),
+    )
+
+
+def get_vit_layer_with_local_spec() -> ModuleSpec:
+    '''
+    Returns ViT layer spec with Mcore local layers
+    '''
+    mlp = _get_mlp_module_spec(use_te=False)
+    return ModuleSpec(
+        module=TransformerLayer,
+        submodules=TransformerLayerSubmodules(
+            input_layernorm=LNImpl,
+            self_attention=ModuleSpec(
+                module=SelfAttention,
+                params={"attn_mask_type": AttnMaskType.causal},
+                submodules=SelfAttentionSubmodules(
+                    linear_qkv=ColumnParallelLinear,
+                    core_attention=DotProductAttention,
+                    linear_proj=RowParallelLinear,
+                ),
+            ),
+            self_attn_bda=get_bias_dropout_add,
+            pre_mlp_layernorm=LNImpl,
+            mlp=mlp,
+            mlp_bda=get_bias_dropout_add,
+        ),
+    )
+
+
+# Helper function to get module spec for MLP/MoE
+def _get_mlp_module_spec(use_te: bool = True) -> ModuleSpec:
+    # Dense MLP w/ or w/o TE modules.
+    return ModuleSpec(
+        module=MLP,
+        submodules=MLPSubmodules(
+            linear_fc1=TELayerNormColumnParallelLinear if use_te else ColumnParallelLinear,
+            linear_fc2=TERowParallelLinear if use_te else RowParallelLinear,
+        ),
+    )
diff --git a/megatron/core/num_microbatches_calculator.py b/megatron/core/num_microbatches_calculator.py
new file mode 100644
index 0000000000..5850e512ca
--- /dev/null
+++ b/megatron/core/num_microbatches_calculator.py
@@ -0,0 +1,498 @@
+# Copyright (c) 2022, NVIDIA CORPORATION. All rights reserved.
+
+"""Megatron Core number of microbatches calculators."""
+
+import logging
+from abc import ABC, abstractmethod
+from typing import List, Optional, Union
+
+logger = logging.getLogger(__name__)
+
+# TODO: global_var merge into mcore?
+_GLOBAL_NUM_MICROBATCHES_CALCULATOR: Union[
+    'ConstantNumMicroBatchesCalculator', 'RampupBatchsizeNumMicroBatchesCalculator'
+] = None
+
+
+def get_num_microbatches() -> int:
+    """Get number of microbatches."""
+    return _GLOBAL_NUM_MICROBATCHES_CALCULATOR.get()
+
+
+def get_current_global_batch_size() -> int:
+    """Get current global batch size."""
+    return _GLOBAL_NUM_MICROBATCHES_CALCULATOR.get_current_global_batch_size()
+
+
+def get_micro_batch_size() -> int:
+    """Get micro batch size."""
+    return _GLOBAL_NUM_MICROBATCHES_CALCULATOR.get_micro_batch_size()
+
+
+def get_current_running_global_batch_size() -> int:
+    """Get current running global batch size, taking into account number of DP replicas might be
+    incompatible with true global batch size if `decrease_batch_size_if_needed` is True."""
+    return _GLOBAL_NUM_MICROBATCHES_CALCULATOR.get_current_running_global_batch_size()
+
+
+def update_num_microbatches(
+    consumed_samples: int, consistency_check: bool = True, verbose: bool = False
+) -> None:
+    """Update number of microbatches.
+
+    Args:
+        consumed_samples (int):
+            Number of samples consumed.
+        consistency_check (bool, optional):
+            Option to check current schedule's consistency. Defaults to True.
+        verbose (bool, optional):
+            Option to control logging. Defaults to False.
+    """
+    _GLOBAL_NUM_MICROBATCHES_CALCULATOR.update(consumed_samples, consistency_check, verbose)
+
+
+def init_num_microbatches_calculator(
+    rank: int,
+    rampup_batch_size: Optional[List[int]],
+    global_batch_size: int,
+    micro_batch_size: int,
+    data_parallel_size: int,
+    decrease_batch_size_if_needed: bool = False,
+) -> None:
+    """Initialize number of microbatches calculator. Supporting backward compatibility.
+
+    Args:
+        rank (int):
+            Rank of the GPU, only rank 0 will log the information.
+        rampup_batch_size (Optional[List[int]]):
+            Rampup batch size, should be in format of [start_global_batch_size,
+            batch_size_increment, ramup_samples].
+        global_batch_size (int):
+            Global batch size for the model.
+        micro_batch_size (int):
+            Micro batch size at initialization.
+        data_parallel_size (int):
+            Data parallel size.
+        decrease_batch_size_if_needed (bool, optional):
+            If true, scale down batch size to ensure divisibility by DP size * microbatch size.
+            Defaults to False.
+    """
+    _configure_global_num_microbatches_calculator(
+        rank,
+        rampup_batch_size,
+        global_batch_size,
+        micro_batch_size,
+        data_parallel_size,
+        decrease_batch_size_if_needed,
+        init=True,
+    )
+
+
+def destroy_num_microbatches_calculator():
+    """Destroy number of microbatches calculator."""
+    global _GLOBAL_NUM_MICROBATCHES_CALCULATOR
+    _GLOBAL_NUM_MICROBATCHES_CALCULATOR = None
+
+
+def reconfigure_num_microbatches_calculator(
+    rank: int,
+    rampup_batch_size: Optional[List[int]],
+    global_batch_size: int,
+    micro_batch_size: int,
+    data_parallel_size: int,
+    decrease_batch_size_if_needed: bool = False,
+) -> None:
+    """Reconfigure number of microbatches calculator. Supporting backward compatibility.
+
+    Args:
+        rank (int):
+            Rank of the GPU, only rank 0 will log the information.
+        rampup_batch_size (Optional[List[int]]):
+            Rampup batch size, should be in format of
+            [start_global_batch_size, batch_size_increment, ramup_samples].
+        global_batch_size (int):
+            Global batch size for the model.
+        micro_batch_size (int):
+            Micro batch size at initialization.
+        data_parallel_size (int):
+            Data parallel size.
+        decrease_batch_size_if_needed (bool, optional):
+            If true, scale down batch size to ensure divisibility by DP size * microbatch size.
+            Defaults to False.
+    """
+    _configure_global_num_microbatches_calculator(
+        rank,
+        rampup_batch_size,
+        global_batch_size,
+        micro_batch_size,
+        data_parallel_size,
+        decrease_batch_size_if_needed,
+        init=False,
+    )
+
+
+def _configure_global_num_microbatches_calculator(
+    rank: int,
+    rampup_batch_size: Optional[List[int]],
+    global_batch_size: int,
+    micro_batch_size: int,
+    data_parallel_size: int,
+    decrease_batch_size_if_needed: bool = False,
+    init: bool = False,
+) -> None:
+    """Configure number of microbatches calculator. Can be used for initialization and
+    reconfiguration.
+
+    Args:
+        rank (int):
+            Rank of the GPU, only rank 0 will log the information.
+        rampup_batch_size (Optional[List[int]]):
+            Rampup batch size, should be in format of
+            [start_global_batch_size, batch_size_increment, ramup_samples].
+        global_batch_size (int):
+            Global batch size for the model.
+        micro_batch_size (int):
+            Micro batch size at initialization.
+        data_parallel_size (int):
+            Data parallel size.
+        decrease_batch_size_if_needed (bool, optional):
+            If true, scale down batch size to ensure divisibility by DP size * microbatch size.
+            Defaults to False.
+        init (bool, optional):
+            If true, initialize the calculator. Defaults to False.
+    """
+    global _GLOBAL_NUM_MICROBATCHES_CALCULATOR
+
+    if init:
+        assert (
+            _GLOBAL_NUM_MICROBATCHES_CALCULATOR is None
+        ), 'num microbatches calculator is already initialized.'
+
+    _GLOBAL_NUM_MICROBATCHES_CALCULATOR = _build_num_microbatches_calculator(
+        rank,
+        rampup_batch_size,
+        global_batch_size,
+        micro_batch_size,
+        data_parallel_size,
+        decrease_batch_size_if_needed,
+    )
+
+
+def _build_num_microbatches_calculator(
+    rank: int,
+    rampup_batch_size: Optional[List[int]],
+    global_batch_size: int,
+    micro_batch_size: int,
+    data_parallel_size: int,
+    decrease_batch_size_if_needed: bool,
+) -> Union['ConstantNumMicroBatchesCalculator', 'RampupBatchsizeNumMicroBatchesCalculator']:
+    """Build number of microbatches calculator. Internal helper method.
+
+    Args:
+        rank (int):
+            Rank of the GPU, only rank 0 will log the information.
+        rampup_batch_size (Optional[List[int]]):
+            Rampup batch size, should be in format of
+            [start_global_batch_size, batch_size_increment, ramup_samples].
+        global_batch_size (int):
+            Global batch size for the model.
+        micro_batch_size (int):
+            Micro batch size at initialization.
+        data_parallel_size (int):
+            Data parallel size.
+        decrease_batch_size_if_needed (bool):
+            If true, scale down batch size to ensure divisibility by DP size * microbatch size.
+
+    """
+
+    # Constant batch size.
+    if rampup_batch_size is None:
+        num_microbatches_calculator = ConstantNumMicroBatchesCalculator(
+            global_batch_size,
+            micro_batch_size,
+            data_parallel_size,
+            decrease_batch_size_if_needed,
+            rank,
+        )
+        if rank == 0:
+            logger.info(
+                f'setting number of microbatches to constant {num_microbatches_calculator.get()}'
+            )
+    # Batch size ramp up.
+    else:
+        assert len(rampup_batch_size) == 3, (
+            'expected the following '
+            'format: --rampup-batch-size <start batch size> '
+            '<batch size incerement> <ramp-up samples>'
+        )
+        start_global_batch_size = int(rampup_batch_size[0])
+        batch_size_increment = int(rampup_batch_size[1])
+        ramup_samples = int(rampup_batch_size[2])
+        if rank == 0:
+            logger.info(
+                f'will use batch size rampup starting from global batch size '
+                f'{start_global_batch_size} to global batch size {global_batch_size} with batch'
+                f'size increments {batch_size_increment} over {ramup_samples} samples.'
+            )
+        num_microbatches_calculator = RampupBatchsizeNumMicroBatchesCalculator(
+            global_batch_size,
+            micro_batch_size,
+            data_parallel_size,
+            decrease_batch_size_if_needed,
+            rank,
+            start_global_batch_size,
+            batch_size_increment,
+            ramup_samples,
+        )
+
+    return num_microbatches_calculator
+
+
+def _round(batch_size: int, divisor: int) -> int:
+    """Round `batch_size` down to nearest batch size divisible by `divisor`."""
+    return (batch_size // divisor) * divisor
+
+
+class NumMicroBatchesCalculator(ABC):
+    """Base class for number of microbatches calculator."""
+
+    def __init__(self) -> None:
+        self.num_micro_batches = None
+        self.current_global_batch_size = None
+        self.micro_batch_size = None
+        self.current_running_global_batch_size = None
+
+    def get(self) -> int:
+        """Get number of microbatches."""
+        return self.num_micro_batches
+
+    def get_current_global_batch_size(self) -> int:
+        """Get current global batch size."""
+        return self.current_global_batch_size
+
+    def get_micro_batch_size(self) -> int:
+        """Get current global batch size."""
+        return self.micro_batch_size
+
+    def get_current_running_global_batch_size(self) -> int:
+        """Get current running global batch size. If decrease_batch_size_if_needed is False,
+        this just equals global batch size."""
+        return self.current_running_global_batch_size
+
+    @abstractmethod
+    def update(self, consumed_samples, consistency_check, verbose=False) -> None:
+        """Update number of microbatches depending on batch size rampup."""
+        pass
+
+
+class ConstantNumMicroBatchesCalculator(NumMicroBatchesCalculator):
+    """Calculator of number of microbatches with constant global batch size.
+
+    Args:
+        global_batch_size (int):
+            Global batch size.
+        micro_batch_size (int):
+            Micro batch size.
+        data_parallel_size (int):
+            Data parallel size.
+        decrease_batch_size_if_needed (bool):
+            If true, decrease batch size to ensure divisibility by DP size * microbatch size
+            (if needed).
+        rank (int):
+            Rank (to determine whether logging should be performed).
+    """
+
+    def __init__(
+        self,
+        global_batch_size: int,
+        micro_batch_size: int,
+        data_parallel_size: int,
+        decrease_batch_size_if_needed: bool,
+        rank: int,
+    ) -> None:
+
+        micro_batch_times_data_parallel_size = micro_batch_size * data_parallel_size
+        if decrease_batch_size_if_needed:
+            running_global_batch_size = _round(
+                global_batch_size, micro_batch_times_data_parallel_size
+            )
+            assert running_global_batch_size % micro_batch_times_data_parallel_size == 0
+            if rank == 0:
+                logger.info(
+                    f'decreasing batch size from {global_batch_size} to {running_global_batch_size}'
+                    f'to keep divisiblity by micro_batch_size={micro_batch_size} * '
+                    f'data_parallel_size={data_parallel_size}'
+                )
+            self.num_micro_batches = (
+                running_global_batch_size // micro_batch_times_data_parallel_size
+            )
+        else:
+            assert global_batch_size % micro_batch_times_data_parallel_size == 0, (
+                'global batch size ({}) is not divisible by micro batch size ({})'
+                ' times data parallel size ({})'.format(
+                    global_batch_size, micro_batch_size, data_parallel_size
+                )
+            )
+            running_global_batch_size = global_batch_size
+            self.num_micro_batches = global_batch_size // micro_batch_times_data_parallel_size
+        assert (
+            self.num_micro_batches >= 1
+        ), 'number of microbatches should be at least 1, got {}.'.format(self.num_micro_batches)
+
+        self.current_global_batch_size = global_batch_size
+        self.current_running_global_batch_size = running_global_batch_size
+        self.micro_batch_size = micro_batch_size
+
+    def update(self, consumed_samples, consistency_check, verbose=False) -> None:
+        pass
+
+
+class RampupBatchsizeNumMicroBatchesCalculator(NumMicroBatchesCalculator):
+    """Calculator of number of microbatches with batch size rampup.
+    Over `steps = (global-batch-size - start-batch-size) / batch_size_increment` increment batch
+    size from start-batch-size to global-batch-size using rampup-samples / steps
+    samples.
+
+    Args:
+        global_batch_size (int):
+            Global batch size post rampup.
+        micro_batch_size (int):
+            Micro batch size.
+        data_parallel_size (int):
+            Data parallel size.
+        decrease_batch_size_if_needed (bool):
+            If true, decrease batch size to ensure divisibility by DP size * microbatch size
+            (if needed).
+        rank (int):
+            Rank (to determine whether logging should be performed).
+        start_global_batch_size (int):
+            Global batch size to start with.
+        batch_size_increment (int):
+            Global batch size increments.
+        ramup_samples (int):
+            Number of samples to use ramp up global
+            batch size from `start_global_batch_size` to `global_batch_size`.
+    """
+
+    def __init__(
+        self,
+        global_batch_size: int,
+        micro_batch_size: int,
+        data_parallel_size: int,
+        decrease_batch_size_if_needed: bool,
+        rank: int,
+        start_global_batch_size: int,
+        batch_size_increment: int,
+        ramup_samples: int,
+    ) -> None:
+        assert global_batch_size > 0, 'global batch size should be positive, got {}.'.format(
+            global_batch_size
+        )
+        assert start_global_batch_size > 0, 'start batch size should be positive, got {}.'.format(
+            start_global_batch_size
+        )
+        assert batch_size_increment > 0, 'batch size increment should be positive, got {}.'.format(
+            batch_size_increment
+        )
+        assert ramup_samples >= 0, 'ramp-up samples should be non-negative, got {}.'.format(
+            ramup_samples
+        )
+
+        self.global_batch_size = global_batch_size
+        self.micro_batch_size = micro_batch_size
+        self.data_parallel_size = data_parallel_size
+        self.decrease_batch_size_if_needed = decrease_batch_size_if_needed
+        self.rank = rank
+        self.start_global_batch_size = start_global_batch_size
+        self.batch_size_increment = batch_size_increment
+        self.ramup_samples = ramup_samples
+
+        self.micro_batch_times_data_parallel_size = self.micro_batch_size * self.data_parallel_size
+        assert self.micro_batch_times_data_parallel_size > 0
+        self.current_global_batch_size = None
+
+        diff_batch_size = self.global_batch_size - self.start_global_batch_size
+        assert diff_batch_size >= 0, (
+            'expected global batch size to be greater than or equal to start batch size, '
+            f'got {self.global_batch_size} and {self.start_global_batch_size}'
+        )
+        assert diff_batch_size % batch_size_increment == 0, (
+            'expected '
+            f'global batch size interval ({diff_batch_size}) to be divisible by global batch '
+            f'size increment ({batch_size_increment})'
+        )
+
+        num_increments = diff_batch_size // self.batch_size_increment
+        self.rampup_samples_per_increment = self.ramup_samples / num_increments
+
+        # Initialize number of microbatches.
+        self.update(0, consistency_check=False, verbose=True)
+
+    def update(self, consumed_samples: int, consistency_check: bool, verbose: bool = False) -> None:
+        """Update number of microbatches.
+
+        Args:
+            consumed_samples (int): Number of samples consumed.
+            consistency_check (bool): Option to check current schedule's consistency.
+            verbose (bool, optional): Option to control logging. Defaults to False.
+        """
+
+        # Update current global batch size.
+        global_batch_size_changed = False
+        old_current_global_batch_size = self.current_global_batch_size
+        if consumed_samples > self.ramup_samples:
+            self.current_global_batch_size = self.global_batch_size
+        else:
+            steps = int(consumed_samples / self.rampup_samples_per_increment)
+            self.current_global_batch_size = (
+                self.start_global_batch_size + steps * self.batch_size_increment
+            )
+            assert self.current_global_batch_size <= self.global_batch_size
+
+        if old_current_global_batch_size != self.current_global_batch_size:
+            global_batch_size_changed = True
+        if self.rank == 0 and global_batch_size_changed and verbose:
+            if old_current_global_batch_size is None:
+                logger.info(f'setting initial batch size to {self.current_global_batch_size}')
+            else:
+                logger.info(
+                    f'ramping up batch size from {old_current_global_batch_size} to '
+                    f'{self.current_global_batch_size}'
+                )
+
+        # Check consistency of the current global batch size.
+        if consistency_check and not self.decrease_batch_size_if_needed:
+            assert (
+                self.current_global_batch_size % self.micro_batch_times_data_parallel_size == 0
+            ), (
+                'current global '
+                'batch size ({}) is not divisible by micro-batch-size ({}) times'
+                'data parallel size ({})'.format(
+                    self.current_global_batch_size, self.micro_batch_size, self.data_parallel_size
+                )
+            )
+
+        if (
+            self.decrease_batch_size_if_needed
+            and self.current_global_batch_size % self.micro_batch_times_data_parallel_size != 0
+        ):
+            self.current_running_global_batch_size = _round(
+                self.current_global_batch_size, self.micro_batch_times_data_parallel_size
+            )
+            if self.rank == 0 and global_batch_size_changed and verbose:
+                logger.info(
+                    f'decreasing batch size from {self.current_global_batch_size} to '
+                    f'{self.current_running_global_batch_size} to keep divisiblity by '
+                    f'micro_batch_size={self.micro_batch_size} * '
+                    f'data_parallel_size={self.data_parallel_size}'
+                )
+            assert (
+                self.current_running_global_batch_size % self.micro_batch_times_data_parallel_size
+                == 0
+            )
+        else:
+            self.current_running_global_batch_size = self.current_global_batch_size
+
+        self.num_micro_batches = (
+            self.current_running_global_batch_size // self.micro_batch_times_data_parallel_size
+        )
diff --git a/megatron/core/optimizer/__init__.py b/megatron/core/optimizer/__init__.py
new file mode 100644
index 0000000000..4a83564ce7
--- /dev/null
+++ b/megatron/core/optimizer/__init__.py
@@ -0,0 +1,445 @@
+# Copyright (c) 2024, NVIDIA CORPORATION. All rights reserved.
+import logging
+from typing import Callable, Dict, List, Optional, Tuple
+
+import torch
+
+try:
+    from transformer_engine.pytorch.optimizers import FusedAdam as Adam
+    from transformer_engine.pytorch.optimizers import FusedSGD as SGD
+except ImportError:
+    try:
+        from apex.optimizers import FusedAdam as Adam
+        from apex.optimizers import FusedSGD as SGD
+    except ImportError:
+        import warnings
+
+        warnings.warn(
+            f'Transformer Engine and Apex are not installed. Falling back to Torch optimizers.'
+        )
+
+        # Apex's FusedAdam is a drop-in replacement for torch's AdamW.
+        # pylint: disable-next=line-too-long.
+        # See https://github.com/NVIDIA/apex/blob/7b73b12361068a10b0f44844534613f252a5ea75/apex/optimizers/fused_adam.py#L16.
+        from torch.optim import AdamW as Adam, SGD
+
+from megatron.core import mpu
+
+from ..distributed.param_and_grad_buffer import _ParamAndGradBuffer
+from ..transformer.module import MegatronModule
+from ..utils import log_single_rank
+from .distrib_optimizer import DistributedOptimizer
+from .grad_scaler import ConstantGradScaler, DynamicGradScaler
+from .optimizer import (
+    ChainedOptimizer,
+    Float16OptimizerWithFloat16Params,
+    FP32Optimizer,
+    MegatronOptimizer,
+)
+from .optimizer_config import OptimizerConfig
+
+logger = logging.getLogger(__name__)
+
+
+def _get_param_groups(
+    model_chunks: List[MegatronModule],
+    no_weight_decay_cond: Optional[Callable],
+    scale_lr_cond: Optional[Callable],
+    lr_mult: float,
+    lr: float,
+    min_lr: float,
+    decoupled_lr: Optional[float],
+    decoupled_min_lr: Optional[float],
+) -> List[Dict]:
+    """Create parameter groups for optimizer.
+
+    Creates parameter groups based on weight decay condition (regularized vs
+    non regularized), learning rate scale condition (lr vs lr_mult * lr),
+    and whether it is expert parameters. scale_lr_cond is used during finetuning
+    where head of the network requires a scaled version of the base learning rate.
+
+    Args:
+        model_chunks (List[MegatronModule]): model chunks to create parameter
+            groups for.
+        no_weight_decay_cond (func, optional): function to determine whether a
+            parameter should not perform weight decay.
+        scale_lr_cond (func, optional): function to determine whether a parameter
+            should have a scaled learning rate.
+        lr_mult (float): learning rate multiplier for parameters that
+            satisfy scale_lr_cond.
+        lr (float): learning rate.
+        min_lr (float): minimum learning rate.
+        decoupled_lr (Optional[float]): optional decoupled learning rate.
+        decoupled_min_lr (Optional[float]): optional decoupled minimum learning rate.
+
+    Returns:
+        List of parameter groups.
+    """
+
+    use_decoupled_learning_rate = decoupled_lr is not None
+
+    # Map (wd_mult, lr_mult, is_expert_parallel, is_decoupled_lr) to params.
+    params_map = {}
+    for model_chunk in model_chunks:
+        for name, param in model_chunk.named_parameters():
+            if not param.requires_grad:
+                continue
+
+            is_expert_parallel = not getattr(param, 'allreduce', True)
+
+            if no_weight_decay_cond is not None:
+                no_wd = no_weight_decay_cond(name, param)
+            else:
+                # Do not regularize biases and norm parameters.
+                no_wd = name.endswith(".bias") or len(param.shape) == 1
+
+            if scale_lr_cond is not None:
+                scale_lr = scale_lr_cond(name, param)
+            else:
+                scale_lr = False
+
+            if not no_wd and not scale_lr:
+                wd_mult, _lr_mult = 1.0, 1.0
+            elif not no_wd and scale_lr:
+                wd_mult, _lr_mult = 1.0, lr_mult
+            elif no_wd and not scale_lr:
+                wd_mult, _lr_mult = 0.0, 1.0
+            else:
+                wd_mult, _lr_mult = 0.0, lr_mult
+
+            is_decoupled_lr = False
+            # For input/embedding and output layer: embedding.word_embeddings.weight /
+            # output_layer.weight.
+            if use_decoupled_learning_rate and getattr(
+                param, 'is_embedding_or_output_parameter', False
+            ):
+                is_decoupled_lr = True
+
+            key = (wd_mult, _lr_mult, is_expert_parallel, is_decoupled_lr)
+            if key not in params_map:
+                params_map[key] = []
+            params_map[key].append(param)
+
+    param_groups = []
+    for (wd_mult, _lr_mult, is_expert_parallel, is_decoupled_lr), params in params_map.items():
+        assert len(params) > 0
+        param_group = {
+            'params': params,
+            'wd_mult': wd_mult,
+            'lr_mult': _lr_mult,
+            'is_expert_parallel': is_expert_parallel,
+            'is_decoupled_lr': is_decoupled_lr,
+        }
+        param_groups.append(param_group)
+
+    param_groups = _update_min_and_max_lr_in_param_groups(
+        param_groups,
+        lr=lr,
+        min_lr=min_lr,
+        decoupled_lr=decoupled_lr,
+        decoupled_min_lr=decoupled_min_lr,
+    )
+
+    return param_groups
+
+
+def _update_min_and_max_lr_in_param_groups(
+    param_groups: List[Dict],
+    lr: float,
+    min_lr: float,
+    decoupled_lr: Optional[float],
+    decoupled_min_lr: Optional[float],
+) -> List[Dict]:
+    """
+    Updates `max_lr` and `min_lr` values in each parameter group, and returns new list.
+    By default, each group will use `lr` / `min_lr` as `max_lr` / `min_lr`.
+    If `decoupled_lr` is provided, then `decoupled_lr` / `decoupled_min_lr` will be used
+    as `max_lr` / `min_lr` for the input and output layer.
+
+    Args:
+        param_groups (List): parameter groups whose 'max_lr' and `min_lr` fields need to
+            be adjusted.
+        lr (float): learning rate.
+        min_lr (float): minimum learning rate.
+        decoupled_lr (Optional[float]): optional decoupled learning rate.
+        decoupled_min_lr (Optional[float]): optional decoupled minimum learning rate.
+
+    Returns:
+        List of adjusted parameter groups.
+    """
+
+    if decoupled_min_lr is None:
+        decoupled_min_lr = min_lr
+
+    for param_group in param_groups:
+        if param_group['is_decoupled_lr']:
+            assert decoupled_lr is not None
+            param_group['max_lr'] = decoupled_lr
+            param_group['min_lr'] = decoupled_min_lr
+        else:
+            param_group['max_lr'] = lr
+            param_group['min_lr'] = min_lr
+    return param_groups
+
+
+def _get_param_groups_and_buffers(
+    model_chunks: List[MegatronModule],
+    model_chunk_offset: int,
+    config: OptimizerConfig,
+    no_weight_decay_cond: Optional[Callable],
+    scale_lr_cond: Optional[Callable],
+    lr_mult: float,
+    filter_fn: Callable,
+    buffer_name: str,
+) -> Tuple[List[Dict], Dict[int, List[_ParamAndGradBuffer]]]:
+    """Returns parameter groups and buffer for optimizer.
+
+    Args:
+        model_chunks (List[MegatronModule]): model chunks to create parameter
+            groups for.
+        model_chunk_offset (int): offset of model_chunks in global model_chunks list.
+        config (OptimizerConfig): optimizer configuration object.
+        no_weight_decay_cond (func, optional): function to determine whether a
+            parameter should not perform weight decay.
+        scale_lr_cond (func, optional): function to determine whether a parameter
+            should have a scaled learning rate.
+        lr_mult (float): learning rate multiplier for parameters that
+            satisfy scale_lr_cond.
+        lr (float): learning rate.
+        min_lr (float): minimum learning rate.
+        filter_fn (callable): filtering function for param_groups.
+        buffer_name (str): name of buffer.
+
+    Returns:
+        List of parameter groups and dictionary of model chunk IDs to buffers.
+    """
+    param_groups = _get_param_groups(
+        model_chunks,
+        no_weight_decay_cond,
+        scale_lr_cond,
+        lr_mult,
+        lr=config.lr,
+        min_lr=config.min_lr,
+        decoupled_lr=config.decoupled_lr,
+        decoupled_min_lr=config.decoupled_min_lr,
+    )
+    param_groups = list(filter(filter_fn, param_groups))
+    buffers = {}
+    for model_chunk_idx, model_chunk in enumerate(model_chunks):
+        if hasattr(model_chunk, buffer_name):
+            buffers[model_chunk_idx + model_chunk_offset] = getattr(model_chunk, buffer_name)
+
+    return param_groups, buffers
+
+
+def _get_megatron_optimizer_based_on_param_groups(
+    config: OptimizerConfig,
+    model_chunks: List[MegatronModule],
+    param_groups: List,
+    per_model_buffers: Optional[Dict[int, List[_ParamAndGradBuffer]]] = None,
+    model_parallel_group: Optional[torch.distributed.ProcessGroup] = None,
+    data_parallel_group: Optional[torch.distributed.ProcessGroup] = None,
+    data_parallel_group_gloo: Optional[torch.distributed.ProcessGroup] = None,
+    data_parallel_group_idx: Optional[int] = None,
+) -> MegatronOptimizer:
+    """Get Megatron optimizer based on parameter groups.
+
+    Args:
+        config (OptimizerConfig): optimizer configuration object.
+        model_chunks (list): list of model chunks.
+        param_groups (list): list of parameter groups.
+        per_model_buffers (dict, optional): buffers for distributed optimizer. Defaults to None.
+        data_parallel_group (torch.distributed.ProcessGroup, optional): data-parallel group for
+            distributed optimizer. Defaults to None.
+        data_parallel_group_gloo (torch.distributed.ProcessGroup, optional): gloo data-parallel
+            group for distributed optimizer. Defaults to None.
+        data_parallel_group_idx (int, optional): data-parallel group index for distributed
+            optimizer. Defaults to None.
+
+    Returns:
+        Instance of MegatronOptimizer.
+    """
+    if config.optimizer == 'adam':
+        optimizer = Adam(
+            param_groups,
+            lr=config.lr,
+            weight_decay=config.weight_decay,
+            betas=(config.adam_beta1, config.adam_beta2),
+            eps=config.adam_eps,
+        )
+
+        def init_state_fn(opt):
+            for group in opt.param_groups:
+                for p in group['params']:
+                    if len(opt.state[p]) == 0:
+                        opt.state[p]['exp_avg'] = torch.zeros_like(p.data)
+                        opt.state[p]['exp_avg_sq'] = torch.zeros_like(p.data)
+
+    elif config.optimizer == 'sgd':
+        optimizer = SGD(
+            param_groups,
+            lr=config.lr,
+            weight_decay=config.weight_decay,
+            momentum=config.sgd_momentum,
+        )
+        init_state_fn = None
+    else:
+        raise Exception('{} optimizer is not supported.'.format(config.optimizer))
+
+    # Mixed precision optimizer.
+    # - Note: both the Float16Optimizer and the DistributedOptimizer inherit
+    #   from the MixedPrecisionOptimizer, which manages any optimizer where
+    #   the model params and main params are distinct.
+    if config.fp16 or config.bf16 or config.use_distributed_optimizer:
+
+        # Grad scaler:
+        #    if loss-scale is provided, instantiate the constant scaler.
+        #    if we are using fp16 and loss-scale is not present, use a
+        #       dynamic scaler.
+        #    otherwise we are running in bf16 with no loss-scale so
+        #       leave it as None.
+        grad_scaler = None
+
+        # Constant loss scale.
+        if config.loss_scale:
+            grad_scaler = ConstantGradScaler(config.loss_scale)
+
+        # Dynamic loss scale.
+        else:
+            if config.fp16:
+                grad_scaler = DynamicGradScaler(
+                    initial_scale=config.initial_loss_scale,
+                    min_scale=config.min_loss_scale,
+                    growth_factor=2.0,
+                    backoff_factor=0.5,
+                    growth_interval=config.loss_scale_window,
+                    hysteresis=config.hysteresis,
+                )
+
+        optimizer_args = [optimizer, config, grad_scaler, init_state_fn]
+        if config.use_distributed_optimizer:
+            optimizer = DistributedOptimizer(
+                *optimizer_args,
+                model_chunks=model_chunks,
+                per_model_buffers=per_model_buffers,
+                data_parallel_group=data_parallel_group,
+                data_parallel_group_gloo=data_parallel_group_gloo,
+                data_parallel_group_idx=data_parallel_group_idx,
+            )
+        else:
+            optimizer = Float16OptimizerWithFloat16Params(*optimizer_args)
+            setattr(optimizer, 'model_parallel_group', model_parallel_group)
+    else:
+        # FP32 optimizer.
+        optimizer = FP32Optimizer(optimizer, config, init_state_fn)
+        setattr(optimizer, 'model_parallel_group', model_parallel_group)
+
+    return optimizer
+
+
+def get_megatron_optimizer(
+    config: OptimizerConfig,
+    model_chunks: List[MegatronModule],
+    no_weight_decay_cond: Optional[Callable] = None,
+    scale_lr_cond: Optional[Callable] = None,
+    lr_mult: float = 1.0,
+) -> MegatronOptimizer:
+    """Retrieve the Megatron optimizer for model chunks.
+
+    We use separate optimizers for expert parameters and non-expert parameters.
+
+    Args:
+        config (OptimizerConfig): optimizer configuration object.
+        model_chunks (List[MegatronModule]): model chunks to get optimizer for.
+        no_weight_decay_cond (func, optional): function to determine whether a parameter
+            should not perform weight decay. Defaults to None.
+        scale_lr_cond (func, optional): function to determine whether a parameter
+            should have a scaled learning rate. Defaults to None.
+        lr_mult (float, optional): learning rate multiplier for parameters that
+            satisfy scale_lr_cond. Defaults to 1.0.
+
+    Returns:
+        Instance of MegatronOptimizer.
+    """
+
+    log_single_rank(logger, logging.INFO, f'Setting up optimizer with config {config}')
+
+    # Separate out first model chunk if overlapping param AG with optimizer step.
+    if config.overlap_param_gather_with_optimizer_step:
+        all_dense_model_chunks = [[model_chunks[0]], model_chunks[1:]]
+        overlap_param_gather_with_optimizer_step_flags = [True, False]
+    else:
+        all_dense_model_chunks = [model_chunks]
+        overlap_param_gather_with_optimizer_step_flags = [False]
+    model_parallel_rank = torch.distributed.get_rank(mpu.get_model_parallel_group())
+
+    optimizers = []
+    model_chunk_offset = 0
+    for dense_model_chunks, overlap_param_gather_with_optimizer_step in zip(
+        all_dense_model_chunks, overlap_param_gather_with_optimizer_step_flags
+    ):
+        param_groups, buffers = _get_param_groups_and_buffers(
+            dense_model_chunks,
+            model_chunk_offset=model_chunk_offset,
+            config=config,
+            no_weight_decay_cond=no_weight_decay_cond,
+            scale_lr_cond=scale_lr_cond,
+            lr_mult=lr_mult,
+            filter_fn=lambda g: not g['is_expert_parallel'],
+            buffer_name='buffers',
+        )
+        for model_chunk in dense_model_chunks:
+            model_chunk.overlap_param_gather_with_optimizer_step = (
+                overlap_param_gather_with_optimizer_step
+            )
+        optimizers.append(
+            _get_megatron_optimizer_based_on_param_groups(
+                config,
+                model_chunks=dense_model_chunks,
+                param_groups=param_groups,
+                per_model_buffers=buffers,
+                model_parallel_group=mpu.get_model_parallel_group(),
+                data_parallel_group=mpu.get_data_parallel_group(with_context_parallel=True),
+                data_parallel_group_gloo=mpu.get_data_parallel_group_gloo(
+                    with_context_parallel=True
+                ),
+                data_parallel_group_idx=model_parallel_rank,
+            )
+        )
+        model_chunk_offset += 1
+
+    moe_param_groups, moe_buffers = _get_param_groups_and_buffers(
+        model_chunks,
+        model_chunk_offset=0,
+        config=config,
+        no_weight_decay_cond=no_weight_decay_cond,
+        scale_lr_cond=scale_lr_cond,
+        lr_mult=lr_mult,
+        filter_fn=lambda g: g['is_expert_parallel'],
+        buffer_name='expert_parallel_buffers',
+    )
+    if len(moe_param_groups) > 0:
+        model_parallel_world_size = torch.distributed.get_world_size(mpu.get_model_parallel_group())
+        expert_parallel_rank = mpu.get_expert_model_parallel_rank()
+        optimizers.append(
+            _get_megatron_optimizer_based_on_param_groups(
+                config,
+                model_chunks=model_chunks,
+                param_groups=moe_param_groups,
+                per_model_buffers=moe_buffers,
+                model_parallel_group=mpu.get_model_parallel_group(with_expert_parallel=True),
+                data_parallel_group=mpu.get_data_modulo_expert_parallel_group(
+                    with_context_parallel=True
+                ),
+                data_parallel_group_gloo=mpu.get_data_modulo_expert_parallel_group_gloo(
+                    with_context_parallel=True
+                ),
+                data_parallel_group_idx=expert_parallel_rank * model_parallel_world_size
+                + model_parallel_rank,
+            )
+        )
+
+    if len(optimizers) == 1:
+        return optimizers[0]
+
+    return ChainedOptimizer(optimizers)
diff --git a/megatron/core/optimizer/clip_grads.py b/megatron/core/optimizer/clip_grads.py
new file mode 100644
index 0000000000..708ccd019e
--- /dev/null
+++ b/megatron/core/optimizer/clip_grads.py
@@ -0,0 +1,193 @@
+# Copyright (c) 2024, NVIDIA CORPORATION. All rights reserved.
+
+"""Gradient clipping."""
+
+import os
+from typing import List, Optional, Union
+
+import torch
+from torch import inf
+
+try:
+    from transformer_engine.pytorch.optimizers import (
+        multi_tensor_applier,
+        multi_tensor_l2norm,
+        multi_tensor_scale,
+    )
+
+    l2_norm_impl = multi_tensor_l2norm
+    multi_tensor_scale_impl = multi_tensor_scale
+except ImportError:
+    try:
+        import amp_C
+        from apex.multi_tensor_apply import multi_tensor_applier
+
+        l2_norm_impl = amp_C.multi_tensor_l2norm
+        multi_tensor_scale_impl = amp_C.multi_tensor_scale
+    except ImportError:
+        import warnings
+
+        warnings.warn(
+            f'Transformer Engine and Apex are not installed. '
+            'Falling back to local implementations of multi_tensor_applier, '
+            'multi_tensor_l2norm, and multi_tensor_scale'
+        )
+
+        from megatron.core.utils import (
+            local_multi_tensor_applier,
+            local_multi_tensor_l2_norm,
+            local_multi_tensor_scale,
+        )
+
+        multi_tensor_applier = local_multi_tensor_applier
+        l2_norm_impl = local_multi_tensor_l2_norm
+        multi_tensor_scale_impl = local_multi_tensor_scale
+
+
+from ..tensor_parallel import param_is_not_tensor_parallel_duplicate
+from ..transformer.module import param_is_not_shared
+
+
+def get_grad_norm_fp32(
+    grads_for_norm: Union[List[torch.Tensor], torch.Tensor],
+    norm_type: Union[int, float] = 2,
+    model_parallel_group: Optional[torch.distributed.ProcessGroup] = None,
+) -> float:
+    """Calculate the norm of gradients in fp32.
+
+    This is adapted from torch.nn.utils.clip_grad.clip_grad_norm_ and
+    added functionality to handle model parallel parameters.
+
+    Arguments:
+        grads_for_norm (Iterable[Tensor] or Tensor): an iterable of Tensors or a single
+            Tensor that will be used for calculating the grad norm.
+        norm_type (float or int): type of the used p-norm. Can be ``'inf'`` for
+            infinity norm.
+        model_parallel_group (group): given the nature of the distributed
+            optimizer, this is passed as an argument.
+
+    Returns:
+        Total norm of the parameters (viewed as a single vector).
+    """
+
+    if isinstance(grads_for_norm, torch.Tensor):
+        grads_for_norm = [grads_for_norm]
+
+    # Norm parameters.
+    norm_type = float(norm_type)
+    total_norm = 0.0
+
+    # Calculate norm.
+    if norm_type == inf:
+        total_norm = max(grad.abs().max() for grad in grads_for_norm)
+        total_norm_cuda = torch.tensor([float(total_norm)], dtype=torch.float, device='cuda')
+        # Take max across all model-parallel GPUs.
+        torch.distributed.all_reduce(
+            total_norm_cuda, op=torch.distributed.ReduceOp.MAX, group=model_parallel_group
+        )
+        total_norm = total_norm_cuda[0].item()
+
+    else:
+        if norm_type == 2.0:
+            dummy_overflow_buf = torch.tensor([0], dtype=torch.int, device='cuda')
+            # Use apex's multi-tensor applier for efficiency reasons.
+            # Multi-tensor applier takes a function and a list of list
+            # and performs the operation on that list all in one kernel.
+            if grads_for_norm:
+                grad_norm, _ = multi_tensor_applier(
+                    l2_norm_impl,
+                    dummy_overflow_buf,
+                    [grads_for_norm],
+                    False,  # no per-parameter norm
+                )
+            else:
+                grad_norm = torch.tensor([0], dtype=torch.float, device='cuda')
+            # Since we will be summing across data parallel groups,
+            # we need the pow(norm-type).
+            total_norm = grad_norm**norm_type
+
+        else:
+            for grad in grads_for_norm:
+                grad_norm = torch.norm(grad, norm_type)
+                total_norm += grad_norm**norm_type
+
+        # Sum across all model-parallel GPUs.
+        torch.distributed.all_reduce(
+            total_norm, op=torch.distributed.ReduceOp.SUM, group=model_parallel_group
+        )
+        total_norm = total_norm.item() ** (1.0 / norm_type)
+
+    return total_norm
+
+
+def clip_grad_by_total_norm_fp32(
+    parameters: Union[List[torch.Tensor], torch.Tensor],
+    max_norm: Union[int, float],
+    total_norm: float,
+):
+    """Clips gradient of an iterable of parameters in fp32 by total norm.
+
+    Note that the gradients are modified in place.
+
+    Args:
+        parameters (Iterable[Tensor] or Tensor): an iterable of Tensors or a
+            single Tensor that will have gradients normalized.
+        max_norm (float or int): max norm of the gradients.
+        total_norm (float): total norm of the gradients.
+    """
+    # Grads.
+    grads = []
+    for param in parameters:
+        if param.grad is not None:
+            assert param.grad.type() == 'torch.cuda.FloatTensor'
+            grads.append(param.grad.detach())
+
+    # Scale.
+    clip_coeff = max_norm / (total_norm + 1.0e-6)
+    if clip_coeff < 1.0:
+        dummy_overflow_buf = torch.tensor([0], dtype=torch.int, device='cuda')
+        multi_tensor_applier(
+            multi_tensor_scale_impl, dummy_overflow_buf, [grads, grads], clip_coeff
+        )
+
+
+def count_zeros_fp32(
+    parameters: Union[List[torch.Tensor], torch.Tensor],
+    model_parallel_group: torch.distributed.ProcessGroup,
+) -> float:
+    """Counts the number of zeros in gradients associated with the passed-in list of
+    parameters.
+
+    Args:
+        parameters (Iterable[Tensor] or Tensor): an iterable of Tensors or a
+            single Tensor that will have the number of zeros in its corresponding
+            gradient counted.
+        model_parallel_group (torch.distributed.ProcessGroup, optional): model-parallel
+            group over which grad norm needs to be aggregated.
+    """
+
+    if isinstance(parameters, torch.Tensor):
+        parameters = [parameters]
+
+    # Filter parameters based on:
+    #   - grad should not be none
+    #   - parameter should not be shared
+    #   - should not be a replica due to tensor model parallelism
+    total_num_zeros = torch.tensor([0.0], dtype=torch.float, device='cuda')
+    for param in parameters:
+        grad_not_none = param.grad is not None
+        is_not_shared = param_is_not_shared(param)
+        is_not_tp_duplicate = param_is_not_tensor_parallel_duplicate(param)
+        if grad_not_none and is_not_shared and is_not_tp_duplicate:
+            grad = param.grad.detach()
+            num_zeros = grad.numel() - torch.count_nonzero(grad)
+            total_num_zeros = num_zeros + total_num_zeros
+
+    # Sum across all model-parallel GPUs.
+    torch.distributed.all_reduce(
+        total_num_zeros, op=torch.distributed.ReduceOp.SUM, group=model_parallel_group
+    )
+
+    total_num_zeros = total_num_zeros.item()
+
+    return total_num_zeros
diff --git a/megatron/core/optimizer/distrib_optimizer.py b/megatron/core/optimizer/distrib_optimizer.py
new file mode 100644
index 0000000000..dfa8d51979
--- /dev/null
+++ b/megatron/core/optimizer/distrib_optimizer.py
@@ -0,0 +1,1839 @@
+# Copyright (c) 2024, NVIDIA CORPORATION. All rights reserved.
+
+"""Megatron distributed optimizer."""
+
+
+import itertools
+import warnings
+from dataclasses import replace
+from logging import getLogger
+from typing import Callable, Dict, List, Optional, Tuple
+
+import torch
+
+HAVE_APEX_OR_TE = True
+try:
+    from transformer_engine.pytorch.optimizers import FusedAdam as Adam
+except ImportError:
+    try:
+        from apex.optimizers import FusedAdam as Adam
+    except ImportError:
+        from torch.optim import Adam
+
+        HAVE_APEX_OR_TE = False
+
+from .. import tensor_parallel
+from ..config_logger import has_config_logger_enabled, log_config_to_disk
+from ..dist_checkpointing import ShardedTensor
+from ..dist_checkpointing.dict_utils import nested_values
+from ..dist_checkpointing.mapping import (
+    LocalNonpersistentObject,
+    ShardedObject,
+    ShardedStateDict,
+    ShardedTensorFactory,
+)
+from ..dist_checkpointing.utils import extract_sharded_tensors_and_factories
+from ..distributed.param_and_grad_buffer import _ParamAndGradBuffer, partition_buckets
+from ..transformer.module import MegatronModule
+from ..utils import is_float8tensor
+from .grad_scaler import MegatronGradScaler
+from .optimizer import (
+    MixedPrecisionOptimizer,
+    _multi_tensor_copy_this_to_that,
+    _zero_grad_group_helper,
+)
+from .optimizer_config import OptimizerConfig
+
+try:
+    # This will be used when "--fp8-param-gather" is enabled.
+    # When BF16/FP16 parameters don't exist, we need to cast the FP32 main parameters to
+    # FP8 directly in the optimizer.
+    from transformer_engine.pytorch.cpp_extensions import cast_to_fp8
+except:
+    pass
+
+logger = getLogger(__name__)
+
+
+class Range:
+    """
+    A range represents a start and end points for indexing a shard
+    from a full tensor.
+
+    Args:
+        start (int): Start index.
+        end (int): End index.
+    """
+
+    def __init__(self, start: int, end: int):
+        self.start = start
+        self.end = end
+        self.size = end - start
+
+    def normalize(self, start: int = 0):
+        """Shift start/end indexes to start at new start index.
+
+        Both start and end indexes will be shifted by [new start] - [old start].
+
+        Args:
+            start (int): New start index.
+        """
+        return Range(start, start + self.size)
+
+    def __str__(self):
+        return "%d,%d [%d]" % (self.start, self.end, self.size)
+
+    def __len__(self):
+        return self.end - self.start
+
+
+class DistributedOptimizer(MixedPrecisionOptimizer):
+    """Distributed optimizer, for all data types (fp16, bf16, and fp32).
+
+    See __init__() below for argument details.
+    """
+
+    @classmethod
+    def _build_model_gbuf_param_range_map(
+        cls,
+        param_world_index_map: Dict[torch.nn.Parameter, Tuple],
+        gbuf_world_range: Range,
+        bucket_offset: int,
+    ):
+        """
+        Build mapping from param reference to grad buffer shard ranges.
+
+        This method builds a mapping from parameter references to grad
+        buffer shard ranges, specific to each data-parallel (DP) rank's
+        set of 'owned' parameters. Each grad buffer (padded to be an even
+        multiple of DP-world-size) is conceptually divided into DP-world-size
+        contiguous regions, where each DP rank 'owns' a contiguous region.
+        Ownership in this sense means DP rank is responsible for reducing
+        the relevant subset of grads, and updating the relevant subset of
+        params.
+
+        This conceptual partitioning of the grad buffer does NOT respect
+        parameter boundaries, and as such it is assumed that each created
+        range references a shard (or subset) of the full parameter. It is
+        easiest to think of each DP rank as operating (i.e., reducing,
+        gathering) purely on views into the grad buffer, for all model-to-
+        main & main-to-model operations.
+
+        This method creates four ranges:
+        - The param's range within the entire grad buffer (i.e., world index).
+        - The param's range within the relevant grad bucket's buffer.
+        - The param's range within the DP rank's local view of the grad buffer.
+        - The param's range within itself (i.e., its shard).
+        """
+
+        # Param range map.
+        param_range_map = {}
+        for param, param_world_indexes in param_world_index_map.items():
+
+            # Param range.
+            param_world_start, param_world_end, _ = param_world_indexes
+            param_local_start = max(0, param_world_start - gbuf_world_range.start)
+            param_local_end = min(gbuf_world_range.size, param_world_end - gbuf_world_range.start)
+
+            # Add param, if within local gbuf range.
+            if param_local_end > param_local_start:
+                param_local_range = Range(param_local_start, param_local_end)
+                param_world_range = param_local_range.normalize(
+                    param_local_start + gbuf_world_range.start
+                )
+                param_world_range_in_bucket = Range(
+                    param_world_range.start - bucket_offset, param_world_range.end - bucket_offset
+                )
+                sub_param_start = max(0, gbuf_world_range.start - param_world_start)
+                sub_param_range = param_local_range.normalize(sub_param_start)
+                param_range_map[param] = {
+                    "gbuf_world": param_world_range,
+                    "gbuf_world_in_bucket": param_world_range_in_bucket,
+                    "gbuf_local": param_local_range,
+                    "param": sub_param_range,
+                }
+
+        return param_range_map
+
+    @classmethod
+    def _build_model_gbuf_range(cls, param_and_grad_buffer: _ParamAndGradBuffer, bucket_index: int):
+        """
+        Build mapping between params and their grad buffers.
+
+        This method does the initial setup for the method above. This setup
+        includes determining the shard ranges into the param_and_grad_buffer
+        for each data-parallel (DP) rank. Each DP rank keeps range info for
+        all other DP ranks, for the purpose of creating args for
+        reduce-scatter and all-gather.
+        """
+
+        data_parallel_rank = torch.distributed.get_rank(param_and_grad_buffer.data_parallel_group)
+        data_parallel_world_size = param_and_grad_buffer.data_parallel_group.size()
+
+        bucket = param_and_grad_buffer.buckets[bucket_index]
+        gbuf_size = bucket.grad_data.numel()
+        assert (
+            gbuf_size % data_parallel_world_size == 0
+        ), f"Each bucket's buffer size should be divisible by {data_parallel_world_size}"
+        max_gbuf_range_size = gbuf_size // data_parallel_world_size
+
+        # All world ranges (i.e., across all data parallel ranks).
+        gbuf_world_all_ranges = []
+        for r in range(data_parallel_world_size):
+            # Compute start of chunk in this bucket.
+            gbuf_world_start = r * max_gbuf_range_size
+            gbuf_world_end = min(gbuf_size, gbuf_world_start + max_gbuf_range_size)
+            # Add bucket's offset in grad buffer.
+            gbuf_world_range = Range(
+                gbuf_world_start + bucket.offset, gbuf_world_end + bucket.offset
+            )
+            gbuf_world_all_ranges.append(gbuf_world_range)
+
+        # Local DP's ranges.
+        gbuf_world_range = gbuf_world_all_ranges[data_parallel_rank]
+
+        # Get each param's ranges.
+        param_range_map = cls._build_model_gbuf_param_range_map(
+            param_and_grad_buffer.param_index_map, gbuf_world_range, bucket.offset
+        )
+
+        # Group into dict.
+        data = {"param_map": param_range_map}
+
+        return data
+
+    @classmethod
+    def _build_gbuf_range_map(cls, param_and_grad_buffer: _ParamAndGradBuffer):
+        """
+        Build mapping between params and their grad buffers. These mappings are
+        partitioned according to data type.
+
+        Iterate through all buckets of grad buffer to construct param ranges
+        that this rank "owns" (the dp_rank'th shard of each bucket, where each
+        shard is 1/dp_world_size of the bucket).
+
+        Args:
+            param_and_grad_buffer (_ParamAndGradBuffer): buffer to build mapping for.
+        """
+        return {
+            (param_and_grad_buffer.param_dtype, param_and_grad_buffer.grad_dtype): [
+                cls._build_model_gbuf_range(param_and_grad_buffer, bucket_index)
+                for bucket_index in range(len(param_and_grad_buffer.buckets))
+            ]
+        }
+
+    @classmethod
+    def _build_model_param_gbuf_map(
+        cls, gbuf_ranges: List[Dict]
+    ) -> Dict[torch.nn.Parameter, Tuple]:
+        """
+        Create a reverse of the gbuf_ranges, for referencing in opposite direction.
+        """
+        param_gbuf_map = {}
+        for gbuf_index, gbuf_range_map in enumerate(gbuf_ranges):
+            for dtype, gbuf_range_map_for_all_buckets in gbuf_range_map.items():
+                for bucket_index, gbuf_range_map in enumerate(gbuf_range_map_for_all_buckets):
+                    for param, _ in gbuf_range_map["param_map"].items():
+                        assert param not in param_gbuf_map, (
+                            "Param should not be in param_gbuf_map; each param only belongs "
+                            "to a single bucket."
+                        )
+                        param_gbuf_map[param] = (gbuf_index, dtype, bucket_index)
+        return param_gbuf_map
+
+    @classmethod
+    def _build_optimizer_group_ranges(cls, param_groups: List[Dict], gbuf_ranges: List[Dict]):
+        """
+        Create optimizer groups.
+
+        Given the set of parameter shard ranges that are owned by the current
+        data-parallel (DP) rank, gather the set of parameters that will be
+        used (in the method below) to create the current DP's optimizer
+        groups.
+        """
+
+        # Param group map.
+        # World param group map.
+        # - Store a mapping of <model_parameter:group_index> for all parameters
+        #   across all DP ranks. This is necessary because it is our first
+        #   cross reference between the DDP mappings and the optimizer group
+        #   parameters. This mapping only for use in the next step of building
+        #   the local mapping over this DP rank's parameters.
+        world_param_group_map = {}
+        for group_index, group in enumerate(param_groups):
+            for param in group["params"]:
+                assert param.requires_grad
+                world_param_group_map[param] = group_index
+
+        # Optimizer group ranges & param-group mapping.
+        # - Build a mapping from groups to their contained parameters, and also
+        #   from parameters to their containing group index and order within
+        #   the group. The group index and order are particularly important for
+        #   saving and loading checkpoints.
+        local_param_group_map = {}
+        group_ranges = [{"params": []} for _ in param_groups]
+        for gbuf_range_map in gbuf_ranges:
+            for dtype, gbuf_range_map_for_all_buckets in gbuf_range_map.items():
+                for gbuf_range_map in gbuf_range_map_for_all_buckets:
+                    for param in gbuf_range_map["param_map"]:
+                        group_index = world_param_group_map[param]
+                        group_range = group_ranges[group_index]
+                        group_range["params"].append(param)
+                        local_param_group_map[param] = (group_index, len(group_range["params"]) - 1)
+
+        # Squeeze zero-size group ranges.
+        for group_index, group_range in enumerate(group_ranges):
+            group_range["orig_group"] = param_groups[group_index]
+            group_range["orig_group_idx"] = param_groups[group_index]
+
+        return local_param_group_map, group_ranges
+
+    @classmethod
+    def _build_model_and_main_param_groups(
+        cls,
+        gbuf_ranges: List[Dict],
+        param_gbuf_map: Dict[torch.nn.Parameter, Tuple],
+        opt_group_ranges: List,
+    ):
+        """
+        Create main parameter groups needed for the optimizer step.
+
+        These groups encompass both: 1) groups used by this class, for
+        reducing/gather, and 2) groups used by the inner optimizer for the
+        parameter update. Given that the conceptual grad buffer partitioning
+        (created in earlier method) doesn't respect parameter boundaries,
+        the optimizer operates on shards of the model parameters, rather than
+        the full parameters.
+        """
+
+        # Parameter groups:
+        #   model_float16_groups: original float16 parameters
+        #   model_fp32_groups: original fp32 parameters
+        #   shard_float16_groups: shards of original float16 parameters
+        #   shard_fp32_groups: shards of original fp32 parameters
+        #   shard_fp32_from_float16_groups: fp32 copy of float16 parameters
+        model_float16_groups = []
+        model_fp32_groups = []
+        shard_float16_groups = []
+        shard_fp32_groups = []
+        shard_fp32_from_float16_groups = []
+
+        # Allocate (or slice) each group's param shard.
+        for group_range in opt_group_ranges:
+
+            # Params of this group.
+            model_float16_params_this_group = []
+            model_fp32_params_this_group = []
+            shard_float16_params_this_group = []
+            shard_fp32_params_this_group = []
+            shard_fp32_from_float16_params_this_group = []
+            model_float16_groups.append(model_float16_params_this_group)
+            model_fp32_groups.append(model_fp32_params_this_group)
+            shard_float16_groups.append(shard_float16_params_this_group)
+            shard_fp32_groups.append(shard_fp32_params_this_group)
+            shard_fp32_from_float16_groups.append(shard_fp32_from_float16_params_this_group)
+
+            for model_param in group_range["params"]:
+
+                assert model_param.requires_grad
+
+                gbuf_index, dtype, bucket_index = param_gbuf_map[model_param]
+                gbuf_range = gbuf_ranges[gbuf_index][dtype][bucket_index]
+                param_range = gbuf_range["param_map"][model_param]["param"]
+
+                # fp16, bf16 params.
+                if model_param.type() in ['torch.cuda.HalfTensor', 'torch.cuda.BFloat16Tensor']:
+
+                    # Clone model -> main.
+                    shard_model_param = model_param.detach().view(-1)[
+                        param_range.start : param_range.end
+                    ]
+
+                    # If we use FP8 params to initialize FP32 main params (compared to using the
+                    # bf16/fp16 params to initialize the main params), there will be a loss of
+                    # precision at the beginning of training (this problem will not occur if the
+                    # training is long enough or if the main params are loaded from a checkpoint).
+                    if is_float8tensor(model_param) and hasattr(
+                        model_param, 'get_high_precision_init_val'
+                    ):
+                        shard_main_param = (
+                            model_param.get_high_precision_init_val()
+                            .view(-1)[param_range.start : param_range.end]
+                            .clone()
+                            .to(shard_model_param.device)
+                            .float()
+                        )
+                        model_param.clear_high_precision_init_val()
+                    else:
+                        shard_main_param = shard_model_param.clone().float()
+
+                    tensor_parallel.copy_tensor_model_parallel_attributes(
+                        shard_model_param, model_param
+                    )
+                    tensor_parallel.copy_tensor_model_parallel_attributes(
+                        shard_main_param, model_param
+                    )
+                    if hasattr(model_param, 'shared'):
+                        shard_model_param.shared = model_param.shared
+                        shard_main_param.shared = model_param.shared
+
+                    # Add to group.
+                    model_float16_params_this_group.append(model_param)
+                    shard_float16_params_this_group.append(shard_model_param)
+                    shard_fp32_from_float16_params_this_group.append(shard_main_param)
+
+                # fp32 params.
+                elif model_param.type() == 'torch.cuda.FloatTensor':
+                    shard_model_param = model_param.view(-1)[param_range.start : param_range.end]
+                    model_fp32_params_this_group.append(model_param)
+                    shard_fp32_params_this_group.append(shard_model_param)
+                    tensor_parallel.copy_tensor_model_parallel_attributes(
+                        shard_model_param, model_param
+                    )
+                    if hasattr(model_param, 'shared'):
+                        shard_model_param.shared = model_param.shared
+
+                else:
+                    raise TypeError(
+                        'Wrapped parameters must be one of '
+                        'torch.cuda.FloatTensor,  '
+                        'torch.cuda.HalfTensor, or '
+                        'torch.cuda.BFloat16Tensor. '
+                        'Received {}'.format(model_param.type())
+                    )
+
+            # Update optimizer's params.
+            group_range["orig_group"]["params"] = [
+                *shard_fp32_params_this_group,
+                *shard_fp32_from_float16_params_this_group,
+            ]
+
+        return (
+            model_float16_groups,
+            model_fp32_groups,
+            shard_float16_groups,
+            shard_fp32_groups,
+            shard_fp32_from_float16_groups,
+        )
+
+    def __init__(
+        self,
+        optimizer: torch.optim.Optimizer,
+        config: OptimizerConfig,
+        grad_scaler: MegatronGradScaler,
+        init_state_fn: Optional[Callable],
+        model_chunks: List[MegatronModule],
+        per_model_buffers: Dict[int, List[_ParamAndGradBuffer]],
+        data_parallel_group: torch.distributed.ProcessGroup,
+        data_parallel_group_gloo: torch.distributed.ProcessGroup,
+        data_parallel_group_idx: int,
+    ):
+        """
+        Distributed optimizer, for all data types (fp16, bf16, and fp32).
+
+        The steps in this method create the core mapping between param and grad buffers,
+        parameters, and parameter shard ranges, that is needed for converting between model
+        param indexes and main parameter shard indexes. This method also updates the optimizer
+        parameter groups with the newly created shards.
+
+        Args:
+            optimizer (torch.optim.Optimizer): base optimizer such as Adam or SGD.
+            config (OptimizerConfig): configuration object for optimizer.
+            grad_scaler (MegatronGradScaler): used for scaling gradients. Note that
+                this can be None. This case happens when `bf16 = True` and we don't
+                use any loss scale. Note that for `bf16 = True`, we can have
+                a constant gradient scaler. Also for `bf16 = False`, we
+                always require a grad scaler.
+            init_state_fn (Callable, optional): function to initialize state in the optimizer.
+            model_chunks (List[MegatronModule]): list of model chunks.
+            per_model_buffers (Dict[int, List[ParamAndGradBuffer]]): the implementation of the
+                distributed optimizer is centered on using a contiguous buffer for
+                communicating grads & params between the model state and the optimizer state.
+                You can find a more detailed description in
+                https://github.com/NVIDIA/Megatron-LM/blob/main/docs/source/distrib_optimizer.md.
+            data_parallel_group (torch.distributed.ProcessGroup): data-parallel group to use to
+                all-gather params after optimizer.step().
+            data_parallel_group_gloo (torch.distributed.ProcessGroup): gloo data-parallel group
+                (used in checkpoint loading and saving).
+            data_parallel_group_idx (int): index in data-parallel group (used by
+                distributed checkpointing logic).
+        """
+
+        if has_config_logger_enabled(config):
+            log_config_to_disk(config, locals(), prefix=type(self).__name__)
+
+        assert (
+            HAVE_APEX_OR_TE
+        ), f'Please install Apex or Transformer Engine to use DistributedOptimizer.'
+
+        super().__init__(optimizer, config, grad_scaler, init_state_fn)
+        self.model_chunks = model_chunks
+        self.ddp_config = self.model_chunks[0].ddp_config
+        for model_chunk in self.model_chunks:
+            assert self.ddp_config == model_chunk.ddp_config
+
+        assert isinstance(
+            optimizer, Adam
+        ), "Only Adam currently supported, due to checkpointing requirements."
+
+        # Model grad buffer ranges.
+        assert per_model_buffers is not None, "per_model_buffers must be provided"
+        self.buffers = list(itertools.chain(*per_model_buffers.values()))
+        self.per_model_buffers = per_model_buffers
+        self.data_parallel_group = data_parallel_group
+        self.data_parallel_group_gloo = data_parallel_group_gloo
+        self.data_parallel_group_idx = data_parallel_group_idx
+
+        self.gbuf_idx_to_model_idx_map = {}
+        gbuf_idx = 0
+        for model_idx, buffers in self.per_model_buffers.items():
+            for _ in buffers:
+                self.gbuf_idx_to_model_idx_map[gbuf_idx] = model_idx
+                gbuf_idx += 1
+
+        self.per_model_bucket_groups = {}
+        for model_idx, buffers in self.per_model_buffers.items():
+            self.per_model_bucket_groups[model_idx] = partition_buckets(buffers)
+
+        self.gbuf_ranges = []
+        self.per_bucket_numel = []
+        self.per_bucket_numel_unpadded = []
+        for buffer in self.buffers:
+
+            self.per_bucket_numel.append(
+                {
+                    (buffer.param_dtype, buffer.grad_dtype): [
+                        bucket.grad_data.numel() for bucket in buffer.buckets
+                    ]
+                }
+            )
+            self.per_bucket_numel_unpadded.append(
+                {
+                    (buffer.param_dtype, buffer.grad_dtype): [
+                        bucket.numel_unpadded for bucket in buffer.buckets
+                    ]
+                }
+            )
+            self.gbuf_ranges.append(self._build_gbuf_range_map(buffer))
+        self.model_param_gbuf_map = self._build_model_param_gbuf_map(self.gbuf_ranges)
+
+        # Optimizer ranges.
+        (self.model_param_group_index_map, self.opt_group_ranges) = (
+            self._build_optimizer_group_ranges(self.optimizer.param_groups, self.gbuf_ranges)
+        )
+
+        # Allocate main param shards.
+        (
+            self.model_float16_groups,
+            self.model_fp32_groups,
+            self.shard_float16_groups,
+            self.shard_fp32_groups,
+            self.shard_fp32_from_float16_groups,
+        ) = self._build_model_and_main_param_groups(
+            self.gbuf_ranges, self.model_param_gbuf_map, self.opt_group_ranges
+        )
+
+        # Update optimizer groups.
+        # - Also, leverage state_dict() and load_state_dict() to
+        #   recast preexisting per-param state tensors.
+        self.optimizer.param_groups = [g["orig_group"] for g in self.opt_group_ranges]
+        self.optimizer.load_state_dict(self.optimizer.state_dict())
+
+    def enable_pre_hook(self):
+        """
+        Enable forward pre-hook needed for param all-gather overlap with forward compute.
+        """
+        warnings.warn(
+            "`DistributedOptimizer.enable_pre_hook` will be deprecated in a future release. "
+            "Use `DistributedDataParallel.enable_forward_pre_hook` directly."
+        )
+        for model_chunk in self.model_chunks:
+            model_chunk.enable_forward_pre_hook()
+
+    def disable_pre_hook(self):
+        """
+        Disable forward pre-hook needed for param all-gather overlap with forward compute.
+        """
+        warnings.warn(
+            "`DistributedOptimizer.disable_pre_hook` will be deprecated in a future release. "
+            "Use `DistributedDataParallel.disable_forward_pre_hook` directly."
+        )
+        for model_chunk in self.model_chunks:
+            model_chunk.disable_forward_pre_hook()
+
+    def _get_model_param_range_map(self, param: torch.nn.Parameter):
+        """
+        Given a model param, get the index sub-range of the param that this
+        data-parallel rank owns.
+        """
+        gbuf_index, dtype, bucket_index = self.model_param_gbuf_map[param]
+        gbuf_range_map = self.gbuf_ranges[gbuf_index][dtype][bucket_index]
+        param_range_map = gbuf_range_map["param_map"][param]
+        return param_range_map
+
+    def get_model_parallel_group(self) -> torch.distributed.ProcessGroup:
+        """
+        With the distributed optimizer, the model parallel group is the
+        entire world.
+        """
+        return None
+
+    def state_dict(self):
+        """
+        The state dict contains all non-DP-rank-dependent (i.e., non-parameter-
+        related) optimizer variables. The returned state dict can be stored in
+        the standard model/RNG checkpoint file. The parameter and dependent
+        optimizer state (e.g., exp_avg, exp_avg_sq) are stored in a separate
+        checkpoint file by calling 'save_parameter_state()'.
+        """
+
+        inner_state_dict = self.optimizer.state_dict()
+        state_dict = {}
+
+        # Extract 'step', for non-Apex/TE support.
+        if not HAVE_APEX_OR_TE:
+            steps = list(set([s["step"].item() for s in inner_state_dict["state"].values()]))
+            assert len(steps) == 1
+            step = steps[0]
+
+        # Optimizer state (do not store parameter state here).
+        state_dict['optimizer'] = {k: v for k, v in inner_state_dict.items() if k != "state"}
+        for param_group in state_dict["optimizer"]["param_groups"]:
+            del param_group["params"]
+            if not HAVE_APEX_OR_TE:
+                # Native PyTorch param group requires step (i.e., iteration).
+                param_group["step"] = step
+
+        # Grad scaler state.
+        if self.grad_scaler:
+            state_dict['grad_scaler'] = self.grad_scaler.state_dict()
+
+        return state_dict
+
+    def load_state_dict(self, state_dict):
+        """Load the state dict.
+
+        As detailed in state_dict(), the state dict contains all non-
+        parameter-related variables. This method is notably longer than
+        state_dict(), because the Torch optimizers state has yet to be
+        allocated at this point, and so we must do a cross referencing between
+        the optimizers state (and the ordering it expects for parameter state)
+        and this DP rank's shards. The optimizer at this point does not contain
+        any tensor dimension information, so we must get these dimensions from
+        the DP shards mapped during DistributedOptimizer.__init__().
+
+        The tensor parameter state is loaded via load_parameter_state(), and
+        so this method also must populate the loaded state dict with dummy
+        tensor data (i.e., via torch.empty() below). This will be overwritten
+        during load_parameter_state().
+
+        ** Note: Torch optimizer's state structure. **
+        The Torch optimizer stores its state in two levels. The top level is a
+        list of groups, where each group contains a list of integer indexes
+        (corresponding to parameters) that index into a master parameter list
+        that is shared by all groups. As such, three values are necessary for
+        maintaining this ordering:
+
+        - group_index : The group to which a parameter belongs.
+        - group_order : The index of a parameter within its group.
+        - state_order : The index of a parameter within the shared parameter
+            list.
+        """
+
+        # Get the Torch optimizer's state dict.
+        # - This 'inner' optimizer at this point is unallocated, and only
+        #   contains an integer ordering of parameters within each group, and
+        #   the ordering of parameters within its flattened parameter state
+        #   list.
+        inner_state_dict = self.optimizer.state_dict()
+        state_dict_param_groups = [
+            {**group, "params": list(inner_state_dict["param_groups"][idx]["params"])}
+            for idx, group in enumerate(state_dict["optimizer"]["param_groups"])
+        ]
+
+        # Allocate or retrieve optimizer state (i.e., tensors).
+        if len(self.optimizer.state) == 0:
+            # Allocate empty optimizer state if not previously initialized.
+            # - If len(self.optimizer.state) == 0, this means that the optimizer
+            #   state has not been previously initialized. Once it has been
+            #   initialized, we skip this code block to avoid reallocating
+            #   empty tensors (i.e., torch.empty), which in turn reduces memory
+            #   fragmentation.
+            # - Real data is overwritten during load_parameter_state().
+            state_dict_state = []
+            for gbuf_range_maps in self.gbuf_ranges:
+                for gbuf_range_map_for_all_buckets in gbuf_range_maps.values():
+                    for gbuf_range_map in gbuf_range_map_for_all_buckets:
+                        for model_param, param_range_map in gbuf_range_map["param_map"].items():
+
+                            # Get parameter ordering information (see method docstring
+                            # for details).
+                            group_index, group_order = self.model_param_group_index_map[model_param]
+                            state_order = inner_state_dict["param_groups"][group_index]["params"][
+                                group_order
+                            ]
+
+                            # Allocate dummy tensors.
+                            numel = len(param_range_map["gbuf_world"])
+                            init_shard = lambda: torch.empty(
+                                (numel,), dtype=torch.float32, device=torch.cuda.current_device()
+                            )
+
+                            state_dict_state.append(
+                                (state_order, {"exp_avg": init_shard(), "exp_avg_sq": init_shard()})
+                            )
+
+            # Sort by state order (see method docstring for details).
+            state_dict_state.sort(key=lambda s: s[0])
+            state_dict_state = {s[0]: s[1] for s in state_dict_state}
+
+        else:
+            # Retrieve existing optimizer state.
+            state_dict_state = inner_state_dict["state"]
+
+        # Extract 'step', for non-Apex/TE support.
+        if not HAVE_APEX_OR_TE:
+            steps = list(set([g["step"] for g in state_dict["optimizer"]["param_groups"]]))
+            assert len(steps) == 1
+            step = torch.tensor(steps[0], dtype=torch.float)
+
+            for s in state_dict_state.values():
+                # Native PyTorch state dict requires step (i.e., iteration).
+                s["step"] = step
+
+        # Optimizer.
+        self.optimizer.load_state_dict(
+            {"state": state_dict_state, "param_groups": state_dict_param_groups}
+        )
+
+        # Grad scaler.
+        if 'grad_scaler' not in state_dict:
+            if self.config.fp16:
+                logger.info(
+                    '***WARNING*** found an old checkpoint, will not ' 'load grad scaler ...'
+                )
+        else:
+            if self.grad_scaler:
+                self.grad_scaler.load_state_dict(state_dict['grad_scaler'])
+            else:
+                logger.info(
+                    '***WARNING*** fould the grad scaler in the '
+                    'checkpoint but it is None in the class. '
+                    'Skipping loading grad scaler ...'
+                )
+
+        if 'param_state' in state_dict:
+            assert 'param_state_sharding_type' in state_dict, state_dict.keys()
+            param_state = state_dict['param_state']
+            sharding_type = state_dict['param_state_sharding_type']
+            logger.info(f'Loading distributed optimizer sharded state of type {sharding_type}')
+            if sharding_type == 'dp_zero_gather_scatter':
+                self.load_parameter_state_from_dp_zero(param_state)
+            elif sharding_type == 'fully_sharded_bucket_space':
+                self.load_parameter_state_from_fs_bucket_space(param_state)
+            elif sharding_type == 'fully_sharded_model_space':
+                self.load_parameter_state_from_fs_model_space(param_state)
+            else:
+                raise NotImplementedError(f'Unknown sharding_type: {sharding_type}')
+
+    def get_parameter_state_fs_bucket_space(self):
+        """Get internal representation of parameter state without any copies and modifications.
+
+        This is referred to as "fully sharded bucket space" because the optimizer state is
+        fully sharded (e.g. no gather involved) and bucket-centric (the state
+        follows the internal structure of the Distributed Optimizer buckets)
+        as opposed to model-centric (typical structure of PyT optimizers)
+        """
+        state = {
+            "per_bucket_numel": self.per_bucket_numel,
+            "per_bucket_numel_unpadded": self.per_bucket_numel_unpadded,
+        }
+        for gbuf_idx, gbuf_range_maps in enumerate(self.gbuf_ranges):
+
+            # Iterate grad buffers (by data type).
+            dtype_state = {}
+            assert len(gbuf_range_maps) == 1, "single dtype supported, for now."
+            for dtype, gbuf_range_map_for_all_buckets in gbuf_range_maps.items():
+                buckets_state = []
+                for bucket_idx, gbuf_range_map in enumerate(gbuf_range_map_for_all_buckets):
+                    bucket_state = []
+                    for model_param, param_range_map in gbuf_range_map["param_map"].items():
+
+                        # Main param & optimizer states.
+                        group_index, group_order = self.model_param_group_index_map[model_param]
+                        main_param = self.optimizer.param_groups[group_index]["params"][group_order]
+                        optim_state = self.optimizer.state[main_param]
+
+                        tensors = {
+                            "param": main_param,
+                            **optim_state,
+                            "gbuf_local_start": param_range_map["gbuf_local"].start,
+                            "gbuf_local_end": param_range_map["gbuf_local"].end,
+                        }
+                        bucket_state.append(tensors)
+                    buckets_state.append(bucket_state)
+                dtype_state[dtype] = buckets_state
+            state[gbuf_idx] = dtype_state
+        return state
+
+    def get_parameter_state_dp_zero(self):
+        """Get parameter state (i.e., parameter & optimizer tensors).
+
+        This method performs two steps:
+        - For each DP rank, copy param & optimizer shards to contiguous CPU
+          buffers (e.g., one buffer each for main_param, exp_avg, and
+          exp_avg_sq).
+        - Gather contiguous buffers on DP rank 0 and concatenate to world
+          buffers.
+        """
+
+        # Data parallelism variables.
+        data_parallel_world_size = self.data_parallel_group_gloo.size()
+        data_parallel_rank = torch.distributed.get_rank(self.data_parallel_group_gloo)
+        data_parallel_group_gloo = self.data_parallel_group_gloo
+        data_parallel_global_ranks = torch.distributed.get_process_group_ranks(
+            self.data_parallel_group_gloo
+        )
+
+        # Collect param states.
+        state = {"buckets_coalesced": True}
+        for gbuf_idx, gbuf_range_maps in enumerate(self.gbuf_ranges):
+
+            # Iterate grad buffers (by data type).
+            dtype_state = {}
+            assert len(gbuf_range_maps) == 1, "single dtype supported, for now."
+            for dtype, gbuf_range_map_for_all_buckets in gbuf_range_maps.items():
+                buffer_numel_unpadded = self.buffers[gbuf_idx].numel_unpadded
+                # Create coalesced tensors for all state related to parameters in this buffer.
+                world_tensors = {}
+                if data_parallel_rank == 0:
+                    world_tensors = {
+                        key: torch.zeros(
+                            (buffer_numel_unpadded,), dtype=torch.float32, device="cpu"
+                        )
+                        for key in ("param", "exp_avg", "exp_avg_sq")
+                    }
+                    world_tensors["numel_unpadded"] = buffer_numel_unpadded
+                offset_in_world_tensors = 0
+                for bucket_idx, gbuf_range_map in enumerate(gbuf_range_map_for_all_buckets):
+
+                    # Compute local DP contiguous shard's size.
+                    gbuf_world_numel = self.buffers[gbuf_idx].buckets[bucket_idx].grad_data.numel()
+                    assert gbuf_world_numel % data_parallel_world_size == 0
+                    gbuf_local_numel = gbuf_world_numel // data_parallel_world_size
+
+                    gbuf_world_numel_unpadded = (
+                        self.buffers[gbuf_idx].buckets[bucket_idx].numel_unpadded
+                    )
+                    assert gbuf_world_numel_unpadded <= gbuf_world_numel
+
+                    local_shards = {
+                        key: torch.zeros((gbuf_local_numel,), dtype=torch.float32, device="cpu")
+                        for key in ("param", "exp_avg", "exp_avg_sq")
+                    }
+
+                    # Build contiguous DP rank shards (for param + optim states).
+                    for model_param, param_range_map in gbuf_range_map["param_map"].items():
+
+                        # Main param & optimizer states.
+                        group_index, group_order = self.model_param_group_index_map[model_param]
+                        main_param = self.optimizer.param_groups[group_index]["params"][group_order]
+                        optim_state = self.optimizer.state[main_param]
+
+                        tensors = {"param": main_param, **optim_state}
+
+                        # Copy states into contiguous shard.
+                        gbuf_local_start = param_range_map["gbuf_local"].start
+                        gbuf_local_end = param_range_map["gbuf_local"].end
+                        for key in local_shards:
+                            local_shards[key][gbuf_local_start:gbuf_local_end].data.copy_(
+                                tensors[key].detach().cpu()
+                            )
+
+                    # Gather contiguous shards on DP rank 0.
+                    for key, send_tensor in local_shards.items():
+
+                        # Gather tensor list.
+                        if data_parallel_rank == 0:
+                            recv_tensors = [
+                                torch.zeros((gbuf_local_numel,), dtype=torch.float32, device="cpu")
+                                for _ in range(data_parallel_world_size)
+                            ]
+                        else:
+                            recv_tensors = None
+
+                        # Gather.
+                        torch.distributed.gather(
+                            send_tensor,
+                            recv_tensors,
+                            data_parallel_global_ranks[0],
+                            data_parallel_group_gloo,
+                        )
+
+                        # Concatenate.
+                        if data_parallel_rank == 0:
+                            recv_tensors_concatenated = torch.cat(recv_tensors)
+                            # Copy this bucket's collected all-gather tensors into the right place
+                            # in the tensor for the buffer. The tensor for the buffer gets rid of
+                            # the padding between buckets.
+                            start = offset_in_world_tensors
+                            end = offset_in_world_tensors + gbuf_world_numel_unpadded
+                            world_tensors[key][start:end].copy_(
+                                recv_tensors_concatenated[:gbuf_world_numel_unpadded]
+                            )
+
+                    offset_in_world_tensors += gbuf_world_numel_unpadded
+
+                # Collect world state.
+                dtype_state[dtype] = world_tensors
+            state[gbuf_idx] = dtype_state
+
+        return state
+
+    def save_parameter_state(self, filename: str):
+        """Save the distributed parameter state on DP rank 0.
+
+        Args:
+            filename (str): path to save parameter state to.
+        """
+
+        state_dict = self.get_parameter_state_dp_zero()
+        if torch.distributed.get_rank(self.data_parallel_group) == 0:
+            torch.save(state_dict, filename)
+
+    def sharded_state_dict(
+        self,
+        model_sharded_state_dict: ShardedStateDict,
+        is_loading: bool = False,
+        sharding_type: str = 'fully_sharded_model_space',
+    ):
+        """
+        Chooses between 3 param state sharding implementations as requested by `sharding_type`.
+
+        Regular state dict parameters are saved on DP rank 0 and loaded on all ranks.
+        """
+        if not is_loading and sharding_type == 'fully_sharded_bucket_space':
+            logger.warning(
+                '`fully_sharded_bucket_space` sharding for DistributedOptimizer'
+                ' checkpoint is deprecated and will be removed in the future.'
+                ' Please switch to `full_sharded_model_space`.'
+            )
+
+        state_dict = self.state_dict()
+        if sharding_type != 'fully_sharded_model_space':
+            # State dict differs between different model parallel groups
+            state_dict = {
+                k: ShardedObject(
+                    f'optimizer.distributed.dp_group_idx_{self.data_parallel_group_idx}.{k}',
+                    v,
+                    (1,),
+                    (0,),
+                    replica_id=torch.distributed.get_rank(self.data_parallel_group),
+                )
+                for k, v in state_dict.items()
+            }
+
+        if is_loading:
+            # Call the distributed optimizer's specialized load_state_dict(),
+            # which conditionally skips re-allocating the optimizer's state if
+            # already initialized, which in turn reduces memory fragmentation.
+            self.load_state_dict(self.state_dict())
+
+        if sharding_type == 'fully_sharded_bucket_space':
+            param_state = self.sharded_param_state_fs_bucket_space(
+                model_sharded_state_dict, is_loading
+            )
+
+        elif sharding_type == 'dp_zero_gather_scatter':
+            param_state = self.sharded_param_state_dp_zero(model_sharded_state_dict, is_loading)
+        elif sharding_type == 'fully_sharded_model_space':
+            param_state = self.sharded_param_state_fs_model_space(
+                model_sharded_state_dict, is_loading
+            )
+        else:
+            raise NotImplementedError(f'Unknown sharding_type: {sharding_type}')
+
+        state_dict['param_state'] = param_state
+        state_dict['param_state_sharding_type'] = sharding_type
+        return state_dict
+
+    def sharded_param_state_dp_zero(
+        self, model_sharded_state_dict: ShardedStateDict, is_loading: bool = False
+    ):
+        """Naive implementation which reuses gather/scatter from the legacy ckpt format.
+
+        During saving, gathers the parameters state on DP rank 0 and saves a ShardedObject
+        with fixed TPxPP structure. During loading, loads the saved data on DP rank 0
+        (None on other ranks). Relies on the parameters scatter done in load_state_dict.
+        """
+        if is_loading:
+            param_state_data = None
+        else:
+            # Gather on rank 0
+            param_state_data = self.get_parameter_state_dp_zero()
+
+        if torch.distributed.get_rank(self.data_parallel_group) == 0:
+            # Fixed TPxPP. Save on DP rank 0 only
+            param_state = ShardedObject(
+                f'optimizer.distributed.dp_group_idx_{self.data_parallel_group_idx}.param_state',
+                param_state_data,
+                (1,),
+                (0,),
+            )
+        else:
+            # DP ranks > 0 don't save. During loading, the param_state needs to be None.
+            param_state = LocalNonpersistentObject(None)
+
+        return param_state
+
+    def sharded_param_state_fs_bucket_space(
+        self, model_sharded_state_dict: ShardedStateDict, is_loading: bool = False
+    ):
+        """Sharded state dict where each noncontiguous buffer is a separate ShardedTensor.
+
+        Results in fully parallel save and load without any inter-process
+        communication or intermediate buffers/copies.
+        """
+        data_parallel_rank = torch.distributed.get_rank(self.data_parallel_group)
+        data_parallel_world_size = torch.distributed.get_world_size(self.data_parallel_group)
+
+        state = self.get_parameter_state_fs_bucket_space()
+        # per_bucket_numel metadata is saved separately for each TPxPP domain.
+        for per_bucket_key in ('per_bucket_numel', 'per_bucket_numel_unpadded'):
+            key = (
+                f'optimizer.distributed.dp_group_idx_{self.data_parallel_group_idx}'
+                f'.{per_bucket_key}'
+            )
+            state[per_bucket_key] = ShardedObject(
+                key, state[per_bucket_key], (1,), (0,), replica_id=data_parallel_rank
+            )
+
+        for gbuf_idx, gbuf_range_maps in enumerate(self.gbuf_ranges):
+            for dtype, gbuf_range_map_for_all_buckets in state[gbuf_idx].items():
+                for bucket_idx, bucket_state in enumerate(gbuf_range_map_for_all_buckets):
+                    # Compute local DP contiguous shard's size.
+                    gbuf_world_numel = self.buffers[gbuf_idx].buckets[bucket_idx].grad_data.numel()
+                    assert gbuf_world_numel % data_parallel_world_size == 0
+                    gbuf_local_numel = gbuf_world_numel // data_parallel_world_size
+
+                    sharded_bucket_key = (
+                        f'optimizer.distributed.dp_group_idx_{self.data_parallel_group_idx}'
+                        f'.gbuf_idx_{gbuf_idx}.dtype_{dtype}.bucket_idx_{bucket_idx}'
+                    )
+
+                    # The global ckpt tensors must be fully covered.
+                    # We add extra empty padding if necessary
+                    assert bucket_state, 'empty bucket encountered'
+
+                    # Insert padding between parameter tensors to ensure full coverage as needed.
+                    all_pad_tensors = {}
+                    for i in range(len(bucket_state) - 1):
+                        next_param_start = bucket_state[i + 1]['gbuf_local_start']
+                        cur_param_end = bucket_state[i]['gbuf_local_end']
+                        if next_param_start != cur_param_end:
+                            pad_tensors = {
+                                k: torch.empty(
+                                    next_param_start - cur_param_end, dtype=v.dtype, device=v.device
+                                )
+                                for k, v in bucket_state[i].items()
+                                if isinstance(v, torch.Tensor)
+                            }
+                            all_pad_tensors[i + 1] = {
+                                **pad_tensors,
+                                'gbuf_local_start': cur_param_end,
+                                'gbuf_local_end': next_param_start,
+                                'padding': True,
+                            }
+
+                    # Insert from end so that insertion positions are still correct.
+                    indices_to_insert = sorted(list(all_pad_tensors.keys()))
+                    for index_to_insert in reversed(indices_to_insert):
+                        bucket_state.insert(index_to_insert, all_pad_tensors[index_to_insert])
+
+                    if bucket_state[-1]['gbuf_local_end'] != gbuf_local_numel:
+                        pad_tensors = {
+                            k: torch.empty(
+                                gbuf_local_numel - bucket_state[-1]['gbuf_local_end'],
+                                dtype=v.dtype,
+                                device=v.device,
+                            )
+                            for k, v in bucket_state[-1].items()
+                            if isinstance(v, torch.Tensor)
+                        }
+                        bucket_state.append(
+                            {
+                                **pad_tensors,
+                                'gbuf_local_start': bucket_state[-1]['gbuf_local_end'],
+                                'gbuf_local_end': gbuf_local_numel,
+                                'padding': True,
+                            }
+                        )
+
+                    # Each tensor is mapped to a slice (`flattened_range`)
+                    # of a DP-local shard of size `gbuf_local_numel`.
+                    for bucket_params_idx in range(len(bucket_state)):
+                        tensors = bucket_state[bucket_params_idx]
+                        gbuf_local_start = tensors.pop('gbuf_local_start')
+                        gbuf_local_end = tensors.pop('gbuf_local_end')
+                        if 'padding' not in tensors:
+                            tensors['padding'] = False
+
+                        for key in tensors:
+                            if key == 'padding':
+                                tensors[key] = LocalNonpersistentObject(tensors[key])
+                                continue
+                            assert tensors[key].shape == (gbuf_local_end - gbuf_local_start,), (
+                                tensors[key].shape,
+                                gbuf_local_start,
+                                gbuf_local_end,
+                            )
+
+                            tensors[key] = ShardedTensor(
+                                f'{sharded_bucket_key}.{key}',
+                                tensors[key],
+                                tensors[key].dtype,
+                                (gbuf_local_numel,),
+                                (data_parallel_world_size * gbuf_local_numel,),
+                                (data_parallel_rank * gbuf_local_numel,),
+                                axis_fragmentations=(data_parallel_world_size,),
+                                flattened_range=slice(gbuf_local_start, gbuf_local_end),
+                                allow_shape_mismatch=True,
+                            )
+        return state
+
+    def sharded_param_state_fs_model_space(
+        self, model_sharded_state_dict: ShardedStateDict, is_loading: bool = False
+    ):
+        """Sharded state dict where each buffer is mapped to corresponding model param.
+
+        In this approach the optimizer state tensors are directly related to model parameters
+        by linking them with metadata from `model_sharded_state_dict`.
+        This will allow changing TP and PP while using DistOpt (as with other optimizers).
+        """
+
+        param_to_sharded_metadata = {}
+        model_sharded_state_dict, _ = extract_sharded_tensors_and_factories(
+            model_sharded_state_dict
+        )
+        for sh_base in nested_values(model_sharded_state_dict):
+            param_to_sharded_metadata[sh_base.data] = sh_base
+
+        prefix = 'optimizer.state'
+        state = {}
+
+        # Not stored in the checkpoint, used only to identify params in
+        # `sharded_param_state_fs_model_space`.
+        param_idx = 0
+        for gbuf_range_maps in self.gbuf_ranges:
+            for gbuf_range_map_for_all_buckets in gbuf_range_maps.values():
+                for gbuf_range_map in gbuf_range_map_for_all_buckets:
+                    for model_param, param_range_map in gbuf_range_map["param_map"].items():
+                        group_index, group_order = self.model_param_group_index_map[model_param]
+                        param_range = param_range_map['param']
+
+                        main_param = self.optimizer.param_groups[group_index]["params"][group_order]
+                        optim_state = self.optimizer.state[main_param]
+
+                        tensors = {"fp32_param": main_param, **optim_state}
+                        # Match optimizer parameter with model ShardedTensor (or
+                        # ShardedTensorFactory).
+                        try:
+                            sharded_metadata = param_to_sharded_metadata[model_param]
+                        except KeyError as e:
+                            raise ValueError(
+                                f'Model param {model_param} not in model_sharded_state_dict'
+                            ) from e
+
+                        # Set DP corresponding replica_id coordinate to 0.
+                        assert (
+                            len(sharded_metadata.replica_id) == 3
+                        ), f'Expected replica_id format (PP, TP, DP), got: {sharded_metadata}'
+                        replica_id = (*sharded_metadata.replica_id[:2], 0)
+
+                        # Instantiate ShardedTensor (or ShardedTensorFactory) for optimizer
+                        # params.
+                        for state_key, state_ten in tensors.items():
+                            replace_kwargs = dict(
+                                key=f'{prefix}.{state_key}.{sharded_metadata.key}',
+                                data=state_ten,
+                                dtype=state_ten.dtype,
+                                flattened_range=slice(param_range.start, param_range.end),
+                                replica_id=replica_id,
+                            )
+                            if isinstance(sharded_metadata, ShardedTensorFactory):
+                                replace_kwargs.pop('dtype')
+                            tensors[state_key] = replace(sharded_metadata, **replace_kwargs)
+                            tensors[state_key].validate_metadata_integrity()
+                        state[param_idx] = tensors
+                        param_idx += 1
+        return state
+
+    def load_parameter_state_from_fs_bucket_space(self, state_dict):
+        """Loads the parameter state from an internal representation.
+
+        Inverse of the `get_parameter_state_fs_bucket_space` method.
+        """
+        logger.warning(
+            '`fully_sharded_bucket_space` sharding for DistributedOptimizer'
+            'checkpoint is deprecated. Please switch to `full_sharded_model_space`'
+        )
+
+        if state_dict is not None and "per_bucket_numel_unpadded" in state_dict:
+            per_bucket_numel_unpadded_in_checkpoint = state_dict["per_bucket_numel_unpadded"]
+            assert self.per_bucket_numel_unpadded == per_bucket_numel_unpadded_in_checkpoint, (
+                f"Number of unpadded elements in each bucket need to be the same in current run "
+                f"({self.per_bucket_numel_unpadded}) and checkpoint "
+                f"({per_bucket_numel_unpadded_in_checkpoint})"
+            )
+
+        for gbuf_idx, gbuf_range_maps in enumerate(self.gbuf_ranges):
+            assert len(gbuf_range_maps) == 1, "single dtype supported, for now."
+            for dtype, gbuf_range_map_for_all_buckets in gbuf_range_maps.items():
+                for bucket_idx, gbuf_range_map in enumerate(gbuf_range_map_for_all_buckets):
+                    bucket_state = state_dict[gbuf_idx][dtype][bucket_idx]
+                    bucket_state = [
+                        bucket_state_elem
+                        for bucket_state_elem in bucket_state
+                        if not bucket_state_elem['padding']
+                    ]
+
+                    assert len(bucket_state) == len(gbuf_range_map["param_map"]), (
+                        len(bucket_state),
+                        len(gbuf_range_map["param_map"]),
+                    )
+                    for src_tensors, (model_param, param_range_map) in zip(
+                        bucket_state, gbuf_range_map["param_map"].items()
+                    ):
+                        # Main param & optimizer states.
+                        group_index, group_order = self.model_param_group_index_map[model_param]
+                        main_param = self.optimizer.param_groups[group_index]["params"][group_order]
+                        optim_state = self.optimizer.state[main_param]
+
+                        dst_tensors = {"param": main_param, **optim_state}
+                        for key in dst_tensors:
+                            dst_tensors[key].copy_(src_tensors[key])
+
+    @torch.no_grad()
+    def load_parameter_state_from_fs_model_space(self, state_dict):
+        """Loads the parameter state from a "model space" representation.
+
+        Inverse of the `sharded_param_state_fs_model_space` method.
+        """
+        param_idx = 0  # matching order with `sharded_param_state_fs_model_space`
+        for gbuf_range_maps in self.gbuf_ranges:
+            for gbuf_range_map_for_all_buckets in gbuf_range_maps.values():
+                for gbuf_range_map in gbuf_range_map_for_all_buckets:
+                    for model_param, param_range_map in gbuf_range_map["param_map"].items():
+                        group_index, group_order = self.model_param_group_index_map[model_param]
+                        main_param = self.optimizer.param_groups[group_index]["params"][group_order]
+                        optim_state = self.optimizer.state[main_param]
+
+                        src_tensors = state_dict[param_idx]
+                        dst_tensors = {"fp32_param": main_param, **optim_state}
+                        for key in dst_tensors:
+                            dst_tensors[key].copy_(src_tensors[key])
+
+                        param_idx += 1
+
+    @classmethod
+    def _update_legacy_world_tensors(cls, old_tensors, new_numels):
+        '''Reshard buckets (where each bucket is a tensor) to new target
+        numels, where the total numel remains the same.'''
+
+        old_total = sum([t.numel() for t in old_tensors])
+        new_total = sum(new_numels)
+
+        assert old_total == new_total
+
+        unified_tensor = torch.cat(old_tensors, dim=0)
+
+        new_tensors = []
+        start_idx = 0
+        for new_numel in new_numels:
+            new_tensors.append(unified_tensor[start_idx : (start_idx + new_numel)])
+            start_idx += new_numel
+
+        return new_tensors
+
+    def load_parameter_state_from_dp_zero_legacy(self, state_dict):
+        """Load parameter state (i.e., parameter & optimizer tensors) from DP 0 rank,
+        using the legacy checkpoint format as described below.
+
+        The difference between this method and `load_parameter_state_from_dp_zero_modern()`
+        is that this method is used for updating the format of checkpoints that
+        were saved using code from before Feb 13, 2024. Starting on this date, a
+        new format was used (i.e., different format for the parameter mapping and
+        bucket sharding).
+
+        Use arg `--ckpt-convert-update-legacy-dist-opt-format` to call this
+        method, along with `--ckpt-convert-format` and `--ckpt-convert-save` to
+        update a legacy-format checkpoint to the modern format.
+        """
+
+        # Data parallelism variables.
+        data_parallel_world_size = self.data_parallel_group_gloo.size()
+        data_parallel_rank = torch.distributed.get_rank(self.data_parallel_group_gloo)
+        data_parallel_group_gloo = self.data_parallel_group_gloo
+        data_parallel_global_ranks = torch.distributed.get_process_group_ranks(
+            self.data_parallel_group_gloo
+        )
+
+        # Scatter tensors to all DP ranks.
+        for gbuf_idx, gbuf_range_maps in enumerate(self.gbuf_ranges):
+            for dtype, gbuf_range_map_for_all_buckets in gbuf_range_maps.items():
+                if data_parallel_rank == 0:
+                    buffer_numel_unpadded = self.buffers[gbuf_idx].numel_unpadded
+                    model_numels = [b.numel_unpadded for b in self.buffers[gbuf_idx].buckets]
+                    checkpoint_numels = [
+                        t.numel() for t in state_dict[gbuf_idx][torch.float32]["param"]
+                    ]
+                    assert sum(model_numels) == sum(checkpoint_numels)
+                for key in ("param", "exp_avg", "exp_avg_sq"):
+                    legacy_world_tensors = self._update_legacy_world_tensors(
+                        state_dict[gbuf_idx][torch.float32][key],
+                        [
+                            self.buffers[gbuf_idx].buckets[bi].numel_unpadded
+                            for bi in range(len(gbuf_range_map_for_all_buckets))
+                        ],
+                    )
+                    offset_in_world_tensors = 0
+                    for bucket_idx, gbuf_range_map in enumerate(gbuf_range_map_for_all_buckets):
+                        # Compute local DP contiguous shard's size.
+                        gbuf_world_numel = (
+                            self.buffers[gbuf_idx].buckets[bucket_idx].grad_data.numel()
+                        )
+                        assert gbuf_world_numel % data_parallel_world_size == 0
+                        gbuf_local_numel = gbuf_world_numel // data_parallel_world_size
+                        gbuf_world_numel_unpadded = (
+                            self.buffers[gbuf_idx].buckets[bucket_idx].numel_unpadded
+                        )
+                        assert gbuf_world_numel_unpadded <= gbuf_world_numel
+
+                        # Contiguous local shards (received from DP rank 0).
+                        recv_tensor = torch.empty(
+                            (gbuf_local_numel,), dtype=torch.float32, device="cpu"
+                        )
+
+                        # Scatter tensor list.
+                        if data_parallel_rank == 0:
+
+                            start = offset_in_world_tensors
+                            end = offset_in_world_tensors + gbuf_world_numel_unpadded
+
+                            world_tensor = legacy_world_tensors[bucket_idx]
+                            assert (
+                                world_tensor.numel() == gbuf_world_numel_unpadded
+                            ), "%d vs. %d." % (world_tensor.numel(), gbuf_world_numel_unpadded)
+                            offset_in_world_tensors += gbuf_world_numel_unpadded
+
+                            # Pad world_tensor to gbuf_world_numel. Don't pad at the front,
+                            # pad at the back.
+                            world_tensor = torch.nn.functional.pad(
+                                world_tensor, (0, gbuf_world_numel - gbuf_world_numel_unpadded)
+                            )
+                            assert world_tensor.numel() == gbuf_world_numel
+                            gbuf_start_idxs = list(range(0, gbuf_world_numel, gbuf_local_numel))
+                            send_tensors = [
+                                world_tensor[i : (i + gbuf_local_numel)] for i in gbuf_start_idxs
+                            ]
+                        else:
+                            send_tensors = None
+
+                        # Scatter.
+                        torch.distributed.scatter(
+                            recv_tensor,
+                            send_tensors,
+                            data_parallel_global_ranks[0],
+                            data_parallel_group_gloo,
+                        )
+
+                        # Copy local contiguous shards to param/optim shards.
+                        for model_param, param_range_map in gbuf_range_map["param_map"].items():
+
+                            # Main param & optimizer states.
+                            group_index, group_order = self.model_param_group_index_map[model_param]
+                            main_param = self.optimizer.param_groups[group_index]["params"][
+                                group_order
+                            ]
+                            if key == "param":
+                                tensor_to_copy_into = main_param
+                            else:
+                                optim_state = self.optimizer.state[main_param]
+                                tensor_to_copy_into = optim_state[key]
+
+                            # Copy states into contiguous shard.
+                            gbuf_local_start = param_range_map["gbuf_local"].start
+                            gbuf_local_end = param_range_map["gbuf_local"].end
+                            tensor_to_copy_into.data.copy_(
+                                recv_tensor[gbuf_local_start:gbuf_local_end]
+                            )
+
+    def load_parameter_state_from_dp_zero(self, state_dict, *, update_legacy_format=False):
+        """Load parameter state (i.e., parameter & optimizer tensors) from DP 0 rank,
+        using the new checkpoint format with coalesced state across buckets.
+
+        This method performs the reverse of get_parameter_state_dp_zero():
+        - Scatter contiguous buffers from DP rank 0 to each DP rank (each DP
+          rank receives its relevant subset of the world buffers).
+        - For each DP rank, copy param & optimizer shards from contiguous CPU
+          buffers. (e.g., one buffer each for main_param, exp_avg, and
+          exp_avg_sq).
+        """
+
+        # Selectively load from a legacy checkpoint. The legacy format was used
+        # prior to Feb 13, 2024.
+        if update_legacy_format:
+            return self.load_parameter_state_from_dp_zero_legacy(state_dict)
+
+        # Data parallelism variables.
+        data_parallel_world_size = self.data_parallel_group_gloo.size()
+        data_parallel_rank = torch.distributed.get_rank(self.data_parallel_group_gloo)
+        data_parallel_group_gloo = self.data_parallel_group_gloo
+        data_parallel_global_ranks = torch.distributed.get_process_group_ranks(
+            self.data_parallel_group_gloo
+        )
+
+        if data_parallel_rank == 0:
+            # Do nothing if "--fp8-param-gather" is not used.
+            self.split_state_dict_if_needed(state_dict)
+
+        # Scatter tensors to all DP ranks.
+        for gbuf_idx, gbuf_range_maps in enumerate(self.gbuf_ranges):
+            for dtype, gbuf_range_map_for_all_buckets in gbuf_range_maps.items():
+                if data_parallel_rank == 0:
+                    buffer_numel_unpadded = self.buffers[gbuf_idx].numel_unpadded
+                    checkpoint_numel_unpadded = state_dict[gbuf_idx][dtype]["numel_unpadded"]
+                    assert buffer_numel_unpadded == checkpoint_numel_unpadded, (
+                        f"Number of unpadded elements must be same in current run "
+                        f"({buffer_numel_unpadded}) and checkpoint ({checkpoint_numel_unpadded})"
+                    )
+                for key in ("param", "exp_avg", "exp_avg_sq"):
+                    offset_in_world_tensors = 0
+                    for bucket_idx, gbuf_range_map in enumerate(gbuf_range_map_for_all_buckets):
+                        # Compute local DP contiguous shard's size.
+                        gbuf_world_numel = (
+                            self.buffers[gbuf_idx].buckets[bucket_idx].grad_data.numel()
+                        )
+                        assert gbuf_world_numel % data_parallel_world_size == 0
+                        gbuf_local_numel = gbuf_world_numel // data_parallel_world_size
+                        gbuf_world_numel_unpadded = (
+                            self.buffers[gbuf_idx].buckets[bucket_idx].numel_unpadded
+                        )
+                        assert gbuf_world_numel_unpadded <= gbuf_world_numel
+
+                        # Contiguous local shards (received from DP rank 0).
+                        recv_tensor = torch.zeros(
+                            (gbuf_local_numel,), dtype=torch.float32, device="cpu"
+                        )
+
+                        # Scatter tensor list.
+                        if data_parallel_rank == 0:
+                            world_tensors = state_dict[gbuf_idx][dtype][key]
+
+                            start = offset_in_world_tensors
+                            end = offset_in_world_tensors + gbuf_world_numel_unpadded
+                            assert 0 <= start < end <= world_tensors.numel()
+                            world_tensor = world_tensors[start:end]
+                            offset_in_world_tensors += gbuf_world_numel_unpadded
+
+                            # Pad world_tensor to gbuf_world_numel. Don't pad at the front,
+                            # pad at the back.
+                            world_tensor = torch.nn.functional.pad(
+                                world_tensor, (0, gbuf_world_numel - gbuf_world_numel_unpadded)
+                            )
+                            assert world_tensor.numel() == gbuf_world_numel
+                            gbuf_start_idxs = list(range(0, gbuf_world_numel, gbuf_local_numel))
+                            send_tensors = [
+                                world_tensor[i : (i + gbuf_local_numel)] for i in gbuf_start_idxs
+                            ]
+                        else:
+                            send_tensors = None
+
+                        # Scatter.
+                        torch.distributed.scatter(
+                            recv_tensor,
+                            send_tensors,
+                            data_parallel_global_ranks[0],
+                            data_parallel_group_gloo,
+                        )
+
+                        # Copy local contiguous shards to param/optim shards.
+                        for model_param, param_range_map in gbuf_range_map["param_map"].items():
+
+                            # Main param & optimizer states.
+                            group_index, group_order = self.model_param_group_index_map[model_param]
+                            main_param = self.optimizer.param_groups[group_index]["params"][
+                                group_order
+                            ]
+                            if key == "param":
+                                tensor_to_copy_into = main_param
+                            else:
+                                optim_state = self.optimizer.state[main_param]
+                                tensor_to_copy_into = optim_state[key]
+
+                            # Copy states into contiguous shard.
+                            gbuf_local_start = param_range_map["gbuf_local"].start
+                            gbuf_local_end = param_range_map["gbuf_local"].end
+                            tensor_to_copy_into.data.copy_(
+                                recv_tensor[gbuf_local_start:gbuf_local_end]
+                            )
+
+    def split_state_dict_if_needed(self, state_dict):
+        """
+        When "--fp8-param-gather" is disabled, weights and biases are stored in the same
+        `ParamAndGradBuffer`. So, when saving a checkpoint, the optimizer's main parameters are
+        saved in a single continuous tensor (this also applies to "exp_avg" and "exp_avg_sq").
+
+        However, when "--fp8-param-gather" is enabled, weights(in fp8 dtype) and biases(in bf16/fp16
+        dtype) are stored in separate `ParamAndGradBuffer`. Therefore, when we enabled
+        "--fp8-param-gather", and want to load a checkpoint saved without "--fp8-param-gather", we
+        need to split the weights(fp8) and biases(bf16/fp16) in the static_dict into two separate
+        tensors.
+        """
+        # Skip if there is no fp8 buffers.
+        fp8_gbuf_indices = []
+        for gbuf_idx, gbuf_range_maps in enumerate(self.gbuf_ranges):
+            for dtype, _ in gbuf_range_maps.items():
+                if is_float8tensor(self.buffers[gbuf_idx].params[0]):
+                    fp8_gbuf_indices.append(gbuf_idx)
+        if len(fp8_gbuf_indices) == 0:
+            return
+
+        dtype_to_gbuf_idx = {}
+        for key in state_dict.keys():
+            if key != 'buckets_coalesced':
+                for dtype in state_dict[key].keys():
+                    assert dtype not in dtype_to_gbuf_idx
+                    if dtype[0] == torch.uint8:
+                        # If the `state_dict`` already contains a torch.uint8 buffer, we assumed
+                        # that the fp8 weights and fp16/bf16 biases in the checkpoint are already
+                        # separated. In this case, no action is required, so we can return directly.
+                        return
+                    dtype_to_gbuf_idx[dtype] = key
+
+        # 1. Replace the gbuf_idx in the checkpoint with the new gbuf_idx.
+        # 2. Copy the non-tensor data (i.e., the "buckets_coalesced") to `new_state_dict`.
+        new_state_dict = {'buckets_coalesced': state_dict['buckets_coalesced']}
+        for gbuf_idx, gbuf_range_maps in enumerate(self.gbuf_ranges):
+            for dtype, _ in gbuf_range_maps.items():
+                if not is_float8tensor(self.buffers[gbuf_idx].params[0]):
+                    new_state_dict[gbuf_idx] = state_dict[dtype_to_gbuf_idx[dtype]]
+
+        for fp8_gbuf_idx in fp8_gbuf_indices:
+            # Note that `self.buffers[fp8_gbuf_idx].params[0].dtype` is the dummy dtype of
+            # `Float8Tensor`, not torch.uint8.
+            non_fp8_param_and_grad_dtype = (
+                self.buffers[fp8_gbuf_idx].params[0].dtype,
+                self.buffers[fp8_gbuf_idx].grad_dtype,
+            )
+
+            # Iterate through all buffers to find the one that needs to be split.
+            non_fp8_gbuf_idx = None
+            for gbuf_idx, gbuf_range_maps in enumerate(self.gbuf_ranges):
+                for dtype, _ in gbuf_range_maps.items():
+                    if dtype == non_fp8_param_and_grad_dtype:
+                        non_fp8_gbuf_idx = gbuf_idx
+            assert non_fp8_gbuf_idx is not None
+
+            # We need the fp8_flags to determine the order of weight (fp8) and bias (fp16/bf16) in
+            # the buffer.
+            index_to_fp8_map = {}
+            for index in self.buffers[fp8_gbuf_idx].param_indices:
+                assert index not in index_to_fp8_map
+                index_to_fp8_map[index] = True
+            for index in self.buffers[non_fp8_gbuf_idx].param_indices:
+                assert index not in index_to_fp8_map
+                index_to_fp8_map[index] = False
+            param_indices = (
+                self.buffers[fp8_gbuf_idx].param_indices
+                + self.buffers[non_fp8_gbuf_idx].param_indices
+            )
+            assert min(param_indices) == 0
+            assert max(param_indices) == len(param_indices) - 1
+            fp8_flags = []
+            for i in range(len(param_indices)):
+                fp8_flag.append(index_to_fp8_map[i])
+
+            fp8_buffer = self.buffers[fp8_gbuf_idx]
+            non_fp8_buffer = self.buffers[non_fp8_gbuf_idx]
+
+            fp8_idx = len(fp8_buffer.params) - 1
+            non_fp8_idx = len(non_fp8_buffer.params) - 1
+            offsets, fp8_offsets, non_fp8_offsets = [0], [0], [0]
+
+            # Because the parameters in `ParamAndGradBuffer` are traversed in reverse order, the
+            # flag here also needs to be traversed in reverse order.
+            for fp8_flag in fp8_flags[::-1]:
+                if fp8_flag:
+                    numel = fp8_buffer.params[fp8_idx].nelement()
+                    fp8_idx -= 1
+                    offsets.append(offsets[-1] + numel)
+                    fp8_offsets.append(fp8_offsets[-1] + numel)
+                else:
+                    numel = non_fp8_buffer.params[non_fp8_idx].nelement()
+                    non_fp8_idx -= 1
+                    offsets.append(offsets[-1] + numel)
+                    non_fp8_offsets.append(non_fp8_offsets[-1] + numel)
+
+            # Split the target buffer into two separate buffers.
+            fp8_state_dict, non_fp8_state_dict = {}, {}
+            for key in ['param', 'exp_avg', 'exp_avg_sq']:
+                tensor = state_dict[non_fp8_gbuf_idx][non_fp8_param_and_grad_dtype][key]
+                fp8_tensor = torch.empty([fp8_offsets[-1]], dtype=tensor.dtype)
+                non_fp8_tensor = torch.empty([non_fp8_offsets[-1]], dtype=tensor.dtype)
+
+                fp8_idx, non_fp8_idx = 0, 0
+                for i in range(len(offsets) - 1):
+                    if fp8_flags[-(i + 1)]:
+                        fp8_tensor[fp8_offsets[fp8_idx] : fp8_offsets[fp8_idx + 1]].copy_(
+                            tensor[offsets[i] : offsets[i + 1]]
+                        )
+                        fp8_idx += 1
+                    else:
+                        non_fp8_tensor[
+                            non_fp8_offsets[non_fp8_idx] : non_fp8_offsets[non_fp8_idx + 1]
+                        ].copy_(tensor[offsets[i] : offsets[i + 1]])
+                        non_fp8_idx += 1
+
+                fp8_state_dict[key] = fp8_tensor
+                non_fp8_state_dict[key] = non_fp8_tensor
+
+            fp8_state_dict['numel_unpadded'] = fp8_offsets[-1]
+            non_fp8_state_dict['numel_unpadded'] = non_fp8_offsets[-1]
+
+            # Add the two separate buffers into `new_state_dict`.
+            new_state_dict[fp8_gbuf_idx] = {}
+            new_state_dict[fp8_gbuf_idx][(torch.uint8, fp8_buffer.grad_dtype)] = fp8_state_dict
+            new_state_dict[non_fp8_gbuf_idx][non_fp8_param_and_grad_dtype] = non_fp8_state_dict
+
+        # Inplace update state_dict
+        state_dict.clear()
+        for key, value in new_state_dict.items():
+            state_dict[key] = value
+
+    def load_parameter_state(self, filename: str, *, update_legacy_format=False):
+        """Load the distributed parameter state from disk.
+
+        Args:
+            filename (str): path to load parameter state from.
+        """
+        state_dict = None
+        if torch.distributed.get_rank(self.data_parallel_group) == 0:
+            state_dict = torch.load(filename)
+
+        self.load_parameter_state_from_dp_zero(
+            state_dict, update_legacy_format=update_legacy_format
+        )
+
+    def zero_grad(self, set_to_none: bool = True):
+        """
+        Zeroes grads for the model related parameters, i.e., model_float16_groups
+        and model_fp32_groups. We additionally zero the remaining groups as a
+        memory optimization to reduce fragmentation; in the case of
+        set_to_none==True, the space used by this field can be safely deallocated.
+
+        Args:
+            set_to_none (bool): if true, set grads to None.
+        """
+        for groups in (
+            self.model_float16_groups,
+            self.model_fp32_groups,
+            self.shard_float16_groups,  # grad empty/unused here?
+            self.shard_fp32_groups,  # throws grad-access warning
+            self.shard_fp32_from_float16_groups,
+        ):
+            for group in groups:
+                _zero_grad_group_helper(group, set_to_none)
+
+    def _collect_main_grad_data_for_unscaling(self):
+        """
+        Note: this should be equivalent to the float-16 optimizer's method,
+        but written differently, so the two should be combined.
+        """
+        return [
+            param.grad.data for group in self.optimizer.param_groups for param in group["params"]
+        ]
+
+    def _get_model_and_main_params_data_float16(self):
+        """
+        Get aligned list of model and main params.
+        """
+        model_data = []
+        main_data = []
+        for model_group, main_group in zip(
+            self.shard_float16_groups, self.shard_fp32_from_float16_groups
+        ):
+            for model_param, main_param in zip(model_group, main_group):
+                model_data.append(model_param.data)
+                main_data.append(main_param.data)
+        return model_data, main_data
+
+    def _copy_model_grads_to_main_grads(self):
+        """
+        Copy model grads to main grads.
+
+        Since this step follows a reduce-scatter through the DDP's grad
+        buffer, this method is responsible for copying the updated grads
+        from the grad buffer to the main shard's grad field.
+        """
+
+        # Utility method for copying group grads.
+        def copy_group_grads(model_groups, shard_main_groups):
+            for model_group, shard_main_group in zip(model_groups, shard_main_groups):
+                for model_param, shard_main_param in zip(model_group, shard_main_group):
+
+                    param_range_map = self._get_model_param_range_map(model_param)
+                    param_range = param_range_map["param"]
+                    assert param_range.size == shard_main_param.nelement()
+
+                    model_grad = model_param.main_grad
+                    shard_model_grad = model_grad.view(-1)[param_range.start : param_range.end]
+                    shard_main_param.grad = shard_model_grad.float()
+
+        # Copy model groups to shard groups.
+        copy_group_grads(self.model_float16_groups, self.shard_fp32_from_float16_groups)
+        copy_group_grads(self.model_fp32_groups, self.shard_fp32_groups)
+
+    def _copy_main_params_to_model_params(self):
+        """
+        Copy main params to model params.
+
+        Since this step is followed by an all-gather through the DDP's grad
+        buffer, this method is responsible for copying the updated params
+        from the main shards into the correct position in the grad buffer.
+        """
+
+        # Utility method for copying group params.
+        def copy_group_params(shard_main_groups, model_groups):
+            for shard_main_group, model_group in zip(shard_main_groups, model_groups):
+                for shard_main_param, model_param in zip(shard_main_group, model_group):
+
+                    param_range_map = self._get_model_param_range_map(model_param)
+                    world_range = param_range_map["gbuf_world_in_bucket"]
+
+                    assert world_range.size == shard_main_param.nelement()
+
+                    gbuf_index, _, bucket_id = self.model_param_gbuf_map[model_param]
+                    model_param_buffer = self.buffers[gbuf_index].buckets[bucket_id].param_data
+
+                    shard_model_param = model_param_buffer.view(-1)[
+                        world_range.start : world_range.end
+                    ]
+
+                    if is_float8tensor(model_param):
+                        # 1. When "--fp8-param-gather" is disabled, the main param is first cast to
+                        #    BF16/FP16, and then cast to FP8, so the amax_history is calculated
+                        #    using BF16/FP16 param.
+                        # 2. When "--fp8-param-gather" is enabled, we can cast the FP32 main param
+                        #    to FP8 directly, which results in slightly different results with
+                        #    higher speed. In theory, this does not affect convergence.
+                        # TODO: The following code maintains the logic of the point-1 above. It can
+                        # be deleted if it is not necessary.
+                        shard_main_param = shard_main_param.to(model_param.dtype)
+
+                        cast_to_fp8(
+                            shard_main_param.view(1, -1),
+                            model_param._fp8_meta['scaling_fwd'],
+                            model_param._fp8_meta_index,
+                            model_param._fp8_dtype,
+                            out=shard_model_param.view(1, -1),
+                        )
+                    else:
+                        shard_model_param.data.copy_(shard_main_param)
+
+        # Copy shard groups to model groups.
+        copy_group_params(self.shard_fp32_from_float16_groups, self.model_float16_groups)
+        copy_group_params(self.shard_fp32_groups, self.model_fp32_groups)
+
+    def _copy_model_params_to_main_params(self):
+        """
+        Copy model params to main params.
+
+        During finetuning, this method is used to reload the main params from
+        the model params. This copy does not make use of the grad buffer as
+        an intermediary.
+        """
+
+        # Utility method for copying group params.
+        def copy_group_params(model_groups, shard_main_groups):
+            for model_group, shard_main_group in zip(model_groups, shard_main_groups):
+                for model_param, shard_main_param in zip(model_group, shard_main_group):
+
+                    param_range_map = self._get_model_param_range_map(model_param)
+                    param_range = param_range_map["param"]
+                    assert param_range.size == shard_main_param.nelement()
+
+                    shard_model_param = model_param.view(-1)[param_range.start : param_range.end]
+                    shard_main_param.data.copy_(shard_model_param)
+
+        # Copy model groups to shard groups.
+        copy_group_params(self.model_float16_groups, self.shard_fp32_from_float16_groups)
+        copy_group_params(self.model_fp32_groups, self.shard_fp32_groups)
+
+    def _update_fp8_scale_inv_and_amax(self):
+        """
+        If detect FP8 parameters, update their `_scale_inv` and do reduce-max for their
+        `amax_history`.
+        """
+        amaxes = []
+        scales = []
+        scale_invs = []
+        # Iterate over all parameters inside this optimizer to find FP8 parameters.
+        for buffer in self.buffers:
+            for bucket in buffer.buckets:
+                for param in bucket.params_list:
+                    if is_float8tensor(param):
+                        fp8_meta = param._fp8_meta['scaling_fwd']
+                        fp8_meta_index = param._fp8_meta_index
+                        amaxes.append(fp8_meta.amax_history[0][fp8_meta_index].view(1))
+                        scales.append(fp8_meta.scale[fp8_meta_index].view(1))
+                        scale_invs.append(param._scale_inv.view(1))
+                        # Reset transpose cache
+                        param._reset_caches()
+
+        # If there is no FP8 parameters, skip all operations.
+        if len(scales) > 0:
+            dummy_overflow_buf = torch.tensor([0], dtype=torch.int, device='cuda')
+
+            # Update scaling factors.
+            packed_scales = torch.empty(len(scales), dtype=torch.float32, device=scales[0].device)
+            packed_scale_views = [packed_scales[i].view(1) for i in range(len(scales))]
+            _multi_tensor_copy_this_to_that(scales, packed_scale_views, dummy_overflow_buf)
+            torch.reciprocal(packed_scales, out=packed_scales)
+            _multi_tensor_copy_this_to_that(packed_scale_views, scale_invs, dummy_overflow_buf)
+
+            # Reduce amaxes.
+            # Note: Assume each param has a separate amax.
+            packed_amaxes = torch.empty(len(amaxes), dtype=torch.float32, device=amaxes[0].device)
+            packed_amax_views = [packed_amaxes[i].view(1) for i in range(len(amaxes))]
+            _multi_tensor_copy_this_to_that(amaxes, packed_amax_views, dummy_overflow_buf)
+            torch.distributed.all_reduce(
+                packed_amaxes, op=torch.distributed.ReduceOp.MAX, group=self.data_parallel_group
+            )
+            _multi_tensor_copy_this_to_that(packed_amax_views, amaxes, dummy_overflow_buf)
+
+    @torch.no_grad()
+    def step_with_ready_grads(self) -> bool:
+        """Step the optimizer with ready gradients, return successful.
+        Under the hood, either launch synchronous param all-gathers or get ready to launch
+        asynchorous all-gathers that get overlapped with the next forward pass.
+        """
+        update_successful = super().step_with_ready_grads()
+
+        # If there is no FP8 parameters, this will do nothing.
+        self._update_fp8_scale_inv_and_amax()
+
+        timers = self.config.timers
+        if timers is not None:
+            timers('params-all-gather', log_level=1).start(barrier=self.config.barrier_with_L1_time)
+        # If not overlapping all-gather for parameters, launch synchronous all-gather
+        # communication calls here. If overlapping all-gather for parameters, the following
+        # the first all-gather is launched asynchronously in the next optimizer.zero_grad()
+        # call and subsequent all-gathers are launched in the forward pre-hook.
+        if not self.ddp_config.overlap_param_gather:
+            for model_chunk in self.model_chunks:
+                model_chunk.start_param_sync()
+        if timers is not None:
+            timers('params-all-gather').stop()
+
+        return update_successful
diff --git a/megatron/optimizer/grad_scaler.py b/megatron/core/optimizer/grad_scaler.py
similarity index 56%
rename from megatron/optimizer/grad_scaler.py
rename to megatron/core/optimizer/grad_scaler.py
index 66f7c907a4..abdd1e7b60 100644
--- a/megatron/optimizer/grad_scaler.py
+++ b/megatron/core/optimizer/grad_scaler.py
@@ -1,19 +1,18 @@
-# Copyright (c) 2022, NVIDIA CORPORATION. All rights reserved.
+# Copyright (c) 2024, NVIDIA CORPORATION. All rights reserved.
 
 """Megatron grad scaler."""
 
-from abc import ABC
-from abc import abstractmethod
+from abc import ABC, abstractmethod
+from typing import Dict
 
 import torch
 
 
 class MegatronGradScaler(ABC):
-
-    def __init__(self, initial_scale):
+    def __init__(self, initial_scale: float):
         """Initialize scale value with the input initial scale."""
         assert initial_scale > 0.0
-        self._scale = torch.cuda.FloatTensor([initial_scale])
+        self._scale = torch.tensor([initial_scale], dtype=torch.float, device='cuda')
 
     @property
     def scale(self):
@@ -24,7 +23,7 @@ def inv_scale(self):
         return self._scale.double().reciprocal().float()
 
     @abstractmethod
-    def update(self, found_inf):
+    def update(self, found_inf: bool):
         pass
 
     @abstractmethod
@@ -32,14 +31,16 @@ def state_dict(self):
         pass
 
     @abstractmethod
-    def load_state_dict(self, state_dict):
+    def load_state_dict(self, state_dict: Dict):
         pass
 
 
-
 class ConstantGradScaler(MegatronGradScaler):
+    """
+    Constant grad scaler (loss scale is never adjusted regardless of NaNs seen in gradients).
+    """
 
-    def update(self, found_inf):
+    def update(self, found_inf: bool):
         pass
 
     def state_dict(self):
@@ -49,26 +50,48 @@ def load_state_dict(self, state_dict):
         pass
 
 
-
 class DynamicGradScaler(MegatronGradScaler):
-
-    def __init__(self, initial_scale, min_scale,
-                 growth_factor, backoff_factor,
-                 growth_interval, hysteresis):
-        """"Grad scaler with dynamic scale that gets adjusted
-        during training."""
+    """
+    Grad scaler with dynamic scale that gets adjusted during training.
+
+    Reduces loss scale by `backoff_factor` if `hysteresis` number of NaNs are seen in a row. Increases
+    loss scale by `growth_factor` if NaNs are not seen for `growth_interval` iterations.
+    """
+
+    def __init__(
+        self,
+        initial_scale: float,
+        min_scale: float,
+        growth_factor: float,
+        backoff_factor: float,
+        growth_interval: int,
+        hysteresis: int,
+    ):
+        """
+        Grad scaler with dynamic scale that gets adjusted during training.
+
+        Args:
+            initial_scale (float): Initial loss scale value.
+            min_scale (float): Minimum loss scale value.
+            growth_factor (float): Factor to grow loss scale by if NaNs are not seen in `growth_interval`
+                training iterations. Must be greater than 1.
+            backoff_factor (float): Factor to decrease loss scale by if NaNs are seen in `hysteresis`
+                consecutive training iterations. Must be between 0 and 1.
+            growth_interval (int): Number of training iterations of no NaNs before loss scale is increased.
+            hysteresis (int): Number of training iterations of consecutive NaNs before loss scale is decreased.
+        """
         super(DynamicGradScaler, self).__init__(initial_scale)
 
         # Lower bound on the scale.
         assert min_scale > 0.0
         assert min_scale <= initial_scale
-        self.min_scale = torch.cuda.FloatTensor([min_scale])
+        self.min_scale = torch.tensor([min_scale], dtype=torch.float, device='cuda')
         # Growth and backoff factors for the scale.
         assert growth_factor > 1.0
-        self.growth_factor = torch.cuda.FloatTensor([growth_factor])
+        self.growth_factor = torch.tensor([growth_factor], dtype=torch.float, device='cuda')
         assert backoff_factor < 1.0
         assert backoff_factor > 0.0
-        self.backoff_factor = torch.cuda.FloatTensor([backoff_factor])
+        self.backoff_factor = torch.tensor([backoff_factor], dtype=torch.float, device='cuda')
         # Interval over which if we don't see any inf/nan,
         # we will scale the grad scale by the growth factor.
         assert growth_interval > 0
@@ -82,8 +105,10 @@ def __init__(self, initial_scale, min_scale,
         self._growth_tracker = 0
         self._hysteresis_tracker = self.hysteresis
 
-
-    def update(self, found_inf):
+    def update(self, found_inf: bool):
+        """
+        Updates internal state in grad scaler based on whether NaNs are seen in grads or not.
+        """
 
         # If we have an inf/nan, growth tracker is set to 0
         # and hysterisis tracker is reduced by 1.
@@ -92,8 +117,7 @@ def update(self, found_inf):
             self._hysteresis_tracker -= 1
             # Now if we are out of hysteresis count, scale down the loss.
             if self._hysteresis_tracker <= 0:
-                self._scale = torch.max(self._scale * self.backoff_factor,
-                                        self.min_scale)
+                self._scale = torch.max(self._scale * self.backoff_factor, self.min_scale)
         else:
             # If there is no nan/inf, increment the growth tracker.
             self._growth_tracker += 1
@@ -105,7 +129,6 @@ def update(self, found_inf):
                 # and scale up the loss scale.
                 self._scale = self._scale * self.growth_factor
 
-
     def state_dict(self):
         state_dict = {}
         state_dict['scale'] = self._scale
@@ -113,8 +136,7 @@ def state_dict(self):
         state_dict['hysteresis_tracker'] = self._hysteresis_tracker
         return state_dict
 
-
-    def load_state_dict(self, state_dict):
+    def load_state_dict(self, state_dict: Dict):
         self._scale = state_dict['scale'].cuda(torch.cuda.current_device())
         self._growth_tracker = state_dict['growth_tracker']
         self._hysteresis_tracker = state_dict['hysteresis_tracker']
diff --git a/megatron/core/optimizer/optimizer.py b/megatron/core/optimizer/optimizer.py
new file mode 100644
index 0000000000..7f2bbc0832
--- /dev/null
+++ b/megatron/core/optimizer/optimizer.py
@@ -0,0 +1,1069 @@
+# Copyright (c) 2024, NVIDIA CORPORATION. All rights reserved.
+
+"""Megatron optimizer."""
+
+import copy
+import math
+import warnings
+from abc import ABC, abstractmethod
+from itertools import chain
+from logging import getLogger
+from typing import Any, Callable, Dict, List, Optional, Tuple, Union
+
+import torch
+
+try:
+    from transformer_engine.pytorch.optimizers import multi_tensor_applier, multi_tensor_scale
+
+    multi_tensor_scale_impl = multi_tensor_scale
+except ImportError:
+    try:
+        from apex.multi_tensor_apply import multi_tensor_applier
+    except ImportError:
+        from megatron.core.utils import local_multi_tensor_applier
+
+        multi_tensor_applier = local_multi_tensor_applier
+    try:
+        import amp_C
+
+        l2_norm_impl = amp_C.multi_tensor_l2norm
+        multi_tensor_scale_impl = amp_C.multi_tensor_scale
+    except ImportError:
+        from megatron.core.utils import local_multi_tensor_l2_norm, local_multi_tensor_scale
+
+        l2_norm_impl = local_multi_tensor_l2_norm
+        multi_tensor_scale_impl = local_multi_tensor_scale
+
+from .. import parallel_state, tensor_parallel
+from ..config_logger import has_config_logger_enabled, log_config_to_disk
+from ..dist_checkpointing.mapping import ShardedStateDict
+from ..dist_checkpointing.optimizer import (
+    get_param_id_to_sharded_param_map,
+    make_sharded_optimizer_tensor,
+    optim_state_to_sharding_state,
+)
+from ..dist_checkpointing.utils import add_prefix_for_sharding
+from ..transformer.module import param_is_not_shared
+from .clip_grads import clip_grad_by_total_norm_fp32, count_zeros_fp32, get_grad_norm_fp32
+from .grad_scaler import MegatronGradScaler
+from .optimizer_config import OptimizerConfig
+
+logger = getLogger(__name__)
+
+
+def _zero_grad_group_helper(group: List[torch.nn.Parameter], set_to_none: bool):
+    """
+    Zero out the gradient for a group of parameters.
+    Note: copied from torch.optim.optimizer.
+    """
+    for param in group:
+        if param.grad is not None:
+            if set_to_none:
+                param.grad = None
+            else:
+                if param.grad.grad_fn is not None:
+                    param.grad.detach_()
+                else:
+                    param.grad.requires_grad_(False)
+                param.grad.zero_()
+
+
+def _multi_tensor_copy_this_to_that(
+    this: List[torch.Tensor], that: List[torch.Tensor], overflow_buf: Optional[torch.Tensor] = None
+):
+    """
+    Use multi-tensor-applier to copy values from one list to another.
+    We don't have a bfloat16 implementation so for now if the overflow_buf
+    is not provided, we default back to simple loop copy to be compatible
+    with bfloat16.
+    """
+    if overflow_buf:
+        overflow_buf.fill_(0)
+        # Scaling with factor `1.0` is equivalent to copy.
+        multi_tensor_applier(multi_tensor_scale_impl, overflow_buf, [this, that], 1.0)
+    else:
+        for this_, that_ in zip(this, that):
+            that_.copy_(this_)
+
+
+class MegatronOptimizer(ABC):
+    """
+    Base class for all Megatron optimizers.
+
+    Args:
+        optimizer (torch.optim.Optimizer): base optimizer such as Adam or SGD.
+        config (OptimizerConfig): configuration object for optimizer.
+        init_state_fn (Callable, optional): function to initialize state in the optimizer.
+    """
+
+    def __init__(
+        self,
+        optimizer: torch.optim.Optimizer,
+        config: OptimizerConfig,
+        init_state_fn: Callable = lambda x: None,
+    ):
+        """Input optimizer is the base optimizer (e.g., Adam)."""
+        self.optimizer = optimizer
+        assert self.optimizer, 'no optimizer is provided.'
+        self.config = config
+        self.init_state_fn = init_state_fn
+
+    def get_parameters(self) -> List[torch.nn.Parameter]:
+        """
+        Get list of parameters wrapped in optimizer.
+        """
+        params = []
+        for param_group in self.optimizer.param_groups:
+            for param in param_group['params']:
+                params.append(param)
+        return params
+
+    def get_main_grads_for_grad_norm(self) -> List[torch.Tensor]:
+        """
+        Get main_grads that should be taken into account to compute the grad norm.
+        Filter parameters based on:
+          - grad should not be None.
+          - parameter should not be shared (i.e., grads shouldn't be double counted while
+            computing norms).
+          - should not be a replica due to tensor model parallelism.
+        """
+        params = self.get_parameters()
+        grads_for_norm = []
+        for param in params:
+            grad = param.grad
+            grad_not_none = grad is not None
+            is_not_shared = param_is_not_shared(param)
+            is_not_tp_duplicate = tensor_parallel.param_is_not_tensor_parallel_duplicate(param)
+            if grad_not_none and is_not_shared and is_not_tp_duplicate:
+                grads_for_norm.append(grad)
+
+        return grads_for_norm
+
+    def get_model_parallel_group(self) -> torch.distributed.ProcessGroup:
+        """Default returned here, but the distributed optimizer overrides this."""
+        if hasattr(self, 'model_parallel_group'):
+            return self.model_parallel_group
+        return parallel_state.get_model_parallel_group()
+
+    @abstractmethod
+    def prepare_grads(self) -> bool:
+        """Pre-processing gradients before the optimizer step, returns whether inf/nan is found."""
+        return False
+
+    @abstractmethod
+    def step_with_ready_grads(self) -> bool:
+        """Step the optimizer with ready gradients, return successful."""
+        return True
+
+    @torch.no_grad()
+    def get_grad_norm(self):
+        """Compute and return grad norm."""
+        grads_for_norm = self.get_main_grads_for_grad_norm()
+        total_norm = get_grad_norm_fp32(
+            grads_for_norm, model_parallel_group=self.get_model_parallel_group()
+        )
+        return total_norm
+
+    def clip_grad_norm(self, clip_grad: float) -> float:
+        """Compute and return grad norm, also clip grads."""
+        params = self.get_parameters()
+        grads_for_norm = self.get_main_grads_for_grad_norm()
+        grad_norm = get_grad_norm_fp32(
+            grads_for_norm, model_parallel_group=self.get_model_parallel_group()
+        )
+        clip_grad_by_total_norm_fp32(params, clip_grad, grad_norm)
+        return grad_norm
+
+    def count_zeros(self) -> float:
+        """Count number of zeros in model's gradients."""
+        params = self.get_parameters()
+        return count_zeros_fp32(params, model_parallel_group=self.get_model_parallel_group())
+
+    @abstractmethod
+    def zero_grad(self, set_to_none: bool = True):
+        """Zero gradients and prepare for next forward pass."""
+        pass
+
+    @abstractmethod
+    def get_loss_scale(self) -> torch.Tensor:
+        """
+        Get current loss scale factor.
+        NOTE: The output should be a CUDA tensor of size 1.
+        """
+        pass
+
+    def scale_loss(self, loss: torch.Tensor) -> torch.Tensor:
+        """Simple scaling."""
+        return self.get_loss_scale() * loss
+
+    def start_param_sync(self, model_index: int, *unused):
+        """
+        Start parameter synchronization for all optimizers.
+        This is a no-op for all non-distributed optimizers.
+        """
+        pass
+
+    @abstractmethod
+    def reload_model_params(self):
+        """Refreshes any internal state from the current model parameters.
+        Call whenever the parameters are changed outside of the optimizer.
+        For example, when we load a model from a checkpoint  without loading
+        the optimizer, the model parameters are updated but for fp16 optimizer
+        with main parameters, the main parameters need to also be updated."""
+        pass
+
+    @abstractmethod
+    def state_dict(self):
+        """Return state_dict."""
+        pass
+
+    @abstractmethod
+    def load_state_dict(self, state_dict):
+        """Load pass-in `state_dict`."""
+        pass
+
+    # Promote state so it can be retrieved or set via
+    # "optimizer_instance.state"
+    def _get_state(self):
+        return self.optimizer.state
+
+    def _set_state(self, value):
+        self.optimizer.state = value
+
+    state = property(_get_state, _set_state)
+
+    # Promote param_groups so it can be retrieved or set via
+    # "optimizer_instance.param_groups"
+    # (for example, to adjust the learning rate)
+    def _get_param_groups(self):
+        return self.optimizer.param_groups
+
+    def _set_param_groups(self, value):
+        self.optimizer.param_groups = value
+
+    param_groups = property(_get_param_groups, _set_param_groups)
+
+    @abstractmethod
+    def step(self):
+        """Step the optimizer."""
+        pass
+
+    @abstractmethod
+    def sharded_state_dict(
+        self, model_sharded_state_dict: ShardedStateDict, is_loading: bool = False
+    ) -> ShardedStateDict:
+        """Builds sharded state dict for the optimizer, based on model's sharded state dict.
+
+        Args:
+            model_sharded_state_dict (ShardedStateDict): sharded state dict of the model
+            is_loading (bool, optional): flag indicating whether the state dict will be
+                used to save or load the optimizer state. Defaults to False.
+
+        Returns: optimizer sharded state dict
+        """
+
+    @staticmethod
+    def _extract_common_per_param_step(state_dict) -> Union[int, torch.Tensor]:
+        common_step = None
+        for param_idx, param_state in state_dict['state'].items():
+            param_step = param_state.get('step', None)
+            if param_step is not None:
+                if common_step is None:
+                    common_step = param_step
+                elif common_step != param_step:
+                    raise ValueError(
+                        "The optimizer step differs per parameter. Mcore only supports "
+                        "optimizers whose step is shared across all parameters."
+                    )
+        return common_step
+
+    @staticmethod
+    def _restore_common_per_param_step(state_dict: Dict, step: Union[int, torch.Tensor]):
+        for param_idx, param_state in state_dict['state'].items():
+            param_state['step'] = copy.deepcopy(step)
+
+
+class MixedPrecisionOptimizer(MegatronOptimizer):
+    """Base class for both the float-16 and the distributed optimizer.
+
+    Args:
+        optimizer (torch.optim.Optimizer): base optimizer such as Adam or SGD.
+        config (OptimizerConfig): configuration object for optimizer.
+        grad_scaler (MegatronGradScaler): used for scaling gradients. Note that
+            this can be None. This case happens when `bf16 = True` and we don't
+            use any loss scale. Note that for `bf16 = True`, we can have
+            a constant gradient scaler. Also for `bf16 = False`, we
+            always require a grad scaler.
+        init_state_fn (Callable, optional): function to initialize state in the optimizer.
+    """
+
+    def __init__(
+        self,
+        optimizer: torch.optim.Optimizer,
+        config: OptimizerConfig,
+        grad_scaler: Optional[MegatronGradScaler],
+        init_state_fn: Callable,
+    ):
+        if has_config_logger_enabled(config):
+            log_config_to_disk(config, locals(), prefix=type(self).__name__)
+
+        super().__init__(optimizer, config, init_state_fn)
+        self.grad_scaler = grad_scaler
+
+        # None grad scaler is only supported for bf16.
+        if self.grad_scaler is None:
+            assert not self.config.fp16, 'fp16 expects a grad scaler.'
+
+        # Tensor used to determine if a nan/if has happend.
+        # Any non-zero value indicates inf/nan.
+        # Note that we keep this for the cases that grad scaler is none.
+        # We still record nan/inf if we have a bfloat16 with a grad scaler.
+        if self.grad_scaler:
+            self.found_inf = torch.tensor([0.0], dtype=torch.float, device='cuda')
+
+        # Dummy tensor needed for apex multi-apply tensor.
+        # For bfloat, we don't have multi-tensor apply and for now
+        # we set it to none so the multi-tensor apply gets ignored.
+        if self.config.bf16:
+            self._dummy_overflow_buf = None
+        else:
+            self._dummy_overflow_buf = torch.tensor([0], dtype=torch.int, device='cuda')
+
+        # In case grad scaler is not passed, define the unity scale.
+        if self.grad_scaler is None:
+            self._scale_one = torch.tensor([1.0], dtype=torch.float, device='cuda')
+
+    def get_loss_scale(self):
+        if self.grad_scaler is None:
+            return self._scale_one
+        return self.grad_scaler.scale
+
+    def reload_model_params(self):
+        self._copy_model_params_to_main_params()
+
+    def _unscale_main_grads_and_check_for_nan(self):
+
+        # Collect main grads.
+        main_grads = self._collect_main_grad_data_for_unscaling()
+
+        # Reset found inf.
+        self.found_inf.fill_(0.0)
+
+        # Unscale and set found inf/nan
+        torch._amp_foreach_non_finite_check_and_unscale_(
+            main_grads, self.found_inf, self.grad_scaler.inv_scale
+        )
+
+        # Update across all model parallel instances.
+        torch.distributed.all_reduce(
+            self.found_inf, op=torch.distributed.ReduceOp.MAX, group=self.get_model_parallel_group()
+        )
+
+        # Check for nan.
+        found_inf_flag = self.found_inf.item() > 0
+
+        return found_inf_flag
+
+    @torch.no_grad()
+    def prepare_grads(self) -> bool:
+        """Pre-processing gradients before the optimizer step, returns whether inf/nan is found."""
+        timers = self.config.timers
+
+        # Copy gradients from model params to main params.
+        if timers is not None:
+            timers('optimizer-copy-to-main-grad', log_level=1).start(
+                barrier=self.config.barrier_with_L1_time
+            )
+        self._copy_model_grads_to_main_grads()
+        if timers is not None:
+            timers('optimizer-copy-to-main-grad').stop()
+
+        # Do unscale, check for inf, and update grad scaler only for
+        # the case that grad scaler is provided.
+        if self.grad_scaler:
+
+            # Unscale and check for inf/nan.
+            if timers is not None:
+                timers('optimizer-unscale-and-check-inf', log_level=1).start(
+                    barrier=self.config.barrier_with_L1_time
+                )
+            found_inf_flag = self._unscale_main_grads_and_check_for_nan()
+            if timers is not None:
+                timers('optimizer-unscale-and-check-inf').stop()
+
+            # We are done with scaling gradients
+            # so we can update the loss scale.
+            self.grad_scaler.update(found_inf_flag)
+
+            return found_inf_flag
+
+        return False
+
+    @torch.no_grad()
+    def step_with_ready_grads(self) -> bool:
+        """Step the optimizer with ready gradients, return successful."""
+        timers = self.config.timers
+        # Step the optimizer.
+        if timers is not None:
+            timers('optimizer-inner-step', log_level=1).start(
+                barrier=self.config.barrier_with_L1_time
+            )
+        self.optimizer.step()
+        if timers is not None:
+            timers('optimizer-inner-step').stop()
+
+        # Update params from main params.
+        if timers is not None:
+            timers('optimizer-copy-main-to-model-params', log_level=1).start(
+                barrier=self.config.barrier_with_L1_time
+            )
+        self._copy_main_params_to_model_params()
+        if timers is not None:
+            timers('optimizer-copy-main-to-model-params').stop()
+
+        return True
+
+    @torch.no_grad()
+    def step(self):
+        timers = self.config.timers
+
+        found_inf_flag = self.prepare_grads()
+        if found_inf_flag:
+            return False, None, None
+
+        # Clip the main gradients.
+        if timers is not None:
+            timers('optimizer-clip-main-grad', log_level=1).start(
+                barrier=self.config.barrier_with_L1_time
+            )
+        grad_norm = None
+        if self.config.clip_grad > 0.0:
+            grad_norm = self.clip_grad_norm(self.config.clip_grad)
+        if timers is not None:
+            timers('optimizer-clip-main-grad').stop()
+
+        # Count the zeros in the grads.
+        if timers is not None:
+            timers('optimizer-count-zeros', log_level=1).start(
+                barrier=self.config.barrier_with_L1_time
+            )
+        num_zeros_in_grad = self.count_zeros() if self.config.log_num_zeros_in_grad else None
+        if timers is not None:
+            timers('optimizer-count-zeros').stop()
+
+        success = self.step_with_ready_grads()
+
+        # Successful update.
+        return success, grad_norm, num_zeros_in_grad
+
+
+class Float16OptimizerWithFloat16Params(MixedPrecisionOptimizer):
+    """Float16 optimizer for fp16 and bf16 data types.
+
+    Args:
+        optimizer (torch.optim.Optimizer): base optimizer such as Adam or SGD.
+        config (OptimizerConfig): configuration object for optimizer.
+        grad_scaler (MegatronGradScaler): used for scaling gradients. Note that
+            this can be None. This case happens when `bf16 = True` and we don't
+            use any loss scale. Note that for `bf16 = True`, we can have
+            a constant gradient scaler. Also for `bf16 = False`, we
+            always require a grad scaler.
+        init_state_fn (Callable, optional): function to initialize state in the optimizer.
+    """
+
+    def __init__(
+        self,
+        optimizer: torch.optim.Optimizer,
+        config: OptimizerConfig,
+        grad_scaler: MegatronGradScaler,
+        init_state_fn: Callable,
+    ):
+
+        super().__init__(optimizer, config, grad_scaler, init_state_fn)
+
+        # Handle main parameters.
+
+        # Three groups of parameters:
+        #   float16_groups: original float16 parameters
+        #   fp32_from_float16_groups: fp32 copy of float16 parameters
+        #   fp32_from_fp32_groups: original fp32 parameters
+        self.float16_groups = []
+        self.fp32_from_float16_groups = []
+        self.fp32_from_fp32_groups = []
+
+        # For all the groups in the original optimizer:
+        for param_group in self.optimizer.param_groups:
+            float16_params_this_group = []
+            fp32_params_this_group = []
+            fp32_from_float16_params_this_group = []
+            # For all the parameters in this group:
+            for i, param in enumerate(param_group['params']):
+                if param.requires_grad:
+
+                    # float16 params:
+                    if param.type() in ['torch.cuda.HalfTensor', 'torch.cuda.BFloat16Tensor']:
+                        float16_params_this_group.append(param)
+                        # Create a copy
+                        main_param = param.detach().clone().float()
+                        # Copy tensor model parallel attributes.
+                        tensor_parallel.copy_tensor_model_parallel_attributes(main_param, param)
+                        if hasattr(param, 'shared'):
+                            main_param.shared = param.shared
+                        # Replace the optimizer params with the new fp32 copy.
+                        param_group['params'][i] = main_param
+
+                        fp32_from_float16_params_this_group.append(main_param)
+                        # Reset existing state dict key to the new main param.
+                        if param in self.optimizer.state:
+                            self.optimizer.state[main_param] = self.optimizer.state.pop(param)
+                    # fp32 params.
+                    elif param.type() == 'torch.cuda.FloatTensor':
+                        fp32_params_this_group.append(param)
+                        param_group['params'][i] = param
+
+                    else:
+                        raise TypeError(
+                            'Wrapped parameters must be one of '
+                            'torch.cuda.FloatTensor,  '
+                            'torch.cuda.HalfTensor, or '
+                            'torch.cuda.BFloat16Tensor. '
+                            'Received {}'.format(param.type())
+                        )
+
+            self.float16_groups.append(float16_params_this_group)
+            self.fp32_from_float16_groups.append(fp32_from_float16_params_this_group)
+            self.fp32_from_fp32_groups.append(fp32_params_this_group)
+
+    def zero_grad(self, set_to_none=True):
+        """We only need to zero the model related parameters, i.e.,
+        float16_groups & fp32_from_fp32_groups. We additionally zero
+        fp32_from_float16_groups as a memory optimization to reduce
+        fragmentation; in the case of set_to_none==True, the space
+        used by this field can be safely deallocated at this point."""
+        for group in self.float16_groups:
+            _zero_grad_group_helper(group, set_to_none)
+        for group in self.fp32_from_float16_groups:
+            _zero_grad_group_helper(group, set_to_none)
+        for group in self.fp32_from_fp32_groups:
+            _zero_grad_group_helper(group, set_to_none)
+
+    def _collect_main_grad_data_for_unscaling(self):
+
+        main_grads = []
+
+        # fp32 params from float16 ones.
+        for main_group in self.fp32_from_float16_groups:
+            for main_param in main_group:
+                if main_param.grad is not None:
+                    main_grads.append(main_param.grad.data)
+
+        # Append fp32 parameters.
+        for main_group in self.fp32_from_fp32_groups:
+            for main_param in main_group:
+                if main_param.grad is not None:
+                    main_grads.append(main_param.grad.data)
+
+        return main_grads
+
+    def _get_model_and_main_params_data_float16(self):
+        model_data = []
+        main_data = []
+        for model_group, main_group in zip(self.float16_groups, self.fp32_from_float16_groups):
+            for model_param, main_param in zip(model_group, main_group):
+                model_data.append(model_param.data)
+                main_data.append(main_param.data)
+        return model_data, main_data
+
+    def _copy_model_grads_to_main_grads(self):
+        # This only needs to be done for the float16 group.
+        for model_group, main_group in zip(self.float16_groups, self.fp32_from_float16_groups):
+            for model_param, main_param in zip(model_group, main_group):
+                if hasattr(model_param, 'main_grad'):
+                    main_param.grad = model_param.main_grad.float()
+                else:
+                    if model_param.grad is not None:
+                        main_param.grad = model_param.grad.float()
+
+                # Safe to deallocate model's grad/main_grad after copying.
+                # (If using contiguous buffers, main_grad's memory should
+                # persist and therefore should not be deallocated.)
+                model_param.grad = None
+
+        # For fp32 grads, we need to reset the grads to main grad.
+        for model_group in self.fp32_from_fp32_groups:
+            for model_param in model_group:
+                model_param.grad = model_param.main_grad
+
+    def _copy_main_params_to_model_params(self):
+        # Only needed for the float16 params.
+        model_data, main_data = self._get_model_and_main_params_data_float16()
+        _multi_tensor_copy_this_to_that(
+            this=main_data, that=model_data, overflow_buf=self._dummy_overflow_buf
+        )
+
+    def _copy_model_params_to_main_params(self):
+        # Only needed for the float16 params.
+        model_data, main_data = self._get_model_and_main_params_data_float16()
+        _multi_tensor_copy_this_to_that(
+            this=model_data, that=main_data, overflow_buf=self._dummy_overflow_buf
+        )
+
+    def state_dict(self):
+        state_dict = {}
+        state_dict['optimizer'] = self.optimizer.state_dict()
+        if self.grad_scaler:
+            state_dict['grad_scaler'] = self.grad_scaler.state_dict()
+        state_dict['fp32_from_fp16_params'] = self.fp32_from_float16_groups
+        return state_dict
+
+    def sharded_state_dict(
+        self, model_sharded_state_dict: ShardedStateDict, is_loading: bool = False
+    ):
+
+        if is_loading:
+            self.init_state_fn(self.optimizer)
+
+        state_dict = self.state_dict()
+
+        id_to_sharded_param_map = get_param_id_to_sharded_param_map(
+            model_sharded_state_dict, chain.from_iterable(g for g in self.float16_groups)
+        )
+
+        # Convert fp32_from_fp16_params
+        assert len(state_dict['fp32_from_fp16_params']) == len(
+            state_dict['optimizer']['param_groups']
+        )
+        state_dict['fp32_from_fp16_params'] = [
+            [
+                make_sharded_optimizer_tensor(
+                    id_to_sharded_param_map[param_id],
+                    fp32_param,
+                    prefix=f'optimizer.state.fp32_param',
+                )
+                for param_id, fp32_param in zip(state_group['params'], fp32_group)
+            ]
+            for fp32_group, state_group in zip(
+                state_dict['fp32_from_fp16_params'], state_dict['optimizer']['param_groups']
+            )
+        ]
+
+        step = self._extract_common_per_param_step(state_dict['optimizer'])
+
+        # Convert regular optimizer state
+        # all optimizer parameters passed to optim_state_to_sharding_state are
+        # expected to have the same shape as the model parameters,
+        # so we save the step separately and ignore it here
+        optim_state_to_sharding_state(
+            state_dict['optimizer'], id_to_sharded_param_map, exclude_keys="step"
+        )
+        # save step as a shared step among all parameters. Separate per-parameter
+        # steps are not supported
+        state_dict['optimizer']['state']['common_step'] = step
+        return state_dict
+
+    def load_state_dict(self, state_dict):
+        pipeline_parallel_size = parallel_state.get_pipeline_model_parallel_world_size()
+        # Optimizer.
+        optimizer_key = 'optimizer'
+        if optimizer_key not in state_dict:
+            optimizer_key = 'optimizer_state_dict'
+            logger.info('***WARNING*** loading optimizer from ' 'an old checkpoint ...')
+        if 'common_step' in state_dict[optimizer_key]['state']:
+            common_step = state_dict[optimizer_key]['state'].pop('common_step')
+            self._restore_common_per_param_step(state_dict[optimizer_key], common_step)
+        self.optimizer.load_state_dict(state_dict[optimizer_key])
+
+        # Grad scaler.
+        if 'grad_scaler' not in state_dict:
+            if self.config.fp16:
+                logger.info(
+                    '***WARNING*** found an old checkpoint, will not ' 'load grad scaler ...'
+                )
+        else:
+            if self.grad_scaler:
+                self.grad_scaler.load_state_dict(state_dict['grad_scaler'])
+            else:
+                logger.info(
+                    '***WARNING*** fould the grad scaler in the '
+                    'checkpoint but it is None in the class. '
+                    'Skipping loading grad scaler ...'
+                )
+
+        # Copy data for the main params.
+        fp32_from_float16_params_key = 'fp32_from_fp16_params'
+        if fp32_from_float16_params_key not in state_dict:
+            fp32_from_float16_params_key = 'fp32_from_fp16'
+        for current_group, saved_group in zip(
+            self.fp32_from_float16_groups, state_dict[fp32_from_float16_params_key]
+        ):
+            for current_param, saved_param in zip(current_group, saved_group):
+                current_param.data.copy_(saved_param.data)
+
+
+class FP32Optimizer(MegatronOptimizer):
+    """Float32 optimizer.
+
+    Args:
+        optimizer (torch.optim.Optimizer): base optimizer such as Adam or SGD.
+        config (OptimizerConfig): configuration object for optimizer.
+        init_state_fn (Callable, optional): function to initialize state in the optimizer.
+    """
+
+    def __init__(
+        self, optimizer: torch.optim.Optimizer, config: OptimizerConfig, init_state_fn: Callable
+    ):
+        if has_config_logger_enabled(config):
+            log_config_to_disk(config, locals(), prefix=type(self).__name__)
+
+        super(FP32Optimizer, self).__init__(optimizer, config, init_state_fn)
+
+        self._scale = torch.tensor([1.0], dtype=torch.float, device='cuda')
+
+    def zero_grad(self, set_to_none=True):
+        """Copied from torch.optim.optimizer"""
+        for group in self.optimizer.param_groups:
+            _zero_grad_group_helper(group['params'], set_to_none)
+
+    def get_loss_scale(self):
+        """FP32 optimizer does not do any scaling."""
+        return self._scale
+
+    @torch.no_grad()
+    def prepare_grads(self) -> bool:
+        """Pre-processing gradients before the optimizer step, returns whether inf/nan is found."""
+        timers = self.config.timers
+
+        # Copy main_grads to grads.
+        if timers is not None:
+            timers('optimizer-copy-to-main-grad', log_level=1).start(
+                barrier=self.config.barrier_with_L1_time
+            )
+        for param_group in self.optimizer.param_groups:
+            for param in param_group['params']:
+                param.grad = param.main_grad
+        if timers is not None:
+            timers('optimizer-copy-to-main-grad').stop()
+
+        return False
+
+    @torch.no_grad()
+    def step_with_ready_grads(self) -> bool:
+        """Step the optimizer with ready gradients, return successful."""
+        timers = self.config.timers
+
+        # Update parameters.
+        if timers is not None:
+            timers('optimizer-inner-step', log_level=1).start(
+                barrier=self.config.barrier_with_L1_time
+            )
+        self.optimizer.step()
+        if timers is not None:
+            timers('optimizer-inner-step').stop()
+
+        return True
+
+    @torch.no_grad()
+    def step(self):
+        """Clip gradients (if needed) and step the base optimizer.
+        Always return successful since there is no overflow."""
+        timers = self.config.timers
+
+        found_inf_flag = self.prepare_grads()
+        if found_inf_flag:
+            return False, None, None
+
+        # Clip gradients.
+        if timers is not None:
+            timers('optimizer-clip-main-grad', log_level=1).start(
+                barrier=self.config.barrier_with_L1_time
+            )
+        grad_norm = None
+        if self.config.clip_grad > 0.0:
+            grad_norm = self.clip_grad_norm(self.config.clip_grad)
+        if timers is not None:
+            timers('optimizer-clip-main-grad').stop()
+
+        # Count the zeros in the grads.
+        if timers is not None:
+            timers('optimizer-count-zeros', log_level=1).start(
+                barrier=self.config.barrier_with_L1_time
+            )
+        num_zeros_in_grad = self.count_zeros() if self.config.log_num_zeros_in_grad else None
+        if timers is not None:
+            timers('optimizer-count-zeros').stop()
+
+        success = self.step_with_ready_grads()
+
+        # No overflow for FP32 optimizer.
+        return success, grad_norm, num_zeros_in_grad
+
+    def reload_model_params(self):
+        pass
+
+    def state_dict(self):
+        return self.optimizer.state_dict()
+
+    def load_state_dict(self, state_dict):
+        pipeline_parallel_size = parallel_state.get_pipeline_model_parallel_world_size()
+        if 'common_step' in state_dict['state']:
+            common_step = state_dict['state'].pop('common_step')
+            self._restore_common_per_param_step(state_dict, common_step)
+        self.optimizer.load_state_dict(state_dict)
+
+    def sharded_state_dict(
+        self, model_sharded_state_dict: ShardedStateDict, is_loading: bool = False
+    ):
+        if is_loading:
+            self.init_state_fn(self.optimizer)
+
+        state_dict = self.state_dict()
+        id_to_sharded_param_map = get_param_id_to_sharded_param_map(
+            model_sharded_state_dict, self.get_parameters()
+        )
+        step = self._extract_common_per_param_step(state_dict)
+
+        # all optimizer parameters passed to optim_state_to_sharding_state are
+        # expected to have the same shape as the model parameters,
+        # so we save the step separately and ignore it here
+        optim_state_to_sharding_state(state_dict, id_to_sharded_param_map, exclude_keys="step")
+        # save step as a shared step among all parameters. Separate per-parameter
+        # steps are not supported
+        state_dict['state']['common_step'] = step
+        return state_dict
+
+
+class ProxyDict:
+    """
+    A dictionary-like object that proxies to a list of dictionaries.
+
+    e.g., ProxyDict([{'a': 1}, {'b': 2}]) behaves like:
+    {
+        (0, 'a'): 1,
+        (1, 'b'): 2,
+    }
+    We use tuples as keys to avoid ambiguity with the keys of the inner dicts.
+    """
+
+    def __init__(self, inner_dicts: List[dict]):
+        self._inner_dicts = inner_dicts
+
+    def __getitem__(self, key: Tuple[int, str]):
+        idx, inner_key = key
+        return self._inner_dicts[idx].get(inner_key)
+
+    def __setitem__(self, key: Tuple[int, str], value: Any):
+        idx, inner_key = key
+        self._inner_dicts[idx][inner_key] = value
+
+    def __len__(self) -> int:
+        return sum([len(inner_dict) for inner_dict in self._inner_dicts])
+
+    def __iter__(self):
+        for idx, inner_dict in enumerate(self._inner_dicts):
+            for inner_key in inner_dict:
+                yield (idx, inner_key)
+
+    def items(self):
+        """Return generator over underlying items."""
+        for idx, inner_dict in enumerate(self._inner_dicts):
+            for inner_key, value in inner_dict.items():
+                yield (idx, inner_key), value
+
+
+class ChainedOptimizer(MegatronOptimizer):
+    """ChainedOptimizer is designed for a collection of optimizers.
+
+    These optimizers are responsible for different parts of multiple models for
+    a training task and will be executed one-by-one when the model is updated.
+
+    Args:
+        chained_optimizers: a list of optimizers.
+    """
+
+    def __init__(self, chained_optimizers: List[MegatronOptimizer]):
+        self.model_chunks = []
+        self.config = getattr(chained_optimizers[0], 'config', None)
+        for optimizer in chained_optimizers:
+            if hasattr(optimizer, 'model_chunks'):
+                for model_chunk in optimizer.model_chunks:
+                    if model_chunk not in self.model_chunks:
+                        self.model_chunks.append(model_chunk)
+            assert self.config == getattr(optimizer, 'config', None)
+        self.chained_optimizers = chained_optimizers
+
+    @property
+    def param_groups(self) -> List[dict]:
+        """Get param_groups aggregated over underlying optimizers."""
+        param_groups = []
+        for optimizer in self.chained_optimizers:
+            param_groups += optimizer.param_groups
+        return param_groups
+
+    @property
+    def state(self) -> ProxyDict:
+        """
+        Return optimizer state with tuple keys, where the first element is the
+        index of the optimizer in the list of chained optimizers.
+        """
+        return ProxyDict([opt.state for opt in self.chained_optimizers])
+
+    def zero_grad(self, set_to_none=True):
+        for optimizer in self.chained_optimizers:
+            optimizer.zero_grad(set_to_none)
+
+    def get_loss_scale(self):
+        return self.chained_optimizers[0].get_loss_scale()
+
+    def reload_model_params(self):
+        for optimizer in self.chained_optimizers:
+            optimizer.reload_model_params()
+
+    def state_dict(self):
+        return [optimizer.state_dict() for optimizer in self.chained_optimizers]
+
+    def sharded_state_dict(
+        self, model_sharded_state_dict: ShardedStateDict, is_loading: bool = False, **kwargs
+    ):
+        sharded_state_dict = {}
+        for optimizer_idx, optimizer in enumerate(self.chained_optimizers):
+            optim_state_dict = optimizer.sharded_state_dict(
+                model_sharded_state_dict, is_loading, **kwargs
+            )
+            add_prefix_for_sharding(optim_state_dict, f'chained_{optimizer_idx}.')
+            sharded_state_dict[optimizer_idx] = optim_state_dict
+        return sharded_state_dict
+
+    def load_state_dict(self, state_dict):
+        if len(self.chained_optimizers) != len(state_dict):
+            raise RuntimeError(
+                f'Expected {len(self.chained_optimizers)} entries'
+                f' in state dict, but got {len(state_dict)}.'
+            )
+        if isinstance(state_dict, dict):
+            state_dict = (v for k, v in sorted(state_dict.items()))
+        for optimizer, state in zip(self.chained_optimizers, state_dict):
+            optimizer.load_state_dict(state)
+
+    @torch.no_grad()
+    def prepare_grads(self) -> bool:
+        """Pre-processing gradients before the optimizer step, returns whether inf/nan is found."""
+        found_inf_flag = False
+        for optimizer in self.chained_optimizers:
+            found_inf_flag |= optimizer.prepare_grads()
+
+        return found_inf_flag
+
+    @torch.no_grad()
+    def step_with_ready_grads(self) -> bool:
+        """Step the optimizer with ready gradients, return successful."""
+        success = True
+        for optimizer_idx, optimizer in enumerate(self.chained_optimizers):
+            success &= optimizer.step_with_ready_grads()
+            if self.config.overlap_param_gather_with_optimizer_step and optimizer_idx == 0:
+                assert success
+                assert len(optimizer.model_chunks) == 1
+                optimizer.model_chunks[0].start_param_sync(force_dispatch=True)
+
+        return success
+
+    def disable_pre_hook(self):
+        """Disable pre-hooks for underlying distributed optimizers."""
+        warnings.warn(
+            "`ChainedOptimizer.disable_pre_hook` will be deprecated in a future release. "
+            "Use `DistributedDataParallel.disable_forward_pre_hook` directly."
+        )
+        for model_chunk in self.model_chunks:
+            model_chunk.disable_forward_pre_hook()
+
+    def enable_pre_hook(self):
+        """Enable pre-hooks for underlying distributed optimizers."""
+        warnings.warn(
+            "`ChainedOptimizer.enable_pre_hook` will be deprecated in a future release. "
+            "Use `DistributedDataParallel.enable_forward_pre_hook` directly."
+        )
+        for model_chunk in self.model_chunks:
+            model_chunk.enable_forward_pre_hook()
+
+    @torch.no_grad()
+    def step(self):
+        """ChainedOptimizer will step all optimizers one by one."""
+        found_inf_flag = self.prepare_grads()
+        if found_inf_flag:
+            return False, None, None
+
+        # Get grad norm.
+        grad_norms = []
+        for optimizer in self.chained_optimizers:
+            _grad_norm = optimizer.get_grad_norm()
+            grad_norms += [_grad_norm if _grad_norm else 0.0]
+        grad_norm = math.sqrt(sum([x**2 for x in grad_norms]))
+
+        # Clip gradients.
+        for optimizer in self.chained_optimizers:
+            if optimizer.config.clip_grad > 0.0:
+                clip_grad_by_total_norm_fp32(
+                    optimizer.get_parameters(),
+                    max_norm=optimizer.config.clip_grad,
+                    total_norm=grad_norm,
+                )
+
+        # Count the zeros in the grads.
+        num_zeros_in_grad = 0
+        for optimizer in self.chained_optimizers:
+            num_zeros_in_grad += (
+                optimizer.count_zeros() if optimizer.config.log_num_zeros_in_grad else 0
+            )
+
+        update_successful = self.step_with_ready_grads()
+
+        return update_successful, grad_norm, num_zeros_in_grad
+
+    def save_parameter_state(self, filename: str):
+        """Save the distributed parameter states of all optimizers to a file.
+
+        Args:
+            filename (str): path to save parameter state to.
+        """
+        save_states = False
+        states = []
+        for optimizer in self.chained_optimizers:
+            if hasattr(optimizer, 'get_parameter_state_dp_zero'):
+                state_dict = optimizer.get_parameter_state_dp_zero()
+
+                # Save checkpoint economically, only when DP rank = 0, state dict
+                # needs to be saved.
+                if torch.distributed.get_rank(optimizer.data_parallel_group) == 0:
+                    states.append(state_dict)
+                    save_states = True
+                else:
+                    states.append(None)
+            else:
+                states.append(None)
+
+        if save_states:
+            torch.save(states, filename)
+
+    def load_parameter_state(self, filename: str, *, update_legacy_format: bool = False):
+        """Load the distributed parameter states of all optimizers from a file.
+
+        Args:
+            filename (str): path to load parameter state from.
+        """
+        states = None
+        for idx, optimizer in enumerate(self.chained_optimizers):
+            if not hasattr(optimizer, 'load_parameter_state_from_dp_zero'):
+                continue
+
+            # Lazy loading checkpoint, state dict is needed only when DP rank = 0.
+            if torch.distributed.get_rank(optimizer.data_parallel_group) == 0 and states is None:
+                states = torch.load(filename)
+
+            state_dict = states[idx] if states else None
+            optimizer.load_parameter_state_from_dp_zero(
+                state_dict, update_legacy_format=update_legacy_format
+            )
+
+    def start_param_sync(self, model_index: int, *unused):
+        """Start parameter synchronization for all optimizers."""
+        for optimizer in self.chained_optimizers:
+            optimizer.start_param_sync(model_index, *unused)
diff --git a/megatron/core/optimizer/optimizer_config.py b/megatron/core/optimizer/optimizer_config.py
new file mode 100644
index 0000000000..8876d925cb
--- /dev/null
+++ b/megatron/core/optimizer/optimizer_config.py
@@ -0,0 +1,116 @@
+# Copyright (c) 2024, NVIDIA CORPORATION. All rights reserved.
+
+from dataclasses import dataclass
+from typing import Callable, Optional
+
+import torch
+
+
+@dataclass
+class OptimizerConfig:
+    """Configuration for optimizer."""
+
+    ##############
+    # General
+    ##############
+    optimizer: str = 'adam'
+    """Optimizer to use (one of Adam or SGD)."""
+
+    lr: Optional[float] = None
+    """Initial learning rate. Depending on decay style and initial warmup, the learning rate at each
+       iteration would be different.
+    """
+
+    min_lr: Optional[float] = None
+    """Minumum value for learning rate. The scheduler clip values below this threshold."""
+
+    decoupled_lr: Optional[float] = None
+    """Separate learning rate for the input and output layer."""
+
+    decoupled_min_lr: Optional[float] = None
+    """Minimum value for learning rate for the input and output layer. The scheduler clip values
+       below this threshold.
+    """
+
+    weight_decay: float = 0.01
+    """Weight decay coefficient for L2 regularization."""
+
+    ##############
+    # Precision
+    ##############
+    fp16: bool = False
+    """If true, train with fp16 mixed precision training. Defaults to False."""
+
+    bf16: bool = False
+    """If true, train with bf16 mixed precision training. Defaults to False."""
+
+    params_dtype: torch.dtype = torch.float32
+    """dtype used when intializing the weights. Defaults to torch.float32."""
+
+    ###############
+    # Loss scaling
+    ###############
+    loss_scale: Optional[float] = None
+    """Static loss scaling, positive power of 2 values can improve fp16 convergence. If None,
+       dynamic loss scaling is used.
+    """
+
+    initial_loss_scale: float = 2**32
+    """Initial loss-scale for dynamic loss scaling."""
+
+    min_loss_scale: float = 1.0
+    """Minimum loss scale for dynamic loss scaling."""
+
+    loss_scale_window: float = 1000
+    """Window over which to raise/lower dynamic scale."""
+
+    hysteresis: int = 2
+    """Hysteresis for dynamic loss scaling."""
+
+    ##############
+    # Optimizer
+    ##############
+    # Adam
+    adam_beta1: float = 0.9
+    """First coefficient for computing running averages of gradient and its square in Adam
+    optimizer.
+    """
+
+    adam_beta2: float = 0.999
+    """Second coefficient for computing running averages of gradient and its square in Adam
+    optimizer.
+    """
+
+    adam_eps: float = 1e-08
+    """Term added to the denominator to improve numerical stability in Adam optimizer."""
+
+    # SGD.
+    sgd_momentum: float = 0.9
+    """Momentum factor for SGD optimizer."""
+
+    #######################
+    # Distributed optimizer
+    #######################
+    use_distributed_optimizer: bool = False
+    """Distribute optimizer state over data-parallel replicas."""
+
+    overlap_param_gather_with_optimizer_step: bool = False
+    """If true, overlap param all-gather of first bucket with optimizer step."""
+
+    ################
+    # Miscellaneous
+    ################
+    clip_grad: float = 1.0
+    """Gradient clipping based on global L2 norm."""
+
+    log_num_zeros_in_grad: bool = False
+    """If true, calculate and log the number of zeros in gradient."""
+
+    barrier_with_L1_time: bool = False
+    """If true, use barrier with level 1 time measurements."""
+
+    timers: Callable = None
+    """Function to get timers."""
+
+    config_logger_dir: str = ""
+    """When non-empty, dumps entry-point configs to config_logger_dir"""
diff --git a/megatron/core/optimizer_param_scheduler.py b/megatron/core/optimizer_param_scheduler.py
new file mode 100644
index 0000000000..43c106f4f5
--- /dev/null
+++ b/megatron/core/optimizer_param_scheduler.py
@@ -0,0 +1,297 @@
+# Copyright (c) 2022, NVIDIA CORPORATION. All rights reserved.
+
+"""Learning rate decay and weight decay incr functions."""
+import logging
+import math
+from typing import Optional
+
+from megatron.core.optimizer import MegatronOptimizer
+from megatron.core.utils import log_single_rank
+
+logger = logging.getLogger(__name__)
+
+
+class OptimizerParamScheduler:
+    """Anneals learning rate and weight decay
+
+    Args:
+        optimizer (MegatronOptimizer): the optimizer to be used
+        init_lr (float): initial learning rate
+        max_lr (float): maximum learning rate
+        min_lr (float): minimum learning rate
+        lr_warmup_steps (int): number of warmup steps
+        lr_decay_steps (int): number of decay steps
+        lr_decay_style (str): decay style for learning rate
+        start_wd (float): initial weight decay
+        end_wd (float): final weight decay
+        wd_incr_steps (int): number of weight decay increment steps
+        wd_incr_style (str): weight decay increment style
+        use_checkpoint_opt_param_scheduler (bool, optional): whether to use the checkpoint values
+            for the optimizer param scheduler
+        override_opt_param_scheduler (bool, optional): whether to override the optimizer param
+            scheduler values with the class values
+        wsd_decay_steps (int, optional): number of weight decay decay steps
+        lr_wsd_decay_style (str, optional): decay style for learning rate during weight decay decay
+            steps
+
+    """
+
+    def __init__(
+        self,
+        optimizer: MegatronOptimizer,
+        init_lr: float,
+        max_lr: float,
+        min_lr: float,
+        lr_warmup_steps: int,
+        lr_decay_steps: int,
+        lr_decay_style: str,
+        start_wd: float,
+        end_wd: float,
+        wd_incr_steps: int,
+        wd_incr_style: str,
+        use_checkpoint_opt_param_scheduler: Optional[bool] = True,
+        override_opt_param_scheduler: Optional[bool] = False,
+        wsd_decay_steps: Optional[int] = None,
+        lr_wsd_decay_style: Optional[str] = None,
+    ) -> None:
+
+        # Class values.
+        self.optimizer = optimizer
+
+        self.init_lr = init_lr
+        self.max_lr = float(max_lr)
+        self.min_lr = min_lr
+        assert self.min_lr >= 0.0
+        assert self.max_lr >= self.min_lr
+        assert self.init_lr <= self.max_lr
+
+        self.lr_warmup_steps = lr_warmup_steps
+        self.num_steps = 0
+        self.lr_decay_steps = lr_decay_steps
+        self.wsd_decay_steps = wsd_decay_steps
+        self.lr_wsd_decay_style = lr_wsd_decay_style
+        assert self.lr_decay_steps > 0
+        assert self.lr_warmup_steps < self.lr_decay_steps
+
+        self.lr_decay_style = lr_decay_style
+        if self.lr_decay_style == "WSD":
+            assert self.wsd_decay_steps is not None
+
+        self.start_wd = start_wd
+        self.end_wd = end_wd
+        assert self.start_wd >= 0.0
+        assert self.end_wd >= self.start_wd
+        self.wd_incr_steps = wd_incr_steps
+        self.wd_incr_style = wd_incr_style
+
+        self.override_opt_param_scheduler = override_opt_param_scheduler
+        self.use_checkpoint_opt_param_scheduler = use_checkpoint_opt_param_scheduler
+        if self.override_opt_param_scheduler:
+            assert not self.use_checkpoint_opt_param_scheduler, (
+                'both override and ' 'use-checkpoint are set.'
+            )
+
+        # Set the learning rate
+        self.step(0)
+        log_single_rank(logger, logging.INFO, f"> learning rate decay style: {self.lr_decay_style}")
+
+    def get_wd(self) -> float:
+        """Weight decay incr functions"""
+        if self.num_steps > self.wd_incr_steps:
+            return self.end_wd
+
+        if self.wd_incr_style == 'constant':
+            assert self.start_wd == self.end_wd
+            return self.end_wd
+
+        incr_ratio = float(self.num_steps) / float(self.wd_incr_steps)
+        assert incr_ratio >= 0.0
+        assert incr_ratio <= 1.0
+        delta_wd = self.end_wd - self.start_wd
+
+        if self.wd_incr_style == 'linear':
+            coeff = incr_ratio
+        elif self.wd_incr_style == 'cosine':
+            coeff = 0.5 * (math.cos(math.pi * (1 - incr_ratio)) + 1.0)
+        else:
+            raise Exception(f'{self.wd_incr_style} weight decay increment style is not supported.')
+
+        return self.start_wd + coeff * delta_wd
+
+    def get_lr(self, param_group: dict) -> float:
+        """Learning rate decay functions from:
+        https://openreview.net/pdf?id=BJYwwY9ll pg. 4
+
+        Args:
+            param_group (dict): parameter group from the optimizer.
+        """
+
+        max_lr = param_group.get('max_lr', self.max_lr)
+        min_lr = param_group.get('min_lr', self.min_lr)
+
+        # Use linear warmup for the initial part.
+        if self.lr_warmup_steps > 0 and self.num_steps <= self.lr_warmup_steps:
+            return self.init_lr + (
+                (max_lr - self.init_lr) * float(self.num_steps) / float(self.lr_warmup_steps)
+            )
+
+        # If the learning rate is constant, just return the initial value.
+        if self.lr_decay_style == 'constant':
+            return max_lr
+
+        # For any steps larger than `self.lr_decay_steps`, use `min_lr`.
+        if self.num_steps > self.lr_decay_steps:
+            return min_lr
+
+        # If we are done with the warmup period, use the decay style.
+        if self.lr_decay_style == 'inverse-square-root':
+            warmup_steps = max(self.lr_warmup_steps, 1)
+            num_steps = max(self.num_steps, 1)
+            lr = max_lr * warmup_steps**0.5 / (num_steps**0.5)
+            return max(min_lr, lr)
+
+        num_steps_ = self.num_steps - self.lr_warmup_steps
+        decay_steps_ = self.lr_decay_steps - self.lr_warmup_steps
+        decay_ratio = float(num_steps_) / float(decay_steps_)
+        assert decay_ratio >= 0.0
+        assert decay_ratio <= 1.0
+        delta_lr = max_lr - min_lr
+
+        if self.lr_decay_style == 'linear':
+            coeff = 1.0 - decay_ratio
+        elif self.lr_decay_style == 'cosine':
+            coeff = 0.5 * (math.cos(math.pi * decay_ratio) + 1.0)
+        elif self.lr_decay_style == 'WSD':
+            wsd_anneal_start_ = self.lr_decay_steps - self.wsd_decay_steps
+            if self.num_steps <= wsd_anneal_start_:
+                coeff = 1.0
+            else:
+                wsd_steps = self.num_steps - wsd_anneal_start_
+                wsd_decay_ratio = float(wsd_steps) / float(self.wsd_decay_steps)
+                if self.lr_wsd_decay_style == "linear":
+                    coeff = 1.0 - wsd_decay_ratio
+                elif self.lr_wsd_decay_style == "cosine":
+                    coeff = 0.5 * (math.cos(math.pi * wsd_decay_ratio) + 1.0)
+                elif self.lr_wsd_decay_style == "exponential":
+                    coeff = (2.0 * math.pow(0.5, wsd_decay_ratio)) - 1.0
+        else:
+            raise Exception(f'{self.lr_decay_style} decay style is not supported.')
+
+        return min_lr + coeff * delta_lr
+
+    def step(self, increment: int) -> None:
+        """Set lr for all parameters groups.
+
+        Args:
+            increment (int): number of steps to increment
+        """
+        self.num_steps += increment
+        new_wd = self.get_wd()
+        for param_group in self.optimizer.param_groups:
+            new_lr = self.get_lr(param_group)
+            param_group['lr'] = new_lr * param_group.get('lr_mult', 1.0)
+            param_group['weight_decay'] = new_wd * param_group.get('wd_mult', 1.0)
+
+    def state_dict(self) -> dict:
+        """Return the state dict."""
+        state_dict = {
+            'max_lr': self.max_lr,
+            'lr_warmup_steps': self.lr_warmup_steps,
+            'num_steps': self.num_steps,
+            'lr_decay_style': self.lr_decay_style,
+            'lr_decay_steps': self.lr_decay_steps,
+            'min_lr': self.min_lr,
+            'start_wd': self.start_wd,
+            'end_wd': self.end_wd,
+            'wd_incr_style': self.wd_incr_style,
+            'wd_incr_steps': self.wd_incr_steps,
+        }
+        return state_dict
+
+    def _check_and_set(self, cls_value: float, sd_value: float, name: str) -> float:
+        """Auxiliary function for checking the values in the checkpoint and
+        setting them.
+
+        Args:
+            cls_value (float): class value
+            sd_value (float): checkpoint value
+            name (str): name of the parameter
+        """
+
+        if self.override_opt_param_scheduler:
+            log_single_rank(logger, logging.INFO, f" > overriding {name} value to {cls_value}")
+            return cls_value
+
+        if not self.use_checkpoint_opt_param_scheduler:
+            assert cls_value == sd_value, (
+                f'OptimizerParamScheduler: class input value {cls_value} and checkpoint'
+                f'value {sd_value} for {name} do not match'
+            )
+
+        log_single_rank(logger, logging.INFO, f" > using checkpoint value {sd_value} for {name}")
+        return sd_value
+
+    def load_state_dict(self, state_dict: dict) -> None:
+        """Load the state dict.
+
+        Args:
+            state_dict (dict): state dict to be load
+        """
+
+        if 'start_lr' in state_dict:
+            max_lr_ = state_dict['start_lr']
+        else:
+            max_lr_ = state_dict['max_lr']
+        self.max_lr = self._check_and_set(self.max_lr, max_lr_, 'learning rate')
+
+        self.min_lr = self._check_and_set(
+            self.min_lr, state_dict['min_lr'], 'minimum learning rate'
+        )
+
+        if 'warmup_iter' in state_dict:
+            lr_warmup_steps_ = state_dict['warmup_iter']
+        elif 'warmup_steps' in state_dict:
+            lr_warmup_steps_ = state_dict['warmup_steps']
+        else:
+            lr_warmup_steps_ = state_dict['lr_warmup_steps']
+        self.lr_warmup_steps = self._check_and_set(
+            self.lr_warmup_steps, lr_warmup_steps_, 'warmup iterations'
+        )
+
+        if 'end_iter' in state_dict:
+            lr_decay_steps_ = state_dict['end_iter']
+        elif 'decay_steps' in state_dict:
+            lr_decay_steps_ = state_dict['decay_steps']
+        else:
+            lr_decay_steps_ = state_dict['lr_decay_steps']
+        self.lr_decay_steps = self._check_and_set(
+            self.lr_decay_steps, lr_decay_steps_, 'total number of iterations'
+        )
+
+        if 'decay_style' in state_dict:
+            lr_decay_style_ = state_dict['decay_style']
+        else:
+            lr_decay_style_ = state_dict['lr_decay_style']
+        self.lr_decay_style = self._check_and_set(
+            self.lr_decay_style, lr_decay_style_, 'learning rate decay style'
+        )
+
+        if 'num_iters' in state_dict:
+            num_steps = state_dict['num_iters']
+        else:
+            num_steps = state_dict['num_steps']
+        self.step(increment=num_steps)
+
+        if 'start_wd' in state_dict:
+            self.start_wd = self._check_and_set(
+                self.start_wd, state_dict['start_wd'], "start weight decay"
+            )
+            self.end_wd = self._check_and_set(self.end_wd, state_dict['end_wd'], "end weight decay")
+            self.wd_incr_steps = self._check_and_set(
+                self.wd_incr_steps,
+                state_dict['wd_incr_steps'],
+                "total number of weight decay iterations",
+            )
+            self.wd_incr_style = self._check_and_set(
+                self.wd_incr_style, state_dict['wd_incr_style'], "weight decay incr style"
+            )
diff --git a/megatron/core/package_info.py b/megatron/core/package_info.py
index 6f53034623..6135dc52c8 100644
--- a/megatron/core/package_info.py
+++ b/megatron/core/package_info.py
@@ -2,9 +2,9 @@
 
 
 MAJOR = 0
-MINOR = 1
+MINOR = 10
 PATCH = 0
-PRE_RELEASE = ''
+PRE_RELEASE = 'rc0'
 
 # Use the following formatting: (major, minor, patch, pre-release)
 VERSION = (MAJOR, MINOR, PATCH, PRE_RELEASE)
@@ -14,10 +14,16 @@
 
 __package_name__ = 'megatron_core'
 __contact_names__ = 'NVIDIA'
-__contact_emails__ = 'nemo-toolkit@nvidia.com' # use NeMo Email
-__homepage__ = 'https://docs.nvidia.com/deeplearning/nemo/user-guide/docs/en/stable/' # use NeMo homepage
+__contact_emails__ = 'nemo-toolkit@nvidia.com'  # use NeMo Email
+__homepage__ = (
+    'https://docs.nvidia.com/deeplearning/nemo/user-guide/docs/en/stable/'  # use NeMo homepage
+)
 __repository_url__ = 'https://github.com/NVIDIA/Megatron-LM/megatron/core'
 __download_url__ = 'https://github.com/NVIDIA/Megatron-LM/releases'
-__description__ = 'Megatron Core - a library for efficient and scalable training of transformer based models'
+__description__ = (
+    'Megatron Core - a library for efficient and scalable training of transformer based models'
+)
 __license__ = 'BSD-3'
-__keywords__ = 'deep learning, machine learning, gpu, NLP, NLU, language, transformer, nvidia, pytorch, torch'
+__keywords__ = (
+    'deep learning, machine learning, gpu, NLP, NLU, language, transformer, nvidia, pytorch, torch'
+)
diff --git a/megatron/core/packed_seq_params.py b/megatron/core/packed_seq_params.py
new file mode 100644
index 0000000000..dff0cc5992
--- /dev/null
+++ b/megatron/core/packed_seq_params.py
@@ -0,0 +1,20 @@
+# Copyright (c) 2024, NVIDIA CORPORATION. All rights reserved.
+from dataclasses import dataclass
+
+from torch import Tensor
+
+
+@dataclass
+class PackedSeqParams:
+    '''
+    parameters to TEDotProductAttention and fused rope kernels for the
+    `thd` (packed) sequence format
+    '''
+
+    qkv_format: str = None
+    cu_seqlens_q: Tensor = None
+    cu_seqlens_kv: Tensor = None
+    cu_seqlens_q_padded: Tensor = None
+    cu_seqlens_kv_padded: Tensor = None
+    max_seqlen_q: Tensor = None
+    max_seqlen_kv: Tensor = None
diff --git a/megatron/core/parallel_state.py b/megatron/core/parallel_state.py
index fcc1b6b9cf..e9043b647c 100644
--- a/megatron/core/parallel_state.py
+++ b/megatron/core/parallel_state.py
@@ -2,8 +2,14 @@
 
 """Model and data parallel groups."""
 
+import os
+import warnings
+from datetime import timedelta
+from functools import partial
+from itertools import cycle
+from typing import Callable, List, Optional
+
 import torch
-from typing import Optional
 
 from .utils import GlobalMemoryBuffer
 
@@ -13,6 +19,8 @@
 _PIPELINE_MODEL_PARALLEL_GROUP = None
 # Model parallel group (both intra- and pipeline) that the current rank belongs to.
 _MODEL_PARALLEL_GROUP = None
+# Model parallel group (both intra-, pipeline, and expert) that the current rank belongs to.
+_MODEL_AND_EXPERT_PARALLEL_GROUP = None
 # Embedding group.
 _EMBEDDING_GROUP = None
 # Position embedding group.
@@ -20,18 +28,33 @@
 # Data parallel group that the current rank belongs to.
 _DATA_PARALLEL_GROUP = None
 _DATA_PARALLEL_GROUP_GLOO = None
-# FP8 amax reduction group.
-_AMAX_REDUCTION_GROUP = None
+# tensor model parallel group and data parallel group combined
+# used for fp8 and moe training
+_TENSOR_AND_DATA_PARALLEL_GROUP = None
+# Expert parallel group that the current rank belongs to.
+_EXPERT_MODEL_PARALLEL_GROUP = None
+_TENSOR_AND_EXPERT_PARALLEL_GROUP = None
+_DATA_MODULO_EXPERT_PARALLEL_GROUP = None
+_DATA_MODULO_EXPERT_PARALLEL_GROUP_GLOO = None
+_DATA_MODULO_EXPERT_PARALLEL_GROUP_WITH_CP = None
+_DATA_MODULO_EXPERT_PARALLEL_GROUP_WITH_CP_GLOO = None
+
 
 _VIRTUAL_PIPELINE_MODEL_PARALLEL_RANK = None
 _VIRTUAL_PIPELINE_MODEL_PARALLEL_WORLD_SIZE = None
 _PIPELINE_MODEL_PARALLEL_SPLIT_RANK = None
 
+_PIPELINE_MODEL_PARALLEL_DECODER_START = None
+
 # These values enable us to change the mpu sizes on the fly.
 _MPU_TENSOR_MODEL_PARALLEL_WORLD_SIZE = None
 _MPU_PIPELINE_MODEL_PARALLEL_WORLD_SIZE = None
+_MPU_EXPERT_MODEL_PARALLEL_WORLD_SIZE = None
+_MPU_DATA_PARALLEL_WORLD_SIZE = None
+_MPU_DATA_PARALLEL_RANK = None
 _MPU_TENSOR_MODEL_PARALLEL_RANK = None
 _MPU_PIPELINE_MODEL_PARALLEL_RANK = None
+_MPU_EXPERT_MODEL_PARALLEL_RANK = None
 
 # A list of ranks that have a copy of the embedding.
 _EMBEDDING_GLOBAL_RANKS = None
@@ -47,20 +70,305 @@
 # rank when broadcasting weights from src to all other data parallel ranks
 _DATA_PARALLEL_GLOBAL_RANKS = None
 
+# A list of global ranks for each tensor model parallel group to ease calculation of
+# the first local rank in the tensor model parallel group
+_TENSOR_MODEL_PARALLEL_GLOBAL_RANKS = None
+
+# Context parallel group that the current rank belongs to
+_CONTEXT_PARALLEL_GROUP = None
+# A list of global ranks for each context parallel group to ease calculation of the
+# destination rank when exchanging KV/dKV between context parallel_ranks
+_CONTEXT_PARALLEL_GLOBAL_RANKS = None
+
+# Data parallel group information with context parallel combined.
+_DATA_PARALLEL_GROUP_WITH_CP = None
+_DATA_PARALLEL_GROUP_WITH_CP_GLOO = None
+_DATA_PARALLEL_GLOBAL_RANKS_WITH_CP = None
+
+# combined parallel group of TP and CP
+_TENSOR_AND_CONTEXT_PARALLEL_GROUP = None
+
+# combined parallel group of TP, DP, and CP used for fp8
+_TENSOR_AND_DATA_PARALLEL_GROUP_WITH_CP = None
+
 # Memory buffers to avoid dynamic memory allocation
 _GLOBAL_MEMORY_BUFFER = None
 
+# MOE logging
+_MOE_LAYER_WISE_LOGGING_TRACKER = {}
+
+
+def get_nccl_options(pg_name, nccl_comm_cfgs):
+    """Set the NCCL process group options.
+
+    Args:
+        pg_name (str): process group name
+        nccl_comm_cfgs (dict): nccl communicator configurations
+
+    When an option (e.g., max_ctas) is not found in the config, use the NCCL default setting.
+    """
+    if pg_name in nccl_comm_cfgs:
+        nccl_options = torch.distributed.ProcessGroupNCCL.Options()
+        nccl_options.config.cga_cluster_size = nccl_comm_cfgs[pg_name].get('cga_cluster_size', 4)
+        nccl_options.config.max_ctas = nccl_comm_cfgs[pg_name].get('max_ctas', 32)
+        nccl_options.config.min_ctas = nccl_comm_cfgs[pg_name].get('min_ctas', 1)
+        return nccl_options
+    else:
+        return None
+
+
+def generate_masked_orthogonal_rank_groups(
+    world_size: int, parallel_size: List[int], mask: List[bool]
+) -> List[List[int]]:
+    """Generate orthogonal parallel groups based on the parallel size and mask.
+
+    Arguments:
+        world_size (int): world size
+
+        parallel_size (List[int]):
+            The parallel size of each orthogonal parallel type. For example, if
+            tensor_parallel_size = 2, pipeline_model_parallel_group = 3, data_parallel_size = 4,
+            and the parallel mapping order is tp-pp-dp, then the parallel_size = [2, 3, 4].
+
+        mask (List[bool]):
+            The mask controls which parallel methods the generated groups represent. If mask[i] is
+            True, it means the generated group contains the i-th parallelism method. For example,
+            if parallel_size = [tp_size, pp_size, dp_size], and mask = [True, False , True], then
+            the generated group is the `tp-dp` group, if the mask = [False, True, False], then the
+            generated group is the `pp` group.
+
+    Algorithm:
+        For orthogonal parallelism, such as tp/dp/pp/cp, the global_rank and
+        local_rank satisfy the following equation:
+            global_rank = tp_rank + dp_rank * tp_size + pp_rank * tp_size * dp_size (1)
+                tp_rank \in [0, tp_size)
+                dp_rank \in [0, dp_size)
+                pp_rank \in [0, pp_size)
+
+        If we want to get the `dp_group` (tp_size * pp_size groups of dp_size ranks each.
+        For example,  if the gpu size is 8 and order is 'tp-pp-dp', size is '2-2-2', and the
+        dp_group here is [[0, 4], [1, 5], [2, 6], [3, 7]].)
+        The tp_rank and pp_rank will be combined to form the `dp_group_index`.
+            dp_group_index = tp_rank + pp_rank * tp_size (2)
+
+        So, Given that tp_rank and pp_rank satisfy equation (2), and dp_rank in
+        range(0, dp_size), the ranks in dp_group[dp_group_index] satisfies the
+        equation (1).
+
+        This function solve this math problem.
+
+    For example, if the parallel_size = [tp_size, dp_size, pp_size] = [2, 3, 4],
+    and the mask = [False, True, False]. Then,
+        dp_group_index(0) = tp_rank(0) + pp_rank(0) * 2
+        dp_group_index(1) = tp_rank(1) + pp_rank(0) * 2
+        ...
+        dp_group_index(7) = tp_rank(1) + pp_rank(3) * 2
+
+        dp_group[0] = 0 + range(0, 3) * 2 + 0 = [0, 2, 4]
+        dp_group[1] = 1 + range(0, 3) * 2 + 0 = [1, 3, 5]
+        ...
+        dp_group[7] = 1 + range(0, 3) * 2 + 3 * 2 * 3 = [19, 21, 23]
+    """
+
+    def prefix_product(a: List[int], init=1) -> List[int]:
+        r = [init]
+        for v in a:
+            init = init * v
+            r.append(init)
+        return r
+
+    def inner_product(a: List[int], b: List[int]) -> int:
+        return sum([x * y for x, y in zip(a, b)])
+
+    def decompose(index, shape, stride=None):
+        '''
+        This function solve the math problem below:
+            There is an equation:
+                index = sum(idx[i] * stride[i])
+            And given the value of index, stride.
+            Return the idx.
+        This function will used to get the pp/dp/pp_rank
+        from group_index and rank_in_group.
+        '''
+        if stride is None:
+            stride = prefix_product(shape)
+        idx = [(index // d) % s for s, d in zip(shape, stride)]
+        # stride is a prefix_product result. And the value of stride[-1]
+        # is not used.
+        assert (
+            sum([x * y for x, y in zip(idx, stride[:-1])]) == index
+        ), "idx {} with shape {} mismatch the return idx {}".format(index, shape, idx)
+        return idx
+
+    masked_shape = [s for s, m in zip(parallel_size, mask) if m]
+    unmasked_shape = [s for s, m in zip(parallel_size, mask) if not m]
+
+    global_stride = prefix_product(parallel_size)
+    masked_stride = [d for d, m in zip(global_stride, mask) if m]
+    unmasked_stride = [d for d, m in zip(global_stride, mask) if not m]
+
+    group_size = prefix_product(masked_shape)[-1]
+    num_of_group = world_size // group_size
+
+    ranks = []
+    for group_index in range(num_of_group):
+        # get indices from unmaksed for group_index.
+        decomposed_group_idx = decompose(group_index, unmasked_shape)
+        rank = []
+        for rank_in_group in range(group_size):
+            # get indices from masked for rank_in_group.
+            decomposed_rank_idx = decompose(rank_in_group, masked_shape)
+            rank.append(
+                inner_product(decomposed_rank_idx, masked_stride)
+                + inner_product(decomposed_group_idx, unmasked_stride)
+            )
+        ranks.append(rank)
+    return ranks
+
+
+class RankGenerator(object):
+    """A class for generating rank groups for different modes of parallelism."""
+
+    def __init__(
+        self, tp: int, ep: int, dp: int, pp: int, cp: int, order: str, rank_offset: int = 0
+    ) -> None:
+        self.tp = tp
+        self.ep = ep
+        self.dp = dp
+        self.pp = pp
+        self.cp = cp
+        self.rank_offset = rank_offset
+        self.world_size = tp * dp * pp * cp
+
+        self.name_to_size = {
+            "tp": self.tp,
+            "pp": self.pp,
+            "dp": self.dp,
+            "ep": self.ep,
+            "cp": self.cp,
+        }
+        self.order = order
+        order = order.lower()
+
+        if 'ep' in order:
+            if 'ep-dp' not in order and 'dp-ep' not in order:
+                raise RuntimeError(f"The ep and dp must be adjacent in order ({self.order}).")
+
+        for name in self.name_to_size.keys():
+            if name not in order and self.name_to_size[name] != 1:
+                raise RuntimeError(
+                    f"The size of ({name}) is ({self.name_to_size[name]}), but you haven't"
+                    f"specified the order ({self.order})."
+                )
+            elif name not in order:
+                order = order + '-' + name
+
+        self.order_w_ep = order
+        self.order_wo_ep = '-'.join([token for token in order.split('-') if token != 'ep'])
+        self.ordered_size_wo_ep = []
+        self.ordered_size_w_ep = []
+
+        for token in order.split('-'):
+            if token == 'dp':
+                self.ordered_size_w_ep.append(self.dp // self.ep)
+                self.ordered_size_wo_ep.append(self.dp)
+            elif token == 'ep':
+                self.ordered_size_w_ep.append(self.ep)
+            else:
+                self.ordered_size_w_ep.append(self.name_to_size[token])
+                self.ordered_size_wo_ep.append(self.name_to_size[token])
+
+    def get_mask(self, order: str, token: str):
+        """Create a mask for the specified tokens based on the given order.
+
+        Args:
+            order (str): The order of parallelism types (e.g., 'tp-dp-pp').
+            token (str): The specific parallelism types to include in the mask,
+                         separated by hyphens (e.g., 'tp-dp').
+        """
+        ordered_token = order.split('-')
+        token = token.split('-')
+        mask = [False] * len(ordered_token)
+        for t in token:
+            mask[ordered_token.index(t)] = True
+        return mask
+
+    def get_ranks(self, token, independent_ep=False):
+        """Get rank group by input token.
+
+        Args:
+            token (str):
+                Specify the ranks type that want to get. If we want
+                to obtain multiple parallel types, we can use a hyphen
+                '-' to separate them. For example, if we want to obtain
+                the TP_DP group, the token should be 'tp-dp'.
+
+            independent_ep (bool: True):
+                This flag controls whether we treat EP and DP independently.
+                EP shares ranks with DP, if we want to get ranks related to
+                EP, we should set the flag. For example, get_ranks('dp', True)
+                will get DP modulo EP group, and get_ranks('dp', False) will
+                get full DP group.
+        """
+        if independent_ep:
+            parallel_size = self.ordered_size_w_ep
+            order = self.order_w_ep
+        else:
+            parallel_size = self.ordered_size_wo_ep
+            order = self.order_wo_ep
+        mask = self.get_mask(order, token)
+        ranks = generate_masked_orthogonal_rank_groups(self.world_size, parallel_size, mask)
+        if self.rank_offset > 0:
+            for rank_group in ranks:
+                for i in range(len(rank_group)):
+                    rank_group[i] += self.rank_offset
+        return ranks
+
+
+def default_embedding_ranks(pp_ranks, split_rank=None):
+    """Return the default ranks that constitute the stages on which the word embeddings live.
+    For most models, these are the first and last pipeline stages.
+
+    We also support the deprecated split rank argument for backwards compatibility."""
+    if len(pp_ranks) == 1:
+        return [pp_ranks[0]]
+    elif split_rank is not None and pp_ranks[split_rank] not in (pp_ranks[0], pp_ranks[-1]):
+        return [pp_ranks[0], pp_ranks[split_rank], pp_ranks[-1]]
+    else:
+        return [pp_ranks[0], pp_ranks[-1]]
+
+
+def default_position_embedding_ranks(pp_ranks, split_rank=None):
+    """Return the default ranks that constitute the stages on which the position embeddings live.
+    For most models, this is only the first pipeline stage.
+
+    We also support the deprecated split rank argument for backwards compatibility."""
+    if split_rank is not None and pp_ranks[0] != pp_ranks[split_rank]:
+        return [pp_ranks[0], pp_ranks[split_rank]]
+    else:
+        return [pp_ranks[0]]
+
 
 def initialize_model_parallel(
     tensor_model_parallel_size: int = 1,
     pipeline_model_parallel_size: int = 1,
     virtual_pipeline_model_parallel_size: Optional[int] = None,
     pipeline_model_parallel_split_rank: Optional[int] = None,
-    use_fp8: bool = False,
+    use_sharp: bool = False,
+    context_parallel_size: int = 1,
+    expert_model_parallel_size: int = 1,
+    nccl_communicator_config_path: Optional[str] = None,
+    distributed_timeout_minutes: int = 30,
+    order: str = "tp-cp-ep-dp-pp",
+    encoder_tensor_model_parallel_size: Optional[int] = 0,
+    encoder_pipeline_model_parallel_size: Optional[int] = 0,
+    get_embedding_ranks: Optional[Callable[[List[int], Optional[int]], List[int]]] = None,
+    get_position_embedding_ranks: Optional[Callable[[List[int], Optional[int]], List[int]]] = None,
 ) -> None:
+    # pylint: disable=line-too-long
     """Initialize model data parallel groups.
 
-    Arguments:
+    Args:
         tensor_model_parallel_size (int, default = 1):
             The number of GPUs to split individual tensors across.
 
@@ -87,7 +395,7 @@ def initialize_model_parallel(
             GPU 3: [7, 8] [15, 16]
 
         pipeline_model_parallel_split_rank (int, optional):
-            For models with both an encoder and decoder, the rank in
+            DEPRECATED. For models with both an encoder and decoder, the rank in
             pipeline to switch between encoder and decoder (i.e. the
             first rank of the decoder). This allows the user to set
             the pipeline parallel size of the encoder and decoder
@@ -96,10 +404,72 @@ def initialize_model_parallel(
             pipeline_model_parallel_split_rank is 3, then ranks 0-2
             will be the encoder and ranks 3-7 will be the decoder.
 
-        use_fp8 (bool, default = False):
-            Construct GPU groups needed for FP8 training, namely for
-            amax reduction across the product of the data-parallel and
-            tensor-parallel groups.
+        use_sharp (bool, default = False):
+            Set the use of SHARP for the collective communications of
+            data-parallel process groups. When `True`, run barrier
+            within each data-parallel process group, which specifies
+            the SHARP application target groups.
+
+        context_parallel_size (int, default = 1):
+            The number of tensor parallel GPU groups to split the
+            network input sequence length across. Compute of attention
+            module requires tokens of full sequence length, so GPUs
+            in a context parallel group need to communicate with each
+            other to exchange information of other sequence chunks.
+            Each GPU and its counterparts in other tensor parallel
+            groups compose a context parallel group.
+
+            For example, assume we have 8 GPUs, if tensor model parallel
+            size is 4 and context parallel size is 2, the network input
+            will be split into two sequence chunks, which are processed
+            by 2 different groups of 4 GPUs. One chunk is processed by
+            GPU0-3, the other chunk is processed by GPU4-7. Four groups
+            are build to do context parallel communications: [GPU0, GPU4],
+            [GPU1, GPU5], [GPU2, GPU6], and [GPU3, GPU7].
+
+            Context parallelism partitions sequence length, so it has no
+            impact on weights, which means weights are duplicated among
+            GPUs in a context parallel group. Hence, weight gradients
+            all-reduce is required in backward. For simplicity, we piggyback
+            GPUs of context parallelism on data parallel group for
+            weight gradient all-reduce.
+
+        expert_model_parallel_size (int, default = 1):
+            The number of Mixture of Experts parallel GPUs in each expert
+            parallel group.
+
+        nccl_communicator_config_path (str, default = None):
+            Path to the yaml file of NCCL communicator configurations.
+            `min_ctas`, `max_ctas`, and `cga_cluster_size` can be set
+            for each communicator.
+
+        distributed_timeout_minutes (int, default = 30): Timeout, in
+            minutes,for operations executed against distributed
+            process groups. See PyTorch documentation at
+            https://pytorch.org/docs/stable/distributed.html for
+            caveats.
+
+        order (str, default=tp-dp-pp):
+            The rank initialization order of parallelism. Now we support
+            tp-dp-pp and tp-pp-dp orders.
+
+        encoder_tensor_model_parallel_size (int, default = 0):
+            The number of GPUs to split individual tensors across in the encoder. If 0,
+            then we use the default, decoder's tensor model parallel size.
+
+        encoder_pipeline_model_parallel_size (int, default = 0):
+            The number of tensor parallel GPU groups to allocate to the encoder. As an example,
+            if pipeline_model_parallel_size is 4 and encoder_pipeline_model_parallel_size is 2,
+            then the encoder will use the first two pipeline stages for its layers, and the total
+            amount of pipelineing is 6.
+
+        get_embedding_ranks (Callable[[List[int], Optional[int]], List[int]], optional, default=None):
+            A function that takes in a list of ranks for a pipeline group and returns
+            those ranks that should have embeddings.
+
+        get_position_embedding_ranks (Callable[[List[int], Optional[int]], List[int]], optional, default=None):
+            A function that takes in a list of ranks for a pipeline group, and returns
+            those ranks that should have position embeddings.
 
     Let's say we have a total of 16 GPUs denoted by g0 ... g15 and we
     use 2 GPUs to parallelize the model tensor, and 4 GPUs to parallelize
@@ -118,27 +488,69 @@ def initialize_model_parallel(
     ranks 8 to 15 belong to the second box.
 
     """
+    if encoder_pipeline_model_parallel_size is None:
+        encoder_pipeline_model_parallel_size = 0
+
+    if encoder_tensor_model_parallel_size == 0 and encoder_pipeline_model_parallel_size > 0:
+        encoder_tensor_model_parallel_size = tensor_model_parallel_size
+
+    if get_embedding_ranks is None:
+        get_embedding_ranks = partial(
+            default_embedding_ranks, split_rank=pipeline_model_parallel_split_rank
+        )
+
+    if get_position_embedding_ranks is None:
+        get_position_embedding_ranks = partial(
+            default_position_embedding_ranks, split_rank=pipeline_model_parallel_split_rank
+        )
+
+    if encoder_pipeline_model_parallel_size > 0:
+        global _PIPELINE_MODEL_PARALLEL_DECODER_START
+        _PIPELINE_MODEL_PARALLEL_DECODER_START = encoder_pipeline_model_parallel_size
+
     # Get world size and rank. Ensure some consistencies.
     assert torch.distributed.is_initialized()
     world_size: int = torch.distributed.get_world_size()
 
-    if world_size % (tensor_model_parallel_size * pipeline_model_parallel_size) != 0:
+    if encoder_tensor_model_parallel_size > 0:
+        assert encoder_pipeline_model_parallel_size > 0
+        assert (
+            encoder_tensor_model_parallel_size <= tensor_model_parallel_size
+        ), "We do not support encoders with more TP than the decoder."
+
+    encoder_model_size = (
+        encoder_tensor_model_parallel_size
+        * encoder_pipeline_model_parallel_size
+        * context_parallel_size
+    )
+    decoder_model_size = (
+        tensor_model_parallel_size * pipeline_model_parallel_size * context_parallel_size
+    )
+    total_model_size = encoder_model_size + decoder_model_size
+
+    if world_size % total_model_size != 0:
+        raise RuntimeError(f"world_size ({world_size}) is not divisible by {total_model_size}")
+
+    data_parallel_size: int = world_size // total_model_size
+
+    if data_parallel_size % expert_model_parallel_size != 0:
         raise RuntimeError(
-            f"world_size ({world_size}) is not divisible by tensor_model_parallel_size "
-            f"({tensor_model_parallel_size}) x pipeline_model_parallel_size ({pipeline_model_parallel_size})"
+            f"data_parallel_size ({data_parallel_size}) is not divisible by "
+            "expert_model_parallel_size "
         )
 
-    data_parallel_size: int = world_size // (tensor_model_parallel_size *
-                                             pipeline_model_parallel_size)
+    encoder_world_size = encoder_model_size * data_parallel_size
+    decoder_world_size = decoder_model_size * data_parallel_size
 
-    num_tensor_model_parallel_groups: int  = world_size // tensor_model_parallel_size
-    num_pipeline_model_parallel_groups: int = world_size // pipeline_model_parallel_size
-    num_data_parallel_groups: int = world_size // data_parallel_size
+    assert (
+        encoder_world_size + decoder_world_size == world_size
+    ), f"{encoder_world_size=} + {decoder_world_size=} != {world_size=}"
 
     if virtual_pipeline_model_parallel_size is not None:
-        if not pipeline_model_parallel_size > 2:
-            raise RuntimeError("pipeline-model-parallel size should be greater than 2 with "
-                               "interleaved schedule")
+        if not pipeline_model_parallel_size > 1:
+            raise RuntimeError(
+                "pipeline-model-parallel size should be greater than 1 with interleaved schedule"
+            )
         global _VIRTUAL_PIPELINE_MODEL_PARALLEL_RANK
         global _VIRTUAL_PIPELINE_MODEL_PARALLEL_WORLD_SIZE
         _VIRTUAL_PIPELINE_MODEL_PARALLEL_RANK = 0
@@ -150,108 +562,304 @@ def initialize_model_parallel(
 
     rank = torch.distributed.get_rank()
 
+    nccl_comm_cfgs = {}
+    if nccl_communicator_config_path is not None:
+        try:
+            import yaml
+        except ImportError:
+            raise RuntimeError(
+                "Cannot import `yaml`. Setting custom nccl communicator configs "
+                "requires the yaml package."
+            )
+
+        with open(nccl_communicator_config_path, "r") as stream:
+            nccl_comm_cfgs = yaml.safe_load(stream)
+
+    if encoder_world_size > 0:
+        encoder_rank_generator = RankGenerator(
+            tp=encoder_tensor_model_parallel_size,
+            ep=1,
+            dp=data_parallel_size,
+            pp=encoder_pipeline_model_parallel_size,
+            cp=context_parallel_size,
+            order=order,
+            rank_offset=0,
+        )
+    else:
+        encoder_rank_generator = None
+
+    decoder_rank_generator = RankGenerator(
+        tp=tensor_model_parallel_size,
+        ep=expert_model_parallel_size,
+        dp=data_parallel_size,
+        pp=pipeline_model_parallel_size,
+        cp=context_parallel_size,
+        order=order,
+        rank_offset=encoder_world_size,
+    )
+
+    def generator_wrapper(group_type, **kwargs):
+        """The `RankGenerator` class produces a hyper-rectangle for a given set of
+        tensor, pipeline, data, expert, and context parallelism. If we have an encoder,
+        in addition to the default decoder, we essentially instantiate two `RankGenerator`
+        classes to construct the parallelism for each module separately, and we then have
+        to stitch them together for the right groups. For now, this means pp and tp-pp."""
+        d_ranks = decoder_rank_generator.get_ranks(group_type, **kwargs)
+        if encoder_rank_generator is None:
+            for x in d_ranks:
+                yield x
+            return
+        e_ranks = encoder_rank_generator.get_ranks(group_type, **kwargs)
+        if group_type == 'pp':
+            # Map 1 encoder tp rank to several decoder tp ranks, because
+            # these won't be the same size.
+            for x, y in zip(cycle(e_ranks), d_ranks):
+                yield x + y
+        elif group_type == 'tp-pp':
+            # For this group, we can just return the concatenated
+            # groups together, because their sizes are the same.
+            assert len(e_ranks) == len(d_ranks)
+            for x, y in zip(e_ranks, d_ranks):
+                yield x + y
+        else:
+            for x in e_ranks:
+                yield x
+            for x in d_ranks:
+                yield x
+
+    timeout = timedelta(minutes=distributed_timeout_minutes)
+
     # Build the data-parallel groups.
     global _DATA_PARALLEL_GROUP
     global _DATA_PARALLEL_GROUP_GLOO
     global _DATA_PARALLEL_GLOBAL_RANKS
+    global _DATA_PARALLEL_GROUP_WITH_CP
+    global _DATA_PARALLEL_GROUP_WITH_CP_GLOO
+    global _DATA_PARALLEL_GLOBAL_RANKS_WITH_CP
     assert _DATA_PARALLEL_GROUP is None, 'data parallel group is already initialized'
-    all_data_parallel_group_ranks = []
-    for i in range(pipeline_model_parallel_size):
-        start_rank = i * num_pipeline_model_parallel_groups
-        end_rank = (i + 1) * num_pipeline_model_parallel_groups
-        for j in range(tensor_model_parallel_size):
-            ranks = range(start_rank + j, end_rank, tensor_model_parallel_size)
-            all_data_parallel_group_ranks.append(list(ranks))
-            group = torch.distributed.new_group(ranks)
-            group_gloo = torch.distributed.new_group(ranks, backend="gloo")
-            if rank in ranks:
-                _DATA_PARALLEL_GROUP = group
-                _DATA_PARALLEL_GROUP_GLOO = group_gloo
-                _DATA_PARALLEL_GLOBAL_RANKS = ranks
+
+    for ranks in generator_wrapper('dp'):
+        group = torch.distributed.new_group(
+            ranks, timeout=timeout, pg_options=get_nccl_options('dp', nccl_comm_cfgs)
+        )
+        group_gloo = torch.distributed.new_group(ranks, timeout=timeout, backend="gloo")
+        if rank in ranks:
+            _DATA_PARALLEL_GROUP = group
+            _DATA_PARALLEL_GROUP_GLOO = group_gloo
+            _DATA_PARALLEL_GLOBAL_RANKS = ranks
+
+    for ranks_with_cp in generator_wrapper('dp-cp'):
+        group_with_cp = torch.distributed.new_group(
+            ranks_with_cp, timeout=timeout, pg_options=get_nccl_options('dp_cp', nccl_comm_cfgs)
+        )
+        group_with_cp_gloo = torch.distributed.new_group(
+            ranks_with_cp, timeout=timeout, backend="gloo"
+        )
+        if rank in ranks_with_cp:
+            _DATA_PARALLEL_GROUP_WITH_CP = group_with_cp
+            _DATA_PARALLEL_GROUP_WITH_CP_GLOO = group_with_cp_gloo
+            _DATA_PARALLEL_GLOBAL_RANKS_WITH_CP = ranks_with_cp
+
+    # Apply SHARP to DP process groups
+    if use_sharp:
+        if rank == 0:
+            print(
+                "The number of process groups to use SHARP with depends on the type "
+                "of the network switch. Nvidia QM1 switch supports SAHRP up to 8 "
+                "process groups and QM2 supports up to 256 process groups. We apply "
+                "SHARP to the communications of the data-parallel domain. If the "
+                "number of data-parallel process groups is larger than the max "
+                "process groups that the network switch supports, the communication "
+                "will fall back to non-SHARP operators. To enable SHARP, "
+                "`#SBATCH_NETWORK=sharp` should be set in the sbatch script."
+            )
+        torch.distributed.barrier(
+            group=get_data_parallel_group(with_context_parallel=True),
+            device_ids=[torch.cuda.current_device()],
+        )
+        # Set `NCCL_COLLNET_ENABLE=0` to restrict SHARP application to DP process groups
+        os.environ["NCCL_COLLNET_ENABLE"] = "0"
+
+    # Build the context-parallel groups.
+    global _CONTEXT_PARALLEL_GROUP
+    global _CONTEXT_PARALLEL_GLOBAL_RANKS
+    assert _CONTEXT_PARALLEL_GROUP is None, 'context parallel group is already initialized'
+    for ranks in generator_wrapper('cp'):
+        group = torch.distributed.new_group(
+            ranks, timeout=timeout, pg_options=get_nccl_options('cp', nccl_comm_cfgs)
+        )
+        if rank in ranks:
+            _CONTEXT_PARALLEL_GROUP = group
+            _CONTEXT_PARALLEL_GLOBAL_RANKS = ranks
 
     # Build the model-parallel groups.
     global _MODEL_PARALLEL_GROUP
     assert _MODEL_PARALLEL_GROUP is None, 'model parallel group is already initialized'
-    for i in range(data_parallel_size):
-        ranks = [data_parallel_group_ranks[i]
-                 for data_parallel_group_ranks in all_data_parallel_group_ranks]
-        group = torch.distributed.new_group(ranks)
+    for ranks in generator_wrapper('tp-pp'):
+        group = torch.distributed.new_group(
+            ranks, timeout=timeout, pg_options=get_nccl_options('mp', nccl_comm_cfgs)
+        )
         if rank in ranks:
             _MODEL_PARALLEL_GROUP = group
 
+    # Build the model-parallel groups with expert parallel
+    global _MODEL_AND_EXPERT_PARALLEL_GROUP
+    assert (
+        _MODEL_AND_EXPERT_PARALLEL_GROUP is None
+    ), 'model and expert parallel group is already initialized'
+    for ranks in generator_wrapper('tp-ep-pp', independent_ep=True):
+        group = torch.distributed.new_group(
+            ranks, timeout=timeout, pg_options=get_nccl_options('mp_exp', nccl_comm_cfgs)
+        )
+        if rank in ranks:
+            _MODEL_AND_EXPERT_PARALLEL_GROUP = group
+
     # Build the tensor model-parallel groups.
     global _TENSOR_MODEL_PARALLEL_GROUP
-    assert _TENSOR_MODEL_PARALLEL_GROUP is None, \
-        'tensor model parallel group is already initialized'
-    for i in range(num_tensor_model_parallel_groups):
-        ranks = range(i * tensor_model_parallel_size,
-                      (i + 1) * tensor_model_parallel_size)
-        group = torch.distributed.new_group(ranks)
+    global _TENSOR_MODEL_PARALLEL_GLOBAL_RANKS
+    assert (
+        _TENSOR_MODEL_PARALLEL_GROUP is None
+    ), 'tensor model parallel group is already initialized'
+    for ranks in generator_wrapper('tp'):
+        group = torch.distributed.new_group(
+            ranks, timeout=timeout, pg_options=get_nccl_options('tp', nccl_comm_cfgs)
+        )
         if rank in ranks:
             _TENSOR_MODEL_PARALLEL_GROUP = group
+            _TENSOR_MODEL_PARALLEL_GLOBAL_RANKS = ranks
 
     # Build the pipeline model-parallel groups and embedding groups
     # (first and last rank in each pipeline model-parallel group).
     global _PIPELINE_MODEL_PARALLEL_GROUP
     global _PIPELINE_GLOBAL_RANKS
-    assert _PIPELINE_MODEL_PARALLEL_GROUP is None, \
-        'pipeline model parallel group is already initialized'
+    assert (
+        _PIPELINE_MODEL_PARALLEL_GROUP is None
+    ), 'pipeline model parallel group is already initialized'
     global _EMBEDDING_GROUP
     global _EMBEDDING_GLOBAL_RANKS
     assert _EMBEDDING_GROUP is None, 'embedding group is already initialized'
     global _POSITION_EMBEDDING_GROUP
     global _POSITION_EMBEDDING_GLOBAL_RANKS
-    assert _POSITION_EMBEDDING_GROUP is None, \
-        'position embedding group is already initialized'
-    for i in range(num_pipeline_model_parallel_groups):
-        ranks = range(i, world_size, num_pipeline_model_parallel_groups)
-        group = torch.distributed.new_group(ranks)
+    assert _POSITION_EMBEDDING_GROUP is None, 'position embedding group is already initialized'
+    for ranks in generator_wrapper('pp'):
+        group = torch.distributed.new_group(
+            ranks, timeout=timeout, pg_options=get_nccl_options('pp', nccl_comm_cfgs)
+        )
         if rank in ranks:
-            _PIPELINE_MODEL_PARALLEL_GROUP = group
-            _PIPELINE_GLOBAL_RANKS = ranks
-        # Setup embedding group (to exchange gradients between
-        # first and last stages).
-        if len(ranks) > 1:
-            embedding_ranks = [ranks[0], ranks[-1]]
-            position_embedding_ranks = [ranks[0]]
-            if pipeline_model_parallel_split_rank is not None:
-                if ranks[pipeline_model_parallel_split_rank] not in embedding_ranks:
-                    embedding_ranks = [ranks[0],
-                                       ranks[pipeline_model_parallel_split_rank],
-                                       ranks[-1]]
-                if ranks[pipeline_model_parallel_split_rank] not in position_embedding_ranks:
-                    position_embedding_ranks = [ranks[0],
-                                       ranks[pipeline_model_parallel_split_rank]]
-        else:
-            embedding_ranks = ranks
-            position_embedding_ranks = ranks
-
-        group = torch.distributed.new_group(embedding_ranks)
+            if _PIPELINE_MODEL_PARALLEL_GROUP is None:
+                _PIPELINE_MODEL_PARALLEL_GROUP = group
+                _PIPELINE_GLOBAL_RANKS = ranks
+            elif isinstance(_PIPELINE_GLOBAL_RANKS[0], list):
+                _PIPELINE_MODEL_PARALLEL_GROUP.append(group)
+                _PIPELINE_GLOBAL_RANKS.append(ranks)
+            else:
+                _PIPELINE_MODEL_PARALLEL_GROUP = [_PIPELINE_MODEL_PARALLEL_GROUP, group]
+                _PIPELINE_GLOBAL_RANKS = [_PIPELINE_GLOBAL_RANKS, ranks]
+
+        embedding_ranks = get_embedding_ranks(ranks)
+        group = torch.distributed.new_group(
+            embedding_ranks, timeout=timeout, pg_options=get_nccl_options('embd', nccl_comm_cfgs)
+        )
         if rank in embedding_ranks:
             _EMBEDDING_GROUP = group
-        if rank in ranks:
             _EMBEDDING_GLOBAL_RANKS = embedding_ranks
 
-        group = torch.distributed.new_group(position_embedding_ranks)
+        position_embedding_ranks = get_position_embedding_ranks(ranks)
+        group = torch.distributed.new_group(
+            position_embedding_ranks,
+            timeout=timeout,
+            pg_options=get_nccl_options('embd', nccl_comm_cfgs),
+        )
         if rank in position_embedding_ranks:
             _POSITION_EMBEDDING_GROUP = group
-        if rank in ranks:
             _POSITION_EMBEDDING_GLOBAL_RANKS = position_embedding_ranks
 
-    # Build the FP8 groups.
-    global _AMAX_REDUCTION_GROUP
-    assert _AMAX_REDUCTION_GROUP is None, \
-        'FP8 amax reduction group is already initialized'
-    if use_fp8:
-        amax_group_size: int = tensor_model_parallel_size * data_parallel_size
-        num_amax_groups: int = world_size // amax_group_size
-        for i in range(num_amax_groups):
-            start_rank = i * amax_group_size
-            end_rank = (i + 1) * amax_group_size
-            ranks = range(start_rank, end_rank)
-            group = torch.distributed.new_group(ranks)
-            if rank in ranks:
-                _AMAX_REDUCTION_GROUP = group
+    # Build the tensor + data parallel groups.
+    global _TENSOR_AND_DATA_PARALLEL_GROUP
+    global _TENSOR_AND_DATA_PARALLEL_GROUP_WITH_CP
+    assert (
+        _TENSOR_AND_DATA_PARALLEL_GROUP is None
+    ), 'Tensor + data parallel group is already initialized'
+    for ranks in generator_wrapper('tp-dp-cp'):
+        group = torch.distributed.new_group(
+            ranks, timeout=timeout, pg_options=get_nccl_options('tp_dp_cp', nccl_comm_cfgs)
+        )
+        if rank in ranks:
+            _TENSOR_AND_DATA_PARALLEL_GROUP_WITH_CP = group
+    for ranks in generator_wrapper('tp-dp'):
+        group = torch.distributed.new_group(
+            ranks, timeout=timeout, pg_options=get_nccl_options('tp_dp', nccl_comm_cfgs)
+        )
+        if rank in ranks:
+            _TENSOR_AND_DATA_PARALLEL_GROUP = group
+
+    global _TENSOR_AND_CONTEXT_PARALLEL_GROUP
+    assert (
+        _TENSOR_AND_CONTEXT_PARALLEL_GROUP is None
+    ), 'Tensor + context parallel group is already initialized'
+    for ranks in generator_wrapper('tp-cp'):
+        group = torch.distributed.new_group(
+            ranks, timeout=timeout, pg_options=get_nccl_options('tp_cp', nccl_comm_cfgs)
+        )
+        if rank in ranks:
+            _TENSOR_AND_CONTEXT_PARALLEL_GROUP = group
+
+    # Build the tensor + expert parallel groups
+    global _EXPERT_MODEL_PARALLEL_GROUP
+    assert _EXPERT_MODEL_PARALLEL_GROUP is None, 'Expert parallel group is already initialized'
+    global _TENSOR_AND_EXPERT_PARALLEL_GROUP
+    assert (
+        _TENSOR_AND_EXPERT_PARALLEL_GROUP is None
+    ), 'Tensor + expert parallel group is already initialized'
+    global _DATA_MODULO_EXPERT_PARALLEL_GROUP
+    assert (
+        _DATA_MODULO_EXPERT_PARALLEL_GROUP is None
+    ), 'Data modulo expert group is already initialized'
+    global _DATA_MODULO_EXPERT_PARALLEL_GROUP_WITH_CP
+    assert (
+        _DATA_MODULO_EXPERT_PARALLEL_GROUP_WITH_CP is None
+    ), 'Data modulo expert group with context parallel is already initialized'
+    global _DATA_MODULO_EXPERT_PARALLEL_GROUP_GLOO
+    global _DATA_MODULO_EXPERT_PARALLEL_GROUP_WITH_CP_GLOO
+
+    for ranks in generator_wrapper('tp-ep', independent_ep=True):
+        group = torch.distributed.new_group(
+            ranks, timeout=timeout, pg_options=get_nccl_options('tp_exp', nccl_comm_cfgs)
+        )
+        if rank in ranks:
+            _TENSOR_AND_EXPERT_PARALLEL_GROUP = group
+
+    for ranks in generator_wrapper('ep', independent_ep=True):
+        group = torch.distributed.new_group(
+            ranks, pg_options=get_nccl_options('exp', nccl_comm_cfgs)
+        )
+        if rank in ranks:
+            _EXPERT_MODEL_PARALLEL_GROUP = group
+
+    for ranks in generator_wrapper('dp', independent_ep=True):
+        group = torch.distributed.new_group(
+            ranks, timeout=timeout, pg_options=get_nccl_options('dp_modulo_exp', nccl_comm_cfgs)
+        )
+        group_gloo = torch.distributed.new_group(ranks, backend="gloo")
+        if rank in ranks:
+            _DATA_MODULO_EXPERT_PARALLEL_GROUP = group
+            _DATA_MODULO_EXPERT_PARALLEL_GROUP_GLOO = group_gloo
+
+    for ranks in generator_wrapper('dp-cp', independent_ep=True):
+        # Lazy initialization of the group
+        if get_context_parallel_world_size() > 1:
+            group = torch.distributed.new_group(
+                ranks,
+                timeout=timeout,
+                pg_options=get_nccl_options('dp_modulo_exp_cp', nccl_comm_cfgs),
+            )
+            group_gloo = torch.distributed.new_group(ranks, backend="gloo")
+        else:
+            group = _DATA_MODULO_EXPERT_PARALLEL_GROUP
+            group_gloo = _DATA_MODULO_EXPERT_PARALLEL_GROUP_GLOO
+        if rank in ranks:
+            _DATA_MODULO_EXPERT_PARALLEL_GROUP_WITH_CP = group
+            _DATA_MODULO_EXPERT_PARALLEL_GROUP_WITH_CP_GLOO = group_gloo
 
     # Initialize global memory buffer
     # This isn't really "parallel state" but there isn't another good place to
@@ -260,94 +868,230 @@ def initialize_model_parallel(
     _set_global_memory_buffer()
 
 
-def is_unitialized():
+def is_initialized():
     """Useful for code segments that may be accessed with or without mpu initialization"""
-    return _DATA_PARALLEL_GROUP is None
+    return _DATA_PARALLEL_GROUP is not None
+
+
+def is_unitialized() -> bool:
+    """Check if parallel state has been initialized
+
+    Deprecated. Use is_initialized instead.
+
+    """
+    warnings.warn("is_unitialized is deprecated, use is_initialized instead", DeprecationWarning)
+    return not is_initialized()
 
 
 def model_parallel_is_initialized():
-    """Check if model and data parallel groups are initialized."""
-    if _TENSOR_MODEL_PARALLEL_GROUP is None or \
-        _PIPELINE_MODEL_PARALLEL_GROUP is None or \
-        _DATA_PARALLEL_GROUP is None:
+    """Check if model- and data-parallel groups are initialized."""
+    if (
+        _TENSOR_MODEL_PARALLEL_GROUP is None
+        or _PIPELINE_MODEL_PARALLEL_GROUP is None
+        or _DATA_PARALLEL_GROUP is None
+    ):
         return False
     return True
 
 
-def get_model_parallel_group():
-    """Get the model parallel group the caller rank belongs to."""
-    assert _MODEL_PARALLEL_GROUP is not None, \
-        'model parallel group is not initialized'
+def get_model_parallel_group(with_expert_parallel=False):
+    """Get the model-parallel group the caller rank belongs to."""
+    if with_expert_parallel:
+        assert (
+            _MODEL_AND_EXPERT_PARALLEL_GROUP is not None
+        ), 'model parallel group is not initialized'
+        return _MODEL_AND_EXPERT_PARALLEL_GROUP
+    assert _MODEL_PARALLEL_GROUP is not None, 'model parallel group is not initialized'
     return _MODEL_PARALLEL_GROUP
 
 
-def get_tensor_model_parallel_group():
-    """Get the tensor model parallel group the caller rank belongs to."""
-    assert _TENSOR_MODEL_PARALLEL_GROUP is not None, \
-        'intra_layer_model parallel group is not initialized'
+def get_tensor_model_parallel_group(check_initialized=True):
+    """Get the tensor-model-parallel group the caller rank belongs to."""
+    if check_initialized:
+        assert (
+            _TENSOR_MODEL_PARALLEL_GROUP is not None
+        ), 'tensor model parallel group is not initialized'
     return _TENSOR_MODEL_PARALLEL_GROUP
 
 
 def get_pipeline_model_parallel_group():
-    """Get the pipeline model parallel group the caller rank belongs to."""
-    assert _PIPELINE_MODEL_PARALLEL_GROUP is not None, \
-        'pipeline_model parallel group is not initialized'
+    """Get the pipeline-model-parallel group the caller rank belongs to."""
+    assert (
+        _PIPELINE_MODEL_PARALLEL_GROUP is not None
+    ), 'pipeline_model parallel group is not initialized'
     return _PIPELINE_MODEL_PARALLEL_GROUP
 
 
-def get_data_parallel_group():
-    """Get the data parallel group the caller rank belongs to."""
-    assert _DATA_PARALLEL_GROUP is not None, \
-        'data parallel group is not initialized'
-    return _DATA_PARALLEL_GROUP
+def get_data_parallel_group(with_context_parallel=False):
+    """Get the data-parallel group the caller rank belongs to."""
+    if with_context_parallel:
+        assert (
+            _DATA_PARALLEL_GROUP_WITH_CP is not None
+        ), 'data parallel group with context parallel combined is not initialized'
+        return _DATA_PARALLEL_GROUP_WITH_CP
+    else:
+        assert _DATA_PARALLEL_GROUP is not None, 'data parallel group is not initialized'
+        return _DATA_PARALLEL_GROUP
+
+
+def get_data_parallel_group_gloo(with_context_parallel=False):
+    """Get the Gloo data-parallel group the caller rank belongs to."""
+    if with_context_parallel:
+        assert (
+            _DATA_PARALLEL_GROUP_WITH_CP_GLOO is not None
+        ), 'data parallel group-gloo with context parallel combined is not initialized'
+        return _DATA_PARALLEL_GROUP_WITH_CP_GLOO
+    else:
+        assert _DATA_PARALLEL_GROUP_GLOO is not None, 'data parallel group-gloo is not initialized'
+        return _DATA_PARALLEL_GROUP_GLOO
 
 
-def get_data_parallel_group_gloo():
-    """Get the data parallel group-gloo the caller rank belongs to."""
-    assert _DATA_PARALLEL_GROUP_GLOO is not None, \
-        'data parallel group-gloo is not initialized'
-    return _DATA_PARALLEL_GROUP_GLOO
+def get_context_parallel_group(check_initialized=True):
+    """Get the context-parallel group the caller rank belongs to."""
+    if check_initialized:
+        assert _CONTEXT_PARALLEL_GROUP is not None, 'context parallel group is not initialized'
+    return _CONTEXT_PARALLEL_GROUP
+
+
+def get_context_parallel_global_ranks(check_initialized=True):
+    """Get all global ranks of the context-parallel group that the caller rank belongs to."""
+    if check_initialized:
+        assert (
+            _CONTEXT_PARALLEL_GLOBAL_RANKS is not None
+        ), 'context parallel group is not initialized'
+    return _CONTEXT_PARALLEL_GLOBAL_RANKS
 
 
 def get_embedding_group():
     """Get the embedding group the caller rank belongs to."""
-    assert _EMBEDDING_GROUP is not None, \
-        'embedding group is not initialized'
+    assert _EMBEDDING_GROUP is not None, 'embedding group is not initialized'
     return _EMBEDDING_GROUP
 
 
 def get_position_embedding_group():
     """Get the position embedding group the caller rank belongs to."""
-    assert _POSITION_EMBEDDING_GROUP is not None, \
-        'position embedding group is not initialized'
+    assert _POSITION_EMBEDDING_GROUP is not None, 'position embedding group is not initialized'
     return _POSITION_EMBEDDING_GROUP
 
 
-def get_amax_reduction_group():
+def get_amax_reduction_group(with_context_parallel=False, tp_only_amax_red=False):
     """Get the FP8 amax reduction group the caller rank belongs to."""
-    assert _AMAX_REDUCTION_GROUP is not None, \
-        'FP8 amax reduction group is not initialized'
-    return _AMAX_REDUCTION_GROUP
+    if with_context_parallel:
+        if not tp_only_amax_red:
+            assert (
+                _TENSOR_AND_DATA_PARALLEL_GROUP_WITH_CP is not None
+            ), 'FP8 amax reduction group is not initialized'
+            return _TENSOR_AND_DATA_PARALLEL_GROUP_WITH_CP
+        else:
+            assert (
+                _TENSOR_AND_CONTEXT_PARALLEL_GROUP is not None
+            ), 'FP8 amax reduction group is not initialized'
+            return _TENSOR_AND_CONTEXT_PARALLEL_GROUP
+    else:
+        if not tp_only_amax_red:
+            assert (
+                _TENSOR_AND_DATA_PARALLEL_GROUP is not None
+            ), 'FP8 amax reduction group is not initialized'
+            return _TENSOR_AND_DATA_PARALLEL_GROUP
+        else:
+            assert (
+                _TENSOR_MODEL_PARALLEL_GROUP is not None
+            ), 'FP8 amax reduction group is not initialized'
+            return _TENSOR_MODEL_PARALLEL_GROUP
+
+
+def get_tensor_and_data_parallel_group(with_context_parallel=False):
+    """Get the tensor- and data-parallel group the caller rank belongs to."""
+    if with_context_parallel:
+        assert (
+            _TENSOR_AND_DATA_PARALLEL_GROUP_WITH_CP is not None
+        ), 'tensor and data parallel group is not initialized'
+        return _TENSOR_AND_DATA_PARALLEL_GROUP_WITH_CP
+    else:
+        assert (
+            _TENSOR_AND_DATA_PARALLEL_GROUP is not None
+        ), 'tensor and data parallel group is not initialized'
+        return _TENSOR_AND_DATA_PARALLEL_GROUP
+
+
+def get_tensor_and_context_parallel_group():
+    """Get the tensor- and context-parallel group the caller rank belongs to."""
+    assert (
+        _TENSOR_AND_CONTEXT_PARALLEL_GROUP is not None
+    ), 'tensor and context parallel group is not initialized'
+    return _TENSOR_AND_CONTEXT_PARALLEL_GROUP
+
+
+def get_expert_model_parallel_group():
+    """Get the expert-model-parallel group the caller rank belongs to."""
+    assert (
+        _EXPERT_MODEL_PARALLEL_GROUP is not None
+    ), 'expert model parallel group is not initialized'
+    return _EXPERT_MODEL_PARALLEL_GROUP
+
+
+def get_tensor_and_expert_parallel_group():
+    """Get the tensor- and expert-parallel group the caller rank belongs to."""
+    assert (
+        _TENSOR_AND_EXPERT_PARALLEL_GROUP is not None
+    ), 'tensor and expert parallel group is not initialized'
+    return _TENSOR_AND_EXPERT_PARALLEL_GROUP
+
+
+def get_data_modulo_expert_parallel_group(with_context_parallel=False):
+    """Get the data-modulo-expert-parallel group the caller rank belongs to."""
+    if with_context_parallel:
+        assert (
+            _DATA_MODULO_EXPERT_PARALLEL_GROUP_WITH_CP is not None
+        ), 'data modulo expert parallel group with context parallel is not initialized'
+        return _DATA_MODULO_EXPERT_PARALLEL_GROUP_WITH_CP
+    else:
+        assert (
+            _DATA_MODULO_EXPERT_PARALLEL_GROUP is not None
+        ), 'data modulo expert parallel group is not initialized'
+        return _DATA_MODULO_EXPERT_PARALLEL_GROUP
+
+
+def get_data_modulo_expert_parallel_group_gloo(with_context_parallel=False):
+    """Get the Gloo data-modulo-expert-parallel group the caller rank belongs to."""
+    if with_context_parallel:
+        assert (
+            _DATA_MODULO_EXPERT_PARALLEL_GROUP_WITH_CP_GLOO is not None
+        ), 'data modulo expert parallel group-gloo with context parallel is not initialized'
+        return _DATA_MODULO_EXPERT_PARALLEL_GROUP_WITH_CP_GLOO
+    else:
+        assert (
+            _DATA_MODULO_EXPERT_PARALLEL_GROUP_GLOO is not None
+        ), 'data modulo expert parallel group-gloo is not initialized'
+        return _DATA_MODULO_EXPERT_PARALLEL_GROUP_GLOO
+
+
+def set_expert_model_parallel_world_size(world_size):
+    """Sets the expert-model-parallel world size."""
+    global _MPU_EXPERT_MODEL_PARALLEL_WORLD_SIZE
+    _MPU_EXPERT_MODEL_PARALLEL_WORLD_SIZE = world_size
 
 
 def set_tensor_model_parallel_world_size(world_size):
-    """Set the tensor model parallel size"""
+    """Set the tensor-model-parallel size"""
     global _MPU_TENSOR_MODEL_PARALLEL_WORLD_SIZE
     _MPU_TENSOR_MODEL_PARALLEL_WORLD_SIZE = world_size
 
 
 def set_pipeline_model_parallel_world_size(world_size):
-    """Set the pipeline model parallel size"""
+    """Set the pipeline-model-parallel size"""
     global _MPU_PIPELINE_MODEL_PARALLEL_WORLD_SIZE
     _MPU_PIPELINE_MODEL_PARALLEL_WORLD_SIZE = world_size
 
+
 def set_virtual_pipeline_model_parallel_world_size(world_size):
-    """Set the pipeline model parallel size"""
+    """Set the pipeline-model-parallel size"""
     global _VIRTUAL_PIPELINE_MODEL_PARALLEL_WORLD_SIZE
     _VIRTUAL_PIPELINE_MODEL_PARALLEL_WORLD_SIZE = world_size
 
+
 def get_tensor_model_parallel_world_size():
-    """Return world size for the tensor model parallel group."""
+    """Return world size for the tensor-model-parallel group."""
     global _MPU_TENSOR_MODEL_PARALLEL_WORLD_SIZE
     if _MPU_TENSOR_MODEL_PARALLEL_WORLD_SIZE is not None:
         return _MPU_TENSOR_MODEL_PARALLEL_WORLD_SIZE
@@ -355,33 +1099,49 @@ def get_tensor_model_parallel_world_size():
 
 
 def get_pipeline_model_parallel_world_size():
-    """Return world size for the pipeline model parallel group."""
+    """Return world size for the pipeline-model-parallel group."""
     global _MPU_PIPELINE_MODEL_PARALLEL_WORLD_SIZE
     if _MPU_PIPELINE_MODEL_PARALLEL_WORLD_SIZE is not None:
         return _MPU_PIPELINE_MODEL_PARALLEL_WORLD_SIZE
-    return torch.distributed.get_world_size(group=get_pipeline_model_parallel_group())
+
+    pp_group = get_pipeline_model_parallel_group()
+    if isinstance(pp_group, list):
+        # Implicit assumption that each PP group is the same size.
+        sizes = []
+        for group in _PIPELINE_GLOBAL_RANKS:
+            sizes.append(len(group))
+        assert all(x == sizes[0] for x in sizes)
+        return torch.distributed.get_world_size(group=pp_group[0])
+    else:
+        return torch.distributed.get_world_size(group=pp_group)
+
+
+def set_expert_model_parallel_rank(rank):
+    """Set expert-model-parallel rank."""
+    global _MPU_EXPERT_MODEL_PARALLEL_RANK
+    _MPU_EXPERT_MODEL_PARALLEL_RANK = rank
 
 
 def set_tensor_model_parallel_rank(rank):
-    """Set tensor model parallel rank."""
+    """Set tensor-model-parallel rank."""
     global _MPU_TENSOR_MODEL_PARALLEL_RANK
     _MPU_TENSOR_MODEL_PARALLEL_RANK = rank
 
 
 def set_pipeline_model_parallel_rank(rank):
-    """Set pipeline model parallel rank."""
+    """Set pipeline-model-parallel rank."""
     global _MPU_PIPELINE_MODEL_PARALLEL_RANK
     _MPU_PIPELINE_MODEL_PARALLEL_RANK = rank
 
 
 def set_pipeline_model_parallel_split_rank(rank):
-    """Set pipeline model parallel split rank."""
+    """Set pipeline-model-parallel split rank. DEPRECATED."""
     global _PIPELINE_MODEL_PARALLEL_SPLIT_RANK
     _PIPELINE_MODEL_PARALLEL_SPLIT_RANK = rank
 
 
 def get_tensor_model_parallel_rank():
-    """Return my rank for the tensor model parallel group."""
+    """Return caller's rank for the tensor-model-parallel group."""
     global _MPU_TENSOR_MODEL_PARALLEL_RANK
     if _MPU_TENSOR_MODEL_PARALLEL_RANK is not None:
         return _MPU_TENSOR_MODEL_PARALLEL_RANK
@@ -389,15 +1149,27 @@ def get_tensor_model_parallel_rank():
 
 
 def get_pipeline_model_parallel_rank():
-    """Return my rank for the pipeline model parallel group."""
+    """Return caller's rank for the pipeline-model-parallel group."""
     global _MPU_PIPELINE_MODEL_PARALLEL_RANK
     if _MPU_PIPELINE_MODEL_PARALLEL_RANK is not None:
         return _MPU_PIPELINE_MODEL_PARALLEL_RANK
-    return torch.distributed.get_rank(group=get_pipeline_model_parallel_group())
+    rank = torch.distributed.get_rank()
+    pp_group = get_pipeline_model_parallel_group()
+    if isinstance(pp_group, list):
+        # Assume that if the caller exist in multiple PP groups, then it has the same index.
+        indices = []
+        for group in _PIPELINE_GLOBAL_RANKS:
+            for i, r in enumerate(group):
+                if r == rank:
+                    indices.append(i)
+        assert all(x == indices[0] for x in indices)
+        return torch.distributed.get_rank(group=pp_group[0])
+    else:
+        return torch.distributed.get_rank(group=pp_group)
 
 
 def get_pipeline_model_parallel_split_rank():
-    """Return pipeline model parallel split rank."""
+    """Return pipeline-model-parallel split rank."""
     global _PIPELINE_MODEL_PARALLEL_SPLIT_RANK
     return _PIPELINE_MODEL_PARALLEL_SPLIT_RANK
 
@@ -405,29 +1177,35 @@ def get_pipeline_model_parallel_split_rank():
 def is_pipeline_first_stage(ignore_virtual=False):
     """Return True if in the first pipeline model-parallel stage, False otherwise."""
     if not ignore_virtual:
-        if get_virtual_pipeline_model_parallel_world_size() is not None and \
-            get_virtual_pipeline_model_parallel_rank() != 0:
+        if (
+            get_virtual_pipeline_model_parallel_world_size() is not None
+            and get_virtual_pipeline_model_parallel_rank() != 0
+        ):
             return False
     return get_pipeline_model_parallel_rank() == 0
 
 
 def is_pipeline_last_stage(ignore_virtual=False):
-    """Return True if in the last pipeline model-parallel stage, False otherwise."""
+    """Return True if in the last pipeline-model-parallel stage, False otherwise."""
     if not ignore_virtual:
-        virtual_pipeline_model_parallel_world_size = \
+        virtual_pipeline_model_parallel_world_size = (
             get_virtual_pipeline_model_parallel_world_size()
-        if virtual_pipeline_model_parallel_world_size is not None and \
-            get_virtual_pipeline_model_parallel_rank() != (
-                virtual_pipeline_model_parallel_world_size - 1):
+        )
+        if (
+            virtual_pipeline_model_parallel_world_size is not None
+            and get_virtual_pipeline_model_parallel_rank()
+            != (virtual_pipeline_model_parallel_world_size - 1)
+        ):
             return False
-    return get_pipeline_model_parallel_rank() == (
-        get_pipeline_model_parallel_world_size() - 1)
+    return get_pipeline_model_parallel_rank() == (get_pipeline_model_parallel_world_size() - 1)
 
 
 def is_rank_in_embedding_group(ignore_virtual=False):
     """Return true if current rank is in embedding group, False otherwise."""
     rank = torch.distributed.get_rank()
     global _EMBEDDING_GLOBAL_RANKS
+    if _EMBEDDING_GLOBAL_RANKS is None:
+        return False
     if ignore_virtual:
         return rank in _EMBEDDING_GLOBAL_RANKS
     if rank in _EMBEDDING_GLOBAL_RANKS:
@@ -444,7 +1222,7 @@ def is_rank_in_position_embedding_group():
     """Return true if current rank is in position embedding group, False otherwise."""
     rank = torch.distributed.get_rank()
     global _POSITION_EMBEDDING_GLOBAL_RANKS
-    return rank in _POSITION_EMBEDDING_GLOBAL_RANKS
+    return _POSITION_EMBEDDING_GLOBAL_RANKS is not None and rank in _POSITION_EMBEDDING_GLOBAL_RANKS
 
 
 def is_pipeline_stage_before_split(rank=None):
@@ -477,13 +1255,42 @@ def is_pipeline_stage_after_split(rank=None):
     return False
 
 
+def is_inside_encoder(rank=None):
+    """Return True if pipeline stage executes encoder block for a model
+    with both encoder and decoder."""
+    if get_pipeline_model_parallel_world_size() == 1:
+        return True
+    if rank is None:
+        rank = get_pipeline_model_parallel_rank()
+    global _PIPELINE_MODEL_PARALLEL_DECODER_START
+    if _PIPELINE_MODEL_PARALLEL_DECODER_START is None:
+        return True
+    if rank < _PIPELINE_MODEL_PARALLEL_DECODER_START:
+        return True
+    return False
+
+
+def is_inside_decoder(rank=None):
+    """Return True if pipeline stage executes decoder block for a model
+    with both encoder and decoder."""
+    if get_pipeline_model_parallel_world_size() == 1:
+        return True
+    if rank is None:
+        rank = get_pipeline_model_parallel_rank()
+    global _PIPELINE_MODEL_PARALLEL_DECODER_START
+    if _PIPELINE_MODEL_PARALLEL_DECODER_START is None:
+        return True
+    if rank >= _PIPELINE_MODEL_PARALLEL_DECODER_START:
+        return True
+    return False
+
+
 def is_pipeline_stage_at_split():
     """Return true if pipeline stage executes decoder block and next
     stage executes encoder block for a model with both encoder and
     decoder."""
     rank = get_pipeline_model_parallel_rank()
-    return is_pipeline_stage_before_split(rank) and \
-            is_pipeline_stage_after_split(rank+1)
+    return is_pipeline_stage_before_split(rank) and is_pipeline_stage_after_split(rank + 1)
 
 
 def get_virtual_pipeline_model_parallel_rank():
@@ -504,110 +1311,337 @@ def get_virtual_pipeline_model_parallel_world_size():
     return _VIRTUAL_PIPELINE_MODEL_PARALLEL_WORLD_SIZE
 
 
-def set_virtual_pipeline_model_parallel_world_size(world_size):
-    """Set the virtual pipeline-parallel world size"""
-    global _VIRTUAL_PIPELINE_MODEL_PARALLEL_WORLD_SIZE
-    _VIRTUAL_PIPELINE_MODEL_PARALLEL_WORLD_SIZE = world_size
-
-
 def get_tensor_model_parallel_src_rank():
     """Calculate the global rank corresponding to the first local rank
     in the tensor model parallel group."""
-    global_rank = torch.distributed.get_rank()
-    local_world_size = get_tensor_model_parallel_world_size()
-    return (global_rank // local_world_size) * local_world_size
+    assert (
+        _TENSOR_MODEL_PARALLEL_GLOBAL_RANKS is not None
+    ), "Tensor model parallel group is not initialized"
+    return _TENSOR_MODEL_PARALLEL_GLOBAL_RANKS[0]
 
 
-def get_data_parallel_src_rank():
+def get_data_parallel_src_rank(with_context_parallel=False):
     """Calculate the global rank corresponding to the first local rank
     in the data parallel group."""
-    assert _DATA_PARALLEL_GLOBAL_RANKS is not None, \
-        "Data parallel group is not initialized"
-    return _DATA_PARALLEL_GLOBAL_RANKS[0]
+    if with_context_parallel:
+        assert (
+            _DATA_PARALLEL_GLOBAL_RANKS_WITH_CP is not None
+        ), "Data parallel group with context parallel combined is not initialized"
+        return _DATA_PARALLEL_GLOBAL_RANKS_WITH_CP[0]
+    else:
+        assert _DATA_PARALLEL_GLOBAL_RANKS is not None, "Data parallel group is not initialized"
+        return _DATA_PARALLEL_GLOBAL_RANKS[0]
 
 
 def get_pipeline_model_parallel_first_rank():
-    """Return the global rank of the first process in the pipeline for the
-    current tensor parallel group"""
-    assert _PIPELINE_GLOBAL_RANKS is not None, \
-        "Pipeline parallel group is not initialized"
-    return _PIPELINE_GLOBAL_RANKS[0]
+    """Return the global rank of the first stage in the current rank's pipeline."""
+    assert _PIPELINE_GLOBAL_RANKS is not None, "Pipeline parallel group is not initialized"
+    if isinstance(_PIPELINE_GLOBAL_RANKS[0], list):
+        # I assume the first rank is the same for all pp groups right now.
+        for rank_group in _PIPELINE_GLOBAL_RANKS:
+            assert rank_group[0] == _PIPELINE_GLOBAL_RANKS[0][0]
+        return _PIPELINE_GLOBAL_RANKS[0][0]
+    else:
+        return _PIPELINE_GLOBAL_RANKS[0]
 
 
 def get_pipeline_model_parallel_last_rank():
-    """Return the global rank of the last process in the pipeline for the
-    current tensor parallel group"""
-    assert _PIPELINE_GLOBAL_RANKS is not None, \
-        "Pipeline parallel group is not initialized"
+    """Return the global rank of the last stage in the current rank's pipeline."""
+    assert _PIPELINE_GLOBAL_RANKS is not None, "Pipeline parallel group is not initialized"
     last_rank_local = get_pipeline_model_parallel_world_size() - 1
     return _PIPELINE_GLOBAL_RANKS[last_rank_local]
 
+
 def get_pipeline_model_parallel_next_rank():
-    """Return the global rank that follows the caller in the pipeline"""
-    assert _PIPELINE_GLOBAL_RANKS is not None, \
-        "Pipeline parallel group is not initialized"
+    """Return the global rank that follows the caller in the pipeline, for each
+    pipeline-parallel group that the rank is part of.
+
+    If it is just part of one group, an int is returned, otherwise a list of ints.
+    """
+    assert _PIPELINE_GLOBAL_RANKS is not None, "Pipeline parallel group is not initialized"
     rank_in_pipeline = get_pipeline_model_parallel_rank()
     world_size = get_pipeline_model_parallel_world_size()
-    return _PIPELINE_GLOBAL_RANKS[(rank_in_pipeline + 1) % world_size]
+    if isinstance(_PIPELINE_GLOBAL_RANKS[0], list):
+        to_return = []
+        for group in _PIPELINE_GLOBAL_RANKS:
+            to_return.append(group[(rank_in_pipeline + 1) % world_size])
+        return to_return
+    else:
+        return _PIPELINE_GLOBAL_RANKS[(rank_in_pipeline + 1) % world_size]
 
 
 def get_pipeline_model_parallel_prev_rank():
-    """Return the global rank that preceeds the caller in the pipeline"""
-    assert _PIPELINE_GLOBAL_RANKS is not None, \
-        "Pipeline parallel group is not initialized"
+    """Return the global rank that precedes the caller in the pipeline, for each
+    pipeline-parallel group that the rank is part of.
+
+    If it is just part of one group, an int is returned, otherwise a list of ints.
+    """
+    assert _PIPELINE_GLOBAL_RANKS is not None, "Pipeline parallel group is not initialized"
     rank_in_pipeline = get_pipeline_model_parallel_rank()
     world_size = get_pipeline_model_parallel_world_size()
-    return _PIPELINE_GLOBAL_RANKS[(rank_in_pipeline - 1) % world_size]
+    if isinstance(_PIPELINE_GLOBAL_RANKS[0], list):
+        to_return = []
+        for group in _PIPELINE_GLOBAL_RANKS:
+            to_return.append(group[(rank_in_pipeline - 1) % world_size])
+        return to_return
+    else:
+        return _PIPELINE_GLOBAL_RANKS[(rank_in_pipeline - 1) % world_size]
 
 
-def get_data_parallel_world_size():
+def get_data_parallel_world_size(with_context_parallel=False):
     """Return world size for the data parallel group."""
-    return torch.distributed.get_world_size(group=get_data_parallel_group())
+    global _MPU_DATA_PARALLEL_WORLD_SIZE
+    if _MPU_DATA_PARALLEL_WORLD_SIZE is not None:
+        return _MPU_DATA_PARALLEL_WORLD_SIZE
+    if torch.distributed.is_available() and torch.distributed.is_initialized():
+        return torch.distributed.get_world_size(
+            group=get_data_parallel_group(with_context_parallel=with_context_parallel)
+        )
+    else:
+        return 0
+
+
+def set_data_parallel_rank(rank):
+    """Return world size for the data parallel group."""
+    global _MPU_DATA_PARALLEL_RANK
+    _MPU_DATA_PARALLEL_RANK = rank
+
+
+def get_data_parallel_rank(with_context_parallel=False):
+    """Return caller's rank in the data-parallel group."""
+    global _MPU_DATA_PARALLEL_RANK
+    if _MPU_DATA_PARALLEL_RANK is not None:
+        return _MPU_DATA_PARALLEL_RANK
+    if torch.distributed.is_available() and torch.distributed.is_initialized():
+        return torch.distributed.get_rank(
+            group=get_data_parallel_group(with_context_parallel=with_context_parallel)
+        )
+    else:
+        return 0
+
+
+def get_context_parallel_world_size():
+    """Return world size for the context parallel group."""
+    if torch.distributed.is_available() and torch.distributed.is_initialized():
+        return torch.distributed.get_world_size(group=get_context_parallel_group())
+    else:
+        return 0
+
+
+def get_context_parallel_rank():
+    """Return caller's rank in the context-parallel group."""
+    if torch.distributed.is_available() and torch.distributed.is_initialized():
+        return torch.distributed.get_rank(group=get_context_parallel_group())
+    else:
+        return 0
+
+
+def get_tensor_and_context_parallel_world_size():
+    """Return world size for the tensor and context-parallel group."""
+    if torch.distributed.is_available() and torch.distributed.is_initialized():
+        return torch.distributed.get_world_size(group=get_tensor_and_context_parallel_group())
+    else:
+        return 0
+
+
+def get_tensor_and_context_parallel_rank():
+    """Return caller's rank in the joint tensor-model-parallel and context-parallel group."""
+    if torch.distributed.is_available() and torch.distributed.is_initialized():
+        return torch.distributed.get_rank(group=get_tensor_and_context_parallel_group())
+    else:
+        return 0
+
+
+def get_expert_model_parallel_world_size():
+    """Return world size for the expert-model-parallel group."""
+    if _MPU_EXPERT_MODEL_PARALLEL_WORLD_SIZE is not None:
+        return _MPU_EXPERT_MODEL_PARALLEL_WORLD_SIZE
+    if torch.distributed.is_available() and torch.distributed.is_initialized():
+        tensor_and_expert_parallel_world_size = torch.distributed.get_world_size(
+            group=get_tensor_and_expert_parallel_group()
+        )
+        return tensor_and_expert_parallel_world_size // get_tensor_model_parallel_world_size()
+    else:
+        return 0
 
 
-def get_data_parallel_rank():
-    """Return my rank for the data parallel group."""
-    return torch.distributed.get_rank(group=get_data_parallel_group())
+def get_tensor_and_expert_parallel_world_size():
+    """Return world size for the expert model parallel group times model parallel group.
+    Currently, each expert will also be distributed across TP group by default.
+    """
+    if torch.distributed.is_available() and torch.distributed.is_initialized():
+        tensor_and_expert_parallel_world_size = torch.distributed.get_world_size(
+            group=get_tensor_and_expert_parallel_group()
+        )
+        return tensor_and_expert_parallel_world_size
+    else:
+        return 0
+
+
+def get_expert_model_parallel_rank():
+    """Return caller's rank in the expert-model-parallel group."""
+    if _MPU_EXPERT_MODEL_PARALLEL_RANK is not None:
+        return _MPU_EXPERT_MODEL_PARALLEL_RANK
+    if torch.distributed.is_available() and torch.distributed.is_initialized():
+        tensor_and_expert_parallel_rank = torch.distributed.get_rank(
+            group=get_tensor_and_expert_parallel_group()
+        )
+        return tensor_and_expert_parallel_rank // get_tensor_model_parallel_world_size()
+    else:
+        return 0
+
+
+def get_data_modulo_expert_parallel_rank(with_context_parallel=False):
+    """Return caller's rank in the context-parallel group."""
+    if torch.distributed.is_available() and torch.distributed.is_initialized():
+        return torch.distributed.get_rank(
+            group=get_data_modulo_expert_parallel_group(with_context_parallel=with_context_parallel)
+        )
+    else:
+        return 0
+
+
+def get_tensor_and_expert_parallel_rank():
+    """Return caller's rank in the joint tensor- and expert-model-parallel group."""
+    if torch.distributed.is_available() and torch.distributed.is_initialized():
+        return torch.distributed.get_rank(group=get_tensor_and_expert_parallel_group())
+    else:
+        return 0
+
 
 def _set_global_memory_buffer():
-    """Initialize global buffer"""
+    """Initialize global buffer."""
     global _GLOBAL_MEMORY_BUFFER
     assert _GLOBAL_MEMORY_BUFFER is None, 'global memory buffer is already initialized'
     _GLOBAL_MEMORY_BUFFER = GlobalMemoryBuffer()
 
+
 def get_global_memory_buffer():
     """Return the global GlobalMemoryBuffer object"""
     assert _GLOBAL_MEMORY_BUFFER is not None, 'global memory buffer is not initialized'
     return _GLOBAL_MEMORY_BUFFER
 
 
+def destroy_global_memory_buffer():
+    """Sets the global memory buffer to None"""
+    global _GLOBAL_MEMORY_BUFFER
+    _GLOBAL_MEMORY_BUFFER = None
+
+
+def get_all_ranks():
+    """Get caller's rank in tensor-model-parallel, data-parallel, context-parallel,
+    pipeline-model-parallel and expert-model-parallel groups."""
+    ranks = [
+        get_tensor_model_parallel_rank(),
+        get_data_parallel_rank(),
+        get_context_parallel_rank(),
+        get_pipeline_model_parallel_rank(),
+        get_expert_model_parallel_rank(),
+    ]
+    return '_'.join(map(lambda x: str(x or 0), ranks))
+
+
+def get_moe_layer_wise_logging_tracker():
+    """Return the moe layer wise tracker."""
+    global _MOE_LAYER_WISE_LOGGING_TRACKER
+    return _MOE_LAYER_WISE_LOGGING_TRACKER
+
+
 def destroy_model_parallel():
     """Set the groups to none."""
     global _MODEL_PARALLEL_GROUP
     _MODEL_PARALLEL_GROUP = None
+
+    global _MODEL_AND_EXPERT_PARALLEL_GROUP
+    _MODEL_AND_EXPERT_PARALLEL_GROUP = None
+
     global _TENSOR_MODEL_PARALLEL_GROUP
     _TENSOR_MODEL_PARALLEL_GROUP = None
+
     global _PIPELINE_MODEL_PARALLEL_GROUP
     _PIPELINE_MODEL_PARALLEL_GROUP = None
+
     global _DATA_PARALLEL_GROUP
     _DATA_PARALLEL_GROUP = None
+
+    global _DATA_PARALLEL_GROUP_WITH_CP
+    _DATA_PARALLEL_GROUP_WITH_CP = None
+
+    global _CONTEXT_PARALLEL_GROUP
+    _CONTEXT_PARALLEL_GROUP = None
+
+    global _CONTEXT_PARALLEL_GLOBAL_RANKS
+    _CONTEXT_PARALLEL_GLOBAL_RANKS = None
+
     global _EMBEDDING_GROUP
     _EMBEDDING_GROUP = None
+
     global _POSITION_EMBEDDING_GROUP
     _POSITION_EMBEDDING_GROUP = None
-    global _AMAX_REDUCTION_GROUP
-    _AMAX_REDUCTION_GROUP = None
+
+    global _TENSOR_AND_DATA_PARALLEL_GROUP
+    _TENSOR_AND_DATA_PARALLEL_GROUP = None
+
+    global _TENSOR_AND_DATA_PARALLEL_GROUP_WITH_CP
+    _TENSOR_AND_DATA_PARALLEL_GROUP_WITH_CP = None
+
+    global _TENSOR_AND_CONTEXT_PARALLEL_GROUP
+    _TENSOR_AND_CONTEXT_PARALLEL_GROUP = None
+
+    global _EXPERT_MODEL_PARALLEL_GROUP
+    _EXPERT_MODEL_PARALLEL_GROUP = None
+
+    global _TENSOR_AND_EXPERT_PARALLEL_GROUP
+    _TENSOR_AND_EXPERT_PARALLEL_GROUP = None
+
+    global _DATA_MODULO_EXPERT_PARALLEL_GROUP
+    _DATA_MODULO_EXPERT_PARALLEL_GROUP = None
+
+    global _DATA_MODULO_EXPERT_PARALLEL_GROUP_WITH_CP
+    _DATA_MODULO_EXPERT_PARALLEL_GROUP_WITH_CP = None
+
     global _VIRTUAL_PIPELINE_MODEL_PARALLEL_RANK
     _VIRTUAL_PIPELINE_MODEL_PARALLEL_RANK = None
+
     global _VIRTUAL_PIPELINE_MODEL_PARALLEL_WORLD_SIZE
     _VIRTUAL_PIPELINE_MODEL_PARALLEL_WORLD_SIZE = None
+
     global _MPU_TENSOR_MODEL_PARALLEL_WORLD_SIZE
     _MPU_TENSOR_MODEL_PARALLEL_WORLD_SIZE = None
+
     global _MPU_PIPELINE_MODEL_PARALLEL_WORLD_SIZE
     _MPU_PIPELINE_MODEL_PARALLEL_WORLD_SIZE = None
+
     global _MPU_TENSOR_MODEL_PARALLEL_RANK
     _MPU_TENSOR_MODEL_PARALLEL_RANK = None
+
     global _MPU_PIPELINE_MODEL_PARALLEL_RANK
     _MPU_PIPELINE_MODEL_PARALLEL_RANK = None
+
     global _GLOBAL_MEMORY_BUFFER
     _GLOBAL_MEMORY_BUFFER = None
+
+    global _MPU_EXPERT_MODEL_PARALLEL_WORLD_SIZE
+    _MPU_EXPERT_MODEL_PARALLEL_WORLD_SIZE = None
+
+    global _MPU_EXPERT_MODEL_PARALLEL_RANK
+    _MPU_EXPERT_MODEL_PARALLEL_RANK = None
+
+    global _DATA_PARALLEL_GROUP_GLOO
+    if _DATA_PARALLEL_GROUP_GLOO is not None:
+        torch.distributed.destroy_process_group(_DATA_PARALLEL_GROUP_GLOO)
+    _DATA_PARALLEL_GROUP_GLOO = None
+
+    global _DATA_PARALLEL_GROUP_WITH_CP_GLOO
+    _DATA_PARALLEL_GROUP_WITH_CP_GLOO = None
+
+    global _DATA_MODULO_EXPERT_PARALLEL_GROUP_GLOO
+    if _DATA_MODULO_EXPERT_PARALLEL_GROUP_GLOO is not None:
+        torch.distributed.destroy_process_group(_DATA_MODULO_EXPERT_PARALLEL_GROUP_GLOO)
+    _DATA_MODULO_EXPERT_PARALLEL_GROUP_GLOO = None
+
+    global _DATA_MODULO_EXPERT_PARALLEL_GROUP_WITH_CP_GLOO
+    _DATA_MODULO_EXPERT_PARALLEL_GROUP_WITH_CP_GLOO = None
+
+    global _MOE_LAYER_WISE_LOGGING_TRACKER
+    _MOE_LAYER_WISE_LOGGING_TRACKER = {}
diff --git a/megatron/core/pipeline_parallel/__init__.py b/megatron/core/pipeline_parallel/__init__.py
index 00cd1ff382..37b3a5a972 100644
--- a/megatron/core/pipeline_parallel/__init__.py
+++ b/megatron/core/pipeline_parallel/__init__.py
@@ -1 +1,2 @@
+# Copyright (c) 2024, NVIDIA CORPORATION. All rights reserved.
 from .schedules import get_forward_backward_func
diff --git a/megatron/core/pipeline_parallel/p2p_communication.py b/megatron/core/pipeline_parallel/p2p_communication.py
index 301583132a..3e33e7c2f8 100644
--- a/megatron/core/pipeline_parallel/p2p_communication.py
+++ b/megatron/core/pipeline_parallel/p2p_communication.py
@@ -1,30 +1,32 @@
 # Copyright (c) 2022, NVIDIA CORPORATION. All rights reserved.
 
-from functools import reduce
 import operator
-from typing import Optional, List, Union, Callable, Tuple
+from functools import reduce
+from typing import Callable, List, Optional, Tuple, Union
 
 import torch
 
 from megatron import core
+from megatron.core import ModelParallelConfig
 from megatron.core.parallel_state import (
     get_pipeline_model_parallel_group,
-    get_pipeline_model_parallel_prev_rank,
     get_pipeline_model_parallel_next_rank,
+    get_pipeline_model_parallel_prev_rank,
+    get_pipeline_model_parallel_rank,
+    get_pipeline_model_parallel_world_size,
 )
 
 # Types
 Shape = Union[List[int], torch.Size]
 
-def _communicate_shapes(tensor_send_next, tensor_send_prev,
-                        recv_prev, recv_next,
-                        use_ring_exchange_p2p):
+
+def _communicate_shapes(tensor_send_next, tensor_send_prev, recv_prev, recv_next, config):
     """Communicate tensor shapes between stages. Used to communicate
     tensor shapes before the actual tensor communication happens.
     This is required when the sequence lengths across micro batches
     are not uniform.
 
-    Takes the following arguments:
+    Args:
         tensor_send_next: tensor to send to next rank (no tensor sent if
                           set to None).
         tensor_send_prev: tensor to send to prev rank (no tensor sent if
@@ -42,49 +44,59 @@ def _communicate_shapes(tensor_send_next, tensor_send_prev,
     send_prev_shape_tensor = None
     send_next_shape_tensor = None
     if recv_prev:
-        recv_prev_shape_tensor = torch.empty((3),
-                                             device=torch.cuda.current_device(),
-                                             dtype=torch.int64)
+        recv_prev_shape_tensor = torch.empty(
+            (3), device=torch.cuda.current_device(), dtype=torch.int64
+        )
     if recv_next:
-        recv_next_shape_tensor = torch.empty((3),
-                                             device=torch.cuda.current_device(),
-                                             dtype=torch.int64)
+        recv_next_shape_tensor = torch.empty(
+            (3), device=torch.cuda.current_device(), dtype=torch.int64
+        )
     if tensor_send_prev is not None:
-        send_prev_shape_tensor = torch.tensor(tensor_send_prev.size(),
-                                              device=torch.cuda.current_device(),
-                                              dtype=torch.int64)
+        send_prev_shape_tensor = torch.tensor(
+            tensor_send_prev.size(), device=torch.cuda.current_device(), dtype=torch.int64
+        )
     if tensor_send_next is not None:
-        send_next_shape_tensor = torch.tensor(tensor_send_next.size(),
-                                              device=torch.cuda.current_device(),
-                                              dtype=torch.int64)
-
-    if use_ring_exchange_p2p:
-        torch.distributed.ring_exchange(tensor_send_prev=send_prev_shape_tensor,
-                                        tensor_recv_prev=recv_prev_shape_tensor,
-                                        tensor_send_next=send_next_shape_tensor,
-                                        tensor_recv_next=recv_next_shape_tensor,
-                                        group=mpu.get_pipeline_model_parallel_group())
+        send_next_shape_tensor = torch.tensor(
+            tensor_send_next.size(), device=torch.cuda.current_device(), dtype=torch.int64
+        )
+
+    if config.use_ring_exchange_p2p:
+        torch.distributed.ring_exchange(
+            tensor_send_prev=send_prev_shape_tensor,
+            tensor_recv_prev=recv_prev_shape_tensor,
+            tensor_send_next=send_next_shape_tensor,
+            tensor_recv_next=recv_next_shape_tensor,
+            group=get_pipeline_model_parallel_group(),
+        )
     else:
         ops = []
         if send_prev_shape_tensor is not None:
             send_prev_op = torch.distributed.P2POp(
-                torch.distributed.isend, send_prev_shape_tensor,
-                mpu.get_pipeline_model_parallel_prev_rank())
+                torch.distributed.isend,
+                send_prev_shape_tensor,
+                get_pipeline_model_parallel_prev_rank(),
+            )
             ops.append(send_prev_op)
         if recv_prev_shape_tensor is not None:
             recv_prev_op = torch.distributed.P2POp(
-                torch.distributed.irecv, recv_prev_shape_tensor,
-                mpu.get_pipeline_model_parallel_prev_rank())
+                torch.distributed.irecv,
+                recv_prev_shape_tensor,
+                get_pipeline_model_parallel_prev_rank(),
+            )
             ops.append(recv_prev_op)
         if send_next_shape_tensor is not None:
             send_next_op = torch.distributed.P2POp(
-                torch.distributed.isend, send_next_shape_tensor,
-                mpu.get_pipeline_model_parallel_next_rank())
+                torch.distributed.isend,
+                send_next_shape_tensor,
+                get_pipeline_model_parallel_next_rank(),
+            )
             ops.append(send_next_op)
         if recv_next_shape_tensor is not None:
             recv_next_op = torch.distributed.P2POp(
-                torch.distributed.irecv, recv_next_shape_tensor,
-                mpu.get_pipeline_model_parallel_next_rank())
+                torch.distributed.irecv,
+                recv_next_shape_tensor,
+                get_pipeline_model_parallel_next_rank(),
+            )
             ops.append(recv_next_op)
         if len(ops) > 0:
             reqs = torch.distributed.batch_isend_irecv(ops)
@@ -106,19 +118,132 @@ def _communicate_shapes(tensor_send_next, tensor_send_prev,
     return recv_prev_shape, recv_next_shape
 
 
-def _communicate(*, tensor_send_next: Optional[torch.Tensor],
-                 tensor_send_prev: Optional[torch.Tensor],
-                 recv_prev: bool,
-                 recv_next: bool,
-                 tensor_shape: Shape,
-                 dtype: Optional[torch.dtype],
-                 variable_seq_lengths: bool = False,
-                 use_ring_exchange_p2p: bool = False,
-                 ) -> Tuple[torch.Tensor, torch.Tensor]:
+def _batched_p2p_ops(
+    *,
+    tensor_send_prev: Optional[torch.Tensor],
+    tensor_recv_prev: Optional[torch.Tensor],
+    tensor_send_next: Optional[torch.Tensor],
+    tensor_recv_next: Optional[torch.Tensor],
+    group: torch.distributed.ProcessGroup,
+    prev_pipeline_rank: int,
+    next_pipeline_rank: int,
+):
+    ops = []
+    if tensor_send_prev is not None:
+        send_prev_op = torch.distributed.P2POp(
+            torch.distributed.isend, tensor_send_prev, prev_pipeline_rank, group
+        )
+        ops.append(send_prev_op)
+    if tensor_recv_prev is not None:
+        recv_prev_op = torch.distributed.P2POp(
+            torch.distributed.irecv, tensor_recv_prev, prev_pipeline_rank, group
+        )
+        ops.append(recv_prev_op)
+    if tensor_send_next is not None:
+        send_next_op = torch.distributed.P2POp(
+            torch.distributed.isend, tensor_send_next, next_pipeline_rank, group
+        )
+        ops.append(send_next_op)
+    if tensor_recv_next is not None:
+        recv_next_op = torch.distributed.P2POp(
+            torch.distributed.irecv, tensor_recv_next, next_pipeline_rank, group
+        )
+        ops.append(recv_next_op)
+    if len(ops) > 0:
+        reqs = torch.distributed.batch_isend_irecv(ops)
+    else:
+        reqs = []
+    return reqs
+
+
+def _p2p_ops(
+    *,
+    tensor_send_prev: Optional[torch.Tensor],
+    tensor_recv_prev: Optional[torch.Tensor],
+    tensor_send_next: Optional[torch.Tensor],
+    tensor_recv_next: Optional[torch.Tensor],
+    group: torch.distributed.ProcessGroup,
+    prev_pipeline_rank: int,
+    next_pipeline_rank: int,
+):
+    reqs = []
+    rank = get_pipeline_model_parallel_rank()
+    even_send_odd_recv_group = group
+    if get_pipeline_model_parallel_world_size() == 2:
+        # Use the global process group for one of the two p2p communications
+        # to allow the overlap of the independent communications.
+        # Using the global process group is compatible because the pipeline-parallel
+        # communications set the source and destination by global rank.
+        even_recv_odd_send_group = torch.distributed.group.WORLD
+    else:
+        even_recv_odd_send_group = group
+
+    if get_pipeline_model_parallel_rank() % 2 == 0:
+        if tensor_send_next is not None:
+            send_next_req = torch.distributed.isend(
+                tensor=tensor_send_next, dst=next_pipeline_rank, group=even_send_odd_recv_group
+            )
+            reqs.append(send_next_req)
+
+        if tensor_recv_prev is not None:
+            recv_prev_req = torch.distributed.irecv(
+                tensor=tensor_recv_prev, src=prev_pipeline_rank, group=even_recv_odd_send_group
+            )
+            reqs.append(recv_prev_req)
+
+        if tensor_send_prev is not None:
+            send_prev_req = torch.distributed.isend(
+                tensor=tensor_send_prev, dst=prev_pipeline_rank, group=even_send_odd_recv_group
+            )
+            reqs.append(send_prev_req)
+
+        if tensor_recv_next is not None:
+            recv_next_req = torch.distributed.irecv(
+                tensor=tensor_recv_next, src=next_pipeline_rank, group=even_recv_odd_send_group
+            )
+            reqs.append(recv_next_req)
+
+    else:
+        if tensor_recv_prev is not None:
+            recv_prev_req = torch.distributed.irecv(
+                tensor=tensor_recv_prev, src=prev_pipeline_rank, group=even_send_odd_recv_group
+            )
+            reqs.append(recv_prev_req)
+
+        if tensor_send_next is not None:
+            send_next_req = torch.distributed.isend(
+                tensor=tensor_send_next, dst=next_pipeline_rank, group=even_recv_odd_send_group
+            )
+            reqs.append(send_next_req)
+
+        if tensor_recv_next is not None:
+            recv_next_req = torch.distributed.irecv(
+                tensor=tensor_recv_next, src=next_pipeline_rank, group=even_send_odd_recv_group
+            )
+            reqs.append(recv_next_req)
+
+        if tensor_send_prev is not None:
+            send_prev_req = torch.distributed.isend(
+                tensor=tensor_send_prev, dst=prev_pipeline_rank, group=even_recv_odd_send_group
+            )
+            reqs.append(send_prev_req)
+    return reqs
+
+
+def _communicate(
+    *,
+    tensor_send_next: Optional[torch.Tensor],
+    tensor_send_prev: Optional[torch.Tensor],
+    recv_prev: bool,
+    recv_next: bool,
+    tensor_shape: Shape,
+    config: ModelParallelConfig,
+    wait_on_reqs: bool = True,
+) -> Tuple[torch.Tensor, torch.Tensor]:
     """Communicate tensors between stages. Used as helper method in other
     communication methods that are used in megatron/schedules.py.
 
-    Arguments:
+    Args:
         tensor_send_next (torch.Tensor, optional):
             Tensor to send to next rank (no tensor sent if None)
 
@@ -136,23 +261,9 @@ def _communicate(*, tensor_send_next: Optional[torch.Tensor],
             tensors sent and received in a single function call are
             the same shape).
 
-        dtype (torch.dtype, required if either recv_{prev,next} is True):
-            this must be the type of the tensors that will be
-            received, will typically be params_dtype, but in the case
-            of fp32 residual connections might be torch.float.
-
-        variable_seq_lengths (bool, optional, default=False):
-            Support for variable sequence lengths across
-            microbatches. Setting this communicates the size of
-            tensors during pipeline parallelism communication, because
-            of this extra overhead it should only be set if the
-            sequence length is not constant during training.
-
-        use_ring_exchange_p2p (bool, optional, default = False):
-            Use custom ring_exchange kernel instead of
-            torch.distributed.batch_isend_irecv(). Requires custom
-            built torch with torch.distributed.ring_exchange.
-
+        wait_on_reqs (boolean, optional, default=False):
+            For non-batched p2p communication, wait on each request
+            before returning.
 
     Returns:
         tuple containing
@@ -162,91 +273,142 @@ def _communicate(*, tensor_send_next: Optional[torch.Tensor],
 
     """
 
-    # Create placeholder tensors for receive in forward and backward directions
-    # if needed.
-    tensor_recv_prev = None
-    tensor_recv_next = None
+    tensor_recv_prev_func = None
+    tensor_recv_next_func = None
 
-    if not variable_seq_lengths:
+    if not config.variable_seq_lengths:
         recv_prev_shape = tensor_shape
         recv_next_shape = tensor_shape
     else:
-        recv_prev_shape, recv_next_shape = \
-            _communicate_shapes(tensor_send_next,
-                                tensor_send_prev,
-                                recv_prev,
-                                recv_next)
+        recv_prev_shape, recv_next_shape = _communicate_shapes(
+            tensor_send_next, tensor_send_prev, recv_prev, recv_next, config
+        )
+
+    def create_tensor_recv_prev():
+        return torch.empty(
+            recv_prev_shape,
+            requires_grad=True,
+            device=torch.cuda.current_device(),
+            dtype=config.pipeline_dtype,
+        )
+
+    def create_tensor_recv_next():
+        return torch.empty(
+            recv_next_shape,
+            requires_grad=True,
+            device=torch.cuda.current_device(),
+            dtype=config.pipeline_dtype,
+        )
 
     if recv_prev:
-        if dtype is None:
-            raise RuntimeError("dtype must be provided if recv_prev is True")
+        if config.pipeline_dtype is None:
+            raise RuntimeError("pipeline_dtype must be provided if recv_prev is True")
         if tensor_shape is None:
             raise RuntimeError(
                 "tensor_shape must be specified if recv_prev is True. "
                 "Common tensor_shape is (seq_length, micro_batch_size, hidden_size)"
             )
-        tensor_recv_prev = torch.empty(recv_prev_shape,
-                                       requires_grad=True,
-                                       device=torch.cuda.current_device(),
-                                       dtype=dtype)
+        tensor_recv_prev_func = create_tensor_recv_prev
+
     if recv_next:
-        if dtype is None:
+        if config.pipeline_dtype is None:
             raise RuntimeError("dtype must be provided if recv_next is True")
         if tensor_shape is None:
             raise RuntimeError(
                 "tensor_shape must be specified if recv_next is True. "
                 "Common tensor_shape is (seq_length, micro_batch_size, hidden_size)"
             )
-        tensor_recv_next = torch.empty(recv_next_shape,
-                                       requires_grad=True,
-                                       device=torch.cuda.current_device(),
-                                       dtype=dtype)
+        tensor_recv_next_func = create_tensor_recv_next
 
     # Send tensors in both the forward and backward directions as appropriate.
-    if use_ring_exchange_p2p:
-        torch.distributed.ring_exchange(tensor_send_prev=tensor_send_prev,
-                                        tensor_recv_prev=tensor_recv_prev,
-                                        tensor_send_next=tensor_send_next,
-                                        tensor_recv_next=tensor_recv_next,
-                                        group=get_pipeline_model_parallel_group())
+    if config.use_ring_exchange_p2p:
+
+        def _ring_exchange_wrapper(**kwargs):
+            torch.distributed.ring_exchange(**kwargs)
+            return []
+
+        p2p_func = _ring_exchange_wrapper
+    elif config.batch_p2p_comm:
+        assert wait_on_reqs
+        p2p_func = _batched_p2p_ops
     else:
-        ops = []
-        if tensor_send_prev is not None:
-            send_prev_op = torch.distributed.P2POp(
-                torch.distributed.isend, tensor_send_prev,
-                get_pipeline_model_parallel_prev_rank())
-            ops.append(send_prev_op)
-        if tensor_recv_prev is not None:
-            recv_prev_op = torch.distributed.P2POp(
-                torch.distributed.irecv, tensor_recv_prev,
-                get_pipeline_model_parallel_prev_rank())
-            ops.append(recv_prev_op)
-        if tensor_send_next is not None:
-            send_next_op = torch.distributed.P2POp(
-                torch.distributed.isend, tensor_send_next,
-                get_pipeline_model_parallel_next_rank())
-            ops.append(send_next_op)
-        if tensor_recv_next is not None:
-            recv_next_op = torch.distributed.P2POp(
-                torch.distributed.irecv, tensor_recv_next,
-                get_pipeline_model_parallel_next_rank())
-            ops.append(recv_next_op)
-        if len(ops) > 0:
-            reqs = torch.distributed.batch_isend_irecv(ops)
-            for req in reqs:
-                req.wait()
+        p2p_func = _p2p_ops
+
+    # Each rank can now be part of several different pipeline parallel groups
+    # (specifically, this can occur when encoder tensor parallelism != decoder
+    # tensor parallelism, and hence a rank in the encoder is going to feed
+    # several different decoder ranks. We therefore have to receive or send tensors
+    # from several groups. For convenience, I wrap everything into lists.
+    pp_group = get_pipeline_model_parallel_group()
+    next_rank = get_pipeline_model_parallel_next_rank()
+    prev_rank = get_pipeline_model_parallel_prev_rank()
+    if not isinstance(pp_group, list):
+        pp_group = [pp_group]
+        assert not isinstance(next_rank, list)
+        next_rank = [next_rank]
+        assert not isinstance(prev_rank, list)
+        prev_rank = [prev_rank]
+
+    reqs = []
+    tensor_recv_prev_list = []
+    tensor_recv_next_list = []
+
+    for group, nr, pr in zip(pp_group, next_rank, prev_rank):
+        if tensor_recv_prev_func is not None:
+            tensor_recv_prev = tensor_recv_prev_func()
+            tensor_recv_prev_list.append(tensor_recv_prev)
+        else:
+            tensor_recv_prev = None
+
+        if tensor_recv_next_func is not None:
+            tensor_recv_next = tensor_recv_next_func()
+            tensor_recv_next_list.append(tensor_recv_next)
+        else:
+            tensor_recv_next = None
+
+        reqs.extend(
+            p2p_func(
+                tensor_send_prev=tensor_send_prev,
+                tensor_recv_prev=tensor_recv_prev,
+                tensor_send_next=tensor_send_next,
+                tensor_recv_next=tensor_recv_next,
+                group=group,
+                prev_pipeline_rank=pr,
+                next_pipeline_rank=nr,
+            )
+        )
+
+    if wait_on_reqs and len(reqs) > 0:
+        for req in reqs:
+            req.wait()
+        reqs = None
+
+    if config.batch_p2p_comm and config.batch_p2p_sync:
         # To protect against race condition when using batch_isend_irecv().
         # User should assert that we have a modern enough PyTorch to not need this
         torch.cuda.synchronize()
 
-    return tensor_recv_prev, tensor_recv_next
+    def _handle_tensor_list(x):
+        """This basically handles all the cases that we expect to see. Either the list None,
+        or it's a singleton (the usual cases, since most ranks only belong to one pipeline group),
+        or everything returned is None, or everything returned is not None, and it has to be summed
+        together."""
+        if len(x) == 0:
+            return None
+        if len(x) == 1:
+            return x[0]
+        if all(xx is None for xx in x):
+            return None
+        return torch.stack(x, dim=0).sum(dim=0, dtype=torch.float32).to(x[0].dtype)
+
+    tensor_recv_prev = _handle_tensor_list(tensor_recv_prev_list)
+    tensor_recv_next = _handle_tensor_list(tensor_recv_next_list)
 
+    return tensor_recv_prev, tensor_recv_next, reqs
 
-def recv_forward(tensor_shape: Shape,
-                 dtype: torch.dtype,
-                 timers: Callable = None) -> torch.Tensor:
-    """ Receive tensor from previous rank in pipeline (forward receive).
 
+def recv_forward(tensor_shape: Shape, config: ModelParallelConfig) -> torch.Tensor:
+    """Receive tensor from previous rank in pipeline (forward receive).
 
     See _communicate for argument details.
     """
@@ -254,23 +416,22 @@ def recv_forward(tensor_shape: Shape,
     if core.parallel_state.is_pipeline_first_stage():
         input_tensor = None
     else:
-        if timers is not None:
-            timers('forward-recv', log_level=2).start()
-        input_tensor, _ = _communicate(
+        if config.timers is not None:
+            config.timers('forward-recv', log_level=2).start()
+        input_tensor, _, _ = _communicate(
             tensor_send_next=None,
             tensor_send_prev=None,
             recv_prev=True,
             recv_next=False,
             tensor_shape=tensor_shape,
-            dtype=dtype)
-        if timers is not None:
-            timers('forward-recv').stop()
+            config=config,
+        )
+        if config.timers is not None:
+            config.timers('forward-recv').stop()
     return input_tensor
 
 
-def recv_backward(tensor_shape: Shape,
-                  dtype: torch.dtype,
-                  timers: Callable = None) -> torch.Tensor:
+def recv_backward(tensor_shape: Shape, config: ModelParallelConfig) -> torch.Tensor:
     """Receive tensor from next rank in pipeline (backward receive).
 
     See _communicate for argument details.
@@ -278,65 +439,65 @@ def recv_backward(tensor_shape: Shape,
     if core.parallel_state.is_pipeline_last_stage():
         output_tensor_grad = None
     else:
-        if timers is not None:
-            timers('backward-recv', log_level=2).start()
-        _, output_tensor_grad = _communicate(
+        if config.timers is not None:
+            config.timers('backward-recv', log_level=2).start()
+        _, output_tensor_grad, _ = _communicate(
             tensor_send_next=None,
             tensor_send_prev=None,
             recv_prev=False,
             recv_next=True,
             tensor_shape=tensor_shape,
-            dtype=dtype)
-        if timers is not None:
-            timers('backward-recv').stop()
+            config=config,
+        )
+        if config.timers is not None:
+            config.timers('backward-recv').stop()
     return output_tensor_grad
 
 
-def send_forward(output_tensor: torch.Tensor,
-                 timers: Callable = None) -> None:
+def send_forward(output_tensor: torch.Tensor, config: ModelParallelConfig) -> None:
     """Send tensor to next rank in pipeline (forward send).
 
     See _communicate for argument details.
     """
 
     if not core.parallel_state.is_pipeline_last_stage():
-        if timers is not None:
-            timers('forward-send', log_level=2).start()
+        if config.timers is not None:
+            config.timers('forward-send', log_level=2).start()
         _communicate(
             tensor_send_next=output_tensor,
             tensor_send_prev=None,
             recv_prev=False,
             recv_next=False,
             tensor_shape=None,
-            dtype=None)
-        if timers is not None:
-            timers('forward-send').stop()
+            config=config,
+        )
+        if config.timers is not None:
+            config.timers('forward-send').stop()
 
 
-def send_backward(input_tensor_grad: torch.Tensor,
-                  timers: Callable = None) -> None:
+def send_backward(input_tensor_grad: torch.Tensor, config: ModelParallelConfig) -> None:
     """Send tensor to previous rank in pipeline (backward send).
 
     See _communicate for argument details.
     """
     if not core.parallel_state.is_pipeline_first_stage():
-        if timers is not None:
-            timers('backward-send', log_level=2).start()
+        if config.timers is not None:
+            config.timers('backward-send', log_level=2).start()
         _communicate(
             tensor_send_next=None,
             tensor_send_prev=input_tensor_grad,
             recv_prev=False,
             recv_next=False,
             tensor_shape=None,
-            dtype=None)
-        if timers is not None:
-            timers('backward-send').stop()
+            config=config,
+        )
+        if config.timers is not None:
+            config.timers('backward-send').stop()
 
 
-def send_forward_recv_backward(output_tensor: torch.Tensor,
-                               tensor_shape: Shape,
-                               dtype: torch.dtype,
-                               timers: Callable = None) -> torch.Tensor:
+def send_forward_recv_backward(
+    output_tensor: torch.Tensor, tensor_shape: Shape, config: ModelParallelConfig
+) -> torch.Tensor:
     """Batched send and recv with next rank in pipeline.
 
     See _communicate for argument details.
@@ -344,24 +505,24 @@ def send_forward_recv_backward(output_tensor: torch.Tensor,
     if core.parallel_state.is_pipeline_last_stage():
         output_tensor_grad = None
     else:
-        if timers is not None:
-            timers('forward-send-backward-recv', log_level=2).start()
-        _, output_tensor_grad = _communicate(
+        if config.timers is not None:
+            config.timers('forward-send-backward-recv', log_level=2).start()
+        _, output_tensor_grad, _ = _communicate(
             tensor_send_next=output_tensor,
             tensor_send_prev=None,
             recv_prev=False,
             recv_next=True,
             tensor_shape=tensor_shape,
-            dtype=dtype)
-        if timers is not None:
-            timers('forward-send-backward-recv').stop()
+            config=config,
+        )
+        if config.timers is not None:
+            config.timers('forward-send-backward-recv').stop()
     return output_tensor_grad
 
 
-def send_backward_recv_forward(input_tensor_grad: torch.Tensor,
-                               tensor_shape: Shape,
-                               dtype: torch.dtype,
-                               timers: Callable = None) -> torch.Tensor:
+def send_backward_recv_forward(
+    input_tensor_grad: torch.Tensor, tensor_shape: Shape, config: ModelParallelConfig
+) -> torch.Tensor:
     """Batched send and recv with previous rank in pipeline.
 
     See _communicate for argument details.
@@ -369,88 +530,101 @@ def send_backward_recv_forward(input_tensor_grad: torch.Tensor,
     if core.parallel_state.is_pipeline_first_stage():
         input_tensor = None
     else:
-        if timers is not None:
-            timers('backward-send-forward-recv', log_level=2).start()
-        input_tensor, _ = _communicate(
+        if config.timers is not None:
+            config.timers('backward-send-forward-recv', log_level=2).start()
+        input_tensor, _, _ = _communicate(
             tensor_send_next=None,
             tensor_send_prev=input_tensor_grad,
             recv_prev=True,
             recv_next=False,
             tensor_shape=tensor_shape,
-            dtype=dtype)
-        if timers is not None:
-            timers('backward-send-forward-recv').stop()
+            config=config,
+        )
+        if config.timers is not None:
+            config.timers('backward-send-forward-recv').stop()
     return input_tensor
 
 
-def send_forward_recv_forward(output_tensor: torch.Tensor,
-                              recv_prev: bool,
-                              tensor_shape: Shape,
-                              dtype: torch.dtype,
-                              timers: Callable = None) -> torch.Tensor:
+def send_forward_recv_forward(
+    output_tensor: torch.Tensor,
+    recv_prev: bool,
+    tensor_shape: Shape,
+    config: ModelParallelConfig,
+    overlap_p2p_comm: bool = False,
+) -> torch.Tensor:
     """Batched recv from previous rank and send to next rank in pipeline.
 
     See _communicate for argument details.
     """
-    if timers is not None:
-        timers('forward-send-forward-recv', log_level=2).start()
-    input_tensor, _ = _communicate(
+    if config.timers is not None:
+        config.timers('forward-send-forward-recv', log_level=2).start()
+    input_tensor, _, wait_handles = _communicate(
         tensor_send_next=output_tensor,
         tensor_send_prev=None,
         recv_prev=recv_prev,
         recv_next=False,
         tensor_shape=tensor_shape,
-        dtype=dtype)
-    if timers is not None:
-        timers('forward-send-forward-recv').stop()
+        wait_on_reqs=(not overlap_p2p_comm),
+        config=config,
+    )
+    if config.timers is not None:
+        config.timers('forward-send-forward-recv').stop()
+    if overlap_p2p_comm:
+        return input_tensor, wait_handles
     return input_tensor
 
 
-def send_backward_recv_backward(input_tensor_grad: torch.Tensor,
-                                recv_next: bool,
-                                tensor_shape: Shape,
-                                dtype: torch.dtype,
-                                timers: Callable = None) -> torch.Tensor:
+def send_backward_recv_backward(
+    input_tensor_grad: torch.Tensor,
+    recv_next: bool,
+    tensor_shape: Shape,
+    config: ModelParallelConfig,
+    overlap_p2p_comm: bool = False,
+) -> torch.Tensor:
     """Batched recv from next rank and send to previous rank in pipeline.
 
     See _communicate for argument details.
     """
-    if timers is not None:
-        timers('backward-send-backward-recv', log_level=2).start()
-    _, output_tensor_grad = _communicate(
+    if config.timers is not None:
+        config.timers('backward-send-backward-recv', log_level=2).start()
+    _, output_tensor_grad, wait_handles = _communicate(
         tensor_send_next=None,
         tensor_send_prev=input_tensor_grad,
         recv_prev=False,
         recv_next=recv_next,
         tensor_shape=tensor_shape,
-        dtype=dtype)
-    if timers is not None:
-        timers('backward-send-backward-recv').stop()
+        wait_on_reqs=(not overlap_p2p_comm),
+        config=config,
+    )
+    if config.timers is not None:
+        config.timers('backward-send-backward-recv').stop()
+    if overlap_p2p_comm:
+        return output_tensor_grad, wait_handles
     return output_tensor_grad
 
 
 def send_forward_backward_recv_forward_backward(
-        output_tensor: torch.Tensor,
-        input_tensor_grad: torch.Tensor,
-        recv_prev: bool,
-        recv_next: bool,
-        tensor_shape: Shape,
-        dtype: torch.dtype,
-        timers: Callable = None) -> Tuple[torch.Tensor, torch.Tensor]:
+    output_tensor: torch.Tensor,
+    input_tensor_grad: torch.Tensor,
+    recv_prev: bool,
+    recv_next: bool,
+    tensor_shape: Shape,
+    config: ModelParallelConfig,
+) -> torch.Tensor:
     """Batched send and recv with previous and next ranks in pipeline.
 
     See _communicate for argument details.
     """
-    if timers is not None:
-        timers('forward-backward-send-forward-backward-recv',
-               log_level=2).start()
-    input_tensor, output_tensor_grad = _communicate(
+    if config.timers is not None:
+        config.timers('forward-backward-send-forward-backward-recv', log_level=2).start()
+    input_tensor, output_tensor_grad, _ = _communicate(
         tensor_send_next=output_tensor,
         tensor_send_prev=input_tensor_grad,
         recv_prev=recv_prev,
         recv_next=recv_next,
         tensor_shape=tensor_shape,
-        dtype=dtype)
-    if timers is not None:
-        timers('forward-backward-send-forward-backward-recv').stop()
+        config=config,
+    )
+    if config.timers is not None:
+        config.timers('forward-backward-send-forward-backward-recv').stop()
     return input_tensor, output_tensor_grad
diff --git a/megatron/core/pipeline_parallel/schedules.py b/megatron/core/pipeline_parallel/schedules.py
index 5007a44cd2..f082dbc6df 100644
--- a/megatron/core/pipeline_parallel/schedules.py
+++ b/megatron/core/pipeline_parallel/schedules.py
@@ -1,20 +1,27 @@
 # Copyright (c) 2022, NVIDIA CORPORATION. All rights reserved.
 
 import contextlib
-from typing import Callable, Iterator, List, Optional, Union
+from typing import Iterator, List, Union
 
 import torch
 from torch.autograd.variable import Variable
-from torch.nn.parallel.distributed import DistributedDataParallel as torchDDP
 
 from megatron.core import parallel_state
-from megatron.core.pipeline_parallel import p2p_communication
 from megatron.core.enums import ModelType
-from megatron.core.utils import get_attr_wrapped_model, get_model_type
+from megatron.core.pipeline_parallel import p2p_communication
+from megatron.core.transformer.moe.router import MoEAuxLossAutoScaler
+from megatron.core.utils import (
+    drain_embedding_wgrad_compute,
+    get_attr_wrapped_model,
+    get_model_config,
+    get_model_type,
+    get_model_xattn,
+)
 
 # Types
 Shape = Union[List[int], torch.Size]
 
+
 def get_forward_backward_func():
     """Retrieves the appropriate forward_backward function given the
     configuration of parallel_state.
@@ -24,6 +31,10 @@ def get_forward_backward_func():
     world size and virtual pipeline model parallel world size in the
     global parallel_state.
 
+    Note that if using sequence parallelism, the sequence length component of
+    the tensor shape is updated to original_sequence_length /
+    tensor_model_parallel_world_size.
+
     The function returned takes the following arguments:
 
     forward_step_func (required): A function that takes a data
@@ -32,6 +43,13 @@ def get_forward_backward_func():
         take one torch.Tensor and return a torch.Tensor of loss and a
         dictionary of string -> torch.Tensor.
 
+        A third argument, checkpoint_activations_microbatch, indicates
+        that the activations for this microbatch should be
+        checkpointed. A None value for this argument indicates that
+        the default from the configuration should be used. This is
+        used when the
+        num_microbatches_with_partial_activation_checkpoints is used.
+
         For example:
 
         def loss_func(loss_mask, output_tensor):
@@ -57,62 +75,29 @@ def forward_step(data_iterator, model):
         passed as is to forward_step_func. Expected to be a list of
         iterators in the case of interleaved pipeline parallelism.
 
-    model (required): the actual model. Expected to be a list of
-        modules in the case of interleaved pipeline parallelism.
+    model (required): the actual model. Expected to be a list of modules in the case of interleaved
+        pipeline parallelism. Must be a (potentially wrapped) megatron.core.models.MegatronModule.
 
     num_microbatches (int, required):
         The number of microbatches to go through
 
-    dtype (required when using pipeline parallelism): dtype used in
-        p2p communication, usually params_dtype
-
-    tensor_shape (required when using pipeline parallelism): Shape of
-        tensor. The tensor is expected to be 3D and its order of
-        dimension is supposed to be ``(sequence, batch, hidden)``.
-
-    decoder_seq_length (int, required for ModelType.encoder_and_decoder models):
-        Sequence length of the decoder portion, used to determine tensor shapes.
-
-    grad_scaler (optional, default=None): If using loss scaling,
-        this function should take the loss and return the scaled
-        loss. If None, no function is called on the loss.
-
-    sequence_parallel (optional, default=False):
-        Set to :obj:`True` for this function to handle sequence
-        length.  When :obj:`True`, the sequence length on each tensor
-        model parallel rank is updated to
-        :math:`original\_sequence\_length /
-        tensor\_model\_parallel\_world\_size`.
-        TODO: Do we need this? Just roll into tensor_shape arg?
-
-    forward_only (optional, default=False): Perform only the forward step
+    seq_length (int, required): Sequence length of the current global batch. If this is a dual-stack
+        transformer, this is the encoder's sequence length. This is ignored if variable_seq_lengths
+        in the config is True. Otherwise, each microbatch in the current global batch size must use
+        this sequence length.
 
-    timers (optional, default=None): TODO
+    micro_batch_size (int, required): The number of sequences in a microbatch.
 
-    collect_non_loss_data: TODO
+    decoder_seq_length (int, optional): The sequence length for the decoder in a dual-stack
+        transformer. This is ignored for a single-stack transformer.
 
-    enable_autocast (optional, default=False): If True, runs the
-        forward_step_func call inside torch.autocast context
+    forward_only (optional, default = False): Perform only the forward step
 
-    deallocate_pipeline_outputs (optional, default=False): If True, output data 
-        is deallocated after the tensor is sent to the next pipeline stage.
-        Helps with saving memory, does nothing when pipeline parallel is 
-        not used.
-    
-    no_sync_func (optional): Function that creates a context that
-        suppresses asynchronous data-parallel communication. If the
-        model is an instance of torch.nn.DistributedDataParallel, the
-        default is to use torch.nn.DistributedDataParallel.no_sync.
+    collect_non_loss_data (optional, bool, default=False): TODO
 
-    grad_sync_func (optional): Function that launches asynchronous
-        gradient reductions (e.g. distributed optimizer gradient
-        reduce-scatters). The function should take one argument: an
-        iterable of parameters whose gradients are to be synchronized.
-
-    param_sync_func (optional): Function that launches asynchronous
-        parameter synchronizations (e.g. distributed optimizer
-        parameter all-gathers). The function should take one argument:
-        an iterable of parameters to be synchronized.
+    first_val_step (bool, optional): Is the first step of the validation phase. Used by
+        Transformer Engine modules to only update their fp8 weights only on the first validation
+        step.
 
     """
     pipeline_model_parallel_size = parallel_state.get_pipeline_model_parallel_world_size()
@@ -125,6 +110,7 @@ def forward_step(data_iterator, model):
         forward_backward_func = forward_backward_no_pipelining
     return forward_backward_func
 
+
 def deallocate_output_tensor(out, deallocate_pipeline_outputs=False):
     '''Pseudo-deallocate (i.e., set to scalar) the output tensor's '.data' field.
 
@@ -134,15 +120,10 @@ def deallocate_output_tensor(out, deallocate_pipeline_outputs=False):
     '''
     if (out is None) or (not deallocate_pipeline_outputs):
         return
-    assert isinstance(out, torch.Tensor), \
-        "expected Tensor, found %s." % type(out).__name__
-    assert out._base is None, \
-        "counter-productive to free a view of another tensor."
-    out.data = torch.empty(
-        (1,),
-        device = out.device,
-        dtype = out.dtype,
-    )
+    assert isinstance(out, torch.Tensor), "expected Tensor, found %s." % type(out).__name__
+    assert out._base is None, "counter-productive to free a view of another tensor."
+    out.data = torch.empty((1,), device=out.device, dtype=out.dtype)
+
 
 def custom_backward(output, grad_output):
     '''Directly call C++ autograd engine.
@@ -153,54 +134,127 @@ def custom_backward(output, grad_output):
     grad have the same shape, while C++'s 'backward' does not.
     '''
 
-    assert output.numel() == 1, \
-        "output should be pseudo-'freed' in schedule, to optimize memory"
-    assert isinstance(output, torch.Tensor), \
-        "output == '%s'." % type(output).__name__
-    assert isinstance(grad_output, (torch.Tensor, type(None))), \
+    assert output.numel() == 1, "output should be pseudo-'freed' in schedule, to optimize memory"
+    assert isinstance(output, torch.Tensor), "output == '%s'." % type(output).__name__
+    assert isinstance(grad_output, (torch.Tensor, type(None))), (
         "grad_output == '%s'." % type(grad_output).__name__
+    )
 
     # Handle scalar output
     if grad_output is None:
         assert output.numel() == 1, "implicit grad requires scalar output."
-        grad_output = torch.ones_like(
-            output,
-            memory_format = torch.preserve_format,
-        )
+        grad_output = torch.ones_like(output, memory_format=torch.preserve_format)
 
     # Call c++ engine [ see torch/csrc/autograd/python_engine.cpp ]
     Variable._execution_engine.run_backward(
-        tensors = (output,),
-        grad_tensors = (grad_output,),
-        keep_graph = False,
-        create_graph = False,
-        inputs = tuple(),
+        tensors=(output,),
+        grad_tensors=(grad_output,),
+        keep_graph=False,
+        create_graph=False,
+        inputs=tuple(),
         allow_unreachable=True,
         accumulate_grad=True,
     )
 
 
-
-
-
-def forward_step(forward_step_func,
-                 data_iterator,
-                 model,
-                 num_microbatches,
-                 input_tensor,
-                 forward_data_store,
-                 timers,
-                 collect_non_loss_data=False,
-                 autocast_dtype=torch.float,
-                 enable_autocast=False):
+def set_current_microbatch(model, microbatch_id):
+    decoder_exists = True
+    decoder = None
+    try:
+        decoder = get_attr_wrapped_model(model, "decoder")
+    except RuntimeError:
+        decoder_exists = False
+    if decoder_exists and decoder is not None:
+        decoder.current_microbatch = microbatch_id
+
+
+def forward_step(
+    forward_step_func,
+    data_iterator,
+    model,
+    num_microbatches,
+    input_tensor,
+    forward_data_store,
+    config,
+    collect_non_loss_data=False,
+    checkpoint_activations_microbatch=None,
+    is_first_microbatch=False,
+    current_microbatch=None,
+    encoder_decoder_xattn=False,
+):
     """Forward step for passed-in model.
 
-    If first stage, input tensor is obtained from data_iterator, otherwise
-    passed-in input_tensor is used.
+    If it is the first stage, the input tensor is obtained from the data_iterator.
+    Otherwise, the passed-in input_tensor is used.
+
+    Args:
+        forward_step_func (callable):
+            The forward step function for the model that takes the
+            data iterator as the first argument, and model as the second.
+            This user's forward step is expected to output a tuple of two elements:
+
+                1. The output object from the forward step. This output object needs to be a
+                    tensor or some kind of collection of tensors. The only hard requirement
+                    for this object is that it needs to be acceptible as input into the second
+                    function.
+                2. A function to reduce (optionally) the output from the forward step. This
+                    could be a reduction over the loss from the model, it could be a function that
+                    grabs the output from the model and reformats, it could be a function that just
+                    passes through the model output. This function must have one of the following
+                    patterns, and depending on the pattern different things happen internally:
+
+                        a. A tuple of reduced loss and some other data. Note that in this case
+                            the first argument is divided by the number of global microbatches,
+                            assuming it is a loss, so that the loss is stable as a function of
+                            the number of devices the step is split across.
+                        b. A triple of reduced loss, number of tokens, and some other data. This
+                            is similar to case (a), but the loss is further averaged across the
+                            number of tokens in the batch. If the user is not already averaging
+                            across the number of tokens, this pattern is useful to use.
+                        c. Any arbitrary data the user wants (eg a dictionary of tensors, a list
+                            of tensors, etc in the case of inference). To trigger case 3 you need
+                            to specify `collect_non_loss_data=True` and you may also want to
+                            specify `forward_only=True` in the call to the parent forward_backward
+                            function.
+        data_iterator (iterator):
+            The data iterator.
+        model (nn.Module):
+            The model to perform the forward step on.
+        num_microbatches (int):
+            The number of microbatches.
+        input_tensor (Tensor or list[Tensor]):
+            The input tensor(s) for the forward step.
+        forward_data_store (list):
+            The list to store the forward data. If you go down path 2.a or
+            2.b for the return of your forward reduction function then this will store only the
+            final dimension of the output, for example the metadata output by the loss function.
+            If you go down the path of 2.c then this will store the entire output of the forward
+            reduction function applied to the model output.
+        config (object):
+            The configuration object.
+        collect_non_loss_data (bool, optional):
+            Whether to collect non-loss data. Defaults to False.
+            This is the path to use if you want to collect arbitrary output from the model forward,
+            such as with inference use cases. Defaults to False.
+        checkpoint_activations_microbatch (int, optional):
+            The microbatch to checkpoint activations.
+            Defaults to None.
+        is_first_microbatch (bool, optional):
+            Whether it is the first microbatch. Defaults to False.
+        current_microbatch (int, optional):
+            The current microbatch. Defaults to None.
+
+    Returns:
+        Tensor or list[Tensor]: The output object(s) from the forward step.
+        Tensor: The number of tokens.
+    """
+    if config.timers is not None:
+        config.timers('forward-compute', log_level=2).start()
 
-    Returns output tensor."""
-    if timers is not None:
-        timers('forward-compute', log_level=2).start()
+    if is_first_microbatch and hasattr(model, 'set_is_first_microbatch'):
+        model.set_is_first_microbatch()
+    if current_microbatch is not None:
+        set_current_microbatch(model, current_microbatch)
 
     unwrap_output_tensor = False
     if not isinstance(input_tensor, list):
@@ -210,41 +264,69 @@ def forward_step(forward_step_func,
     set_input_tensor = get_attr_wrapped_model(model, "set_input_tensor")
     set_input_tensor(input_tensor)
 
-    if enable_autocast:
-        context_manager = torch.autocast("cuda", dtype=autocast_dtype)
+    if config.enable_autocast:
+        context_manager = torch.autocast("cuda", dtype=config.autocast_dtype)
     else:
         context_manager = contextlib.nullcontext()
     with context_manager:
-        output_tensor, loss_func = forward_step_func(data_iterator, model)
+        if checkpoint_activations_microbatch is None:
+            output_tensor, loss_func = forward_step_func(data_iterator, model)
+        else:
+            output_tensor, loss_func = forward_step_func(
+                data_iterator, model, checkpoint_activations_microbatch
+            )
 
+    num_tokens = torch.tensor(0, dtype=torch.int)
     if parallel_state.is_pipeline_last_stage():
         if not collect_non_loss_data:
-            output_tensor = loss_func(output_tensor)
-            loss, loss_reduced = output_tensor
-            output_tensor = loss / num_microbatches
+            outputs = loss_func(output_tensor)
+            if len(outputs) == 3:
+                output_tensor, num_tokens, loss_reduced = outputs
+                if not config.calculate_per_token_loss:
+                    output_tensor /= num_tokens
+                    output_tensor /= num_microbatches
+            else:
+                # preserve legacy loss averaging behavior (ie, over the number of microbatches)
+                assert len(outputs) == 2
+                output_tensor, loss_reduced = outputs
+                output_tensor /= num_microbatches
             forward_data_store.append(loss_reduced)
         else:
             data = loss_func(output_tensor, non_loss_data=True)
             forward_data_store.append(data)
 
-    if timers is not None:
-        timers('forward-compute').stop()
+    if config.timers is not None:
+        config.timers('forward-compute').stop()
+
+    # Set the loss scale for the auxiliary loss of the MoE layer.
+    # Since we use a trick to do backward on the auxiliary loss, we need to set the scale
+    # explicitly.
+    if hasattr(config, 'num_moe_experts') and config.num_moe_experts is not None:
+        # Calculate the loss scale based on the grad_scale_func if available, else default to 1.
+        loss_scale = (
+            config.grad_scale_func(torch.ones(1, device=output_tensor.device))
+            if config.grad_scale_func is not None
+            else torch.tensor(1.0)
+        )
+        # Set the loss scale
+        MoEAuxLossAutoScaler.set_loss_scale(loss_scale / num_microbatches)
 
-    # If T5 model (or other model with encoder and decoder)
-    # and in decoder stack, then send encoder_hidden_state
+    # If T5 model and in decoder stack, then send encoder_hidden_state
     # downstream as well.
     model_type = get_model_type(model)
+    if (
+        model_type == ModelType.encoder_and_decoder
+        and encoder_decoder_xattn
+        and parallel_state.is_inside_decoder()
+    ):
+        return [output_tensor, input_tensor[-1]], num_tokens
 
-    if parallel_state.is_pipeline_stage_after_split() and \
-            model_type == ModelType.encoder_and_decoder:
-        return [output_tensor, input_tensor[-1]]
     if unwrap_output_tensor:
-        return output_tensor
-    return [output_tensor]
+        return output_tensor, num_tokens
+    return [output_tensor], num_tokens
 
 
-def backward_step(grad_scaler, input_tensor, output_tensor,
-                  output_tensor_grad, model_type, timers, deallocate_pipeline_outputs=False):
+def backward_step(input_tensor, output_tensor, output_tensor_grad, model_type, config):
     """Backward step through passed-in output tensor.
 
     If last stage, output_tensor_grad is None, otherwise gradient of loss
@@ -257,8 +339,8 @@ def backward_step(grad_scaler, input_tensor, output_tensor,
     # needs to be modified slightly to support arbitrary numbers of skip
     # connections.
 
-    if timers is not None:
-        timers('backward-compute', log_level=2).start()
+    if config.timers is not None:
+        config.timers('backward-compute', log_level=2).start()
 
     # Retain the grad on the input_tensor.
     unwrap_input_tensor_grad = False
@@ -275,10 +357,10 @@ def backward_step(grad_scaler, input_tensor, output_tensor,
         output_tensor_grad = [output_tensor_grad]
 
     # Backward pass.
-    if output_tensor_grad[0] is None and grad_scaler is not None:
-        output_tensor = grad_scaler(output_tensor[0])
-    
-    if deallocate_pipeline_outputs:
+    if output_tensor_grad[0] is None and config.grad_scale_func is not None:
+        output_tensor[0] = config.grad_scale_func(output_tensor[0])
+
+    if config.deallocate_pipeline_outputs:
         custom_backward(output_tensor[0], output_tensor_grad[0])
     else:
         torch.autograd.backward(output_tensor[0], grad_tensors=output_tensor_grad[0])
@@ -295,39 +377,43 @@ def backward_step(grad_scaler, input_tensor, output_tensor,
 
     # Handle single skip connection if it exists (encoder_hidden_state in
     # model with encoder and decoder).
-    if parallel_state.get_pipeline_model_parallel_world_size() > 1 and \
-            parallel_state.is_pipeline_stage_after_split() and \
-            model_type == ModelType.encoder_and_decoder:
+    if (
+        parallel_state.get_pipeline_model_parallel_world_size() > 1
+        and model_type == ModelType.encoder_and_decoder
+        and len(output_tensor_grad) > 1  # excludes models that lack a skip connection.
+    ):
         if output_tensor_grad[1] is not None:
+            assert input_tensor_grad[-1] is not None
             input_tensor_grad[-1].add_(output_tensor_grad[1])
     if unwrap_input_tensor_grad:
         input_tensor_grad = input_tensor_grad[0]
 
-    if timers is not None:
-        timers('backward-compute').stop()
+    if config.timers is not None:
+        config.timers('backward-compute').stop()
 
     return input_tensor_grad
 
 
-def forward_backward_no_pipelining(*,
-                                   forward_step_func,
-                                   data_iterator: Union[Iterator, List[Iterator]],
-                                   model: Union[torch.nn.Module, List[torch.nn.Module]],
-                                   num_microbatches: int,
-                                   dtype: Optional[torch.dtype] = None,
-                                   tensor_shape: Optional[Shape] = None, # unused
-                                   decoder_seq_length: Optional[int] = None, # unused
-                                   grad_scaler: Callable = None,
-                                   sequence_parallel: bool = False, # unused
-                                   forward_only: bool = False,
-                                   timers: Callable = None,
-                                   collect_non_loss_data: bool = False,
-                                   enable_autocast: bool = False,
-                                   deallocate_pipeline_outputs: bool = False,
-                                   no_sync_func: Optional[Callable] = None,
-                                   grad_sync_func: Optional[Callable] = None, # unused
-                                   param_sync_func: Optional[Callable] = None, # unused
-                                   ):
+def check_first_val_step(first_val_step, forward_only, cond):
+    if (first_val_step is not None) and forward_only:
+        return first_val_step and cond
+    else:
+        return cond
+
+
+def forward_backward_no_pipelining(
+    *,
+    forward_step_func,
+    data_iterator: Union[Iterator, List[Iterator]],
+    model: Union[torch.nn.Module, List[torch.nn.Module]],
+    num_microbatches: int,
+    seq_length: int,  # unused
+    micro_batch_size: int,  # unused
+    decoder_seq_length: int = None,  # unused
+    forward_only: bool = False,
+    collect_non_loss_data: bool = False,
+    first_val_step: bool = None,
+):
     """Run forward and backward passes with no pipeline parallelism
     (no inter-stage communication).
 
@@ -338,16 +424,19 @@ def forward_backward_no_pipelining(*,
     """
 
     if isinstance(model, list):
-        assert len(model) == 1, \
-            "non-pipeline-parallel schedule does not support model chunking"
+        assert len(model) == 1, "non-pipeline-parallel schedule does not support model chunking"
         model = model[0]
     if isinstance(data_iterator, list):
-        assert len(data_iterator) == 1, \
-            "non-pipeline-parallel schedule does not support model chunking"
+        assert (
+            len(data_iterator) == 1
+        ), "non-pipeline-parallel schedule does not support model chunking"
         data_iterator = data_iterator[0]
 
-    if no_sync_func is None and isinstance(model, torchDDP):
-        no_sync_func = model.no_sync
+    config = get_model_config(model)
+    if config.timers is not None:
+        config.timers('forward-backward', log_level=1).start(barrier=config.barrier_with_L1_time)
+
+    no_sync_func = config.no_sync_func
     if no_sync_func is None:
         no_sync_func = contextlib.nullcontext
 
@@ -355,81 +444,174 @@ def forward_backward_no_pipelining(*,
 
     forward_data_store = []
     input_tensor, output_tensor_grad = None, None
+    total_num_tokens = torch.zeros([], dtype=torch.int, device="cuda")
     with no_sync_func():
         for i in range(num_microbatches - 1):
-            output_tensor = forward_step(forward_step_func, data_iterator,
-                                         model, num_microbatches, input_tensor, forward_data_store,
-                                         timers, collect_non_loss_data, dtype, enable_autocast)
+            output_tensor, num_tokens = forward_step(
+                forward_step_func,
+                data_iterator,
+                model,
+                num_microbatches,
+                input_tensor,
+                forward_data_store,
+                config,
+                collect_non_loss_data,
+                is_first_microbatch=check_first_val_step(first_val_step, forward_only, i == 0),
+                current_microbatch=i,
+            )
+            total_num_tokens += num_tokens.item()
             if not forward_only:
-                backward_step(grad_scaler, input_tensor, output_tensor,
-                              output_tensor_grad, model_type, timers, deallocate_pipeline_outputs)
+                backward_step(input_tensor, output_tensor, output_tensor_grad, model_type, config)
 
     # Run computation for last microbatch out of context handler (want to
     # synchronize gradients).
-    output_tensor = forward_step(forward_step_func, data_iterator,
-                                 model, num_microbatches, input_tensor, forward_data_store,
-                                 timers, collect_non_loss_data, dtype, enable_autocast)
+    output_tensor, num_tokens = forward_step(
+        forward_step_func,
+        data_iterator,
+        model,
+        num_microbatches,
+        input_tensor,
+        forward_data_store,
+        config,
+        collect_non_loss_data,
+        is_first_microbatch=check_first_val_step(
+            first_val_step, forward_only, num_microbatches == 1
+        ),
+        current_microbatch=num_microbatches - 1,
+    )
+    total_num_tokens += num_tokens.item()
 
     if not forward_only:
-        backward_step(grad_scaler, input_tensor, output_tensor,
-                      output_tensor_grad, model_type, timers, deallocate_pipeline_outputs)
+        backward_step(input_tensor, output_tensor, output_tensor_grad, model_type, config)
+
+    if config.finalize_model_grads_func is not None and not forward_only:
+        # Finalize model grads (perform full grad all-reduce / reduce-scatter for
+        # data parallelism and layernorm all-reduce for sequence parallelism).
+        config.finalize_model_grads_func(
+            [model], total_num_tokens if config.calculate_per_token_loss else None
+        )
+
+    if config.timers is not None:
+        config.timers('forward-backward').stop()
 
     return forward_data_store
 
 
-def forward_backward_pipelining_with_interleaving(*,
-                                                  forward_step_func,
-                                                  data_iterator: Union[Iterator, List[Iterator]],
-                                                  model: Union[torch.nn.Module, List[torch.nn.Module]],
-                                                  num_microbatches: int,
-                                                  dtype: torch.dtype,
-                                                  tensor_shape: Shape,
-                                                  decoder_seq_length: Optional[int] = None,
-                                                  grad_scaler: Callable = None,
-                                                  sequence_parallel: bool = False,
-                                                  forward_only: bool = False,
-                                                  timers: Callable = None,
-                                                  collect_non_loss_data: bool = False,
-                                                  enable_autocast: bool = False,
-                                                  deallocate_pipeline_outputs: bool = False,
-                                                  no_sync_func: Optional[Callable] = None,
-                                                  grad_sync_func: Optional[Callable] = None,
-                                                  param_sync_func: Optional[Callable] = None,
-                                                  ):
+def clear_embedding_activation_buffer(config, model):
+
+    if (
+        parallel_state.is_pipeline_last_stage(ignore_virtual=True)
+        and config.defer_embedding_wgrad_compute
+    ):
+        if isinstance(model, list):
+            embedding_module = get_attr_wrapped_model(
+                model[-1], 'post_process', return_model_obj=True
+            )
+        else:
+            embedding_module = get_attr_wrapped_model(model, 'post_process', return_model_obj=True)
+
+        # Need to ensure no stray activations exists in this buffer
+        embedding_module.embedding_activation_buffer.clear()
+
+        return embedding_module
+    else:
+        return None
+
+
+def finish_embedding_wgrad_compute(config, embedding_module):
+    if (
+        parallel_state.is_pipeline_last_stage(ignore_virtual=True)
+        and config.defer_embedding_wgrad_compute
+    ):
+        embedding_activation_buffer = embedding_module.embedding_activation_buffer
+        grad_output_buffer = embedding_module.grad_output_buffer
+        weight = (
+            embedding_module.output_layer.weight
+            if embedding_module.share_embeddings_and_output_weights
+            else embedding_module.shared_embedding_or_output_weight()
+        )
+
+        drain_embedding_wgrad_compute(
+            config, embedding_activation_buffer, grad_output_buffer, weight
+        )
+
+
+def forward_backward_pipelining_with_interleaving(
+    *,
+    forward_step_func,
+    data_iterator: Union[Iterator, List[Iterator]],
+    model: Union[torch.nn.Module, List[torch.nn.Module]],
+    num_microbatches: int,
+    seq_length: int,
+    micro_batch_size: int,
+    decoder_seq_length: int = None,
+    forward_only: bool = False,
+    collect_non_loss_data: bool = False,
+    first_val_step: bool = None,
+):
     """Run interleaved 1F1B schedule (model split into model chunks), with
     communication between pipeline stages as needed.
 
     Returns dictionary with losses if the last stage, empty dict otherwise."""
-    assert isinstance(model, list), \
-        "interleaved pipeline parallelism expected model chunking"
-    assert all(isinstance(chunk, torch.nn.Module) for chunk in model), \
-        "invalid model chunking"
-    assert isinstance(data_iterator, list), \
-        "interleaved pipeline parallelism expected each model chunk to have a data iterator"
+    assert isinstance(model, list), "interleaved pipeline parallelism expected model chunking"
+    assert all(isinstance(chunk, torch.nn.Module) for chunk in model), "invalid model chunking"
+    assert isinstance(
+        data_iterator, list
+    ), "interleaved pipeline parallelism expected each model chunk to have a data iterator"
+
+    config = get_model_config(model[0])
+    if config.overlap_p2p_comm and config.batch_p2p_comm:
+        raise ValueError("Can not use both overlap_p2p_comm and batch_p2p_comm")
+
+    # Needed only when gradients are finalized in M-Core
+    if config.finalize_model_grads_func is not None and not forward_only:
+        embedding_module = clear_embedding_activation_buffer(config, model)
+
+    if config.timers is not None:
+        config.timers('forward-backward', log_level=1).start(barrier=config.barrier_with_L1_time)
 
     # Disable async grad reductions
-    if no_sync_func is None and all(isinstance(chunk, torchDDP) for chunk in model):
+    no_sync_func = config.no_sync_func
+    if isinstance(no_sync_func, list):
+
         def multi_no_sync():
             stack = contextlib.ExitStack()
-            for chunk in model:
-                stack.enter_context(chunk.no_sync())
+            for model_chunk_no_sync_func in config.no_sync_func:
+                stack.enter_context(model_chunk_no_sync_func())
             return stack
+
         no_sync_func = multi_no_sync
     if no_sync_func is None:
         no_sync_func = contextlib.nullcontext
     no_sync_context = None
+
+    if config.grad_sync_func is not None and not isinstance(config.grad_sync_func, list):
+        config.grad_sync_func = [config.grad_sync_func for _ in model]
+
+    if config.param_sync_func is not None and not isinstance(config.param_sync_func, list):
+        config.param_sync_func = [config.param_sync_func for _ in model]
+
+    # Disable config.grad_sync_func and config.param_sync_func if only running forward passes.
+    # They will be re-enabled at the end of this function.
+    grad_sync_func, param_sync_func = None, None
+    if forward_only:
+        grad_sync_func, param_sync_func = config.grad_sync_func, config.param_sync_func
+        config.grad_sync_func, config.param_sync_func = None, None
+
     def disable_grad_sync():
         """Disable asynchronous grad reductions"""
         nonlocal no_sync_context
         if no_sync_context is None:
             no_sync_context = no_sync_func()
             no_sync_context.__enter__()
+
     def enable_grad_sync():
         """Enable asynchronous grad reductions"""
         nonlocal no_sync_context
         if no_sync_context is not None:
             no_sync_context.__exit__(None, None, None)
             no_sync_context = None
+
     disable_grad_sync()
 
     # Model chunk IDs with synchronized grads
@@ -437,6 +619,8 @@ def enable_grad_sync():
 
     input_tensors = [[] for _ in range(len(model))]
     output_tensors = [[] for _ in range(len(model))]
+    total_num_tokens = torch.tensor(0, dtype=torch.int).cuda()
+
     forward_data_store = []
     if not forward_only:
         output_tensor_grads = [[] for _ in range(len(model))]
@@ -454,17 +638,16 @@ def enable_grad_sync():
     if model_type == ModelType.encoder_and_decoder:
         raise RuntimeError("Interleaving is not supported with an encoder and decoder model.")
 
-    if decoder_seq_length is not None and decoder_seq_length != tensor_shape[0]:
-        raise RuntimeError("Interleaving is not supported with a different decoder sequence length.")
-
-    if sequence_parallel:
-        seq_length, batch_size, hidden = tensor_shape
-        tensor_shape = (
-            seq_length // parallel_state.get_tensor_model_parallel_world_size(),
-            batch_size,
-            hidden,
+    if decoder_seq_length is not None and decoder_seq_length != seq_length:
+        raise RuntimeError(
+            "Interleaving is not supported with a different decoder sequence length."
         )
 
+    tensor_shape = [seq_length, micro_batch_size, config.hidden_size]
+    tensor_shape[0] = tensor_shape[0] // parallel_state.get_context_parallel_world_size()
+    if config.sequence_parallel:
+        tensor_shape[0] = tensor_shape[0] // parallel_state.get_tensor_model_parallel_world_size()
+
     # Compute number of warmup and remaining microbatches.
     num_model_chunks = len(model)
     total_num_microbatches = num_microbatches * num_model_chunks
@@ -482,32 +665,48 @@ def enable_grad_sync():
             num_warmup_microbatches = total_num_microbatches
             all_warmup_microbatches = True
         else:
-            num_warmup_microbatches = \
-                (pipeline_parallel_size - pipeline_parallel_rank - 1) * 2
-            num_warmup_microbatches += (
-                num_model_chunks - 1) * pipeline_parallel_size
-            num_warmup_microbatches = min(num_warmup_microbatches,
-                                          total_num_microbatches)
-    num_microbatches_remaining = \
-        total_num_microbatches - num_warmup_microbatches
+            num_warmup_microbatches = (pipeline_parallel_size - pipeline_parallel_rank - 1) * 2
+            num_warmup_microbatches += (num_model_chunks - 1) * pipeline_parallel_size
+            num_warmup_microbatches = min(num_warmup_microbatches, total_num_microbatches)
+    num_microbatches_remaining = total_num_microbatches - num_warmup_microbatches
+
+    # Checkpoint the activations of partial Transformer layers in a number of micro-batches
+    # within the maximum outstanding micro-batch backpropagations.
+    # Micro-batches with the ids less than 'num_microbatches_with_partial_activation_checkpoints'
+    # checkpoint partial Transformer layers (or skip checkpointing) and
+    # the rest of micro-batches within a window of micro-batches checkpoint
+    # all Transformer layers. The window of micro-batches is set by the maximum
+    # outstanding backpropagations and becomes smaller at later pipeline stages.
+    # Please refer the appendix C in https://arxiv.org/pdf/2205.05198.pdf
+    max_outstanding_backprops = None
+    if config.num_microbatches_with_partial_activation_checkpoints is not None:
+        max_outstanding_backprops = num_warmup_microbatches + 1
 
     # Synchronize params for first two model chunks
-    if param_sync_func is not None:
-        param_sync_func(model[0].parameters())
-        param_sync_func(model[1].parameters())
+    if config.param_sync_func is not None:
+        config.param_sync_func[0](model[0].parameters())
+        config.param_sync_func[1](model[1].parameters())
 
     def get_model_chunk_id(microbatch_id, forward):
         """Helper method to get the model chunk ID given the iteration number."""
         microbatch_id_in_group = microbatch_id % (pipeline_parallel_size * num_model_chunks)
         model_chunk_id = microbatch_id_in_group // pipeline_parallel_size
         if not forward:
-            model_chunk_id = (num_model_chunks - model_chunk_id - 1)
+            model_chunk_id = num_model_chunks - model_chunk_id - 1
         return model_chunk_id
 
+    def get_microbatch_id_in_model_chunk(iteration_id, forward):
+        """Helper method to get the microbatch_id within model chunk given the iteration number."""
+        assert forward
+        iteration_group_id = iteration_id // (pipeline_parallel_size * num_model_chunks)
+        microbatch_id_in_model_chunk = (iteration_group_id * pipeline_parallel_size) + (
+            iteration_id % pipeline_parallel_size
+        )
+        return microbatch_id_in_model_chunk
+
     def is_first_microbatch_for_model_chunk(microbatch_id: int) -> bool:
         """Check if an iteration is the first for a model chunk."""
         microbatch_group_size = pipeline_parallel_size * num_model_chunks
-        num_microbatch_groups = num_microbatches // microbatch_group_size
         microbatch_group_id = microbatch_id // microbatch_group_size
         microbatch_id_in_group = microbatch_id % microbatch_group_size
         if microbatch_group_id == 0:
@@ -518,7 +717,7 @@ def is_first_microbatch_for_model_chunk(microbatch_id: int) -> bool:
     def is_last_microbatch_for_model_chunk(microbatch_id: int) -> bool:
         """Check if an iteration is the last for a model chunk."""
         microbatch_group_size = pipeline_parallel_size * num_model_chunks
-        num_microbatch_groups = num_microbatches // microbatch_group_size
+        num_microbatch_groups = total_num_microbatches // microbatch_group_size
         microbatch_group_id = microbatch_id // microbatch_group_size
         microbatch_id_in_group = microbatch_id % microbatch_group_size
         if microbatch_group_id == num_microbatch_groups - 1:
@@ -526,8 +725,7 @@ def is_last_microbatch_for_model_chunk(microbatch_id: int) -> bool:
         else:
             return False
 
-
-    def forward_step_helper(microbatch_id):
+    def forward_step_helper(microbatch_id, current_microbatch, checkpoint_activations_microbatch):
         """Helper method to run forward step with model split into chunks
         (run set_virtual_pipeline_model_parallel_rank() before calling
         forward_step())."""
@@ -539,31 +737,44 @@ def forward_step_helper(microbatch_id):
         # To reduce idling from mismatched microbatch times, we launch
         # asynchronous communication at the same time across the
         # pipeline-parallel group.
-        if param_sync_func is not None:
+        if config.param_sync_func is not None:
             param_sync_microbatch_id = microbatch_id + pipeline_parallel_rank
-            if param_sync_microbatch_id < num_microbatches and is_first_microbatch_for_model_chunk(param_sync_microbatch_id):
+            if (
+                param_sync_microbatch_id < total_num_microbatches
+                and is_first_microbatch_for_model_chunk(param_sync_microbatch_id)
+            ):
                 param_sync_chunk_id = get_model_chunk_id(param_sync_microbatch_id, forward=True) + 1
                 if 1 < param_sync_chunk_id < num_model_chunks:
-                    param_sync_func(model[param_sync_chunk_id].parameters())
+                    config.param_sync_func[param_sync_chunk_id](
+                        model[param_sync_chunk_id].parameters()
+                    )
 
         # forward step
         if parallel_state.is_pipeline_first_stage():
-            if len(input_tensors[model_chunk_id]) == \
-                    len(output_tensors[model_chunk_id]):
+            if len(input_tensors[model_chunk_id]) == len(output_tensors[model_chunk_id]):
                 input_tensors[model_chunk_id].append(None)
         input_tensor = input_tensors[model_chunk_id][-1]
-        output_tensor = forward_step(forward_step_func,
-                                     data_iterator[model_chunk_id],
-                                     model[model_chunk_id],
-                                     num_microbatches,
-                                     input_tensor,
-                                     forward_data_store,
-                                     timers,
-                                     collect_non_loss_data,
-                                     dtype,
-                                     enable_autocast)
+
+        output_tensor, num_tokens = forward_step(
+            forward_step_func,
+            data_iterator[model_chunk_id],
+            model[model_chunk_id],
+            num_microbatches,
+            input_tensor,
+            forward_data_store,
+            config,
+            collect_non_loss_data,
+            checkpoint_activations_microbatch,
+            check_first_val_step(
+                first_val_step, forward_only, is_first_microbatch_for_model_chunk(microbatch_id)
+            ),
+            current_microbatch=current_microbatch,
+        )
         output_tensors[model_chunk_id].append(output_tensor)
 
+        nonlocal total_num_tokens
+        total_num_tokens += num_tokens.item()
+
         # if forward-only, no need to save tensors for a backward pass
         if forward_only:
             input_tensors[model_chunk_id].pop()
@@ -579,7 +790,7 @@ def backward_step_helper(microbatch_id):
         parallel_state.set_virtual_pipeline_model_parallel_rank(model_chunk_id)
 
         # launch grad synchronization (default)
-        if grad_sync_func is None and is_last_microbatch_for_model_chunk(microbatch_id):
+        if config.grad_sync_func is None and is_last_microbatch_for_model_chunk(microbatch_id):
             enable_grad_sync()
             synchronized_model_chunks.add(model_chunk_id)
 
@@ -589,26 +800,23 @@ def backward_step_helper(microbatch_id):
         input_tensor = input_tensors[model_chunk_id].pop(0)
         output_tensor = output_tensors[model_chunk_id].pop(0)
         output_tensor_grad = output_tensor_grads[model_chunk_id].pop(0)
-        input_tensor_grad = \
-            backward_step(grad_scaler,
-                          input_tensor,
-                          output_tensor,
-                          output_tensor_grad,
-                          model_type,
-                          timers,
-                          deallocate_pipeline_outputs)
+        input_tensor_grad = backward_step(
+            input_tensor, output_tensor, output_tensor_grad, model_type, config
+        )
 
         # launch grad synchronization (custom grad sync)
         # Note: Asynchronous communication tends to slow down compute.
         # To reduce idling from mismatched microbatch times, we launch
         # asynchronous communication at the same time across the
         # pipeline-parallel group.
-        if grad_sync_func is not None:
+        if config.grad_sync_func is not None:
             grad_sync_microbatch_id = microbatch_id - pipeline_parallel_rank
-            if grad_sync_microbatch_id >= 0 and is_last_microbatch_for_model_chunk(grad_sync_microbatch_id):
+            if grad_sync_microbatch_id >= 0 and is_last_microbatch_for_model_chunk(
+                grad_sync_microbatch_id
+            ):
                 grad_sync_chunk_id = get_model_chunk_id(grad_sync_microbatch_id, forward=False)
                 enable_grad_sync()
-                grad_sync_func(model[grad_sync_chunk_id].parameters())
+                config.grad_sync_func[grad_sync_chunk_id](model[grad_sync_chunk_id].parameters())
                 synchronized_model_chunks.add(grad_sync_chunk_id)
         disable_grad_sync()
 
@@ -616,13 +824,33 @@ def backward_step_helper(microbatch_id):
 
     # Run warmup forward passes.
     parallel_state.set_virtual_pipeline_model_parallel_rank(0)
-    input_tensors[0].append(
-        p2p_communication.recv_forward(tensor_shape, dtype, timers=timers))
+    input_tensors[0].append(p2p_communication.recv_forward(tensor_shape, config))
+
+    fwd_wait_handles = None
+    bwd_wait_handles = None
+
     for k in range(num_warmup_microbatches):
-        output_tensor = forward_step_helper(k)
+
+        if fwd_wait_handles is not None:
+            for req in fwd_wait_handles:
+                req.wait()
+
+        # Decide to checkpoint all layers' activations of the current micro-batch
+        if max_outstanding_backprops is not None:
+            checkpoint_activations_microbatch = (
+                k % max_outstanding_backprops
+                >= config.num_microbatches_with_partial_activation_checkpoints
+            )
+        else:
+            checkpoint_activations_microbatch = None
+
+        current_microbatch = get_microbatch_id_in_model_chunk(k, forward=True)
+        output_tensor = forward_step_helper(
+            k, current_microbatch, checkpoint_activations_microbatch
+        )
 
         # Determine if tensor should be received from previous stage.
-        next_forward_model_chunk_id = get_model_chunk_id(k+1, forward=True)
+        next_forward_model_chunk_id = get_model_chunk_id(k + 1, forward=True)
         recv_prev = True
         if parallel_state.is_pipeline_first_stage(ignore_virtual=True):
             if next_forward_model_chunk_id == 0:
@@ -636,108 +864,257 @@ def backward_step_helper(microbatch_id):
 
         # Send and receive tensors as appropriate (send tensors computed
         # in this iteration; receive tensors for next iteration).
-        if k == (num_warmup_microbatches - 1) and not forward_only and \
-                not all_warmup_microbatches:
-            input_tensor_grad = None
-            recv_next = True
-            if parallel_state.is_pipeline_last_stage(ignore_virtual=True):
-                recv_next = False
-            input_tensor, output_tensor_grad = \
-                p2p_communication.send_forward_backward_recv_forward_backward(
-                        output_tensor, input_tensor_grad,
-                        recv_prev=recv_prev, recv_next=recv_next,
-                        tensor_shape=tensor_shape, dtype=dtype,
-                        timers=timers)
-            output_tensor_grads[num_model_chunks-1].append(output_tensor_grad)
+        if not config.overlap_p2p_comm:
+            if (
+                k == (num_warmup_microbatches - 1)
+                and not forward_only
+                and not all_warmup_microbatches
+            ):
+                input_tensor_grad = None
+                recv_next = True
+                if parallel_state.is_pipeline_last_stage(ignore_virtual=True):
+                    recv_next = False
+                (input_tensor, output_tensor_grad) = (
+                    p2p_communication.send_forward_backward_recv_forward_backward(
+                        output_tensor,
+                        input_tensor_grad,
+                        recv_prev=recv_prev,
+                        recv_next=recv_next,
+                        tensor_shape=tensor_shape,
+                        config=config,
+                    )
+                )
+                output_tensor_grads[num_model_chunks - 1].append(output_tensor_grad)
+            else:
+                input_tensor = p2p_communication.send_forward_recv_forward(
+                    output_tensor, recv_prev=recv_prev, tensor_shape=tensor_shape, config=config
+                )
+            input_tensors[next_forward_model_chunk_id].append(input_tensor)
         else:
-            input_tensor = \
-                p2p_communication.send_forward_recv_forward(
-                    output_tensor, recv_prev=recv_prev,
-                    tensor_shape=tensor_shape, dtype=dtype,
-                    timers=timers)
-        input_tensors[next_forward_model_chunk_id].append(input_tensor)
-        deallocate_output_tensor(output_tensor, deallocate_pipeline_outputs)
+            input_tensor, fwd_wait_handles = p2p_communication.send_forward_recv_forward(
+                output_tensor,
+                recv_prev=recv_prev,
+                tensor_shape=tensor_shape,
+                config=config,
+                overlap_p2p_comm=True,
+            )
+
+            if (
+                k == (num_warmup_microbatches - 1)
+                and not forward_only
+                and not all_warmup_microbatches
+            ):
+                input_tensor_grad = None
+                recv_next = True
+                if parallel_state.is_pipeline_last_stage(ignore_virtual=True):
+                    recv_next = False
+
+                (output_tensor_grad, bwd_wait_handles) = (
+                    p2p_communication.send_backward_recv_backward(
+                        input_tensor_grad,
+                        recv_next=recv_next,
+                        tensor_shape=tensor_shape,
+                        config=config,
+                        overlap_p2p_comm=True,
+                    )
+                )
+
+                output_tensor_grads[num_model_chunks - 1].append(output_tensor_grad)
+            input_tensors[next_forward_model_chunk_id].append(input_tensor)
+
+        deallocate_output_tensor(output_tensor, config.deallocate_pipeline_outputs)
 
     # Run 1F1B in steady state.
     for k in range(num_microbatches_remaining):
         # Forward pass.
         forward_k = k + num_warmup_microbatches
-        output_tensor = forward_step_helper(forward_k)
 
-        # Backward pass.
-        backward_k = k
-        input_tensor_grad = backward_step_helper(backward_k)
+        # Decide to checkpoint all layers' activations of the current micro-batch
+        if max_outstanding_backprops is not None:
+            checkpoint_activations_microbatch = (
+                forward_k % max_outstanding_backprops
+                >= config.num_microbatches_with_partial_activation_checkpoints
+            )
+        else:
+            checkpoint_activations_microbatch = None
+
+        current_microbatch = get_microbatch_id_in_model_chunk(forward_k, forward=True)
+        if config.overlap_p2p_comm:
+            if fwd_wait_handles is not None:
+                for req in fwd_wait_handles:
+                    req.wait()
+
+            deallocate_output_tensor(output_tensor, config.deallocate_pipeline_outputs)
+
+            output_tensor = forward_step_helper(
+                forward_k, current_microbatch, checkpoint_activations_microbatch
+            )
+
+            # Determine if current stage has anything to send in either direction,
+            # otherwise set tensor to None.
+            forward_model_chunk_id = get_model_chunk_id(forward_k, forward=True)
+            parallel_state.set_virtual_pipeline_model_parallel_rank(forward_model_chunk_id)
+
+            # Last virtual stage no activation tensor to send
+            if parallel_state.is_pipeline_last_stage():
+                output_tensor = None
+
+            # Determine if peers are sending, and where in data structure to put
+            # received tensors.
+            recv_prev = True
+            if parallel_state.is_pipeline_first_stage(ignore_virtual=True):
+                # First stage is ahead of last stage by (pipeline_parallel_size - 1).
+                next_forward_model_chunk_id = get_model_chunk_id(
+                    forward_k - (pipeline_parallel_size - 1), forward=True
+                )
+                if next_forward_model_chunk_id == (num_model_chunks - 1):
+                    recv_prev = False
+                next_forward_model_chunk_id += 1
+            else:
+                next_forward_model_chunk_id = get_model_chunk_id(forward_k + 1, forward=True)
 
-        # Send output_tensor and input_tensor_grad, receive input_tensor
-        # and output_tensor_grad.
+            # If last iteration, don't receive; we already received one extra
+            # before the start of the for loop.
+            if k == (num_microbatches_remaining - 1):
+                recv_prev = False
 
-        # Determine if current stage has anything to send in either direction,
-        # otherwise set tensor to None.
-        forward_model_chunk_id = get_model_chunk_id(forward_k, forward=True)
-        parallel_state.set_virtual_pipeline_model_parallel_rank(forward_model_chunk_id)
-        if parallel_state.is_pipeline_last_stage():
-            output_tensor = None
+            # Send activation tensor to the next stage and receive activation tensor from the
+            # previous stage
+            input_tensor, fwd_wait_handles = p2p_communication.send_forward_recv_forward(
+                output_tensor,
+                recv_prev=recv_prev,
+                tensor_shape=tensor_shape,
+                config=config,
+                overlap_p2p_comm=True,
+            )
+            # assert fwd_wait_handles is not None
+
+            if bwd_wait_handles is not None:
+                for req in bwd_wait_handles:
+                    req.wait()
+
+            # Backward pass.
+            backward_k = k
+            input_tensor_grad = backward_step_helper(backward_k)
+
+            backward_model_chunk_id = get_model_chunk_id(backward_k, forward=False)
+            parallel_state.set_virtual_pipeline_model_parallel_rank(backward_model_chunk_id)
+
+            # First virtual stage no activation gradient tensor to send
+            if parallel_state.is_pipeline_first_stage():
+                input_tensor_grad = None
+
+            # Determine if the current virtual stage has an activation gradient tensor to receive
+            recv_next = True
+            if parallel_state.is_pipeline_last_stage(ignore_virtual=True):
+                # Last stage is ahead of first stage by (pipeline_parallel_size - 1).
+                next_backward_model_chunk_id = get_model_chunk_id(
+                    backward_k - (pipeline_parallel_size - 1), forward=False
+                )
+                if next_backward_model_chunk_id == 0:
+                    recv_next = False
+                next_backward_model_chunk_id -= 1
+            else:
+                next_backward_model_chunk_id = get_model_chunk_id(backward_k + 1, forward=False)
+
+            output_tensor_grad, bwd_wait_handles = p2p_communication.send_backward_recv_backward(
+                input_tensor_grad,
+                recv_next=recv_next,
+                tensor_shape=tensor_shape,
+                config=config,
+                overlap_p2p_comm=True,
+            )
+
+        else:  # no p2p overlap
+            output_tensor = forward_step_helper(
+                forward_k, current_microbatch, checkpoint_activations_microbatch
+            )
+
+            # Backward pass.
+            backward_k = k
+            input_tensor_grad = backward_step_helper(backward_k)
+
+            # Send output_tensor and input_tensor_grad, receive input_tensor
+            # and output_tensor_grad.
+
+            # Determine if current stage has anything to send in either direction,
+            # otherwise set tensor to None.
+            forward_model_chunk_id = get_model_chunk_id(forward_k, forward=True)
+            parallel_state.set_virtual_pipeline_model_parallel_rank(forward_model_chunk_id)
+            if parallel_state.is_pipeline_last_stage():
+                output_tensor = None
+
+            backward_model_chunk_id = get_model_chunk_id(backward_k, forward=False)
+            parallel_state.set_virtual_pipeline_model_parallel_rank(backward_model_chunk_id)
+            if parallel_state.is_pipeline_first_stage():
+                input_tensor_grad = None
+
+            # Determine if peers are sending, and where in data structure to put
+            # received tensors.
+            recv_prev = True
+            if parallel_state.is_pipeline_first_stage(ignore_virtual=True):
+                # First stage is ahead of last stage by (pipeline_parallel_size - 1).
+                next_forward_model_chunk_id = get_model_chunk_id(
+                    forward_k - (pipeline_parallel_size - 1), forward=True
+                )
+                if next_forward_model_chunk_id == (num_model_chunks - 1):
+                    recv_prev = False
+                next_forward_model_chunk_id += 1
+            else:
+                next_forward_model_chunk_id = get_model_chunk_id(forward_k + 1, forward=True)
 
-        backward_model_chunk_id = get_model_chunk_id(backward_k, forward=False)
-        parallel_state.set_virtual_pipeline_model_parallel_rank(backward_model_chunk_id)
-        if parallel_state.is_pipeline_first_stage():
-            input_tensor_grad = None
+            recv_next = True
+            if parallel_state.is_pipeline_last_stage(ignore_virtual=True):
+                # Last stage is ahead of first stage by (pipeline_parallel_size - 1).
+                next_backward_model_chunk_id = get_model_chunk_id(
+                    backward_k - (pipeline_parallel_size - 1), forward=False
+                )
+                if next_backward_model_chunk_id == 0:
+                    recv_next = False
+                next_backward_model_chunk_id -= 1
+            else:
+                next_backward_model_chunk_id = get_model_chunk_id(backward_k + 1, forward=False)
 
-        # Determine if peers are sending, and where in data structure to put
-        # received tensors.
-        recv_prev = True
-        if parallel_state.is_pipeline_first_stage(ignore_virtual=True):
-            # First stage is ahead of last stage by (pipeline_parallel_size - 1).
-            next_forward_model_chunk_id = get_model_chunk_id(
-                forward_k - (pipeline_parallel_size - 1), forward=True)
-            if next_forward_model_chunk_id == (num_model_chunks - 1):
+            # If last iteration, don't receive; we already received one extra
+            # before the start of the for loop.
+            if k == (num_microbatches_remaining - 1):
                 recv_prev = False
-            next_forward_model_chunk_id += 1
-        else:
-            next_forward_model_chunk_id = get_model_chunk_id(forward_k + 1,
-                                                             forward=True)
-
-        recv_next = True
-        if parallel_state.is_pipeline_last_stage(ignore_virtual=True):
-            # Last stage is ahead of first stage by (pipeline_parallel_size - 1).
-            next_backward_model_chunk_id = get_model_chunk_id(
-                backward_k - (pipeline_parallel_size - 1), forward=False)
-            if next_backward_model_chunk_id == 0:
-                recv_next = False
-            next_backward_model_chunk_id -= 1
-        else:
-            next_backward_model_chunk_id = get_model_chunk_id(backward_k + 1,
-                                                              forward=False)
-
-        # If last iteration, don't receive; we already received one extra
-        # before the start of the for loop.
-        if k == (num_microbatches_remaining - 1):
-            recv_prev = False
 
-        # Communicate tensors.
-        input_tensor, output_tensor_grad = \
-            p2p_communication.send_forward_backward_recv_forward_backward(
-                    output_tensor, input_tensor_grad,
-                    recv_prev=recv_prev, recv_next=recv_next,
-                    tensor_shape=tensor_shape, dtype=dtype, timers=timers)
-        deallocate_output_tensor(output_tensor, deallocate_pipeline_outputs)
+            # Communicate tensors.
+            (input_tensor, output_tensor_grad) = (
+                p2p_communication.send_forward_backward_recv_forward_backward(
+                    output_tensor,
+                    input_tensor_grad,
+                    recv_prev=recv_prev,
+                    recv_next=recv_next,
+                    tensor_shape=tensor_shape,
+                    config=config,
+                )
+            )
+            deallocate_output_tensor(output_tensor, config.deallocate_pipeline_outputs)
 
         # Put input_tensor and output_tensor_grad in data structures in the
         # right location.
         if recv_prev:
             input_tensors[next_forward_model_chunk_id].append(input_tensor)
         if recv_next:
-            output_tensor_grads[next_backward_model_chunk_id].append(
-                output_tensor_grad)
+            output_tensor_grads[next_backward_model_chunk_id].append(output_tensor_grad)
+
+    deallocate_output_tensor(output_tensor, config.deallocate_pipeline_outputs)
 
     # Run cooldown backward passes (flush out pipeline).
     if not forward_only:
+        if config.overlap_p2p_comm and bwd_wait_handles is not None:
+            for wait_handle in bwd_wait_handles:
+                wait_handle.wait()
+
         if all_warmup_microbatches:
-            output_tensor_grads[num_model_chunks-1].append(
-                p2p_communication.recv_backward(tensor_shape, dtype=dtype, timers=timers))
+            output_tensor_grads[num_model_chunks - 1].append(
+                p2p_communication.recv_backward(tensor_shape, config=config)
+            )
         for k in range(num_microbatches_remaining, total_num_microbatches):
             input_tensor_grad = backward_step_helper(k)
-            next_backward_model_chunk_id = get_model_chunk_id(k+1, forward=False)
+            next_backward_model_chunk_id = get_model_chunk_id(k + 1, forward=False)
             recv_next = True
             if parallel_state.is_pipeline_last_stage(ignore_virtual=True):
                 if next_backward_model_chunk_id == (num_model_chunks - 1):
@@ -746,211 +1123,265 @@ def backward_step_helper(microbatch_id):
                 recv_next = False
             output_tensor_grads[next_backward_model_chunk_id].append(
                 p2p_communication.send_backward_recv_backward(
-                    input_tensor_grad, recv_next=recv_next,
-                    tensor_shape=tensor_shape, dtype=dtype,
-                    timers=timers))
-
-    # Launch any remaining grad reductions
-    enable_grad_sync()
-    if grad_sync_func is not None:
-        params = []
-        for model_chunk_id in range(num_model_chunks):
-            if model_chunk_id not in synchronized_model_chunks:
-                params.extend(model[model_chunk_id].parameters())
-                synchronized_model_chunks.add(model_chunk_id)
-        if params:
-            grad_sync_func(params)
+                    input_tensor_grad, recv_next=recv_next, tensor_shape=tensor_shape, config=config
+                )
+            )
+
+        # Launch any remaining grad reductions.
+        enable_grad_sync()
+        if config.grad_sync_func is not None:
+            for model_chunk_id in range(num_model_chunks):
+                if model_chunk_id not in synchronized_model_chunks:
+                    config.grad_sync_func[model_chunk_id](model[model_chunk_id].parameters())
+                    synchronized_model_chunks.add(model_chunk_id)
+
+    if config.finalize_model_grads_func is not None and not forward_only:
+
+        # If defer_embedding_wgrad_compute is enabled we need to do the
+        # weight gradient GEMM's here.
+        finish_embedding_wgrad_compute(config, embedding_module)
+
+        # Finalize model grads (perform full grad all-reduce / reduce-scatter for
+        # data parallelism, layernorm all-reduce for sequence parallelism, and
+        # embedding all-reduce for pipeline parallelism).
+        config.finalize_model_grads_func(
+            model, total_num_tokens if config.calculate_per_token_loss else None
+        )
+
+    # Restore config.grad_sync_func and config.param_sync_func.
+    if forward_only:
+        config.grad_sync_func, config.param_sync_func = grad_sync_func, param_sync_func
+
+    if config.timers is not None:
+        config.timers('forward-backward').stop()
 
     return forward_data_store
 
-def get_tensor_shapes(*,
-                      rank: int,
-                      model_type: ModelType,
-                      tensor_shape: Shape,
-                      decoder_seq_length: int,
-                      sequence_parallel: bool):
-    # Determine right tensor sizes (based on position of rank with respect to split
-    # rank) and model size.
-    # Send two tensors if model is T5 and rank is in decoder stage:
-    #     first tensor is decoder (pre-transpose),
-    #     second tensor is encoder (post-transpose).
-    # If model is T5 and rank is at the boundary:
-    #     send one tensor (post-transpose from encoder).
-    # Otherwise, send one tensor (pre-transpose).
-    tensor_shapes = []
 
-    assert (
-        len(tensor_shape) == 3
-    ), f"`tensor_shape` should be [sequence_length, micro_batch_size, hidden_size] but {tensor_shape}"
+def get_tensor_shapes(
+    *,
+    rank: int,
+    model_type: ModelType,
+    seq_length: int,
+    micro_batch_size: int,
+    decoder_seq_length: int,
+    config,
+    encoder_decoder_xattn: bool,
+):
+    # Determine right tensor sizes (based on position of rank with
+    # respect to split rank) and model size.
+    # Send two tensors if model decoder requires the encoder's output
+    # (via cross-attention) and rank is in decoder stage.
+    #     first tensor is decoder.
+    #     second tensor is encoder.
+    # If model has an encoder & decoder and rank is at the boundary:
+    #     send one tensor.
+    # Otherwise, send one tensor.
+    tensor_shapes = []
 
-    seq_length, micro_batch_size, hidden_size = tensor_shape
+    seq_length = seq_length // parallel_state.get_context_parallel_world_size()
+    if model_type == ModelType.encoder_and_decoder:
+        decoder_seq_length = decoder_seq_length // parallel_state.get_context_parallel_world_size()
 
-    if sequence_parallel:
+    if config.sequence_parallel:
         seq_length = seq_length // parallel_state.get_tensor_model_parallel_world_size()
+        if model_type == ModelType.encoder_and_decoder:
+            decoder_seq_length = (
+                decoder_seq_length // parallel_state.get_tensor_model_parallel_world_size()
+            )
 
     if model_type == ModelType.encoder_and_decoder:
-        if sequence_parallel:
-            decoder_seq_length = decoder_seq_length // parallel_state.get_tensor_model_parallel_world_size()
-
-        if parallel_state.is_pipeline_stage_before_split(rank):
-            tensor_shapes.append((seq_length, micro_batch_size, hidden_size))
+        if parallel_state.is_inside_encoder(rank):
+            tensor_shapes.append((seq_length, micro_batch_size, config.hidden_size))
+        elif encoder_decoder_xattn:
+            tensor_shapes.append((decoder_seq_length, micro_batch_size, config.hidden_size))
+            tensor_shapes.append((seq_length, micro_batch_size, config.hidden_size))
         else:
-            tensor_shapes.append((decoder_seq_length, micro_batch_size, hidden_size))
-            tensor_shapes.append((seq_length, micro_batch_size, hidden_size))
-    else:
-        tensor_shapes.append((seq_length, micro_batch_size, hidden_size))
+            tensor_shapes.append((decoder_seq_length, micro_batch_size, config.hidden_size))
+    else:  # model_type == ModelType.encoder_or_decoder
+        tensor_shapes.append((seq_length, micro_batch_size, config.hidden_size))
     return tensor_shapes
 
 
-
-def recv_forward(tensor_shapes, dtype, timers):
+def recv_forward(tensor_shapes, config):
     input_tensors = []
     for tensor_shape in tensor_shapes:
         if tensor_shape is None:
             input_tensors.append(None)
         else:
-            input_tensors.append(p2p_communication.recv_forward(tensor_shape, dtype,
-                                                                timers=timers))
+            input_tensors.append(p2p_communication.recv_forward(tensor_shape, config))
     return input_tensors
 
 
-def recv_backward(tensor_shapes, dtype, timers):
+def recv_backward(tensor_shapes, config):
     output_tensor_grads = []
     for tensor_shape in tensor_shapes:
         if tensor_shape is None:
             output_tensor_grads.append(None)
         else:
-            output_tensor_grads.append(p2p_communication.recv_backward(tensor_shape, dtype,
-                                                                       timers=timers))
+            output_tensor_grads.append(p2p_communication.recv_backward(tensor_shape, config))
     return output_tensor_grads
 
 
-def send_forward(output_tensors, tensor_shapes, timers):
+def send_forward(output_tensors, tensor_shapes, config):
     if not isinstance(output_tensors, list):
         output_tensors = [output_tensors]
-    for (output_tensor, tensor_shape) in zip(output_tensors, tensor_shapes):
+    for output_tensor, tensor_shape in zip(output_tensors, tensor_shapes):
         if tensor_shape is None:
             continue
-        p2p_communication.send_forward(output_tensor, timers=timers)
+        p2p_communication.send_forward(output_tensor, config)
 
 
-def send_backward(input_tensor_grads, tensor_shapes, timers):
+def send_backward(input_tensor_grads, tensor_shapes, config):
     if not isinstance(input_tensor_grads, list):
         input_tensor_grads = [input_tensor_grads]
-    for (input_tensor_grad, tensor_shape) in zip(input_tensor_grads, tensor_shapes):
+    for input_tensor_grad, tensor_shape in zip(input_tensor_grads, tensor_shapes):
         if tensor_shape is None:
             continue
-        p2p_communication.send_backward(input_tensor_grad, timers=timers)
+        p2p_communication.send_backward(input_tensor_grad, config)
 
 
-def send_forward_recv_backward(output_tensors, tensor_shapes, dtype, timers):
+def send_forward_recv_backward(output_tensors, tensor_shapes, config):
     if not isinstance(output_tensors, list):
         output_tensors = [output_tensors]
     output_tensor_grads = []
-    for (output_tensor, tensor_shape) in zip(output_tensors, tensor_shapes):
+    for output_tensor, tensor_shape in zip(output_tensors, tensor_shapes):
         if tensor_shape is None:
             output_tensor_grads.append(None)
             continue
         output_tensor_grad = p2p_communication.send_forward_recv_backward(
-                output_tensor, tensor_shape, dtype, timers=timers)
+            output_tensor, tensor_shape, config
+        )
         output_tensor_grads.append(output_tensor_grad)
     return output_tensor_grads
 
 
-def send_backward_recv_forward(input_tensor_grads, tensor_shapes, dtype, timers):
+def send_backward_recv_forward(input_tensor_grads, tensor_shapes, config):
     if not isinstance(input_tensor_grads, list):
         input_tensor_grads = [input_tensor_grads]
     input_tensors = []
-    for (input_tensor_grad, tensor_shape) in zip(input_tensor_grads, tensor_shapes):
+    for input_tensor_grad, tensor_shape in zip(input_tensor_grads, tensor_shapes):
         if tensor_shape is None:
             input_tensors.append(None)
             continue
         input_tensor = p2p_communication.send_backward_recv_forward(
-                input_tensor_grad, tensor_shape, dtype, timers=timers)
+            input_tensor_grad, tensor_shape, config
+        )
         input_tensors.append(input_tensor)
     return input_tensors
 
 
-def forward_backward_pipelining_without_interleaving(*,
-                                                     forward_step_func,
-                                                     data_iterator: Union[Iterator, List[Iterator]],
-                                                     model: Union[torch.nn.Module, List[torch.nn.Module]],
-                                                     num_microbatches: int,
-                                                     dtype: torch.dtype,
-                                                     tensor_shape: Shape,
-                                                     decoder_seq_length: Optional[int] = None,
-                                                     grad_scaler: Callable = None,
-                                                     sequence_parallel: bool = False,
-                                                     forward_only: bool = False,
-                                                     timers: Callable = None,
-                                                     collect_non_loss_data: bool = False,
-                                                     enable_autocast: bool = False,
-                                                     deallocate_pipeline_outputs: bool = False,
-                                                     no_sync_func: Optional[Callable] = None,
-                                                     grad_sync_func: Optional[Callable] = None,
-                                                     param_sync_func: Optional[Callable] = None, # unused
-                                                     ):
+def forward_backward_pipelining_without_interleaving(
+    *,
+    forward_step_func,
+    data_iterator: Union[Iterator, List[Iterator]],
+    model: Union[torch.nn.Module, List[torch.nn.Module]],
+    num_microbatches: int,
+    seq_length: int,
+    micro_batch_size: int,
+    decoder_seq_length: int = None,
+    forward_only: bool = False,
+    collect_non_loss_data: bool = False,
+    first_val_step: bool = None,
+):
     """Run non-interleaved 1F1B schedule, with communication between pipeline
-    stages.
-
-    Returns dictionary with losses if the last stage, empty dict otherwise."""
+    stages. Returns dictionary with losses if the last stage, empty dict otherwise."""
 
     if isinstance(model, list):
-        assert len(model) == 1, \
-            "non-interleaved pipeline parallelism does not support model chunking"
+        assert (
+            len(model) == 1
+        ), "non-interleaved pipeline parallelism does not support model chunking"
         model = model[0]
     if isinstance(data_iterator, list):
-        assert len(data_iterator) == 1, \
-            "non-pipeline-parallel schedule does not support model chunking"
+        assert (
+            len(data_iterator) == 1
+        ), "non-pipeline-parallel schedule does not support model chunking"
         data_iterator = data_iterator[0]
 
+    config = get_model_config(model)
+    if config.overlap_p2p_comm:
+        raise ValueError(
+            "Non-interleaved pipeline parallelism does not support overlapping p2p communication"
+        )
+
+    # Needed only when gradients are finalized in M-Core
+    if config.finalize_model_grads_func is not None and not forward_only:
+        embedding_module = clear_embedding_activation_buffer(config, model)
+
+    if config.timers is not None:
+        config.timers('forward-backward', log_level=1).start(barrier=config.barrier_with_L1_time)
+
     # Disable async grad reductions
-    if no_sync_func is None and isinstance(model, torchDDP):
-        no_sync_func = model.no_sync
+    no_sync_func = config.no_sync_func
     if no_sync_func is None:
         no_sync_func = contextlib.nullcontext
     no_sync_context = None
+
     def disable_grad_sync():
         """Disable asynchronous grad reductions"""
         nonlocal no_sync_context
         if no_sync_context is None:
             no_sync_context = no_sync_func()
             no_sync_context.__enter__()
+
     def enable_grad_sync():
         """Enable asynchronous grad reductions"""
         nonlocal no_sync_context
         if no_sync_context is not None:
             no_sync_context.__exit__(None, None, None)
             no_sync_context = None
+
     disable_grad_sync()
 
     # Compute number of warmup microbatches.
-    num_warmup_microbatches = \
-        (parallel_state.get_pipeline_model_parallel_world_size() -
-         parallel_state.get_pipeline_model_parallel_rank() - 1)
-    num_warmup_microbatches = min(
-        num_warmup_microbatches,
-        num_microbatches)
-    num_microbatches_remaining = \
-        num_microbatches - num_warmup_microbatches
+    num_warmup_microbatches = (
+        parallel_state.get_pipeline_model_parallel_world_size()
+        - parallel_state.get_pipeline_model_parallel_rank()
+        - 1
+    )
+    num_warmup_microbatches = min(num_warmup_microbatches, num_microbatches)
+    num_microbatches_remaining = num_microbatches - num_warmup_microbatches
+
+    # Checkpoint the activations of partial Transformer layers in a number of micro-batches
+    # within the maximum outstanding micro-batch backpropagations.
+    # Micro-batches with the ids less than 'num_microbatches_with_partial_activation_checkpoints'
+    # checkpoint partial Transformer layers (or skip checkpointing) and
+    # the rest of micro-batches within a window of micro-batches checkpoint
+    # all Transformer layers. The window of micro-batches is set by the maximum
+    # outstanding backpropagations and becomes smaller at later pipeline stages.
+    # Please refer the appendix C in https://arxiv.org/pdf/2205.05198.pdf
+    max_outstanding_backprops = None
+    if config.num_microbatches_with_partial_activation_checkpoints is not None:
+        max_outstanding_backprops = num_warmup_microbatches + 1
 
     model_type = get_model_type(model)
+    encoder_decoder_xattn = get_model_xattn(model)
 
     rank = parallel_state.get_pipeline_model_parallel_rank()
-    recv_tensor_shapes = get_tensor_shapes(rank=rank-1,
-                                           model_type=model_type,
-                                           tensor_shape=tensor_shape,
-                                           decoder_seq_length=decoder_seq_length,
-                                           sequence_parallel=sequence_parallel)
-    send_tensor_shapes = get_tensor_shapes(rank=rank,
-                                           model_type=model_type,
-                                           tensor_shape=tensor_shape,
-                                           decoder_seq_length=decoder_seq_length,
-                                           sequence_parallel=sequence_parallel)
+    recv_tensor_shapes = get_tensor_shapes(
+        rank=rank - 1,
+        model_type=model_type,
+        seq_length=seq_length,
+        micro_batch_size=micro_batch_size,
+        decoder_seq_length=decoder_seq_length,
+        config=config,
+        encoder_decoder_xattn=encoder_decoder_xattn,
+    )
+    send_tensor_shapes = get_tensor_shapes(
+        rank=rank,
+        model_type=model_type,
+        seq_length=seq_length,
+        micro_batch_size=micro_batch_size,
+        decoder_seq_length=decoder_seq_length,
+        config=config,
+        encoder_decoder_xattn=encoder_decoder_xattn,
+    )
 
     # Input, output tensors only need to be saved when doing backward passes
     input_tensors = None
     output_tensors = None
+    total_num_tokens = torch.tensor(0, dtype=torch.int).cuda()
+
     if not forward_only:
         input_tensors = []
         output_tensors = []
@@ -958,64 +1389,112 @@ def enable_grad_sync():
 
     # Run warmup forward passes.
     for i in range(num_warmup_microbatches):
-        input_tensor = recv_forward(recv_tensor_shapes, dtype, timers=timers)
-        output_tensor = forward_step(forward_step_func, data_iterator, model, num_microbatches,
-                                     input_tensor, forward_data_store,
-                                     timers, collect_non_loss_data, dtype, enable_autocast)
-        send_forward(output_tensor, send_tensor_shapes, timers=timers)
+        # Decide to checkpoint all layers' activations of the current micro-batch
+        if max_outstanding_backprops is not None:
+            checkpoint_activations_microbatch = (
+                i % max_outstanding_backprops
+                >= config.num_microbatches_with_partial_activation_checkpoints
+            )
+        else:
+            checkpoint_activations_microbatch = None
+
+        input_tensor = recv_forward(recv_tensor_shapes, config)
+        output_tensor, num_tokens = forward_step(
+            forward_step_func,
+            data_iterator,
+            model,
+            num_microbatches,
+            input_tensor,
+            forward_data_store,
+            config,
+            collect_non_loss_data,
+            checkpoint_activations_microbatch,
+            check_first_val_step(first_val_step, forward_only, i == 0),
+            current_microbatch=i,
+            encoder_decoder_xattn=encoder_decoder_xattn,
+        )
+        send_forward(output_tensor, send_tensor_shapes, config)
+        total_num_tokens += num_tokens.item()
 
         if not forward_only:
             input_tensors.append(input_tensor)
             output_tensors.append(output_tensor)
-            deallocate_output_tensor(output_tensor[0], deallocate_pipeline_outputs)
+            deallocate_output_tensor(output_tensor[0], config.deallocate_pipeline_outputs)
 
     # Before running 1F1B, need to receive first forward tensor.
     # If all microbatches are run in warmup / cooldown phase, then no need to
     # receive this tensor here.
     if num_microbatches_remaining > 0:
-        input_tensor = recv_forward(recv_tensor_shapes, dtype, timers=timers)
+        input_tensor = recv_forward(recv_tensor_shapes, config)
 
     # Run 1F1B in steady state.
     for i in range(num_microbatches_remaining):
-        last_iteration = (i == (num_microbatches_remaining - 1))
+        last_iteration = i == (num_microbatches_remaining - 1)
 
-        output_tensor = forward_step(forward_step_func, data_iterator, model, num_microbatches,
-                                     input_tensor, forward_data_store,
-                                     timers, collect_non_loss_data, dtype, enable_autocast)
+        # Decide to checkpoint all layers' activations of the current micro-batch
+        if max_outstanding_backprops is not None:
+            checkpoint_activations_microbatch = (
+                (i + num_warmup_microbatches) % max_outstanding_backprops
+            ) >= config.num_microbatches_with_partial_activation_checkpoints
+        else:
+            checkpoint_activations_microbatch = None
+
+        output_tensor, num_tokens = forward_step(
+            forward_step_func,
+            data_iterator,
+            model,
+            num_microbatches,
+            input_tensor,
+            forward_data_store,
+            config,
+            collect_non_loss_data,
+            checkpoint_activations_microbatch,
+            check_first_val_step(
+                first_val_step, forward_only, (i == 0) and (num_warmup_microbatches == 0)
+            ),
+            current_microbatch=i + num_warmup_microbatches,
+            encoder_decoder_xattn=encoder_decoder_xattn,
+        )
+        total_num_tokens += num_tokens.item()
 
         if forward_only:
-            send_forward(output_tensor, send_tensor_shapes, timers=timers)
+            send_forward(output_tensor, send_tensor_shapes, config)
 
             if not last_iteration:
-                input_tensor = recv_forward(recv_tensor_shapes, dtype, timers=timers)
+                input_tensor = recv_forward(recv_tensor_shapes, config)
 
         else:
-            output_tensor_grad = \
-                send_forward_recv_backward(output_tensor,
-                                           send_tensor_shapes, dtype,
-                                           timers=timers)
+            output_tensor_grad = send_forward_recv_backward(
+                output_tensor, send_tensor_shapes, config
+            )
 
             # Add input_tensor and output_tensor to end of list.
             input_tensors.append(input_tensor)
             output_tensors.append(output_tensor)
-            deallocate_output_tensor(output_tensor[0], deallocate_pipeline_outputs)
+            deallocate_output_tensor(output_tensor[0], config.deallocate_pipeline_outputs)
 
             # Pop input_tensor and output_tensor from the start of the list for
             # the backward pass.
             input_tensor = input_tensors.pop(0)
             output_tensor = output_tensors.pop(0)
 
-            input_tensor_grad = \
-                backward_step(grad_scaler, input_tensor, output_tensor,
-                              output_tensor_grad, model_type, timers, deallocate_pipeline_outputs)
+            # Enable grad sync for the last microbatch in the batch if the full
+            # backward pass completes in the 1F1B stage.
+            if num_warmup_microbatches == 0 and last_iteration:
+                if config.grad_sync_func is None or rank == 0:
+                    enable_grad_sync()
+
+            input_tensor_grad = backward_step(
+                input_tensor, output_tensor, output_tensor_grad, model_type, config
+            )
 
             if last_iteration:
                 input_tensor = None
-                send_backward(input_tensor_grad, recv_tensor_shapes, timers=timers)
+                send_backward(input_tensor_grad, recv_tensor_shapes, config)
             else:
-                input_tensor = \
-                    send_backward_recv_forward(
-                        input_tensor_grad, recv_tensor_shapes, dtype, timers=timers)
+                input_tensor = send_backward_recv_forward(
+                    input_tensor_grad, recv_tensor_shapes, config
+                )
 
     # Run cooldown backward passes.
     if not forward_only:
@@ -1026,25 +1505,41 @@ def enable_grad_sync():
             # async grad reduction in first pipeline stage. Other
             # pipeline stages do grad reduction during pipeline
             # bubble.
-            if i == num_warmup_microbatches-1:
-                if grad_sync_func is None or rank == 0:
+            if i == num_warmup_microbatches - 1:
+                if config.grad_sync_func is None or rank == 0:
                     enable_grad_sync()
 
             input_tensor = input_tensors.pop(0)
             output_tensor = output_tensors.pop(0)
 
-            output_tensor_grad = recv_backward(send_tensor_shapes, dtype, timers=timers)
+            output_tensor_grad = recv_backward(send_tensor_shapes, config)
 
-            input_tensor_grad = \
-                backward_step(grad_scaler, input_tensor, output_tensor,
-                              output_tensor_grad, model_type, timers, deallocate_pipeline_outputs)
+            input_tensor_grad = backward_step(
+                input_tensor, output_tensor, output_tensor_grad, model_type, config
+            )
 
-            send_backward(input_tensor_grad, recv_tensor_shapes, timers=timers)
+            send_backward(input_tensor_grad, recv_tensor_shapes, config)
 
-    # Launch any remaining grad reductions
-    if no_sync_context is not None:
-        enable_grad_sync()
-        if grad_sync_func is not None:
-            grad_sync_func(model.parameters())
+        # Launch any remaining grad reductions.
+        if no_sync_context is not None:
+            enable_grad_sync()
+            if config.grad_sync_func is not None:
+                config.grad_sync_func(model.parameters())
+
+    if config.finalize_model_grads_func is not None and not forward_only:
+
+        # If defer_embedding_wgrad_compute is enabled we need to do the
+        # weight gradient GEMM's here.
+        finish_embedding_wgrad_compute(config, embedding_module)
+
+        # Finalize model grads (perform full grad all-reduce / reduce-scatter for
+        # data parallelism, layernorm all-reduce for sequence parallelism, and
+        # embedding all-reduce for pipeline parallelism).
+        config.finalize_model_grads_func(
+            [model], total_num_tokens if config.calculate_per_token_loss else None
+        )
+
+    if config.timers is not None:
+        config.timers('forward-backward').stop()
 
     return forward_data_store
diff --git a/megatron/core/requirements.txt b/megatron/core/requirements.txt
index 08ed5eeb4b..a03ef133e7 100644
--- a/megatron/core/requirements.txt
+++ b/megatron/core/requirements.txt
@@ -1 +1,2 @@
-torch
\ No newline at end of file
+torch
+packaging
diff --git a/megatron/core/ssm/__init__.py b/megatron/core/ssm/__init__.py
new file mode 100644
index 0000000000..e69de29bb2
diff --git a/megatron/core/ssm/mamba_block.py b/megatron/core/ssm/mamba_block.py
new file mode 100644
index 0000000000..20754b5c25
--- /dev/null
+++ b/megatron/core/ssm/mamba_block.py
@@ -0,0 +1,336 @@
+# Copyright (c) 2024, NVIDIA CORPORATION. All rights reserved.
+# Copyright (c) 2024, Tri Dao, Albert Gu.
+
+# Some of this code was adopted from https://github.com/state-spaces/mamba/
+# This source code is licensed under the Apache license found in the
+# LICENSE file in the root directory of this source tree.
+
+import math
+from dataclasses import dataclass
+from functools import partial
+from typing import Union
+
+from torch import Tensor, nn
+
+from megatron.core import parallel_state
+from megatron.core.dist_checkpointing.mapping import ShardedStateDict
+from megatron.core.dist_checkpointing.utils import replace_prefix_for_sharding
+from megatron.core.extensions.transformer_engine import TENorm
+from megatron.core.ssm.mamba_hybrid_layer_allocation import Symbols as LayerSymbols
+from megatron.core.ssm.mamba_hybrid_layer_allocation import allocate_layers
+from megatron.core.tensor_parallel import get_cuda_rng_tracker
+from megatron.core.transformer.identity_op import IdentityOp
+from megatron.core.transformer.module import MegatronModule
+from megatron.core.transformer.spec_utils import ModuleSpec, build_module
+from megatron.core.transformer.transformer_config import TransformerConfig
+from megatron.core.transformer.utils import sharded_state_dict_default
+from megatron.core.utils import make_viewless_tensor
+
+
+# https://github.com/huggingface/transformers/blob/c28d04e9e252a1a099944e325685f14d242ecdcd/src/transformers/models/gpt2/modeling_gpt2.py#L454
+def _init_weights(
+    module,
+    n_layer,
+    initializer_range=0.02,  # Now only used for embedding layer.
+    rescale_prenorm_residual=True,
+    n_residuals_per_layer=1,  # Change to 2 if we have MLP
+):
+    with get_cuda_rng_tracker().fork():
+        if isinstance(module, nn.Linear):
+            if not getattr(module.weight, "_no_reinit", False):
+                nn.init.normal_(module.weight, std=initializer_range)
+            if module.bias is not None:
+                if not getattr(module.bias, "_no_reinit", False):
+                    nn.init.zeros_(module.bias)
+        elif isinstance(module, nn.Embedding):
+            nn.init.normal_(module.weight, std=initializer_range)
+
+        for name, p in module.named_parameters():
+            if name in ["in_proj.weight", "x_proj.weight", "conv1d.weight", "out_proj.weight"]:
+                nn.init.kaiming_uniform_(p, a=math.sqrt(5))
+
+        if rescale_prenorm_residual:
+            # Reinitialize selected weights subject to the OpenAI GPT-2 Paper Scheme:
+            #   > A modified initialization which accounts for the accumulation on the
+            #   > residual path with model depth. Scale
+            #   > the weights of residual layers at initialization by a factor of
+            #   > 1/√N where N is the # of residual layers.
+            #   >   -- GPT-2 :: https://openai.com/blog/better-language-models/
+            #
+            # Reference (Megatron-LM):
+            # https://github.com/NVIDIA/Megatron-LM/blob/main/megatron/model/gpt_model.py
+            for name, p in module.named_parameters():
+                if name in ["out_proj.weight", "fc2.weight"]:
+                    # Special Scaled Initialization
+                    nn.init.normal_(
+                        p,
+                        mean=0.0,
+                        std=initializer_range / math.sqrt(n_residuals_per_layer * n_layer),
+                    )
+
+
+@dataclass
+class MambaStackSubmodules:
+    """
+    A class for the module specs for the MambaStack.
+    """
+
+    mamba_layer: Union[ModuleSpec, type] = IdentityOp
+    attention_layer: Union[ModuleSpec, type] = IdentityOp
+    mlp_layer: Union[ModuleSpec, type] = IdentityOp
+
+
+class MambaStack(MegatronModule):
+    """
+    Constructor for the MambaStack class.
+
+    Args:
+        config (TransformerConfig): the transformer configuration
+        submodules (MambaStackSubmodules): the submodules for the stack
+        mamba_ssm_ngroups (int, optional): the number of groups for the
+            MAMBA SSM. Defaults to 8.
+        residual_in_fp32 (bool, optional): whether to do residual connections
+            in fp32. Defaults to False.
+        pre_process (bool, optional): whether to include an embedding layer.
+            Defaults to True.
+        hybrid_attention_ratio (float, optional): the target ratio of attention layers to
+            total layers. Defaults to 0.0.
+        hybrid_mlp_ratio (float, optional): the target ratio of mlp layers to total
+            layers. Defaults to 0.0.
+        hybrid_override_pattern (str, optional): the hybrid layer pattern to override
+             with. Defaults to None.
+        post_layer_norm (bool, optional): whether to include a final layer norm.
+            Defaults to True.
+        post_process (bool, optional): whether to include an output layer.
+            Defaults to True.
+        device (optional): the device to use. Defaults to None.
+        dtype (optional): the data type to use. Defaults to None.
+    """
+
+    def __init__(
+        self,
+        config: TransformerConfig,
+        submodules: MambaStackSubmodules,
+        mamba_ssm_ngroups: int = 8,
+        residual_in_fp32=False,
+        pre_process: bool = True,
+        hybrid_attention_ratio: float = 0.0,
+        hybrid_mlp_ratio: float = 0.0,
+        hybrid_override_pattern: str = None,
+        post_layer_norm: bool = True,
+        post_process: bool = True,
+        device=None,
+        dtype=None,
+    ) -> None:
+        super().__init__(config=config)
+        self.residual_in_fp32 = residual_in_fp32
+        self.pre_process = pre_process
+        self.post_layer_norm = post_layer_norm
+        self.post_process = post_process
+
+        # Required for pipeline parallel schedules
+        self.input_tensor = None
+
+        self.hybrid_attention_ratio = hybrid_attention_ratio
+        self.hybrid_mlp_ratio = hybrid_mlp_ratio
+        self.hybrid_override_pattern = hybrid_override_pattern
+
+        layer_type_list = allocate_layers(
+            self.config.num_layers,
+            self.hybrid_attention_ratio,
+            self.hybrid_mlp_ratio,
+            self.hybrid_override_pattern,
+        )
+
+        pp_layer_offset = 0
+        if parallel_state.get_pipeline_model_parallel_world_size() > 1:
+            pp_layer_offset, layer_type_list = self._select_layers_for_pipeline_parallel(
+                layer_type_list
+            )
+
+        self.layers = nn.ModuleList()
+        for i, layer_type in enumerate(layer_type_list):
+            if layer_type == LayerSymbols.MAMBA:
+                layer = build_module(
+                    submodules.mamba_layer,
+                    config=self.config,
+                    mamba_ssm_ngroups=mamba_ssm_ngroups,
+                    residual_in_fp32=residual_in_fp32,
+                    layer_number=i + 1 + pp_layer_offset,
+                )
+            elif layer_type == LayerSymbols.ATTENTION:
+                # Transformer layers apply their own pp_layer_offset
+                layer = build_module(
+                    submodules.attention_layer, config=self.config, layer_number=i + 1
+                )
+            elif layer_type == LayerSymbols.MLP:
+                # Transformer layers apply their own pp_layer_offset
+                layer = build_module(submodules.mlp_layer, config=self.config, layer_number=i + 1)
+            else:
+                assert True, "unexpected layer_type"
+            self.layers.append(layer)
+
+        # Required for activation recomputation
+        self.num_layers_per_pipeline_rank = len(self.layers)
+
+        if self.post_process and self.post_layer_norm:
+            # Final layer norm before output.
+            self.final_norm = TENorm(
+                config=self.config,
+                hidden_size=self.config.hidden_size,
+                eps=self.config.layernorm_epsilon,
+            )
+
+        self.apply(partial(_init_weights, n_layer=self.config.num_layers))
+
+    def _select_layers_for_pipeline_parallel(self, layer_type_list):
+        pipeline_rank = parallel_state.get_pipeline_model_parallel_rank()
+        num_layers_per_pipeline_rank = (
+            self.config.num_layers // parallel_state.get_pipeline_model_parallel_world_size()
+        )
+
+        assert parallel_state.get_virtual_pipeline_model_parallel_world_size() is None, (
+            "The Mamba hybrid model does not currently support "
+            "virtual/interleaved pipeline parallelism"
+        )
+
+        offset = pipeline_rank * num_layers_per_pipeline_rank
+        selected_list = layer_type_list[offset : offset + num_layers_per_pipeline_rank]
+
+        return offset, selected_list
+
+    def allocate_inference_cache(self, batch_size, max_seqlen, dtype=None):
+        """
+        Allocate inference cache for each layer.
+
+        Args:
+            batch_size (int): The batch size to use for inference.
+            max_seqlen (int): The maximum sequence length to use
+                for inference.
+            dtype (optional): The data type to use for allocation.
+                Defaults to the data type of the model.
+        """
+        return {
+            i: layer.allocate_inference_cache(batch_size, max_seqlen, dtype=dtype)
+            for i, layer in enumerate(self.layers)
+        }
+
+    def set_input_tensor(self, input_tensor: Tensor):
+        """Set input tensor to be used instead of forward()'s input.
+
+        When doing pipeline parallelism the input from the previous
+        stage comes from communication, not from the input, so the
+        model's forward_step_func won't have it. This function is thus
+        used by internal code to bypass the input provided by the
+        forward_step_func"""
+        self.input_tensor = input_tensor
+
+    def forward(
+        self,
+        hidden_states: Tensor,
+        attention_mask: Tensor,
+        inference_params=None,
+        rotary_pos_emb: Tensor = None,
+    ):
+        """
+        Forward function of the MambaStack class.
+
+        It either returns the Loss values if labels are given or the
+            final hidden units
+
+        Args:
+            hidden_states (Tensor): the input tensor.
+            attention_mask (Tensor): the attention mask.
+            inference_params (InferenceParams): the inference parameters.
+            rotary_pos_emb (Tensor, optional): the rotary positional embeddings.
+                Defaults to None.
+        Returns:
+            Tensor: the output tensor.
+        """
+        if not self.pre_process:
+            # See set_input_tensor()
+            hidden_states = self.input_tensor
+
+        if inference_params:
+            # NOTE(bnorick): match InferenceParams attributes for
+            # mamba_ssm.utils.generation.InferenceParams,
+            # this hack supports eval
+            inference_params.max_seqlen = inference_params.max_sequence_length
+            inference_params.seqlen_offset = inference_params.sequence_len_offset
+
+        for layer in self.layers:
+            hidden_states = layer(
+                hidden_states,
+                attention_mask,
+                inference_params=inference_params,
+                rotary_pos_emb=rotary_pos_emb,
+            )
+
+            # The attention layer (currently a simplified transformer layer)
+            # outputs a tuple of (hidden_states, context). Context is intended
+            # for cross-attention, and is not needed in our model.
+            if isinstance(hidden_states, tuple):
+                hidden_states = hidden_states[0]
+
+        # Final layer norm.
+        if self.post_process and self.post_layer_norm:
+            hidden_states = self.final_norm(hidden_states)
+
+        # Ensure that the tensor passed between pipeline parallel stages is
+        # viewless. See related notes in TransformerBlock and TransformerLayer
+        output = make_viewless_tensor(
+            inp=hidden_states, requires_grad=hidden_states.requires_grad, keep_graph=True
+        )
+
+        return hidden_states
+
+    def sharded_state_dict(
+        self, prefix: str = '', sharded_offsets: tuple = (), metadata: dict = None
+    ) -> ShardedStateDict:
+        """
+        Returns a sharded state dictionary for the current object.
+
+        This function constructs a sharded state dictionary by iterating over the layers
+        in the current object, computing the sharded state dictionary for each layer,
+        and combining the results into a single dictionary.
+
+        Parameters:
+            prefix (str): The prefix to use for the state dictionary keys.
+            sharded_offsets (tuple): The sharded offsets to use for the state dictionary.
+            metadata (dict): Additional metadata to use when computing the sharded state dictionary.
+
+        Returns:
+            dict: The sharded state dictionary for the current object.
+        """
+
+        sharded_state_dict = {}
+        layer_prefix = f'{prefix}layers.'
+
+        for local_layer_idx, layer in enumerate(self.layers):
+
+            global_layer_offset = layer.layer_number - 1  # self.layer_number starts at 1
+            state_dict_prefix = (
+                f'{layer_prefix}{local_layer_idx}.'  # module list index in MambaBlock
+            )
+
+            sharded_prefix = f'{layer_prefix}{global_layer_offset}.'
+            sharded_pp_offset = []
+
+            layer_sharded_state_dict = layer.sharded_state_dict(
+                state_dict_prefix, sharded_pp_offset, metadata
+            )
+
+            replace_prefix_for_sharding(layer_sharded_state_dict, state_dict_prefix, sharded_prefix)
+
+            sharded_state_dict.update(layer_sharded_state_dict)
+
+        # Add modules other than self.layers
+        for name, module in self.named_children():
+            if not module is self.layers:
+                sharded_state_dict.update(
+                    sharded_state_dict_default(
+                        module, f'{prefix}{name}.', sharded_offsets, metadata
+                    )
+                )
+
+        return sharded_state_dict
diff --git a/megatron/core/ssm/mamba_hybrid_layer_allocation.py b/megatron/core/ssm/mamba_hybrid_layer_allocation.py
new file mode 100644
index 0000000000..abfa2ae305
--- /dev/null
+++ b/megatron/core/ssm/mamba_hybrid_layer_allocation.py
@@ -0,0 +1,191 @@
+# Copyright (c) 2024, NVIDIA CORPORATION. All rights reserved.
+
+import logging
+
+if __name__ != "__main__":
+    from megatron.core.utils import log_single_rank
+else:
+    from typing import Any
+
+    def log_single_rank(logger: logging.Logger, *args: Any, rank: int = 0, **kwargs: Any):
+        print(*args[1:], **kwargs)
+
+
+logger = logging.getLogger(__name__)
+
+
+class Symbols:
+    MAMBA = 'M'
+    ATTENTION = '*'
+    MLP = '-'
+    VALID = {MAMBA, ATTENTION, MLP}
+
+
+def _allocate_auto(
+    total_layers_count: int, target_attention_ratio: float, target_mlp_ratio: float
+) -> list:
+    # First, allocate attention (evenly spaced, starting and ending with mamba)
+    attention_layers_count: int = round(total_layers_count * target_attention_ratio)
+    mamba_layers_count: int = total_layers_count - attention_layers_count
+    mamba_sections_count: int = attention_layers_count + 1
+    mamba_section_length: float = mamba_layers_count / mamba_sections_count
+
+    layer_type_list = [Symbols.MAMBA] * total_layers_count
+    x: float = mamba_section_length
+    for l in range(total_layers_count):
+        if x < 0.5:
+            layer_type_list[l] = Symbols.ATTENTION
+            x += mamba_section_length
+        else:
+            x -= 1
+
+    # Next, allocate mlp
+    # (evenly distributed, but right-justified, not replacing attention)
+    mlp_layers_count: int = round(total_layers_count * target_mlp_ratio)
+    if mlp_layers_count > 0:
+        mamba_layers_count -= mlp_layers_count
+        mamba_to_mlp_ratio: float = mamba_layers_count / mlp_layers_count
+
+        x: float = mamba_to_mlp_ratio
+        for l in range(total_layers_count):
+            if layer_type_list[l] == Symbols.MAMBA:
+                if x < 0.5:
+                    layer_type_list[l] = Symbols.MLP
+                    x += mamba_to_mlp_ratio
+                else:
+                    x -= 1
+
+    return layer_type_list
+
+
+def _allocate_override(total_layers_count: int, override_pattern: str) -> list:
+    layer_type_list = list(override_pattern)
+    override_pattern_length = len(layer_type_list)
+    if override_pattern_length != total_layers_count:
+        raise ValueError(
+            "The hybrid override pattern is the wrong "
+            f"length: got {override_pattern_length}, expected "
+            f"{total_layers_count}"
+        )
+    for l in layer_type_list:
+        if l not in Symbols.VALID:
+            raise ValueError(f"In hybrid override pattern, '{l}' is not " f"one of {Symbols.VALID}")
+
+    return layer_type_list
+
+
+def _layer_counts_match(a: list, b: list) -> bool:
+    for s in Symbols.VALID:
+        if a.count(s) != b.count(s):
+            return False
+    return True
+
+
+def allocate_layers(
+    total_layers_count: int,
+    target_attention_ratio: float,
+    target_mlp_ratio: float,
+    override_pattern: str = None,
+) -> list:
+    assert total_layers_count > 0
+    assert target_attention_ratio >= 0.0 and target_attention_ratio <= 1.0
+    assert target_mlp_ratio >= 0.0 and target_mlp_ratio <= 1.0
+    assert target_attention_ratio + target_mlp_ratio <= 1.0
+    # Note: target_mamba_ratio = 1.0 - target_attention_ratio - target_mlp_ratio
+
+    layer_type_list = _allocate_auto(total_layers_count, target_attention_ratio, target_mlp_ratio)
+
+    if override_pattern is not None:
+        layer_type_list_override = _allocate_override(total_layers_count, override_pattern)
+        log_single_rank(logger, logging.INFO, "Using hybrid override pattern")
+        if (target_attention_ratio > 0.0 or target_mlp_ratio > 0.0) and not _layer_counts_match(
+            layer_type_list_override, layer_type_list
+        ):
+            raise ValueError(
+                "The number of each type of layer in the override "
+                "pattern must match the number in the overridden "
+                "pattern."
+            )
+        if layer_type_list_override == layer_type_list:
+            log_single_rank(
+                logger, logging.INFO, "The override pattern matches the overridden pattern"
+            )
+        else:
+            log_single_rank(logger, logging.INFO, "Warning: overriding pattern A with pattern B")
+            log_single_rank(logger, logging.INFO, f"A: {''.join(layer_type_list)}")
+            log_single_rank(logger, logging.INFO, f"B: {''.join(layer_type_list_override)}")
+        layer_type_list = layer_type_list_override
+
+    if target_attention_ratio > 0.0 or target_mlp_ratio > 0.0 or override_pattern is not None:
+        actual_attention_layers_count = layer_type_list.count(Symbols.ATTENTION)
+        actual_attention_ratio = actual_attention_layers_count / total_layers_count
+        actual_mlp_layers_count = layer_type_list.count(Symbols.MLP)
+        actual_mlp_ratio = actual_mlp_layers_count / total_layers_count
+        allocation_string = ''.join(layer_type_list)
+        log_single_rank(
+            logger,
+            logging.INFO,
+            f"Hybrid allocation ({Symbols.MAMBA} is mamba, "
+            f"{Symbols.ATTENTION} is attention, "
+            f"{Symbols.MLP} is mlp):",
+        )
+        log_single_rank(logger, logging.INFO, allocation_string)
+        log_single_rank(
+            logger,
+            logging.INFO,
+            f"{actual_attention_layers_count} attention layers in "
+            f"{total_layers_count} total layers.",
+        )
+        log_single_rank(
+            logger,
+            logging.INFO,
+            f"Target attention ratio: {target_attention_ratio:.2f}. "
+            f"Actual attention ratio: {actual_attention_ratio:.2f}.",
+        )
+        log_single_rank(
+            logger,
+            logging.INFO,
+            f"{actual_mlp_layers_count} mlp layers in " f"{total_layers_count} total layers.",
+        )
+        log_single_rank(
+            logger,
+            logging.INFO,
+            f"Target mlp ratio: {target_mlp_ratio:.2f}. "
+            f"Actual mlp ratio: {actual_mlp_ratio:.2f}.",
+        )
+    return layer_type_list
+
+
+if __name__ == "__main__":
+    test_cases = [
+        # (10, 0.2, 0.0),
+        # (48, 0.0, 0.0), # will not print anything
+        # (48, 0.1, 0.0),
+        # 48, 0.3, 0.0),
+        # (48, 0.5, 0.0),
+        # (48, 0.6, 0.0),
+        # (48, 0.7, 0.0),
+        # (10, 0.0, 0.1),
+        # (10, 0.0, 0.3),
+        # (10, 0.0, 0.5),
+        # (10, 0.1, 0.1),
+        # (10, 0.2, 0.2),
+        # (10, 0.3, 0.3),
+        # (10, 0.5, 0.5),
+        # (48, 0.2, 0.3),
+        # (48, 0.5, 0.2),
+        # (48, 0.5, 0.2, "MM*-MM*-MM*-MM*-MM*-MM*-MM*-MM*-MM*-MM*-MM*-MM*-"),
+        # (48, 0.25, 0.25, "MM*-MM*-MM*-MM*-MM*-MM*-MM*-MM*-MM*-MM*-MM*-MM*-"),
+        # (48, 0.25, 0.25, "MM-*MM-*MM*-MM*-MM*-MM*-M*M-M*M-M*M-M*M-*MM-*MM-"),
+        # (48, 0.0, 0.2, "MM*-MM*-MM*-MM*-MM*-MM*-MM*-MM*-MM*-MM*-MM*-MM*-"),
+        # (48, 0.2, 0.0, "MM*-MM*-MM*-MM*-MM*-MM*-MM*-MM*-MM*-MM*-MM*-MM*-"),
+        # (48, 0.0, 0.0, "MM*-MM*-MM*-MM*-MM*-MM*-MM*-MM*-MM*-MM*-MM*-MM*-"),
+        # (48, 0.5, 0.5),
+        # (10, 0.3, 0.2, "MMM*-*M*M-"),
+        # (10, 0.3, 0.2, "MM*M-*M*M-"),
+        (9, 0.0, 0.0, "M*-M*-M*-"),
+        (9, 0.0, 0.0, "MMMMMMMMM"),
+    ]
+    for t in test_cases:
+        print("")
+        allocate_layers(*t)
diff --git a/megatron/core/ssm/mamba_layer.py b/megatron/core/ssm/mamba_layer.py
new file mode 100644
index 0000000000..f0776746dd
--- /dev/null
+++ b/megatron/core/ssm/mamba_layer.py
@@ -0,0 +1,116 @@
+# Copyright (c) 2024, NVIDIA CORPORATION. All rights reserved.
+# Copyright (c) 2024, Tri Dao, Albert Gu.
+
+# Some of this code was adopted from https://github.com/state-spaces/mamba/
+# This source code is licensed under the Apache license found in the
+# LICENSE file in the root directory of this source tree.
+
+from dataclasses import dataclass
+from typing import Union
+
+import torch
+from torch import Tensor
+
+from megatron.core.transformer.identity_op import IdentityOp
+from megatron.core.transformer.module import MegatronModule
+from megatron.core.transformer.spec_utils import ModuleSpec, build_module
+from megatron.core.transformer.transformer_config import TransformerConfig
+
+
+@dataclass
+class MambaLayerSubmodules:
+    """
+    Configuration class for specifying the submodules of a Mamba layer.
+
+    This class defines the structure and default implementations for various
+    components of a Mamba layer, allowing for flexible customization of the
+    layer's architecture.
+
+    Args:
+        norm (Union[ModuleSpec, type]): Specification for the input layer normalization.
+        mixer (Union[ModuleSpec, type]): Specification for the along-sequence mixing mechanism.
+        mamba_bda (Union[ModuleSpec, type]): Specification for the bias-dropout-add operation
+            after the mixer.
+    """
+
+    norm: Union[ModuleSpec, type] = IdentityOp
+    mixer: Union[ModuleSpec, type] = IdentityOp
+    mamba_bda: Union[ModuleSpec, type] = IdentityOp
+
+
+class MambaLayer(MegatronModule):
+    """
+    A single Mamba layer.
+
+    Mamba layer takes input with size [s, b, h] and returns an
+    output of the same size.
+    """
+
+    def __init__(
+        self,
+        config: TransformerConfig,
+        submodules: MambaLayerSubmodules,
+        mamba_ssm_ngroups=8,
+        layer_number: int = 1,
+        residual_in_fp32=False,
+    ):
+        """Initialize Mamba Layer."""
+        super().__init__(config)
+        self.config = config
+        self.layer_number = layer_number
+        self.residual_in_fp32 = residual_in_fp32
+        self.hidden_dropout = config.hidden_dropout
+        self.mixer = build_module(
+            submodules.mixer,
+            self.config,
+            d_model=self.config.hidden_size,
+            ngroups=mamba_ssm_ngroups,
+            layer_number=layer_number,
+        )
+        self.norm = build_module(submodules.norm, self.config, self.config.hidden_size)
+        self.mamba_bda = build_module(submodules.mamba_bda)
+        self.bias_dropout_add_exec_handler = torch.enable_grad
+
+    def forward(
+        self,
+        hidden_states: Tensor,
+        attention_mask: Tensor,  # Not used in MambaLayer
+        inference_params=None,
+        rotary_pos_emb: Tensor = None,  # Not used in MambaLayer
+    ):
+        """
+        Perform a forward pass through the Mamba layer.
+
+        This method implements the core computation of a Mamba layer, including
+        the convolution and the selective SSM/SSD.
+
+        Args:
+            hidden_states (Tensor): Input tensor of shape [s, b, h] where s is sequence length,
+                b is batch size, and h is hidden size.
+            attention_mask (Tensor): Mask tensor for self-attention. Not used by this layer.
+            inference_params (object, optional): Parameters for inference-time optimizations.
+            rotary_pos_emb (Tensor, optional): Rotary positional embeddings.
+
+        Returns:
+            output (Tensor): Transformed hidden states of shape [s, b, h].
+        """
+
+        residual = hidden_states
+        if self.residual_in_fp32:
+            residual = residual.to(torch.float32)
+
+        hidden_states = hidden_states.to(dtype=self.config.params_dtype)
+        hidden_states = self.norm(hidden_states)
+
+        mixer_out_with_bias = self.mixer(hidden_states, inference_params=inference_params)
+
+        with self.bias_dropout_add_exec_handler():
+            hidden_states = self.mamba_bda(self.training, self.config.bias_dropout_fusion)(
+                mixer_out_with_bias, residual, self.hidden_dropout
+            )
+
+        return hidden_states
+
+    def allocate_inference_cache(self, batch_size, max_seqlen, dtype=None):
+        """Allocate the inference cache."""
+        return self.mixer.allocate_inference_cache(batch_size, max_seqlen, dtype=dtype)
diff --git a/megatron/core/ssm/mamba_mixer.py b/megatron/core/ssm/mamba_mixer.py
new file mode 100644
index 0000000000..6448f30d9c
--- /dev/null
+++ b/megatron/core/ssm/mamba_mixer.py
@@ -0,0 +1,718 @@
+# Copyright (c) 2024, NVIDIA CORPORATION. All rights reserved.
+# Copyright (c) 2024, Tri Dao, Albert Gu.
+
+# Some of this code was adopted from https://github.com/state-spaces/mamba/
+# This source code is licensed under the Apache license found in the
+# LICENSE file in the root directory of this source tree.
+
+import math
+from dataclasses import dataclass, replace
+from typing import List, Optional, Union
+
+import torch
+import torch.nn as nn
+import torch.nn.functional as F
+
+from megatron.core.dist_checkpointing import ShardedTensor
+from megatron.core.dist_checkpointing.mapping import ReplicaId, ShardedTensorFactory
+from megatron.core.parallel_state import get_tensor_model_parallel_world_size
+from megatron.core.tensor_parallel import get_cuda_rng_tracker
+from megatron.core.transformer.module import MegatronModule
+from megatron.core.transformer.spec_utils import ModuleSpec, build_module
+from megatron.core.transformer.transformer_config import TransformerConfig
+from megatron.core.transformer.utils import (
+    make_sharded_tensors_for_checkpoint,
+    sharded_state_dict_default,
+)
+
+try:
+    from mamba_ssm.ops.triton.selective_state_update import selective_state_update
+except ImportError:
+    selective_state_update = None
+
+try:
+    from causal_conv1d import causal_conv1d_fn, causal_conv1d_update
+except ImportError:
+    causal_conv1d_fn = None
+    causal_conv1d_update = None
+
+try:
+    from mamba_ssm.ops.triton.layernorm_gated import RMSNorm as RMSNormGated
+    from mamba_ssm.ops.triton.ssd_combined import (
+        mamba_chunk_scan_combined,
+        mamba_split_conv1d_scan_combined,
+    )
+except ImportError:
+    raise ImportError("mamba-ssm is required by the Mamba model but cannot be imported")
+
+try:
+    from einops import rearrange, repeat
+except ImportError:
+    raise ImportError("einops is required by the Mamba model but cannot be imported")
+
+
+class ExtendedRMSNorm(RMSNormGated):
+    """
+    RMSNormGated with sharded state dict.
+    """
+
+    def sharded_state_dict(self, prefix='', sharded_offsets=(), metadata=None):
+        """Sharding along axis 0, bias not sharded"""
+        state_dict = self.state_dict(prefix='', keep_vars=True)
+        return make_sharded_tensors_for_checkpoint(
+            state_dict, prefix, {'weight': 0}, sharded_offsets
+        )
+
+
+@dataclass
+class MambaMixerSubmodules:
+    """
+    Contains the module specs for the input and output linear layers.
+    """
+
+    in_proj: Union[ModuleSpec, type] = None
+    out_proj: Union[ModuleSpec, type] = None
+
+
+class MambaMixer(MegatronModule):
+    """
+    Args:
+        config: The config of the model.
+        submodules: Contains the module specs for the input and output linear layers.
+        d_model: The hidden size of the model.
+        d_state: The state size of the SSM.
+        d_conv: The number of channels in the causal convolution.
+        conv_init: The initialization range for the causal convolution weights.
+        expand: The expansion factor for the SSM.
+        headdim: The hidden size of each attention head.
+        ngroups: The number of attention heads.
+        A_init_range: The initialization range for the attention weights.
+        D_has_hdim: Whether the D parameter has the same number of dimensions as the hidden
+            state.
+        rmsnorm: Whether to use root mean square normalization.
+        norm_before_gate: Whether to apply normalization before the gating mechanism.
+        dt_min: The minimum value of the dt parameter.
+        dt_max: The maximum value of the dt parameter.
+        dt_init: The initialization value of the dt parameter.
+        dt_scale: The scaling factor for the dt parameter.
+        dt_init_floor: The minimum value of the dt parameter after initialization.
+        bias: Whether to use bias in the linear layers.
+        conv_bias: Whether to use bias in the causal convolution.
+        chunk_size: The chunk size for the fused kernel.
+        use_mem_eff_path: Whether to use the memory-efficient path for the Mamba model.
+        layer_number: The layer number of this Mamba layer.
+    """
+
+    def __init__(
+        self,
+        config: TransformerConfig,
+        submodules: MambaMixerSubmodules,
+        d_model,
+        d_state=128,
+        d_conv=4,
+        conv_init=None,
+        expand=2,
+        headdim=64,
+        ngroups=8,
+        A_init_range=(1, 16),
+        D_has_hdim=False,
+        rmsnorm=True,
+        norm_before_gate=False,
+        dt_min=0.001,
+        dt_max=0.1,
+        dt_init="random",
+        dt_scale=1.0,
+        dt_init_floor=1e-4,
+        bias=False,
+        conv_bias=True,
+        # Fused kernel and sharding options
+        chunk_size=128,
+        use_mem_eff_path=True,
+        layer_number=None,
+    ):
+        super().__init__(config)
+        self.config = config
+        self.d_model = d_model
+        self.d_state = d_state
+        self.d_conv = d_conv
+        self.conv_init = conv_init
+        self.expand = expand
+        self.d_inner = int(self.expand * self.d_model)
+        self.headdim = headdim
+        self.ngroups = ngroups
+        assert self.d_inner % self.headdim == 0
+        self.nheads = self.d_inner // self.headdim
+        self.D_has_hdim = D_has_hdim
+        self.rmsnorm = rmsnorm
+        self.norm_before_gate = norm_before_gate
+        self.chunk_size = chunk_size
+        self.use_mem_eff_path = use_mem_eff_path
+        self.layer_number = layer_number
+
+        self.tensor_model_parallel_size = get_tensor_model_parallel_world_size()
+        assert self.d_inner % self.tensor_model_parallel_size == 0
+        assert self.ngroups % self.tensor_model_parallel_size == 0
+        assert self.nheads % self.tensor_model_parallel_size == 0
+        assert not bias
+        assert not self.norm_before_gate
+
+        self.d_inner_local = self.d_inner // self.tensor_model_parallel_size
+        self.ngroups_local = self.ngroups // self.tensor_model_parallel_size
+        self.nheads_local = self.nheads // self.tensor_model_parallel_size
+
+        assert self.d_inner_local % self.ngroups_local == 0
+
+        # Assume sequence parallelism: input is already partitioned along the
+        # sequence dimension
+        self.in_proj = build_module(
+            submodules.in_proj,
+            self.d_model,
+            self.d_inner * 2 + 2 * self.ngroups * self.d_state + self.nheads,  # AB CD E
+            config=self.config,
+            init_method=self.config.init_method,
+            gather_output=False,
+            bias=bias,
+            skip_bias_add=False,
+            is_expert=False,
+            tp_comm_buffer_name='fc1',
+        )
+
+        conv_dim = self.d_inner_local + 2 * self.ngroups_local * self.d_state  # A CD
+        with get_cuda_rng_tracker().fork():
+            # weight dim: [conv_dim, conv_dim, d_conv]
+            self.conv1d = nn.Conv1d(
+                in_channels=conv_dim,
+                out_channels=conv_dim,
+                bias=conv_bias,
+                kernel_size=d_conv,
+                groups=conv_dim,
+                padding=d_conv - 1,
+                device=torch.cuda.current_device(),
+                dtype=config.params_dtype,
+            )
+            setattr(self.conv1d.weight, 'tensor_model_parallel', True)
+            setattr(self.conv1d.bias, 'tensor_model_parallel', True)
+
+            if self.conv_init is not None:
+                nn.init.uniform_(self.conv1d.weight, -self.conv_init, self.conv_init)
+
+        self.activation = "silu"
+        self.act = nn.SiLU()
+
+        with get_cuda_rng_tracker().fork():
+            # Initialize dt bias so that F.softplus(dt_bias) is between dt_min and dt_max
+            dt = torch.exp(
+                torch.rand(
+                    self.nheads_local, device=torch.cuda.current_device(), dtype=config.params_dtype
+                )
+                * (math.log(dt_max) - math.log(dt_min))
+                + math.log(dt_min)
+            ).clamp(min=dt_init_floor)
+            # Inverse of softplus: https://github.com/pytorch/pytorch/issues/72759
+            inv_dt = dt + torch.log(-torch.expm1(-dt))
+            with torch.no_grad():
+                self.dt_bias = nn.Parameter(inv_dt)
+            # Our initialization would set all Linear.bias to zero,
+            # need to mark this one as _no_reinit
+            self.dt_bias._no_reinit = True
+            # Just to be explicit. Without this we already don't
+            # put wd on dt_bias because of the check
+
+            # name.endswith("bias") in param_grouping.py
+            self.dt_bias._no_weight_decay = True
+
+            assert A_init_range[0] > 0 and A_init_range[1] >= A_init_range[0]
+            A = torch.empty(
+                self.nheads_local, dtype=torch.float32, device=torch.cuda.current_device()
+            ).uniform_(*A_init_range)
+            A_log = torch.log(A)  # Keep A_log in fp32
+            self.A_log = nn.Parameter(A_log)
+            self.A_log._no_weight_decay = True
+            setattr(self.A_log, 'tensor_model_parallel', True)
+
+        # D "skip" parameter
+        self.D = nn.Parameter(
+            torch.ones(
+                self.d_inner_local if self.D_has_hdim else self.nheads_local,
+                device=torch.cuda.current_device(),
+            )
+        )  # Keep in fp32
+        self.D._no_weight_decay = True
+        setattr(self.D, 'tensor_model_parallel', True)
+
+        if self.rmsnorm:
+            assert RMSNormGated is not None
+            self.norm = ExtendedRMSNorm(
+                self.d_inner_local,
+                eps=1e-5,
+                group_size=self.d_inner_local // self.ngroups_local,
+                norm_before_gate=self.norm_before_gate,
+                device=torch.cuda.current_device(),
+                dtype=config.params_dtype,
+            )
+
+        # Assume sequence parallelism: input is partitioned along d_inner and
+        # output is partitioned along the sequence dimension
+        self.out_proj = build_module(
+            submodules.out_proj,
+            self.d_inner,
+            self.d_model,
+            config=self.config,
+            init_method=self.config.output_layer_init_method,
+            bias=bias,
+            input_is_parallel=True,
+            skip_bias_add=True,
+            is_expert=False,
+            tp_comm_buffer_name='fc2',
+        )
+
+    def forward(self, hidden_states, inference_params=None):
+        """
+        hidden_states: (nL, B, D) / (L B D)
+        Returns: same shape as hidden_states
+        """
+        _, batch, dim = hidden_states.shape
+
+        conv_state, ssm_state = None, None
+        if inference_params is not None:
+            assert not self.config.sequence_parallel
+            conv_state, ssm_state = self._get_states_from_cache(inference_params, batch)
+            if inference_params.seqlen_offset > 0:
+                # The states are updated inplace
+                out, out_bias, _, _ = self.step(hidden_states, conv_state, ssm_state)
+                return out, out_bias
+
+        # (nheads_local)
+        A = -torch.exp(self.A_log.float())
+
+        xz, _ = self.in_proj(hidden_states)
+
+        # transpose: l b pd --> b l pd
+        xz = rearrange(xz, "l b d -> b l d").contiguous()
+
+        if self.use_mem_eff_path and inference_params is None:
+            assert ssm_state is None
+
+            if self.conv1d.bias is not None:
+                self.conv1d.bias.data_ptr()
+
+            y = mamba_split_conv1d_scan_combined(
+                xz,
+                rearrange(self.conv1d.weight, "d 1 w -> d w"),
+                self.conv1d.bias,
+                self.dt_bias.float(),
+                A,
+                D=(
+                    rearrange(self.D.float(), "(h p) -> h p", p=self.headdim)
+                    if self.D_has_hdim
+                    else self.D
+                ),
+                chunk_size=self.chunk_size,
+                activation=self.activation,
+                headdim=None if self.D_has_hdim else self.headdim,
+                ngroups=self.ngroups_local,
+                norm_before_gate=self.norm_before_gate,
+            )
+
+            if self.rmsnorm:
+                y = self.norm(y)
+        else:
+            z, xBC, dt = torch.split(
+                xz,
+                [
+                    self.d_inner_local,
+                    self.d_inner_local + 2 * self.ngroups_local * self.d_state,
+                    self.nheads_local,
+                ],
+                dim=-1,
+            )
+
+            # transpose: b l pd --> b pd l
+            xBC = rearrange(xBC, "b l d -> b d l").contiguous()
+
+            # Compute short convolution
+            if conv_state is not None:
+                # If we just take x[:, :, -self.d_conv :], it will error if seqlen < self.d_conv
+                # Instead F.pad will pad with zeros if seqlen < self.d_conv, and truncate otherwise.
+                conv_state.copy_(
+                    F.pad(xBC, (self.d_conv - xBC.shape[-1], 0))
+                )  # Update state (B D W)
+
+            seqlen = xBC.size(2)
+            if causal_conv1d_fn is None:
+                xBC = self.act(self.conv1d(xBC)[..., :seqlen])
+            else:
+                assert self.activation in ["silu", "swish"]
+                xBC = causal_conv1d_fn(
+                    x=xBC,
+                    weight=rearrange(self.conv1d.weight, "d 1 w -> d w"),
+                    bias=self.conv1d.bias,
+                    activation=self.activation,
+                )
+
+            # transpose b pd l --> b l pd
+            xBC = rearrange(xBC, "b d l ->  b l d").contiguous()
+
+            x, B, C = torch.split(
+                xBC,
+                [
+                    self.d_inner_local,
+                    self.ngroups_local * self.d_state,
+                    self.ngroups_local * self.d_state,
+                ],
+                dim=-1,
+            )
+
+            # TODO Vijay: fuse most of the transposes with the GEMMS
+            x = rearrange(x, "b l (h p) -> b l h p", p=self.headdim).contiguous()
+            dt = dt.contiguous()
+            B = rearrange(B, "b l (g n) -> b l g n", n=self.d_state).contiguous()
+            C = rearrange(C, "b l (g n) -> b l g n", n=self.d_state).contiguous()
+            z = rearrange(z, "b l (h p) -> b l h p", p=self.headdim).contiguous()
+            y = mamba_chunk_scan_combined(
+                x,
+                dt,
+                A,
+                B,
+                C,
+                self.chunk_size,
+                D=(
+                    rearrange(self.D.float(), "(h p) -> h p", p=self.headdim)
+                    if self.D_has_hdim
+                    else self.D
+                ),
+                z=z if not self.rmsnorm else None,
+                dt_bias=self.dt_bias.float(),
+                dt_softplus=True,
+                return_final_states=ssm_state is not None,
+            )
+
+            if ssm_state is not None:
+                y, last_state = y
+                ssm_state.copy_(last_state)
+
+            if self.rmsnorm:
+                y = rearrange(y, "b l h p -> b l (h p)").contiguous()
+                z = rearrange(z, "b l h p -> b l (h p)").contiguous()
+                y = self.norm(y, z)
+            else:
+                y = rearrange(y, "b l h p -> b l (h p)").contiguous()
+
+        y = rearrange(y, "b l d -> l b d").contiguous()
+        out, out_bias = self.out_proj(y)
+
+        return out, out_bias
+
+    def step(self, hidden_states, conv_state, ssm_state):
+        """
+        Performs inference step for decoding
+        """
+        # assert self.ngroups_local == 1, "Only support ngroups=1 for inference for now"
+        dtype = hidden_states.dtype
+        assert hidden_states.shape[0] == 1, "Only support decoding with 1 token at a time for now"
+
+        # l b d --> b d
+        hidden_states = hidden_states.squeeze(0)
+
+        #  b d_model --> b p(2d)
+        xz, _ = self.in_proj(hidden_states)
+
+        z, xBC, dt = torch.split(
+            xz,
+            [
+                self.d_inner_local,
+                self.d_inner_local + 2 * self.ngroups_local * self.d_state,
+                self.nheads_local,
+            ],
+            dim=-1,
+        )
+
+        # Conv step
+        if causal_conv1d_update is None:
+            conv_state.copy_(torch.roll(conv_state, shifts=-1, dims=-1))  # Update state (B D W)
+            conv_state[:, :, -1] = xBC
+            xBC = torch.sum(
+                conv_state * rearrange(self.conv1d.weight, "d 1 w -> d w"), dim=-1
+            )  # (B D)
+            if self.conv1d.bias is not None:
+                xBC = xBC + self.conv1d.bias
+            xBC = self.act(xBC).to(dtype=dtype)
+        else:
+            xBC = causal_conv1d_update(
+                xBC,
+                conv_state,
+                rearrange(self.conv1d.weight, "d 1 w -> d w"),
+                self.conv1d.bias,
+                self.activation,
+            )
+
+        x, B, C = torch.split(
+            xBC,
+            [
+                self.d_inner_local,
+                self.ngroups_local * self.d_state,
+                self.ngroups_local * self.d_state,
+            ],
+            dim=-1,
+        )
+        A = -torch.exp(self.A_log.float())
+
+        # SSM step
+        if selective_state_update is None:
+            if self.ngroups_local > 1:
+                B = rearrange(B, "b (g n) -> b g n", n=self.d_state)
+                C = rearrange(C, "b (g n) -> b g n", n=self.d_state)
+                B = repeat(B, "b g n -> b (g h) n", h=self.d_inner_local // self.ngroups_local)
+                C = repeat(C, "b g n -> b (g h) n", h=self.d_inner_local // self.ngroups_local)
+
+                dt = repeat(dt, "b h -> b (h p)", p=self.headdim)
+                dt_bias = repeat(self.dt_bias, "h -> (h p)", p=self.headdim)
+                A = repeat(A, "h -> (h p) n", p=self.headdim, n=self.d_state)
+                D = repeat(self.D, "h -> (h p)", p=self.headdim)
+
+                dt = F.softplus(dt + dt_bias.to(dtype=dt.dtype))
+                dA = torch.exp(torch.einsum("bd,dn->bdn", dt, A))
+
+                dB_x = torch.einsum('bd,bdn,bd->bdn', dt, B, x)
+                ssm_state.copy_(
+                    ssm_state * rearrange(dA, "b (h p) n -> b h p n", p=self.headdim)
+                    + rearrange(dB_x, "b (h p) n -> b h p n", p=self.headdim)
+                )
+
+                y = torch.einsum(
+                    "bdn,bdn->bd",
+                    rearrange(ssm_state.to(dtype), "b h p n -> b (h p) n", p=self.headdim),
+                    C,
+                )
+                y = y + D.to(dtype) * x
+                if not self.rmsnorm:
+                    y = y * self.act(z)  # (B D)
+            else:
+                # Discretize A and B (b (g n))
+                dt = F.softplus(dt + self.dt_bias.to(dtype=dt.dtype))  # (batch, nheads)
+                dA = torch.exp(dt * A)
+                x = rearrange(x, "b (h p) -> b h p", p=self.headdim)
+                dBx = torch.einsum("bh,bn,bhp->bhpn", dt, B, x)
+                ssm_state.copy_(ssm_state * rearrange(dA, "b h -> b h 1 1") + dBx)
+                y = torch.einsum("bhpn,bn->bhp", ssm_state.to(dtype), C)
+                y = y + rearrange(self.D.to(dtype), "h -> h 1") * x
+                y = rearrange(y, "b h p -> b (h p)")
+                if not self.rmsnorm:
+                    y = y * self.act(z)  # (B D)
+        else:
+            A = repeat(A, "h -> h p n", p=self.headdim, n=self.d_state).to(dtype=torch.float32)
+            dt = repeat(dt, "b h -> b h p", p=self.headdim)
+            dt_bias = repeat(self.dt_bias, "h -> h p", p=self.headdim)
+            D = repeat(self.D, "h -> h p", p=self.headdim)
+            B = rearrange(B, "b (g n) -> b g n", g=self.ngroups_local)
+            C = rearrange(C, "b (g n) -> b g n", g=self.ngroups_local)
+            x_reshaped = rearrange(x, "b (h p) -> b h p", p=self.headdim)
+            if not self.rmsnorm:
+                z = rearrange(z, "b (h p) -> b h p", p=self.headdim)
+            y = selective_state_update(
+                ssm_state,
+                x_reshaped,
+                dt,
+                A,
+                B,
+                C,
+                D,
+                z=z if not self.rmsnorm else None,
+                dt_bias=dt_bias,
+                dt_softplus=True,
+            )
+            y = rearrange(y, "b h p -> b (h p)")
+
+        if self.rmsnorm:
+            y = self.norm(y, z)
+
+        # b pd --> b d
+        out, out_bias = self.out_proj(y)
+        return out.unsqueeze(0), out_bias, conv_state, ssm_state
+
+    def allocate_inference_cache(self, batch_size, max_seqlen, dtype=None):
+        """
+        allocate inference cache
+        """
+        device = self.out_proj.weight.device
+        conv_dtype = self.conv1d.weight.dtype if dtype is None else dtype
+        conv_state = torch.zeros(
+            batch_size, self.conv1d.weight.shape[0], self.d_conv, device=device, dtype=conv_dtype
+        )
+        ssm_dtype = self.in_proj.weight.dtype if dtype is None else dtype
+        # ssm_dtype = torch.float32
+        ssm_state = torch.zeros(
+            batch_size,
+            self.nheads_local,
+            self.headdim,
+            self.d_state,
+            device=device,
+            dtype=ssm_dtype,
+        )
+        return conv_state, ssm_state
+
+    def _get_states_from_cache(self, inference_params, batch_size, initialize_states=False):
+        assert self.layer_number is not None
+        if self.layer_number not in inference_params.key_value_memory_dict:
+            conv_state = torch.zeros(
+                batch_size,
+                self.conv1d.weight.shape[0],
+                self.d_conv,
+                device=self.conv1d.weight.device,
+                dtype=self.conv1d.weight.dtype,
+            )
+            ssm_state = torch.zeros(
+                batch_size,
+                self.nheads_local,
+                self.headdim,
+                self.d_state,
+                device=self.in_proj.weight.device,
+                dtype=self.in_proj.weight.dtype,
+            )
+            inference_params.key_value_memory_dict[self.layer_number] = (conv_state, ssm_state)
+        else:
+            conv_state, ssm_state = inference_params.key_value_memory_dict[self.layer_number]
+            # TODO: What if batch size changes between generation, and we reuse the same states?
+            if initialize_states:
+                conv_state.zero_()
+                ssm_state.zero_()
+        return conv_state, ssm_state
+
+    def sharded_state_dict(self, prefix='', sharded_offsets=(), metadata=None):
+        """Provide a sharded state dictionary for distributed checkpointing."""
+        sharded_state_dict = {}
+        # Parameters
+        self._save_to_state_dict(sharded_state_dict, '', keep_vars=True)
+        sharded_state_dict = make_sharded_tensors_for_checkpoint(
+            sharded_state_dict,
+            prefix,
+            tensor_parallel_layers_axis_map={
+                'A_log': 0,
+                'dt_bias': 0,
+                'D': 0,
+            },  # parameters sharded across TP
+            sharded_offsets=sharded_offsets,
+        )
+        # Submodules
+        for name, module in self.named_children():
+            if name == 'conv1d':
+                # Add TP sharding for Conv1d
+                module_sd = module.state_dict(prefix='', keep_vars=True)
+                module_sharded_sd = make_sharded_tensors_for_checkpoint(
+                    module_sd, f'{prefix}{name}.', {f'weight': 0, f'bias': 0}, sharded_offsets
+                )
+
+            else:
+                module_sharded_sd = sharded_state_dict_default(
+                    module, f'{prefix}{name}.', sharded_offsets, metadata
+                )
+
+            sharded_state_dict.update(module_sharded_sd)
+
+        # At this point the TP sharding is correctly defined fo each tensor, but some of the tensors
+        # must be additionally split into separate parts
+        # in_proj
+        in_proj_dim = (
+            self.d_inner_local * 2 + 2 * self.ngroups_local * self.d_state + self.nheads_local
+        )
+        assert sharded_state_dict[f'{prefix}in_proj.weight'].data.size(0) == in_proj_dim, (
+            in_proj_dim,
+            sharded_state_dict[f'{prefix}in_proj.weight'],
+        )
+
+        sharded_state_dict[f'{prefix}in_proj.weight'] = _split_tensor_factory(
+            sharded_state_dict[f'{prefix}in_proj.weight'],
+            [
+                self.d_inner_local,
+                self.d_inner_local,
+                self.ngroups_local * self.d_state,
+                self.ngroups_local * self.d_state,
+                self.nheads_local,
+            ],
+            ['z', 'x', 'B', 'C', 'dt'],
+            0,
+        )
+
+        conv_dim = self.d_inner_local + 2 * self.ngroups_local * self.d_state
+        assert sharded_state_dict[f'{prefix}conv1d.weight'].data.size(0) == conv_dim, (
+            conv_dim,
+            sharded_state_dict[f'{prefix}conv1d.weight'],
+        )
+        assert sharded_state_dict[f'{prefix}conv1d.bias'].data.size(0) == conv_dim, (
+            conv_dim,
+            sharded_state_dict[f'{prefix}conv1d.bias'],
+        )
+
+        for conv_layer_name in ['conv1d.weight', 'conv1d.bias']:
+            sharded_state_dict[f'{prefix}{conv_layer_name}'] = _split_tensor_factory(
+                sharded_state_dict[f'{prefix}{conv_layer_name}'],
+                [
+                    self.d_inner_local,
+                    self.ngroups_local * self.d_state,
+                    self.ngroups_local * self.d_state,
+                ],
+                ['x', 'B', 'C'],
+                0,
+            )
+
+        return sharded_state_dict
+
+
+def _split_tensor_factory(
+    orig_sh_ten: ShardedTensor, split_sections: List[int], split_names: List[str], split_dim: int
+) -> ShardedTensorFactory:
+    """Builds a factory that splits a given ShardedTensor into several independent chunks."""
+    assert isinstance(orig_sh_ten, ShardedTensor), type(orig_sh_ten)
+    orig_sh_ten_no_data = orig_sh_ten.without_data()  # remove `data` reference
+
+    if sum(split_sections) != orig_sh_ten_no_data.local_shape[split_dim]:
+        raise ValueError(
+            f'Split sections must cover the whole dimension size, '
+            f'got {split_sections=} vs dimensions size '
+            f'{orig_sh_ten_no_data.local_shape[split_dim]}'
+        )
+
+    assert not isinstance(
+        split_sections, int
+    ), 'Splitting into predefined section sizes is supported (`split_sections` must be a list)'
+    assert len(split_sections) == len(split_names), (len(split_sections), len(split_names))
+
+    @torch.no_grad()
+    def sh_ten_build_fn(
+        key: str, t: torch.Tensor, replica_id: ReplicaId, flattened_range: Optional[slice]
+    ):
+        factory_sh_ten = replace(
+            orig_sh_ten_no_data,
+            key=key,
+            data=t,
+            dtype=t.dtype,
+            replica_id=replica_id,
+            flattened_range=flattened_range,
+        )
+
+        chunk_sh_tens = []
+        split_start = 0
+        for split_size, split_name in zip(split_sections, split_names):
+            split_chunks = factory_sh_ten.narrow(split_dim, split_start, split_size)
+            for sh_ten in split_chunks:
+                sh_ten.key = f'{sh_ten.key}.{split_name}'
+            chunk_sh_tens.extend(split_chunks)
+            split_start += split_size
+
+        assert split_start == orig_sh_ten_no_data.local_shape[split_dim], (
+            split_start,
+            orig_sh_ten_no_data.local_shape[split_dim],
+        )
+        assert sum(sh_ten.data.numel() for sh_ten in chunk_sh_tens) == t.numel(), (
+            chunk_sh_tens,
+            t.shape,
+        )
+        return chunk_sh_tens
+
+    @torch.no_grad()
+    def sh_ten_merge_fn(sub_state_dict):
+        return torch.cat(sub_state_dict)
+
+    return ShardedTensorFactory(
+        orig_sh_ten.key, orig_sh_ten.data, sh_ten_build_fn, sh_ten_merge_fn, orig_sh_ten.replica_id
+    )
diff --git a/megatron/core/ssm/triton_cache_manager.py b/megatron/core/ssm/triton_cache_manager.py
new file mode 100644
index 0000000000..781f17d32c
--- /dev/null
+++ b/megatron/core/ssm/triton_cache_manager.py
@@ -0,0 +1,81 @@
+# Copyright (c) 2024, NVIDIA CORPORATION. All rights reserved.
+# Copyright 2018-2020 Philippe Tillet
+# Copyright 2020-2022 OpenAI
+
+# Some of this code was adopted from https://github.com/triton-lang/triton
+# This source code is licensed under the MIT license found in the
+# LICENSE file in the root directory of this source tree.
+
+import os
+import uuid
+from pathlib import Path
+
+try:
+    from triton import __version__ as triton_version
+    from triton.runtime.cache import FileCacheManager
+except ImportError:
+    raise ImportError("triton is required by the Mamba model but cannot be imported")
+
+
+def _version_no_greater_than(version, version_limit):
+    major, minor, _ = map(int, version.split('.'))
+    limit_major, limit_minor = map(int, version_limit.split('.'))
+    return major < limit_major or (major == limit_major and minor <= limit_minor)
+
+
+def default_cache_dir():
+    """Provides a default path for the Triton cache directory."""
+    return os.path.join(Path.home(), ".triton", "cache")
+
+
+class ParallelFileCacheManager(FileCacheManager):
+    """
+    This patched version of ParallelFileCacheManager prevents errors related
+    to the builing of the Triton compiler cache when the number of model
+    parallel ranks is greater than one, including when certain types of file
+    system are used (such as Lustre).
+
+    Usage:
+    export TRITON_CACHE_DIR=<chosen-cache-location>
+    export TRITON_CACHE_MANAGER=megatron.core.ssm.triton_cache_manager:ParallelFileCacheManager
+
+    This patch implements the changes in the following two Triton project pull
+    requests:
+    1. https://github.com/triton-lang/triton/pull/3544
+    2. https://github.com/triton-lang/triton/pull/4295
+
+    The above changes will probably be included in Triton release version 3.1,
+    making this patch no longer necessary.
+    """
+
+    def put(self, data, filename, binary=True) -> str:
+        """A patched version of put, implementing PR 3544 and PR 4295."""
+        patch_limit = '3.0'
+        assert _version_no_greater_than(triton_version, patch_limit), (
+            "Assertion failed: ParallelFileCacheManager patch should not be "
+            f"used beyond Triton version {patch_limit}."
+        )
+        if not self.cache_dir:
+            raise RuntimeError("Could not create or locate cache dir")
+        binary = isinstance(data, bytes)
+        if not binary:
+            data = str(data)
+        assert self.lock_path is not None
+        filepath = self._make_path(filename)
+        # Random ID to avoid any collisions
+        rnd_id = str(uuid.uuid4())
+        # we use the PID in case a bunch of these around so we can see what PID made it
+        pid = os.getpid()
+        # use temp dir to be robust against program interruptions
+        temp_dir = os.path.join(self.cache_dir, f"tmp.pid_{pid}_{rnd_id}")
+        os.makedirs(temp_dir, exist_ok=True)
+        temp_path = os.path.join(temp_dir, filename)
+
+        mode = "wb" if binary else "w"
+        with open(temp_path, mode) as f:
+            f.write(data)
+        # Replace is guaranteed to be atomic on POSIX systems if it succeeds
+        # so filepath cannot see a partial write
+        os.replace(temp_path, filepath)
+        os.removedirs(temp_dir)
+        return filepath
diff --git a/megatron/core/tensor_parallel/__init__.py b/megatron/core/tensor_parallel/__init__.py
index 4abec79c16..41d87431fe 100644
--- a/megatron/core/tensor_parallel/__init__.py
+++ b/megatron/core/tensor_parallel/__init__.py
@@ -1,36 +1,43 @@
+# Copyright (c) 2024, NVIDIA CORPORATION. All rights reserved.
 from .cross_entropy import vocab_parallel_cross_entropy
 from .data import broadcast_data
-
 from .layers import (
     ColumnParallelLinear,
     RowParallelLinear,
     VocabParallelEmbedding,
-    set_tensor_model_parallel_attributes,
-    set_defaults_if_not_set_tensor_model_parallel_attributes,
     copy_tensor_model_parallel_attributes,
+    linear_with_grad_accumulation_and_async_allreduce,
     param_is_not_tensor_parallel_duplicate,
-    linear_with_grad_accumulation_and_async_allreduce
-
+    set_defaults_if_not_set_tensor_model_parallel_attributes,
+    set_tensor_model_parallel_attributes,
 )
-
 from .mappings import (
+    all_gather_last_dim_from_tensor_parallel_region,
+    all_to_all,
+    all_to_all_hp2sp,
+    all_to_all_sp2hp,
     copy_to_tensor_model_parallel_region,
-    gather_from_tensor_model_parallel_region,
     gather_from_sequence_parallel_region,
-    scatter_to_tensor_model_parallel_region,
+    gather_from_sequence_parallel_region_to_moe,
+    gather_from_tensor_model_parallel_region,
+    reduce_from_tensor_model_parallel_region,
+    reduce_scatter_last_dim_to_tensor_parallel_region,
+    reduce_scatter_to_sequence_parallel_region,
+    reduce_scatter_to_sequence_parallel_region_from_moe,
     scatter_to_sequence_parallel_region,
+    scatter_to_tensor_model_parallel_region,
 )
-
 from .random import (
     checkpoint,
     get_cuda_rng_tracker,
+    get_data_parallel_rng_tracker_name,
+    get_expert_parallel_rng_tracker_name,
     model_parallel_cuda_manual_seed,
 )
-
 from .utils import (
+    gather_split_1d_tensor,
     split_tensor_along_last_dim,
     split_tensor_into_1d_equal_chunks,
-    gather_split_1d_tensor,
 )
 
 __all__ = [
@@ -38,7 +45,7 @@
     "vocab_parallel_cross_entropy",
     # data.py
     "broadcast_data",
-    #layers.py
+    # layers.py
     "ColumnParallelLinear",
     "RowParallelLinear",
     "VocabParallelEmbedding",
@@ -51,15 +58,19 @@
     "copy_to_tensor_model_parallel_region",
     "gather_from_tensor_model_parallel_region",
     "gather_from_sequence_parallel_region",
-#    "reduce_from_tensor_model_parallel_region",
+    "reduce_from_tensor_model_parallel_region",
+    "reduce_scatter_to_sequence_parallel_region",
     "scatter_to_tensor_model_parallel_region",
     "scatter_to_sequence_parallel_region",
     # random.py
     "checkpoint",
     "get_cuda_rng_tracker",
     "model_parallel_cuda_manual_seed",
+    "get_expert_parallel_rng_tracker_name",
     # utils.py
     "split_tensor_along_last_dim",
     "split_tensor_into_1d_equal_chunks",
     "gather_split_1d_tensor",
+    "gather_from_sequence_parallel_region_to_moe",
+    "reduce_scatter_to_sequence_parallel_region_from_moe",
 ]
diff --git a/megatron/core/tensor_parallel/cross_entropy.py b/megatron/core/tensor_parallel/cross_entropy.py
index 9147dbbadd..0066d126fd 100644
--- a/megatron/core/tensor_parallel/cross_entropy.py
+++ b/megatron/core/tensor_parallel/cross_entropy.py
@@ -1,36 +1,46 @@
 # Copyright (c) 2022, NVIDIA CORPORATION. All rights reserved.
 
+from typing import Tuple
+
 import torch
 
 from megatron.core.parallel_state import (
     get_tensor_model_parallel_group,
     get_tensor_model_parallel_rank,
-    get_tensor_model_parallel_world_size
+    get_tensor_model_parallel_world_size,
 )
 
 from .utils import VocabUtility
 
 
-class _VocabParallelCrossEntropy(torch.autograd.Function):
+class VocabParallelCrossEntropy:
+    """
+    Computes the Cross Entropy Loss splitting the Vocab size across tensor parallel
+    ranks. This implementation is used in both fused and unfused cross entropy implementations
+    """
 
     @staticmethod
-    def forward(ctx, vocab_parallel_logits, target, label_smoothing=0.0):
+    def calculate_logits_max(
+        vocab_parallel_logits: torch.Tensor,
+    ) -> Tuple[torch.Tensor, torch.Tensor]:
 
+        vocab_parallel_logits = vocab_parallel_logits.float()
         # Maximum value along vocab dimension across all GPUs.
         logits_max = torch.max(vocab_parallel_logits, dim=-1)[0]
-        torch.distributed.all_reduce(logits_max,
-                                     op=torch.distributed.ReduceOp.MAX,
-                                     group=get_tensor_model_parallel_group())
-        # Subtract the maximum value.
-        vocab_parallel_logits = vocab_parallel_logits - logits_max.unsqueeze(dim=-1)
 
-        # Get the partition's vocab indecies
-        get_vocab_range = VocabUtility.vocab_range_from_per_partition_vocab_size
-        partition_vocab_size = vocab_parallel_logits.size()[-1]
-        rank = get_tensor_model_parallel_rank()
-        world_size = get_tensor_model_parallel_world_size()
-        vocab_start_index, vocab_end_index = get_vocab_range(
-            partition_vocab_size, rank, world_size)
+        return vocab_parallel_logits, logits_max
+
+    @staticmethod
+    def calculate_predicted_logits(
+        vocab_parallel_logits: torch.Tensor,
+        target: torch.Tensor,
+        logits_max: torch.Tensor,
+        vocab_start_index: int,
+        vocab_end_index: int,
+    ) -> Tuple[torch.Tensor, torch.Tensor, torch.Tensor, torch.Tensor, torch.Tensor]:
+
+        # In-place subtraction reduces memory pressure.
+        vocab_parallel_logits -= logits_max.unsqueeze(dim=-1)
 
         # Create a mask of valid vocab ids (1 means it needs to be masked).
         target_mask = (target < vocab_start_index) | (target >= vocab_end_index)
@@ -40,26 +50,25 @@ def forward(ctx, vocab_parallel_logits, target, label_smoothing=0.0):
         # Get predicted-logits = logits[target].
         # For Simplicity, we convert logits to a 2-D tensor with size
         # [*, partition-vocab-size] and target to a 1-D tensor of size [*].
+        partition_vocab_size = vocab_parallel_logits.size()[-1]
         logits_2d = vocab_parallel_logits.view(-1, partition_vocab_size)
         masked_target_1d = masked_target.view(-1)
-        arange_1d = torch.arange(start=0, end=logits_2d.size()[0],
-                                 device=logits_2d.device)
+        arange_1d = torch.arange(start=0, end=logits_2d.size()[0], device=logits_2d.device)
         predicted_logits_1d = logits_2d[arange_1d, masked_target_1d]
         predicted_logits_1d = predicted_logits_1d.clone().contiguous()
         predicted_logits = predicted_logits_1d.view_as(target)
         predicted_logits[target_mask] = 0.0
-        # All reduce is needed to get the chunks from other GPUs.
-        torch.distributed.all_reduce(predicted_logits,
-                                     op=torch.distributed.ReduceOp.SUM,
-                                     group=get_tensor_model_parallel_group())
 
-        # Sum of exponential of logits along vocab dimension across all GPUs.
         exp_logits = vocab_parallel_logits
         torch.exp(vocab_parallel_logits, out=exp_logits)
         sum_exp_logits = exp_logits.sum(dim=-1)
-        torch.distributed.all_reduce(sum_exp_logits,
-                                     op=torch.distributed.ReduceOp.SUM,
-                                     group=get_tensor_model_parallel_group())
+
+        return target_mask, masked_target_1d, predicted_logits, sum_exp_logits, exp_logits
+
+    @staticmethod
+    def calculate_cross_entropy_loss(
+        exp_logits: torch.Tensor, predicted_logits: torch.Tensor, sum_exp_logits: torch.Tensor
+    ) -> Tuple[torch.Tensor, torch.Tensor]:
 
         # Loss = log(sum(exp(logits))) - predicted-logit.
         loss = torch.log(sum_exp_logits) - predicted_logits
@@ -67,6 +76,85 @@ def forward(ctx, vocab_parallel_logits, target, label_smoothing=0.0):
         # Normalize and optionally smooth logits
         exp_logits.div_(sum_exp_logits.unsqueeze(dim=-1))
 
+        return exp_logits, loss
+
+    @staticmethod
+    def prepare_gradient_calculation_operands(
+        softmax: torch.Tensor, target_mask: torch.Tensor
+    ) -> Tuple[torch.Tensor, torch.Tensor, torch.Tensor, torch.Tensor]:
+
+        # All the inputs have softmax as thier gradient.
+        grad_input = softmax
+        # For simplicity, work with the 2D gradient.
+        partition_vocab_size = softmax.size()[-1]
+        grad_2d = grad_input.view(-1, partition_vocab_size)
+
+        # Add the gradient from matching classes.
+        arange_1d = torch.arange(start=0, end=grad_2d.size()[0], device=grad_2d.device)
+
+        softmax_update = 1.0 - target_mask.view(-1).float()
+
+        return grad_2d, arange_1d, softmax_update, grad_input
+
+    @staticmethod
+    def calculate_gradients(
+        grad_2d: torch.Tensor,
+        arange_1d: torch.Tensor,
+        masked_target_1d: torch.Tensor,
+        softmax_update: torch.Tensor,
+        grad_input: torch.Tensor,
+        grad_output: torch.Tensor,
+    ) -> torch.Tensor:
+
+        grad_2d[arange_1d, masked_target_1d] -= softmax_update
+
+        # Finally elementwise multiplication with the output gradients.
+        grad_input.mul_(grad_output.unsqueeze(dim=-1))
+
+        return grad_input
+
+
+class _VocabParallelCrossEntropy(torch.autograd.Function):
+    @staticmethod
+    def forward(ctx, vocab_parallel_logits, target, label_smoothing=0.0):
+
+        vocab_parallel_logits, logits_max = VocabParallelCrossEntropy.calculate_logits_max(
+            vocab_parallel_logits
+        )
+        torch.distributed.all_reduce(
+            logits_max, op=torch.distributed.ReduceOp.MAX, group=get_tensor_model_parallel_group()
+        )
+
+        # Get the partition's vocab indices
+        get_vocab_range = VocabUtility.vocab_range_from_per_partition_vocab_size
+        partition_vocab_size = vocab_parallel_logits.size()[-1]
+        rank = get_tensor_model_parallel_rank()
+        world_size = get_tensor_model_parallel_world_size()
+        vocab_start_index, vocab_end_index = get_vocab_range(partition_vocab_size, rank, world_size)
+
+        (target_mask, masked_target_1d, predicted_logits, sum_exp_logits, exp_logits) = (
+            VocabParallelCrossEntropy.calculate_predicted_logits(
+                vocab_parallel_logits, target, logits_max, vocab_start_index, vocab_end_index
+            )
+        )
+
+        # All reduce is needed to get the chunks from other GPUs.
+        torch.distributed.all_reduce(
+            predicted_logits,
+            op=torch.distributed.ReduceOp.SUM,
+            group=get_tensor_model_parallel_group(),
+        )
+
+        torch.distributed.all_reduce(
+            sum_exp_logits,
+            op=torch.distributed.ReduceOp.SUM,
+            group=get_tensor_model_parallel_group(),
+        )
+
+        exp_logits, loss = VocabParallelCrossEntropy.calculate_cross_entropy_loss(
+            exp_logits, predicted_logits, sum_exp_logits
+        )
+
         vocab_size = exp_logits.size(-1)
         if label_smoothing > 0:
             """
@@ -87,7 +175,6 @@ def forward(ctx, vocab_parallel_logits, target, label_smoothing=0.0):
             loss = (1.0 - smoothing) * loss - smoothing * mean_log_probs
 
         ctx.label_smoothing, ctx.vocab_size = label_smoothing, vocab_size
-        ctx.save_for_backward(exp_logits, target_mask, masked_target_1d)
 
         # Store softmax, target-mask and masked-target for backward pass.
         ctx.save_for_backward(exp_logits, target_mask, masked_target_1d)
@@ -101,28 +188,22 @@ def backward(ctx, grad_output):
         softmax, target_mask, masked_target_1d = ctx.saved_tensors
         label_smoothing, vocab_size = ctx.label_smoothing, ctx.vocab_size
 
-        # All the inputs have softmax as thier gradient.
-        grad_input = softmax
-        # For simplicity, work with the 2D gradient.
-        partition_vocab_size = softmax.size()[-1]
-        grad_2d = grad_input.view(-1, partition_vocab_size)
-
-        # Add the gradient from matching classes.
-        arange_1d = torch.arange(start=0, end=grad_2d.size()[0],
-                                 device=grad_2d.device)
-
-        softmax_update = 1.0 - target_mask.view(-1).float()
+        (grad_2d, arange_1d, softmax_update, grad_input) = (
+            VocabParallelCrossEntropy.prepare_gradient_calculation_operands(softmax, target_mask)
+        )
 
         if label_smoothing > 0:
             smoothing = label_smoothing * vocab_size / (vocab_size - 1)
             grad_2d[arange_1d, masked_target_1d] -= (1.0 - smoothing) * softmax_update
             average_grad = 1 / vocab_size
             grad_2d[arange_1d, :] -= smoothing * average_grad
-        else:
-            grad_2d[arange_1d, masked_target_1d] -= softmax_update
 
-        # Finally elementwise multiplication with the output gradients.
-        grad_input.mul_(grad_output.unsqueeze(dim=-1))
+            # Finally elementwise multiplication with the output gradients.
+            grad_input.mul_(grad_output.unsqueeze(dim=-1))
+        else:
+            grad_input = VocabParallelCrossEntropy.calculate_gradients(
+                grad_2d, arange_1d, masked_target_1d, softmax_update, grad_input, grad_output
+            )
 
         return grad_input, None, None
 
@@ -131,9 +212,9 @@ def vocab_parallel_cross_entropy(vocab_parallel_logits, target, label_smoothing=
     """
     Performs cross entropy loss when logits are split across tensor parallel ranks
 
-    Arguments:
+    Args:
         vocab_parallel_logits: logits split across tensor parallel ranks
-                               dimension is [sequence_length, batch_size, hidden_size]
+                               dimension is [sequence_length, batch_size, vocab_size/num_parallel_ranks]
 
         target: correct vocab ids of dimseion [sequence_length, micro_batch_size]
 
diff --git a/megatron/core/tensor_parallel/data.py b/megatron/core/tensor_parallel/data.py
index b911790dae..c549f74d73 100644
--- a/megatron/core/tensor_parallel/data.py
+++ b/megatron/core/tensor_parallel/data.py
@@ -8,15 +8,17 @@
     get_tensor_model_parallel_src_rank,
 )
 
-
 _MAX_DATA_DIM = 5
 
 
 def _check_data_types(keys, data, target_dtype):
     """Check that all the keys have the same target data type."""
     for key in keys:
-        assert data[key].dtype == target_dtype, '{} has data type {} which '\
-            'is different than {}'.format(key, data[key].dtype, target_dtype)
+        assert (
+            data[key].dtype == target_dtype
+        ), '{} has data type {} which ' 'is different than {}'.format(
+            key, data[key].dtype, target_dtype
+        )
 
 
 def _build_key_size_numel_dictionaries(keys, data):
@@ -35,9 +37,10 @@ def _build_key_size_numel_dictionaries(keys, data):
             offset += max_dim
 
     # Move to GPU and broadcast.
-    sizes_cuda = torch.cuda.LongTensor(sizes)
-    torch.distributed.broadcast(sizes_cuda, get_tensor_model_parallel_src_rank(),
-                                group=get_tensor_model_parallel_group())
+    sizes_cuda = torch.tensor(sizes, dtype=torch.long, device='cuda')
+    torch.distributed.broadcast(
+        sizes_cuda, get_tensor_model_parallel_src_rank(), group=get_tensor_model_parallel_group()
+    )
 
     # Move back to cpu and unpack.
     sizes_cpu = sizes_cuda.cpu()
@@ -66,7 +69,7 @@ def broadcast_data(keys, data, datatype):
     """Broadcast data from rank zero of each model parallel group to the
     members of the same model parallel group.
 
-    Arguments:
+    Args:
         keys: list of keys in the data disctionary to be broadcasted
         data: data dictionary of string keys and cpu tensor values.
         datatype: torch data type of all tensors in data associated
@@ -74,24 +77,21 @@ def broadcast_data(keys, data, datatype):
     """
     # Build (key, size) and (key, number of elements) dictionaries along
     # with the total number of elements on all ranks.
-    key_size, key_numel, total_numel = _build_key_size_numel_dictionaries(keys,
-                                                                          data)
+    key_size, key_numel, total_numel = _build_key_size_numel_dictionaries(keys, data)
 
     # Pack on rank zero.
     if get_tensor_model_parallel_rank() == 0:
         # Check that all keys have the same data type.
         _check_data_types(keys, data, datatype)
         # Flatten the data associated with the keys
-        flatten_data = torch.cat(
-            [data[key].contiguous().view(-1) for key in keys], dim=0).cuda()
+        flatten_data = torch.cat([data[key].contiguous().view(-1) for key in keys], dim=0).cuda()
     else:
-        flatten_data = torch.empty(total_numel,
-                                   device=torch.cuda.current_device(),
-                                   dtype=datatype)
+        flatten_data = torch.empty(total_numel, device=torch.cuda.current_device(), dtype=datatype)
 
     # Broadcast
-    torch.distributed.broadcast(flatten_data, get_tensor_model_parallel_src_rank(),
-                                group=get_tensor_model_parallel_group())
+    torch.distributed.broadcast(
+        flatten_data, get_tensor_model_parallel_src_rank(), group=get_tensor_model_parallel_group()
+    )
 
     # Unpack
     output = {}
diff --git a/megatron/core/tensor_parallel/layers.py b/megatron/core/tensor_parallel/layers.py
index 15e0fbb025..903b4ed873 100644
--- a/megatron/core/tensor_parallel/layers.py
+++ b/megatron/core/tensor_parallel/layers.py
@@ -1,41 +1,40 @@
-# Copyright (c) 2022, NVIDIA CORPORATION. All rights reserved.
+# Copyright (c) 2024, NVIDIA CORPORATION. All rights reserved.
 
 # Parts of the code here are adapted from PyTorch
 # repo: https://github.com/pytorch/pytorch
 
-import math
 import os
-from typing import Optional
 import warnings
+from typing import Any, Callable, List, Optional, Tuple
 
 import torch
 import torch.nn.functional as F
-import torch.nn.init as init
+from torch.cuda.amp import custom_bwd, custom_fwd
 from torch.nn.parameter import Parameter
 
-from torch.cuda.amp import custom_fwd, custom_bwd
-
+from megatron.core.model_parallel_config import ModelParallelConfig
 from megatron.core.parallel_state import (
+    get_global_memory_buffer,
+    get_tensor_and_expert_parallel_rank,
+    get_tensor_and_expert_parallel_world_size,
+    get_tensor_model_parallel_group,
     get_tensor_model_parallel_rank,
     get_tensor_model_parallel_world_size,
-    get_tensor_model_parallel_group,
-    get_global_memory_buffer,
 )
+
+from ..dist_checkpointing.mapping import ShardedStateDict
+from ..transformer.utils import make_sharded_tensors_for_checkpoint
+from ..utils import make_tp_sharded_tensor_for_checkpoint, prepare_input_tensors_for_wgrad_compute
 from .mappings import (
     copy_to_tensor_model_parallel_region,
-    gather_from_tensor_model_parallel_region,
     gather_from_sequence_parallel_region,
+    gather_from_tensor_model_parallel_region,
     reduce_from_tensor_model_parallel_region,
-    scatter_to_tensor_model_parallel_region,
     reduce_scatter_to_sequence_parallel_region,
+    scatter_to_tensor_model_parallel_region,
 )
-
-from .random import get_cuda_rng_tracker
-from .utils import (
-    divide,
-    split_tensor_along_last_dim,
-    VocabUtility,
-)
+from .random import get_cuda_rng_tracker, get_expert_parallel_rng_tracker_name
+from .utils import VocabUtility, divide
 
 _grad_accum_fusion_available = True
 try:
@@ -43,17 +42,23 @@
 except ImportError:
     _grad_accum_fusion_available = False
 
-_MODEL_PARALLEL_ATTRIBUTE_DEFAULTS = {'tensor_model_parallel': False,
-                                      'partition_dim': -1,
-                                      'partition_stride': 1}
+_MODEL_PARALLEL_ATTRIBUTE_DEFAULTS = {
+    'tensor_model_parallel': False,
+    'partition_dim': -1,
+    'partition_stride': 1,
+}
+
 
 def param_is_not_tensor_parallel_duplicate(param):
-    return (hasattr(param, 'tensor_model_parallel') and
-            param.tensor_model_parallel) or (
-                get_tensor_model_parallel_rank() == 0)
+    """Returns true if the passed-in parameter is not a duplicate parameter
+    on another TP rank."""
+    return (hasattr(param, 'tensor_model_parallel') and param.tensor_model_parallel) or (
+        get_tensor_model_parallel_rank() == 0
+    )
 
 
 def set_tensor_model_parallel_attributes(tensor, is_parallel, dim, stride):
+    """Sets tp attributes to tensor"""
     # Make sure the attributes are not set.
     for attribute in _MODEL_PARALLEL_ATTRIBUTE_DEFAULTS:
         assert not hasattr(tensor, attribute)
@@ -64,67 +69,85 @@ def set_tensor_model_parallel_attributes(tensor, is_parallel, dim, stride):
 
 
 def set_defaults_if_not_set_tensor_model_parallel_attributes(tensor):
+    """Set default model parallel attributes if not set explicitly already."""
+
     def maybe_set(attribute, value):
         if not hasattr(tensor, attribute):
             setattr(tensor, attribute, value)
+
     for attribute in _MODEL_PARALLEL_ATTRIBUTE_DEFAULTS:
         maybe_set(attribute, _MODEL_PARALLEL_ATTRIBUTE_DEFAULTS[attribute])
 
 
 def copy_tensor_model_parallel_attributes(destination_tensor, source_tensor):
+    """Copy model parallel attributes from one tensor to another."""
+
     def maybe_copy(attribute):
         if hasattr(source_tensor, attribute):
-            setattr(destination_tensor, attribute,
-                    getattr(source_tensor, attribute))
+            setattr(destination_tensor, attribute, getattr(source_tensor, attribute))
+
     for attribute in _MODEL_PARALLEL_ATTRIBUTE_DEFAULTS:
         maybe_copy(attribute)
 
 
-def _initialize_affine_weight_gpu(weight, init_method,
-                                  partition_dim, stride=1):
+def _initialize_affine_weight_gpu(
+    weight, init_method, partition_dim, stride=1, expert_parallel=False
+):
     """Initialize affine weight for model parallel on GPU."""
 
-    set_tensor_model_parallel_attributes(tensor=weight,
-                                         is_parallel=True,
-                                         dim=partition_dim,
-                                         stride=stride)
-
-    with get_cuda_rng_tracker().fork():
-        init_method(weight)
-
-
-def _initialize_affine_weight_cpu(weight, output_size, input_size,
-                                  per_partition_size, partition_dim,
-                                  init_method, stride=1,
-                                  return_master_weight=False,
-                                  *, params_dtype=torch.float32):
+    set_tensor_model_parallel_attributes(
+        tensor=weight, is_parallel=True, dim=partition_dim, stride=stride
+    )
+
+    if not expert_parallel:
+        with get_cuda_rng_tracker().fork():
+            init_method(weight)
+    else:
+        with get_cuda_rng_tracker().fork(get_expert_parallel_rng_tracker_name()):
+            init_method(weight)
+
+
+def _initialize_affine_weight_cpu(
+    weight,
+    output_size,
+    input_size,
+    per_partition_size,
+    partition_dim,
+    init_method,
+    stride=1,
+    return_master_weight=False,
+    *,
+    params_dtype=torch.float32,
+    rank=None,
+    world_size=None,
+    skip_set_tensor_parallel_attributes=False,
+):
     """Initialize affine weight for model parallel.
 
     Build the master weight on all processes and scatter
     the relevant chunk."""
 
-    set_tensor_model_parallel_attributes(tensor=weight,
-                                         is_parallel=True,
-                                         dim=partition_dim,
-                                         stride=stride)
+    if not skip_set_tensor_parallel_attributes:
+        set_tensor_model_parallel_attributes(
+            tensor=weight, is_parallel=True, dim=partition_dim, stride=stride
+        )
 
     # Initialize master weight
-    master_weight = torch.empty(output_size, input_size,
-                                dtype=torch.float,
-                                requires_grad=False)
+    master_weight = torch.empty(output_size, input_size, dtype=torch.float, requires_grad=False)
     init_method(master_weight)
     master_weight = master_weight.to(dtype=params_dtype)
-
     # Split and copy
     per_partition_per_stride_size = divide(per_partition_size, stride)
-    weight_list = torch.split(master_weight, per_partition_per_stride_size,
-                              dim=partition_dim)
-    rank = get_tensor_model_parallel_rank()
-    world_size = get_tensor_model_parallel_world_size()
+    weight_list = torch.split(master_weight, per_partition_per_stride_size, dim=partition_dim)
+    if rank is None:
+        rank = get_tensor_model_parallel_rank()
+        world_size = get_tensor_model_parallel_world_size()
     my_weight_list = weight_list[rank::world_size]
 
     with torch.no_grad():
-        torch.cat(my_weight_list, dim=partition_dim, out=weight)
+        # all tensors must live on the same device
+        cpu_weight = torch.cat(my_weight_list, dim=partition_dim).to_dense()
+        weight.data.copy_(cpu_weight)
     if return_master_weight:
         return master_weight
     return None
@@ -135,107 +158,267 @@ class VocabParallelEmbedding(torch.nn.Module):
 
     This is mainly adapted from torch.nn.Embedding and all the default
     values are kept.
-    Arguments:
+
+    Args:
         num_embeddings: vocabulary size.
         embedding_dim: size of hidden state.
+        reduce_scatter_embeddings: Decides whether to perform ReduceScatter after embedding lookup
 
-    Keyword Arguments:
-        init_method: method to initialize weights.
-        params_dtype
-        use_cpu_initialization
-        perform_initialization
+    Keyword Args:
+        config: A megatron.core.ModelParallelConfig object
     """
 
-    def __init__(self, num_embeddings: int, embedding_dim: int, *,
-                 init_method=init.xavier_normal_,
-                 params_dtype: torch.dtype=torch.float32,
-                 use_cpu_initialization: bool=False,
-                 perform_initialization: bool=True):
+    def __init__(
+        self,
+        num_embeddings: int,
+        embedding_dim: int,
+        *,
+        init_method: Callable,
+        reduce_scatter_embeddings: bool = False,
+        config: ModelParallelConfig,
+    ):
         super(VocabParallelEmbedding, self).__init__()
         # Keep the input dimensions.
         self.num_embeddings = num_embeddings
         self.embedding_dim = embedding_dim
-        # Set the detauls for compatibility.
-        self.padding_idx = None
-        self.max_norm = None
-        self.norm_type = 2.
-        self.scale_grad_by_freq = False
-        self.sparse = False
-        self._weight = None
+        self.reduce_scatter_embeddings = reduce_scatter_embeddings
         self.tensor_model_parallel_size = get_tensor_model_parallel_world_size()
         # Divide the weight matrix along the vocaburaly dimension.
-        self.vocab_start_index, self.vocab_end_index = \
+        (self.vocab_start_index, self.vocab_end_index) = (
             VocabUtility.vocab_range_from_global_vocab_size(
-                self.num_embeddings, get_tensor_model_parallel_rank(),
-                self.tensor_model_parallel_size)
-        self.num_embeddings_per_partition = self.vocab_end_index - \
-            self.vocab_start_index
+                self.num_embeddings,
+                get_tensor_model_parallel_rank(),
+                self.tensor_model_parallel_size,
+            )
+        )
+        self.num_embeddings_per_partition = self.vocab_end_index - self.vocab_start_index
+        self.deterministic_mode = config.deterministic_mode
 
         # Allocate weights and initialize.
-        if use_cpu_initialization:
-            self.weight = Parameter(torch.empty(
-                self.num_embeddings_per_partition, self.embedding_dim,
-                dtype=params_dtype))
-            if perform_initialization:
+        if config.use_cpu_initialization:
+            self.weight = Parameter(
+                torch.empty(
+                    self.num_embeddings_per_partition, self.embedding_dim, dtype=config.params_dtype
+                )
+            )
+            if config.perform_initialization:
                 _initialize_affine_weight_cpu(
-                    self.weight, self.num_embeddings, self.embedding_dim,
-                    self.num_embeddings_per_partition, 0, init_method,
-                    params_dtype=params_dtype)
+                    self.weight,
+                    self.num_embeddings,
+                    self.embedding_dim,
+                    self.num_embeddings_per_partition,
+                    0,
+                    init_method,
+                    params_dtype=config.params_dtype,
+                )
         else:
-            self.weight = Parameter(torch.empty(
-                self.num_embeddings_per_partition, self.embedding_dim,
-                device=torch.cuda.current_device(), dtype=params_dtype))
-            if perform_initialization:
-                _initialize_affine_weight_gpu(self.weight, init_method,
-                                              partition_dim=0, stride=1)
+            self.weight = Parameter(
+                torch.empty(
+                    self.num_embeddings_per_partition,
+                    self.embedding_dim,
+                    device=torch.cuda.current_device(),
+                    dtype=config.params_dtype,
+                )
+            )
+            if config.perform_initialization:
+                _initialize_affine_weight_gpu(self.weight, init_method, partition_dim=0, stride=1)
 
     def forward(self, input_):
+        """Forward.
+
+        Args:
+            input_ (torch.Tensor): Input tensor.
+        """
         if self.tensor_model_parallel_size > 1:
             # Build the mask.
-            input_mask = (input_ < self.vocab_start_index) | \
-                         (input_ >= self.vocab_end_index)
+            input_mask = (input_ < self.vocab_start_index) | (input_ >= self.vocab_end_index)
             # Mask the input.
             masked_input = input_.clone() - self.vocab_start_index
             masked_input[input_mask] = 0
         else:
             masked_input = input_
-            # Get the embeddings.
-        output_parallel = F.embedding(masked_input, self.weight,
-                                      self.padding_idx, self.max_norm,
-                                      self.norm_type, self.scale_grad_by_freq,
-                                      self.sparse)
+        # Get the embeddings.
+        if self.deterministic_mode:
+            output_parallel = self.weight[masked_input]
+        else:
+            # F.embedding currently has a non-deterministic backward function
+            output_parallel = F.embedding(masked_input, self.weight)
         # Mask the output embedding.
         if self.tensor_model_parallel_size > 1:
             output_parallel[input_mask, :] = 0.0
-        # Reduce across all the model parallel GPUs.
-        output = reduce_from_tensor_model_parallel_region(output_parallel)
+
+        if self.reduce_scatter_embeddings:
+            # Data format change to avoid explicit tranposes : [b s h] --> [s b h].
+            output_parallel = output_parallel.transpose(0, 1).contiguous()
+            output = reduce_scatter_to_sequence_parallel_region(output_parallel)
+        else:
+            # Reduce across all the model parallel GPUs.
+            output = reduce_from_tensor_model_parallel_region(output_parallel)
         return output
 
+    def sharded_state_dict(
+        self,
+        prefix: str = '',
+        sharded_offsets: Tuple[Tuple[int, int, int]] = (),
+        metadata: Optional[dict] = None,
+    ) -> ShardedStateDict:
+        """Non-default implementation for embeddings due to `allow_shape_mismatch` param"""
+        state_dict = self.state_dict(prefix='', keep_vars=True)
+
+        weight_prefix = f'{prefix}weight'
+        return {
+            weight_prefix: make_tp_sharded_tensor_for_checkpoint(
+                tensor=state_dict['weight'],
+                key=weight_prefix,
+                allow_shape_mismatch=True,
+                prepend_offsets=sharded_offsets,
+            )
+        }
+
+
+class LinearWithFrozenWeight(torch.autograd.Function):
+    """Linear operator that does not calculate gradient for weight.
+    This op and LinearWithGradAccumulationAndAsyncCommunication performs
+    mathematically-identical forward and DGRAD.
+
+    Conceptually this op is the same as torch.nn.functional.linear with
+    weight.requires_grad==False, but in experiments they are not identical
+    mathematically."""
+
+    @staticmethod
+    @custom_fwd
+    def forward(ctx, input, weight, bias, allreduce_dgrad):
+        """Forward with frozen weight."""
+        ctx.save_for_backward(weight)
+        ctx.allreduce_dgrad = allreduce_dgrad
+        output = torch.matmul(input, weight.t())
+        if bias is not None:
+            output = output + bias
+        return output
+
+    @staticmethod
+    @custom_bwd
+    def backward(ctx, grad_output):
+        """Backward with frozen weight."""
+        (weight,) = ctx.saved_tensors
+        grad_input = grad_output.matmul(weight)
+
+        if ctx.allreduce_dgrad:
+            # All-reduce. Note: here async and sync are effectively the same.
+            torch.distributed.all_reduce(grad_input, group=get_tensor_model_parallel_group())
+
+        return grad_input, None, None, None
+
+
+def linear_with_frozen_weight(
+    input: torch.Tensor,
+    weight: torch.Tensor,
+    bias: Optional[torch.Tensor],
+    gradient_accumulation_fusion: bool,
+    async_grad_allreduce: bool,
+    sequence_parallel: bool,
+    grad_output_buffer: Optional[List[torch.Tensor]] = None,
+    wgrad_deferral_limit: None = None,
+    allreduce_dgrad: bool = None,
+) -> torch.Tensor:
+    """Linear layer execution with weight.requires_grad == False.
+
+    This function handles linear layers with weight frozen (untrainable).
+    In the forward, it only saves weight and does not save input activations.
+    In the backward, it does not perform weight gradient calculation, or
+    weight gradient allreduce.
+
+    Args:
+
+    input (torch.Tensor required): input like torch.nn.functional.linear
+
+    weight (torch.Tensor required): weight like torch.nn.functional.linear
+
+    bias (torch.Tensor optional): bias like torch.nn.functional.linear
+
+    gradient_accumulation_fusion (bool required): dummy argument, used to
+    keep the API unified between all forward implementation functions.
+
+    async_grad_allreduce (bool required): dummy argument, used to
+    keep the API unified between all forward implementation functions.
+
+    sequence_parallel (bool required): Indicates that sequence
+        parallelism is used and thus in the forward pass the input is
+        all gathered, and the backward pass the input gradients are
+        reduce scattered.
+
+    grad_output_buffer (List[torch.Tensor] optional): dummy argument, used to
+    keep the API unified between all forward implementation functions.
+
+    wgrad_deferral_limit (int optional): dummy argument, used to
+    keep the API unified between all forward implementation functions.
+
+    allreduce_dgrad (bool): Do the allreduce of input gradients.
+        Here, async and sync allreduce are the same. If sequence_parallel is
+        True, this must be False, as no all reduce is performed.
+
+    """
+
+    assert grad_output_buffer is None, (
+        "grad_output_buffer kwarg is only supported with "
+        "linear_with_grad_accumulation_and_async_allreduce"
+    )
+
+    assert wgrad_deferral_limit is None, (
+        "This arg is only supported with " "linear_with_grad_accumulation_and_async_allreduce"
+    )
+
+    if sequence_parallel:
+        input = gather_from_sequence_parallel_region(input, tensor_parallel_output_grad=True)
+    else:
+        input = input
+
+    if allreduce_dgrad is None:
+        warnings.warn(
+            "`async_grad_allreduce` is deprecated and will be removed in a future release. "
+            "Please ue `allreduce_dgrad` instead."
+        )
+        allreduce_dgrad = async_grad_allreduce
+
+    args = [input, weight, bias, allreduce_dgrad]
+
+    return LinearWithFrozenWeight.apply(*args)
+
 
 class LinearWithGradAccumulationAndAsyncCommunication(torch.autograd.Function):
     """See linear_with_grad_accumulation_and_async_allreduce"""
 
     @staticmethod
     @custom_fwd
-    def forward(ctx, input, weight, bias, gradient_accumulation_fusion,
-                async_grad_allreduce, sequence_parallel):
+    def forward(
+        ctx,
+        input,
+        weight,
+        bias,
+        gradient_accumulation_fusion,
+        allreduce_dgrad,
+        sequence_parallel,
+        grad_output_buffer,
+        wgrad_deferral_limit,
+    ):
+        """Forward."""
         ctx.save_for_backward(input, weight)
         ctx.use_bias = bias is not None
         ctx.gradient_accumulation_fusion = gradient_accumulation_fusion
-        ctx.async_grad_allreduce = async_grad_allreduce
+        ctx.allreduce_dgrad = allreduce_dgrad
         ctx.sequence_parallel = sequence_parallel
+        ctx.wgrad_deferral_limit = wgrad_deferral_limit
+        ctx.grad_output_buffer = grad_output_buffer
 
         if sequence_parallel:
             world_size = get_tensor_model_parallel_world_size()
             dim_size = list(input.size())
             dim_size[0] = dim_size[0] * world_size
 
-            all_gather_buffer = \
-                get_global_memory_buffer().get_tensor(dim_size, input.dtype, "mpu")
+            all_gather_buffer = get_global_memory_buffer().get_tensor(dim_size, input.dtype, "mpu")
             torch.distributed._all_gather_base(
-                all_gather_buffer,
-                input,
-                group=get_tensor_model_parallel_group())
+                all_gather_buffer, input, group=get_tensor_model_parallel_group()
+            )
             total_input = all_gather_buffer
         else:
             total_input = input
@@ -248,91 +431,128 @@ def forward(ctx, input, weight, bias, gradient_accumulation_fusion,
     @staticmethod
     @custom_bwd
     def backward(ctx, grad_output):
+        """Backward."""
         input, weight = ctx.saved_tensors
         use_bias = ctx.use_bias
+        grad_output_buffer = ctx.grad_output_buffer
+        wgrad_deferral_limit = ctx.wgrad_deferral_limit
+
+        wgrad_compute = True
+        if grad_output_buffer is not None:
+            if wgrad_deferral_limit == 0 or len(grad_output_buffer) < wgrad_deferral_limit:
+                grad_output_buffer.append(grad_output)
+                wgrad_compute = False
+
+        if wgrad_compute:
+            if ctx.sequence_parallel:
+                world_size = get_tensor_model_parallel_world_size()
+                dim_size = list(input.size())
+                dim_size[0] = dim_size[0] * world_size
+
+                all_gather_buffer = get_global_memory_buffer().get_tensor(
+                    dim_size, input.dtype, "mpu"
+                )
+                handle = torch.distributed._all_gather_base(
+                    all_gather_buffer, input, group=get_tensor_model_parallel_group(), async_op=True
+                )
 
-        if ctx.sequence_parallel:
-            world_size = get_tensor_model_parallel_world_size()
-            dim_size = list(input.size())
-            dim_size[0] = dim_size[0] * world_size
-
-            all_gather_buffer = \
-                get_global_memory_buffer().get_tensor(dim_size, input.dtype, "mpu")
-            handle = torch.distributed._all_gather_base(
-                all_gather_buffer,
-                input,
-                group=get_tensor_model_parallel_group(), async_op=True)
-
-            # Here we rely on CUDA_DEVICE_MAX_CONNECTIONS=1 to ensure that the
-            # gather is scheduled before the input gradient computation
-            total_input = all_gather_buffer
-        else:
-            total_input = input
+                # Here we rely on CUDA_DEVICE_MAX_CONNECTIONS=1 to ensure that the
+                # gather is scheduled before the input gradient computation
+                total_input = all_gather_buffer
+            else:
+                total_input = input
         grad_input = grad_output.matmul(weight)
 
-        if ctx.sequence_parallel:
+        if ctx.sequence_parallel and wgrad_compute:
             handle.wait()
 
-        # Doing gather + slicing during the NeMo forward pass can make this tensor 
-        # not be contiguous. PyTorch only checks if the tensor is contiguous, and only 
-        # clones it if it's not contiguous: 
-        # https://github.com/pytorch/pytorch/blob/c47cf9bc7f9e02f649ab4ed53fe4d35732c92ab6/torch/_refs/__init__.py#L2761
-        grad_output = grad_output.contiguous()
-        # Convert the tensor shapes to 2D for execution compatibility
-        grad_output = grad_output.view(grad_output.shape[0] * grad_output.shape[1],
-                                       grad_output.shape[2])
-        total_input = total_input.view(total_input.shape[0] * total_input.shape[1],
-				       total_input.shape[2])
-
-        if ctx.async_grad_allreduce:
+        if wgrad_compute:
+            grad_output, total_input = prepare_input_tensors_for_wgrad_compute(
+                grad_output, total_input
+            )
+
+        if ctx.allreduce_dgrad:
             # Asynchronous all-reduce
             handle = torch.distributed.all_reduce(
-                    grad_input, group=get_tensor_model_parallel_group(), async_op=True)
+                grad_input, group=get_tensor_model_parallel_group(), async_op=True
+            )
             # Here we rely on CUDA_DEVICE_MAX_CONNECTIONS=1 to ensure that the
             # all-reduce is scheduled before the weight gradient computation
 
         if ctx.sequence_parallel:
-            assert not ctx.async_grad_allreduce
+            assert not ctx.allreduce_dgrad
             dim_size = list(input.size())
-            sub_grad_input = torch.empty(dim_size, dtype=input.dtype,
-                                         device=torch.cuda.current_device(),
-                                         requires_grad=False)
+            sub_grad_input = torch.empty(
+                dim_size, dtype=input.dtype, device=torch.cuda.current_device(), requires_grad=False
+            )
             # reduce_scatter
-            handle = torch.distributed._reduce_scatter_base(sub_grad_input, grad_input,
-                                                            group=get_tensor_model_parallel_group(),
-                                                            async_op=True)
+            handle = torch.distributed._reduce_scatter_base(
+                sub_grad_input, grad_input, group=get_tensor_model_parallel_group(), async_op=True
+            )
             # Here we rely on CUDA_DEVICE_MAX_CONNECTIONS=1 to ensure that the
             # reduce scatter is scheduled before the weight gradient computation
 
-
         if ctx.gradient_accumulation_fusion:
-            if weight.main_grad.dtype == torch.float32:
-                fused_weight_gradient_mlp_cuda.wgrad_gemm_accum_fp32(total_input, grad_output, weight.main_grad)
-            elif weight.main_grad.dtype in (torch.float16, torch.bfloat16):
-                fused_weight_gradient_mlp_cuda.wgrad_gemm_accum_fp16(total_input, grad_output, weight.main_grad)
+            if wgrad_compute:
+                if weight.main_grad.dtype == torch.float32:
+                    fused_weight_gradient_mlp_cuda.wgrad_gemm_accum_fp32(
+                        total_input, grad_output, weight.main_grad
+                    )
+                elif weight.main_grad.dtype in (torch.float16, torch.bfloat16):
+                    fused_weight_gradient_mlp_cuda.wgrad_gemm_accum_fp16(
+                        total_input, grad_output, weight.main_grad
+                    )
+                else:
+                    raise RuntimeError("Unsupported gradient type for gradient accumulation fusion")
+
+            if hasattr(weight, 'grad_added_to_main_grad'):
+                # When overlap_grad_reduce is True, need to ensure that backward hooks
+                # are all run on the main backprop thread to prevent deadlocks. Setup
+                # dummy grad_weight tensor to prevent backward hooks from being run
+                # in a background thread.
+                if getattr(weight, 'zero_out_wgrad', False):
+                    grad_weight = torch.zeros(
+                        weight.main_grad.shape,
+                        dtype=input.dtype,
+                        device=torch.cuda.current_device(),
+                        requires_grad=False,
+                    )
+                else:
+                    grad_weight = torch.empty(
+                        weight.main_grad.shape,
+                        dtype=input.dtype,
+                        device=torch.cuda.current_device(),
+                        requires_grad=False,
+                    )
+                weight.grad_added_to_main_grad = True
             else:
-                raise RuntimeError("Unsupported gradient type for gradient accumulation fusion")
-            grad_weight = None
+                grad_weight = None
         else:
             grad_weight = grad_output.t().matmul(total_input)
         grad_bias = grad_output.sum(dim=0) if use_bias else None
 
         if ctx.sequence_parallel:
             handle.wait()
-            return sub_grad_input, grad_weight, grad_bias, None, None, None
+            # Need to return None's as gradient has to flow for all the input arguments
+            # provided during forward
+            return sub_grad_input, grad_weight, grad_bias, None, None, None, None, None
 
-        if ctx.async_grad_allreduce:
+        if ctx.allreduce_dgrad:
             handle.wait()
 
-        return grad_input, grad_weight, grad_bias, None, None, None
+        return grad_input, grad_weight, grad_bias, None, None, None, None, None
+
 
 def linear_with_grad_accumulation_and_async_allreduce(
     input: torch.Tensor,
     weight: torch.Tensor,
     bias: Optional[torch.Tensor],
     gradient_accumulation_fusion: bool,
-    async_grad_allreduce: bool,
-    sequence_parallel_enabled: bool,
+    sequence_parallel: bool,
+    allreduce_dgrad: bool,
+    async_grad_allreduce: Optional[bool] = None,
+    grad_output_buffer: Optional[List[torch.Tensor]] = None,
+    wgrad_deferral_limit: Optional[int] = 0,
 ) -> torch.Tensor:
     """Linear layer execution with asynchronous communication and
     gradient accumulation fusion in backprop.
@@ -359,105 +579,155 @@ def linear_with_grad_accumulation_and_async_allreduce(
     CUDA_DEVICE_MAX_CONNECTIONS=1 forces the kernels to be scheduled
     in the order they are called.
 
-    Arguments:
+    Args:
+        input (torch.Tensor required): input like torch.nn.functional.linear
 
-    input (torch.Tensor required): input like torch.nn.functional.linear
+        weight (torch.Tensor required): weight like torch.nn.functional.linear
 
-    weight (torch.Tensor required): weight like torch.nn.functional.linear
+        bias (torch.Tensor optional): bias like torch.nn.functional.linear
 
-    bias (torch.Tensor optional): bias like torch.nn.functional.linear
+        gradient_accumulation_fusion (bool required): Perform the gradient
+            accumulation fusion, requires the custom CUDA extension
+            fused_weight_gradient_mlp_cuda module. To use
+            gradient_accumulation_fusion you must install APEX with
+            --cpp_ext and --cuda_ext. For example: "pip install
+            --global-option=\"--cpp_ext\" --global-option=\"--cuda_ext .\"
+            " Note that the extension requires CUDA>=11. Otherwise, you
+            must turn off gradient accumulation fusion."
+
+        allreduce_dgrad (bool required): Do the allreduce of input gradients.
+            The allreduce is done asynchronously with the computation of weight
+            gradients. If sequence_parallel is True, this must be
+            False, as no all reduce is performed.
+
+        async_grad_allreduce (bool optional): Do the allreduce of input
+            gradients asyncronously with the computation of weight
+            gradients. If sequence_parallel is True, this must be
+            False, as no all reduce is performed. Will be deprecated with 0.10.0
+
+        sequence_parallel (bool required): Indicates that sequence
+            parallelism is used and thus in the forward pass the input is
+            all gathered, and the backward pass the input gradients are
+            reduce scattered.
+
+        grad_output_buffer (List[torch.Tensor] optional): Buffer used to save
+            output gradients when embedding table wgrad compute is deferred.
+            Defaults to None.
+
+        wgrad_deferral_limit (int optional): Limit on the number of
+            micro-batches for which embedding weight gradient GEMM should be
+            deferred. Disable by setting this to 0. Defaults to 0.
 
-    gradient_accumulation_fusion (bool required): Perform the gradient
-        accumulation fusion, requires the custom CUDA extension
-        fused_weight_gradient_mlp_cuda module. To use
-        gradient_accumulation_fusion you must install APEX with
-        --cpp_ext and --cuda_ext. For example: "pip install
-        --global-option=\"--cpp_ext\" --global-option=\"--cuda_ext .\"
-        " Note that the extension requires CUDA>=11. Otherwise, you
-        must turn off gradient accumulation fusion."
-
-    async_grad_allreduce (bool required): Do the allreduce of input
-        gradients asyncronously with the computation of weight
-        gradients. If sequence_parallel_enabled is True, this must be
-        False, as no all reduce is performed.
-
-    sequence_parallel_enabled (bool required): Indicates that sequence
-        parallelism is used and thus in the forward pass the input is
-        all gathered, and the backward pass the input gradients are
-        reduce scattered.
     """
+    if async_grad_allreduce is not None:
+        warnings.warn(
+            "async_grad_allreduce is deprecated, not in use anymore and will"
+            " be fully removed with 0.10.0. Please use allreduce_dgrad instead."
+        )
+
     args = [
         input,
         weight,
         bias,
         gradient_accumulation_fusion,
-        async_grad_allreduce,
-        sequence_parallel_enabled,
+        allreduce_dgrad,
+        sequence_parallel,
+        grad_output_buffer,
+        wgrad_deferral_limit,
     ]
 
     if not linear_with_grad_accumulation_and_async_allreduce.warned:
         if os.environ.get('CUDA_DEVICE_MAX_CONNECTIONS') != "1":
-            if sequence_parallel_enabled:
+            if sequence_parallel:
                 warnings.warn(
                     "When using sequence parallelism it is recommended to set the "
                     "environment variable CUDA_DEVICE_MAX_CONNECTIONS to 1 for "
-                    "maximum speedup")
+                    "maximum speedup"
+                )
                 linear_with_grad_accumulation_and_async_allreduce.warned = True
 
-            if async_grad_allreduce:
+            if allreduce_dgrad:
                 warnings.warn(
                     "When using async grad allreduce it is recommended to set the "
                     "environment variable CUDA_DEVICE_MAX_CONNECTIONS to 1 for "
-                    "maximum speedup")
+                    "maximum speedup"
+                )
                 linear_with_grad_accumulation_and_async_allreduce.warned = True
 
     return LinearWithGradAccumulationAndAsyncCommunication.apply(*args)
 
+
 linear_with_grad_accumulation_and_async_allreduce.warned = False
 
+
 class ColumnParallelLinear(torch.nn.Module):
     """Linear layer with column parallelism.
 
     The linear layer is defined as Y = XA + b. A is parallelized along
     its second dimension as A = [A_1, ..., A_p].
 
-    Arguments:
-        input_size: first dimension of matrix A.
-        output_size: second dimension of matrix A.
-
-    Keyword Arguments
-        bias: If true, add bias
-        gather_output: If true, call all-gather on output and make Y available
-                       to all GPUs, otherwise, every GPU will have its output
-                       which is Y_i = XA_i
-        init_method: method to initialize weights. Note that bias is always set
-                     to zero.
-        stride: For the strided linear layers.
-        keep_master_weight_for_test: This was added for testing and should be
-                                     set to False. It returns the master weights
-                                     used for initialization.
-        skip_bias_add: This was added to enable performance optimations where bias
-                       can be fused with other elementwise operations. we skip
-                       adding bias but instead return it.
-        async_tensor_model_parallel_allreduce:
-        params_dtype:
-        use_cpu_initialization:
-        gradient_accumulation_fusion:
-        sequence_parallel_enabled:
+    Args:
+        input_size:
+            first dimension of matrix A.
+        output_size:
+            second dimension of matrix A.
+        bias:
+            If true, add bias
+        gather_output:
+            If true, call all-gather on output and make Y available to all GPUs,
+            otherwise, every GPU will have its output which is Y_i = XA_i
+        init_method:
+            method to initialize weights. Note that bias is always set to zero.
+        stride:
+            For the strided linear layers.
+        keep_master_weight_for_test:
+            This was added for testing and should be set to False. It
+            returns the master weights used for initialization.
+        skip_bias_add:
+            If True, do not add the bias term, instead return it to be added by the
+            caller. This enables performance optimations where bias can be fused with other
+            elementwise operations.
+        skip_weight_param_allocation:
+            If True, weight parameter is not allocated and must be passed
+            as a keyword argument `weight` during the forward pass. Note that this does not
+            affect bias, which will be allocated if bias is True. Defaults to False.
+        embedding_activation_buffer:
+            This buffer holds the input activations of the final embedding
+            linear layer on the last pipeline stage when defer_embedding_wgrad_compute is enabled.
+        grad_output_buffer:
+            This buffer holds the gradient outputs of the final embedding linear
+            layer on the last pipeline stage when defer_embedding_wgrad_compute is enabled.
+        is_expert:
+            If True, the layer is treated as an MoE expert layer.
+        config:
+            ModelParallelConfig object
+        tp_comm_buffer_name:
+            Communication buffer name is not used in non-Transformer-Engine modules.
+        disable_grad_reduce:
+            If True, reduction of output gradients across tensor-parallel ranks
+            will be disabled. Defaults to False. This feature is used by Lora Adapter in Nemo to
+            delay and fuse reduction along with other gradients for performance optimization.
     """
 
-    def __init__(self, input_size, output_size, *,
-                 bias=True, gather_output=True,
-                 init_method=init.xavier_normal_, stride=1,
-                 keep_master_weight_for_test=False,
-                 skip_bias_add=False,
-                 async_tensor_model_parallel_allreduce=True,
-                 params_dtype=torch.float32,
-                 use_cpu_initialization=False,
-                 perform_initialization=True,
-                 gradient_accumulation_fusion=False,
-                 sequence_parallel_enabled: bool = False,
-                 ):
+    def __init__(
+        self,
+        input_size,
+        output_size,
+        *,
+        config: ModelParallelConfig,
+        init_method: Callable,
+        bias=True,
+        gather_output=False,
+        stride=1,
+        keep_master_weight_for_test=False,
+        skip_bias_add=False,
+        skip_weight_param_allocation: bool = False,
+        embedding_activation_buffer: Optional[List[torch.Tensor]] = None,
+        grad_output_buffer: Optional[List[torch.Tensor]] = None,
+        is_expert: bool = False,
+        tp_comm_buffer_name: str = None,  # Not used
+        disable_grad_reduce: bool = False,
+    ):
         super(ColumnParallelLinear, self).__init__()
 
         # Keep input parameters
@@ -465,216 +735,401 @@ def __init__(self, input_size, output_size, *,
         self.output_size = output_size
         self.gather_output = gather_output
         # Divide the weight matrix along the last dimension.
-        world_size = get_tensor_model_parallel_world_size()
-        self.output_size_per_partition = divide(output_size, world_size)
         self.skip_bias_add = skip_bias_add
+        self.is_expert = is_expert
+        self.expert_parallel = config.expert_model_parallel_size > 1
+        self.embedding_activation_buffer = embedding_activation_buffer
+        self.grad_output_buffer = grad_output_buffer
+        self.config = config
+        self.disable_grad_reduce = disable_grad_reduce
+
+        self.explicit_expert_comm = self.is_expert and (
+            config.tensor_model_parallel_size > 1 or self.expert_parallel
+        )
+        if self.explicit_expert_comm and config.moe_extended_tp:
+            world_size = get_tensor_and_expert_parallel_world_size()
+            rank = get_tensor_and_expert_parallel_rank()
+        else:
+            world_size = get_tensor_model_parallel_world_size()
+            rank = get_tensor_model_parallel_rank()
+
+        self.output_size_per_partition = divide(output_size, world_size)
 
         # Parameters.
         # Note: torch.nn.functional.linear performs XA^T + b and as a result
         # we allocate the transpose.
         # Initialize weight.
-        if use_cpu_initialization:
-            self.weight = Parameter(torch.empty(self.output_size_per_partition,
-                                                self.input_size,
-                                                dtype=params_dtype))
-            if perform_initialization:
-                self.master_weight = _initialize_affine_weight_cpu(
-                    self.weight, self.output_size, self.input_size,
-                    self.output_size_per_partition, 0, init_method,
-                    stride=stride, return_master_weight=keep_master_weight_for_test)
+        if not skip_weight_param_allocation:
+            if config.use_cpu_initialization:
+                self.weight = Parameter(
+                    torch.empty(
+                        self.output_size_per_partition, self.input_size, dtype=config.params_dtype
+                    )
+                )
+                if config.perform_initialization:
+                    self.master_weight = _initialize_affine_weight_cpu(
+                        self.weight,
+                        self.output_size,
+                        self.input_size,
+                        self.output_size_per_partition,
+                        0,
+                        init_method,
+                        stride=stride,
+                        return_master_weight=keep_master_weight_for_test,
+                        rank=rank,
+                        world_size=world_size,
+                    )
+            else:
+                self.weight = Parameter(
+                    torch.empty(
+                        self.output_size_per_partition,
+                        self.input_size,
+                        device=torch.cuda.current_device(),
+                        dtype=config.params_dtype,
+                    )
+                )
+                if config.perform_initialization:
+                    _initialize_affine_weight_gpu(
+                        self.weight,
+                        init_method,
+                        partition_dim=0,
+                        stride=stride,
+                        expert_parallel=(self.is_expert and self.expert_parallel),
+                    )
+
+            setattr(self.weight, 'allreduce', not (self.is_expert and self.expert_parallel))
         else:
-            self.weight = Parameter(torch.empty(
-                self.output_size_per_partition, self.input_size,
-                device=torch.cuda.current_device(), dtype=params_dtype))
-            if perform_initialization:
-                _initialize_affine_weight_gpu(self.weight, init_method,
-                                              partition_dim=0, stride=stride)
+            self.weight = None
 
         if bias:
-            if use_cpu_initialization:
-                self.bias = Parameter(torch.empty(
-                    self.output_size_per_partition, dtype=params_dtype))
+            if config.use_cpu_initialization:
+                self.bias = Parameter(
+                    torch.empty(self.output_size_per_partition, dtype=config.params_dtype)
+                )
             else:
-                self.bias = Parameter(torch.empty(
-                    self.output_size_per_partition,
-                    device=torch.cuda.current_device(),
-                    dtype=params_dtype))
+                self.bias = Parameter(
+                    torch.empty(
+                        self.output_size_per_partition,
+                        device=torch.cuda.current_device(),
+                        dtype=config.params_dtype,
+                    )
+                )
             set_tensor_model_parallel_attributes(self.bias, True, 0, stride)
-            # Always initialize bias to zero.
-            with torch.no_grad():
-                self.bias.zero_()
+            if config.perform_initialization:
+                # Always initialize bias to zero.
+                with torch.no_grad():
+                    self.bias.zero_()
+            setattr(self.bias, 'allreduce', not (self.is_expert and self.expert_parallel))
         else:
             self.register_parameter('bias', None)
 
-        self.async_tensor_model_parallel_allreduce = (
-                async_tensor_model_parallel_allreduce and
-                world_size > 1)
-        if sequence_parallel_enabled:
-            if world_size <= 1:
-                warnings.warn(
-                    f"`sequence_parallel_enabled` is set to `True`, but tensor model parallel size is {world_size}. "
-                    f"Disabling sequence parallel."
-                )
-                sequence_parallel_enabled = False
-        self.sequence_parallel_enabled = sequence_parallel_enabled
+        self.sequence_parallel = config.sequence_parallel
+        if self.sequence_parallel and world_size <= 1:
+            warnings.warn(
+                "`sequence_parallel` is set to `True`, but tensor model parallel size "
+                f"is {world_size}. Disabling sequence parallel."
+            )
+            self.sequence_parallel = False
 
-        if gradient_accumulation_fusion:
-            if not _grad_accum_fusion_available:
-                raise RuntimeError(
-                    "ColumnParallelLinear was called with gradient_accumulation_fusion set "
-                    "to True but the custom CUDA extension fused_weight_gradient_mlp_cuda "
-                    "module is not found. To use gradient_accumulation_fusion you must "
-                    "install APEX with --cpp_ext and --cuda_ext. For example: "
-                    "pip install --global-option=\"--cpp_ext\" --global-option=\"--cuda_ext .\" "
-                    "Note that the extension requires CUDA>=11. Otherwise, you must turn off "
-                    "gradient accumulation fusion."
-                )
-        self.gradient_accumulation_fusion = gradient_accumulation_fusion
+        self.allreduce_dgrad = (
+            world_size > 1 and not self.sequence_parallel and not self.disable_grad_reduce
+        )
 
-        if self.async_tensor_model_parallel_allreduce and self.sequence_parallel_enabled:
+        if config.gradient_accumulation_fusion and not _grad_accum_fusion_available:
             raise RuntimeError(
-                "`async_tensor_model_parallel_allreduce` and `sequence_parallel_enabled` "
-                "cannot be enabled at the same time."
+                "ColumnParallelLinear was called with gradient_accumulation_fusion set "
+                "to True but the custom CUDA extension fused_weight_gradient_mlp_cuda "
+                "module is not found. To use gradient_accumulation_fusion you must "
+                "install APEX with --cpp_ext and --cuda_ext. For example: "
+                "pip install --global-option=\"--cpp_ext\" --global-option=\"--cuda_ext .\" "
+                "Note that the extension requires CUDA>=11. Otherwise, you must turn off "
+                "gradient accumulation fusion."
             )
+        self.gradient_accumulation_fusion = config.gradient_accumulation_fusion
 
+        if self.allreduce_dgrad and self.sequence_parallel:
+            raise RuntimeError(
+                "`allreduce_dgrad` and `sequence_parallel` cannot be enabled at the same time."
+            )
 
-    def forward(self, input_):
+        self._forward_impl = linear_with_grad_accumulation_and_async_allreduce
+
+        # Hook adding a default empty _extra_state for state dict
+        self._register_load_state_dict_pre_hook(
+            lambda state_dict, prefix, *args, **kwargs: state_dict.setdefault(
+                f'{prefix}_extra_state'
+            )
+        )
+
+    def forward(
+        self,
+        input_: torch.Tensor,
+        weight: Optional[torch.Tensor] = None,
+        runtime_gather_output: Optional[bool] = None,
+    ):
         """Forward of ColumnParallelLinear
 
         Args:
-            input_: 3D tensor whose order of dimension is [sequence, batch, hidden]
+            input_:
+                3D tensor whose order of dimension is [sequence, batch, hidden]
+            weight (optional):
+                weight tensor to use, compulsory when skip_weight_param_allocation is True.
+            runtime_gather_output (bool): Gather output at runtime. Default None means
+                `gather_output` arg in the constructor will be used.
 
         Returns:
             - output
             - bias
+
         """
+        if weight is None:
+            if self.weight is None:
+                raise RuntimeError(
+                    "weight was not supplied to ColumnParallelLinear forward pass "
+                    "and skip_weight_param_allocation is True."
+                )
+            weight = self.weight
+        else:
+            # Check the weight passed in is the correct shape
+            expected_shape = (self.output_size_per_partition, self.input_size)
+            if weight.shape != expected_shape:
+                raise RuntimeError(
+                    f"supplied weight's shape is {tuple(weight.shape)}, "
+                    f"not {expected_shape} as expected"
+                )
+
+        if self.config._cpu_offloading_context is not None:
+            if self.config._cpu_offloading_context.inside_context is True:
+                assert (
+                    self.config.cpu_offloading is False
+                ), "CPU Offloading cannot be enabled while using non-TE modules"
+
         bias = self.bias if not self.skip_bias_add else None
 
-        if self.async_tensor_model_parallel_allreduce or \
-                self.sequence_parallel_enabled:
+        if (
+            self.allreduce_dgrad
+            or self.sequence_parallel
+            or self.explicit_expert_comm
+            or self.disable_grad_reduce
+        ):
             input_parallel = input_
         else:
             input_parallel = copy_to_tensor_model_parallel_region(input_)
+
+        if self.config.defer_embedding_wgrad_compute:
+            if (
+                self.config.wgrad_deferral_limit == 0
+                or len(self.embedding_activation_buffer) < self.config.wgrad_deferral_limit
+            ):
+                self.embedding_activation_buffer.append(input_parallel)
+
         # Matrix multiply.
-        output_parallel = linear_with_grad_accumulation_and_async_allreduce(
+        if not weight.requires_grad:
+            self._forward_impl = linear_with_frozen_weight
+        else:
+            self._forward_impl = linear_with_grad_accumulation_and_async_allreduce
+
+        allreduce_dgrad = False if self.explicit_expert_comm else self.allreduce_dgrad
+
+        output_parallel = self._forward_impl(
             input=input_parallel,
-            weight=self.weight,
+            weight=weight,
             bias=bias,
             gradient_accumulation_fusion=self.gradient_accumulation_fusion,
-            async_grad_allreduce=self.async_tensor_model_parallel_allreduce,
-            sequence_parallel_enabled=self.sequence_parallel_enabled,
+            async_grad_allreduce=allreduce_dgrad,
+            sequence_parallel=False if self.explicit_expert_comm else self.sequence_parallel,
+            grad_output_buffer=(
+                self.grad_output_buffer if self.config.defer_embedding_wgrad_compute else None
+            ),
+            wgrad_deferral_limit=(
+                self.config.wgrad_deferral_limit
+                if self.config.defer_embedding_wgrad_compute
+                else None
+            ),
+            allreduce_dgrad=allreduce_dgrad,
         )
-        if self.gather_output:
+
+        gather_output = self.gather_output
+        # Use the runtime gather output if it's set explicitly.
+        if runtime_gather_output is not None:
+            gather_output = runtime_gather_output
+
+        if gather_output:
             # All-gather across the partitions.
-            assert not self.sequence_parallel_enabled
+            assert not self.sequence_parallel
             output = gather_from_tensor_model_parallel_region(output_parallel)
         else:
             output = output_parallel
         output_bias = self.bias if self.skip_bias_add else None
         return output, output_bias
 
+    def sharded_state_dict(self, prefix='', sharded_offsets=(), metadata=None):
+        """Sharding along axis 0, bias sharded"""
+        state_dict = self.state_dict(prefix='', keep_vars=True)
+        return make_sharded_tensors_for_checkpoint(
+            state_dict, prefix, {'weight': 0, 'bias': 0}, sharded_offsets
+        )
+
+    def set_extra_state(self, state: Any):
+        """Extra state is ignored"""
+
+    def get_extra_state(self) -> None:
+        """Keep compatibility with TE state dict."""
+        return None
+
 
 class RowParallelLinear(torch.nn.Module):
     """Linear layer with row parallelism.
 
-    The linear layer is defined as Y = XA + b. A is parallelized along
-    its first dimension and X along its second dimension as:
-               -   -
-              | A_1 |
-              | .   |
-          A = | .   |        X = [X_1, ..., X_p]
-              | .   |
-              | A_p |
-               -   -
-    Arguments:
-        input_size: first dimension of matrix A.
-        output_size: second dimension of matrix A.
-
-    Keyword Arguments:
-        bias: If true, add bias. Note that bias is not parallelized.
-        input_is_parallel: If true, we assume that the input is already
-                           split across the GPUs and we do not split
-                           again.
-        init_method: method to initialize weights. Note that bias is always set
-                     to zero.
-        stride: For the strided linear layers.
-        keep_master_weight_for_test: This was added for testing and should be
-                                     set to False. It returns the master weights
-                                     used for initialization.
-        skip_bias_add: This was added to enable performance optimization where bias
-                       can be fused with other elementwise operations. We skip
-                       adding bias but instead return it.
-        params_dtype:
-        use_cpu_initialization:
-        perform_initialization:
-        gradient_accumulation_fusion:
-        sequence_parallel_enabled:
+    The linear layer is defined as Y = XA + b. A is parallelized along its first dimension and X
+    along its second dimension. A = transpose([A_1 .. A_p]) X = [X_1, ..., X_p]
+
+    Args:
+        input_size:
+            first dimension of matrix A.
+        output_size:
+            second dimension of matrix A.
+        bias:
+            If true, add bias. Note that bias is not parallelized.
+        input_is_parallel:
+            If true, we assume that the input is already split across the GPUs
+            and we do not split again.
+        init_method:
+            method to initialize weights. Note that bias is always set to zero.
+        stride:
+            For the strided linear layers.
+        keep_master_weight_for_test:
+            This was added for testing and should be set to False. It returns the master weights
+            used for initialization.
+        skip_bias_add:
+            If True, do not add the bias term, instead return it to be added by the
+            caller. This enables performance optimations where bias can be fused with other
+            elementwise operations.
+        is_expert:
+            If True, the layer is treated as an MoE expert layer
+        tp_comm_buffer_name:
+            Communication buffer name. Not used in non-Transformer-Engine modules.
+        config:
+            ModelParallelConfig object
+
     """
 
-    def __init__(self, input_size, output_size, *,
-                 bias=True, input_is_parallel=False,
-                 init_method=init.xavier_normal_, stride=1,
-                 keep_master_weight_for_test=False,
-                 skip_bias_add=False,
-                 params_dtype=torch.float32,
-                 use_cpu_initialization=False,
-                 perform_initialization=True,
-                 gradient_accumulation_fusion=False,
-                 sequence_parallel_enabled: bool = False,
-                 ):
+    def __init__(
+        self,
+        input_size: int,
+        output_size: int,
+        *,
+        config: ModelParallelConfig,
+        init_method: Callable,
+        bias: bool,
+        input_is_parallel: bool,
+        skip_bias_add: bool,
+        stride: int = 1,
+        keep_master_weight_for_test: bool = False,
+        is_expert: bool = False,
+        tp_comm_buffer_name: str = None,  # Not used
+    ):
         super(RowParallelLinear, self).__init__()
 
         # Keep input parameters
         self.input_size = input_size
         self.output_size = output_size
         self.input_is_parallel = input_is_parallel
+        self.skip_bias_add = skip_bias_add
+        self.config = config
+        self.is_expert = is_expert
+        self.expert_parallel = config.expert_model_parallel_size > 1
+        self.gradient_accumulation_fusion = config.gradient_accumulation_fusion
+        self.sequence_parallel = config.sequence_parallel
+        if self.sequence_parallel and not self.input_is_parallel:
+            raise RuntimeError("To enable `sequence_parallel`, `input_is_parallel` must be `True`")
+
+        self.explicit_expert_comm = self.is_expert and (
+            config.tensor_model_parallel_size > 1 or self.expert_parallel
+        )
+
         # Divide the weight matrix along the last dimension.
-        world_size = get_tensor_model_parallel_world_size()
+        if self.explicit_expert_comm and config.moe_extended_tp:
+            world_size = get_tensor_and_expert_parallel_world_size()
+            rank = get_tensor_and_expert_parallel_rank()
+        else:
+            world_size = get_tensor_model_parallel_world_size()
+            rank = get_tensor_model_parallel_rank()
+
         self.input_size_per_partition = divide(input_size, world_size)
-        self.skip_bias_add = skip_bias_add
-        self.gradient_accumulation_fusion = gradient_accumulation_fusion
-        self.sequence_parallel_enabled = sequence_parallel_enabled
-        if self.sequence_parallel_enabled and not self.input_is_parallel:
-            raise RuntimeError("To enable `sequence_parallel_enabled`, `input_is_parallel` must be `True`")
 
         # Parameters.
         # Note: torch.nn.functional.linear performs XA^T + b and as a result
         # we allocate the transpose.
         # Initialize weight.
-        if use_cpu_initialization:
-            self.weight = Parameter(torch.empty(self.output_size,
-                                                self.input_size_per_partition,
-                                                dtype=params_dtype))
-            if perform_initialization:
+        if config.use_cpu_initialization:
+            self.weight = Parameter(
+                torch.empty(
+                    self.output_size, self.input_size_per_partition, dtype=config.params_dtype
+                )
+            )
+            if config.perform_initialization:
                 self.master_weight = _initialize_affine_weight_cpu(
-                    self.weight, self.output_size, self.input_size,
-                    self.input_size_per_partition, 1, init_method,
-                    stride=stride, return_master_weight=keep_master_weight_for_test,
-                    params_dtype=params_dtype)
+                    self.weight,
+                    self.output_size,
+                    self.input_size,
+                    self.input_size_per_partition,
+                    1,
+                    init_method,
+                    stride=stride,
+                    return_master_weight=keep_master_weight_for_test,
+                    params_dtype=config.params_dtype,
+                    rank=rank,
+                    world_size=world_size,
+                )
         else:
-            self.weight = Parameter(torch.empty(
-                self.output_size, self.input_size_per_partition,
-                device=torch.cuda.current_device(), dtype=params_dtype))
-            if perform_initialization:
-                _initialize_affine_weight_gpu(self.weight, init_method,
-                                              partition_dim=1, stride=stride)
+            self.weight = Parameter(
+                torch.empty(
+                    self.output_size,
+                    self.input_size_per_partition,
+                    device=torch.cuda.current_device(),
+                    dtype=config.params_dtype,
+                )
+            )
+            if config.perform_initialization:
+                _initialize_affine_weight_gpu(
+                    self.weight,
+                    init_method,
+                    partition_dim=1,
+                    stride=stride,
+                    expert_parallel=(self.is_expert and self.expert_parallel),
+                )
+        setattr(self.weight, 'allreduce', not (self.is_expert and self.expert_parallel))
+
         if bias:
-            if use_cpu_initialization:
-                self.bias = Parameter(torch.empty(self.output_size,
-                                                  dtype=params_dtype))
+            if config.use_cpu_initialization:
+                self.bias = Parameter(torch.empty(self.output_size, dtype=config.params_dtype))
             else:
-                self.bias = Parameter(torch.empty(
-                    self.output_size, device=torch.cuda.current_device(),
-                    dtype=params_dtype))
-            setattr(self.bias, 'sequence_parallel', sequence_parallel_enabled)
-
-            # Always initialize bias to zero.
-            with torch.no_grad():
-                self.bias.zero_()
+                self.bias = Parameter(
+                    torch.empty(
+                        self.output_size,
+                        device=torch.cuda.current_device(),
+                        dtype=config.params_dtype,
+                    )
+                )
+
+            if config.perform_initialization:
+                # Always initialize bias to zero.
+                with torch.no_grad():
+                    self.bias.zero_()
+            setattr(self.bias, 'allreduce', not (self.is_expert and self.expert_parallel))
+            setattr(self.bias, 'sequence_parallel', self.sequence_parallel)
         else:
             self.register_parameter('bias', None)
 
+        self._forward_impl = linear_with_grad_accumulation_and_async_allreduce
 
+        # Hook adding a default empty _extra_state for state dict
+        self._register_load_state_dict_pre_hook(
+            lambda state_dict, prefix, *args, **kwargs: state_dict.setdefault(
+                f'{prefix}_extra_state'
+            )
+        )
 
     def forward(self, input_):
         """Forward of RowParallelLinear
@@ -686,31 +1141,64 @@ def forward(self, input_):
             - output
             - bias
         """
+
+        if self.config._cpu_offloading_context is not None:
+            if self.config._cpu_offloading_context.inside_context is True:
+                assert (
+                    self.config.cpu_offloading is False
+                ), "CPU Offloading cannot be enabled while using non-TE modules"
+
         # Set up backprop all-reduce.
         if self.input_is_parallel:
             input_parallel = input_
         else:
-            assert not self.sequence_parallel_enabled
+            assert not self.sequence_parallel
             input_parallel = scatter_to_tensor_model_parallel_region(input_)
         # Matrix multiply.
-        output_parallel = linear_with_grad_accumulation_and_async_allreduce(
+        if not self.weight.requires_grad:
+            self._forward_impl = linear_with_frozen_weight
+        else:
+            self._forward_impl = linear_with_grad_accumulation_and_async_allreduce
+
+        allreduce_dgrad = False
+
+        output_parallel = self._forward_impl(
             input=input_parallel,
             weight=self.weight,
             bias=None,
             gradient_accumulation_fusion=self.gradient_accumulation_fusion,
-            async_grad_allreduce=False,
-            sequence_parallel_enabled=False,
+            async_grad_allreduce=allreduce_dgrad,
+            sequence_parallel=False,
+            grad_output_buffer=None,
+            allreduce_dgrad=allreduce_dgrad,
         )
 
         # All-reduce across all the partitions.
-        if self.sequence_parallel_enabled:
+        if self.explicit_expert_comm:
+            assert self.skip_bias_add
+            output_ = output_parallel
+        elif self.sequence_parallel:
             output_ = reduce_scatter_to_sequence_parallel_region(output_parallel)
         else:
             output_ = reduce_from_tensor_model_parallel_region(output_parallel)
         if not self.skip_bias_add:
-            output = output_ + self.bias if self.bias is not None else output_
+            output = (output_ + self.bias) if self.bias is not None else output_
             output_bias = None
         else:
             output = output_
             output_bias = self.bias
         return output, output_bias
+
+    def sharded_state_dict(self, prefix='', sharded_offsets=(), metadata=None):
+        """Sharding along axis 1, bias not sharded"""
+        state_dict = self.state_dict(prefix='', keep_vars=True)
+        return make_sharded_tensors_for_checkpoint(
+            state_dict, prefix, {'weight': 1}, sharded_offsets
+        )
+
+    def set_extra_state(self, state: Any):
+        """Extra state is ignored"""
+
+    def get_extra_state(self) -> None:
+        """Keep compatibility with TE state dict."""
+        return None
diff --git a/megatron/core/tensor_parallel/mappings.py b/megatron/core/tensor_parallel/mappings.py
index 624be8054e..3addd8d2ee 100644
--- a/megatron/core/tensor_parallel/mappings.py
+++ b/megatron/core/tensor_parallel/mappings.py
@@ -3,10 +3,14 @@
 import torch
 
 from megatron.core.parallel_state import (
+    get_expert_model_parallel_group,
+    get_global_memory_buffer,
+    get_tensor_and_expert_parallel_group,
+    get_tensor_model_parallel_group,
     get_tensor_model_parallel_rank,
     get_tensor_model_parallel_world_size,
-    get_tensor_model_parallel_group,
 )
+
 from .utils import split_tensor_along_last_dim
 
 
@@ -14,11 +18,11 @@ def _reduce(input_):
     """All-reduce the input tensor across model parallel group."""
 
     # Bypass the function if we are using only 1 GPU.
-    if get_tensor_model_parallel_world_size()==1:
+    if get_tensor_model_parallel_world_size() == 1:
         return input_
 
     # All-reduce.
-    torch.distributed.all_reduce(input_, group=get_tensor_model_parallel_group())
+    torch.distributed.all_reduce(input_.contiguous(), group=get_tensor_model_parallel_group())
 
     return input_
 
@@ -53,13 +57,14 @@ def _split_along_first_dim(input_):
 
     # Split along first dimension.
     dim_size = input_.size()[0]
-    assert dim_size % world_size == 0, \
-        "First dimension of the tensor should be divisible by tensor parallel size"
+    assert (
+        dim_size % world_size == 0
+    ), "First dimension of the tensor should be divisible by tensor parallel size"
     local_dim_size = dim_size // world_size
     rank = get_tensor_model_parallel_rank()
     dim_offset = rank * local_dim_size
 
-    output = input_[dim_offset:dim_offset+local_dim_size].contiguous()
+    output = input_[dim_offset : dim_offset + local_dim_size].contiguous()
 
     return output
 
@@ -72,55 +77,161 @@ def _gather_along_last_dim(input_):
     if world_size == 1:
         return input_
 
-    # Size and dimension.
-    last_dim = input_.dim() - 1
-    rank = get_tensor_model_parallel_rank()
+    dim_size = list(input_.size())
+    dim_size[0] = dim_size[0] * world_size
 
-    tensor_list = [torch.empty_like(input_) for _ in range(world_size)]
-    tensor_list[rank] = input_
-    torch.distributed.all_gather(tensor_list, input_, group=get_tensor_model_parallel_group())
+    output = torch.empty(dim_size, dtype=input_.dtype, device=torch.cuda.current_device())
+    torch.distributed.all_gather_into_tensor(
+        output, input_.contiguous(), group=get_tensor_model_parallel_group()
+    )
+    tensor_list = output.chunk(world_size, dim=0)
+    output = torch.cat(tensor_list, dim=-1).contiguous()
+
+    return output
 
-    # Note: torch.cat already creates a contiguous tensor.
-    output = torch.cat(tensor_list, dim=last_dim).contiguous()
 
+def _reduce_scatter_along_last_dim(input_):
+    """Reduce-scatter tensors on the last dimension."""
+    world_size = get_tensor_model_parallel_world_size()
+    target_shape = list(input_.size())
+    target_shape[-1] = target_shape[-1] // world_size
+    input_ = input_.reshape(-1, input_.shape[-1])
+    split_tensors = torch.split(
+        input_, split_size_or_sections=input_.shape[-1] // world_size, dim=1
+    )
+    concat_tensor = torch.cat(split_tensors, dim=0)
+    output = _reduce_scatter_along_first_dim(concat_tensor).reshape(target_shape)
     return output
 
 
-def _gather_along_first_dim(input_):
-    """Gather tensors and concatinate along the first dimension."""
+def _gather_along_first_dim(input_, output_split_sizes=None):
+    """Gather tensors and concatenate along the first dimension.
+
+    Args:
+        input_tensor (torch.Tensor):
+            A tensor to be gathered.
+        output_split_sizes (List[int], optional):
+            A list specifying the sizes of the output splits along the first dimension.
+            If None, equal splitting is assumed. Default: None.
+
+    Returns:
+        torch.Tensor: Gathered tensor.
+    """
 
     world_size = get_tensor_model_parallel_world_size()
     # Bypass the function if we are using only 1 GPU.
     if world_size == 1:
         return input_
 
+    dim_size = list(input_.size())
+    if output_split_sizes is None:
+        dim_size[0] = dim_size[0] * world_size
+
+        output = torch.empty(dim_size, dtype=input_.dtype, device=torch.cuda.current_device())
+        torch.distributed._all_gather_base(
+            output, input_.contiguous(), group=get_tensor_model_parallel_group()
+        )
+    else:
+        dim_size[0] = sum(output_split_sizes)
+        output = torch.empty(dim_size, dtype=input_.dtype, device=torch.cuda.current_device())
+        output_tensor_list = list(torch.split(output, output_split_sizes, dim=0))
+        torch.distributed.all_gather(
+            output_tensor_list, input_, group=get_tensor_model_parallel_group()
+        )
+
+    return output
+
+
+def _reduce_scatter_along_first_dim(input_, input_split_sizes=None):
+    """Reduce-scatter the input tensor across model parallel group.
+
+    Args:
+        input_ (torch.Tensor): The input tensor to be reduce-scattered.
+        input_split_sizes (List[int], optional): A list specifying the sizes of
+            the input splits along the first dimension for each rank. If None,
+            equal splitting is assumed. Default: None.
+    """
+    world_size = get_tensor_model_parallel_world_size()
+    # Bypass the function if we are using only 1 GPU.
+    if world_size == 1:
+        return input_
+
+    if input_split_sizes is None:
+        dim_size = list(input_.size())
+        assert (
+            dim_size[0] % world_size == 0
+        ), "First dimension of the tensor should be divisible by tensor parallel size"
+
+        dim_size[0] = dim_size[0] // world_size
+
+        output = torch.empty(dim_size, dtype=input_.dtype, device=torch.cuda.current_device())
+        torch.distributed._reduce_scatter_base(
+            output, input_.contiguous(), group=get_tensor_model_parallel_group()
+        )
+    else:
+        rank = torch.distributed.get_rank(get_tensor_model_parallel_group())
+        input_tensor_list = list(torch.split(input_, input_split_sizes, dim=0))
+        output = torch.empty_like(input_tensor_list[rank])
+        torch.distributed.reduce_scatter(
+            output, input_tensor_list, group=get_tensor_model_parallel_group()
+        )
+    return output
+
+
+def _gather_along_first_dim_moe(input_, use_global_buffer=False):
+    """Gather tensors and concatenate along the first dimension."""
+    group = get_tensor_and_expert_parallel_group()
+    world_size = torch.distributed.get_world_size(group=group)
+    # Bypass the function if we are using only 1 GPU.
+    if world_size == 1:
+        return input_
+
     dim_size = list(input_.size())
     dim_size[0] = dim_size[0] * world_size
 
-    output = torch.empty(dim_size, dtype=input_.dtype,
-                         device=torch.cuda.current_device())
-    torch.distributed._all_gather_base(output, input_.contiguous(),
-                                       group=get_tensor_model_parallel_group())
+    if use_global_buffer:
+        output = get_global_memory_buffer().get_tensor(dim_size, input_.dtype, "mpu")
+    else:
+        output = torch.empty(dim_size, dtype=input_.dtype, device=torch.cuda.current_device())
+    torch.distributed._all_gather_base(output, input_.contiguous(), group=group)
 
     return output
 
-def _reduce_scatter_along_first_dim(input_):
+
+def _reduce_scatter_along_first_dim_moe(input_, use_global_buffer=False):
     """Reduce-scatter the input tensor across model parallel group."""
-    world_size = get_tensor_model_parallel_world_size()
+    group = get_tensor_and_expert_parallel_group()
+    world_size = torch.distributed.get_world_size(group=group)
     # Bypass the function if we are using only 1 GPU.
     if world_size == 1:
         return input_
 
     dim_size = list(input_.size())
-    assert dim_size[0] % world_size == 0, \
-        "First dimension of the tensor should be divisible by tensor parallel size"
-    
+    assert dim_size[0] % world_size == 0
     dim_size[0] = dim_size[0] // world_size
-   
-    output = torch.empty(dim_size, dtype=input_.dtype,
-                         device=torch.cuda.current_device())
-    torch.distributed._reduce_scatter_base(output, input_.contiguous(), 
-                                           group=get_tensor_model_parallel_group())
+
+    if use_global_buffer:
+        output = get_global_memory_buffer().get_tensor(dim_size, input_.dtype, "mpu")
+    else:
+        output = torch.empty(dim_size, dtype=input_.dtype, device=torch.cuda.current_device())
+    torch.distributed._reduce_scatter_base(output, input_.contiguous(), group=group)
+    return output
+
+
+def _gather_along_first_dim_expert_parallel(input_):
+    """Gather tensors and concatenate along the first dimension."""
+    group = get_expert_model_parallel_group()
+    world_size = torch.distributed.get_world_size(group=group)
+    # Bypass the function if we are using only 1 GPU.
+    if world_size == 1:
+        return input_
+
+    dim_size = list(input_.size())
+    dim_size[0] = dim_size[0] * world_size
+
+    output = torch.empty(dim_size, dtype=input_.dtype, device=torch.cuda.current_device())
+    torch.distributed._all_gather_base(output, input_.contiguous(), group=group)
+
     return output
 
 
@@ -129,14 +240,17 @@ class _CopyToModelParallelRegion(torch.autograd.Function):
 
     @staticmethod
     def symbolic(graph, input_):
+        """Symbolic function for tracing."""
         return input_
-    
+
     @staticmethod
     def forward(ctx, input_):
+        """Forward function."""
         return input_
 
     @staticmethod
     def backward(ctx, grad_output):
+        """Backward function."""
         return _reduce(grad_output)
 
 
@@ -145,14 +259,17 @@ class _ReduceFromModelParallelRegion(torch.autograd.Function):
 
     @staticmethod
     def symbolic(graph, input_):
+        """Symbolic function for tracing."""
         return _reduce(input_)
-    
+
     @staticmethod
     def forward(ctx, input_):
+        """Forward function."""
         return _reduce(input_)
 
     @staticmethod
     def backward(ctx, grad_output):
+        """Backward function."""
         return grad_output
 
 
@@ -161,14 +278,17 @@ class _ScatterToModelParallelRegion(torch.autograd.Function):
 
     @staticmethod
     def symbolic(graph, input_):
+        """Symbolic function for tracing."""
         return _split_along_last_dim(input_)
 
     @staticmethod
     def forward(ctx, input_):
+        """Forward function."""
         return _split_along_last_dim(input_)
 
     @staticmethod
     def backward(ctx, grad_output):
+        """Backward function."""
         return _gather_along_last_dim(grad_output)
 
 
@@ -177,14 +297,17 @@ class _GatherFromModelParallelRegion(torch.autograd.Function):
 
     @staticmethod
     def symbolic(graph, input_):
+        """Symbolic function for tracing."""
         return _gather_along_last_dim(input_)
-    
+
     @staticmethod
     def forward(ctx, input_):
+        """Forward function."""
         return _gather_along_last_dim(input_)
 
     @staticmethod
     def backward(ctx, grad_output):
+        """Backward function."""
         return _split_along_last_dim(grad_output)
 
 
@@ -193,87 +316,314 @@ class _ScatterToSequenceParallelRegion(torch.autograd.Function):
 
     @staticmethod
     def symbolic(graph, input_):
+        """Symbolic function for tracing."""
         return _split_along_first_dim(input_)
 
     @staticmethod
     def forward(ctx, input_):
+        """Forward function."""
         return _split_along_first_dim(input_)
 
     @staticmethod
     def backward(ctx, grad_output):
+        """Backward function."""
         return _gather_along_first_dim(grad_output)
 
 
 class _GatherFromSequenceParallelRegion(torch.autograd.Function):
-    """Gather the input from sequence parallel region and concatinate.""" 
+    """Gather the input from sequence parallel region and concatinate."""
 
     @staticmethod
-    def symbolic(graph, input_, tensor_parallel_output_grad=True):
-        return _gather_along_first_dim(input_)
-    
+    def symbolic(graph, input_, tensor_parallel_output_grad=True, output_split_sizes=None):
+        """Symbolic function for tracing."""
+        return _gather_along_first_dim(input_, output_split_sizes)
+
     @staticmethod
-    def forward(ctx, input_, tensor_parallel_output_grad=True):
+    def forward(ctx, input_, tensor_parallel_output_grad=True, output_split_sizes=None):
+        """Forward function."""
         ctx.tensor_parallel_output_grad = tensor_parallel_output_grad
-        return _gather_along_first_dim(input_)
+        ctx.output_split_sizes = output_split_sizes
+        return _gather_along_first_dim(input_, ctx.output_split_sizes)
 
     @staticmethod
     def backward(ctx, grad_output):
+        """Backward function."""
         tensor_parallel_output_grad = ctx.tensor_parallel_output_grad
 
         # If the computation graph after the gather operation is
-        # in the tensor parallel mode, output gradients need to reduce 
-        # scattered and whereas if the computation is duplicated, 
+        # in the tensor parallel mode, output gradients need to reduce
+        # scattered and whereas if the computation is duplicated,
         # output gradients need to be scattered.
         if tensor_parallel_output_grad:
-            return _reduce_scatter_along_first_dim(grad_output), None
+            return (
+                _reduce_scatter_along_first_dim(grad_output, ctx.output_split_sizes),
+                None,
+                None,
+            )
         else:
-            return _split_along_first_dim(grad_output), None
+            assert ctx.output_split_sizes is None
+            return _split_along_first_dim(grad_output), None, None
 
 
 class _ReduceScatterToSequenceParallelRegion(torch.autograd.Function):
     """Reduce scatter the input from the model parallel region."""
 
+    @staticmethod
+    def symbolic(graph, input_, input_split_sizes=None):
+        """Symbolic function for tracing."""
+        return _reduce_scatter_along_first_dim(input_, input_split_sizes)
+
+    @staticmethod
+    def forward(ctx, input_, input_split_sizes=None):
+        """Forward function."""
+        ctx.input_split_sizes = input_split_sizes
+        return _reduce_scatter_along_first_dim(input_, input_split_sizes)
+
+    @staticmethod
+    def backward(ctx, grad_output):
+        """Backward function."""
+        input_split_sizes = ctx.input_split_sizes
+        return _gather_along_first_dim(grad_output, input_split_sizes), None
+
+
+class _GatherFromSequenceParallelRegionToMOE(torch.autograd.Function):
+    """Gather the input from model parallel region and concatenate."""  # TODO
+
+    @staticmethod
+    def symbolic(graph, input_, use_global_buffer=False):
+        """Symbolic function for tracing."""
+        return _gather_along_first_dim_moe(input_, use_global_buffer)
+
+    @staticmethod
+    def forward(ctx, input_, use_global_buffer=False):
+        """Forward function."""
+        ctx.use_global_buffer = use_global_buffer
+        return _gather_along_first_dim_moe(input_, use_global_buffer)
+
+    @staticmethod
+    def backward(ctx, grad_output):
+        """Backward function."""
+        use_global_buffer = ctx.use_global_buffer
+        return _reduce_scatter_along_first_dim_moe(grad_output, use_global_buffer), None
+
+
+class _ReduceScatterToSequenceParallelRegionFromMOE(torch.autograd.Function):
+    """Reduce scatter the input from the model parallel region."""
+
+    @staticmethod
+    def symbolic(graph, input_, use_global_buffer=False):
+        """Symbolic function for tracing."""
+        return _reduce_scatter_along_first_dim_moe(input_, use_global_buffer)
+
+    @staticmethod
+    def forward(ctx, input_, use_global_buffer=False):
+        """Forward function."""
+        ctx.use_global_buffer = use_global_buffer
+        return _reduce_scatter_along_first_dim_moe(input_, use_global_buffer)
+
+    @staticmethod
+    def backward(ctx, grad_output):
+        """Backward function."""
+        use_global_buffer = ctx.use_global_buffer
+        return _gather_along_first_dim_moe(grad_output, use_global_buffer), None
+
+
+class _AllGatherFromTensorParallelRegion(torch.autograd.Function):
+    """Gather the input from model parallel region and concatenate."""
+
+    @staticmethod
+    def symbolic(graph, input_):
+        """Symbolic function for tracing."""
+        return _gather_along_last_dim(input_)
+
+    @staticmethod
+    def forward(ctx, input_):
+        """Forward function."""
+        return _gather_along_last_dim(input_)
+
+    @staticmethod
+    def backward(ctx, grad_output):
+        """Backward function."""
+        return _reduce_scatter_along_last_dim(grad_output)
+
+
+class _ReduceScatterToTensorParallelRegion(torch.autograd.Function):
+    """Reduce scatter the input from the model parallel region."""
+
     @staticmethod
     def symbolic(graph, input_):
-        return _reduce_scatter_along_first_dim(input_)
-    
+        """Symbolic function for tracing."""
+        return _reduce_scatter_along_last_dim(input_)
+
     @staticmethod
     def forward(ctx, input_):
-        return _reduce_scatter_along_first_dim(input_)
+        """Forward function."""
+        return _reduce_scatter_along_last_dim(input_)
 
     @staticmethod
     def backward(ctx, grad_output):
-        return _gather_along_first_dim(grad_output)
+        """Backward function."""
+        return _gather_along_last_dim(grad_output)
+
+
+class _AllToAll(torch.autograd.Function):
+    @staticmethod
+    def forward(ctx, group, input, output_split_sizes, input_split_sizes):
+        """Forward function."""
+        ctx.group = group
+        ctx.output_split_sizes = output_split_sizes
+        ctx.input_split_sizes = input_split_sizes
+
+        world_size = torch.distributed.get_world_size(group=group)
+        # Bypass the function if we are using only 1 GPU.
+        if world_size == 1:
+            return input
+
+        input = input.contiguous()
+        if output_split_sizes is None:
+            # Equal split (all2all)
+            output = torch.empty_like(input)
+        else:
+            # Unequal split (all2all-v)
+            output = input.new_empty(
+                size=[sum(output_split_sizes)] + list(input.size()[1:]),
+                dtype=input.dtype,
+                device=torch.cuda.current_device(),
+            )
+        torch.distributed.all_to_all_single(
+            output,
+            input,
+            output_split_sizes=output_split_sizes,
+            input_split_sizes=input_split_sizes,
+            group=group,
+        )
+        return output
+
+    @staticmethod
+    def backward(ctx, *grad_output):
+        """Backward function."""
+        return (
+            None,
+            _AllToAll.apply(ctx.group, *grad_output, ctx.input_split_sizes, ctx.output_split_sizes),
+            None,
+            None,
+        )
 
 
 # -----------------
 # Helper functions.
 # -----------------
 
+
 def copy_to_tensor_model_parallel_region(input_):
+    """Wrapper for autograd function"""
     return _CopyToModelParallelRegion.apply(input_)
 
 
 def reduce_from_tensor_model_parallel_region(input_):
+    """Wrapper for autograd function"""
     return _ReduceFromModelParallelRegion.apply(input_)
 
 
 def scatter_to_tensor_model_parallel_region(input_):
+    """Wrapper for autograd function"""
     return _ScatterToModelParallelRegion.apply(input_)
 
 
 def gather_from_tensor_model_parallel_region(input_):
+    """Wrapper for autograd function"""
     return _GatherFromModelParallelRegion.apply(input_)
 
 
 def scatter_to_sequence_parallel_region(input_):
+    """Wrapper for autograd function"""
     return _ScatterToSequenceParallelRegion.apply(input_)
 
 
-def gather_from_sequence_parallel_region(input_, tensor_parallel_output_grad=True):
-    return _GatherFromSequenceParallelRegion.apply(input_, tensor_parallel_output_grad)
+def gather_from_sequence_parallel_region(
+    input_, tensor_parallel_output_grad=True, output_split_sizes=None
+):
+    """Wrapper for autograd function"""
+    return _GatherFromSequenceParallelRegion.apply(
+        input_, tensor_parallel_output_grad, output_split_sizes
+    )
+
+
+def reduce_scatter_to_sequence_parallel_region(input_, input_split_sizes=None):
+    """Wrapper for autograd function"""
+    return _ReduceScatterToSequenceParallelRegion.apply(input_, input_split_sizes)
+
+
+def gather_from_sequence_parallel_region_to_moe(input_, use_global_buffer=False):
+    """Wrapper for autograd function"""
+    return _GatherFromSequenceParallelRegionToMOE.apply(input_, use_global_buffer)
+
+
+def reduce_scatter_to_sequence_parallel_region_from_moe(input_, use_global_buffer=False):
+    """Wrapper for autograd function"""
+    return _ReduceScatterToSequenceParallelRegionFromMOE.apply(input_, use_global_buffer)
+
 
+def all_gather_last_dim_from_tensor_parallel_region(input_):
+    """Wrapper for autograd function"""
+    return _AllGatherFromTensorParallelRegion.apply(input_)
 
-def reduce_scatter_to_sequence_parallel_region(input_):
-    return _ReduceScatterToSequenceParallelRegion.apply(input_)
 
+def reduce_scatter_last_dim_to_tensor_parallel_region(input_):
+    """Wrapper for autograd function"""
+    return _ReduceScatterToTensorParallelRegion.apply(input_)
+
+
+def all_to_all(group, input_, output_split_sizes_=None, input_split_sizes=None):
+    """Wrapper for autograd function"""
+    return _AllToAll.apply(group, input_, output_split_sizes_, input_split_sizes)
+
+
+def all_to_all_sp2hp(input_):
+    """
+    Perform AlltoAll communication on tensor parallel group, transform the input tensor from shape
+    [num_tokens/TP, H] to [num_tokens, H/TP].
+
+    Args:
+        input_ (torch.Tensor):
+            The input tensor which has been distributed along the sequence
+            dimension.
+
+    Returns:
+        torch.Tensor: The output tensor with shape [num_tokens, H/TP].
+
+    """
+    world_size = get_tensor_model_parallel_world_size()
+    tp_group = get_tensor_model_parallel_group()
+    input_ = input_.reshape(-1, input_.shape[-1])
+    split_tensors = torch.split(
+        input_, split_size_or_sections=input_.shape[-1] // world_size, dim=1
+    )
+    concat_tensor = torch.cat(split_tensors, dim=0)
+    output = all_to_all(tp_group, concat_tensor)
+    return output
+
+
+def all_to_all_hp2sp(input_):
+    """
+    Perform AlltoAll communication on tensor parallel group, transform the input tensor from shape
+    [num_tokens, H/TP] to [num_tokens/TP, H].
+
+    Args:
+        input_ (torch.Tensor):
+            The input tensor which has been distributed along the hidden
+            dimension.
+
+    Returns:
+        torch.Tensor: The output tensor with shape [num_tokens/TP, H].
+    """
+    world_size = get_tensor_model_parallel_world_size()
+    input_ = input_.reshape(-1, input_.shape[-1])
+    tp_group = get_tensor_model_parallel_group()
+    input_exchanged = all_to_all(tp_group, input_)
+    input_reshaped = input_exchanged.reshape(-1, input_exchanged.shape[-1])
+    split_tensors = torch.split(
+        input_reshaped, split_size_or_sections=input_reshaped.shape[0] // world_size, dim=0
+    )
+    output = torch.cat(split_tensors, dim=-1)
+    return output
diff --git a/megatron/core/tensor_parallel/random.py b/megatron/core/tensor_parallel/random.py
index 23059fc1f5..4b144d4163 100644
--- a/megatron/core/tensor_parallel/random.py
+++ b/megatron/core/tensor_parallel/random.py
@@ -4,28 +4,26 @@
 # repo: https://github.com/pytorch/pytorch
 
 import contextlib
+import logging
 
 import torch
 from torch import _C
-from torch.cuda import _lazy_call, device as device_ctx_manager
+from torch.cuda import _lazy_call
+from torch.cuda import device as device_ctx_manager
 from torch.utils.checkpoint import detach_variable
 
 from megatron.core.parallel_state import (
-    get_data_parallel_rank,
-    get_tensor_model_parallel_group,
+    get_expert_model_parallel_rank,
     get_tensor_model_parallel_rank,
-    get_tensor_model_parallel_world_size,
 )
+from megatron.core.utils import is_te_min_version, safely_set_viewless_tensor_data
 
-from .utils import (
-    split_tensor_into_1d_equal_chunks,
-    gather_split_1d_tensor,
-)
-
-from megatron.core.utils import safely_set_viewless_tensor_data
+from .utils import gather_split_1d_tensor, split_tensor_into_1d_equal_chunks
 
 # Default name for the model parallel rng tracker.
 _MODEL_PARALLEL_RNG_TRACKER_NAME = 'model-parallel-rng'
+_EXPERT_PARALLEL_RNG_TRACKER_NAME = 'expert-parallel-rng'
+_DATA_PARALLEL_RNG_TRACKER_NAME = 'data-parallel-rng'
 
 
 def _set_cuda_rng_state(new_state, device=-1):
@@ -42,6 +40,7 @@ def _set_cuda_rng_state(new_state, device=-1):
         def cb():
             with device_ctx_manager(device):
                 _C._cuda_setRNGState(new_state)
+
     else:
         # newer PyTorch
         if device == -1:
@@ -61,6 +60,17 @@ def cb():
     _lazy_call(cb)
 
 
+def get_expert_parallel_rng_tracker_name():
+    """Get the expert parallel rng tracker name"""
+    global _EXPERT_PARALLEL_RNG_TRACKER_NAME
+    return _EXPERT_PARALLEL_RNG_TRACKER_NAME
+
+
+def get_data_parallel_rng_tracker_name():
+    """Get the data parallel rng tracker name"""
+    global _DATA_PARALLEL_RNG_TRACKER_NAME
+    return _DATA_PARALLEL_RNG_TRACKER_NAME
+
 
 class CudaRNGStatesTracker:
     """Tracker for the cuda RNG states.
@@ -72,14 +82,22 @@ class CudaRNGStatesTracker:
     """
 
     def __init__(self):
-        # Map from a string name to the cuda rng state.
-        self.states_ = {}
-        # Seeds are just for book keeping and ensure no seed is set twice.
-        self.seeds_ = set()
+        self.reset()
+
+    def is_initialized(self):
+        """Checks if the internal RNG state has been set wirth set_states()."""
+        return self._is_initialized
 
     def reset(self):
         """Set to the initial state (no tracker)."""
+
+        # Track if initialized.
+        self._is_initialized = False
+
+        # Map from a string name to the cuda rng state.
         self.states_ = {}
+
+        # Seeds are just for book keeping and ensure no seed is set twice.
         self.seeds_ = set()
 
     def get_states(self):
@@ -93,10 +111,12 @@ def get_states(self):
     def set_states(self, states):
         """Set the rng states. For efficiency purposes, we do not check
         the size of seed for compatibility."""
+        self._is_initialized = True
         self.states_ = states
 
     def add(self, name, seed):
         """Track the rng state."""
+        self._is_initialized = True
         # Check seed is not already used.
         if seed in self.seeds_:
             raise Exception('seed {} already exists'.format(seed))
@@ -123,10 +143,15 @@ def fork(self, name=_MODEL_PARALLEL_RNG_TRACKER_NAME):
         orig_cuda_rng_state = torch.cuda.get_rng_state()
         # Set rng state to the desired one
         _set_cuda_rng_state(self.states_[name])
+        # Record cpu RNG state
+        cpu_rng_state = torch.get_rng_state()
         # Do the stuff we wanted to do.
         try:
             yield
         finally:
+            # Throw a warning if cpu RNG state changed
+            if not torch.all(cpu_rng_state == torch.get_rng_state()).item():
+                logging.getLogger(__name__).warning('CPU RNG state changed within GPU RNG context')
             # Update the current rng state for later use.
             self.states_[name] = torch.cuda.get_rng_state()
             # And set the state to the original state we started with.
@@ -134,11 +159,35 @@ def fork(self, name=_MODEL_PARALLEL_RNG_TRACKER_NAME):
 
 
 # RNG tracker object.
-_CUDA_RNG_STATE_TRACKER = CudaRNGStatesTracker()
+_CUDA_RNG_STATE_TRACKER = None
+_CUDA_RNG_STATE_TRACKER_INITIALIZED = False
+
+
+def initialize_rng_tracker(use_te_rng_tracker: bool = False):
+    """Create the RNG tracker. 'use_te_rng_tracker' determines whether to use
+    Megatron or TransformerEngine's implementation.
+    In particular, TransformerEngine's implementation is cudagraphable and supports FP8.
+    """
+
+    global _CUDA_RNG_STATE_TRACKER
+    global _CUDA_RNG_STATE_TRACKER_INITIALIZED
+    if _CUDA_RNG_STATE_TRACKER_INITIALIZED:
+        return
+
+    if use_te_rng_tracker:
+        if not is_te_min_version("1.5.0"):
+            raise RuntimeError("use_te_rng_tracker requires TransformerEngine version >= 1.5")
+        from megatron.core.extensions.transformer_engine import TECudaRNGStatesTracker
+
+        _CUDA_RNG_STATE_TRACKER = TECudaRNGStatesTracker()
+    else:
+        _CUDA_RNG_STATE_TRACKER = CudaRNGStatesTracker()
+    _CUDA_RNG_STATE_TRACKER_INITIALIZED = True
 
 
-def get_cuda_rng_tracker():
+def get_cuda_rng_tracker(use_te_rng_tracker=False):
     """Get cuda rng tracker."""
+    initialize_rng_tracker(use_te_rng_tracker)
     return _CUDA_RNG_STATE_TRACKER
 
 
@@ -150,14 +199,12 @@ def model_parallel_cuda_manual_seed(seed):
     after this function. Basically, this is replacement for that
     function.
     Two set of RNG states are tracked:
-        default state: This is for data parallelism and is the same among a
-                       set of model parallel GPUs but different across
-                       different model paralle groups. This is used for
-                       example for dropout in the non-tensor-model-parallel regions.
-        tensor-model-parallel state: This state is different among a set of model
-                              parallel GPUs, but the same across data parallel
-                              groups. This is used for example for dropout in
-                              model parallel regions.
+    default state: This is for data parallelism and is the same among a set of model parallel GPUs
+    but different across different model parallel groups. This is used for example for dropout
+    in the non-tensor-model-parallel regions.
+    tensor-model-parallel state: This state is different among a set of model parallel GPUs,
+    but the same across data parallel groups. This is used for example for dropout
+    in model parallel regions.
     """
     # 2718 is just for fun and any POSITIVE value will work.
     offset = seed + 2718
@@ -165,26 +212,33 @@ def model_parallel_cuda_manual_seed(seed):
     # Data parallel gets the original seed.
     data_parallel_seed = seed
 
+    initialize_rng_tracker()
     _CUDA_RNG_STATE_TRACKER.reset()
     # Set the default state.
     torch.cuda.manual_seed(data_parallel_seed)
+    _CUDA_RNG_STATE_TRACKER.add(_DATA_PARALLEL_RNG_TRACKER_NAME, data_parallel_seed)
+
     # and model parallel state.
-    _CUDA_RNG_STATE_TRACKER.add(_MODEL_PARALLEL_RNG_TRACKER_NAME,
-                                tensor_model_parallel_seed)
+    _CUDA_RNG_STATE_TRACKER.add(_MODEL_PARALLEL_RNG_TRACKER_NAME, tensor_model_parallel_seed)
+
+    expert_parallel_seed = (
+        seed + 1024 + 100 * get_expert_model_parallel_rank() + get_tensor_model_parallel_rank()
+    )
+    _CUDA_RNG_STATE_TRACKER.add(_EXPERT_PARALLEL_RNG_TRACKER_NAME, expert_parallel_seed)
 
 
 class CheckpointFunction(torch.autograd.Function):
-    """This function is adapted from torch.utils.checkpoint with
-       two main changes:
-           1) torch.cuda.set_rng_state is replaced with `_set_cuda_rng_state`
-           2) the states in the model parallel tracker are also properly
-              tracked/set/reset.
+    """Checkpoint Function
+
+    This function is adapted from torch.utils.checkpoint with two main changes:
+    1) torch.cuda.set_rng_state is replaced with `_set_cuda_rng_state`
+    2) the states in the model parallel tracker are also properly tracked/set/reset.
     """
+
     @staticmethod
     def forward(ctx, run_function, distribute_saved_activations, *args):
         ctx.run_function = run_function
-        ctx.distribute_saved_activations \
-            = distribute_saved_activations
+        ctx.distribute_saved_activations = distribute_saved_activations
 
         # Copy the rng states.
         ctx.fwd_cpu_rng_state = torch.get_rng_state()
@@ -199,8 +253,8 @@ def forward(ctx, run_function, distribute_saved_activations, *args):
         if distribute_saved_activations:
             ctx.input_0_shape = args[0].data.shape
             safely_set_viewless_tensor_data(
-                args[0],
-                split_tensor_into_1d_equal_chunks(args[0].data, new_buffer=True))
+                args[0], split_tensor_into_1d_equal_chunks(args[0].data, new_buffer=True)
+            )
 
         # Store everything.
         ctx.save_for_backward(*args)
@@ -210,13 +264,15 @@ def forward(ctx, run_function, distribute_saved_activations, *args):
     @staticmethod
     def backward(ctx, *args):
         if not torch.autograd._is_checkpoint_valid():
-            raise RuntimeError("Checkpointing is not compatible with .grad(), "
-                               "please use .backward() if possible")
+            raise RuntimeError(
+                "Checkpointing is not compatible with .grad(), "
+                "please use .backward() if possible"
+            )
         inputs = ctx.saved_tensors
         if ctx.distribute_saved_activations:
             safely_set_viewless_tensor_data(
-                inputs[0],
-                gather_split_1d_tensor(inputs[0].data).view(ctx.input_0_shape))
+                inputs[0], gather_split_1d_tensor(inputs[0].data).view(ctx.input_0_shape)
+            )
 
         # Store the current states.
         bwd_cpu_rng_state = torch.get_rng_state()
@@ -240,14 +296,15 @@ def backward(ctx, *args):
 
         if isinstance(outputs, torch.Tensor):
             outputs = (outputs,)
+
+        # filter out non tensor outputs for backward pass
+        outputs, args = zip(*filter(lambda x: torch.is_tensor(x[0]), zip(outputs, args)))
         torch.autograd.backward(outputs, args)
-        grads = tuple(inp.grad if isinstance(inp, torch.Tensor) else inp
-                      for inp in detached_inputs)
+        grads = tuple(inp.grad if isinstance(inp, torch.Tensor) else inp for inp in detached_inputs)
         return (None, None) + grads
 
 
 def checkpoint(function, distribute_saved_activations, *args):
     """Checkpoint a model or part of the model.
     This has been directly copied from torch.utils.checkpoint."""
-    return CheckpointFunction.apply(function,
-                                    distribute_saved_activations, *args)
+    return CheckpointFunction.apply(function, distribute_saved_activations, *args)
diff --git a/megatron/core/tensor_parallel/utils.py b/megatron/core/tensor_parallel/utils.py
index a4c7cb77cc..d7c191b411 100644
--- a/megatron/core/tensor_parallel/utils.py
+++ b/megatron/core/tensor_parallel/utils.py
@@ -1,26 +1,31 @@
 # Copyright (c) 2022, NVIDIA CORPORATION. All rights reserved.
 
-import torch
 from typing import List, Sequence
 
-from megatron.core.utils import divide
+import torch
+
 from megatron.core import parallel_state
+from megatron.core.parallel_state import (
+    get_tensor_model_parallel_group,
+    get_tensor_model_parallel_rank,
+    get_tensor_model_parallel_world_size,
+)
+from megatron.core.utils import divide
+
 
 def split_tensor_along_last_dim(
-    tensor: torch.Tensor,
-    num_partitions: int,
-    contiguous_split_chunks: bool = False,
+    tensor: torch.Tensor, num_partitions: int, contiguous_split_chunks: bool = False
 ) -> List[torch.Tensor]:
-    """ Split a tensor along its last dimension.
+    """Split a tensor along its last dimension.
 
-        Arguments:
-            tensor: input tensor.
-            num_partitions: number of partitions to split the tensor
-            contiguous_split_chunks: If True, make each chunk contiguous
-                                     in memory.
+    Args:
+        tensor: input tensor.
+        num_partitions: number of partitions to split the tensor
+        contiguous_split_chunks: If True, make each chunk contiguous
+                                 in memory.
 
-        Returns:
-            A list of Tensors
+    Returns:
+        A list of Tensors
     """
     # Get the size and dimension.
     last_dim = tensor.dim() - 1
@@ -33,28 +38,31 @@ def split_tensor_along_last_dim(
 
     return tensor_list
 
+
 def split_tensor_into_1d_equal_chunks(tensor, new_buffer=False):
-    """ Break a tensor into equal 1D chunks across tensor parallel ranks.
+    """Break a tensor into equal 1D chunks across tensor parallel ranks.
 
-        Returns a Tensor or View with this rank's portion of the data.
+    Returns a Tensor or View with this rank's portion of the data.
 
-        Arguments:
-            tensor: The tensor to split
+    Args:
+        tensor: The tensor to split
 
-        Keyword Arguments:
-            new_buffer (bool): If True, returns a new Tensor.
-                               If False, returns a view into the existing Tensor.
-                               Default is False
+    Keyword Args:
+        new_buffer (bool): If True, returns a new Tensor.
+                           If False, returns a view into the existing Tensor.
+                           Default is False
 
     """
-    partition_size = torch.numel(tensor) // \
-        parallel_state.get_tensor_model_parallel_world_size()
+    partition_size = torch.numel(tensor) // parallel_state.get_tensor_model_parallel_world_size()
     start_index = partition_size * parallel_state.get_tensor_model_parallel_rank()
     end_index = start_index + partition_size
     if new_buffer:
-        data = torch.empty(partition_size, dtype=tensor.dtype,
-                           device=torch.cuda.current_device(),
-                           requires_grad=False)
+        data = torch.empty(
+            partition_size,
+            dtype=tensor.dtype,
+            device=torch.cuda.current_device(),
+            requires_grad=False,
+        )
         data.copy_(tensor.view(-1)[start_index:end_index])
     else:
         data = tensor.view(-1)[start_index:end_index]
@@ -62,33 +70,33 @@ def split_tensor_into_1d_equal_chunks(tensor, new_buffer=False):
 
 
 def gather_split_1d_tensor(tensor):
-    """ Opposite of split_tensor_into_1d_equal_chunks. Gather values from tensor
-        model parallel ranks.
+    """Opposite of split_tensor_into_1d_equal_chunks. Gather values from tensor
+    model parallel ranks.
 
-        Returns a new Tensor with the gathered data.
+    Returns a new Tensor with the gathered data.
 
-        Arguments:
-            tensor: A Tensor or view of this rank's portion of the data.
+    Args:
+        tensor: A Tensor or view of this rank's portion of the data.
     """
-    numel_gathered = torch.numel(tensor) * \
-        parallel_state.get_tensor_model_parallel_world_size()
-    gathered = torch.empty(numel_gathered, dtype=tensor.dtype,
-                           device=torch.cuda.current_device(),
-                           requires_grad=False)
+    numel_gathered = torch.numel(tensor) * parallel_state.get_tensor_model_parallel_world_size()
+    gathered = torch.empty(
+        numel_gathered, dtype=tensor.dtype, device=torch.cuda.current_device(), requires_grad=False
+    )
     # TODO: This API is experimental in pytorch (as of Feb 2022) and
     # this might break in future pytorch releases. We chose this API
     # as opposed to torch.distributed.all_gather for efficiency reasons.
     # This API calls directly NCCL all-gather versus the former does
     # internal copies and can potentially cause slow down.
-    torch.distributed._all_gather_base(gathered, tensor,
-                                       group=parallel_state.get_tensor_model_parallel_group())
+    torch.distributed._all_gather_base(
+        gathered, tensor, group=parallel_state.get_tensor_model_parallel_group()
+    )
     return gathered
 
 
 class VocabUtility:
-    """ Split the vocabulary into `world_size` chunks and return the first
-        and last index of the vocabulary belonging to the `rank`
-        partition: Note that indices in [fist, last)
+    """Split the vocabulary into `world_size` chunks and return the first
+    and last index of the vocabulary belonging to the `rank`
+    partition: Note that indices in [fist, last)
 
     """
 
@@ -101,7 +109,9 @@ def vocab_range_from_per_partition_vocab_size(
         return index_f, index_l
 
     @staticmethod
-    def vocab_range_from_global_vocab_size(global_vocab_size: int, rank: int, world_size: int) -> Sequence[int]:
+    def vocab_range_from_global_vocab_size(
+        global_vocab_size: int, rank: int, world_size: int
+    ) -> Sequence[int]:
         per_partition_vocab_size = divide(global_vocab_size, world_size)
         return VocabUtility.vocab_range_from_per_partition_vocab_size(
             per_partition_vocab_size, rank, world_size
diff --git a/megatron/timers.py b/megatron/core/timers.py
similarity index 54%
rename from megatron/timers.py
rename to megatron/core/timers.py
index a9478fa014..e7070e37d8 100644
--- a/megatron/timers.py
+++ b/megatron/core/timers.py
@@ -2,16 +2,14 @@
 
 """Megatron timers."""
 
-from abc import ABC
-from abc import abstractmethod
 import time
+from abc import ABC, abstractmethod
+from typing import List
 
 import torch
 
 
-
 class TimerBase(ABC):
-
     def __init__(self, name):
         self.name = name
 
@@ -32,9 +30,7 @@ def elapsed(self, reset=True, barrier=False):
         pass
 
 
-
 class DummyTimer(TimerBase):
-
     def __init__(self):
         super().__init__('dummy timer')
 
@@ -48,13 +44,13 @@ def reset(self):
         return
 
     def elapsed(self, reset=True, barrier=False):
-        raise Exception('dummy timer should not be used to '
-                        'calculate elapsed time')
-
+        raise Exception('dummy timer should not be used to calculate elapsed time')
 
 
 class Timer(TimerBase):
     """
+    Timer class with ability to start/stop.
+
     Comment on using `barrier`: If this flag is passed, then all
     the caller processes will wait till all reach the timing routine.
     It is up to the user to make sure all the ranks in `barrier_group`
@@ -64,20 +60,33 @@ class Timer(TimerBase):
     """
 
     def __init__(self, name):
+        """Initialize Timer.
+
+        Args:
+            name (str): Name of the timer.
+        """
         super().__init__(name)
         self._elapsed = 0.0
+        self._active_time = 0.0
         self._started = False
         # Note that None will default to the global process group
         self._barrier_group = None
         self._start_time = time.time()
 
-
     def set_barrier_group(self, barrier_group):
-        self._barrier_group = barrier_group
+        """Sets barrier group.
 
+        Args:
+            barrier_group (ProcessGroup): Torch ProcessGroup for barrier.
+        """
+        self._barrier_group = barrier_group
 
     def start(self, barrier=False):
-        """Start the timer."""
+        """Start the timer.
+
+        Args:
+            barrier (bool, optional): Synchronizes ranks before starting. Defaults to False.
+        """
         assert not self._started, 'timer has already been started'
         if barrier:
             torch.distributed.barrier(group=self._barrier_group)
@@ -85,25 +94,37 @@ def start(self, barrier=False):
         self._start_time = time.time()
         self._started = True
 
-
     def stop(self, barrier=False):
-        """Stop the timer."""
+        """Stop the timer.
+
+        Args:
+            barrier (bool, optional): Synchronizes ranks before stopping. Defaults to False.
+        """
         assert self._started, 'timer is not started'
         if barrier:
             torch.distributed.barrier(group=self._barrier_group)
         torch.cuda.synchronize()
-        self._elapsed += (time.time() - self._start_time)
+        elapsed = time.time() - self._start_time
+        self._elapsed += elapsed
+        self._active_time += elapsed
         self._started = False
 
-
     def reset(self):
         """Reset timer."""
+        # Don't reset _active_time
         self._elapsed = 0.0
         self._started = False
 
-
     def elapsed(self, reset=True, barrier=False):
-        """Calculate the elapsed time."""
+        """Calculates the elapsed time and restarts timer.
+
+        Args:
+            reset (bool, optional): Resets timer before restarting. Defaults to True.
+            barrier (bool, optional): Synchronizes ranks before stopping. Defaults to False.
+
+        Returns:
+            float: Elapsed time.
+        """
         _started = self._started
         # If the timing in progress, end it first.
         if self._started:
@@ -118,37 +139,53 @@ def elapsed(self, reset=True, barrier=False):
             self.start(barrier=barrier)
         return _elapsed
 
+    def active_time(self):
+        return self._active_time
 
 
 class Timers:
-    """Group of timers."""
+    """Class for a group of Timers."""
 
     def __init__(self, log_level, log_option):
+        """Initialize group of timers.
+
+        Args:
+            log_level (int): Log level to control what timers are enabled.
+            log_option (str): Setting for logging statistics over ranks for all the timers. Allowed: ['max', 'minmax', 'all'].
+        """
         self._log_level = log_level
+        allowed_log_options = set(['max', 'minmax', 'all'])
+        assert (
+            log_option in allowed_log_options
+        ), 'input log option {} is invalid. It must be one of {}'.format(
+            log_option, allowed_log_options
+        )
         self._log_option = log_option
         self._timers = {}
         self._log_levels = {}
         self._dummy_timer = DummyTimer()
         self._max_log_level = 2
 
-
     def __call__(self, name, log_level=None):
+        """Call timer with name and log level."""
         # If the timer has already been set, then check if the log-level
         # is provided, it matches the one that the timer was created with.
         if name in self._timers:
             if log_level is not None:
-                assert log_level == self._log_levels[name], \
-                    'input log level {} does not match already existing '\
-                    'log level {} for {} timer'.format(
-                        log_level, self._log_levels[name], name)
+                assert log_level == self._log_levels[name], (
+                    'input log level {} does not match already existing '
+                    'log level {} for {} timer'.format(log_level, self._log_levels[name], name)
+                )
             return self._timers[name]
         # If timer does not exist and no log level is provided,
         # set it to the max log level which is 2.
         if log_level is None:
             log_level = self._max_log_level
-        assert log_level <= self._max_log_level, \
-            'log level {} is larger than max supported log level {}'.format(
-                log_level, self._max_log_level)
+        assert (
+            log_level <= self._max_log_level
+        ), 'log level {} is larger than max supported log level {}'.format(
+            log_level, self._max_log_level
+        )
         # Now if the input log level is larger than the one set for
         # the timers class, just ignore it and return a dummy timer.
         if log_level > self._log_level:
@@ -158,18 +195,21 @@ def __call__(self, name, log_level=None):
         self._log_levels[name] = log_level
         return self._timers[name]
 
-
     def _get_elapsed_time_all_ranks(self, names, reset, barrier):
-        """
+        """Returns elapsed times of timers in names.
         Assumptions:
             - All the ranks call this function.
             - `names` are identical on all ranks.
         If the above assumptions are not met, calling this function will
         result in hang.
-        Arguments:
-            - names: list of timer names
-            - reset: reset the timer after recording the elapsed time
-            - barrier: if set, do a global barrier before time measurments
+
+        Args:
+            names (List[str]): list of timer names
+            reset (bool): reset the timer after recording the elapsed time
+            barrier (bool): if set, do a global barrier before time measurments
+
+        Returns:
+            torch.tensor: Tensor of size [world_size, len(names)] with times in float.
         """
 
         # First make sure all the callers are in sync.
@@ -184,30 +224,28 @@ def _get_elapsed_time_all_ranks(self, names, reset, barrier):
         # pytorch yet. It is simpler to deal with a single tensor
         # and since we are only gathering a small amount of data,
         # it should be ok to use all-gather instead of gather.
-        rank_name_to_time = torch.zeros((world_size, len(names)),
-                                        dtype=torch.float,
-                                        device=torch.cuda.current_device())
+        rank_name_to_time = torch.zeros(
+            (world_size, len(names)), dtype=torch.float, device=torch.cuda.current_device()
+        )
         for i, name in enumerate(names):
             if name in self._timers:
                 # Here we don't need to pass the barrier flag as all
                 # the processes are already in sync. This avoids the
                 # issue of different timers having different barrier
                 # groups inside their class.
-                rank_name_to_time[rank, i] = self._timers[name].elapsed(
-                    reset=reset)
+                rank_name_to_time[rank, i] = self._timers[name].elapsed(reset=reset)
 
         # See the note above for why we are not using gather.
-        torch.distributed._all_gather_base(rank_name_to_time.view(-1),
-                                           rank_name_to_time[rank, :].view(-1))
+        torch.distributed._all_gather_base(
+            rank_name_to_time.view(-1), rank_name_to_time[rank, :].view(-1)
+        )
 
         return rank_name_to_time
 
-
     def _get_global_min_max_time(self, names, reset, barrier, normalizer):
         """Report only min and max times across all ranks."""
 
-        rank_name_to_time = self._get_elapsed_time_all_ranks(names, reset,
-                                                             barrier)
+        rank_name_to_time = self._get_elapsed_time_all_ranks(names, reset, barrier)
         name_to_min_max_time = {}
         for i, name in enumerate(names):
             rank_to_time = rank_name_to_time[:, i]
@@ -217,32 +255,32 @@ def _get_global_min_max_time(self, names, reset, barrier, normalizer):
             if rank_to_time.numel() > 0:
                 name_to_min_max_time[name] = (
                     rank_to_time.min().item() / normalizer,
-                    rank_to_time.max().item() / normalizer)
+                    rank_to_time.max().item() / normalizer,
+                )
         return name_to_min_max_time
 
-
-    def _get_global_min_max_time_string(self, names, reset, barrier,
-                                        normalizer, max_only):
-        name_to_min_max_time = self._get_global_min_max_time(
-            names, reset, barrier, normalizer)
+    def _get_global_min_max_time_string(self, names, reset, barrier, normalizer, max_only):
+        """Report strings for max/minmax times across all ranks."""
+        name_to_min_max_time = self._get_global_min_max_time(names, reset, barrier, normalizer)
         if not name_to_min_max_time:
             return None
-        output_string = '(min, max) time across ranks (ms):'
+        if max_only:
+            output_string = 'max time across ranks (ms):'
+        else:
+            output_string = '(min, max) time across ranks (ms):'
         for name in name_to_min_max_time:
             min_time, max_time = name_to_min_max_time[name]
             if max_only:
-                output_string += '\n    {}: {:.2f}'.format(
-                    (name+' ').ljust(48, '.'), max_time)
+                output_string += '\n    {}: {:.2f}'.format((name + ' ').ljust(48, '.'), max_time)
             else:
                 output_string += '\n    {}: ({:.2f}, {:.2f})'.format(
-                    (name+' ').ljust(48, '.'), min_time, max_time)
+                    (name + ' ').ljust(48, '.'), min_time, max_time
+                )
         return output_string
 
-
     def _get_all_ranks_time_string(self, names, reset, barrier, normalizer):
         """Report times across all ranks."""
-        rank_name_to_time = self._get_elapsed_time_all_ranks(names, reset,
-                                                             barrier)
+        rank_name_to_time = self._get_elapsed_time_all_ranks(names, reset, barrier)
 
         output_string = 'times across ranks (ms):'
         no_reported_timing = True
@@ -255,49 +293,103 @@ def _get_all_ranks_time_string(self, names, reset, barrier, normalizer):
                         not_yet_found = False
                         output_string += '\n  {}:'.format(name)
                     output_string += '\n     rank {:2d}: {:.2f}'.format(
-                        rank, rank_name_to_time[rank, i] / normalizer)
+                        rank, rank_name_to_time[rank, i] / normalizer
+                    )
         if no_reported_timing:
             return None
         return output_string
 
+    def get_all_timers_string(
+        self,
+        names: List[str] = None,
+        normalizer: float = 1.0,
+        reset: bool = True,
+        barrier: bool = False,
+    ):
+        """Returns the output string with logged timer values according to configured options.
+
+        Args:
+            names (List[str]): Names of the timers to log. If None, all registered timers are fetched. Defaults to None.
+            normalizer (float, optional): Normalizes the timer values by the factor. Defaults to 1.0.
+            reset (bool, optional): Whether to reset timer values after logging. Defaults to True.
+            barrier (bool, optional): Whether to do a global barrier before time measurments. Defaults to False.
+
+        Raises:
+            Exception: Raises if log option is invalid.
+
+        Returns:
+            str: Formatted string with the timer values.
+        """
 
-    def log(self, names, rank=None, normalizer=1.0, reset=True, barrier=False):
-        """Log a group of timers."""
+        if names == None:  # get all registered timers
+            names = self._timers.keys()
 
-        # Print.
         assert normalizer > 0.0
         if self._log_option in ['max', 'minmax']:
             max_only = False
             if self._log_option == 'max':
                 max_only = True
             output_string = self._get_global_min_max_time_string(
-                names, reset, barrier, normalizer/1000.0, max_only)
+                names, reset, barrier, normalizer / 1000.0, max_only
+            )
         elif self._log_option == 'all':
-            output_string = self._get_all_ranks_time_string(names,
-                                                            reset, barrier,
-                                                            normalizer/1000.0)
+            output_string = self._get_all_ranks_time_string(
+                names, reset, barrier, normalizer / 1000.0
+            )
         else:
-            raise Exception('unknown timing log option {}'.format(
-                self._log_option))
+            raise Exception('unknown timing log option {}'.format(self._log_option))
+        return output_string
+
+    def log(
+        self,
+        names: List[str],
+        rank: int = None,
+        normalizer: float = 1.0,
+        reset: bool = True,
+        barrier: bool = False,
+    ):
+        """logs the timers passed in names to stdout. Example usage is to log average per step value for timer 'foo',
+          this function can be called with normalizer factor set to logging interval.
+
+        Args:
+            names (List[str]): Names of the timers to log.
+            rank (int, optional): logs the timers to a specific rank. If set to None, logs to the last rank. Defaults to None.
+            normalizer (float, optional): Normalizes the timer values by the factor. Defaults to 1.0.
+            reset (bool, optional): Whether to reset timer values after logging. Defaults to True.
+            barrier (bool, optional): Whether to do a global barrier before time measurments. Defaults to False.
+        """
 
+        output_string = self.get_all_timers_string(names, normalizer, reset, barrier)
         # If no input rank is provided, log on last rank.
         if rank is None:
             rank = torch.distributed.get_world_size() - 1
         if rank == torch.distributed.get_rank() and output_string is not None:
             print(output_string, flush=True)
 
-
-    def write(self, names, writer, iteration, normalizer=1.0,
-              reset=False, barrier=False):
-        """Write timers to a tensorboard writer
-        Note that we only report maximum time across ranks to tensorboard.
+    def write(
+        self,
+        names: List[str],
+        writer,
+        iteration: int,
+        normalizer: float = 1.0,
+        reset: bool = True,
+        barrier: bool = False,
+    ):
+        """Write timers to a tensorboard writer. Note that we only report maximum time across ranks to tensorboard.
+
+        Args:
+            names (List[str]): Names of the timers to log.
+            writer (SummaryWriter): Tensorboard SummaryWriter object
+            iteration (int): Current iteration.
+            normalizer (float, optional): Normalizes the timer values by the factor. Defaults to 1.0.
+            reset (bool, optional): Whether to reset timer values after logging. Defaults to True.
+            barrier (bool, optional): Whether to do a global barrier before time measurments. Defaults to False.
         """
         # currently when using add_scalars,
         # torch.utils.add_scalars makes each timer its own run, which
         # polutes the runs list, so we just add each as a scalar
         assert normalizer > 0.0
-        name_to_min_max_time = self._get_global_min_max_time(
-            names, reset, barrier, normalizer)
+        name_to_min_max_time = self._get_global_min_max_time(names, reset, barrier, normalizer)
         if writer is not None:
             for name in name_to_min_max_time:
                 _, max_time = name_to_min_max_time[name]
diff --git a/megatron/core/transformer/__init__.py b/megatron/core/transformer/__init__.py
new file mode 100644
index 0000000000..0e3cdcfa57
--- /dev/null
+++ b/megatron/core/transformer/__init__.py
@@ -0,0 +1,6 @@
+# Copyright (c) 2023, NVIDIA CORPORATION. All rights reserved.
+
+from .module import MegatronModule
+from .spec_utils import ModuleSpec, build_module
+from .transformer_config import MLATransformerConfig, TransformerConfig
+from .transformer_layer import TransformerLayer, TransformerLayerSubmodules
diff --git a/megatron/core/transformer/attention.py b/megatron/core/transformer/attention.py
new file mode 100644
index 0000000000..850dec88e1
--- /dev/null
+++ b/megatron/core/transformer/attention.py
@@ -0,0 +1,582 @@
+# Copyright (c) 2023, NVIDIA CORPORATION. All rights reserved.
+from abc import ABC, abstractmethod
+from dataclasses import dataclass
+from typing import Union
+
+import torch
+
+from megatron.core import parallel_state, tensor_parallel
+from megatron.core.models.common.embeddings import apply_rotary_pos_emb
+from megatron.core.parallel_state import (
+    get_data_parallel_group,
+    get_data_parallel_rank,
+    get_data_parallel_world_size,
+    get_tensor_model_parallel_group,
+    get_tensor_model_parallel_rank,
+    get_tensor_model_parallel_world_size,
+)
+from megatron.core.transformer.module import MegatronModule
+from megatron.core.transformer.spec_utils import ModuleSpec, build_module
+from megatron.core.utils import divide
+
+from .enums import AttnMaskType
+from .transformer_config import TransformerConfig
+
+try:
+    import transformer_engine  # pylint: disable=unused-import
+
+    HAVE_TE = True
+    from megatron.core.transformer.custom_layers.transformer_engine import SplitAlongDim
+except ImportError:
+    HAVE_TE = False
+    SplitAlongDim = None
+
+
+@dataclass
+class SelfAttentionSubmodules:
+    linear_qkv: Union[ModuleSpec, type] = None
+    core_attention: Union[ModuleSpec, type] = None
+    linear_proj: Union[ModuleSpec, type] = None
+    q_layernorm: Union[ModuleSpec, type] = None
+    k_layernorm: Union[ModuleSpec, type] = None
+
+
+@dataclass
+class CrossAttentionSubmodules:
+    linear_q: Union[ModuleSpec, type] = None
+    linear_kv: Union[ModuleSpec, type] = None
+    core_attention: Union[ModuleSpec, type] = None
+    linear_proj: Union[ModuleSpec, type] = None
+
+
+class Attention(MegatronModule, ABC):
+    """Attention layer abstract class.
+
+    This layer only contains common modules required for the "self attn" and
+    "cross attn" specializations.
+    """
+
+    def __init__(
+        self,
+        config: TransformerConfig,
+        submodules: Union[SelfAttentionSubmodules, CrossAttentionSubmodules],
+        layer_number: int,
+        attn_mask_type: AttnMaskType,
+        attention_type: str,
+    ):
+        super().__init__(config=config)
+
+        self.config = config
+        self.layer_number = layer_number
+        self.attn_mask_type = attn_mask_type
+        self.attention_type = attention_type
+
+        # For normal attention without groups, num_query_groups == num_attention_heads,
+        # so these two will be the same
+        self.query_projection_size = self.config.kv_channels * self.config.num_attention_heads
+        self.kv_projection_size = self.config.kv_channels * self.config.num_query_groups
+
+        # Per attention head and per partition values.
+        world_size = parallel_state.get_tensor_model_parallel_world_size()
+        self.hidden_size_per_attention_head = divide(
+            self.query_projection_size, self.config.num_attention_heads
+        )
+        self.num_attention_heads_per_partition = divide(self.config.num_attention_heads, world_size)
+        self.num_query_groups_per_partition = divide(self.config.num_query_groups, world_size)
+
+        self.core_attention = build_module(
+            submodules.core_attention,
+            config=self.config,
+            layer_number=self.layer_number,
+            attn_mask_type=self.attn_mask_type,
+            attention_type=self.attention_type,
+        )
+
+        self.checkpoint_core_attention = self.config.recompute_granularity == 'selective'
+
+        # Output.
+        self.linear_proj = build_module(
+            submodules.linear_proj,
+            self.query_projection_size,
+            self.config.hidden_size,
+            config=self.config,
+            init_method=self.config.output_layer_init_method,
+            bias=self.config.add_bias_linear,
+            input_is_parallel=True,
+            skip_bias_add=True,
+            is_expert=False,
+            tp_comm_buffer_name='proj',
+        )
+
+    def _checkpointed_attention_forward(
+        self,
+        query,
+        key,
+        value,
+        attention_mask,
+        rotary_pos_emb=None,
+        attn_mask_type=None,
+        packed_seq_params=None,
+    ):
+        """Forward method with selective activation checkpointing."""
+
+        def custom_forward(*inputs):
+            query = inputs[0]
+            key = inputs[1]
+            value = inputs[2]
+            attention_mask = inputs[3]
+            attn_mask_type = inputs[5]
+            attn_mask_type = AttnMaskType(attn_mask_type.item())
+            output_ = self.core_attention(
+                query,
+                key,
+                value,
+                attention_mask,
+                attn_mask_type=attn_mask_type,
+                packed_seq_params=packed_seq_params,
+            )
+            return output_
+
+        if attn_mask_type is None:
+            attn_mask_type = self.attn_mask_type
+        attn_mask_type = torch.tensor([attn_mask_type.value], dtype=torch.int)
+        hidden_states = tensor_parallel.checkpoint(
+            custom_forward, False, query, key, value, attention_mask, rotary_pos_emb, attn_mask_type
+        )
+
+        return hidden_states
+
+    def _allocate_memory(self, inference_max_sequence_length, batch_size, dim, dtype):
+        """Allocate memory to store kv cache during inference."""
+
+        return torch.empty(
+            inference_max_sequence_length,
+            batch_size,
+            self.num_query_groups_per_partition,
+            dim,
+            dtype=dtype,
+            device=torch.cuda.current_device(),
+        )
+
+    def _adjust_key_value_for_inference(self, inference_params, key, value, rotary_pos_emb):
+        """
+        Saves the generated key and value tensors to the end of the buffers in inference_params.
+        Returns the full size keys and values from the provided inference_params, as well as
+        adjusted rotary_pos_emb.
+
+        Returns a tuple: (key, value, rotary_pos_emb)
+
+        """
+        attn_mask_type = self.attn_mask_type
+        if inference_params is None:
+            return key, value, rotary_pos_emb, attn_mask_type
+
+        # =================================================
+        # Pre-allocate memory for key-values for inference.
+        # =================================================
+        if self.layer_number not in inference_params.key_value_memory_dict:
+            inf_max_seq_length = inference_params.max_sequence_length
+            inf_max_batch_size = inference_params.max_batch_size
+            inference_key_memory = self._allocate_memory(
+                inf_max_seq_length, inf_max_batch_size, key.shape[-1], key.dtype
+            )
+            inference_value_memory = self._allocate_memory(
+                inf_max_seq_length, inf_max_batch_size, value.shape[-1], value.dtype
+            )
+            inference_params.key_value_memory_dict[self.layer_number] = (
+                inference_key_memory,
+                inference_value_memory,
+            )
+        else:
+            # Get the pre-allocated buffers for this layer
+            inference_key_memory, inference_value_memory = inference_params.key_value_memory_dict[
+                self.layer_number
+            ]
+
+        if inference_params.sequence_len_offset > 0:
+            # This should mean that we are past the prompt forward_step
+            # and so we need to turn off masking
+            attn_mask_type = AttnMaskType.no_mask
+
+        batch_start = inference_params.batch_size_offset
+        batch_end = batch_start + key.size(1)
+        assert batch_end <= inference_key_memory.size(1)
+        sequence_start = inference_params.sequence_len_offset
+        sequence_end = sequence_start + key.size(0)
+        assert sequence_end <= inference_key_memory.size(0)
+        # Copy key and values.
+        inference_key_memory[sequence_start:sequence_end, batch_start:batch_end, ...] = key
+        inference_value_memory[sequence_start:sequence_end, batch_start:batch_end, ...] = value
+        key = inference_key_memory[:sequence_end, batch_start:batch_end, ...]
+        value = inference_value_memory[:sequence_end, batch_start:batch_end, ...]
+
+        # adjust the key rotary positional embedding
+        if rotary_pos_emb is None:
+            return key, value, rotary_pos_emb, attn_mask_type
+
+        q_pos_emb, k_pos_emb = rotary_pos_emb
+        q_pos_emb = q_pos_emb[sequence_start:sequence_end, :, :, :]
+        k_pos_emb = k_pos_emb[:sequence_end, :, :, :]
+        rotary_pos_emb = (q_pos_emb, k_pos_emb)
+
+        return key, value, rotary_pos_emb, attn_mask_type
+
+    @abstractmethod
+    def get_query_key_value_tensors(self, hidden_states, key_value_states):
+        """
+        This method needs to be implemented based on whether the derived class
+        is "self-attn" or "cross-attn".
+        """
+
+    def forward(
+        self,
+        hidden_states,
+        attention_mask,
+        key_value_states=None,
+        inference_params=None,
+        rotary_pos_emb=None,
+        packed_seq_params=None,
+    ):
+        # hidden_states: [sq, b, h]
+
+        # For self attention we just duplicate the rotary_pos_emb if it isn't already
+        if rotary_pos_emb is not None and not isinstance(rotary_pos_emb, tuple):
+            rotary_pos_emb = (rotary_pos_emb,) * 2
+
+        # =====================
+        # Query, Key, and Value
+        # =====================
+        # Get the query, key and value tensors based on the type of attention -
+        # self or cross attn.
+        query, key, value = self.get_query_key_value_tensors(hidden_states, key_value_states)
+
+        # ===================================================
+        # Adjust key, value, and rotary_pos_emb for inference
+        # ===================================================
+        key, value, rotary_pos_emb, attn_mask_type = self._adjust_key_value_for_inference(
+            inference_params, key, value, rotary_pos_emb
+        )
+
+        if packed_seq_params is not None:
+            query = query.squeeze(1)
+            key = key.squeeze(1)
+            value = value.squeeze(1)
+
+        # ================================================
+        # relative positional embedding (rotary embedding)
+        # ================================================
+        if rotary_pos_emb is not None:
+            q_pos_emb, k_pos_emb = rotary_pos_emb
+
+            if packed_seq_params is not None:
+                cu_seqlens_q = packed_seq_params.cu_seqlens_q
+                cu_seqlens_kv = packed_seq_params.cu_seqlens_kv
+            else:
+                cu_seqlens_q = cu_seqlens_kv = None
+            query = apply_rotary_pos_emb(
+                query, q_pos_emb, config=self.config, cu_seqlens=cu_seqlens_q
+            )
+            key = apply_rotary_pos_emb(key, k_pos_emb, config=self.config, cu_seqlens=cu_seqlens_kv)
+
+            # TODO, can apply positional embedding to value_layer so it has
+            # absolute positional embedding.
+            # otherwise, only relative positional embedding takes effect
+            # value_layer = apply_rotary_pos_emb(value_layer, k_pos_emb)
+
+        # ==================================
+        # core attention computation
+        # ==================================
+
+        if self.checkpoint_core_attention and self.training:
+            core_attn_out = self._checkpointed_attention_forward(
+                query,
+                key,
+                value,
+                attention_mask,
+                attn_mask_type=attn_mask_type,
+                packed_seq_params=packed_seq_params,
+            )
+        else:
+            core_attn_out = self.core_attention(
+                query,
+                key,
+                value,
+                attention_mask,
+                attn_mask_type=attn_mask_type,
+                packed_seq_params=packed_seq_params,
+            )
+
+        if packed_seq_params is not None:
+            # reshape to same output shape as unpacked case
+            # (t, np, hn) -> (t, b=1, h=np*hn)
+            # t is the pack size = sum (sq_i)
+            # note that batch is a dummy dimension in the packed case
+            core_attn_out = core_attn_out.reshape(core_attn_out.size(0), 1, -1)
+
+        # =================
+        # Output. [sq, b, h]
+        # =================
+
+        output, bias = self.linear_proj(core_attn_out)
+
+        return output, bias
+
+
+class SelfAttention(Attention):
+    """Self-attention layer class
+
+    Self-attention layer takes input with size [s, b, h]
+    and returns output of the same size.
+    """
+
+    def __init__(
+        self,
+        config: TransformerConfig,
+        submodules: SelfAttentionSubmodules,
+        layer_number: int,
+        attn_mask_type=AttnMaskType.padding,
+    ):
+        super().__init__(
+            config=config,
+            submodules=submodules,
+            layer_number=layer_number,
+            attn_mask_type=attn_mask_type,
+            attention_type="self",
+        )
+
+        self.linear_qkv = build_module(
+            submodules.linear_qkv,
+            self.config.hidden_size,
+            self.query_projection_size + 2 * self.kv_projection_size,
+            config=self.config,
+            init_method=self.config.init_method,
+            gather_output=False,
+            bias=self.config.add_bias_linear or self.config.add_qkv_bias,
+            skip_bias_add=False,
+            is_expert=False,
+            tp_comm_buffer_name='qkv',
+        )
+
+        if submodules.q_layernorm is not None:
+            self.q_layernorm = build_module(
+                submodules.q_layernorm,
+                hidden_size=self.hidden_size_per_attention_head,
+                config=self.config,
+                eps=self.config.layernorm_epsilon,
+            )
+        else:
+            self.q_layernorm = None
+
+        if submodules.k_layernorm is not None:
+            self.k_layernorm = build_module(
+                submodules.k_layernorm,
+                hidden_size=self.hidden_size_per_attention_head,
+                config=self.config,
+                eps=self.config.layernorm_epsilon,
+            )
+        else:
+            self.k_layernorm = None
+
+    def run_realtime_tests(self):
+        """Performs a consistency check.
+
+        This function makes sure that tensors across devices are the same during an experiment.
+        This is often not guaranteed to be so because of silent hardware failures (eg, memory
+        corruption loading a checkpoint, network traffic corruption encountered during
+        data transmission).
+
+        (TODO) In the future, more tensors should be checked across the training run and
+        checked every X iterations. This is left for future work. Equality of tensors is probably
+        not required; transmitting hashes is sufficient."""
+
+        if not self.config.qk_layernorm:
+            return
+
+        # check that all tensor parallel and data parallel ranks have the same
+        # Q & K layernorm parameters.
+        rank = get_data_parallel_rank()
+        inputs = torch.stack(
+            [
+                self.q_layernorm.weight.data,
+                self.q_layernorm.bias.data,
+                self.k_layernorm.weight.data,
+                self.k_layernorm.bias.data,
+            ]
+        )
+        dp_list = [torch.empty_like(inputs) for _ in range(get_data_parallel_world_size())]
+        dp_list[rank] = inputs
+        torch.distributed.all_gather(dp_list, inputs, group=get_data_parallel_group())
+
+        def _compare(srcs, tgts, names, parallelism):
+            assert len(srcs) == len(tgts) == len(names)
+            for src, tgt, name in zip(srcs, tgts, names):
+                assert torch.all(src == tgt), (
+                    f"Discrepancy between {name} in {parallelism} ranks {i} and {rank}. "
+                    f"Diff: {torch.norm(src - tgt)}"
+                )
+
+        for i, dp in enumerate(dp_list):
+            q_w, q_b, k_w, k_b = torch.unbind(dp)
+            _compare(
+                [q_w, q_b, k_w, k_b],
+                [
+                    self.q_layernorm.weight.data,
+                    self.q_layernorm.bias.data,
+                    self.k_layernorm.weight.data,
+                    self.k_layernorm.bias.data,
+                ],
+                ["q_w", "q_b", "k_w", "k_b"],
+                "DP",
+            )
+
+        rank = get_tensor_model_parallel_rank()
+        tp_list = [torch.empty_like(inputs) for _ in range(get_tensor_model_parallel_world_size())]
+        tp_list[rank] = inputs
+        torch.distributed.all_gather(tp_list, inputs, group=get_tensor_model_parallel_group())
+
+        for i, tp in enumerate(tp_list):
+            q_w, q_b, k_w, k_b = torch.unbind(tp)
+            _compare(
+                [q_w, q_b, k_w, k_b],
+                [
+                    self.q_layernorm.weight.data,
+                    self.q_layernorm.bias.data,
+                    self.k_layernorm.weight.data,
+                    self.k_layernorm.bias.data,
+                ],
+                ["q_w", "q_b", "k_w", "k_b"],
+                "TP",
+            )
+
+    def get_query_key_value_tensors(self, hidden_states, key_value_states=None):
+        """
+        Derives `query`, `key` and `value` tensors from `hidden_states`.
+        """
+        # Attention heads [sq, b, h] --> [sq, b, ng * (np/ng + 2) * hn)]
+        mixed_qkv, _ = self.linear_qkv(hidden_states)
+
+        # [sq, b, hp] --> [sq, b, ng, (np/ng + 2) * hn]
+        new_tensor_shape = mixed_qkv.size()[:-1] + (
+            self.num_query_groups_per_partition,
+            (
+                (self.num_attention_heads_per_partition // self.num_query_groups_per_partition + 2)
+                * self.hidden_size_per_attention_head
+            ),
+        )
+        mixed_qkv = mixed_qkv.view(*new_tensor_shape)
+
+        split_arg_list = [
+            (
+                self.num_attention_heads_per_partition
+                // self.num_query_groups_per_partition
+                * self.hidden_size_per_attention_head
+            ),
+            self.hidden_size_per_attention_head,
+            self.hidden_size_per_attention_head,
+        ]
+
+        if SplitAlongDim is not None:
+
+            # [sq, b, ng, (np/ng + 2) * hn]
+            # --> [sq, b, ng, np/ng * hn], [sq, b, ng, hn], [sq, b, ng, hn]
+            (query, key, value) = SplitAlongDim(mixed_qkv, 3, split_arg_list)
+        else:
+
+            # [sq, b, ng, (np/ng + 2) * hn]
+            # --> [sq, b, ng, np/ng * hn], [sq, b, ng, hn], [sq, b, ng, hn]
+            (query, key, value) = torch.split(mixed_qkv, split_arg_list, dim=3)
+
+        # [sq, b, ng, np/ng * hn] -> [sq, b, np, hn]
+        query = query.reshape(query.size(0), query.size(1), -1, self.hidden_size_per_attention_head)
+
+        if self.q_layernorm is not None:
+            query = self.q_layernorm(query)
+
+        if self.k_layernorm is not None:
+            key = self.k_layernorm(key)
+
+        if self.config.test_mode:
+            self.run_realtime_tests()
+
+        return query, key, value
+
+
+class CrossAttention(Attention):
+    """Cross-attention layer class
+
+    Cross-attention layer takes input with size [s, b, h] and context with size
+    [s, b, h] and returns output of the same size.
+    """
+
+    def __init__(
+        self,
+        config: TransformerConfig,
+        submodules: CrossAttentionSubmodules,
+        layer_number: int,
+        attn_mask_type=AttnMaskType.padding,
+    ):
+        super().__init__(
+            config=config,
+            submodules=submodules,
+            layer_number=layer_number,
+            attn_mask_type=attn_mask_type,
+            attention_type="cross",
+        )
+
+        if self.config.num_query_groups != self.config.num_attention_heads:
+            raise ValueError("Group query attention is not currently supported in cross attention.")
+        assert self.query_projection_size == self.kv_projection_size
+
+        self.linear_q = build_module(
+            submodules.linear_q,
+            self.config.hidden_size,
+            self.query_projection_size,
+            config=self.config,
+            init_method=self.config.init_method,
+            gather_output=False,
+            bias=self.config.add_bias_linear,
+            skip_bias_add=False,
+            is_expert=False,
+        )
+
+        self.linear_kv = build_module(
+            submodules.linear_kv,
+            self.config.hidden_size,
+            2 * self.kv_projection_size,
+            config=self.config,
+            init_method=self.config.init_method,
+            gather_output=False,
+            bias=self.config.add_bias_linear,
+            skip_bias_add=False,
+            is_expert=False,
+        )
+
+    def get_query_key_value_tensors(self, hidden_states, key_value_states):
+        """
+        Derives `query` tensor from `hidden_states`, and `key`/`value` tensors
+        from `key_value_states`.
+        """
+        # Attention heads [sk, b, h] --> [sk, b, (np * 2 * hn)]
+        mixed_kv, _ = self.linear_kv(key_value_states)
+
+        # [sk, b, (np * 2 * hn)] --> [sk, b, np, 2 * hn]
+        new_tensor_shape = mixed_kv.size()[:-1] + (
+            self.num_attention_heads_per_partition,
+            2 * self.hidden_size_per_attention_head,
+        )
+        mixed_kv = mixed_kv.view(*new_tensor_shape)
+
+        # [sk, b, np, 2 * hn] --> 2 [sk, b, np, hn]
+        (key, value) = tensor_parallel.split_tensor_along_last_dim(mixed_kv, 2)
+
+        # Attention head [sq, b, h] --> [sq, b, hp]
+        query, _ = self.linear_q(hidden_states)
+
+        # [sq, b, hp] --> [sq, b, np, hn]
+        new_tensor_shape = query.size()[:-1] + (
+            self.num_attention_heads_per_partition,
+            self.hidden_size_per_attention_head,
+        )
+        query = query.view(*new_tensor_shape)
+
+        return query, key, value
diff --git a/megatron/core/transformer/cuda_graphs.py b/megatron/core/transformer/cuda_graphs.py
new file mode 100644
index 0000000000..2588980b5b
--- /dev/null
+++ b/megatron/core/transformer/cuda_graphs.py
@@ -0,0 +1,313 @@
+# Copyright (c) 2024, NVIDIA CORPORATION. All rights reserved.
+
+import logging
+import time
+from enum import Enum
+
+import torch
+
+from megatron.core.transformer.module import MegatronModule
+
+try:
+    from transformer_engine.pytorch import make_graphed_callables
+    from transformer_engine.pytorch.fp8 import FP8GlobalStateManager
+
+    HAVE_TE_GRAPHS = True
+except:
+    HAVE_TE_GRAPHS = False
+
+
+class GraphStatus(Enum):
+    """An Enum to track if a cudagraph is ready to perform a forward or backward pass."""
+
+    FWD_READY = 0
+    BWD_READY = 1
+
+
+class GraphStatusFunc(torch.autograd.Function):
+    """Inserts a node into the autograd graph that tracks whether an object has an outstanding
+    backward pass by toggling the value of GraphStatus. This is mainly used to detect when to create
+    multiple graphs per transformer layer for pipeline parallelism.
+    We don't use backward module hooks as they change forward output tensors to views, see:
+    https://pytorch.org/docs/stable/generated/torch.nn.Module.html#torch.nn.Module.register_full_backward_hook
+    """
+
+    @staticmethod
+    def forward(ctx, runner, obj):
+        """Occurs immediately before the graph's forward pass.
+        Marks the graph's backward pass as ready."""
+        ctx.runner = runner
+        runner.status = GraphStatus.BWD_READY
+        return obj
+
+    @staticmethod
+    def backward(ctx, grad):
+        """Occurs immediately after the graph's backward pass.
+        Marks the graph's forward pass as ready."""
+        assert ctx.runner.status == GraphStatus.BWD_READY
+        ctx.runner.status = GraphStatus.FWD_READY
+        return None, grad
+
+
+class TensorDescription:
+    """Records the attributes of a tensor. Used to check if a
+    tensor argument matches the tensor with which the module
+    was graph captured with."""
+
+    def __init__(self, tensor):
+        self.shape = tuple(tensor.shape)
+        self.dtype = tensor.dtype
+        self.device = tensor.device
+
+    def matches_tensor(self, tensor):
+        """Check if 'tensor' matches the attributes of this TensorDescription."""
+
+        assert torch.is_tensor(tensor)
+        return (
+            tensor.shape == self.shape
+            and tensor.dtype == self.dtype
+            and tensor.device == self.device
+        )
+
+
+class CudaGraphCallable(torch.nn.Module):
+    """Wraps a module to be cudagraphable, records the output of the cudagraph.
+    Reinserts non-tensor args, kwargs that were previously filtered out by 'get_tensor_args'.
+    """
+
+    def __init__(self, module, groundtruth_args, groundtruth_kwargs):
+        super().__init__()
+        self.add_module('base_module', module)
+
+        # The Pytorch cudagraph API requires only tensor inputs, so we strip
+        # non-tensor arguments and reinsert them in forward() using these groundtruth attributes.
+        # We will also check future calls to the cudagraph against these to ensure the cudagraph
+        # is called with the same inputs as it was captured with.
+        self.groundtruth_outputs = []
+        self.groundtruth_args = tuple(
+            TensorDescription(a) if torch.is_tensor(a) else a for a in groundtruth_args
+        )
+        self.groundtruth_kwargs = {
+            k: TensorDescription(v) if torch.is_tensor(v) else v
+            for k, v in groundtruth_kwargs.items()
+        }
+
+    def forward(self, *arg_tensors, **kwarg_tensors):
+        """Call the forward pass of the cudagraph. Also checks the outputs
+        of the cudagraph matches what the graph was traced with."""
+
+        args = list(self.groundtruth_args)
+        arg_tensors = list(arg_tensors)
+        for idx, groundtruth_arg in enumerate(self.groundtruth_args):
+            if isinstance(groundtruth_arg, TensorDescription):
+                args[idx] = arg_tensors.pop(0)
+
+        kwargs = dict(self.groundtruth_kwargs)
+        for k, v in self.groundtruth_kwargs.items():
+            if isinstance(v, TensorDescription):
+                kwargs[k] = kwarg_tensors[k]
+
+        # Use forward() instead of __call__ to avoid triggering hooks
+        out = self.base_module.forward(*args, **kwargs)
+        if torch.is_tensor(out):
+            out = tuple(out)
+
+        self.groundtruth_outputs = [TensorDescription(o) if torch.is_tensor(o) else o for o in out]
+
+        out = tuple(o for o in out if torch.is_tensor(o))
+        assert (
+            len(out) > 0
+        ), """A graphed module returned no tensors in training mode, however the graphed module 
+            must output at least one tensor, so that a corresponding backward node
+            may be registered in the autograd graph."""
+
+        if len(out) == 1:
+            return out[0]
+        return out
+
+
+class CudaGraphRunner(torch.nn.Module):
+    """Wraps a single cudagraph and its expected arguments. Checks that
+    the provided args are the same as what the graph was traced with.
+    """
+
+    def __init__(self, graphed_module, wrapped_module):
+        super().__init__()
+
+        self.graphed_module = graphed_module
+        self.groundtruth_args = wrapped_module.groundtruth_args
+        self.groundtruth_kwargs = wrapped_module.groundtruth_kwargs
+        self.groundtruth_outputs = wrapped_module.groundtruth_outputs
+        self.status = GraphStatus.FWD_READY
+
+    def static_args_match(self, args, kwargs):
+        """Check the the passed args, kwargs match with the arg, kwargs
+        the graph was created with."""
+
+        def check(val, ref):
+            if isinstance(ref, TensorDescription):
+                return ref.matches_tensor(val)
+            return ref == val
+
+        if len(args) != len(self.groundtruth_args):
+            return False
+        for idx, groundtruth_arg in enumerate(self.groundtruth_args):
+            if not check(args[idx], groundtruth_arg):
+                return False
+
+        if kwargs.keys() != self.groundtruth_kwargs.keys():
+            return False
+        for k, v in self.groundtruth_kwargs.items():
+            if not check(kwargs[k], v):
+                return False
+        return True
+
+    def forward(self, args, kwargs, is_first_microbatch=None):
+        """Call the forward pass of the cuda graph."""
+        if self.training and torch.is_grad_enabled():
+            args = list(args)
+            for pos in range(len(args)):
+                if torch.is_tensor(args[pos]):
+                    args[pos] = GraphStatusFunc.apply(self, args[pos])
+            for k, v in kwargs.items():
+                if torch.is_tensor(v):
+                    kwargs[k] = GraphStatusFunc.apply(self, v)
+
+        ret_tensors = self.graphed_module(is_first_microbatch=is_first_microbatch, *args, **kwargs)
+        ret_tensors = [ret_tensors] if torch.is_tensor(ret_tensors) else list(ret_tensors)
+        out = tuple(
+            ret_tensors.pop(0) if isinstance(o, TensorDescription) else o
+            for o in self.groundtruth_outputs
+        )
+
+        # Check that the static graph matches what was recorded during graph capture
+        assert len(out) == len(self.groundtruth_outputs)
+        for idx, o in enumerate(self.groundtruth_outputs):
+            if isinstance(o, TensorDescription):
+                assert o.matches_tensor(out[idx])
+            else:
+                assert o == out[idx]
+
+        if len(out) == 1:
+            return out[0]
+        return out
+
+
+class CudaGraphManager(torch.nn.Module):
+    """Creates and runs cudagraphs for a megatron module."""
+
+    def __init__(self):
+        super().__init__()
+        self.cudagraph_runners = []
+        self.is_first_microbatch = True
+        assert HAVE_TE_GRAPHS, "CudaGraphManager currently requires TransformerEngine"
+
+        # Cudagraph stream capture requires no operations on the default stream prior to the
+        # capture, so change to a side stream. At graph capture change it back.
+        self.stream = torch.cuda.current_stream()
+        torch.cuda.set_stream(torch.cuda.Stream())
+
+    def __call__(self, megatron_module, args, kwargs):
+        """Calls the forward pass of the cudagraphed module.
+
+        Args:
+            megatron_module (torch.nn.module): The megatron module to be graphed and run
+
+            args (tuple):  The positional args to be passed to the module.
+
+            kwargs (dict):  The keyword args to be passed to the module.
+
+        """
+
+        # param.data_ptr() below is used to trigger any hooks that have attached to the parameter.
+        # Specifically, this is trying to trigger the param sync hook for the APEX optimizer, which
+        # triggers param syncs by hooking into any param references.
+        # However cudagraphs disables this, so we workaround by manually referencing params here.
+        # For more information see:
+        # https://github.com/NVIDIA/apex/blob/7001836/apex/contrib/optimizers/distributed_fused_adam.py#L885C9
+        for param in megatron_module.parameters():
+            param.data_ptr()
+
+        runner = None
+        for _runner in self.cudagraph_runners:
+            if _runner.static_args_match(args, kwargs) and _runner.status == GraphStatus.FWD_READY:
+                runner = _runner
+                break
+
+        if runner is None:
+            if self.training and torch.is_grad_enabled():
+                runner = self.create_cudagraph_module(megatron_module, args, kwargs)
+                self.cudagraph_runners.append(runner)
+                logging.getLogger(__name__).info(
+                    f"Creating cudagraph; now have {len(self.cudagraph_runners)}"
+                )
+            else:
+                # No cudagraphs were found in inference mode, so fallback to eager since
+                # tensor.requires_grad is needed to correctly trace the backward graph.
+                return super(MegatronModule, megatron_module).__call__(*args, **kwargs)
+
+        tensor_args, tensor_kwargs = self.get_tensor_args(args, kwargs)
+        out = runner(tensor_args, tensor_kwargs, is_first_microbatch=self.is_first_microbatch)
+        self.is_first_microbatch = False
+        return out
+
+    def get_tensor_args(self, args, kwargs):
+        """Filter out non-tensor arguments from args and kwargs.
+        Needed since 'make_graphed_callables' expects Torch.tensor arg, kwargs."""
+        tensor_kwargs = {}
+        for k, v in kwargs.items():
+            if torch.is_tensor(v):
+                tensor_kwargs[k] = v
+        tensor_args = tuple(arg for arg in args if torch.is_tensor(arg))
+        return tensor_args, tensor_kwargs
+
+    def create_cudagraph_module(self, megatron_module, args, kwargs):
+        """Record the graph capture stream. Runs warmup iterations of
+        megatron_module, and creates a autograd function, where the
+        forward, backward functions are the cudagraphs of module's forward,
+        backward passes. Finally wraps this cudagraph function with a CudaGraphRunner.
+        """
+
+        torch.cuda.synchronize()
+        torch.cuda.set_stream(self.stream)
+        start = time.time()
+
+        wrapped_module = CudaGraphCallable(megatron_module, args, kwargs)
+        sample_args, sample_kwargs = self.get_tensor_args(args, kwargs)
+
+        # Cudagraphs require no autograd history recorded on sample inputs
+        sample_args_detached = tuple(n.detach() for n in sample_args)
+        sample_kwargs_detached = {k: v.detach() for k, v in sample_kwargs.items()}
+        sample_args_copy = tuple(torch.clone(n) for n in sample_args_detached)
+        sample_kwargs_copy = {k: torch.clone(v) for k, v in sample_kwargs_detached.items()}
+
+        # Zero out input args inplace so cudagraph warmup doesnt affect grads
+        for orig, detach in zip(sample_args, sample_args_detached):
+            detach.zero_()
+            detach.requires_grad = orig.requires_grad
+        for k, detach in sample_kwargs_detached.items():
+            detach.zero_()
+            detach.requires_grad = sample_kwargs[k].requires_grad
+
+        fp8_enabled = megatron_module.config.fp8 is not None
+        fp8_recipe = FP8GlobalStateManager.get_fp8_recipe() if fp8_enabled else None
+        graphed_module = make_graphed_callables(
+            modules=wrapped_module,
+            sample_args=sample_args_detached,
+            sample_kwargs=sample_kwargs_detached,
+            _order=[1, -1],
+            allow_unused_input=True,
+            fp8_enabled=fp8_enabled,
+            fp8_recipe=fp8_recipe,
+            fp8_weight_caching=True,
+        )
+
+        # Restore zeroed out sample args
+        # Detach again since pytorch prohibits inplace ops on leaf nodes
+        for orig, copy in zip(sample_args, sample_args_copy):
+            orig.detach().copy_(copy)
+        for k, orig in sample_kwargs.items():
+            orig.detach().copy_(sample_kwargs_copy[k])
+
+        logging.getLogger(__name__).info(f'Time spent in cudagraph capture: {time.time() - start}s')
+        return CudaGraphRunner(graphed_module, wrapped_module)
diff --git a/megatron/core/transformer/custom_layers/__init__.py b/megatron/core/transformer/custom_layers/__init__.py
new file mode 100644
index 0000000000..e69de29bb2
diff --git a/megatron/core/transformer/custom_layers/transformer_engine.py b/megatron/core/transformer/custom_layers/transformer_engine.py
new file mode 100644
index 0000000000..02ce9ad5a7
--- /dev/null
+++ b/megatron/core/transformer/custom_layers/transformer_engine.py
@@ -0,0 +1,12 @@
+# Copyright (c) 2024, NVIDIA CORPORATION. All rights reserved.
+
+import warnings
+
+warnings.warn(
+    """The 'megatron.core.transformer.custom_layers.transformer_engine' 
+    module is deprecated and will be removed in 0.10.0. Please use 
+    'megatron.core.extensions.transformer_engine' instead.""",
+    DeprecationWarning,
+    stacklevel=2,
+)
+from megatron.core.extensions.transformer_engine import *
diff --git a/megatron/core/transformer/dot_product_attention.py b/megatron/core/transformer/dot_product_attention.py
new file mode 100644
index 0000000000..d5c014cabf
--- /dev/null
+++ b/megatron/core/transformer/dot_product_attention.py
@@ -0,0 +1,202 @@
+# Copyright (c) 2023, NVIDIA CORPORATION. All rights reserved.
+
+
+import math
+from typing import Optional
+
+import torch
+from torch import Tensor
+
+from megatron.core import parallel_state, tensor_parallel
+from megatron.core.fusions.fused_softmax import FusedScaleMaskSoftmax
+from megatron.core.packed_seq_params import PackedSeqParams
+from megatron.core.transformer.enums import AttnMaskType
+from megatron.core.transformer.module import MegatronModule
+from megatron.core.transformer.transformer_config import TransformerConfig
+from megatron.core.transformer.utils import attention_mask_func
+from megatron.core.utils import divide
+
+
+class DotProductAttention(MegatronModule):
+    """
+    Region where selective activation recomputation is applied.
+    This region is memory intensive but less compute intensive which
+    makes activation checkpointing more efficient for LLMs (20B+).
+    See Reducing Activation Recomputation in Large Transformer Models:
+    https://arxiv.org/abs/2205.05198 for more details.
+
+    We use the following notation:
+     h: hidden size
+     n: number of attention heads
+     p: number of tensor model parallel partitions
+     b: batch size
+     s: sequence length
+    """
+
+    def __init__(
+        self,
+        config: TransformerConfig,
+        layer_number: int,
+        attn_mask_type: AttnMaskType,
+        attention_type: str,
+        attention_dropout: float = None,
+        softmax_scale: float = None,
+    ):
+        super().__init__(config=config)
+
+        self.config: TransformerConfig = config
+
+        assert (
+            self.config.context_parallel_size == 1
+        ), "Context parallelism is only supported by TEDotProductAttention!"
+
+        assert (
+            self.config.window_size is None
+        ), "Sliding Window Attention is only supported by TEDotProductAttention!"
+
+        self.layer_number = max(1, layer_number)
+        self.attn_mask_type = attn_mask_type
+        self.attention_type = attention_type  # unused for now
+
+        projection_size = self.config.kv_channels * self.config.num_attention_heads
+
+        # Per attention head and per partition values.
+        world_size = parallel_state.get_tensor_model_parallel_world_size()
+        self.hidden_size_per_partition = divide(projection_size, world_size)
+        self.hidden_size_per_attention_head = divide(projection_size, config.num_attention_heads)
+        self.num_attention_heads_per_partition = divide(self.config.num_attention_heads, world_size)
+        self.num_query_groups_per_partition = divide(self.config.num_query_groups, world_size)
+
+        coeff = None
+        if softmax_scale is None:
+            self.softmax_scale = 1.0 / math.sqrt(self.hidden_size_per_attention_head)
+        else:
+            self.softmax_scale = softmax_scale
+
+        if self.config.apply_query_key_layer_scaling:
+            coeff = self.layer_number
+            self.softmax_scale /= coeff
+
+        self.scale_mask_softmax = FusedScaleMaskSoftmax(
+            input_in_fp16=self.config.fp16,
+            input_in_bf16=self.config.bf16,
+            attn_mask_type=self.attn_mask_type,
+            scaled_masked_softmax_fusion=self.config.masked_softmax_fusion,
+            mask_func=attention_mask_func,
+            softmax_in_fp32=self.config.attention_softmax_in_fp32,
+            scale=coeff,
+        )
+
+        # Dropout. Note that for a single iteration, this layer will generate
+        # different outputs on different number of parallel partitions but
+        # on average it should not be partition dependent.
+        self.attention_dropout = torch.nn.Dropout(
+            self.config.attention_dropout if attention_dropout is None else attention_dropout
+        )
+
+    def forward(
+        self,
+        query: Tensor,
+        key: Tensor,
+        value: Tensor,
+        attention_mask: Tensor,
+        attn_mask_type: AttnMaskType = None,
+        packed_seq_params: Optional[PackedSeqParams] = None,
+    ):
+        assert packed_seq_params is None, (
+            "Packed sequence is not supported by DotProductAttention."
+            "Please use TEDotProductAttention instead."
+        )
+
+        # ===================================
+        # Raw attention scores. [b, n/p, s, s]
+        # ===================================
+
+        # expand the key and value [sk, b, ng, hn] -> [sk, b, np, hn]
+        # This is a noop for normal attention where ng == np. When using group query attention this
+        # creates a view that has the keys and values virtually repeated along their dimension to
+        # match the number of queries.
+
+        # attn_mask_type is not used.
+        if self.num_attention_heads_per_partition // self.num_query_groups_per_partition > 1:
+            key = key.repeat_interleave(
+                self.num_attention_heads_per_partition // self.num_query_groups_per_partition, dim=2
+            )
+            value = value.repeat_interleave(
+                self.num_attention_heads_per_partition // self.num_query_groups_per_partition, dim=2
+            )
+
+        # [b, np, sq, sk]
+        output_size = (query.size(1), query.size(2), query.size(0), key.size(0))
+
+        # [sq, b, np, hn] -> [sq, b * np, hn]
+        # This will be a simple view when doing normal attention, but in group query attention
+        # the key and value tensors are repeated to match the queries so you can't use
+        # simple strides to extract the queries.
+        query = query.reshape(output_size[2], output_size[0] * output_size[1], -1)
+        # [sk, b, np, hn] -> [sk, b * np, hn]
+        key = key.view(output_size[3], output_size[0] * output_size[1], -1)
+
+        # preallocting input tensor: [b * np, sq, sk]
+        matmul_input_buffer = parallel_state.get_global_memory_buffer().get_tensor(
+            (output_size[0] * output_size[1], output_size[2], output_size[3]), query.dtype, "mpu"
+        )
+
+        # Raw attention scores. [b * np, sq, sk]
+        matmul_result = torch.baddbmm(
+            matmul_input_buffer,
+            query.transpose(0, 1),  # [b * np, sq, hn]
+            key.transpose(0, 1).transpose(1, 2),  # [b * np, hn, sk]
+            beta=0.0,
+            alpha=self.softmax_scale,
+        )
+
+        # change view to [b, np, sq, sk]
+        attention_scores = matmul_result.view(*output_size)
+
+        # ===========================
+        # Attention probs and dropout
+        # ===========================
+
+        # attention scores and attention mask [b, np, sq, sk]
+        attention_probs: Tensor = self.scale_mask_softmax(attention_scores, attention_mask)
+
+        # This is actually dropping out entire tokens to attend to, which might
+        # seem a bit unusual, but is taken from the original Transformer paper.
+
+        if not self.config.sequence_parallel:
+            with tensor_parallel.get_cuda_rng_tracker().fork():
+                attention_probs = self.attention_dropout(attention_probs)
+        else:
+            attention_probs = self.attention_dropout(attention_probs)
+
+        # =========================
+        # Context layer. [sq, b, hp]
+        # =========================
+
+        # value -> context layer.
+        # [sk, b, np, hn] --> [b, np, sq, hn]
+
+        # context layer shape: [b, np, sq, hn]
+        output_size = (value.size(1), value.size(2), query.size(0), value.size(3))
+
+        # change view [sk, b * np, hn]
+        value = value.view(value.size(0), output_size[0] * output_size[1], -1)
+
+        # change view [b * np, sq, sk]
+        attention_probs = attention_probs.view(output_size[0] * output_size[1], output_size[2], -1)
+
+        # matmul: [b * np, sq, hn]
+        context = torch.bmm(attention_probs, value.transpose(0, 1))
+
+        # change view [b, np, sq, hn]
+        context = context.view(*output_size)
+
+        # [b, np, sq, hn] --> [sq, b, np, hn]
+        context = context.permute(2, 0, 1, 3).contiguous()
+
+        # [sq, b, np, hn] --> [sq, b, hp]
+        new_context_shape = context.size()[:-2] + (self.hidden_size_per_partition,)
+        context = context.view(*new_context_shape)
+
+        return context
diff --git a/megatron/core/transformer/enums.py b/megatron/core/transformer/enums.py
new file mode 100644
index 0000000000..99d0ddefbd
--- /dev/null
+++ b/megatron/core/transformer/enums.py
@@ -0,0 +1,28 @@
+# Copyright (c) 2022, NVIDIA CORPORATION. All rights reserved.
+
+import enum
+
+
+# can we get rid of this?
+# it's being used in pipeline schedules
+class ModelType(enum.Enum):
+    encoder_or_decoder = 1
+    encoder_and_decoder = 2
+
+
+# class LayerType(enum.Enum):
+#     encoder = 1
+#     decoder = 2
+
+
+class AttnType(enum.Enum):
+    self_attn = 1
+    cross_attn = 2
+
+
+class AttnMaskType(enum.Enum):
+    padding = 1
+    causal = 2
+    no_mask = 3  # only used for TE
+    padding_causal = 4  # only used for thd attention
+    arbitrary = 5
diff --git a/megatron/core/transformer/identity_op.py b/megatron/core/transformer/identity_op.py
new file mode 100644
index 0000000000..5d9388ffcc
--- /dev/null
+++ b/megatron/core/transformer/identity_op.py
@@ -0,0 +1,28 @@
+# Copyright (c) 2023, NVIDIA CORPORATION. All rights reserved.
+import torch
+
+
+class IdentityOp(torch.nn.Module):
+    """
+    This is a placeholder for IdentityOp(x) -> x
+    """
+
+    def __init__(self, *args, **kwargs):
+        super().__init__()
+
+    def forward(self, x, *args, **kwargs):
+        return x
+
+
+class IdentityFuncOp(IdentityOp):
+    """
+    This is a placeholder for IdentityFuncOp(...)(x) -> IdentityOp(x) -> x.
+    Such a func is handy for ops like `bias_dropout_fusion` which themselves
+    return a function at runtime based on passed arguments
+    """
+
+    def __init__(self, *args, **kwargs):
+        super().__init__()
+
+    def forward(self, *args, **kwargs):
+        return super().forward
diff --git a/megatron/core/transformer/mlp.py b/megatron/core/transformer/mlp.py
new file mode 100644
index 0000000000..e82d6ecd20
--- /dev/null
+++ b/megatron/core/transformer/mlp.py
@@ -0,0 +1,255 @@
+# Copyright (c) 2024, NVIDIA CORPORATION. All rights reserved.
+
+from dataclasses import dataclass
+from typing import Optional, Tuple, Union
+
+import numpy as np
+import torch
+import torch.nn.functional as F
+
+from megatron.core import parallel_state
+from megatron.core.dist_checkpointing import ShardedTensor
+from megatron.core.dist_checkpointing.mapping import (
+    ReplicaId,
+    ShardedStateDict,
+    ShardedTensorFactory,
+)
+from megatron.core.fusions.fused_bias_geglu import bias_geglu_impl
+from megatron.core.fusions.fused_bias_gelu import bias_gelu_impl
+from megatron.core.fusions.fused_bias_swiglu import bias_swiglu_impl
+from megatron.core.transformer.module import MegatronModule
+from megatron.core.transformer.spec_utils import ModuleSpec, build_module
+from megatron.core.transformer.transformer_config import TransformerConfig
+from megatron.core.transformer.utils import make_sharded_tensors_for_checkpoint
+
+
+@dataclass
+class MLPSubmodules:
+    linear_fc1: Union[ModuleSpec, type] = None
+    linear_fc2: Union[ModuleSpec, type] = None
+
+
+class MLP(MegatronModule):
+    """
+    MLP will take the input with h hidden state, project it to 4*h
+    hidden dimension, perform nonlinear transformation, and project the
+    state back into h hidden dimension.
+
+
+    Returns an output and a bias to be added to the output.
+    If config.add_bias_linear is False, the bias returned is None.
+
+    We use the following notation:
+     h: hidden size
+     p: number of tensor model parallel partitions
+     b: batch size
+     s: sequence length
+    """
+
+    def __init__(
+        self,
+        config: TransformerConfig,
+        submodules: MLPSubmodules,
+        is_expert: bool = False,
+        input_size: int = None,
+    ):
+        super().__init__(config=config)
+
+        self.config: TransformerConfig = config
+
+        self.input_size = input_size if input_size != None else self.config.hidden_size
+
+        # If this is a gated linear unit we double the output width, see https://arxiv.org/pdf/2002.05202.pdf
+        ffn_hidden_size = self.config.ffn_hidden_size
+        if self.config.gated_linear_unit:
+            ffn_hidden_size *= 2
+
+        self.linear_fc1 = build_module(
+            submodules.linear_fc1,
+            self.input_size,
+            ffn_hidden_size,
+            config=self.config,
+            init_method=self.config.init_method,
+            gather_output=False,
+            bias=self.config.add_bias_linear,
+            skip_bias_add=True,
+            is_expert=is_expert,
+            tp_comm_buffer_name='fc1',
+        )
+
+        self.activation_func = self.config.activation_func
+
+        self.linear_fc2 = build_module(
+            submodules.linear_fc2,
+            self.config.ffn_hidden_size,
+            self.config.hidden_size,
+            config=self.config,
+            init_method=self.config.output_layer_init_method,
+            bias=self.config.add_bias_linear,
+            input_is_parallel=True,
+            skip_bias_add=True,
+            is_expert=is_expert,
+            tp_comm_buffer_name='fc2',
+        )
+
+    def forward(self, hidden_states):
+
+        # [s, b, 4 * h/p]
+        intermediate_parallel, bias_parallel = self.linear_fc1(hidden_states)
+
+        if self.config.bias_activation_fusion:
+            if self.activation_func == F.gelu:
+                if self.config.gated_linear_unit:
+                    intermediate_parallel = bias_geglu_impl(intermediate_parallel, bias_parallel)
+                else:
+                    assert self.config.add_bias_linear is True
+                    intermediate_parallel = bias_gelu_impl(intermediate_parallel, bias_parallel)
+            elif self.activation_func == F.silu and self.config.gated_linear_unit:
+                intermediate_parallel = bias_swiglu_impl(
+                    intermediate_parallel,
+                    bias_parallel,
+                    self.config.activation_func_fp8_input_store,
+                )
+            else:
+                raise ValueError("Only support fusion of gelu and swiglu")
+        else:
+            if bias_parallel is not None:
+                intermediate_parallel = intermediate_parallel + bias_parallel
+            if self.config.gated_linear_unit:
+
+                def glu(x):
+                    x = torch.chunk(x, 2, dim=-1)
+                    return self.config.activation_func(x[0]) * x[1]
+
+                intermediate_parallel = glu(intermediate_parallel)
+            else:
+                intermediate_parallel = self.activation_func(intermediate_parallel)
+
+        # [s, b, h]
+        output, output_bias = self.linear_fc2(intermediate_parallel)
+
+        return output, output_bias
+
+    def sharded_state_dict(
+        self, prefix: str = '', sharded_offsets: tuple = (), metadata: Optional[dict] = None
+    ) -> ShardedStateDict:
+        sharded_state_dict = {}
+        for name, module in self._modules.items():
+            sub_sd = module.sharded_state_dict(f'{prefix}{name}.', sharded_offsets, metadata)
+            if self.config.gated_linear_unit and name == 'linear_fc1':
+                assert f'{prefix}{name}.weight' in sub_sd, sub_sd.keys()
+                for k, v in sub_sd.items():
+                    if k in (f'{prefix}{name}.weight', f'{prefix}{name}.bias'):
+                        sub_sd[k] = apply_swiglu_sharded_factory(v, sharded_offsets)
+            sharded_state_dict.update(sub_sd)
+        return sharded_state_dict
+
+
+def apply_swiglu_sharded_factory(original_sh_ten, sharded_offsets):
+    # We must split the tensor into 2 parts, each sharded separately.
+    # This requires a ShardedTensorFactory which `chunk`s during saving
+    # and `cat`s during loading
+    tp_rank = parallel_state.get_tensor_model_parallel_rank()
+    tp_size = parallel_state.get_tensor_model_parallel_world_size()
+    swiglu_shard_axis = 0
+    prepend_axis_num = len(sharded_offsets)
+    original_shape = original_sh_ten.local_shape
+    original_numel = int(np.prod(original_shape))
+
+    @torch.no_grad()
+    def sh_ten_build_fn(
+        key: str, t: torch.Tensor, replica_id: ReplicaId, flattened_range: Optional[slice]
+    ):
+        offset_w = (swiglu_shard_axis + prepend_axis_num, tp_rank, tp_size * 2)
+        offset_v = (swiglu_shard_axis + prepend_axis_num, tp_size + tp_rank, tp_size * 2)
+        if flattened_range is None:
+            tensor_w, tensor_v = torch.chunk(t, 2, dim=swiglu_shard_axis)
+            return [
+                ShardedTensor.from_rank_offsets(
+                    key,
+                    tensor_w,
+                    *sharded_offsets,
+                    offset_w,
+                    replica_id=replica_id,
+                    prepend_axis_num=prepend_axis_num,
+                ),
+                ShardedTensor.from_rank_offsets(
+                    key,
+                    tensor_v,
+                    *sharded_offsets,
+                    offset_v,
+                    replica_id=replica_id,
+                    prepend_axis_num=prepend_axis_num,
+                ),
+            ]
+        else:
+            # Here we need to map a slice `t` (`flattened_range` specifies slice start and stop)
+            # of the *original* flattened tensor into slices `w` and `v` of chunked
+            # and flattened tensor.
+            # Example:
+            # If original tensor has (16, 5) shape and flattened_range is `slice(8, 64)`,
+            # then `t` has shape `(56,)` and we need to create 2 tensors:
+            # w: first 32 elements of `t` with flattened_range slice(8, 40)
+            # v: last 24 elements of `t` with flattened_range slice(0, 24)
+            # Global offsets are the same as in the non-flattened case
+            assert t.ndim == 1, (key, t.shape)
+            non_flat_local_shape = (original_shape[0] // 2, *original_shape[1:])
+            chunk_numel = original_numel // 2
+            result = []
+            if flattened_range.start < chunk_numel:
+                # Non-empty `w` chunk
+                tensor_w = t[: chunk_numel - flattened_range.start]
+                flattened_range_w = slice(
+                    flattened_range.start, min(chunk_numel, flattened_range.stop)
+                )
+                assert len(tensor_w) == flattened_range_w.stop - flattened_range_w.start
+                result.append(
+                    ShardedTensor.from_rank_offsets_flat(
+                        key,
+                        tensor_w,
+                        non_flat_local_shape,
+                        *sharded_offsets,
+                        offset_w,
+                        replica_id=replica_id,
+                        prepend_axis_num=prepend_axis_num,
+                        flattened_range=flattened_range_w,
+                    )
+                )
+            if flattened_range.stop > chunk_numel:
+                # Non-empty `v` chunk
+                tensor_v = t[-(flattened_range.stop - chunk_numel) :]
+                flattened_range_v = slice(
+                    max(chunk_numel, flattened_range.start) - chunk_numel,
+                    flattened_range.stop - chunk_numel,
+                )
+                assert len(tensor_v) == flattened_range_v.stop - flattened_range_v.start, (
+                    len(tensor_v),
+                    flattened_range_v,
+                )
+
+                result.append(
+                    ShardedTensor.from_rank_offsets_flat(
+                        key,
+                        tensor_v,
+                        non_flat_local_shape,
+                        *sharded_offsets,
+                        offset_v,
+                        replica_id=replica_id,
+                        prepend_axis_num=prepend_axis_num,
+                        flattened_range=flattened_range_v,
+                    )
+                )
+            assert sum(sh_ten.data.numel() for sh_ten in result) == t.numel(), (result, t.shape)
+            return result
+
+    def sh_ten_merge_fn(sub_state_dict):
+        with torch.no_grad():
+            return torch.cat(sub_state_dict)
+
+    return ShardedTensorFactory(
+        original_sh_ten.key,
+        original_sh_ten.data,
+        sh_ten_build_fn,
+        sh_ten_merge_fn,
+        original_sh_ten.replica_id,
+    )
diff --git a/megatron/core/transformer/module.py b/megatron/core/transformer/module.py
new file mode 100644
index 0000000000..c89acec400
--- /dev/null
+++ b/megatron/core/transformer/module.py
@@ -0,0 +1,195 @@
+# Copyright (c) 2024, NVIDIA CORPORATION. All rights reserved.
+
+"""Megatron Module."""
+from typing import Optional, Tuple
+
+import torch
+from torch.autograd import Variable
+from torch.nn.parameter import Parameter
+
+from megatron.core import parallel_state
+from megatron.core.dist_checkpointing.mapping import ShardedStateDict
+from megatron.core.transformer.transformer_config import TransformerConfig
+from megatron.core.transformer.utils import (
+    make_sharded_tensors_for_checkpoint,
+    sharded_state_dict_default,
+)
+
+_FLOAT_TYPES = (torch.FloatTensor, torch.cuda.FloatTensor)
+_HALF_TYPES = (torch.HalfTensor, torch.cuda.HalfTensor)
+_BF16_TYPES = (torch.BFloat16Tensor, torch.cuda.BFloat16Tensor)
+
+
+def param_is_not_shared(param):
+    return not hasattr(param, 'shared') or not param.shared
+
+
+class MegatronModule(torch.nn.Module):
+    """Base Megatron module inhertied by all Models.
+
+    Megatron specific extensions of torch Module with support
+    for pipelining
+
+    Args:
+        config (TransformerConfig): Transformer config
+    """
+
+    # def __init__(self, config: TransformerConfig, share_word_embeddings=True):
+    def __init__(self, config: TransformerConfig):
+        super().__init__()
+        self.config = config
+
+    def state_dict_for_save_checkpoint(self, prefix: str = '', keep_vars: bool = False):
+        """Override state dict for saving checkpoints Use this function to override the
+        state dict for saving checkpoints.
+
+        Args:
+            prefix (str, optional): _description_. Defaults to ''.
+            keep_vars (bool, optional): _description_. Defaults to False.
+
+        Returns:
+            _type_: _description_
+        """
+
+        return self.state_dict(prefix=prefix, keep_vars=keep_vars)
+
+    def sharded_state_dict(
+        self,
+        prefix: str = '',
+        sharded_offsets: Tuple[Tuple[int, int, int]] = (),
+        metadata: Optional[dict] = None,
+    ) -> ShardedStateDict:
+        """Default implementation for sharded state dict for distributed checkpointing.
+
+        General definition of sharded_state_dict simply calls `sharded_state_dict_default`
+        (which call sharded_state_dict method if possible or a default implementation otherwise)
+        recursively on all submodules.
+
+        Args:
+            prefix (str): prefix for the state dict keys
+            sharded_offsets (Tuple[Tuple[int, int, int]], optional): sharding already
+                applied (e.g. PP related) by sup-modules. Passed along to ShardedTensor
+            metadata (dict, optional): metadata passed recursively to sharded_state_dict methods
+
+        Returns:
+            dict: dictionary of state dict keys mapped to ShardedTensors
+        """
+        sharded_state_dict = {}
+        # Save parameters
+        self._save_to_state_dict(sharded_state_dict, '', keep_vars=True)
+        sharded_state_dict = make_sharded_tensors_for_checkpoint(
+            sharded_state_dict, prefix, sharded_offsets=sharded_offsets
+        )
+        # Recurse into submodules
+        for name, module in self.named_children():
+            sharded_state_dict.update(
+                sharded_state_dict_default(module, f'{prefix}{name}.', sharded_offsets, metadata)
+            )
+        return sharded_state_dict
+
+    def set_is_first_microbatch(self):
+        """Sets the is_first_microbatch flag if it exists and config.fp8==True.
+        When this flag is set, TE modules will update their fp8 parameter cache.
+        """
+        if self.config.fp8 is not None:
+            if not hasattr(self, "modules_with_is_first_microbatch"):
+                self.modules_with_is_first_microbatch = []
+                for m in self.modules():
+                    if hasattr(m, "is_first_microbatch"):
+                        self.modules_with_is_first_microbatch.append(m)
+            for m in self.modules_with_is_first_microbatch:
+                m.is_first_microbatch = True
+
+
+def conversion_helper(val, conversion):
+    if not isinstance(val, (tuple, list)):
+        return conversion(val)
+    rtn = [conversion_helper(v, conversion) for v in val]
+    if isinstance(val, tuple):
+        rtn = tuple(rtn)
+    return rtn
+
+
+def fp32_to_float16(val, float16_convertor):
+    def half_conversion(val):
+        val_typecheck = val
+        if isinstance(val_typecheck, (Parameter, Variable)):
+            val_typecheck = val.data
+        if isinstance(val_typecheck, _FLOAT_TYPES):
+            val = float16_convertor(val)
+        return val
+
+    return conversion_helper(val, half_conversion)
+
+
+def float16_to_fp32(val):
+    def float_conversion(val):
+        val_typecheck = val
+        if isinstance(val_typecheck, (Parameter, Variable)):
+            val_typecheck = val.data
+        if isinstance(val_typecheck, (_BF16_TYPES, _HALF_TYPES)):
+            val = val.float()
+        return val
+
+    return conversion_helper(val, float_conversion)
+
+
+class Float16Module(MegatronModule):
+    """Float 16 Module.
+
+    Attributes:
+        config (TransformerConfig): Transformer config
+        fp16 (bool) : Specifies if the model runs in fp16 mode
+        bf16 (bool) : Specifies if the model runs in bf16 mode
+
+    Args:
+        config (TransformerConfig): The transformer config used to initalize the model
+    """
+
+    def __init__(self, config: TransformerConfig, module: torch.nn.Module):
+        super(Float16Module, self).__init__(config)
+        self.config = config
+        self.fp16 = config.fp16
+        self.bf16 = config.bf16
+
+        if self.fp16:
+            self.add_module('module', module.half())
+
+            def float16_convertor(val):
+                return val.half()
+
+        elif self.bf16:
+            self.add_module('module', module.bfloat16())
+
+            def float16_convertor(val):
+                return val.bfloat16()
+
+        else:
+            raise Exception('Either config.fp16 or config.bf16 should be True.')
+
+        self.float16_convertor = float16_convertor
+
+    def set_input_tensor(self, input_tensor):
+        return self.module.set_input_tensor(input_tensor)
+
+    def forward(self, *inputs, **kwargs):
+        if parallel_state.is_pipeline_first_stage():
+            inputs = fp32_to_float16(inputs, self.float16_convertor)
+        outputs = self.module(*inputs, **kwargs)
+        if parallel_state.is_pipeline_last_stage():
+            outputs = float16_to_fp32(outputs)
+        return outputs
+
+    def state_dict(self, destination=None, prefix='', keep_vars=False):
+        return self.module.state_dict(destination=destination, prefix=prefix, keep_vars=keep_vars)
+
+    def state_dict_for_save_checkpoint(self, prefix='', keep_vars=False):
+        """Retrieve state_dict from the module being wrapped."""
+        return self.module.state_dict_for_save_checkpoint(prefix=prefix, keep_vars=keep_vars)
+
+    def sharded_state_dict(self, prefix='', *args, **kwargs):
+        """Retrieve sharded_state_dict from the module being wrapped."""
+        return self.module.sharded_state_dict(prefix, *args, **kwargs)
+
+    def load_state_dict(self, state_dict, strict=True):
+        self.module.load_state_dict(state_dict, strict=strict)
diff --git a/megatron/core/transformer/moe/README.md b/megatron/core/transformer/moe/README.md
new file mode 100644
index 0000000000..a7ee75bcbf
--- /dev/null
+++ b/megatron/core/transformer/moe/README.md
@@ -0,0 +1,366 @@
+# Megatron Core MoE Key Features
+
+Megatron-Core offers rich parallelism mappings, combining Expert Parallelism with tensor, data, sequence, and pipeline parallelism. This boosts Mixtral 8X7B bf16 training to achieve **438 TFLOPS** as of MCore v0.8.
+
+
+### Parallelism
+- **Expert Parallelism**
+    - A specific method of parallelism for MoE models, where experts are partitioned onto different workers and each worker processes a different batch of training samples, each worker process one or more experts for each MoE layer.
+- **3D Parallelism**: Data Parallelism, Tensor Parallelism, Pipeline Parallelism
+    - Note: When using MoE with expert parallelism and tensor parallelism, sequence parallelism must be enabled.
+- **Context Parallelism**:
+    - Split the sequence dimension to support long context training.
+- **Richer parallel mappings**: EP can be combined with DP/TP/PP/CP for handling larger MoE variants.
+- **Full distributed optimizer support.**
+
+### Router and Load Balancing
+- Router type:
+    - Top-K MLP router
+- Load Balancing algorithms:
+    - Sinkhorn (S-BASE)
+    - Aux loss / Load balancing loss
+
+### Performance Optimizations
+- GroupedGEMM when num local experts > 1
+    - Supported dtype: bf16
+    - Performance improvements for larger MoE models
+- Enable `--tp-comm-overlap` for MoE
+
+### Token Dispatch Mechanism
+- Dropless / No token drop
+- Token drop, with or without padding to capacity
+
+### Ease of use
+- Checkpoint converter for Mixtral models, see the [example](https://github.com/NVIDIA/Megatron-LM/tree/main/examples/mixtral) for details.
+- Distributed checkpoining
+- Per-layer logging
+
+## Upcoming features
+- Token permutation / unpermutation fusion
+- Fused Sinkhorn Kernel
+- FP8 training support
+
+# User Guide
+
+### MoE Related Arguments
+
+| Item | Description |
+| --- | --- |
+| --num-experts | Number of Experts in MoE (None means no MoE) |
+| --expert-model-parallel-size | Degree of expert model parallelism. Default is 1. |
+| --moe-grouped-gemm | When there are multiple experts per rank, launch multiple local GEMM kernels in multiple streams to improve the utilization and performance with GroupedLinear in TransformerEngine. |
+| --moe-router-load-balancing-type | Determines the load balancing strategy for the router. "aux_loss" corresponds to the load balancing loss used in GShard and SwitchTransformer, "sinkhorn" corresponds to the balancing algorithm used in S-BASE, and "none" implies no load balancing. The default is "aux_loss". |
+| --moe-router-topk | Number of experts to route to for each token. The default is 2. |  
+| --moe-aux-loss-coeff | Scaling coefficient for the aux loss: a starting value of 1e-2 is recommended. Default is 0.0. |
+| --moe-z-loss-coeff | Scaling coefficient for the z-loss: a starting value of 1e-3 is recommended. Default is None. |
+| --moe-input-jitter-eps | Add noise to the input tensor by applying jitter with a specified epsilon value. Default is None. |
+| --moe-token-dispatcher-type | Determines the token dispatcher type. Choices are "allgather", "alltoall" and "alltoall_seq". Default is "allgather". We recommend using 'alltoall' if expert parallelism is applied. We have upgraded the "alltoall" dispatcher in place during MCore v0.9, while retaining the original implementation, renamed as "alltoall_seq".|
+| --moe-per-layer-logging | Enable per-layer logging for MoE, currently supports auxiliary loss and z loss. |
+| --moe-expert-capacity-factor | The capacity factor for each expert, None means no token will be dropped. Default is None. |
+| --moe-pad-expert-input-to-capacity | Pads the input for each expert to match the expert capacity length, effective only after the --moe-expert-capacity-factor is set. |
+| --moe-token-drop-policy | The policy to drop tokens. Can be either "probs" or "position". If "probs", the tokens with the lowest probabilities will be dropped. If "position", tokens at the end of each batch will be dropped. |
+| --moe-layer-recompute | Enable activation checkpointing for moe_layer, should be used when memory is not sufficient. |
+| --moe-extended-tp | (Experimental) Alternative parallelization strategy for expert parallelism. Instead of distributing experts across *expert_model_parallel_size*, each expert is sharded along extendended tensor parallel domain (tensor_model_paralle_size * expert_model_parallel_size). It avoids the load balancing problem with MOE training. Only available with `--moe-token-dispatcher-type allgather`. |
+| --moe-shared-expert-intermediate-size | Set shared expert total ffn hidden size. It should be equal to `num_shared_experts * ffn_size_of_each_shared_expert` if there are multiple shared experts. None means no shared expert. |
+| --moe-shared-expert-overlap | (Experimental, may changed) If this is set, the communications/computations in the shared experts and the dispatcher will overlap (The `alltoall` dispatcher is needed.) Otherwise, the shared expert runs after the routed experts. |
+| --moe-use-upcycling | Load the dense model checkpoint, convert it into an MoE model at runtime and start training. The converted model will be saved to the path specified by `--save` before training begins. Upcycling is implemented on the top of distributed checkpointing, so it supports parallel modes different from the dense model.|
+
+
+## Usage
+
+### Quick Start
+To train a top-2 MoE model with 8 experts and auxiliary loss, include the following arguments:
+
+```bash
+--num-experts 8
+--expert-model-parallel-size 8
+--moe-grouped-gemm
+--moe-router-load-balancing-type aux_loss # options: aux_loss, sinkhorn, none. Default is aux_loss.
+--moe-router-topk 2
+--moe-aux-loss-coeff 1e-2
+--use-distributed-optimizer
+--moe-token-dispatcher-type alltoall
+```
+
+To enable the token drop mechanism, such as GShard and SwitchTransformer, include the following arguments:
+
+```bash
+--moe-expert-capacity-factor 1.0
+--moe-pad-expert-input-to-capacity # Optional
+```
+
+The following figure illustrates differenting dropping strategies in MCore:
+<!-- This image is uncommented for now as Sphinx cannot resolve this path. Sphinx imports this markdown file, and from the imported location this relative path does not exist anymore. Ideally, this markdown should not live here but rather in the `docs/` directory that Sphinx uses. -->
+<!-- ![Token Droppling Strategies](../../../../docs/source/images/moe/token_drop.png) -->
+
+1. The default dropless strategy will not drop or pad any token.
+2. By setting `--moe-expert-capacity-factor`, the tokens exceed the capacity of expert will be dropped based on their selected probabilities. 
+   The dropping is performed before the token exchange operation between EP ranks when EP > 1. 
+   The formula of capacity is `capacity = num_tokens_per_rank * topk * capacity_factor / num_experts`.
+3. By setting `--moe-pad-expert-input-to-capacity`, the experts with tokens less than capacity will be padded to the capacity.
+
+### Fine-tuning Mixtral Models
+Megatron-Core has full support for Mixtral MoE models, and we provide the checkpoint converter for Mixtral models from huggingface format to MCore format. 
+<!-- See more details in the [mixtral example](../../../../examples/mixtral/README.md). -->
+
+### Distributed Checkpointing
+MCore v0.7 introduced fully parallel and asynchronous saving capabilities to distributed checkpointing, 
+which addresses the issues of low efficiency in the traditional checkpoint saving methods. 
+It also solved the problem of incompatibility between checkpoints of different parallel mappings in the traditional format.
+With the new distributed checkpointing solution, MCore can achieve flexible parallelism configurations by saving and loading the unified format checkpoints.
+Compared to native PyTorch solution, MCore achieves up to 50x reduction in checkpointing overhead.
+
+From MCore v0.8, MoE supports Distributed Checkpointing, which means users can save and load with any combination of parallelism and it is currently available, including expert parallel.
+1. Loading weight and distributed optimizer states with TPxCPxEPxPP resharding with SequentialMLP is supported in version 0.8.
+2. GroupedMLP weight resharding is supported in version 0.8.0 and optimizer state resharding is supported in version 0.10.0. Switching between GroupedMLP/SequentialMLP when loading and saving is partially supported.
+3. TEGroupedMLP has fully support on distributed checkpointing and is fully exchangable with SequentialMLP in version 0.9.0.
+4. Optimizer state resharding cannot do across EP=1 with EP>1 due to the different optimizer type.
+
+Usage
+- `--ckpt-format torch_dist` The main argument, it will attempt to save and load using distributed checkpointing.
+- `--auto-detect-ckpt-format` With this, it can load both distributed checkpointing and legacy checkpointing.
+
+Checkpoint compatibility across SequentialMLP, GroupedMLP, and TEGroupedMLP:
+```text
+    ┌───────────────┐          ┌───────────────┐          ┌───────────────┐     
+    │   GroupedMLP  │          │ SequentialMLP │          │ TEGroupedMLP  │     
+    │               │          │               │          │               │     
+    │               │          │               │          │               │     
+    │ ┌───────────┐ │          │ ┌───────────┐ │          │ ┌───────────┐ │     
+    │ │legacy ckpt│ │          │ │legacy ckpt│ │          │ │legacy ckpt│ │     
+    │ └─────┬─────┘ │          │ └─────┬─────┘ │          │ └─────┬─────┘ │     
+    │       ▼       │          │       ▼       │          │       ▼       │     
+    │  ┌─────────┐  │          │  ┌─────────┐  │          │  ┌─────────┐  │     
+    │  │dist ckpt│  │          │  │dist ckpt│  │          │  │dist ckpt│  │     
+┌──►│  │ weight  │  │◄────────►│  │ weight  │  │◄────────►│  │ weight  │  │◄──┐ 
+│   │  └─────────┘  │          │  └─────────┘  │          │  └─────────┘  │   │ 
+└───┼───────────────┼──────────┼───────────────┼──────────┼───────────────┼───┘ 
+    │┌─────────────┐│          │┌─────────────┐│          │┌─────────────┐│     
+    ││  dist ckpt  ││          ││  dist ckpt  ││          ││  dist ckpt  ││     
+    ││optim states ││          ││optim states ││◄────────►││optim states ││     
+    │└─────────────┘│          │└─────────────┘│          │└─────────────┘│     
+    └───────────────┘          └───────────────┘          └───────────────┘     
+```
+
+Best practices for distributed checkpointing:
+1. Convert a legacy checkpoint to a distributed checkpoint. To achieve this, we can add both `--ckpt-format torch_dist --auto-detect-ckpt-format`, then it will load the legacy one and save as the distributed checkpoint format later when the training progress tries to save checkpoints.
+2. Convert checkpoint of the legacy GroupedMLP to TEGroupedMLP. This is only supported for the weight parts. To achieve this, we can use the above method to convert the legacy checkpoint to a distributed checkpoint of the legacy GroupedMLP. After updating the libraries and using TEGroupedMLP, we can directly load the previously saved checkpoint by adding argument `--no-load-optim`.
+
+### Shared Experts
+MCore v0.9 introduced the shared expert feature. We can enable this feature by setting suitable `--moe-shared-expert-intermediate-size`.
+
+The parallelism patterns of the shared experts follow the settings of the dense part, i.e., the attention module. The shared experts are not distributed but replicated in EP ranks.
+
+We also have an experimental feature that tries to overlap the communications and computations in the shared experts and the dispatcher.
+We can set `--moe-shared-expert-overlap` and use `alltoall` dispatcher to enable it.
+The overlapping relies on the envirionment setting `CUDA_DEVICE_MAX_CONNECTIONS=1`.
+The `AllGather` and `ReduceScatter` communications in the shared experts are overlapped with `permute`/`unpermute` in the dispatcher.
+The `MLP` computation part in the shared experts are overlapped with the `AlltoAll` communications in the dispatcher.
+Both the forward and the backward pass can overlap. But to get the overlapping in the backward pass, the PyTorch version should `>= 2.2.0`.
+
+### Upcycling
+Use `--moe-use-upcycling` to enable the upcycling feature, which will load the dense model from the directory specified by `--load`, convert it into an MoE model at runtime and start training. The converted model will be saved to the path specified by `--save` before training begins. Upcycling is implemented on the top of distributed checkpointing, so it supports parallel modes different from the dense model.
+
+The MoE model structure is defined through script arguments. All MoE-related arguments (such as `--num-experts`) can be customized; however, other model structure arguments must be consistent with those of the dense model.
+
+## MoE training example:
+<details>
+<summary>Click here. </summary>
+
+```bash
+#!/bin/bash
+
+# Runs Mixtral 8x7B model on 32 H100/A100 GPUs
+# The Dropless MoE suffers from an imbalanced token distribution at the early stage of training (the first few hundred iterations), which may lead to poor performance and out-of-memory (OOM) issues.
+# To check the performance of a Dropless MoE model, we should run the model for at least 500 iterations or resume from trained checkpoints.
+
+export CUDA_DEVICE_MAX_CONNECTIONS=1
+
+GPUS_PER_NODE=8
+# Change for multinode config
+MASTER_ADDR=${MASTER_ADDR:-"localhost"}
+MASTER_PORT=${MASTER_PORT:-"6000"}
+NNODES=${NNODES:-"1"}
+NODE_RANK=${RANK:-"0"}
+WORLD_SIZE=$(($GPUS_PER_NODE*$NNODES))
+
+CHECKPOINT_PATH=$1
+TOKENIZER_MODEL=$2
+DATA_PATH=$3
+
+DISTRIBUTED_ARGS=(
+    --nproc_per_node $GPUS_PER_NODE
+    --nnodes $NNODES
+    --node_rank $NODE_RANK
+    --master_addr $MASTER_ADDR
+    --master_port $MASTER_PORT
+)
+
+MODEL_ARGS=(
+    --disable-bias-linear
+    --seq-length 4096
+    --max-position-embeddings 32768
+    --num-layers 32
+    --hidden-size 4096
+    --ffn-hidden-size 14336
+    --num-attention-heads 32
+    --init-method-std 0.01
+    --attention-dropout 0.0
+    --hidden-dropout 0.0
+    --normalization RMSNorm
+    --position-embedding-type rope
+    --swiglu
+    --untie-embeddings-and-output-weights
+    --group-query-attention
+    --num-query-groups 8
+    --no-masked-softmax-fusion
+    --no-position-embedding
+)
+
+MOE_ARGS=(
+    --num-experts 8
+    --expert-model-parallel-size 8
+    --moe-router-load-balancing-type aux_loss # options: aux_loss, sinkhorn, None. Default is aux_loss.
+    --moe-router-topk 2
+    --moe-aux-loss-coeff 1e-2
+    --moe-grouped-gemm
+)
+
+DATA_ARGS=(
+    --tokenizer-type Llama2Tokenizer
+    --tokenizer-model ${TOKENIZER_MODEL}
+    --data-path $DATA_PATH
+    --split 99990,8,2
+)
+
+TRAINING_ARGS=(
+    --micro-batch-size 1
+    --global-batch-size 128
+    --lr 1e-4
+    --train-iters 500000
+    --lr-decay-iters 320000
+    --lr-decay-style cosine
+    --min-lr 1.0e-5
+    --weight-decay 0.1
+    --lr-warmup-iters 500
+    --clip-grad 1.0
+    --bf16
+    --overlap-grad-reduce
+    --overlap-param-gather
+)
+
+MODEL_PARALLEL_ARGS=(
+    --tensor-model-parallel-size 1
+    --pipeline-model-parallel-size 4
+    --num-layers-per-virtual-pipeline-stage 8
+    --sequence-parallel
+    --use-distributed-optimizer
+)
+
+LOGGING_ARGS=(
+    --log-interval 1 \
+    --save-interval 10000 \
+    --eval-interval 1000 \
+    --eval-iters 10 \
+    --save $CHECKPOINT_PATH \
+    --load $CHECKPOINT_PATH \
+    --tensorboard-dir "${CHECKPOINT_PATH}/tensorboard" \
+    --no-load-optim \
+    --no-load-rng
+)
+
+if [ -n "${WANDB_API_KEY}" ]; then
+    LOGGING_ARGS+=(
+        --wandb-project ${WANDB_PROJECT:-"Mixtral-Finetuning"}
+        --wandb-exp-name ${WANDB_NAME:-"Mixtral_8x7B"} 
+    )
+fi
+
+torchrun ${DISTRIBUTED_ARGS[@]} pretrain_gpt.py \
+    ${MODEL_ARGS[@]} \
+    ${MOE_ARGS[@]} \
+    ${DATA_ARGS[@]} \
+    ${TRAINING_ARGS[@]} \
+    ${MODEL_PARALLEL_ARGS[@]} \
+    ${LOGGING_ARGS[@]}
+```
+</details>
+
+# Performance Best Practice
+
+### Tuning Guide of Parallel Mappings
+
+To find a good parallel mapping that help you achieve a high throughput of a new model, there are some general rule that could help. Here is an overview of properties in different aspects for each parallel strategy.
+
+| Parallel Strategy | Peak Activation Memory          | Weight Memory  | Optimizer states                  | Communication (Per-Layer) |
+|:-----------------:|:-------------------------------:|:--------------:|:---------------------------------:|:-------------------------:|
+| TP                | 1/N (with SP on)                | 1/N            | 1/N                               |        High               |
+| EP                | 1                               | 1/N in MoELayer| 1/N                               |       Medium              |
+| PP                | 1 (>1 with virtual pipeline)    | 1/N            | 1/N                               |       Medium              |
+| CP                | 1/N                             | 1              | 1/N (with distributed optimizer)  |       Medium              |
+| DP                | 1                               | 1              | 1/N (with distributed optimizer)  |        Low                |
+
+For a specific model, the best parallel mapping varies based on the model architecture, trained sequence length and the hardware platform.
+Here we provide some general rules to get better performance:
+1. Keep the model parallism size as small as possible. 
+    - For the large language models, model parallism is often required to prevent OOM, but it will bring communication overhead and hurt performance. 
+    - With distributed optimizer, master weights and optimizer states will be sharded across all DP ranks with slight communication overhead.
+    So try to reduce the model parallism size and increase data parallism size when there are lots of free GPU memory during training.
+2. Ensure the EPxTP communication winthin the NVLink domain.
+    - Communications of EP and TP should remain within the NVLink domain as much as possible, as both are communication-intensive.
+    - If the model is too large and requires scaling across multiple nodes, consider PP before TP and EP. See item 3 for details.
+3. Use Pipeline Parallelism to scale the model further.
+    - Enable Virtual Pipeline Parallelism(VPP) to reduce pp bubbles when PP_size >= 2 by setting `num_layers_per_virtual_pipeline_stage`.
+    - VPP_size tuning: the legal values of vpp_size are all common divisors of num_layers/pp_size, E.g., num_layers=24, pp_size=4, then we can pick vpp_size from {1, 2, 3, 6}. The larger the vpp_size, the lower the pipeline bubbles, while the larger number of P2P communications between each PP stages. Empirically a value in the middle often gives the best trade-off. `VPP_size=num_layers / PP_size / num_layers_per_virtual_pipeline_stage`
+4. Prefer EP over TP for the expert layer when possible:
+    - TP saves more memory than EP, but EP can achieve better GEMM efficiency and less communication overhead than TP.
+    - If EP size increased to the number of expert, the local token permutation/un-permutation for experts computation are omitted.
+    - Simplify the computation graph of MoE layers, more convenient for performing potential comm-computation overlapping.
+    - In practice, EP8TP1 is better than EP4TP2 for 8x7B.
+5. Enable Context Parallelism for long context training.
+    - The efficiency of CP largely depends on whether its communication can be overlapped with computation. 
+    - Emperically, use CP when sequence length >= 8K.
+
+
+### End-to-End Training Practice
+**Use the latest NVIDIA PyTorch or NeMo Docker Image**
+- [NGC PyTorch Image](https://catalog.ngc.nvidia.com/orgs/nvidia/containers/pytorch)
+- [NGC NeMo Image](https://catalog.ngc.nvidia.com/orgs/nvidia/containers/nemo)
+
+**Token Dispatcher Choices**
+- Token Dispatcher sends tokens to the designated expert, involves tensor rearangement and communications.
+- Dispatcher `allgather` is the default option. It achieves better performance and efficiency when only tensor parallelism is used or when the Top-k value is very large.
+- Dispatcher `alltoall` is recommended if expert parallelism is applied.
+- Dispatcher `alltoall_seq` is the original implementation of `alltoall` and is retained for potential compatibility risk.
+
+**Enable Communication Overlap**
+- Enable `--overlap-param-gather` and `--overlap-grad-reduce` with distributed optimizer.
+- Enable `--tp-comm-overlap` when TP>1.
+- Enable p2p comm overlap when PP > 1 by setting `num_layers_per_virtual_pipeline_stage`.
+
+**Enable GroupedGEMM when num_local_experts>1 with `--moe-grouped-gemm`**
+- GroupedGEMM has higher efficiency than vanilla sequential GEMMs for each expert.
+- Recommend to use the TE version of Grouped GEMM (by upgrading to MCore v0.8 and TE v1.9), which support Gradient Accumulation Fusion and FP8 Training.
+
+**OOM Caused by Token Distribution Imbalance when Training From Scratch**  
+MoE suffers from a severe load imbalance issue when the router is under-trained, leading to the model easily running out of memory (OOM), which typically occurs in the first 100~300 steps when training from scratch. 
+Therefore, there are two recommended ways during the first 200 steps to avoid the OOM problem, which can be removed after the token distribution is more stable:
+1. Use Extended-TP(`-moe-extended-tp`) to replace EP with TP in MoELayer, this can prevent the load imbalancing between EP ranks. Since current ETP implementation has some memeory overhead, you can further enable activation recomputation only for MoE Layer by adding `--moe-layer-recompute`.
+2. Setting capacity factor to a relatively small number like 1.0 by adding `--moe-token-capacity-factor 1.0`.
+
+### Reference Best Parallel Mapping
+
+Here are the reference parallel mappings of MCore v0.8 for Mixtral 8x7B and 8x22B models:
+|        Model            | Vocab Size| Dispatcher | Precision | #GPUs | SEQ LEN | TP | EP | PP | VP | MBS | GBS |
+|:-----------------------:|:---------:|:----------:|:---------:|:-----:|:-------:|:--:|:--:|:--:|:--:|:---:|:---:|
+| Mixtral 8x7B(Dropless)  |   32K     | All-to-All | BF16      | 64    | 4096    | 1  | 8  | 4  | 8  | 1   | 256 |
+| Mixtral 8x22B(Dropless) |   32K     | All-to-All | BF16      | 128   | 4096    | 4  | 2  | 8  | 7  | 1   | 256 |
+
+Detailed Benchmark Information:  
+Server:
+- 8xH100 80GB HBM3 
+- NVLink 4th Generation
+- InfiniBand 8x400 Gbit/s
+
+Docker Image:
+- PyTorch 24.04 with TransformerEngine v1.9
\ No newline at end of file
diff --git a/megatron/core/transformer/moe/__init__.py b/megatron/core/transformer/moe/__init__.py
new file mode 100644
index 0000000000..e69de29bb2
diff --git a/megatron/core/transformer/moe/experts.py b/megatron/core/transformer/moe/experts.py
new file mode 100644
index 0000000000..1bb5da588b
--- /dev/null
+++ b/megatron/core/transformer/moe/experts.py
@@ -0,0 +1,808 @@
+# Copyright (c) 2024, NVIDIA CORPORATION. All rights reserved.
+
+import itertools
+from copy import deepcopy
+from functools import partial
+from math import ceil
+from typing import Optional, Tuple
+
+import torch
+import torch.nn.functional as F
+from torch.nn.parameter import Parameter
+
+from megatron.core import parallel_state
+from megatron.core.dist_checkpointing import ShardedTensor
+from megatron.core.dist_checkpointing.mapping import (
+    LocalNonpersistentObject,
+    ReplicaId,
+    ShardedStateDict,
+    ShardedTensorFactory,
+)
+from megatron.core.dist_checkpointing.utils import replace_prefix_for_sharding
+from megatron.core.fusions.fused_bias_geglu import bias_geglu_impl
+from megatron.core.fusions.fused_bias_gelu import bias_gelu_impl
+from megatron.core.fusions.fused_bias_swiglu import bias_swiglu_impl
+from megatron.core.jit import jit_fuser
+from megatron.core.tensor_parallel.layers import (
+    _initialize_affine_weight_cpu,
+    _initialize_affine_weight_gpu,
+)
+from megatron.core.tensor_parallel.utils import divide
+from megatron.core.transformer.mlp import MLP, MLPSubmodules, apply_swiglu_sharded_factory
+from megatron.core.transformer.module import MegatronModule
+from megatron.core.transformer.moe import grouped_gemm_util as gg
+from megatron.core.transformer.spec_utils import build_module
+from megatron.core.transformer.transformer_config import TransformerConfig
+from megatron.core.transformer.utils import make_sharded_object_for_checkpoint
+
+
+class GroupedMLP(MegatronModule):
+    """An efficient implementation of the Experts layer using GroupedGEMM.
+
+    Executes multiple experts in parallel to maximize computational efficiency.
+    """
+
+    def __init__(self, num_local_experts: int, config: TransformerConfig):
+        super().__init__(config=config)
+        self.config: TransformerConfig = config
+        self.num_local_experts = num_local_experts
+        gg.assert_grouped_gemm_is_available()
+        assert (
+            config.add_bias_linear == False
+        ), "bias not supported in Grouped GEMM yet, please set '--disable-bias-linear' instead."
+
+        self.expert_parallel = config.expert_model_parallel_size > 1
+        if self.config.gated_linear_unit:
+            if self.config.activation_func not in (F.silu, F.gelu):
+                raise ValueError("Activation function must be silu or gelu when using GroupedMLP.")
+
+            @jit_fuser
+            def glu(x):
+                x = torch.chunk(x, 2, dim=-1)
+                return self.config.activation_func(x[0]) * x[1]
+
+            self.activation_func = glu
+        else:
+            self.activation_func = self.config.activation_func
+
+        # How many feature each rank holds for fc1 and fc2, respectively.
+        self.moe_extended_tp = config.moe_extended_tp
+        if config.moe_extended_tp:
+            tp_size = parallel_state.get_tensor_and_expert_parallel_world_size()
+        else:
+            tp_size = parallel_state.get_tensor_model_parallel_world_size()
+
+        fc1_output_size = self.config.ffn_hidden_size * self.num_local_experts
+        if config.gated_linear_unit:
+            # Project to 4h. If using swiglu double the output width,
+            # see https://arxiv.org/pdf/2002.05202.pdf
+            fc1_output_size *= 2
+        fc1_output_size_per_partition = divide(fc1_output_size, tp_size)
+
+        fc2_input_size = self.config.ffn_hidden_size * self.num_local_experts
+        fc2_input_size_per_partition = divide(fc2_input_size, tp_size)
+
+        # Note: The current kernel implementations of grouped_gemm
+        # does not support transposition with CUTLASS grouped GEMM
+        # (https://github.com/fanshiqing/grouped_gemm/blob/main/csrc/grouped_gemm.cu#L355-L358)
+        # and as a result we avoid allocate the transpose of weights.
+        # Initialize weight.
+        if config.use_cpu_initialization:
+            self.weight1 = Parameter(
+                torch.empty(
+                    self.config.hidden_size,
+                    fc1_output_size_per_partition,
+                    dtype=config.params_dtype,
+                )
+            )
+            self.weight2 = Parameter(
+                torch.empty(
+                    fc2_input_size_per_partition, self.config.hidden_size, dtype=config.params_dtype
+                )
+            )
+            if config.perform_initialization:
+                _initialize_affine_weight_cpu(
+                    self.weight1,
+                    self.config.hidden_size,
+                    fc1_output_size,
+                    fc1_output_size_per_partition,
+                    partition_dim=1,
+                    init_method=config.init_method,
+                    params_dtype=config.params_dtype,
+                )
+                _initialize_affine_weight_cpu(
+                    self.weight2,
+                    fc2_input_size,
+                    self.config.hidden_size,
+                    fc2_input_size_per_partition,
+                    partition_dim=0,
+                    init_method=config.output_layer_init_method,
+                    params_dtype=config.params_dtype,
+                )
+        else:
+            self.weight1 = Parameter(
+                torch.empty(
+                    self.config.hidden_size,
+                    fc1_output_size_per_partition,
+                    device=torch.cuda.current_device(),
+                    dtype=config.params_dtype,
+                )
+            )
+            self.weight2 = Parameter(
+                torch.empty(
+                    fc2_input_size_per_partition,
+                    self.config.hidden_size,
+                    device=torch.cuda.current_device(),
+                    dtype=config.params_dtype,
+                )
+            )
+            if config.perform_initialization:
+                _initialize_affine_weight_gpu(
+                    self.weight1,
+                    config.init_method,
+                    partition_dim=1,
+                    expert_parallel=self.expert_parallel,
+                )
+                _initialize_affine_weight_gpu(
+                    self.weight2,
+                    config.output_layer_init_method,
+                    partition_dim=0,
+                    expert_parallel=self.expert_parallel,
+                )
+        setattr(self.weight1, 'allreduce', not self.expert_parallel)
+        setattr(self.weight2, 'allreduce', not self.expert_parallel)
+
+        def remove_extra_states_check(self, incompatible_keys):
+            """
+            Remove _extra_state from unexpected keys.
+            These keys are for dist ckpt compatibility with SequentialMLP.
+            """
+            keys = deepcopy(incompatible_keys.unexpected_keys)
+            for key in keys:
+                if '_extra_state' in key:
+                    incompatible_keys.unexpected_keys.remove(key)
+
+        self.register_load_state_dict_post_hook(remove_extra_states_check)
+
+    def forward(self, permuted_local_hidden_states: torch.Tensor, tokens_per_expert: torch.Tensor):
+        """Forward step of the GroupedMLP."""
+        if permuted_local_hidden_states.nelement() != 0:
+            # Reshape the weights for the grouped GEMMs.
+            w1 = self.weight1.view(self.num_local_experts, self.config.hidden_size, -1)
+            w2 = self.weight2.view(self.num_local_experts, -1, self.config.hidden_size)
+
+            fc1_output = gg.ops.gmm(
+                permuted_local_hidden_states, w1, tokens_per_expert, trans_b=False
+            )
+
+            intermediate_parallel = self.activation_func(fc1_output)
+
+            fc2_output = gg.ops.gmm(intermediate_parallel, w2, tokens_per_expert, trans_b=False)
+        else:
+            # No token is allocated for local experts.
+            assert torch.count_nonzero(tokens_per_expert) == 0
+
+            # Make sure params of experts still have gradients even given zero tokens.
+            w1 = self.weight1.view(self.config.hidden_size, -1)
+            w2 = self.weight2.view(-1, self.config.hidden_size)
+            h = torch.matmul(permuted_local_hidden_states, w1)
+            h = self.activation_func(h)
+            h = torch.matmul(h, w2)
+
+            fc2_output = h
+
+        return fc2_output, None
+
+    def sharded_state_dict(self, prefix='', sharded_offsets=(), metadata=None):
+        """
+        Maps local expert to global experts.
+        The sharded_state_dict for the weight parts are compatible with the SequentialMLP,
+        whereas the optimizer states are not due to the limitation from weight transposing.
+        That is, for finetuning scenario, the checkpoint is compatible with the SequentialMLP.
+        """
+        if self.moe_extended_tp:
+            raise NotImplementedError(
+                'Currently distributed checkpointing is not supported for moe_extended_tp'
+            )
+
+        sharded_state_dict = {}
+        num_global_experts = (
+            parallel_state.get_expert_model_parallel_world_size() * self.num_local_experts
+        )
+        local_expert_indices_offset = (
+            parallel_state.get_expert_model_parallel_rank() * self.num_local_experts
+        )
+        tp_size = parallel_state.get_tensor_model_parallel_world_size()
+        tp_rank = parallel_state.get_tensor_model_parallel_rank()
+
+        prepend_axis_num = len(sharded_offsets)
+        replica_id = (
+            0,
+            0,
+            parallel_state.get_data_modulo_expert_parallel_rank(with_context_parallel=True),
+        )
+
+        local_ffn_dim_size = (
+            self.weight2.numel() // self.num_local_experts // self.config.hidden_size
+        )
+
+        @torch.no_grad()
+        def sh_ten_build_fn(
+            key: str,
+            t: torch.Tensor,
+            replica_id: ReplicaId,
+            flattened_range: Optional[slice],
+            tp_axis: int,
+            with_glu: bool,
+        ):
+            # TODO: write a generic implementation to cover both cases with and without GLU
+            if tp_axis == 1:
+                # weight1
+                if with_glu:
+                    last_dim_size = local_ffn_dim_size * 2
+                else:
+                    last_dim_size = local_ffn_dim_size
+                real_shape = (self.num_local_experts, self.config.hidden_size, last_dim_size)
+            elif tp_axis == 0:
+                # weight2
+                real_shape = (self.num_local_experts, local_ffn_dim_size, self.config.hidden_size)
+                assert with_glu == False
+            else:
+                raise ValueError("tp_axis should be 0 or 1.")
+            if flattened_range is None:
+                # weights
+                t = t.view(real_shape).transpose(-1, -2)
+                # change tp_axis due to the transposing
+                tp_axis = 1 - tp_axis
+                if with_glu:
+                    local_tensors = torch.chunk(t, 2, -2)
+                    sub_states = [
+                        ShardedTensor.from_rank_offsets(
+                            key,
+                            local_tensors[0].contiguous(),
+                            *sharded_offsets,
+                            (
+                                prepend_axis_num,
+                                parallel_state.get_expert_model_parallel_rank(),
+                                parallel_state.get_expert_model_parallel_world_size(),
+                            ),
+                            (prepend_axis_num + 1, tp_rank, tp_size * 2),
+                            replica_id=replica_id,
+                            prepend_axis_num=prepend_axis_num,
+                        ),
+                        ShardedTensor.from_rank_offsets(
+                            key,
+                            local_tensors[1].contiguous(),
+                            *sharded_offsets,
+                            (
+                                prepend_axis_num,
+                                parallel_state.get_expert_model_parallel_rank(),
+                                parallel_state.get_expert_model_parallel_world_size(),
+                            ),
+                            (prepend_axis_num + 1, tp_size + tp_rank, tp_size * 2),
+                            replica_id=replica_id,
+                            prepend_axis_num=prepend_axis_num,
+                        ),
+                    ]
+                else:
+                    sub_states = ShardedTensor.from_rank_offsets(
+                        key,
+                        t.contiguous(),
+                        *sharded_offsets,
+                        (
+                            prepend_axis_num,
+                            parallel_state.get_expert_model_parallel_rank(),
+                            parallel_state.get_expert_model_parallel_world_size(),
+                        ),
+                        (prepend_axis_num + 1 + tp_axis, tp_rank, tp_size),
+                        replica_id=replica_id,
+                        prepend_axis_num=prepend_axis_num,
+                    )
+            else:
+                # flattened optmizer states
+                # the non-flattened weight shape is [local_expert_num, hidden_size, ffn_size]
+                #
+                # For the case without GLU, it is straightforward, we just need to split each
+                # expert along the dim-0.
+                #
+                # For the case with GLU, we need to split the experts along dim-0 and split the
+                # two tensors for GLU along dim-2.
+                # To split along the non-first dim, we need to chunk the tensor into small pieces,
+                # since they belong to different tenors and are interleaved in the flattened space.
+                # Refer to the below sketch graph.
+                # |................|           |........|........|
+                # |............FFFF|           |........|....BBBB|
+                # |FFFFFFFFFFFFFFFF|     ->    |AAAAAAAA|BBBBBBBB|
+                # |FFFFFFFFFFFFFFFF|           |AAAAAAAA|BBBBBBBB|
+                # |FF..............|           |AA......|........|
+                # |................|           |........|........|
+                #
+                # But too many chunks have severe performance issues. We merge these chunks during
+                # the save process along with some length information and recover them during the
+                # load process.
+                assert t.ndim == 1, (key, t.shape)
+                if with_glu:
+                    non_flat_local_shape = (1, self.config.hidden_size, local_ffn_dim_size)
+                    chunk_numel = local_ffn_dim_size
+                    sub_states = []
+                    start_pos = 0
+                    for local_expert_idx in range(self.num_local_experts):
+                        first_glu_idx = -1
+                        w_start_range = -1
+                        v_start_range = -1
+                        w_tensors = []
+                        v_tensors = []
+                        w_lens = []
+                        v_lens = []
+                        for input_dim_idx in range(self.config.hidden_size):
+                            for glu_idx in range(2):
+                                local_idx = (
+                                    local_expert_idx * self.config.hidden_size * 2
+                                    + input_dim_idx * 2
+                                    + glu_idx
+                                )
+                                if (
+                                    flattened_range.start < chunk_numel * (local_idx + 1)
+                                    and flattened_range.stop > chunk_numel * local_idx
+                                ):
+                                    if first_glu_idx == -1:
+                                        first_glu_idx = glu_idx
+                                    end_pos = min(
+                                        flattened_range.stop,
+                                        chunk_numel * (local_idx + 1) - flattened_range.start,
+                                    )
+                                    local_tensor = t[start_pos:end_pos]
+                                    local_flattened_range = slice(
+                                        max(0, flattened_range.start - chunk_numel * local_idx),
+                                        min(
+                                            chunk_numel,
+                                            flattened_range.stop - chunk_numel * local_idx,
+                                        ),
+                                    )
+                                    assert (
+                                        len(local_tensor)
+                                        == local_flattened_range.stop - local_flattened_range.start
+                                    )
+                                    start_pos += len(local_tensor)
+                                    expert_global_idx = (
+                                        local_expert_indices_offset + local_expert_idx
+                                    )
+                                    if glu_idx == 0:
+                                        w_tensors.append(local_tensor)
+                                        w_lens.append(len(local_tensor))
+                                        if w_start_range == -1:
+                                            w_start_range = max(
+                                                0, flattened_range.start - chunk_numel * local_idx
+                                            )
+                                    else:
+                                        v_tensors.append(local_tensor)
+                                        v_lens.append(len(local_tensor))
+                                        if v_start_range == -1:
+                                            v_start_range = max(
+                                                0, flattened_range.start - chunk_numel * local_idx
+                                            )
+                        sub_states.append(
+                            {
+                                'w_tensors': ShardedTensor.from_rank_offsets_flat(
+                                    key,
+                                    (
+                                        torch.cat(w_tensors, -1)
+                                        if len(w_tensors) > 0
+                                        else torch.Tensor()
+                                    ),
+                                    non_flat_local_shape,
+                                    *sharded_offsets,
+                                    (prepend_axis_num, expert_global_idx, num_global_experts),
+                                    (prepend_axis_num + 1 + tp_axis, tp_rank, tp_size * 2),
+                                    replica_id=replica_id,
+                                    prepend_axis_num=prepend_axis_num,
+                                    flattened_range=slice(
+                                        w_start_range, w_start_range + sum(w_lens)
+                                    ),
+                                ),
+                                'w_lens': LocalNonpersistentObject(w_lens),
+                                'v_tensors': ShardedTensor.from_rank_offsets_flat(
+                                    key,
+                                    (
+                                        torch.cat(v_tensors, -1)
+                                        if len(v_tensors) > 0
+                                        else torch.Tensor()
+                                    ),
+                                    non_flat_local_shape,
+                                    *sharded_offsets,
+                                    (prepend_axis_num, expert_global_idx, num_global_experts),
+                                    (
+                                        prepend_axis_num + 1 + tp_axis,
+                                        tp_rank + tp_size,
+                                        tp_size * 2,
+                                    ),
+                                    replica_id=replica_id,
+                                    prepend_axis_num=prepend_axis_num,
+                                    flattened_range=slice(
+                                        v_start_range, v_start_range + sum(v_lens)
+                                    ),
+                                ),
+                                'v_lens': LocalNonpersistentObject(v_lens),
+                                'first_glu_idx': LocalNonpersistentObject(first_glu_idx),
+                            }
+                        )
+                else:
+                    non_flat_local_shape = (
+                        real_shape[0] // self.num_local_experts,
+                        *real_shape[1:],
+                    )
+                    chunk_numel = local_ffn_dim_size * self.config.hidden_size
+                    sub_states = []
+                    start_pos = 0
+                    for local_expert_idx in range(self.num_local_experts):
+                        if (
+                            flattened_range.start < chunk_numel * (local_expert_idx + 1)
+                            and flattened_range.stop > chunk_numel * local_expert_idx
+                        ):
+                            end_pos = min(
+                                flattened_range.stop,
+                                chunk_numel * (local_expert_idx + 1) - flattened_range.start,
+                            )
+                            local_tensor = t[start_pos:end_pos]
+                            local_flattened_range = slice(
+                                max(0, flattened_range.start - chunk_numel * local_expert_idx),
+                                min(
+                                    chunk_numel,
+                                    flattened_range.stop - chunk_numel * local_expert_idx,
+                                ),
+                            )
+                            assert (
+                                len(local_tensor)
+                                == local_flattened_range.stop - local_flattened_range.start
+                            )
+                            start_pos += len(local_tensor)
+                            expert_global_idx = local_expert_indices_offset + local_expert_idx
+                            sub_states.append(
+                                ShardedTensor.from_rank_offsets_flat(
+                                    key,
+                                    local_tensor,
+                                    non_flat_local_shape,
+                                    *sharded_offsets,
+                                    (prepend_axis_num, expert_global_idx, num_global_experts),
+                                    (prepend_axis_num + 1 + tp_axis, tp_rank, tp_size),
+                                    replica_id=replica_id,
+                                    prepend_axis_num=prepend_axis_num,
+                                    flattened_range=local_flattened_range,
+                                )
+                            )
+            return sub_states
+
+        @torch.no_grad()
+        def sh_ten_merge_fn(sub_state_dict, tp_axis: int, with_glu: bool):
+            if tp_axis == 1:
+                # weight1
+                weight_shape = (self.config.hidden_size, -1)
+            elif tp_axis == 0:
+                # weight2
+                weight_shape = (-1, self.config.hidden_size)
+                assert with_glu == False
+            else:
+                raise ValueError("tp_axis should be 0 or 1.")
+            if isinstance(sub_state_dict, list) and isinstance(sub_state_dict[0], dict):
+                # flattened tensor with glu
+                res = []
+                for local_expert_dict in sub_state_dict:
+                    w_tensors = torch.split(
+                        local_expert_dict['w_tensors'], local_expert_dict['w_lens']
+                    )
+                    v_tensors = torch.split(
+                        local_expert_dict['v_tensors'], local_expert_dict['v_lens']
+                    )
+                    first_glu_idx = local_expert_dict['first_glu_idx']
+                    if first_glu_idx == 0:
+                        res += [
+                            x for x in itertools.chain(*itertools.zip_longest(w_tensors, v_tensors))
+                        ]
+                    else:
+                        res += [
+                            x for x in itertools.chain(*itertools.zip_longest(v_tensors, w_tensors))
+                        ]
+                return torch.cat(res)
+            elif isinstance(sub_state_dict, list) and sub_state_dict[0].ndim == 1:
+                # flattened tensor without glu
+                return torch.cat(sub_state_dict)
+            else:
+                if with_glu:
+                    sub_state_dict = torch.cat(sub_state_dict, -2)
+                return sub_state_dict.transpose(-1, -2).reshape(weight_shape)
+
+        state_dict = self.state_dict(prefix='', keep_vars=True)
+        for name, tensor in state_dict.items():
+            if name == 'weight1':
+                tp_axis = 1
+                with_glu = self.config.gated_linear_unit
+                wkey = f'{prefix}experts.linear_fc1.weight'
+            else:
+                tp_axis = 0
+                with_glu = False
+                wkey = f'{prefix}experts.linear_fc2.weight'
+            sharded_state_dict[f'{prefix}{name}'] = ShardedTensorFactory(
+                wkey,
+                tensor,
+                partial(sh_ten_build_fn, tp_axis=tp_axis, with_glu=with_glu),
+                partial(sh_ten_merge_fn, tp_axis=tp_axis, with_glu=with_glu),
+                replica_id,
+            )
+
+        replica_id = (
+            0,
+            parallel_state.get_tensor_model_parallel_rank(),
+            parallel_state.get_data_modulo_expert_parallel_rank(with_context_parallel=True),
+        )
+        # Add fake _extra_state to be compatible with SequentialMLP
+        for expert_local_idx in range(self.num_local_experts):
+            expert_global_idx = local_expert_indices_offset + expert_local_idx
+            expert_sharded_offsets = (
+                *sharded_offsets,
+                (len(sharded_offsets), expert_global_idx, num_global_experts),
+            )
+            for mod in ['linear_fc1', 'linear_fc2']:
+                sharded_state_dict[f'{prefix}expert{expert_global_idx}.{mod}._extra_state'] = (
+                    make_sharded_object_for_checkpoint(
+                        None,
+                        f'{prefix}experts.{mod}._extra_state',
+                        expert_sharded_offsets,
+                        replica_id,
+                    )
+                )
+
+        return sharded_state_dict
+
+
+class TEGroupedMLP(MegatronModule):
+    """An efficient implementation of the Experts layer using TE's GroupedLinear.
+
+    Executes multiple experts in parallel to maximize computational efficiency.
+    """
+
+    def __init__(self, num_local_experts, config: TransformerConfig, submodules: MLPSubmodules):
+        super().__init__(config=config)
+        self.moe_extended_tp = config.moe_extended_tp
+        self.num_local_experts = num_local_experts
+        self.input_size = self.config.hidden_size
+
+        # Double the output width with gated linear unit, see https://arxiv.org/pdf/2002.05202.pdf
+        ffn_hidden_size = self.config.ffn_hidden_size
+        if self.config.gated_linear_unit:
+            ffn_hidden_size *= 2
+
+        self.linear_fc1 = build_module(
+            submodules.linear_fc1,
+            self.num_local_experts,
+            self.input_size,
+            ffn_hidden_size,
+            config=self.config,
+            init_method=self.config.init_method,
+            bias=self.config.add_bias_linear,
+            skip_bias_add=True,
+            is_expert=True,
+            tp_comm_buffer_name='fc1',
+        )
+
+        self.activation_func = self.config.activation_func
+
+        self.linear_fc2 = build_module(
+            submodules.linear_fc2,
+            self.num_local_experts,
+            self.config.ffn_hidden_size,
+            self.config.hidden_size,
+            config=self.config,
+            init_method=self.config.output_layer_init_method,
+            bias=self.config.add_bias_linear,
+            skip_bias_add=True,
+            is_expert=True,
+            tp_comm_buffer_name='fc2',
+        )
+
+        def remove_extra_states_check(self, incompatible_keys):
+            """
+            Remove extra _extra_state from unexpected keys.
+            These keys are for dist ckpt compatibility with SequentialMLP.
+            """
+            keys = deepcopy(incompatible_keys.unexpected_keys)
+            for key in keys:
+                if '_extra_state' in key:
+                    incompatible_keys.unexpected_keys.remove(key)
+
+        self.register_load_state_dict_post_hook(remove_extra_states_check)
+
+    def forward(
+        self, permuted_local_hidden_states: torch.Tensor, tokens_per_expert: torch.Tensor
+    ) -> Tuple[torch.Tensor, Optional[torch.Tensor]]:
+        """Forward of TEGroupedMLP
+
+        Args:
+            permuted_local_hidden_states (torch.Tensor): The permuted input hidden states of the
+            local experts.
+            tokens_per_expert (torch.Tensor): The number of tokens per expert.
+
+        Return:
+            output (torch.Tensor): The output of the local experts.
+        """
+        tokens_per_expert = tokens_per_expert.tolist()
+        intermediate_parallel, bias_parallel = self.linear_fc1(
+            permuted_local_hidden_states, tokens_per_expert
+        )
+
+        if self.config.bias_activation_fusion:
+            if self.activation_func == F.gelu:
+                if self.config.gated_linear_unit:
+                    intermediate_parallel = bias_geglu_impl(intermediate_parallel, bias_parallel)
+                else:
+                    assert self.config.add_bias_linear is True
+                    intermediate_parallel = bias_gelu_impl(intermediate_parallel, bias_parallel)
+            elif self.activation_func == F.silu and self.config.gated_linear_unit:
+                intermediate_parallel = bias_swiglu_impl(
+                    intermediate_parallel,
+                    bias_parallel,
+                    self.config.activation_func_fp8_input_store,
+                )
+            else:
+                raise ValueError("Only support fusion of gelu and swiglu")
+        else:
+            if bias_parallel is not None:
+                intermediate_parallel = intermediate_parallel + bias_parallel
+            if self.config.gated_linear_unit:
+
+                def glu(x):
+                    x = torch.chunk(x, 2, dim=-1)
+                    return self.config.activation_func(x[0]) * x[1]
+
+                intermediate_parallel = glu(intermediate_parallel)
+            else:
+                intermediate_parallel = self.activation_func(intermediate_parallel)
+
+        output, output_bias = self.linear_fc2(intermediate_parallel, tokens_per_expert)
+
+        return output, output_bias
+
+    def sharded_state_dict(
+        self, prefix: str = '', sharded_offsets: tuple = (), metadata: Optional[dict] = None
+    ) -> ShardedStateDict:
+        """
+        Maps local expert to global experts.
+        The sharded state dict is interchangable with SequentialMLP's.
+        """
+        if self.moe_extended_tp:
+            raise NotImplementedError(
+                'Currently distributed checkpointing is not supported for moe_extended_tp'
+            )
+        sharded_state_dict = {}
+        for name, module in self._modules.items():
+            sub_sd = module.sharded_state_dict(f'{name}.', sharded_offsets, metadata)
+            if name == 'linear_fc1' and self.config.gated_linear_unit:
+                num_global_experts = (
+                    parallel_state.get_expert_model_parallel_world_size() * self.num_local_experts
+                )
+                local_expert_indices_offset = (
+                    parallel_state.get_expert_model_parallel_rank() * self.num_local_experts
+                )
+                ep_axis = len(sharded_offsets)
+                for i in range(self.num_local_experts):
+                    new_sharded_offsets = (
+                        *sharded_offsets,
+                        (ep_axis, local_expert_indices_offset + i, num_global_experts),
+                    )
+                    for k in (f'{name}.weight{i}', f'{name}.bias{i}'):
+                        if k in sub_sd:
+                            sub_sd[k] = apply_swiglu_sharded_factory(sub_sd[k], new_sharded_offsets)
+            # Add prefix here to match sequential's keys
+            replace_prefix_for_sharding(sub_sd, f'{name}.', f'{prefix}experts.{name}.')
+            sharded_state_dict.update({f"{prefix}{k}": v for k, v in sub_sd.items()})
+        return sharded_state_dict
+
+
+class SequentialMLP(MegatronModule):
+    """An implementation of the Experts layer using a sequence of MLP layers.
+
+    This class executes each expert sequentially.
+    """
+
+    def __init__(self, num_local_experts, config: TransformerConfig, submodules: MLPSubmodules):
+        super().__init__(config=config)
+        self.add_bias = config.add_bias_linear
+        self.moe_extended_tp = config.moe_extended_tp
+        self.num_local_experts = num_local_experts
+        self.local_experts = torch.nn.ModuleList()
+        for _ in range(self.num_local_experts):
+            expert = MLP(self.config, submodules, is_expert=True)
+            self.local_experts.append(expert)
+
+    def _pad_tensor_for_fp8(self, hidden):
+        """Padding tensor shape to multiples of 16."""
+        actual_num_tokens = hidden.shape[0]
+        divisor = 16
+        padded_num_tokens = ceil(actual_num_tokens / divisor) * divisor - actual_num_tokens
+        if padded_num_tokens > 0:
+            pad_tensor = torch.zeros(
+                padded_num_tokens, hidden.shape[1], dtype=hidden.dtype, device=hidden.device
+            )
+            hidden = torch.cat((hidden, pad_tensor), dim=0)
+        return hidden
+
+    def forward(self, permuted_local_hidden_states: torch.Tensor, tokens_per_expert: torch.Tensor):
+        """Forward step of the SequentialMLP."""
+        if self.num_local_experts == 1:
+            if self.config.fp8:
+                hidden = self._pad_tensor_for_fp8(permuted_local_hidden_states)
+                output, output_bias = self.local_experts[0](hidden)
+                output = output[: permuted_local_hidden_states.shape[0]]
+            else:
+                output, output_bias = self.local_experts[0](permuted_local_hidden_states)
+
+            return output, output_bias
+        else:
+            tokens_per_expert = tokens_per_expert.tolist()
+            tokens_list = torch.split(permuted_local_hidden_states, tokens_per_expert)
+
+            output_local_list = []
+            output_bias_list = []
+
+            for expert, tokens in zip(self.local_experts, tokens_list):
+                if self.config.fp8:
+                    hidden = self._pad_tensor_for_fp8(tokens)
+                    output, output_bias = expert(hidden)
+                    output = output[: tokens.shape[0]]
+                else:
+                    output, output_bias = expert(tokens)
+                output_local_list.append(output)
+                if self.add_bias:
+                    output_bias_list.append(output_bias.expand_as(output))
+
+            output_local = torch.cat(output_local_list, dim=0)
+            if self.add_bias:
+                output_bias_local = torch.cat(output_bias_list, dim=0)
+            else:
+                output_bias_local = None
+
+            return output_local, output_bias_local
+
+    def sharded_state_dict(self, prefix='', sharded_offsets=(), metadata=None):
+        """Maps local expert to global experts."""
+        if self.moe_extended_tp:
+            raise NotImplementedError(
+                'Currently distributed checkpointing is not supported for moe_extended_tp'
+            )
+
+        sharded_state_dict = {}
+        num_global_experts = (
+            parallel_state.get_expert_model_parallel_world_size() * self.num_local_experts
+        )
+        local_expert_indices_offset = (
+            parallel_state.get_expert_model_parallel_rank() * self.num_local_experts
+        )
+
+        expert_sharded_prefix = f'{prefix}experts.'
+        for expert_local_idx, expert in enumerate(self.local_experts):
+            expert_global_idx = local_expert_indices_offset + expert_local_idx
+            expert_state_dict_prefix = f'{prefix}local_experts.{expert_local_idx}.'
+            expert_sharded_offsets = (
+                *sharded_offsets,
+                (len(sharded_offsets), expert_global_idx, num_global_experts),
+            )
+
+            expert_state_dict = expert.sharded_state_dict(
+                expert_state_dict_prefix, expert_sharded_offsets, metadata
+            )
+            # Remove expert layers indexing from sharded keys
+            replace_prefix_for_sharding(
+                expert_state_dict, expert_state_dict_prefix, expert_sharded_prefix
+            )
+            # Adjust replica ids - replication along DP modulo EP
+            for k, sh_ten in expert_state_dict.items():
+                replica_id = sh_ten.replica_id
+                assert (
+                    len(replica_id) == 3
+                ), f'Expected replica_id for {k} to be in (PP, TP, DP) format, got: {replica_id}'
+                sh_ten.replica_id = (
+                    *replica_id[:2],
+                    parallel_state.get_data_modulo_expert_parallel_rank(with_context_parallel=True),
+                )
+
+            sharded_state_dict.update(expert_state_dict)
+        return sharded_state_dict
diff --git a/megatron/core/transformer/moe/grouped_gemm_util.py b/megatron/core/transformer/moe/grouped_gemm_util.py
new file mode 100644
index 0000000000..5dd344816b
--- /dev/null
+++ b/megatron/core/transformer/moe/grouped_gemm_util.py
@@ -0,0 +1,22 @@
+# Copyright (c) 2023, NVIDIA CORPORATION. All rights reserved.
+
+try:
+    import grouped_gemm
+except ImportError:
+    grouped_gemm = None
+
+
+def grouped_gemm_is_available():
+    """Check if grouped_gemm is available."""
+    return grouped_gemm is not None
+
+
+def assert_grouped_gemm_is_available():
+    """Assert that grouped_gemm is available."""
+    assert grouped_gemm_is_available(), (
+        "Grouped GEMM is not available. Please run "
+        "`pip install git+https://github.com/fanshiqing/grouped_gemm@v1.1.4`."
+    )
+
+
+ops = grouped_gemm.ops if grouped_gemm_is_available() else None
diff --git a/megatron/core/transformer/moe/legacy_a2a_token_dispatcher.py b/megatron/core/transformer/moe/legacy_a2a_token_dispatcher.py
new file mode 100644
index 0000000000..872c36aaa9
--- /dev/null
+++ b/megatron/core/transformer/moe/legacy_a2a_token_dispatcher.py
@@ -0,0 +1,304 @@
+# Copyright (c) 2024, NVIDIA CORPORATION. All rights reserved.
+
+from typing import List, Optional, Tuple
+
+import torch
+
+from megatron.core import parallel_state, tensor_parallel
+from megatron.core.tensor_parallel.mappings import _gather_along_first_dim_expert_parallel
+from megatron.core.transformer.moe.moe_utils import permute, unpermute
+from megatron.core.transformer.moe.token_dispatcher import MoETokenDispatcher
+from megatron.core.transformer.transformer_config import TransformerConfig
+
+
+class MoEAlltoAllSEQTokenDispatcher(MoETokenDispatcher):
+    """
+    The legacy implementation of the AlltoAll-based token dispatcher, which handles token dispatching on the sequence level instead of token level. The core of this implementation lies each device dispatching on the entire sequence, with the hidden state being partitioned.
+    Note: This class is a replica of the MoEAlltoAllTokenDispatcher from version 0.8.
+    """
+
+    def __init__(
+        self, num_local_experts: int, local_expert_indices: List[int], config: TransformerConfig
+    ) -> None:
+        """
+        Initialize the AlltoAll token dispatcher.
+
+        Args:
+            num_local_experts (int): Number of local experts on the current device.
+            local_expert_indices (List[int]): Indices of local experts on the current device.
+            config (TransformerConfig): Configuration for the transformer model.
+        """
+        super().__init__(config=config)
+        self.hidden_shape = None
+        self.num_input_tokens = None
+        self.num_local_experts = num_local_experts
+        self.num_experts = config.num_moe_experts
+        assert self.num_local_experts > 0, "Expected at least one expert"
+        if self.num_local_experts > 1:
+            self.expert_ids_per_ep_rank = torch.tensor(
+                [i % self.num_local_experts for i in range(self.num_experts)],
+                dtype=torch.int32,
+                device=torch.cuda.current_device(),
+            )
+        self.local_expert_indices = local_expert_indices
+        assert (
+            len(self.local_expert_indices) == self.num_local_experts
+        ), "Invalid local expert indices"
+        for i in range(len(self.local_expert_indices) - 1):
+            assert (
+                self.local_expert_indices[i] == self.local_expert_indices[i + 1] - 1
+            ), "local_expert_indices must be continous"
+        self.router_topk = config.moe_router_topk
+        self.add_bias = config.add_bias_linear
+        self.ep_size = config.expert_model_parallel_size
+        self.probs = None
+        self.input_splits = None
+        self.output_splits = None
+        self.num_global_tokens_per_local_expert = None
+
+        # Token drop and padding.
+        # We need to keep track of the token num if we drop tokens without padding them.
+        self.num_out_tokens = None
+        # Drop and pad the input to capacity.
+        self.drop_and_pad = self.config.moe_pad_expert_input_to_capacity
+        if self.drop_and_pad:
+            assert self.config.moe_expert_capacity_factor is not None
+        self.capacity = None
+
+        # A cuda stream synchronization is needed in self.token_permutation() in some cases,
+        # because there are several non-blocking DtoH data transfers called in self.preprocess().
+        # The synchronization happens at different points based on MoE settings as late as possible.
+        # Valid sync points are "before_permutation_1", "before_ep_alltoall", "before_finish", and "no_sync".
+        self.cuda_sync_point = "no_sync"
+
+    def preprocess(self, indices: torch.Tensor) -> torch.Tensor:
+        """
+        Preprocess token indices for AlltoAll communication and token permutation. This method computes the number of tokens assigned to each expert based on the input indices.
+        It also initializes the necessary data structures for AlltoAll communication, such as input
+        and output splits, and the mapping between global tokens and local experts.
+
+        Args:
+            indices (torch.Tensor): Tensor of indices mapping tokens to experts.
+
+        Returns:
+            torch.Tensor: Tensor containing the number of tokens assigned to local expert.
+        """
+        num_local_tokens_per_expert = torch.histc(
+            indices, bins=self.num_experts, min=0, max=self.num_experts
+        )
+        # num_local_tokens_per_expert: [num_experts]
+
+        ep_size = self.config.expert_model_parallel_size
+        if self.drop_and_pad:
+            # probs: [num_experts, capacity]
+            self.capacity = self.probs.size(1)
+            num_tokens_per_local_expert = torch.full(
+                (self.num_local_experts,), self.capacity * self.ep_size, dtype=torch.long
+            )
+            return num_tokens_per_local_expert
+        elif self.config.moe_expert_capacity_factor is not None:
+            # Token drop but no pad. A synchronization is needed before the first
+            # permutation to get the `num_out_tokens` CPU value.
+            self.num_out_tokens = num_local_tokens_per_expert.sum().to(
+                torch.device("cpu"), non_blocking=True
+            )
+            self.cuda_sync_point = "before_permutation_1"
+        elif ep_size > 1:
+            # Token dropless and enable ep. A synchronization is needed before expert parallel
+            # AlltoAll communication to get the `input_splits` and `output_splits` CPU values.
+            self.cuda_sync_point = "before_ep_alltoall"
+        else:
+            # Token dropless and no ep. A synchronization is needed before the token_permutation()
+            # function returns to get the `tokens_per_expert` CPU value.
+            self.cuda_sync_point = "before_finish"
+
+        if ep_size > 1:
+            # ===================================================
+            # Calculate input_splits, output_splits for alltoall-v.
+            # ===================================================
+            self.input_splits = (
+                num_local_tokens_per_expert.reshape(ep_size, self.num_local_experts)
+                .sum(axis=1)
+                .to(torch.device("cpu"), non_blocking=True)
+                .numpy()
+            )
+            num_global_tokens_per_expert = _gather_along_first_dim_expert_parallel(
+                num_local_tokens_per_expert
+            ).reshape(ep_size, self.num_experts)
+            self.num_global_tokens_per_local_expert = num_global_tokens_per_expert[
+                :, self.local_expert_indices[0] : self.local_expert_indices[-1] + 1
+            ]
+            self.output_splits = (
+                self.num_global_tokens_per_local_expert.sum(axis=-1)
+                .to(torch.device("cpu"), non_blocking=True)
+                .numpy()
+            )
+            num_tokens_per_local_expert = self.num_global_tokens_per_local_expert.sum(axis=0).to(
+                torch.device("cpu"), non_blocking=True
+            )
+            # ===================================================
+            # num_global_tokens_per_expert: [ep_size, num_experts]
+            # num_global_tokens_per_local_expert: [ep_size, num_local_experts]
+            # num_tokens_per_local_expert: [num_local_experts]
+            # ===================================================
+        else:
+            self.num_global_tokens_per_local_expert = num_local_tokens_per_expert.reshape(
+                -1, self.num_experts
+            )
+            num_tokens_per_local_expert = num_local_tokens_per_expert.to(
+                torch.device("cpu"), non_blocking=True
+            )
+
+        if self.num_local_experts > 1:
+            # No further synchronization is needed because torch.repeat_interleave() calls stream
+            # synchronization internally when the `output_size` parameter is not provided.
+            self.cuda_sync_point = "no_sync"
+            self.global_input_tokens_local_experts_indices = torch.repeat_interleave(
+                self.expert_ids_per_ep_rank, self.num_global_tokens_per_local_expert.ravel()
+            )
+
+        return num_tokens_per_local_expert
+
+    def token_permutation(
+        self, hidden_states: torch.Tensor, probs: torch.Tensor, indices: torch.Tensor
+    ) -> Tuple[torch.Tensor, torch.Tensor]:
+        """
+        Dispatch tokens to local experts using AlltoAll communication.
+
+        Args:
+            hidden_states (torch.Tensor): Input token embeddings.
+            probs (torch.Tensor): Probs of tokens assigned to experts.
+            indices (torch.Tensor): Indices of tokens assigned to experts.
+
+        Returns:
+            Tuple[torch.Tensor, torch.Tensor]:
+                - Permuted token embeddings for local experts.
+                - Number of tokens per expert.
+        """
+        # Preprocess: Get the metadata for communication, permutation and computation operations.
+        self.hidden_shape = hidden_states.shape
+        self.probs = probs
+        assert probs.dim() == 2, "Expected 2D tensor for probs"
+        assert indices.dim() == 2, "Expected 2D tensor for indices"
+        hidden_states = hidden_states.view(-1, self.hidden_shape[-1])
+        tokens_per_expert = self.preprocess(indices)
+
+        # Perform tensor parallel AlltoAll communication
+        # hidden_states: [S*B/TP, H] -> [S*B, H/TP]
+        if parallel_state.get_tensor_model_parallel_world_size() > 1:
+            hidden_states = tensor_parallel.all_to_all_sp2hp(hidden_states)
+
+        # Permutation 1: input to AlltoAll input
+        self.hidden_shape_before_permute = hidden_states.shape
+        if self.cuda_sync_point == "before_permutation_1":
+            torch.cuda.current_stream().synchronize()
+        permutated_local_input_tokens, self.reversed_local_input_permutation_mapping = permute(
+            hidden_states,
+            indices,
+            num_out_tokens=self.num_out_tokens,
+            padded_mode=self.drop_and_pad,
+        )
+
+        # Perform expert parallel AlltoAll communication
+        if self.cuda_sync_point == "before_ep_alltoall":
+            torch.cuda.current_stream().synchronize()
+        global_input_tokens = tensor_parallel.all_to_all(
+            parallel_state.get_expert_model_parallel_group(),
+            permutated_local_input_tokens,
+            self.output_splits,
+            self.input_splits,
+        )
+
+        # Permutation 2: Sort alltoall output by local experts when num_local_experts > 1.
+        if self.num_local_experts > 1:
+            if not self.drop_and_pad:
+                global_input_tokens, self.reversed_global_input_permutation_mapping = permute(
+                    global_input_tokens, self.global_input_tokens_local_experts_indices
+                )
+            else:
+                global_input_tokens = global_input_tokens.reshape(
+                    self.ep_size, self.num_local_experts, self.capacity, -1
+                )
+                global_input_tokens = (
+                    global_input_tokens.transpose(0, 1)
+                    .reshape(self.num_local_experts * self.ep_size * self.capacity, -1)
+                    .contiguous()
+                )
+
+        # Perform tensor parallel AllGather on the hidden dimension to obtain the input tokens.
+        # global_input_tokens: [SEQL, H/TP] -> [SEQL, H]
+        if parallel_state.get_tensor_model_parallel_world_size() > 1:
+            global_input_tokens = tensor_parallel.all_gather_last_dim_from_tensor_parallel_region(
+                global_input_tokens
+            )
+        if self.cuda_sync_point == "before_finish":
+            torch.cuda.current_stream().synchronize()
+
+        return global_input_tokens, tokens_per_expert
+
+    def token_unpermutation(
+        self, hidden_states: torch.Tensor, bias: torch.Tensor = None
+    ) -> Tuple[torch.Tensor, Optional[torch.Tensor]]:
+        """
+        Reverse the token permutation to restore the original order.
+
+        Args:
+            hidden_states (torch.Tensor): Output from local experts.
+            bias (torch.Tensor, optional): Bias tensor (not supported).
+
+        Returns:
+            Tuple[torch.Tensor, Optional[torch.Tensor]]:
+                - Unpermuted token embeddings in the original order.
+                - None (bias is not supported).
+        """
+        assert bias is None, "Bias is not supported in MoEAlltoAllTokenDispatcher"
+
+        # Perform tensor parallel Reduce-Scatter
+        # hidden_states: [SEQL, H] -> [SEQL, H/TP]
+        if parallel_state.get_tensor_model_parallel_world_size() > 1:
+            hidden_states = tensor_parallel.reduce_scatter_last_dim_to_tensor_parallel_region(
+                hidden_states
+            )
+
+        # Unpermutation 2: expert output to AlltoAll input
+        if self.num_local_experts > 1:
+            if not self.drop_and_pad:
+                hidden_states = unpermute(
+                    hidden_states, self.reversed_global_input_permutation_mapping
+                )
+            else:
+                hidden_states = hidden_states.reshape(
+                    self.num_local_experts, self.ep_size, self.capacity, -1
+                )
+                hidden_states = (
+                    hidden_states.transpose(0, 1)
+                    .reshape(self.ep_size * self.num_local_experts * self.capacity, -1)
+                    .contiguous()
+                )
+
+        # Perform expert parallel AlltoAll communication
+        # hidden_states: [SEQL, H] -> [SEQL, H/TP]
+        permutated_local_input_tokens = tensor_parallel.all_to_all(
+            parallel_state.get_expert_model_parallel_group(),
+            hidden_states,
+            self.input_splits,
+            self.output_splits,
+        )
+
+        # Unpermutation 1: AlltoAll output to output
+        output = unpermute(
+            permutated_local_input_tokens,
+            self.reversed_local_input_permutation_mapping,
+            probs=self.probs,
+            padded_mode=self.drop_and_pad,
+            restore_shape=self.hidden_shape_before_permute,
+        )
+
+        # Perform tensor parallel AlltoAll communication
+        # output: [S*B, H/TP] -> [S*B/TP, H]
+        if parallel_state.get_tensor_model_parallel_world_size() > 1:
+            output = tensor_parallel.all_to_all_hp2sp(output)
+
+        # Reshape the output tensor
+        output = output.view(self.hidden_shape)
+        return output, None
diff --git a/megatron/core/transformer/moe/moe_layer.py b/megatron/core/transformer/moe/moe_layer.py
new file mode 100644
index 0000000000..8b393abc77
--- /dev/null
+++ b/megatron/core/transformer/moe/moe_layer.py
@@ -0,0 +1,164 @@
+# Copyright (c) 2023, NVIDIA CORPORATION. All rights reserved.
+
+from abc import ABC, abstractmethod
+from dataclasses import dataclass
+from typing import Union
+
+import torch
+
+from megatron.core import parallel_state, tensor_parallel
+from megatron.core.transformer.mlp import MLPSubmodules
+from megatron.core.transformer.module import MegatronModule
+from megatron.core.transformer.moe.experts import GroupedMLP, SequentialMLP, TEGroupedMLP
+from megatron.core.transformer.moe.legacy_a2a_token_dispatcher import MoEAlltoAllSEQTokenDispatcher
+from megatron.core.transformer.moe.router import TopKRouter
+from megatron.core.transformer.moe.shared_experts import SharedExpertMLP
+from megatron.core.transformer.moe.token_dispatcher import (
+    MoEAllGatherTokenDispatcher,
+    MoEAlltoAllTokenDispatcher,
+)
+from megatron.core.transformer.spec_utils import ModuleSpec
+from megatron.core.transformer.transformer_config import TransformerConfig
+
+
+@dataclass
+class MoESubmodules:
+    """MoE Layer Submodule spec"""
+
+    experts: Union[ModuleSpec, type] = None
+    shared_experts: Union[ModuleSpec, type] = None
+
+
+class BaseMoELayer(MegatronModule, ABC):
+    """Base class for a mixture of experts layer.
+
+    Args:
+        config (TransformerConfig): Configuration object for the transformer model.
+    """
+
+    def __init__(self, config: TransformerConfig, layer_number: int = None):
+        super(BaseMoELayer, self).__init__(config)
+        self.config = config
+        self.expert_parallel_size = parallel_state.get_expert_model_parallel_world_size()
+        assert self.expert_parallel_size > 0, "Expected non-negative expert parallel size"
+
+        if self.config.moe_extended_tp:
+            self.num_local_experts = self.config.num_moe_experts
+            local_expert_indices_offset = 0
+        else:
+            assert self.config.num_moe_experts % self.expert_parallel_size == 0
+            self.num_local_experts = self.config.num_moe_experts // self.expert_parallel_size
+            local_expert_indices_offset = (
+                parallel_state.get_expert_model_parallel_rank() * self.num_local_experts
+            )
+
+        self.use_shared_expert = self.config.moe_shared_expert_intermediate_size is not None
+        self.shared_expert_overlap = self.config.moe_shared_expert_overlap
+
+        self.local_expert_indices = [
+            local_expert_indices_offset + i for i in range(self.num_local_experts)
+        ]
+        assert all(map(lambda x: x < self.config.num_moe_experts, self.local_expert_indices))
+        self.router = None
+        self.experts = None
+        self.shared_experts = None
+        self.token_dispatcher = None
+        self.layer_number = layer_number
+
+    @abstractmethod
+    def forward(self, hidden_states):
+        """Forward method for the MoE layer."""
+        pass
+
+    def set_layer_number(self, layer_number: int):
+        """Set the layer number for the MoE layer."""
+        self.layer_number = layer_number
+        self.router.set_layer_number(layer_number)
+
+
+class MoELayer(BaseMoELayer):
+    """Mixture of experts Layer **currently only supports no token dropping**.
+
+    Args:
+        BaseMoELayer (MegatronModule): Base class for MoE layers
+    """
+
+    def __init__(
+        self, config: TransformerConfig, submodules: MLPSubmodules = None, layer_number: int = None
+    ):
+        self.submodules = submodules
+        super(MoELayer, self).__init__(config=config, layer_number=layer_number)
+        self.moe_layer_recompute = config.moe_layer_recompute
+
+        # Initialize router
+        self.router = TopKRouter(config=self.config)
+
+        # Initialize experts
+        if self.config.moe_grouped_gemm:
+            if isinstance(self.submodules.experts, MLPSubmodules):
+                self.experts = TEGroupedMLP(
+                    self.num_local_experts, self.config, self.submodules.experts
+                )
+            else:
+                self.experts = GroupedMLP(self.num_local_experts, self.config)
+        else:
+            assert isinstance(self.submodules.experts, MLPSubmodules)
+            self.experts = SequentialMLP(
+                self.num_local_experts, self.config, self.submodules.experts
+            )
+
+        # Initialize token dispatcher
+        if config.moe_token_dispatcher_type == "allgather":
+            self.token_dispatcher = MoEAllGatherTokenDispatcher(
+                self.num_local_experts, self.local_expert_indices, config=self.config
+            )
+        elif config.moe_token_dispatcher_type == "alltoall":
+            self.token_dispatcher = MoEAlltoAllTokenDispatcher(
+                self.num_local_experts, self.local_expert_indices, config=self.config
+            )
+        elif config.moe_token_dispatcher_type == "alltoall_seq":
+            self.token_dispatcher = MoEAlltoAllSEQTokenDispatcher(
+                self.num_local_experts, self.local_expert_indices, config=self.config
+            )
+        else:
+            raise ValueError(
+                f"Unsupported token dispatcher type: {config.moe_token_dispatcher_type}"
+            )
+
+        # Initialize shared experts
+        if self.use_shared_expert:
+            self.shared_experts = SharedExpertMLP(self.config, self.submodules.shared_experts)
+            if self.shared_expert_overlap:
+                self.token_dispatcher.set_shared_experts(self.shared_experts)
+
+    def forward(self, hidden_states: torch.Tensor):
+        if (
+            self.training
+            and self.config.tensor_model_parallel_size > 1
+            and not self.config.sequence_parallel
+        ):
+            raise ValueError(
+                "During training, performance may degrade if MoE and tensor parallelism"
+                "are enabled without also enabling sequence parallelism."
+            )
+
+        # process MoE
+        def custom_forward(hidden_states):
+            probs, indices = self.router(hidden_states)
+            (dispatched_input, tokens_per_expert) = self.token_dispatcher.token_permutation(
+                hidden_states, probs, indices
+            )
+            expert_output, mlp_bias = self.experts(dispatched_input, tokens_per_expert)
+            output, mlp_bias = self.token_dispatcher.token_unpermutation(expert_output, mlp_bias)
+            if self.use_shared_expert and not self.shared_expert_overlap:
+                # if shared_expert_overlap is True, the expert calculation happens in
+                # the token_dispatcher to overlap communications and computations
+                output += self.shared_experts(hidden_states)
+            return output, mlp_bias
+
+        if self.moe_layer_recompute:
+            output, mlp_bias = tensor_parallel.checkpoint(custom_forward, False, hidden_states)
+        else:
+            output, mlp_bias = custom_forward(hidden_states)
+
+        return output, mlp_bias
diff --git a/megatron/core/transformer/moe/moe_utils.py b/megatron/core/transformer/moe/moe_utils.py
new file mode 100644
index 0000000000..02a2cccca5
--- /dev/null
+++ b/megatron/core/transformer/moe/moe_utils.py
@@ -0,0 +1,558 @@
+# Copyright (c) 2024, NVIDIA CORPORATION. All rights reserved.
+
+import math
+
+import torch
+
+from megatron.core import parallel_state
+
+
+def switch_load_balancing_loss_func(
+    probs: torch.Tensor,
+    tokens_per_expert: torch.Tensor,
+    topk: int,
+    moe_aux_loss_coeff: float,
+    sequence_partition_group=None,
+):
+    """Calculate the auxiliary loss for load balancing.
+    Refer to the Switch Transformer paper (https://arxiv.org/abs/2101.03961) for details.
+
+    Args:
+        probs (torch.Tensor): Softmax probabilities output by the router for each token.
+                              Shape in [num_tokens, num_experts].
+        tokens_per_expert (torch.Tensor): Number of tokens assigned to each expert.
+                                          Shape in [num_experts]
+        topk (int): The number of experts selected for each token.
+        moe_aux_loss_coeff (float): The coefficient for the auxiliary loss.
+        sequence_partition_group (optional): The parallel group over which the sequence is
+                                             partitioned. If None, no partitioning is applied.
+                                             Defaults to None.
+
+    Returns:
+        torch.Tensor: The auxiliary loss for load balancing.
+    """
+    num_sub_sequence = 1
+
+    # If the sequence is partitioned by certain parallelism strategies like Sequence Parallelism
+    # or Context Parallelism, compute the gradient of the auxiliary loss with respect to the full
+    # sequence.
+    if sequence_partition_group is not None:
+        # We can keep `aggregated_probs_per_expert` local since we don't need the gradient for
+        # `tokens_per_expert`, saving one allreduce operation for `aggregated_probs_per_expert`.
+        num_sub_sequence = torch.distributed.get_world_size(sequence_partition_group)
+        torch.distributed.all_reduce(tokens_per_expert, group=sequence_partition_group)
+
+    num_tokens = probs.shape[0] * num_sub_sequence
+    num_experts = probs.shape[1]
+
+    # The formula of aux_loss: aux_loss = sum((probs_per_expert/num_tokens) *
+    # (tokens_per_expert/(num_tokens*topk))) * num_experts * moe_aux_loss_coeff.
+    # This can be simplified to fuse the division and multiplication operations.
+    aggregated_probs_per_expert = probs.sum(dim=0)
+    aux_loss = torch.sum(aggregated_probs_per_expert * tokens_per_expert) * (
+        num_experts * moe_aux_loss_coeff / (num_tokens * num_tokens * topk)
+    )
+    return aux_loss
+
+
+def z_loss_func(logits, z_loss_coeff):
+    """Encourages the router's logits to remain small to enhance stability.
+    Please refer to the ST-MoE paper (https://arxiv.org/pdf/2202.08906.pdf) for details.
+
+    Args:
+        logits (torch.Tensor): The logits of the router.
+
+    Returns:
+        torch.Tensor: The logits after applying the z-loss.
+    """
+
+    z_loss = torch.mean(torch.square(torch.logsumexp(logits, dim=-1))) * z_loss_coeff
+    return z_loss
+
+
+def sinkhorn(cost: torch.Tensor, tol: float = 0.0001):
+    """Sinkhorn based MoE routing function"""
+    cost = torch.exp(cost)
+    d0 = torch.ones(cost.size(0), device=cost.device, dtype=cost.dtype)
+    d1 = torch.ones(cost.size(1), device=cost.device, dtype=cost.dtype)
+
+    eps = 0.00000001
+    error = 1e9
+    d1_old = d1
+    while error > tol:
+        d0 = (1 / d0.size(0)) * 1 / (torch.sum(d1 * cost, 1) + eps)
+        d1 = (1 / d1.size(0)) * 1 / (torch.sum(d0.unsqueeze(1) * cost, 0) + eps)
+        error = torch.mean(torch.abs(d1_old - d1))
+        d1_old = d1
+    return d1 * cost * d0.unsqueeze(1)
+
+
+def get_capacity(num_tokens: int, num_experts: int, capacity_factor: float, min_capacity=None):
+    """
+    Calculate the capacity of each expert.
+
+    Args:
+        num_tokens (int): num of the input tokens.
+        num_experts (int): num of the experts.
+        capacity_factor (float): Capacity factor.
+        min_capacity (int, optional): Minimum capacity. Defaults to None.
+
+    Returns:
+        Tensor: Capacity of each expert.
+    """
+    capacity = math.ceil((num_tokens / num_experts) * capacity_factor)
+    if min_capacity is not None and capacity < min_capacity:
+        capacity = min_capacity
+    return capacity
+
+
+class MoEAuxLossAutoScaler(torch.autograd.Function):
+    """An AutoScaler that compute and scales the grad for auxiliary loss."""
+
+    main_loss_backward_scale: torch.Tensor = torch.tensor(1.0)
+
+    @staticmethod
+    def forward(ctx, output: torch.Tensor, aux_loss: torch.Tensor):
+        """Preserve the aux_loss by storing it in the context to avoid garbage collection.
+
+        Args:
+            output (torch.Tensor): The output tensor.
+            aux_loss (torch.Tensor): The auxiliary loss tensor.
+
+        Returns:
+            torch.Tensor: The output tensor.
+        """
+        ctx.save_for_backward(aux_loss)
+        return output
+
+    @staticmethod
+    def backward(ctx, grad_output: torch.Tensor):
+        """Compute and scale the gradient for auxiliary loss..
+
+        Args:
+            grad_output (torch.Tensor): The gradient of the output.
+
+        Returns:
+            Tuple[torch.Tensor, torch.Tensor]: The gradient of the output, scaled auxiliary loss
+                                               gradient.
+        """
+        (aux_loss,) = ctx.saved_tensors
+        aux_loss_backward_scale = MoEAuxLossAutoScaler.main_loss_backward_scale
+        scaled_aux_loss_grad = torch.ones_like(aux_loss) * aux_loss_backward_scale
+        return grad_output, scaled_aux_loss_grad
+
+    @staticmethod
+    def set_loss_scale(scale: torch.Tensor):
+        """set the scale of the aux loss.
+
+        Args:
+            scale (torch.Tensor): The scale value to set. Please ensure that the scale passed in
+                                  matches the scale of the main_loss.
+        """
+        MoEAuxLossAutoScaler.main_loss_backward_scale = scale
+
+
+def permute(tokens, indices, num_out_tokens: int = None, padded_mode: bool = False):
+    """Permute the tokens based on the indices. Token with the same index will be grouped together.
+       The input indices shape is [tokens, top_k], it indicates which experts were selected by each
+       token separately.
+    Args:
+        tokens (torch.Tensor): The input token tensor.
+        indices (torch.Tensor): The token to expert indices tensor, should have a shape of
+                                [num_tokens] or [num_tokens, topk].
+        num_out_tokens (int, optional): The effective output token count, when enabling the
+                                        capacity factor, should equal the number of tokens not
+                                        dropped. By default, set to None, meaning no tokens are
+                                        dropped.
+        padded_mode (bool, optional): If True, indicating the indices are padded to
+                                      [num_expert, capacity] to denote selected tokens per expert.
+                                      Defaults to False.
+
+    Returns:
+        torch.Tensor: The permuted tensor.
+        torch.Tensor: The sorted_indices corresponding permuted tensor.
+    """
+    if padded_mode:
+        return permute_with_padded_tokens(tokens, indices)
+
+    if indices.dim() == 1:
+        indices = indices.unsqueeze(1)
+
+    topk = indices.size(1)
+    flatten_indices = indices.view(-1)
+    sorted_indices = torch.argsort(flatten_indices, stable=True)
+    if num_out_tokens is not None:
+        sorted_indices = sorted_indices[:num_out_tokens]
+    moe_gather_indices = (sorted_indices // topk).unsqueeze(1).expand(-1, tokens.size(-1))
+    permuted_tokens = moe_gather.apply(tokens, moe_gather_indices)
+
+    return permuted_tokens, sorted_indices
+
+
+def unpermute(
+    permuted_tokens: torch.Tensor,
+    sorted_indices: torch.Tensor,
+    probs: torch.Tensor = None,
+    padded_mode: bool = False,
+    restore_shape: torch.Size = None,
+):
+    """Unpermute a tensor of permuted tokens based on sorted indices, and optionally merge the
+    tokens with their corresponding probabilities.
+
+    Args:
+        permuted_tokens (torch.Tensor): 2D tensor [num_tokens*topk, hidden]. The tensor of permuted
+                                        tokens to be unpermuted.
+        sorted_indices (torch.Tensor): 1D tensor [num_tokens*topk]. The tensor of sorted indices
+                                       used to unpermute the tokens.
+        probs (torch.Tensor, optional): 2D tensor [num_tokens, topk]. The tensor of probabilities
+                                        corresponding to the permuted tokens. If provided,
+                                        the unpermuted tokens will be merged with their respective
+                                        probabilities.
+        padded_mode (bool, optional): If True, indicating the indices are padded to
+                                      [num_expert, capacity] to denote selected tokens per expert.
+                                      Defaults to False.
+        restore_shape (torch.Size, optional): The input shape before permutation, only used in
+                                              padding mode. Defaults to None.
+
+    Returns:
+        torch.Tensor: The unpermuted tokens, optionally merged with probabilities.
+    """
+    if padded_mode:
+        return unpermute_with_padded_tokens(
+            permuted_tokens, sorted_indices, probs, restore_shape=restore_shape
+        )
+
+    assert sorted_indices.numel() == permuted_tokens.size(
+        0
+    ), f"Got {sorted_indices.numel()} != {permuted_tokens.size(0)}."
+    if probs is not None:
+        # Unpermute and merge the tokens with their probabilities
+        num_unpermuted_tokens = probs.numel()
+        assert probs.dim() == 2, f"Expected 2D tensor for probs, got {probs.dim()} dims."
+        topk = probs.size(1)
+    else:
+        # Unpermute the tokens without merge
+        num_unpermuted_tokens = permuted_tokens.size(0)
+        topk = 1
+
+    output_size = [num_unpermuted_tokens, permuted_tokens.shape[-1]]
+    moe_scatter_indices = sorted_indices.unsqueeze(1).expand(-1, permuted_tokens.size(-1))
+    unpermuted_tokens = moe_scatter.apply(permuted_tokens, moe_scatter_indices, output_size)
+    unpermuted_tokens = unpermuted_tokens.reshape(-1, topk, permuted_tokens.size(-1))
+    if probs is not None:
+        unpermuted_tokens = unpermuted_tokens * probs.unsqueeze(-1)
+    unpermuted_tokens = unpermuted_tokens.sum(dim=1)
+
+    return unpermuted_tokens
+
+
+def permute_with_padded_tokens(tokens, indices):
+    """Permute the tokens based on the indices, only used in padding mode.
+       The input indices shape is [num_expert, capacity], it indicates which tokens were selected
+       by each expert separately.
+    Args:
+        tokens (torch.Tensor): The input token tensor.
+        indices (torch.Tensor): A tensor with shape [num_expert, capacity], indicating the selected
+                                tokens for each expert.
+
+    Returns:
+        torch.Tensor: The permuted tensor.
+        torch.Tensor: The sorted_indices corresponding permuted tensor.
+    """
+    permuted_tokens = tokens.index_select(dim=0, index=indices.view(-1))
+
+    return permuted_tokens, indices
+
+
+def unpermute_with_padded_tokens(
+    permuted_tokens: torch.Tensor,
+    indices: torch.Tensor,
+    probs: torch.Tensor,
+    restore_shape: torch.Size,
+) -> torch.Tensor:
+    """
+    Unpermutes a padded permuted tokens based on sorted indices and merges the tokens with their
+    corresponding probabilities.
+
+    This function takes a tensor of permuted tokens and reorders them according to the provided
+    indices. It also combines the tokens with their associated probabilities.
+
+    Parameters:
+        permuted_tokens (torch.Tensor): A 2D tensor containing permuted tokens.
+        indices (torch.Tensor): A tensor with shape [num_expert, capacity], indicating the selected
+                                tokens for each expert.
+        probs (torch.Tensor): A tensor with the same shape as indices, containing probabilities
+                              corresponding to each token.
+        restore_shape (torch.Size): The target shape for the unpermuted tokens tensor.
+
+    Returns:
+        torch.Tensor: A tensor of unpermuted tokens, merged with their probabilities.
+
+    """
+    # Ensure permuted_tokens is 2D
+    assert permuted_tokens.dim() == 2, f"Got {permuted_tokens.dim()}D."
+
+    # Reshape and expand probabilities and indices to match permuted_tokens
+    probs = probs.view(-1).unsqueeze(-1)
+    indices = indices.view(-1, 1).expand(-1, permuted_tokens.shape[1])
+    assert (
+        permuted_tokens.shape == indices.shape
+    ), "Shape mismatch between permuted_tokens and indices."
+
+    # Combine tokens with their probabilities
+    combined_output = probs * permuted_tokens
+
+    # Prepare a tensor of zeros with the desired output shape
+    empty_tokens = torch.zeros(
+        restore_shape, dtype=combined_output.dtype, device=combined_output.device
+    )
+
+    # Scatter the combined tokens back to their original positions
+    unpermuted_tokens = torch.scatter_add(empty_tokens, 0, indices, combined_output)
+
+    return unpermuted_tokens
+
+
+def sort_chunks_by_idxs(input: torch.Tensor, split_sizes: torch.Tensor, sorted_idxs: torch.Tensor):
+    """Split and sort the input tensor based on the split_sizes and sorted indices."""
+    input = torch.split(input, split_sizes.tolist(), dim=0)
+    output = torch.cat([input[i] for i in sorted_idxs], dim=0)
+    return output
+
+
+def topk_softmax_with_capacity(
+    logits: torch.Tensor,
+    topk: int,
+    capacity_factor: float = None,
+    pad_to_capacity: bool = False,
+    drop_policy: str = "probs",
+    use_pre_softmax: bool = False,
+    deterministic_mode: bool = False,
+):
+    """Apply capacity and padding to the top-k selection.
+    Args:
+        logits (torch.Tensor): Logits tensor.
+        topk (int): The number of experts to select for each token.
+        capacity_factor (int): The capacity factor of each expert. Will drop tokens if the number
+                               of tokens exceeds the capacity.
+        pad_to_capacity (bool): Whether to need padding in token drop mode.
+        drop_policy (str): The policy to drop tokens. Can be either "prob" or "position".
+                           If "prob", the tokens with the lowest probabilities will be dropped.
+                           If "position", tokens at the end of each batch will be dropped.
+
+    Returns:
+        Tuple[torch.Tensor, torch.Tensor, torch.Tensor]: Probs, indices and tokens_per_expert
+                                                         tensor.
+
+        (1) If there's no token padding, the shape of probs and indices is [tokens, top_k],
+            indicating the selected experts for each token.
+        (2) If there's token padding, the shape of probs and indices is [num_expert, capacity],
+            indicating the tokens selected for each expert.
+    """
+    assert logits.dim() == 2, f"Expected 2D logits [num_tokens, num_experts], got {logits.dim()}."
+    num_tokens = logits.shape[0]
+    num_experts = logits.shape[1]
+    if use_pre_softmax:
+        # Pre softmax
+        scores = torch.softmax(logits, dim=-1, dtype=torch.float32).type_as(logits)
+        probs, top_indices = torch.topk(scores, k=topk, dim=1)
+    else:
+        # Post softmax
+        if topk == 1:
+            # Requires applying softmax before selecting the top-k when k is 1,
+            # since softmax on a [num_tokens, 1] would yield a zero gradient.
+            raise ValueError("Please use --moe-router-pre-softmax when topk is 1.")
+        scores, top_indices = torch.topk(logits, k=topk, dim=1)
+        probs = torch.softmax(scores, dim=-1, dtype=torch.float32).type_as(logits)
+
+    if capacity_factor is None:
+        # TopK without capacity
+        if deterministic_mode:
+            tokens_per_expert = torch.bincount(top_indices.view(-1), minlength=num_experts)
+        else:
+            tokens_per_expert = torch.histc(top_indices, bins=num_experts, min=0, max=num_experts)
+        return probs, top_indices, tokens_per_expert
+    else:
+        # TopK with capacity
+        expert_capacity = get_capacity(
+            num_tokens=num_tokens * topk, num_experts=num_experts, capacity_factor=capacity_factor
+        )
+        # TopK selection, Maskout unused experts
+        topk_masked_gates = torch.zeros_like(logits).scatter(1, top_indices, probs)
+        topk_mask = torch.zeros_like(logits).scatter(1, top_indices, 1)
+
+        # Maskout exceeded tokens
+        if drop_policy == "probs":
+            capacity_probs, capacity_indices = torch.topk(
+                topk_masked_gates, k=expert_capacity, dim=0, sorted=False
+            )
+            capacity_mask = torch.zeros_like(logits).scatter(0, capacity_indices, 1)
+        elif drop_policy == "position":
+            _, capacity_indices = torch.topk(topk_mask, k=expert_capacity, dim=0, sorted=False)
+            capacity_mask = torch.zeros_like(logits).scatter(0, capacity_indices, 1)
+            capacity_probs = torch.gather(topk_masked_gates, 0, capacity_indices)
+        else:
+            raise ValueError(f"Invalid drop_policy: {drop_policy}")
+
+        if pad_to_capacity:
+            final_probs, final_indices = (
+                capacity_probs.T.contiguous(),
+                capacity_indices.T.contiguous(),
+            )
+            tokens_per_expert_before_capacity = topk_mask.sum(dim=0)
+        else:
+            # Get exceed mask and maskout exceeded probs and indices
+            final_mask = torch.logical_and(topk_mask, capacity_mask)
+            drop_mask = torch.logical_not(final_mask)
+            exceed_mask = torch.gather(drop_mask, 1, top_indices)
+            final_probs = probs * torch.logical_not(exceed_mask)
+            final_indices = top_indices.clone().masked_fill_(
+                exceed_mask, torch.iinfo(torch.long).max
+            )
+            tokens_per_expert_before_capacity = topk_mask.sum(dim=0)
+        return final_probs, final_indices, tokens_per_expert_before_capacity
+
+
+def save_to_aux_losses_tracker(
+    name: str,
+    loss: torch.Tensor,
+    layer_number: int,
+    num_layers: int,
+    reduce_group: torch.distributed.ProcessGroup = None,
+    avg_group: torch.distributed.ProcessGroup = None,
+):
+    """Save the auxiliary loss for logging.
+    Args:
+        name (str): The name of the loss.
+        loss (torch.Tensor): The loss tensor.
+        layer_number (int): Layer index of the loss.
+        num_layers (int): The number of total layers.
+        reduce_group (torch.distributed.ProcessGroup): The group for reducing the loss.
+        mean_group (torch.distributed.ProcessGroup): The group for averaging the loss.
+    """
+    # Skip aux loss logging if layer_number is None.
+    if layer_number is None:
+        return
+
+    tracker = parallel_state.get_moe_layer_wise_logging_tracker()
+    if name not in tracker:
+        tracker[name] = {}
+        tracker[name]["values"] = torch.zeros(num_layers, device=loss.device)
+    tracker[name]["values"][layer_number - 1] += loss.detach()  # Aggregate the loss for the layer.
+    tracker[name]["reduce_group"] = reduce_group
+    tracker[name]["avg_group"] = avg_group
+
+
+def clear_aux_losses_tracker():
+    """Clear the auxiliary losses."""
+    tracker = parallel_state.get_moe_layer_wise_logging_tracker()
+    for name in tracker:
+        tracker[name]["values"].zero_()
+        tracker[name]["reduce_group"] = None
+        tracker[name]["avg_group"] = None
+
+
+def reduce_aux_losses_tracker_across_ranks():
+    """Collect and reduce the auxiliary losses across ranks."""
+    tracker = parallel_state.get_moe_layer_wise_logging_tracker()
+    for name in tracker:
+        values = tracker[name]["values"]
+        # Collect aux losses across PP.
+        torch.distributed.all_reduce(
+            values, group=parallel_state.get_pipeline_model_parallel_group()
+        )
+        # Reduce aux losses across ranks.
+        if tracker[name].get('reduce_group') is not None:
+            torch.distributed.all_reduce(values, group=tracker[name].get('reduce_group'))
+        if tracker[name].get('avg_group') is not None:
+            torch.distributed.all_reduce(
+                values, group=tracker[name]['avg_group'], op=torch.distributed.ReduceOp.AVG
+            )
+
+
+def track_moe_metrics(
+    loss_scale, iteration, writer, wandb_writer=None, total_loss_dict=None, per_layer_logging=False
+):
+    """Track the MoE metrics for logging."""
+    # Aux loss logging
+    reduce_aux_losses_tracker_across_ranks()
+    tracker = parallel_state.get_moe_layer_wise_logging_tracker()
+    if writer is not None:
+        aux_losses = {k: v['values'].float() * loss_scale for k, v in tracker.items()}
+        for name, loss_list in aux_losses.items():
+            if total_loss_dict is not None:
+                if name not in total_loss_dict:
+                    total_loss_dict[name] = loss_list.mean()
+                else:
+                    total_loss_dict[name] += loss_list.mean()
+
+            # currently when using add_scalars,
+            # torch.utils.add_scalars makes each timer its own run, which
+            # polutes the runs list, so we just add each as a scalar
+            writer.add_scalar(name, loss_list.mean(), iteration)
+            if per_layer_logging:
+                for i, loss in enumerate(loss_list.tolist()):
+                    writer.add_scalar(f"moe/{name}_layer_{i}", loss, iteration)
+
+            # W&B logging lacks support for logging multiple scalars simultaneously.
+            # As a workaround, we log each scalar individually first, then we can create
+            # a custom panel to manually group them to a single plot.
+            if wandb_writer:
+                wandb_writer.log({f"{name}": loss_list.mean()}, iteration)
+                if per_layer_logging:
+                    wandb_writer.log(
+                        {
+                            f"moe/{name}_layer_{i}": loss
+                            for i, loss in enumerate(loss_list.tolist())
+                        },
+                        iteration,
+                    )
+
+    clear_aux_losses_tracker()
+
+
+class moe_gather(torch.autograd.Function):
+    """Gather the input tensor based on the map tensor."""
+
+    @staticmethod
+    def forward(ctx, input_, map_):
+        """Gather the input tensor based on the map tensor."""
+        ctx.input_size = input_.size()
+        ctx.map = map_
+        return torch.gather(input_, 0, map_)
+
+    @staticmethod
+    def backward(ctx, grad_output):
+        """Scatter the grad_output tensor based on the map tensor."""
+        input_size = ctx.input_size
+        map_ = ctx.map
+
+        output = torch.zeros(
+            input_size, dtype=grad_output.dtype, device=torch.cuda.current_device()
+        )
+        output.scatter_add_(0, map_, grad_output)
+        return output, None, None
+
+
+class moe_scatter(torch.autograd.Function):
+    """Scatter the input tensor based on the map tensor."""
+
+    @staticmethod
+    def forward(ctx, input_, map_, output_size=None):
+        """Scatter the input tensor based on the map tensor."""
+        ctx.map = map_
+
+        if output_size is not None:
+            output = torch.zeros(output_size, dtype=input_.dtype, device=input_.device)
+        else:
+            output = torch.zeros_like(input_)
+
+        output.scatter_add_(0, map_, input_)
+        return output
+
+    @staticmethod
+    def backward(ctx, grad_output):
+        """Gather the grad_output tensor based on the map tensor."""
+        map_ = ctx.map
+        grad_input = torch.gather(grad_output, 0, map_)
+        return grad_input, None, None, None
diff --git a/megatron/core/transformer/moe/router.py b/megatron/core/transformer/moe/router.py
new file mode 100644
index 0000000000..3e85ec53c5
--- /dev/null
+++ b/megatron/core/transformer/moe/router.py
@@ -0,0 +1,309 @@
+# Copyright (c) 2023, NVIDIA CORPORATION. All rights reserved.
+
+from abc import ABC, abstractmethod
+
+import torch
+
+from megatron.core import parallel_state
+from megatron.core.tensor_parallel import (
+    gather_from_sequence_parallel_region,
+    get_cuda_rng_tracker,
+    get_data_parallel_rng_tracker_name,
+)
+from megatron.core.transformer.module import MegatronModule
+from megatron.core.transformer.moe.moe_utils import (
+    MoEAuxLossAutoScaler,
+    save_to_aux_losses_tracker,
+    sinkhorn,
+    switch_load_balancing_loss_func,
+    topk_softmax_with_capacity,
+    z_loss_func,
+)
+from megatron.core.transformer.transformer_config import TransformerConfig
+
+
+class Router(ABC, MegatronModule):
+    """Base Router class"""
+
+    def __init__(self, config: TransformerConfig) -> None:
+        """
+        Initialize the Router module.
+
+        Args:
+            config (TransformerConfig): Configuration object for the Transformer model.
+        """
+        super().__init__(config)
+        self.config = config
+        self.num_experts = self.config.num_moe_experts
+        self.moe_aux_loss_func = None
+        self.layer_number = None
+
+        # Initialize the gate weights.
+        self.weight = torch.nn.Parameter(
+            torch.empty((self.config.num_moe_experts, self.config.hidden_size), dtype=torch.float32)
+        )
+        if config.perform_initialization:
+            if get_cuda_rng_tracker().is_initialized():
+                with get_cuda_rng_tracker().fork(get_data_parallel_rng_tracker_name()):
+                    config.init_method(self.weight)
+        else:
+            config.init_method(self.weight)
+        self.weight.data = self.weight.data.to(dtype=config.params_dtype)
+        setattr(self.weight, 'sequence_parallel', config.sequence_parallel)
+
+    def gating(self, input: torch.Tensor):
+        """Forward pass of the router gate.
+
+        Args:
+            input (torch.Tensor): Input tensor.
+
+        Returns:
+            torch.Tensor: Logits tensor.
+        """
+        if self.weight.device.type == 'cpu':
+            # move weights to GPU
+            self.weight.data = self.weight.data.to(device=torch.cuda.current_device())
+        logits = torch.nn.functional.linear(input, self.weight)
+        return logits
+
+    @abstractmethod
+    def routing(self, logits: torch.Tensor):
+        """Routing function.
+
+        Args:
+            logits (torch.Tensor): Logits tensor.
+
+        Returns:
+            Tuple[torch.Tensor, torch.Tensor]:
+                Tuple of tensors representing max probs and the indices.
+        """
+        raise NotImplementedError("Routing function not implemented.")
+
+    @abstractmethod
+    def forward(self, input: torch.Tensor):
+        """
+        Forward pass of the router.
+
+        Args:
+            input (torch.Tensor): Input tensor.
+        """
+        raise NotImplementedError("Forward function not implemented.")
+
+    def set_layer_number(self, layer_number: int):
+        """Set the layer number for the router."""
+        self.layer_number = layer_number
+
+
+class TopKRouter(Router):
+    """Route each token to the top-k experts."""
+
+    def __init__(self, config: TransformerConfig) -> None:
+        """Initialize the zero token dropping router.
+
+        Args:
+            config (TransformerConfig): The configuration for the transformer model.
+        """
+        super().__init__(config=config)
+        self.topk = self.config.moe_router_topk
+        self.routing_type = self.config.moe_router_load_balancing_type
+        self.input_jitter = None
+
+    def sinkhorn_load_balancing(self, logits: torch.Tensor):
+        """Apply sinkhorn routing to the logits tensor.
+
+        Args:
+            logits (torch.Tensor): The logits tensor.
+
+        Returns:
+            torch.Tensor: The logits tensor after applying sinkhorn routing.
+        """
+
+        def _sinkhorn_activation(logits):
+            if self.topk == 1:
+                logits = torch.sigmoid(logits)
+            else:  # k > 1
+                logits = torch.softmax(logits, dim=-1, dtype=torch.float32).type_as(logits)
+            return logits
+
+        assert self.config.moe_aux_loss_coeff == 0, "Sinkhorn routing does not support aux loss."
+        if self.training:
+            with torch.no_grad():
+                norm_logits = sinkhorn(
+                    logits.to(dtype=torch.float32)
+                )  # explicit fp32 conversion for stability
+                _, indices = torch.topk(norm_logits, k=self.topk, dim=1)
+            logits = _sinkhorn_activation(logits)
+            scores = torch.gather(logits, 1, indices)
+        else:
+            logits = _sinkhorn_activation(logits)
+            scores, indices = torch.topk(logits, k=self.topk, dim=1)
+        return scores, indices
+
+    def aux_loss_load_balancing(self, logits: torch.Tensor):
+        """Apply loss-based load balancing to the logits tensor.
+
+        Args:
+            logits (torch.Tensor): the logits tensor after gating, shape: [num_tokens, num_experts].
+
+        Returns:
+            probs (torch.Tensor): the probabilities tensor after load balancing.
+            indices (torch.Tensor): the indices tensor after top-k selection.
+        """
+        probs, indices, tokens_per_expert = topk_softmax_with_capacity(
+            logits,
+            self.topk,
+            capacity_factor=self.config.moe_expert_capacity_factor,
+            pad_to_capacity=self.config.moe_pad_expert_input_to_capacity,
+            drop_policy=self.config.moe_token_drop_policy,
+            use_pre_softmax=self.config.moe_router_pre_softmax,
+            deterministic_mode=self.config.deterministic_mode,
+        )
+
+        if self.training:
+            # Apply load balancing loss
+            scores = torch.softmax(logits, dim=-1, dtype=torch.float32)
+            probs = self.apply_load_balancing_loss(scores, tokens_per_expert, activation=probs)
+        return probs, indices
+
+    def apply_load_balancing_loss(
+        self,
+        probs: torch.Tensor,
+        num_local_tokens_per_expert: torch.Tensor,
+        activation: torch.Tensor,
+    ):
+        """Applies auxiliary loss to the MoE layer.
+
+        Args:
+            probs (torch.Tensor):
+                The probs output by the router for each token. [num_tokens, num_experts]
+            num_local_tokens_per_expert (torch.Tensor):
+                The number of tokens per expert. [num_experts]
+            activation (torch.Tensor): The activation tensor to attach the gradient function to.
+
+        Returns:
+            torch.Tensor: The activation tensor with the attached gradient function.
+        """
+        moe_aux_loss_coeff = self.config.moe_aux_loss_coeff
+        sequence_partition_group = None
+        if self.config.moe_token_dispatcher_type == "alltoall_seq":
+            sequence_partition_group = parallel_state.get_context_parallel_group()
+            moe_aux_loss_coeff /= parallel_state.get_tensor_model_parallel_world_size()
+        else:
+            sequence_partition_group = parallel_state.get_tensor_and_context_parallel_group()
+
+        aux_loss = switch_load_balancing_loss_func(
+            probs,
+            num_local_tokens_per_expert,
+            self.topk,
+            moe_aux_loss_coeff,
+            sequence_partition_group=sequence_partition_group,
+        )
+        save_to_aux_losses_tracker(
+            "load_balancing_loss",
+            aux_loss / moe_aux_loss_coeff,
+            self.layer_number,
+            self.config.num_layers,
+            reduce_group=sequence_partition_group,
+        )
+        activation = MoEAuxLossAutoScaler.apply(activation, aux_loss)
+        return activation
+
+    def apply_z_loss(self, logits):
+        """Encourages the router's logits to remain small to enhance stability.
+        Please refer to the ST-MoE paper (https://arxiv.org/pdf/2202.08906.pdf) for details.
+
+        Args:
+            logits (torch.Tensor): The logits of the router.
+
+        Returns:
+            torch.Tensor: The logits after applying the z-loss.
+        """
+        if self.config.moe_z_loss_coeff is not None and self.training:
+            moe_z_loss_coeff = (
+                self.config.moe_z_loss_coeff
+                / parallel_state.get_tensor_and_context_parallel_world_size()
+            )
+            z_loss = z_loss_func(logits, moe_z_loss_coeff)
+            logits = MoEAuxLossAutoScaler.apply(logits, z_loss)
+            save_to_aux_losses_tracker(
+                "z_loss", z_loss / moe_z_loss_coeff, self.layer_number, self.config.num_layers
+            )
+        return logits
+
+    def apply_input_jitter(self, input: torch.Tensor):
+        """Add noise to the input tensor.
+        Refer to https://arxiv.org/abs/2101.03961.
+
+        Args:
+            input (Tensor): Input tensor.
+
+        Returns:
+            Tensor: Jittered input.
+        """
+        if self.config.moe_input_jitter_eps is not None:
+            eps = self.config.moe_input_jitter_eps
+            if self.input_jitter is None:
+                self.input_jitter = torch.distributions.uniform.Uniform(
+                    torch.tensor(1.0 - eps, device=input.device),
+                    torch.tensor(1.0 + eps, device=input.device),
+                ).rsample
+            return input * self.input_jitter(input.shape)
+        else:
+            return input
+
+    def routing(self, logits: torch.Tensor):
+        """Top-k routing function
+
+        Args:
+            logits (torch.Tensor): Logits tensor after gating.
+
+        Returns:
+            probs (torch.Tensor): the probabilities tensor after load balancing.
+            indices (torch.Tensor): the indices tensor after top-k selection.
+        """
+        logits = logits.view(-1, self.config.num_moe_experts)
+
+        # Apply Z-Loss
+        logits = self.apply_z_loss(logits)
+
+        if self.config.moe_token_dispatcher_type == "alltoall_seq":
+            # Gather the logits from the TP region
+            logits = gather_from_sequence_parallel_region(logits)
+
+        if self.routing_type == "sinkhorn":
+            scores, indices = self.sinkhorn_load_balancing(logits)
+        elif self.routing_type == "aux_loss":
+            scores, indices = self.aux_loss_load_balancing(logits)
+        elif self.routing_type == "none":
+            # A naive top-k routing without load balancing
+            scores, indices, _ = topk_softmax_with_capacity(
+                logits,
+                self.topk,
+                capacity_factor=self.config.moe_expert_capacity_factor,
+                pad_to_capacity=self.config.moe_pad_expert_input_to_capacity,
+                drop_policy=self.config.moe_token_drop_policy,
+                use_pre_softmax=self.config.moe_router_pre_softmax,
+                deterministic_mode=self.config.deterministic_mode,
+            )
+        else:
+            raise ValueError(f"Unsupported MoE routing type: {self.routing_type}")
+
+        return scores, indices
+
+    def forward(self, input: torch.Tensor):
+        """
+        Forward pass of the router.
+
+        Args:
+            input (torch.Tensor): Input tensor.
+        """
+        self.hidden = input.shape[-1]
+
+        # Apply input jitter
+        input = self.apply_input_jitter(input)
+        logits = self.gating(input)
+        logits = logits.view(-1, self.config.num_moe_experts)
+
+        scores, indices = self.routing(logits)
+
+        return scores, indices
diff --git a/megatron/core/transformer/moe/shared_experts.py b/megatron/core/transformer/moe/shared_experts.py
new file mode 100644
index 0000000000..c2d9c188e3
--- /dev/null
+++ b/megatron/core/transformer/moe/shared_experts.py
@@ -0,0 +1,262 @@
+# Copyright (c) 2024, NVIDIA CORPORATION. All rights reserved.
+
+import warnings
+from copy import deepcopy
+from typing import Optional
+
+import torch
+import torch.nn.functional as F
+
+from megatron.core.dist_checkpointing.mapping import ShardedStateDict
+from megatron.core.fusions.fused_bias_geglu import bias_geglu_impl
+from megatron.core.fusions.fused_bias_gelu import bias_gelu_impl
+from megatron.core.fusions.fused_bias_swiglu import bias_swiglu_impl
+from megatron.core.tensor_parallel.mappings import (
+    copy_to_tensor_model_parallel_region,
+    gather_from_sequence_parallel_region,
+    reduce_from_tensor_model_parallel_region,
+    reduce_scatter_to_sequence_parallel_region,
+)
+from megatron.core.tensor_parallel.random import (
+    get_cuda_rng_tracker,
+    get_data_parallel_rng_tracker_name,
+)
+from megatron.core.transformer.mlp import MLP
+from megatron.core.transformer.spec_utils import ModuleSpec
+from megatron.core.transformer.transformer_config import TransformerConfig
+from megatron.core.utils import make_sharded_tensor_for_checkpoint
+
+
+class SharedExpertMLP(MLP):
+    """
+    MLP layer for Shared Experts.
+    """
+
+    # This stream is used when '--moe-shared-expert-overlap' is set.
+    # The shared experts are scheduled into this stream to be overlapped with the dispatcher.
+    stream = None
+
+    def __init__(self, config: TransformerConfig, spec: ModuleSpec):
+        config = deepcopy(config)
+        assert config.add_bias_linear == False, "bias is not supported in the shared experts, "
+        "please set '--disable-bias-linear' instead."
+
+        config.ffn_hidden_size = config.moe_shared_expert_intermediate_size
+        super().__init__(config=config, submodules=spec.submodules)
+
+        self.use_shared_expert_gate = spec.params.get("gate", False)
+        if self.use_shared_expert_gate:
+            self.gate_weight = torch.nn.Parameter(torch.empty((1, self.config.hidden_size)))
+            if config.perform_initialization:
+                if get_cuda_rng_tracker().is_initialized():
+                    with get_cuda_rng_tracker().fork(get_data_parallel_rng_tracker_name()):
+                        config.init_method(self.gate_weight)
+            else:
+                config.init_method(self.gate_weight)
+            self.gate_weight.data = self.gate_weight.data.to(dtype=config.params_dtype)
+            setattr(self.gate_weight, 'sequence_parallel', self.config.sequence_parallel)
+        else:
+            self.gate_weight = None
+
+        if self.config.moe_shared_expert_overlap:
+            # disable TP related AG/RS communications in the linear module
+            for linear in [self.linear_fc1, self.linear_fc2]:
+                if hasattr(linear, 'parallel_mode'):
+                    # TELinear
+                    linear.parallel_mode = None
+                else:
+                    # MCore legacy Linear
+                    linear.explicit_expert_comm = True
+
+            # The overlapped version is splitted into some separated functions and is put inside
+            # the token dispatcher. These functions should be called in this order and no one can
+            # be skipped:
+            #     pre_forward_comm(input)
+            #     linear_fc1_forward_and_act()
+            #     linear_fc2_forward()
+            #     post_forward_comm()
+            #     output = get_output()
+            #
+            # We use cached intermediate results to avoid messy arg passing in the dispatcher.
+            self.cached_fc1_input = None
+            self.cached_fc2_input = None
+            self.cached_fc2_output = None
+            self.cached_output = None
+            self.gate_score = None
+
+            if self.stream is None:
+                self.stream = torch.cuda.Stream()
+
+    def forward(self, hidden_states):
+        """Forward function"""
+        output, _ = super().forward(hidden_states)
+        if self.use_shared_expert_gate:
+            logits = torch.nn.functional.linear(hidden_states, self.gate_weight)
+            gate_score = torch.nn.functional.sigmoid(logits)
+            output = output * gate_score
+        return output
+
+    def sharded_state_dict(
+        self, prefix: str = '', sharded_offsets: tuple = (), metadata: Optional[dict] = None
+    ) -> ShardedStateDict:
+        """Gets sharded state dict."""
+        sharded_state_dict = super().sharded_state_dict(prefix, sharded_offsets, metadata)
+        if self.use_shared_expert_gate:
+            name = 'gate_weight'
+            state_dict = self.state_dict(prefix='', keep_vars=True)
+            sub_sd = {
+                f'{prefix}{name}': make_sharded_tensor_for_checkpoint(
+                    state_dict[name], f'{prefix}{name}', prepend_offsets=sharded_offsets
+                )
+            }
+            sharded_state_dict.update(sub_sd)
+        return sharded_state_dict
+
+    def pre_forward_comm(self, input):
+        """
+        All Gather for SP before forward.
+        This function is used to overlap shared experts with the dispatcher.
+        It is only useful when --moe-shared-expert-overlap is set and may be changed.
+        """
+        assert self.config.moe_shared_expert_overlap
+        assert self.cached_output is None
+        self.stream.wait_stream(torch.cuda.current_stream())
+        with torch.cuda.stream(self.stream):
+            if self.use_shared_expert_gate:
+                logits = torch.nn.functional.linear(input, self.gate_weight)
+                self.gate_score = torch.nn.functional.sigmoid(logits)
+            if self.config.sequence_parallel:
+                self.cached_fc1_input = gather_from_sequence_parallel_region(
+                    input, tensor_parallel_output_grad=True
+                )
+            else:
+                self.cached_fc1_input = copy_to_tensor_model_parallel_region(input)
+            set_tensor_grad_fn_sequence_sr(self.cached_fc1_input, torch.iinfo(torch.int).max)
+
+    def linear_fc1_forward_and_act(self, overlapped_comm_output=None):
+        """
+        Do Linear FC1 and activation function forward.
+        This function is used to overlap shared experts with the dispatcher.
+        It is only useful when --moe-shared-expert-overlap is set and may be changed.
+        """
+        assert self.config.moe_shared_expert_overlap
+        assert self.cached_fc1_input is not None
+        if overlapped_comm_output is not None:
+            set_tensor_grad_fn_sequence_sr(overlapped_comm_output, torch.iinfo(torch.int).max)
+        with torch.cuda.stream(self.stream):
+            # [s, b, 4 * h/p]
+            intermediate_parallel, bias_parallel = self.linear_fc1(self.cached_fc1_input)
+            self.cached_fc1_input = None
+
+            if self.config.bias_activation_fusion:
+                if self.activation_func == F.gelu:
+                    if self.config.gated_linear_unit:
+                        intermediate_parallel = bias_geglu_impl(
+                            intermediate_parallel, bias_parallel
+                        )
+                    else:
+                        assert self.config.add_bias_linear is True
+                        intermediate_parallel = bias_gelu_impl(intermediate_parallel, bias_parallel)
+                elif self.activation_func == F.silu and self.config.gated_linear_unit:
+                    intermediate_parallel = bias_swiglu_impl(
+                        intermediate_parallel,
+                        bias_parallel,
+                        self.config.activation_func_fp8_input_store,
+                    )
+                else:
+                    raise ValueError("Only support fusion of gelu and swiglu")
+            else:
+                if bias_parallel is not None:
+                    intermediate_parallel = intermediate_parallel + bias_parallel
+                if self.config.gated_linear_unit:
+
+                    def glu(x):
+                        x = torch.chunk(x, 2, dim=-1)
+                        return self.config.activation_func(x[0]) * x[1]
+
+                    intermediate_parallel = glu(intermediate_parallel)
+                else:
+                    intermediate_parallel = self.activation_func(intermediate_parallel)
+
+            self.cached_fc2_input = intermediate_parallel
+
+    def linear_fc2_forward(self, overlapped_comm_output=None):
+        """
+        Do Linear FC2 forward.
+        This function is used to overlap shared experts with the dispatcher.
+        It is only useful when --moe-shared-expert-overlap is set and may be changed.
+        """
+        assert self.config.moe_shared_expert_overlap
+        assert self.cached_fc2_input is not None
+        if overlapped_comm_output is not None:
+            set_tensor_grad_fn_sequence_sr(overlapped_comm_output, torch.iinfo(torch.int).max)
+        with torch.cuda.stream(self.stream):
+            # [s, b, h]
+            self.cached_fc2_output, _ = self.linear_fc2(self.cached_fc2_input)
+            self.cached_fc2_input = None
+
+    def post_forward_comm(self):
+        """
+        Reduce scatter for SP after forward.
+        This function is used to overlap shared experts with the dispatcher.
+        It is only useful when --moe-shared-expert-overlap is set and may be changed.
+        """
+        assert self.config.moe_shared_expert_overlap
+        assert self.cached_fc2_output is not None
+        with torch.cuda.stream(self.stream):
+            if self.config.sequence_parallel:
+                self.cached_output = reduce_scatter_to_sequence_parallel_region(
+                    self.cached_fc2_output
+                )
+            else:
+                self.cached_output = reduce_from_tensor_model_parallel_region(
+                    self.cached_fc2_output
+                )
+            self.cached_fc2_output = None
+            set_tensor_grad_fn_sequence_sr(self.cached_output, torch.iinfo(torch.int).max)
+
+    def get_output(self):
+        """
+        Gets the module forward output.
+        This function is used to overlap shared experts with the dispatcher.
+        It is only useful when --moe-shared-expert-overlap is set and may be changed.
+        """
+        assert self.config.moe_shared_expert_overlap
+        assert self.cached_output is not None
+        with torch.cuda.stream(self.stream):
+            if self.use_shared_expert_gate:
+                assert self.gate_score is not None
+                output = self.cached_output * self.gate_score
+                self.gate_score = None
+            else:
+                output = self.cached_output
+            self.cached_output = None
+        torch.cuda.current_stream().wait_stream(self.stream)
+        return output
+
+
+TORCH_MAJOR = int(torch.__version__.split(".")[0])
+TORCH_MINOR = int(torch.__version__.split(".")[1])
+TORCH_LAST = torch.__version__.split(".")[2]
+
+
+def set_tensor_grad_fn_sequence_sr(tensor, value):
+    """
+    Set sequence_sr for the grad_fn of a tensor to control the backward order.
+    For older PyTorch version, do nothing (backward order is not changed).
+    The bigger the value is, the earlier the grad_fn is scheduled.
+    """
+    if (
+        (TORCH_MAJOR > 2)
+        or (TORCH_MAJOR == 2 and TORCH_MINOR > 2)
+        or (TORCH_MAJOR == 2 and TORCH_MINOR == 2 and '+' not in TORCH_LAST)
+    ):
+        # In NVIDIA PyTorch container 24.01, the PyTorch version is 2.2.0a0+81ea7a4,
+        # which does not contian the set_sequence_nr commit.
+        if tensor is not None and tensor.grad_fn is not None:
+            tensor.grad_fn._set_sequence_nr(value)
+    else:
+        warnings.warn(
+            "WARNING : PyTorch is too old to set sequence_sr and the performance may not "
+            "optimal. Please use PyTorch >= 2.2.0 for better performance."
+        )
diff --git a/megatron/core/transformer/moe/token_dispatcher.py b/megatron/core/transformer/moe/token_dispatcher.py
new file mode 100644
index 0000000000..db1b1920fa
--- /dev/null
+++ b/megatron/core/transformer/moe/token_dispatcher.py
@@ -0,0 +1,627 @@
+# Copyright (c) 2024, NVIDIA CORPORATION. All rights reserved.
+
+from abc import abstractmethod
+from typing import List, Optional, Tuple
+
+import torch
+
+from megatron.core import parallel_state, tensor_parallel
+from megatron.core.tensor_parallel.mappings import (
+    _gather_along_first_dim_moe,
+    gather_from_sequence_parallel_region,
+    reduce_scatter_to_sequence_parallel_region,
+)
+from megatron.core.transformer.moe.moe_utils import (
+    moe_gather,
+    moe_scatter,
+    permute,
+    sort_chunks_by_idxs,
+    unpermute,
+)
+from megatron.core.transformer.moe.shared_experts import SharedExpertMLP
+from megatron.core.transformer.transformer_config import TransformerConfig
+
+""" We use the following notation throughout this file:
+     H: hidden size
+     B: micro batch size
+     S: sequence length
+     TP: tensor model parallel size
+     EP: expert model parallel size
+     num_local_tokens: S/TP*B
+     num_global_tokens: num_local_tokens*TP*EP
+"""
+
+
+class MoETokenDispatcher:
+    """
+    MoE Token Dispatcher
+    """
+
+    def __init__(self, config: TransformerConfig) -> None:
+        """
+        Initialize the MoE Token Dispatcher.
+        """
+        self.config = config
+        self.shared_experts: Optional[SharedExpertMLP] = None
+
+    @abstractmethod
+    def token_permutation(self, tokens: torch.Tensor, indices: torch.Tensor):
+        """Dispatch tokens to experts.
+
+        Args:
+            tokens (torch.Tensor): Input tokens.
+            indices (torch.Tensor): indices tensor.
+
+        Returns:
+            torch.Tensor: Tokens tensor.
+        """
+        raise NotImplementedError("Dispatch function not implemented.")
+
+    @abstractmethod
+    def token_unpermutation(
+        self, expert_output: torch.Tensor, probs: torch.Tensor, indices: torch.Tensor
+    ):
+        """Restores the expert output to its original ordering.
+
+        Args:
+            expert_output (torch.Tensor): The output tensor from the expert models.
+            probs (torch.Tensor): Each token's score with each expert.
+            indices (torch.Tensor): The indices used to reorder the expert output.
+
+        Returns:
+            (torch.Tensor, torch.Tensor): Unpermuted activation and optional bias.
+        """
+        raise NotImplementedError("Restore function not implemented.")
+
+    def set_shared_experts(self, shared_experts):
+        """Set shared expert to the dispatcher."""
+        self.shared_experts = shared_experts
+
+
+class MoEAllGatherTokenDispatcher(MoETokenDispatcher):
+    """
+    AllGather Based Token dispatcher.
+    Note that this allgather spans the communication domain of TP*EP:
+    """
+
+    def __init__(
+        self, num_local_experts: int, local_expert_indices: List[int], config: TransformerConfig
+    ) -> None:
+        """
+        Initialize the zero token dropping router.
+        """
+        super().__init__(config=config)
+        self.num_local_experts = num_local_experts
+        assert self.num_local_experts > 0, "Expected at least one expert"
+        self.local_expert_indices = local_expert_indices
+        assert len(self.local_expert_indices) > 0, "Expected at least one local expert index"
+        self.router_topk = config.moe_router_topk
+        self.add_bias = config.add_bias_linear
+
+        # self.local_probs: probs of global token assignment to local experts.
+        self.local_probs = None
+
+        # self.global_local_map: 2D tensor. A mask of mapping between global and local tokens where
+        # each element is True if it's between the local_expert_indices. Only useful when cross
+        # device token permutation is enabled and **AllGahter** is performed.
+        self.global_local_map = None
+
+    def token_permutation(
+        self, hidden_states: torch.Tensor, max_prob: torch.Tensor, max_ind: torch.Tensor
+    ):
+        """Dispatch tokens to local experts. It's composed of two stages:
+        (1) Permute the tokens across the expert parallel devices. After this stage,
+        each device receives all of the tokens assigned to its local set of experts
+        in its local HBM.
+        (2) Permute the tokens locally so that they are grouped by their expert
+        assignment. After the stage (1), the tokens are grouped by which device
+        they came from. We re-order them locally for subsequent efficient computation.
+
+        Args:
+            hidden_states: 3D tensor [S/TP, B, H]. Input tokens.
+            max_prob: 2D tensor [S/TP*B, topk]. Each row of max_prob contains
+            the probility distribution across `topk` experts for one local token.
+            For 'aux_loss' load balancing, the sum of the values in each row is 1,
+            thus for `top1` gating, it degenerates into a full 1 tensor.
+            max_ind: 2D tensor [num_local_tokens, topk], where
+            `num_local_tokens=S/TP*B`. Token assignment to global experts.
+
+        Returns:
+            permuted_local_hidden_states: Permutation of tokens to local experts group.
+            tokens_per_expert: the number of tokens each local expert to process.
+        """
+        self.hidden_shape = hidden_states.shape
+        # [S/TP, B, H] -> [S*B/TP, H]
+        hidden_states = hidden_states.view(-1, self.hidden_shape[-1])
+
+        # Permute the tokens across the expert parallel devices.
+        if (self.config.tensor_model_parallel_size > 1) or (
+            self.config.expert_model_parallel_size > 1
+        ):
+            ## local_indices calculation
+            with torch.no_grad():
+                # [num_local_tokens, topk] -> [num_global_tokens, topk], where:
+                #     num_local_tokens=(S/TP)*B, num_global_tokens=S*B*EP
+                global_indices = tensor_parallel.gather_from_sequence_parallel_region_to_moe(
+                    max_ind
+                )
+                # Create a mask of mapping between global and local tokens where each
+                # element is True if it's between the local_expert_indices
+                global_local_mask = (global_indices >= self.local_expert_indices[0]) & (
+                    global_indices <= self.local_expert_indices[-1]
+                )
+                local_indices = global_indices.masked_select(global_local_mask)
+
+            ## local_probs calculation
+            # max_prob: [S/TP*B, topk] -> global_probs: [S*B*EP, topk]
+            global_probs = tensor_parallel.gather_from_sequence_parallel_region_to_moe(max_prob)
+            self.local_probs = global_probs.masked_select(global_local_mask)
+            self.local_probs = self.local_probs.view(-1, 1)
+            # Note that this allgather spans the communication domain of TP*EP.
+            #  [(S/TP)*B, H] -> [((S/TP)*B)*(TP*EP), H] = [S*B*EP, H]
+            global_hidden_states = tensor_parallel.gather_from_sequence_parallel_region_to_moe(
+                hidden_states, use_global_buffer=True
+            )
+            # Reshape global_local_mask to be compatible with Tensor.gather
+            global_local_map = global_local_mask.nonzero()[:, 0]
+            self.global_local_map = global_local_map.view(-1, 1).expand(-1, hidden_states.shape[-1])
+            local_hidden_states = moe_gather.apply(global_hidden_states, self.global_local_map)
+        else:
+            if self.router_topk > 1:
+                global_local_mask = torch.ones_like(max_ind).bool()
+                local_indices = max_ind.masked_select(global_local_mask)
+                self.local_probs = max_prob.masked_select(global_local_mask)
+                self.local_probs = self.local_probs.view(-1, 1)
+                global_local_map = global_local_mask.nonzero()[:, 0]
+                self.global_local_map = global_local_map.view(-1, 1).expand(
+                    -1, hidden_states.shape[-1]
+                )
+                local_hidden_states = torch.gather(hidden_states, 0, self.global_local_map)
+            else:
+                local_indices = max_ind
+                self.local_probs = max_prob.view(-1, 1)
+                local_hidden_states = hidden_states
+                self.global_local_map = None
+
+        with torch.no_grad():
+            # The indices of local_indices that give its sorted order along dim 0.
+            self.indices = torch.argsort(local_indices, dim=0)
+            if self.config.deterministic_mode:
+                tokens_per_expert = torch.bincount(
+                    local_indices.view(-1), minlength=self.config.num_moe_experts
+                )
+                if self.num_local_experts < self.config.num_moe_experts:
+                    tokens_per_expert = tokens_per_expert[
+                        self.local_expert_indices[0] : self.local_expert_indices[-1] + 1
+                    ]
+            else:
+                tokens_per_expert = torch.histc(
+                    local_indices,
+                    bins=self.num_local_experts,
+                    min=self.local_expert_indices[0],
+                    max=self.local_expert_indices[-1],
+                )
+            tokens_per_expert = tokens_per_expert.cpu().to(torch.long)
+
+        # Stage2: permute the tokens locally so that they are grouped by their expert assignment
+        # Reshape indices to be compatible with Tensor.gather
+
+        permuted_local_hidden_states, self.reversed_local_input_permutation_mapping = permute(
+            local_hidden_states, local_indices
+        )
+
+        return permuted_local_hidden_states, tokens_per_expert
+
+    def token_unpermutation(self, hidden_states: torch.Tensor, bias: torch.Tensor = None):
+        """
+        Reverse process of `dispatch()` which permutes the output of local
+        experts locallay and across expert parallel rank into the original order to
+        produce the final output.
+
+        Args:
+            hidden_states: 2D tensor [num_permuted_tokens_for_local_experts, H],
+            output of local experts.
+            bias (optional): The bias tensor.
+
+        Returns:
+            output_total: un-permuted updated hidden states output from all local experts
+            with shape of [S/TP, B, H]
+        """
+        # Stage1: unpermute the tokens and bias locally respectively.
+        # Scale the expert output prior to reduction and subsequent to local unpermutation if k > 1.
+
+        unpermuted_local_hidden = unpermute(
+            hidden_states, self.reversed_local_input_permutation_mapping
+        )
+        unpermuted_local_hidden = unpermuted_local_hidden * self.local_probs
+
+        unpermuted_local_bias = None
+        if self.add_bias:
+            assert bias is not None
+            unpermuted_local_bias = torch.zeros_like(hidden_states)
+            unpermuted_local_bias = unpermute(bias, self.reversed_local_input_permutation_mapping)
+            unpermuted_local_bias = unpermuted_local_bias * self.local_probs
+
+        output_total = unpermuted_local_hidden
+        output_bias_total = unpermuted_local_bias
+
+        # Unpermute the tokens across expert parallel devices.
+        if (self.config.tensor_model_parallel_size > 1) or (
+            self.config.expert_model_parallel_size > 1
+        ):
+            assert (
+                self.global_local_map is not None
+            ), "global_local_map is necessary for `AllGather`."
+            ep_group_size = parallel_state.get_tensor_and_expert_parallel_world_size()
+            # hidden_shape: [S/TP, B, H], gloal_num_tokens = S/TP*B*(TP*EP)
+            global_num_tokens = self.hidden_shape[0] * self.hidden_shape[1] * ep_group_size
+            global_hidden_shape = [global_num_tokens, hidden_states.shape[-1]]
+            assert self.global_local_map.shape == unpermuted_local_hidden.shape
+            unpermuted_global_hidden = moe_scatter.apply(
+                unpermuted_local_hidden, self.global_local_map, global_hidden_shape
+            )
+            output_total = tensor_parallel.reduce_scatter_to_sequence_parallel_region_from_moe(
+                unpermuted_global_hidden
+            )
+            if self.add_bias:
+                # Unpermute the bias across expert parallel devices.
+                unpermuted_global_bias = torch.zeros_like(unpermuted_global_hidden)
+                unpermuted_global_bias = unpermuted_global_bias.scatter_add(
+                    0, self.global_local_map, unpermuted_local_bias
+                )
+                output_bias_total = (
+                    tensor_parallel.reduce_scatter_to_sequence_parallel_region_from_moe(
+                        unpermuted_global_bias
+                    )
+                )
+                # bias is duplicated across tensor parallelism ranks;
+                # reduce scatter reduces bias across tensor parallel_ranks
+                output_bias_total = (
+                    output_bias_total / parallel_state.get_tensor_model_parallel_world_size()
+                )
+        else:
+            if self.router_topk > 1:
+                global_num_tokens = self.hidden_shape[0] * self.hidden_shape[1]
+                global_hidden_shape = [global_num_tokens, hidden_states.shape[-1]]
+                unpermuted_global_hidden = torch.zeros(
+                    global_hidden_shape,
+                    dtype=hidden_states.dtype,
+                    device=torch.cuda.current_device(),
+                )
+                output_total = unpermuted_global_hidden.scatter_add(
+                    0, self.global_local_map, unpermuted_local_hidden
+                )
+                if self.add_bias:
+                    unpermuted_global_bias = torch.zeros_like(unpermuted_global_hidden)
+                    output_bias_total = unpermuted_global_bias.scatter_add(
+                        0, self.global_local_map, unpermuted_local_bias
+                    )
+
+        output_total = output_total.view(self.hidden_shape)
+        if self.add_bias:
+            output_bias_total = output_bias_total.view(self.hidden_shape)
+        else:
+            output_bias_total = None
+
+        return output_total, output_bias_total
+
+
+class MoEAlltoAllTokenDispatcher(MoETokenDispatcher):
+    """
+    AlltoAll-based token dispatcher.
+    """
+
+    def __init__(
+        self, num_local_experts: int, local_expert_indices: List[int], config: TransformerConfig
+    ) -> None:
+        """
+        Initialize the AlltoAll token dispatcher.
+
+        Args:
+            num_local_experts (int): Number of local experts on the current device.
+            local_expert_indices (List[int]): Indices of local experts on the current device.
+            config (TransformerConfig): Configuration for the transformer model.
+        """
+        super().__init__(config=config)
+        self.hidden_shape = None
+        self.num_local_experts = num_local_experts
+        self.num_experts = config.num_moe_experts
+        assert self.num_local_experts > 0, "Expected at least one expert"
+        self.local_expert_indices = local_expert_indices
+        assert (
+            len(self.local_expert_indices) == self.num_local_experts
+        ), "Invalid local expert indices"
+        for i in range(len(self.local_expert_indices) - 1):
+            assert (
+                self.local_expert_indices[i] == self.local_expert_indices[i + 1] - 1
+            ), "local_expert_indices must be continous"
+        self.ep_size = config.expert_model_parallel_size
+        self.tp_size = config.tensor_model_parallel_size
+        self.probs = None
+
+        # [ep_size]. Represents the number of tokens sent by the current rank to other
+        # EP ranks.
+        self.input_splits = None
+        # [ep_size]. Represents the number of tokens received by the current rank from
+        # other EP ranks.
+        self.output_splits = None
+        # [tp_size]. Represents the number of tokens received by the current rank from
+        # other TP ranks.
+        self.output_splits_tp = None
+        # [tp_size * ep_size, num_local_experts]. Represents the number of tokens sent
+        # to each local expert by all ranks.
+        self.num_global_tokens_per_local_expert_cpu = None
+        input_chunk_idxs = torch.arange(self.num_experts * self.tp_size)
+        # [num_local_experts, tp_size * ep_size]. Sort the input chunks by local experts.
+        self.sort_input_by_local_experts = (
+            input_chunk_idxs.reshape(-1, self.num_local_experts).T.ravel().tolist()
+        )
+        # [tp_size * ep_size, num_local_experts]. Restore the output chunks by local experts.
+        self.restore_output_by_local_experts = (
+            input_chunk_idxs.reshape(self.num_local_experts, -1).T.ravel().tolist()
+        )
+
+        # Token drop and padding.
+        # We need to keep track of the token num if we drop tokens without padding them.
+        self.num_out_tokens = None
+        # Drop and pad the input to capacity.
+        self.drop_and_pad = self.config.moe_pad_expert_input_to_capacity
+        if self.drop_and_pad:
+            assert self.config.moe_expert_capacity_factor is not None
+        self.capacity = None
+
+        # A cuda stream synchronization is needed in self.token_permutation() in some cases,
+        # because there are several non-blocking DtoH data transfers called in self.preprocess().
+        # The synchronization happens at different points based on MoE settings as late as possible.
+        # Valid sync points are "before_permutation_1", "before_ep_alltoall", "before_finish",
+        # and "no_sync".
+        self.cuda_sync_point = "no_sync"
+
+        self.shared_experts = None
+
+    def preprocess(self, indices: torch.Tensor) -> torch.Tensor:
+        """
+        Preprocess token indices for AlltoAll communication and token permutation. This method
+        computes the number of tokens assigned to each expert based on the input indices.
+        It also initializes the necessary data structures for AlltoAll communication, such as input
+        and output splits, and the mapping between global tokens and local experts.
+
+        Args:
+            indices (torch.Tensor): Tensor of indices mapping tokens to experts.
+
+        Returns:
+            torch.Tensor: Tensor containing the number of tokens assigned to local expert.
+        """
+        if self.config.deterministic_mode:
+            num_local_tokens_per_expert = torch.bincount(
+                indices.view(-1), minlength=self.num_experts
+            )
+        else:
+            num_local_tokens_per_expert = torch.histc(
+                indices, bins=self.num_experts, min=0, max=self.num_experts
+            )
+        # num_local_tokens_per_expert: [num_experts]
+
+        tp_rank = parallel_state.get_tensor_model_parallel_rank()
+        if self.drop_and_pad:
+            # probs: [num_experts, local_capacity]
+            self.capacity = self.probs.size(1)
+            num_tokens_per_local_expert = torch.full(
+                (self.num_local_experts,),
+                self.capacity * self.tp_size * self.ep_size,
+                dtype=torch.long,
+            )
+            # [tp_size * ep_size, num_local_experts].
+            self.num_global_tokens_per_local_expert_cpu = torch.full(
+                (self.num_experts * self.tp_size,), self.capacity, dtype=torch.long
+            )
+            return num_tokens_per_local_expert
+        elif self.config.moe_expert_capacity_factor is not None:
+            # Token drop but no pad. A synchronization is needed before the first
+            # permutation to get the `num_out_tokens` CPU value.
+            self.num_out_tokens = num_local_tokens_per_expert.sum().to(
+                torch.device("cpu"), non_blocking=True
+            )
+            self.cuda_sync_point = "before_permutation_1"
+        elif self.ep_size > 1 or self.num_local_experts > 1:
+            # Token dropless and enable ep. A synchronization is needed before expert parallel
+            # AlltoAll communication to get the `input_splits` and `output_splits` CPU values.
+            self.cuda_sync_point = "before_ep_alltoall"
+        else:
+            # Token dropless and no ep. A synchronization is needed before the token_permutation()
+            # function returns to get the `tokens_per_expert` CPU value.
+            self.cuda_sync_point = "before_finish"
+
+        if self.ep_size > 1 or self.tp_size > 1:
+            # ===================================================
+            # Calculate input_splits, output_splits for alltoall/allgather in variable size.
+            # ===================================================
+            self.input_splits = (
+                num_local_tokens_per_expert.reshape(self.ep_size, self.num_local_experts)
+                .sum(axis=1)
+                .to(torch.device("cpu"), non_blocking=True)
+                .numpy()
+            )
+            # Gather the global distribution of tokens across ranks.
+            # num_global_tokens_per_expert represents the number of tokens sent to each
+            # expert by all ranks.
+            # [tp_size, ep_size, num_experts]
+            num_global_tokens_per_expert = (
+                _gather_along_first_dim_moe(num_local_tokens_per_expert)
+                .reshape(self.ep_size, self.tp_size, self.num_experts)
+                .transpose(0, 1)
+            )
+            # [tp_size, ep_size, num_experts] -> [tp_size, ep_size, num_local_experts]
+            num_global_tokens_per_local_expert = num_global_tokens_per_expert[
+                :, :, self.local_expert_indices[0] : self.local_expert_indices[-1] + 1
+            ].contiguous()
+            # [tp_size, ep_size, num_local_experts] -> [tp_size, ep_size]
+            num_global_tokens_per_rank = num_global_tokens_per_local_expert.sum(axis=2)
+            # [tp_size, ep_size] -> [ep_size]
+            # self.output_splits represents the number of tokens received by the current rank
+            # from other EP rank.
+            self.output_splits = (
+                num_global_tokens_per_rank[tp_rank]
+                .to(torch.device("cpu"), non_blocking=True)
+                .numpy()
+            )
+            # [tp_size, ep_size] -> [tp_size]
+            # self.output_splits_tp represents the number of tokens received by the current
+            # rank from other TP rank.
+            self.output_splits_tp = (
+                num_global_tokens_per_rank.sum(axis=1)
+                .to(torch.device("cpu"), non_blocking=True)
+                .numpy()
+            )
+            # [tp_size, ep_size, num_local_experts] -> [num_local_experts]
+            num_tokens_per_local_expert = num_global_tokens_per_local_expert.sum(dim=(0, 1)).to(
+                torch.device("cpu"), non_blocking=True
+            )
+        else:
+            num_global_tokens_per_local_expert = num_local_tokens_per_expert.reshape(
+                self.num_experts
+            )
+            num_tokens_per_local_expert = num_local_tokens_per_expert.to(
+                torch.device("cpu"), non_blocking=True
+            )
+
+        if self.num_local_experts > 1:
+            self.num_global_tokens_per_local_expert_cpu = num_global_tokens_per_local_expert.view(
+                -1, self.num_local_experts
+            ).to(torch.device("cpu"), non_blocking=True)
+
+        return num_tokens_per_local_expert
+
+    def token_permutation(
+        self, hidden_states: torch.Tensor, probs: torch.Tensor, indices: torch.Tensor
+    ) -> Tuple[torch.Tensor, torch.Tensor]:
+        """
+        Dispatch tokens to local experts using AlltoAll communication.
+
+        Args:
+            hidden_states (torch.Tensor): Input token embeddings.
+            probs (torch.Tensor): Probs of tokens assigned to experts.
+            indices (torch.Tensor): Indices of tokens assigned to experts.
+
+        Returns:
+            Tuple[torch.Tensor, torch.Tensor]:
+                - Permuted token embeddings for local experts.
+                - Number of tokens per expert.
+        """
+        # Preprocess: Get the metadata for communication, permutation and computation operations.
+        self.hidden_shape = hidden_states.shape
+        self.probs = probs
+        assert probs.dim() == 2, "Expected 2D tensor for probs"
+        assert indices.dim() == 2, "Expected 2D tensor for indices"
+        hidden_states = hidden_states.view(-1, self.hidden_shape[-1])
+        tokens_per_expert = self.preprocess(indices)
+
+        if self.shared_experts is not None:
+            self.shared_experts.pre_forward_comm(hidden_states.view(self.hidden_shape))
+
+        # Permutation 1: input to AlltoAll input
+        self.hidden_shape_before_permute = hidden_states.shape
+        if self.cuda_sync_point == "before_permutation_1":
+            torch.cuda.current_stream().synchronize()
+        permutated_local_input_tokens, self.reversed_local_input_permutation_mapping = permute(
+            hidden_states,
+            indices,
+            num_out_tokens=self.num_out_tokens,
+            padded_mode=self.drop_and_pad,
+        )
+
+        # Perform expert parallel AlltoAll communication
+        if self.cuda_sync_point == "before_ep_alltoall":
+            torch.cuda.current_stream().synchronize()
+        global_input_tokens = tensor_parallel.all_to_all(
+            parallel_state.get_expert_model_parallel_group(),
+            permutated_local_input_tokens,
+            self.output_splits,
+            self.input_splits,
+        )
+        if self.shared_experts is not None:
+            self.shared_experts.linear_fc1_forward_and_act(global_input_tokens)
+
+        if parallel_state.get_tensor_model_parallel_world_size() > 1:
+            global_input_tokens = gather_from_sequence_parallel_region(
+                global_input_tokens,
+                output_split_sizes=(
+                    self.output_splits_tp.tolist() if self.output_splits_tp is not None else None
+                ),
+            )
+
+        # Permutation 2: Sort tokens by local expert.
+        if self.num_local_experts > 1:
+            global_input_tokens = sort_chunks_by_idxs(
+                global_input_tokens,
+                self.num_global_tokens_per_local_expert_cpu.ravel(),
+                self.sort_input_by_local_experts,
+            )
+
+        if self.cuda_sync_point == "before_finish":
+            torch.cuda.current_stream().synchronize()
+
+        return global_input_tokens, tokens_per_expert
+
+    def token_unpermutation(
+        self, hidden_states: torch.Tensor, bias: torch.Tensor = None
+    ) -> Tuple[torch.Tensor, Optional[torch.Tensor]]:
+        """
+        Reverse the token permutation to restore the original order.
+
+        Args:
+            hidden_states (torch.Tensor): Output from local experts.
+            bias (torch.Tensor, optional): Bias tensor (not supported).
+
+        Returns:
+            Tuple[torch.Tensor, Optional[torch.Tensor]]:
+                - Unpermuted token embeddings in the original order.
+                - None (bias is not supported).
+        """
+        assert bias is None, "Bias is not supported in MoEAlltoAllTokenDispatcher"
+
+        # Unpermutation 2: Unsort tokens by local expert.
+        if self.num_local_experts > 1:
+            hidden_states = sort_chunks_by_idxs(
+                hidden_states,
+                self.num_global_tokens_per_local_expert_cpu.T.ravel(),
+                self.restore_output_by_local_experts,
+            )
+
+        if parallel_state.get_tensor_model_parallel_world_size() > 1:
+            hidden_states = reduce_scatter_to_sequence_parallel_region(
+                hidden_states,
+                input_split_sizes=(
+                    self.output_splits_tp.tolist() if self.output_splits_tp is not None else None
+                ),
+            )
+
+        # Perform expert parallel AlltoAll communication
+        # hidden_states: [SEQL, H] -> [SEQL, H/TP]
+        permutated_local_input_tokens = tensor_parallel.all_to_all(
+            parallel_state.get_expert_model_parallel_group(),
+            hidden_states,
+            self.input_splits,
+            self.output_splits,
+        )
+        if self.shared_experts is not None:
+            self.shared_experts.linear_fc2_forward(permutated_local_input_tokens)
+            self.shared_experts.post_forward_comm()
+
+        # Unpermutation 1: Unsort input tokens to restore the original order.
+        output = unpermute(
+            permutated_local_input_tokens,
+            self.reversed_local_input_permutation_mapping,
+            probs=self.probs,
+            padded_mode=self.drop_and_pad,
+            restore_shape=self.hidden_shape_before_permute,
+        )
+
+        # Reshape the output tensor
+        output = output.view(self.hidden_shape)
+
+        # Add shared experts output
+        if self.shared_experts is not None:
+            shared_expert_output = self.shared_experts.get_output()
+            output += shared_expert_output
+        return output, None
diff --git a/megatron/core/transformer/moe/upcycling_utils.py b/megatron/core/transformer/moe/upcycling_utils.py
new file mode 100644
index 0000000000..b905fc99be
--- /dev/null
+++ b/megatron/core/transformer/moe/upcycling_utils.py
@@ -0,0 +1,196 @@
+# Copyright (c) 2022-2024, NVIDIA CORPORATION.  All rights reserved.
+""" Helpers for converting a dense model to a MoE model in runtime """
+from megatron.core import mpu
+
+
+def _get_keys_endswith(model, suffix):
+    """
+    Retrieve keys from the model that end with a specified suffix.
+    """
+    return [k for k in model if k.endswith(suffix)]
+
+
+def _covert_to_moe_state_dict(state_dict, moe_model):
+    """
+    Convert a dense model's state_dict to a MoE model's state_dict.
+
+    This function takes the state dictionary of a dense model and modifies it to fit the
+    structure required by a Mixture of Experts model. It handles the necessary
+    transformations for weights and biases specific to the MoE architecture.
+
+    Args:
+        state_dict (dict): The dense model's state_dict.
+        moe_model (nn.Module): The MoE model instance from which to get the submodule
+                               and state_dict, must be a model without FP16 and/or
+                               DDP wrapper.
+
+    Returns:
+        dict: The converted MoE model state_dict, ready for use in the MoE architecture.
+    """
+
+    mlp = moe_model.get_submodule('decoder.layers.0.mlp')
+
+    moe_state_dict = moe_model.state_dict()
+    new_state_dict = state_dict
+
+    mlp_lm_weight_keys = _get_keys_endswith(new_state_dict, 'mlp.linear_fc1.layer_norm_weight')
+    mlp_lm_bias_keys = _get_keys_endswith(new_state_dict, 'mlp.linear_fc1.layer_norm_bias')
+    mlp_fc1_weight_keys = _get_keys_endswith(new_state_dict, 'mlp.linear_fc1.weight')
+    mlp_fc2_weight_keys = _get_keys_endswith(new_state_dict, 'mlp.linear_fc2.weight')
+    mlp_fc1_bias_keys = _get_keys_endswith(new_state_dict, 'mlp.linear_fc1.bias')
+    mlp_fc2_bias_keys = _get_keys_endswith(new_state_dict, 'mlp.linear_fc2.bias')
+    mlp_fc1_extra_state_keys = _get_keys_endswith(new_state_dict, 'mlp.linear_fc1._extra_state')
+    mlp_fc2_extra_state_keys = _get_keys_endswith(new_state_dict, 'mlp.linear_fc2._extra_state')
+
+    for key in mlp_lm_weight_keys:
+        params = new_state_dict.pop(key)
+        new_key = key.replace('mlp.linear_fc1.layer_norm_weight', 'pre_mlp_layernorm.weight')
+        new_state_dict[new_key] = params
+
+    for key in mlp_lm_bias_keys:
+        params = new_state_dict.pop(key)
+        new_key = key.replace('mlp.linear_fc1.layer_norm_bias', 'pre_mlp_layernorm.bias')
+        new_state_dict[new_key] = params
+
+    for mlp_weight_key in mlp_fc1_weight_keys:
+        router_key = mlp_weight_key.replace('mlp.linear_fc1.weight', 'mlp.router.weight')
+        new_state_dict[router_key] = moe_state_dict[router_key].data.data.clone()
+
+    use_te_grouped_gemm = 'decoder.layers.0.mlp.experts.linear_fc1.weight0' in moe_state_dict
+
+    if mlp.config.moe_grouped_gemm and use_te_grouped_gemm:
+        for mlp_weight_key in mlp_fc1_weight_keys:
+            weight_tensor = new_state_dict.pop(mlp_weight_key)
+            for expert_i in range(mlp.num_local_experts):
+                new_key = mlp_weight_key.replace(
+                    'mlp.linear_fc1.weight', f'mlp.experts.linear_fc1.weight{expert_i}'
+                )
+                new_state_dict[new_key] = weight_tensor.clone()
+
+        for mlp_weight_key in mlp_fc2_weight_keys:
+            weight_tensor = new_state_dict.pop(mlp_weight_key)
+            for expert_i in range(mlp.num_local_experts):
+                new_key = mlp_weight_key.replace(
+                    'mlp.linear_fc2.weight', f'mlp.experts.linear_fc2.weight{expert_i}'
+                )
+                new_state_dict[new_key] = weight_tensor.clone()
+
+        for extra_state_key in mlp_fc1_extra_state_keys:
+            new_state_dict.pop(extra_state_key)
+            new_key = extra_state_key.replace(
+                'mlp.linear_fc1._extra_state', 'mlp.experts.linear_fc1._extra_state'
+            )
+            new_state_dict[new_key] = None
+
+        for extra_state_key in mlp_fc2_extra_state_keys:
+            new_state_dict.pop(extra_state_key)
+            new_key = extra_state_key.replace(
+                'mlp.linear_fc2._extra_state', 'mlp.experts.linear_fc2._extra_state'
+            )
+            new_state_dict[new_key] = None
+
+    elif mlp.config.moe_grouped_gemm:
+        for mlp_weight_key in mlp_fc1_weight_keys:
+            weight_tensor = new_state_dict.pop(mlp_weight_key)
+            shape = weight_tensor.shape
+            weight_tensor = weight_tensor.repeat(mlp.num_local_experts, 1, 1)
+            weight_tensor = weight_tensor.permute(0, 2, 1).reshape(
+                shape[1], mlp.num_local_experts * shape[0]
+            )
+            new_key = mlp_weight_key.replace('mlp.linear_fc1.weight', 'mlp.experts.weight1')
+            new_state_dict[new_key] = weight_tensor
+
+        for mlp_weight_key in mlp_fc2_weight_keys:
+            weight_tensor = new_state_dict.pop(mlp_weight_key)
+            shape = weight_tensor.shape
+            weight_tensor = weight_tensor.repeat(mlp.num_local_experts, 1, 1)
+            weight_tensor = weight_tensor.permute(0, 2, 1).reshape(
+                mlp.num_local_experts * shape[1], shape[0]
+            )
+            new_key = mlp_weight_key.replace('mlp.linear_fc2.weight', 'mlp.experts.weight2')
+            new_state_dict[new_key] = weight_tensor
+
+    else:
+
+        def covert_to_experts(keys):
+            for key in keys:
+                params = new_state_dict.pop(key)
+                new_key_format_str = key.replace('mlp', 'mlp.experts.local_experts.{}')
+                for expert_i in range(mlp.num_local_experts):
+                    new_key = new_key_format_str.format(expert_i)
+                    if hasattr(params, 'clone'):
+                        new_state_dict[new_key] = params.clone()
+                    else:
+                        # set extra_state to None for now
+                        new_state_dict[new_key] = None
+
+        covert_to_experts(mlp_fc1_weight_keys)
+        covert_to_experts(mlp_fc2_weight_keys)
+        covert_to_experts(mlp_fc1_bias_keys)
+        covert_to_experts(mlp_fc2_bias_keys)
+        covert_to_experts(mlp_fc1_extra_state_keys)
+        covert_to_experts(mlp_fc2_extra_state_keys)
+
+    return new_state_dict
+
+
+def upcycle_state_dict(moe_model, dense_model):
+    """
+    Convert a dense model's state_dict to a MoE model's state_dict.
+
+    This function facilitates the conversion of the state_dict from a dense model to
+    a MoE model, ensuring that the parameters are correctly mapped for each model.
+
+    Args:
+        moe_model (nn.Module): The MoE model, must be a model without FP16 and/or DDP wrapper.
+        dense_model (nn.Module): The dense model instance.
+
+    Returns:
+        dict: A dictionary containing the converted state_dict for the MoE model.
+    """
+
+    state_dict = {}
+    if len(moe_model) == 1:
+        assert len(dense_model) == 1
+        state_dict['model'] = _covert_to_moe_state_dict(dense_model[0].state_dict(), moe_model[0])
+    else:
+        assert len(moe_model) == len(dense_model)
+        for i in range(len(moe_model)):
+            mpu.set_virtual_pipeline_model_parallel_rank(i)
+            state_dict['model%d' % i] = _covert_to_moe_state_dict(
+                dense_model[i].state_dict(), moe_model[i]
+            )
+    return state_dict
+
+
+def load_and_upcycle_model(
+    load_dense_ckpt_func, moe_model, dense_model, strict=True, load_args=(), load_kwargs={}
+):
+    """
+    Load a dense model checkpoint and convert it to a MoE model.
+
+    This function loads a checkpoint for a dense model and converts it to the MoE model format,
+    allowing for the integration of the dense model's parameters into the MoE architecture.
+
+    Args:
+        load_dense_ckpt_func (callable): The function to load the dense model checkpoint.
+        moe_model (nn.Module): The MoE model instance.
+        dense_model (nn.Module): The dense model instance.
+        strict (bool): Whether to strictly load the state dictionary (default is True).
+        load_args (tuple): Positional arguments to pass to the loading function.
+        load_kwargs (dict): Keyword arguments to pass to the loading function.
+    """
+
+    iteration, num_floating_point_operations_so_far = load_dense_ckpt_func(
+        *load_args, **load_kwargs
+    )
+    state_dict = upcycle_state_dict(moe_model, dense_model)
+
+    if len(moe_model) == 1:
+        moe_model[0].load_state_dict(state_dict['model'], strict=strict)
+    else:
+        for i in range(len(moe_model)):
+            mpu.set_virtual_pipeline_model_parallel_rank(i)
+            moe_model[i].load_state_dict(state_dict['model%d' % i], strict=strict)
+
+    return iteration, num_floating_point_operations_so_far
diff --git a/megatron/core/transformer/multi_latent_attention.py b/megatron/core/transformer/multi_latent_attention.py
new file mode 100644
index 0000000000..d637e2b448
--- /dev/null
+++ b/megatron/core/transformer/multi_latent_attention.py
@@ -0,0 +1,375 @@
+# Copyright (c) 2024, NVIDIA CORPORATION. All rights reserved.
+
+
+import math
+from dataclasses import dataclass
+from typing import Union
+
+import torch
+
+from megatron.core import parallel_state
+from megatron.core.models.common.embeddings import (
+    YarnRotaryEmbedding,
+    _yarn_get_mscale,
+    apply_rotary_pos_emb,
+)
+from megatron.core.transformer.attention import Attention
+from megatron.core.transformer.enums import AttnMaskType
+from megatron.core.transformer.spec_utils import ModuleSpec, build_module
+from megatron.core.transformer.transformer_config import MLATransformerConfig
+
+
+@dataclass
+class MLASelfAttentionSubmodules:
+    """Submodules for the MLA self-attention layer."""
+
+    linear_q_proj: Union[ModuleSpec, type] = None
+    linear_q_down_proj: Union[ModuleSpec, type] = None
+    linear_q_up_proj: Union[ModuleSpec, type] = None
+    linear_kv_down_proj: Union[ModuleSpec, type] = None
+    linear_kv_up_proj: Union[ModuleSpec, type] = None
+    core_attention: Union[ModuleSpec, type] = None
+    linear_proj: Union[ModuleSpec, type] = None
+    q_layernorm: Union[ModuleSpec, type] = None
+    kv_layernorm: Union[ModuleSpec, type] = None
+
+
+class MultiLatentAttention(Attention):
+    """Multi-Latent Attention layer abstract class.
+
+    This layer only contains common modules required for the "self attn" and
+    "cross attn" specializations.
+    """
+
+    def __init__(
+        self,
+        config: MLATransformerConfig,
+        submodules: Union[MLASelfAttentionSubmodules],
+        layer_number: int,
+        attn_mask_type: AttnMaskType,
+        attention_type: str,
+    ) -> None:
+        world_size = parallel_state.get_tensor_model_parallel_world_size()
+        assert (
+            world_size == 1
+        ), "MLA is not supported with Tensor Parallelism yet, \
+        use Expert Parallelism and Pipeline Parallelism for better performance."
+
+        super().__init__(
+            config=config,
+            submodules=submodules,
+            layer_number=layer_number,
+            attention_type=attention_type,
+            attn_mask_type=attn_mask_type,
+        )
+
+        self.query_projection_size = self.config.v_head_dim * self.config.num_attention_heads
+
+        self.q_head_dim = self.config.qk_head_dim + self.config.qk_pos_emb_head_dim
+
+        mscale = _yarn_get_mscale(self.config.rotary_scaling_factor, self.config.mscale)
+        self.softmax_scale = mscale * mscale / math.sqrt(self.q_head_dim)
+
+        self.rotary_pos_emb = YarnRotaryEmbedding(
+            self.config.qk_pos_emb_head_dim,
+            rotary_base=self.config.rotary_base,
+            scaling_factor=self.config.rotary_scaling_factor,
+            original_max_position_embeddings=self.config.max_position_embeddings,
+            beta_fast=self.config.beta_fast,
+            beta_slow=self.config.beta_slow,
+            mscale=self.config.mscale,
+            mscale_all_dim=self.config.mscale_all_dim,
+        )
+
+        self.core_attention = build_module(
+            submodules.core_attention,
+            config=self.config,
+            layer_number=self.layer_number,
+            attn_mask_type=self.attn_mask_type,
+            attention_type=self.attention_type,
+            softmax_scale=self.softmax_scale,
+            k_channels=self.q_head_dim,
+            v_channels=self.config.v_head_dim,
+        )
+
+        # Output.
+        self.linear_proj = build_module(
+            submodules.linear_proj,
+            self.query_projection_size,
+            self.config.hidden_size,
+            config=self.config,
+            init_method=self.config.output_layer_init_method,
+            bias=self.config.add_bias_linear,
+            input_is_parallel=True,
+            skip_bias_add=True,
+            is_expert=False,
+            tp_comm_buffer_name='proj',
+        )
+
+    def forward(
+        self,
+        hidden_states,
+        attention_mask,
+        key_value_states=None,
+        inference_params=None,
+        rotary_pos_emb=None,
+        packed_seq_params=None,
+        position_ids=None,
+    ):
+        assert rotary_pos_emb is None, "Rotary position embeddings should not be passed into MLA."
+
+        # hidden_states: [sq, b, h]
+
+        # =====================
+        # Query, Key, and Value
+        # =====================
+        # Get the query, key and value tensors based on the type of attention -
+        # self or cross attn.
+        # query: [96, 1, 16, 128], key:[96, 1, 16, 128], value:[96, 1, 16, 128]
+        query, key, value = self.get_query_key_value_tensors(
+            hidden_states,
+            key_value_states,
+            position_ids,
+            packed_seq_params,
+            inference_params=inference_params,
+        )
+
+        # ===================================================
+        # Adjust key, value for inference
+        # ===================================================
+        # rotary_pos_emb = None
+        key, value, _, attn_mask_type = self._adjust_key_value_for_inference(
+            inference_params, key, value, rotary_pos_emb=None
+        )
+
+        # ==================================
+        # core attention computation
+        # ==================================
+        # Need corresponding TE change
+        if self.checkpoint_core_attention and self.training:
+            core_attn_out = self._checkpointed_attention_forward(
+                query, key, value, attention_mask, packed_seq_params=packed_seq_params
+            )
+        else:
+            core_attn_out = self.core_attention(
+                query,
+                key,
+                value,
+                attention_mask,
+                packed_seq_params=packed_seq_params,
+                attn_mask_type=attn_mask_type,
+            )
+
+        if packed_seq_params is not None:
+            # reshape to same output shape as unpacked case
+            # (t, np, hn) -> (t, b=1, h=np*hn)
+            # t is the pack size = sum (sq_i)
+            # note that batch is a dummy dimension in the packed case
+            core_attn_out = core_attn_out.reshape(core_attn_out.size(0), 1, -1)
+
+        # =================
+        # Output. [sq, b, h]
+        # =================
+        output, bias = self.linear_proj(core_attn_out)
+
+        return output, bias
+
+
+class MLASelfAttention(MultiLatentAttention):
+    """MLA Self-attention layer class
+
+    Self-attention layer takes input with size [s, b, h]
+    and returns output of the same size.
+    """
+
+    def __init__(
+        self,
+        config: MLATransformerConfig,
+        submodules: MLASelfAttentionSubmodules,
+        layer_number: int,
+        attn_mask_type=AttnMaskType.padding,
+    ):
+        super().__init__(
+            config=config,
+            submodules=submodules,
+            layer_number=layer_number,
+            attn_mask_type=attn_mask_type,
+            attention_type="self",
+        )
+
+        if self.config.q_lora_rank is None:
+            # Not projectiing query
+            self.linear_q_proj = build_module(
+                submodules.linear_q_proj,
+                self.config.hidden_size,
+                self.config.num_attention_heads * self.q_head_dim,
+                config=self.config,
+                init_method=self.config.init_method,
+                gather_output=False,
+                bias=False,
+                skip_bias_add=False,
+                is_expert=False,
+            )
+
+        else:
+
+            self.linear_q_down_proj = build_module(
+                submodules.linear_q_down_proj,
+                self.config.hidden_size,
+                self.config.q_lora_rank,
+                config=self.config,
+                init_method=self.config.init_method,
+                gather_output=False,
+                bias=False,
+                skip_bias_add=False,
+                is_expert=False,
+            )
+
+            self.linear_q_up_proj = build_module(
+                submodules.linear_q_up_proj,
+                self.config.q_lora_rank,
+                self.config.num_attention_heads * self.q_head_dim,
+                config=self.config,
+                init_method=self.config.init_method,
+                gather_output=False,
+                bias=False,
+                skip_bias_add=False,
+                is_expert=False,
+            )
+
+        self.linear_kv_down_proj = build_module(
+            submodules.linear_kv_down_proj,
+            self.config.hidden_size,
+            self.config.kv_lora_rank + self.config.qk_pos_emb_head_dim,
+            config=self.config,
+            init_method=self.config.init_method,
+            gather_output=False,
+            bias=False,
+            skip_bias_add=False,
+            is_expert=False,
+        )
+
+        self.linear_kv_up_proj = build_module(
+            submodules.linear_kv_up_proj,
+            self.config.kv_lora_rank,
+            self.config.num_attention_heads * (self.config.qk_head_dim + self.config.v_head_dim),
+            config=self.config,
+            init_method=self.config.init_method,
+            gather_output=False,
+            bias=False,
+            skip_bias_add=False,
+            is_expert=False,
+        )
+
+        if self.config.q_lora_rank is not None:
+            self.q_layernorm = build_module(
+                submodules.q_layernorm,
+                hidden_size=self.config.q_lora_rank,
+                config=self.config,
+                eps=self.config.layernorm_epsilon,
+            )
+
+        self.kv_layernorm = build_module(
+            submodules.kv_layernorm,
+            hidden_size=self.config.kv_lora_rank,
+            config=self.config,
+            eps=self.config.layernorm_epsilon,
+        )
+
+    def get_query_key_value_tensors(
+        self,
+        hidden_states,
+        key_value_states=None,
+        position_ids=None,
+        packed_seq_params=None,
+        inference_params=None,
+    ):
+        """
+        Derives `query`, `key` and `value` tensors from `hidden_states`.
+        """
+        # s = sequence length, b = batch size, h = hidden size, n = num attention heads
+        # Attention heads [s, b, n*h]
+        assert (
+            hidden_states.ndim == 3
+        ), f"hidden_states should be 3D, [s, b, n*h], got {hidden_states.ndim}D"
+        q_len, bsz, _ = hidden_states.size()
+
+        if self.config.q_lora_rank is not None:
+            q_compressed, _ = self.linear_q_down_proj(hidden_states)
+            q_compressed = self.q_layernorm(q_compressed)
+            q, _ = self.linear_q_up_proj(q_compressed)
+        else:
+            # hidden_states:[s, b, 2048], q: [s, b, n * 192]
+            q, _ = self.linear_q_proj(hidden_states)
+
+        # q: [s, b, n, 192]
+        q = q.view(q_len, bsz, self.num_attention_heads_per_partition, self.q_head_dim)
+
+        # q: [s, b, n, 128], q_pos_emb: [s, b, n, 64]
+        q_no_pe, q_pos_emb = torch.split(
+            q, [self.config.qk_head_dim, self.config.qk_pos_emb_head_dim], dim=-1
+        )
+
+        # kv_combined: [s, b, 576]
+        kv_combined, _ = self.linear_kv_down_proj(hidden_states)
+
+        # kv_compressed:[s, b, 512], k_pos_emb: [s, b, 64]
+        kv_compressed, k_pos_emb = torch.split(
+            kv_combined, [self.config.kv_lora_rank, self.config.qk_pos_emb_head_dim], dim=-1
+        )
+
+        # kv: [s, b, 2048]
+        kv, _ = self.linear_kv_up_proj(self.kv_layernorm(kv_compressed))
+
+        # kv: [s, b, n, 256]
+        kv = kv.view(
+            q_len,
+            bsz,
+            self.num_attention_heads_per_partition,
+            self.config.qk_head_dim + self.config.v_head_dim,
+        )
+
+        # k_no_pe: [s, b, n, 128], value: [s, b, n, 128]
+        k_no_pe, value = torch.split(kv, [self.config.qk_head_dim, self.config.v_head_dim], dim=-1)
+
+        # rotary_pos_emb:[s, b, 1, 64]
+        rotary_pos_emb = self.rotary_pos_emb(max_seq_len=self.config.max_position_embeddings)
+
+        if len(rotary_pos_emb) == 2:
+            mscale = rotary_pos_emb[1]
+            rotary_pos_emb = rotary_pos_emb[0]
+
+        if inference_params is not None:
+            # add offset to the sequence start for inference
+            sequence_start = inference_params.sequence_len_offset
+            sequence_end = sequence_start + q_len
+            rotary_pos_emb = rotary_pos_emb[sequence_start:sequence_end]
+
+        # [s, b, 64] -> [s, b, 1, 64]
+        k_pos_emb = torch.unsqueeze(k_pos_emb, 2)
+
+        if packed_seq_params is not None:
+            cu_seqlens_q = packed_seq_params.cu_seqlens_q
+            cu_seqlens_kv = packed_seq_params.cu_seqlens_kv
+        else:
+            cu_seqlens_q = cu_seqlens_kv = None
+
+        # q_pos_emb: [s, b, n, 64], k_pos_emb:[s, b, 1, 64]
+        q_pos_emb = apply_rotary_pos_emb(
+            q_pos_emb, rotary_pos_emb, config=self.config, cu_seqlens=cu_seqlens_q, mscale=mscale
+        )
+        k_pos_emb = apply_rotary_pos_emb(
+            k_pos_emb, rotary_pos_emb, config=self.config, cu_seqlens=cu_seqlens_kv, mscale=mscale
+        )
+
+        # query: [s, b, n, 192]
+        query = torch.cat([q_no_pe, q_pos_emb], dim=-1)
+
+        # key: [s, b, n, 192]
+        key = torch.cat([k_no_pe, k_pos_emb], dim=-1)
+
+        query = query.contiguous()
+        key = key.contiguous()
+        value = value.contiguous()
+
+        return query, key, value
diff --git a/megatron/core/transformer/spec_utils.py b/megatron/core/transformer/spec_utils.py
new file mode 100644
index 0000000000..b3de854173
--- /dev/null
+++ b/megatron/core/transformer/spec_utils.py
@@ -0,0 +1,106 @@
+# Copyright (c) 2023, NVIDIA CORPORATION. All rights reserved.
+
+import types
+from dataclasses import dataclass, field
+from typing import Tuple, Union
+
+
+@dataclass
+class ModuleSpec:
+    """This is a Module Specification dataclass.
+
+    Specification defines the location of the module (to import dynamically)
+    or the imported module itself. It also defines the params that need to be
+    passed to initialize the module.
+
+    Args:
+        module (Union[Tuple, type]): A tuple describing the location of the
+            module class e.g. `(module.location, ModuleClass)` or the imported
+            module class itself e.g. `ModuleClass` (which is already imported
+            using `from module.location import ModuleClass`).
+        params (dict): A dictionary of params that need to be passed while init.
+
+    """
+
+    module: Union[Tuple, type]
+    params: dict = field(default_factory=lambda: {})
+    submodules: type = None
+
+
+def import_module(module_path: Tuple[str]):
+    """Import a named object from a module in the context of this function.
+
+    TODO: make this importer module more robust, at least make sure there
+    are no side effects of using this as is
+    """
+    base_path, name = module_path
+    try:
+        module = __import__(base_path, globals(), locals(), [name])
+    except ImportError as e:
+        print(f"couldn't import module due to {e}")
+        return None
+    return vars(module)[name]
+
+
+def get_module(spec_or_module: Union[ModuleSpec, type], **additional_kwargs):
+    # If a module clas is already provided return it as is
+    if isinstance(spec_or_module, (type, types.FunctionType)):
+        return spec_or_module
+
+    # If the module is provided instead of module path, then return it as is
+    if isinstance(spec_or_module.module, (type, types.FunctionType)):
+        return spec_or_module.module
+
+    # Otherwise, return the dynamically imported module from the module path
+    return import_module(spec_or_module.module)
+
+
+def build_module(spec_or_module: Union[ModuleSpec, type], *args, **kwargs):
+    # If the passed `spec_or_module` is
+    # a `Function`, then return it as it is
+    # NOTE: to support an already initialized module add the following condition
+    # `or isinstance(spec_or_module, torch.nn.Module)` to the following if check
+    if isinstance(spec_or_module, types.FunctionType):
+        return spec_or_module
+
+    # If the passed `spec_or_module` is actually a spec (instance of
+    # `ModuleSpec`) and it specifies a `Function` using its `module`
+    # field, return the `Function` as it is
+    if isinstance(spec_or_module, ModuleSpec) and isinstance(
+        spec_or_module.module, types.FunctionType
+    ):
+        return spec_or_module.module
+
+    # Check if a module class is provided as a spec or if the module path
+    # itself is a class
+    if isinstance(spec_or_module, type):
+        module = spec_or_module
+    elif hasattr(spec_or_module, "module") and isinstance(spec_or_module.module, type):
+        module = spec_or_module.module
+    else:
+        # Otherwise, dynamically import the module from the module path
+        module = import_module(spec_or_module.module)
+
+    # If the imported module is actually a `Function` return it as it is
+    if isinstance(module, types.FunctionType):
+        return module
+
+    # Finally return the initialized module with params from the spec as well
+    # as those passed as **kwargs from the code
+
+    # Add the `submodules` argument to the module init call if it exists in the
+    # spec.
+    if hasattr(spec_or_module, "submodules") and spec_or_module.submodules is not None:
+        kwargs["submodules"] = spec_or_module.submodules
+
+    try:
+        return module(
+            *args, **spec_or_module.params if hasattr(spec_or_module, "params") else {}, **kwargs
+        )
+    except Exception as e:
+        # improve the error message since we hide the module name in the line above
+        import sys
+
+        raise type(e)(f"{str(e)} when instantiating {module.__name__}").with_traceback(
+            sys.exc_info()[2]
+        )
diff --git a/megatron/core/transformer/torch_layer_norm.py b/megatron/core/transformer/torch_layer_norm.py
new file mode 100644
index 0000000000..11cf406f04
--- /dev/null
+++ b/megatron/core/transformer/torch_layer_norm.py
@@ -0,0 +1,44 @@
+# Copyright (c) 2024, NVIDIA CORPORATION. All rights reserved.
+import warnings
+
+import torch
+
+from megatron.core.transformer import TransformerConfig
+
+
+class WrappedTorchLayerNorm(torch.nn.LayerNorm):
+
+    def __init__(
+        self,
+        config: TransformerConfig,
+        hidden_size: int,
+        eps: float = 1e-5,
+        persist_layer_norm: bool = False,  ## TODO: unused arguments. See https://gitlab-master.nvidia.com/ADLR/megatron-lm/-/issues/223
+        zero_centered_gamma: bool = False,
+        normalization: str = "LayerNorm",  # included to match TE interface
+    ):
+        self.config = config
+        assert (
+            not self.config.layernorm_zero_centered_gamma
+        ), f"zero_centered_gamma not supported by torch LayerNorm"
+
+        assert (
+            self.config.normalization == "LayerNorm"
+        ), f'({self.config.normalization}) is not supported in by torch Layernorm'
+
+        assert (
+            not self.config.persist_layer_norm
+        ), f"persist_layer_norm not supported by torch LayerNorm"
+
+        assert (
+            not self.config.sequence_parallel
+        ), f"sequence parallel not supported by torch LayerNorm"
+
+        assert (
+            not self.config.memory_efficient_layer_norm
+        ), f"memory_efficient_layer_norm not supported by torch LayerNorm"
+
+        super().__init__(
+            normalized_shape=hidden_size,  ## applied to last len(normalized_shape.size) dimensions
+            eps=eps,
+        )
diff --git a/megatron/core/transformer/transformer_block.py b/megatron/core/transformer/transformer_block.py
new file mode 100755
index 0000000000..3a88f1ab22
--- /dev/null
+++ b/megatron/core/transformer/transformer_block.py
@@ -0,0 +1,597 @@
+# Copyright (c) 2024, NVIDIA CORPORATION. All rights reserved.
+
+from contextlib import nullcontext
+from dataclasses import dataclass
+from typing import List, Optional, Union
+
+import torch
+from torch import Tensor
+
+from megatron.core import InferenceParams, parallel_state, tensor_parallel
+from megatron.core.dist_checkpointing.mapping import ShardedStateDict
+from megatron.core.dist_checkpointing.utils import replace_prefix_for_sharding
+from megatron.core.fusions.fused_layer_norm import FusedLayerNorm
+from megatron.core.packed_seq_params import PackedSeqParams
+from megatron.core.transformer.module import MegatronModule
+from megatron.core.transformer.spec_utils import ModuleSpec, build_module
+from megatron.core.transformer.transformer_config import TransformerConfig
+from megatron.core.transformer.transformer_layer import BaseTransformerLayer
+from megatron.core.transformer.utils import sharded_state_dict_default
+from megatron.core.utils import is_te_min_version, make_viewless_tensor
+
+try:
+    from megatron.core.extensions.transformer_engine import (
+        TEDelayedScaling,
+        TENorm,
+        get_cpu_offload_context,
+        te_checkpoint,
+    )
+
+    HAVE_TE = True
+    LayerNormImpl = TENorm
+except ImportError:
+    HAVE_TE = False
+    get_cpu_offload_context = None
+
+    try:
+        import apex  # pylint: disable=unused-import
+
+        LayerNormImpl = FusedLayerNorm
+
+    except ImportError:
+        from megatron.core.transformer.torch_layer_norm import WrappedTorchLayerNorm
+
+        LayerNormImpl = WrappedTorchLayerNorm
+
+
+def get_num_layers_to_build(config: TransformerConfig) -> int:
+    """
+    Determine the number of transformer layers to build for the current pipeline stage.
+    Args:
+        config (TransformerConfig): Configuration object containing transformer model parameters.
+
+    Returns:
+        int: The number of layers to be built for the current pipeline stage.
+    """
+    if config.first_pipeline_num_layers is not None or config.last_pipeline_num_layers is not None:
+        assert (
+            parallel_state.get_virtual_pipeline_model_parallel_world_size() is None
+        ), "Uneven number of layer not compatible with interleaved pipeline schedule"
+
+        # Number of layers to distribute over rest of pipeline stages
+        layers_to_distribute = config.num_layers
+        # Number of pipeline stages left for distributing transformer layers
+        pipeline_stages_left = parallel_state.get_pipeline_model_parallel_world_size()
+
+        if config.first_pipeline_num_layers is not None:
+            layers_to_distribute -= config.first_pipeline_num_layers
+            pipeline_stages_left -= 1
+            if parallel_state.is_pipeline_first_stage():
+                return config.first_pipeline_num_layers
+
+        if config.last_pipeline_num_layers is not None:
+            layers_to_distribute -= config.last_pipeline_num_layers
+            pipeline_stages_left -= 1
+            if parallel_state.is_pipeline_last_stage():
+                return config.last_pipeline_num_layers
+
+        assert (
+            layers_to_distribute % pipeline_stages_left == 0
+        ), "With uneven pipelineing the left over layers must be divisible by left over stages"
+        num_layers_per_pipeline_rank = layers_to_distribute // pipeline_stages_left
+    else:
+        pipeline_ranks = config.pipeline_model_parallel_size
+        num_layers_per_pipeline_rank = config.num_layers // pipeline_ranks
+
+    if parallel_state.get_virtual_pipeline_model_parallel_world_size() is not None:
+        # Interleaved pipeline parallelism:
+        # Number of layers in each model chunk is the number of layers in the stage,
+        # divided by the number of model chunks in a stage.
+        # With 8 layers, 2 stages, and 4 model chunks, we want an assignment of
+        # layers to stages like (each list is a model chunk):
+        # Stage 0: [0]  [2]  [4]  [6]
+        # Stage 1: [1]  [3]  [5]  [7]
+        # With 8 layers, 2 stages, and 2 virtual stages, we want an assignment of
+        # layers to stages like (each list is a model chunk):
+        # Stage 0: [0, 1]  [4, 5]
+        # Stage 1: [2, 3]  [6, 7]
+
+        vp_size = parallel_state.get_virtual_pipeline_model_parallel_world_size()
+
+        num_layers_per_virtual_rank = num_layers_per_pipeline_rank // vp_size
+
+        num_layers_to_build = num_layers_per_virtual_rank
+
+    else:
+        # Non-interleaved pipeline parallelism:
+        # Each stage gets a contiguous set of layers.
+
+        num_layers_to_build = num_layers_per_pipeline_rank
+
+    return num_layers_to_build
+
+
+@dataclass
+class TransformerBlockSubmodules:
+    """
+    Dataclass for specifying the submodules of a transformer block.
+
+    This class defines the structure for configuring the layers and normalization
+    within a transformer block, allowing for flexible and customizable architecture designs.
+
+    Args:
+        layer_specs (List[ModuleSpec], optional): A list of module specifications for
+            the layers within the transformer block. Each specification typically
+            defines a complete transformer layer (e.g., self-attention, feed-forward network).
+        layer_norm (Optional[Union[ModuleSpec, torch.nn.Module]], optional): Specification
+            or instance of the layer normalization to be applied.
+    """
+
+    layer_specs: List[ModuleSpec] = None
+    layer_norm: Optional[Union[ModuleSpec, torch.nn.Module]] = None
+
+
+def _get_block_submodules(
+    config: TransformerConfig, spec: Union[TransformerBlockSubmodules, ModuleSpec]
+) -> TransformerBlockSubmodules:
+    """
+    Retrieve or construct TransformerBlockSubmodules based on the provided specification.
+
+    Args:
+        config (TransformerConfig): Configuration object for the transformer model.
+        spec (Union[TransformerBlockSubmodules, ModuleSpec]): Specification for the
+            transformer block submodules. Can be either a TransformerBlockSubmodules
+            instance or a ModuleSpec.
+
+    Returns:
+        TransformerBlockSubmodules: The submodules for the transformer block.
+    """
+
+    # Transformer block submodules.
+    if isinstance(spec, TransformerBlockSubmodules):
+        return spec
+
+    # ModuleSpec here is generally assumed to be for a transformer layer that
+    # is implemented in `transformer_layer.py` or if it subclasses
+    # `BaseTransformerLayer` from the `transformer_layer.py` file.
+    elif isinstance(spec, ModuleSpec):
+        if issubclass(spec.module, TransformerBlock):
+            return spec.submodules
+        elif issubclass(spec.module, BaseTransformerLayer):
+            num_layers = get_num_layers_to_build(config)
+            return TransformerBlockSubmodules(
+                layer_specs=[spec] * num_layers, layer_norm=LayerNormImpl
+            )
+        else:
+            raise Exception(f"specialize for {spec.module.__name__}.")
+    else:
+        raise Exception(f"specialize for {type(spec).__name__}.")
+
+
+class TransformerBlock(MegatronModule):
+    """Transformer class."""
+
+    def __init__(
+        self,
+        config: TransformerConfig,
+        spec: Union[TransformerBlockSubmodules, ModuleSpec],
+        post_layer_norm: bool = True,
+        pre_process: bool = True,
+        post_process: bool = True,
+    ):
+        super().__init__(config=config)
+
+        self.submodules = _get_block_submodules(config, spec)
+        self.post_layer_norm = post_layer_norm
+        self.pre_process = pre_process
+        self.post_process = post_process
+        # Dictionary to store CUDA graphs. Number of items in the dictionary = len(self.layers).
+        # Item `i` in the dictionary is a list of `N` CUDA graphs for layer 'i' where N is the
+        # number of microbatches. Multiple CUDA graphs per layer is required to support
+        # pipelining which requires running FWD graph of multiple microbatches before BWD graph.
+        # To enable CUDA graph, this dictionary should be populated in the model training script
+        # with the graphs returned by make_graphed_callables API before the first trainng step.
+        self.cuda_graphs = {}
+        self.current_microbatch = -1
+
+        # required for pipeline parallel schedules
+        self.input_tensor = None
+
+        self.checkpoint_core_attention = self.config.recompute_granularity == 'selective'
+
+        if get_cpu_offload_context is not None:
+            (self.offload_context, self.group_prefetch_offload_commit_async) = (
+                get_cpu_offload_context(
+                    self.config.cpu_offloading,
+                    self.config.cpu_offloading_num_layers,
+                    self.config.num_layers,
+                    self.config.cpu_offloading_activations,
+                    self.config.cpu_offloading_weights,
+                )
+            )
+            self.config._cpu_offloading_context = (
+                self.offload_context if self.config.cpu_offloading else None
+            )
+        else:
+            assert (
+                self.config.cpu_offloading is False
+            ), "CPU Offloading is enabled when TE is not present"
+
+            self.offload_context, self.group_prefetch_offload_commit_async = nullcontext(), None
+            self.config._cpu_offloading_context = None
+
+        self._build_layers()
+        self.num_layers_per_pipeline_rank = len(self.layers)
+        self.tp_only_amax_red = config.tp_only_amax_red
+
+    def _build_layers(self):
+        # Transformer layers.
+        # @jcasper can we improve how we deal with layer_number?
+        # currently it's only used in CoreAttention?
+        # if self.apply_query_key_layer_scaling:
+        #     coeff = self.layer_number
+        #     self.norm_factor *= coeff
+        def build_layer(layer_spec, layer_number):
+            return build_module(layer_spec, config=self.config, layer_number=layer_number)
+
+        # offset is implicit in TransformerLayer
+        self.layers = torch.nn.ModuleList(
+            [
+                build_layer(layer_spec, i + 1)
+                for i, layer_spec in enumerate(self.submodules.layer_specs)
+            ]
+        )
+
+        # @TODO: add back standalone_embedding_stage (see issue #293)
+        # In pipeline parallelism, we want to add this LN only to the last stage of the pipeline
+        # self.post_process and self.post_layer_norm guide this behavior
+        if self.submodules.layer_norm and self.post_process and self.post_layer_norm:
+            self.final_layernorm = build_module(
+                self.submodules.layer_norm,
+                config=self.config,
+                hidden_size=self.config.hidden_size,
+                eps=self.config.layernorm_epsilon,
+            )
+        else:
+            self.final_layernorm = None  # Either this or nn.Identity
+
+    def _get_layer(self, layer_number: int):
+        return self.layers[layer_number]
+
+    def _checkpointed_forward(
+        self,
+        hidden_states: Tensor,
+        attention_mask: Tensor,
+        context: Tensor,
+        context_mask: Tensor,
+        rotary_pos_emb: Tensor,
+        packed_seq_params: PackedSeqParams,
+    ):
+        """Forward method with activation checkpointing."""
+
+        def custom(start: int, end: int):
+            def custom_forward(
+                hidden_states, attention_mask, context, context_mask, rotary_pos_emb
+            ):
+                for index in range(start, end):
+                    layer = self._get_layer(index)
+                    hidden_states, context = layer(
+                        hidden_states=hidden_states,
+                        attention_mask=attention_mask,
+                        context=context,
+                        context_mask=context_mask,
+                        rotary_pos_emb=rotary_pos_emb,
+                        inference_params=None,
+                        packed_seq_params=packed_seq_params,
+                    )
+                return hidden_states, context
+
+            return custom_forward
+
+        def checkpoint_handler(forward_func):
+            if self.config.fp8:
+                return te_checkpoint(
+                    forward_func,
+                    self.config.distribute_saved_activations,
+                    tensor_parallel.random.get_cuda_rng_tracker,
+                    parallel_state.get_tensor_model_parallel_group(),
+                    hidden_states,
+                    attention_mask,
+                    context,
+                    context_mask,
+                    rotary_pos_emb,
+                )
+            else:
+                return tensor_parallel.checkpoint(
+                    forward_func,
+                    self.config.distribute_saved_activations,
+                    hidden_states,
+                    attention_mask,
+                    context,
+                    context_mask,
+                    rotary_pos_emb,
+                )
+
+        if self.config.recompute_method == 'uniform':
+            # Uniformly divide the total number of Transformer layers and checkpoint
+            # the input activation of each divided chunk.
+            # A method to further reduce memory usage reducing checkpoints.
+            layer_idx = 0
+            while layer_idx < self.num_layers_per_pipeline_rank:
+                hidden_states, context = checkpoint_handler(
+                    custom(layer_idx, layer_idx + self.config.recompute_num_layers)
+                )
+
+                layer_idx += self.config.recompute_num_layers
+
+        elif self.config.recompute_method == 'block':
+            # Checkpoint the input activation of only a set number of individual
+            # Transformer layers and skip the rest.
+            # A method fully use the device memory removing redundant re-computation.
+            recompute_skip_num_layers = 0
+            for layer_idx in range(self.num_layers_per_pipeline_rank):
+                # Skip recomputation when input grad computation is not needed.
+                # Need to have at least one input tensor with gradient computation
+                # for re-enterant autograd engine.
+                if self.config.fp8 and not hidden_states.requires_grad:
+                    recompute_skip_num_layers += 1
+                if (
+                    layer_idx >= recompute_skip_num_layers
+                    and layer_idx < self.config.recompute_num_layers + recompute_skip_num_layers
+                ):
+                    hidden_states, context = checkpoint_handler(custom(layer_idx, layer_idx + 1))
+                else:
+                    hidden_states, context = custom(layer_idx, layer_idx + 1)(
+                        hidden_states, attention_mask, context, context_mask, rotary_pos_emb
+                    )
+        else:
+            raise ValueError("Invalid activation recompute method.")
+
+        return hidden_states
+
+    def set_input_tensor(self, input_tensor: Tensor):
+        """Set input tensor to be used instead of forward()'s input.
+
+        When doing pipeline parallelism the input from the previous
+        stage comes from communication, not from the input, so the
+        model's forward_step_func won't have it. This function is thus
+        used by internal code to bypass the input provided by the
+        forward_step_func"""
+        self.input_tensor = input_tensor
+
+    def get_cuda_graph_optional_args(
+        self,
+        attention_mask: Tensor,
+        context: Tensor,
+        context_mask: Tensor,
+        rotary_pos_emb: Tensor,
+        inference_params: InferenceParams,
+        packed_seq_params: PackedSeqParams,
+    ):
+        """Get optional tensor arguments for CUDA graph."""
+
+        optional_inputs = {}
+        optional_inputs['is_first_microbatch'] = self.current_microbatch == 0
+        try:
+            import transformer_engine.pytorch as te  # pylint: disable=unused-import
+
+            if is_te_min_version("1.10.0", check_equality=False):
+                assert not any(
+                    [attention_mask, context, context_mask, rotary_pos_emb]
+                ), "Keyword Arguments not supported with CUDA graph."
+            else:
+                optional_inputs['attention_mask'] = attention_mask
+                optional_inputs['context'] = context
+                optional_inputs['context_mask'] = context_mask
+                optional_inputs['rotary_pos_emb'] = rotary_pos_emb
+        except ImportError:
+            raise RuntimeError("CUDAGraph requires TransformerEngine, but not installed")
+        return optional_inputs
+
+    def forward(
+        self,
+        hidden_states: Tensor,
+        attention_mask: Tensor,
+        context: Tensor = None,
+        context_mask: Tensor = None,
+        rotary_pos_emb: Tensor = None,
+        inference_params: InferenceParams = None,
+        packed_seq_params: PackedSeqParams = None,
+    ):
+        """
+        Perform the forward pass through the transformer block.
+
+        This method handles the core computation of the transformer, including
+        self-attention, optional cross-attention, and feed-forward operations.
+
+        Args:
+            hidden_states (Tensor): Input tensor of shape [s, b, h] where s is the
+                sequence length, b is the batch size, and h is the hidden size.
+            attention_mask (Tensor): Boolean tensor of shape [1, 1, s, s] for masking
+                self-attention.
+            context (Tensor, optional): Context tensor for cross-attention.
+            context_mask (Tensor, optional): Mask for cross-attention context
+            rotary_pos_emb (Tensor, optional): Rotary positional embeddings.
+            inference_params (InferenceParams, optional): Parameters for inference-time
+                optimizations.
+            packed_seq_params (PackedSeqParams, optional): Parameters for packed sequence
+                processing.
+
+        Returns:
+            Union[Tensor, Tuple[Tensor, Tensor]]: The output hidden states tensor of shape
+            [s, b, h], and optionally the updated context tensor if cross-attention is used.
+        """
+
+        if not self.pre_process:
+            # See set_input_tensor()
+            hidden_states = self.input_tensor
+
+        # Viewless tensor.
+        # - We only need to create a viewless tensor in the case of micro batch
+        #   size (mbs) == 1, since in this case, 'hidden_states.transpose()'
+        #   above creates a view tensor, and '.contiguous()' is a pass-through.
+        #   For mbs >= 2, '.contiguous()' creates a new tensor, eliminating
+        #   the need to make it viewless.
+        #
+        #   However, we don't explicitly check mbs == 1 here because
+        #   make_viewless_tensor() has negligible overhead when its input
+        #   is already viewless.
+        #
+        # - For the 'else' case above, calling make_viewless_tensor() here is
+        #   likely redundant, since p2p_communication.py (likely originator)
+        #   already creates viewless tensors. That said, make_viewless_tensor()
+        #   is called here to be future-proof and corner-case-proof.
+        hidden_states = make_viewless_tensor(inp=hidden_states, requires_grad=True, keep_graph=True)
+
+        if self.config.sequence_parallel:
+            rng_context = tensor_parallel.get_cuda_rng_tracker().fork()
+        else:
+            rng_context = nullcontext()
+
+        if self.config.fp8:
+            import transformer_engine  # To keep out TE dependency when not training in fp8
+
+            if self.config.fp8 == "e4m3":
+                fp8_format = transformer_engine.common.recipe.Format.E4M3
+            elif self.config.fp8 == "hybrid":
+                fp8_format = transformer_engine.common.recipe.Format.HYBRID
+            else:
+                raise ValueError("E4M3 and HYBRID are the only supported FP8 formats.")
+
+            fp8_recipe = TEDelayedScaling(
+                config=self.config,
+                fp8_format=fp8_format,
+                override_linear_precision=(False, False, not self.config.fp8_wgrad),
+            )
+            fp8_group = None
+            if parallel_state.model_parallel_is_initialized():
+                fp8_group = parallel_state.get_amax_reduction_group(
+                    with_context_parallel=True, tp_only_amax_red=self.tp_only_amax_red
+                )
+            fp8_context = transformer_engine.pytorch.fp8_autocast(
+                enabled=True, fp8_recipe=fp8_recipe, fp8_group=fp8_group
+            )
+        else:
+            fp8_context = nullcontext()
+
+        with rng_context and fp8_context:
+            # Forward pass.
+            if self.config.recompute_granularity == 'full' and self.training:
+                hidden_states = self._checkpointed_forward(
+                    hidden_states=hidden_states,
+                    attention_mask=attention_mask,
+                    context=context,
+                    context_mask=context_mask,
+                    rotary_pos_emb=rotary_pos_emb,
+                    packed_seq_params=packed_seq_params,
+                )
+            else:
+                for l_no, layer in enumerate(self.layers):
+                    with self.offload_context:
+                        layer.use_cudagraph = True
+                        if (len(self.cuda_graphs) == 0) or (not self.training):
+                            hidden_states, context = layer(
+                                hidden_states=hidden_states,
+                                attention_mask=attention_mask,
+                                context=context,
+                                context_mask=context_mask,
+                                rotary_pos_emb=rotary_pos_emb,
+                                inference_params=inference_params,
+                                packed_seq_params=packed_seq_params,
+                            )
+                        else:
+                            # CUDA graph replay for layer `l_no` and microbatch
+                            # `self.current_microbatch`. TransformerEngine versions>=1.10
+                            # allow keyword arguments with CUDA graph. However, CUDA graph
+                            # acccepts only Tensor inputs and Tensor outputs. Hence,
+                            # `inference_params` and `packed_seq_params` are excluded from
+                            # input list while output is limited to `hidden_states`.
+                            cg_index = self.current_microbatch % len(self.cuda_graphs[l_no])
+                            assert not any(
+                                [inference_params, packed_seq_params]
+                            ), "CUDA graph accepts only Tensor inputs."
+                            optional_inputs = self.get_cuda_graph_optional_args(
+                                attention_mask,
+                                context,
+                                context_mask,
+                                rotary_pos_emb,
+                                inference_params,
+                                packed_seq_params,
+                            )
+                            hidden_states = self.cuda_graphs[l_no][cg_index](
+                                hidden_states, **optional_inputs
+                            )
+
+                    if (
+                        torch.is_grad_enabled()
+                        and self.config.cpu_offloading
+                        and self.group_prefetch_offload_commit_async is not None
+                    ):
+                        hidden_states = self.group_prefetch_offload_commit_async(hidden_states)
+
+        # Final layer norm.
+        if self.final_layernorm is not None:
+            hidden_states = self.final_layernorm(hidden_states)
+            # TENorm produces a "viewed" tensor. This will result in schedule.py's
+            # deallocate_output_tensor() throwing an error, so a viewless tensor is
+            # created to prevent this.
+            hidden_states = make_viewless_tensor(
+                inp=hidden_states, requires_grad=True, keep_graph=True
+            )
+
+        return hidden_states
+
+    def sharded_state_dict(
+        self, prefix: str = '', sharded_offsets: tuple = (), metadata: dict = None
+    ) -> ShardedStateDict:
+        """
+        Generate a sharded state dictionary for the transformer block.
+
+        Args:
+            prefix (str, optional): Prefix to be added to all keys in the state dict.
+                Defaults to an empty string.
+            sharded_offsets (tuple, optional): Tuple of sharding offsets.
+            metadata (dict, optional): Additional metadata for sharding.
+                Can specify if layers are non-homogeneous. Defaults to None.
+
+        Returns:
+            ShardedStateDict: A dictionary containing the sharded state of the model.
+        """
+        assert not sharded_offsets, "Unexpected sharded offsets"
+        non_homogeneous_layers = metadata is not None and metadata.get(
+            'non_homogeneous_layers', False
+        )
+        sharded_state_dict = {}
+
+        layer_prefix = f'{prefix}layers.'
+        num_layers = self.config.num_layers
+        for layer in self.layers:
+            offset = layer._get_layer_offset()
+
+            global_layer_offset = layer.layer_number - 1  # self.layer_number starts at 1
+            state_dict_prefix = f'{layer_prefix}{global_layer_offset - offset}.'  # module list index in TransformerBlock # pylint: disable=line-too-long
+            if non_homogeneous_layers:
+                sharded_prefix = f'{layer_prefix}{global_layer_offset}.'
+                sharded_pp_offset = []
+            else:
+                sharded_prefix = layer_prefix
+                sharded_pp_offset = [
+                    (0, global_layer_offset, num_layers)
+                ]  # PP sharding offset for ShardedTensors
+            layer_sharded_state_dict = layer.sharded_state_dict(
+                state_dict_prefix, sharded_pp_offset, metadata
+            )
+            replace_prefix_for_sharding(layer_sharded_state_dict, state_dict_prefix, sharded_prefix)
+
+            sharded_state_dict.update(layer_sharded_state_dict)
+
+        # Add modules other than self.layers
+        for name, module in self.named_children():
+            if not module is self.layers:
+                sharded_state_dict.update(
+                    sharded_state_dict_default(
+                        module, f'{prefix}{name}.', sharded_offsets, metadata
+                    )
+                )
+
+        return sharded_state_dict
diff --git a/megatron/core/transformer/transformer_config.py b/megatron/core/transformer/transformer_config.py
new file mode 100644
index 0000000000..a63171686a
--- /dev/null
+++ b/megatron/core/transformer/transformer_config.py
@@ -0,0 +1,566 @@
+# Copyright (c) 2024, NVIDIA CORPORATION. All rights reserved.
+
+from dataclasses import dataclass
+from typing import Callable, Optional, Tuple
+
+import torch.nn.functional as F
+
+from ..model_parallel_config import ModelParallelConfig
+from ..utils import get_te_version, init_method_normal, is_te_min_version, scaled_init_method_normal
+
+
+@dataclass
+class TransformerConfig(ModelParallelConfig):
+    """Configuration object for megatron-core transformers.
+
+    The initialization function has an argument for each parameter,
+    including those in ModelParallelConfig.
+    """
+
+    ####################
+    # model architecture
+    ####################
+    num_layers: int = 0
+    """Number of transformer layers in a transformer block."""
+
+    first_pipeline_num_layers: int = None
+    """Number of transformer layers on first pipeline stage. 
+    None implies equal layer division across PP ranks."""
+
+    last_pipeline_num_layers: int = None
+    """Number of transformer layers on last pipeline stage. 
+    None implies equal layer division across PP ranks."""
+
+    hidden_size: int = 0
+    """Transformer hidden size."""
+
+    num_attention_heads: int = 0
+    """Number of transformer attention heads."""
+
+    num_query_groups: int = None
+    """Number of query groups for group query attention. If None, normal attention is used."""
+
+    ffn_hidden_size: int = None
+    """Transformer Feed-Forward Network hidden size. This is set to 4*hidden_size
+    if not provided."""
+
+    kv_channels: int = None
+    """Projection weights dimension in multi-head attention. This is set to hidden_size //
+    num_attention_heads if not provided."""
+
+    hidden_dropout: float = 0.1
+    """Dropout probability for transformer hidden state."""
+
+    attention_dropout: float = 0.1
+    """Post attention dropout probability."""
+
+    fp32_residual_connection: bool = False
+    """If true, move residual connections to fp32."""
+
+    # @jcasper should we keep this option?
+    apply_residual_connection_post_layernorm: bool = False
+    """If True, uses the original BERT residule connection ordering."""
+
+    layernorm_epsilon: float = 1e-5
+    """Epsilon value for any LayerNorm operations."""
+
+    layernorm_zero_centered_gamma: bool = False
+    """If set to True, the LayerNorm is adjusted to center the gamma values around 0. This improves
+    numerical stability."""
+
+    add_bias_linear: bool = True
+    """Include a bias term in all linear layers (QKV projections, after core attention, and two in
+    MLP layer)."""
+
+    add_qkv_bias: bool = False
+    """Add a bias term only for QKV projections."""
+
+    gated_linear_unit: bool = False
+    """Use a gated linear unit for the first linear layer in the MLP."""
+
+    activation_func: Callable = F.gelu
+    """Activation function to use for the non-linearity in the MLP."""
+
+    activation_func_fp8_input_store: bool = False
+    """Store the input of MLP activation function in FP8 for backprop to save memory.
+    The stored input is casted back to the original precision before backprop compuatation."""
+
+    num_moe_experts: int = None
+    """Number of experts to use for MoE layer. When set, it replaces MLP with MoE layer. Set to None
+    for no MoE."""
+
+    rotary_interleaved: bool = False
+    """True is rotate pairs of even and odd dimensions (RoFormer style), False is rotate pairs of
+    first half and second half (LLaMa style). Default to False."""
+
+    window_size: Optional[Tuple[int, int]] = None
+    """If not None, then will use sliding window attention. The size of the window is specified by
+    the numbers inside the tuple; -1 is special value meaning "infinite window size"."""
+
+    normalization: bool = "LayerNorm"
+    """Which norm to use for normalization layers, valid options are `LayerNorm` and `RMSNorm`."""
+
+    qk_layernorm: bool = False
+    """Whether to apply LayerNorm to the query and key embeddings."""
+
+    test_mode: bool = False
+    """Whether to run real-time tests."""
+
+    calculate_per_token_loss: bool = False
+    """Whether cross entropy loss is calculated over the actual number of non-padded tokens in the
+    global batch, versus the default behavior of assuming all tokens are non-padded."""
+
+    multi_latent_attention: bool = False
+    """Whether to use multi-latent attention."""
+
+    ####################
+    # initialization
+    ####################
+    init_method: Callable = None
+    """Method to initialize weights. Note that bias is always set to zero. Should be a function that
+    takes a single Tensor and initializes it. If None, will be set to
+    megatron.core.utils.init_method_normal(init_method_std) which is torch nn init normal with
+    mean=0.0 and std=init_method_std."""
+
+    output_layer_init_method: Callable = None
+    """Method to initialize weights of the output layer of both attention and MLP blocks. If None,
+    will be set to megatron.core.utils.scaled_init_method_normal(init_method_std) which is torch nn
+    init normal with mean=0.0 and std=init_method_std / math.sqrt(2.0 * num_layers)."""
+
+    init_method_std: float = 0.02
+    """Standard deviation of the zero mean normal for the default initialization method, not used if
+    init_method and output_layer_init_method are provided."""
+
+    ####################
+    # mixed-precision
+    ####################
+    apply_query_key_layer_scaling: bool = False
+    """If true, scale Q * K^T by 1 / layer-number. This improve numeric stability when training with
+    fp16."""
+
+    attention_softmax_in_fp32: bool = True
+    """If True, run attention masking and softmax in fp32. This should be True if
+    apply_query_key_layer_scaling is True."""
+
+    ####################
+    # fusion
+    ####################
+    bias_activation_fusion: bool = False
+    """If True, fuses bias addition and the activation function when possible."""
+
+    masked_softmax_fusion: bool = False
+    """If True, uses softmax fusion."""
+
+    persist_layer_norm: bool = False
+    """If True, uses the persistent fused layer norm kernel. This kernel only supports a fixed set
+    of hidden sizes."""
+
+    memory_efficient_layer_norm: bool = False
+    """If True, and using local layers (not from TransformerEngine), tells Apex to use the memory
+    efficient fused LayerNorm kernel. Ignored if not using LayerNorm."""
+
+    bias_dropout_fusion: bool = False  # TODO: this should be bias_dropout_add_fusion?
+    """If True, uses bias dropout fusion."""
+
+    apply_rope_fusion: bool = False
+    """If True, use fused RoPE kernel."""
+
+    ####################
+    # activation recomputation
+    ####################
+    recompute_granularity: str = None
+    """Determines which type of activation recompute to use.  Megatron-core supports 'selective'
+    activation checkpointing where only the memory intensive part of attention is checkpointed.
+    These memory intensive activations are also less compute intensive which makes activation
+    checkpointing more efficient for LLMs (20B+).  See Reducing Activation Recomputation in Large
+    Transformer Models (https://arxiv.org/abs/2205.05198) for more details.  'full' will checkpoint
+    the entire transformer layer.  If None, no recompute is performed and all activations are saved.
+    If set, must be 'selective' or 'full'. 'selective' always uses all layers.
+    """
+
+    recompute_method: str = None
+    """Determines which transformer layers will be recomputed. uniform will uniformly divide the
+    total number of transformer layers in a transformer block and recompute the input activation of
+    each divided chunk at the specified granularity.  block will recompute the input activations for
+    only a set number of transformer layers per pipeline stage.  The rest of the layers in the
+    pipeline stage will not have any activations recomputed.  If None, and recompute is enabled, all
+    layers will do recomputation. If set, must be 'uniform' or 'block'."""
+
+    recompute_num_layers: int = None
+    """When recompute_method is uniform, recompute_num_layers is the number of transformer layers in
+    each uniformly divided recompute unit.  When recompute_method is block, recompute_num_layers is
+    the number of transformer layers to recompute within each pipeline stage.  Must be None for
+    'selective' activation checkpointing."""
+
+    distribute_saved_activations: bool = None
+    """If True, distribute recomputed activations across the model parallel group."""
+
+    ####################
+    # fp8 related
+    ####################
+    fp8: str = None
+    """If set, enables the use of FP8 precision through Transformer Engine. There are 2 predefined
+    choices (1) 'e4m3' uniformly uses e4m3 for all FP8 tensors, (2) 'hybrid' uses e4m3 for all FP8
+    activation and weight tensors and e5m2 for all FP8 output activation gradient tensors."""
+
+    fp8_margin: int = 0
+    """Margin for the scaling factor computation."""
+
+    fp8_interval: int = 1
+    """DEPRECATED from TransformerEngine v1.8.0. This flag is ignored.
+    Controls how often the scaling factor is recomputed.
+    """
+
+    fp8_amax_history_len: int = 1
+    """The length of the amax history window used for scaling factor computation."""
+
+    fp8_amax_compute_algo: str = "most_recent"
+    """Algorithm used for choosing the `amax` value for the scaling factor computation. There are 2
+    predefined choices: `max` chooses the largest `amax` in the history window, while `most_recent`
+    always chooses the most recently seen value.
+
+    """
+
+    fp8_wgrad: bool = True
+    """When set to False, override FP8 config options and do the wgrad computation
+    in higher precision."""
+
+    fp8_dot_product_attention: bool = False
+    """When set to True, use the FP8 implementation of Dot Product Attention."""
+
+    fp8_multi_head_attention: bool = False
+    """When set to True, use the FP8 implementation of Multi Head Attention."""
+
+    tp_only_amax_red: bool = False
+    """When set to True, reduce the FP8 AMAX only in the TP or TP-CP domain"""
+
+    ####################
+    # MoE related
+    ####################
+    moe_shared_expert_intermediate_size: int = None
+    """Shared expert total ffn hidden size.
+    It should be equal to 'num_shared_experts * ffn_size_of_each_shared_expert' if
+    there are multiple shared experts.
+    None means no shared expert."""
+
+    moe_shared_expert_overlap: bool = False
+    """Enable overlapping between shared expert computations and dispatcher communications.
+    Without this, the shared epxerts execute after the routed experts."""
+
+    moe_router_load_balancing_type: str = "aux_loss"
+    """Determines the load balancing strategy for the router. "aux_loss" corresponds to the load
+    balancing loss used in GShard and SwitchTransformer, "sinkhorn" corresponds to the balancing
+    algorithm used in S-BASE, and "none" implies no load balancing."""
+
+    moe_router_topk: int = 2
+    """Number of experts to route to for each token."""
+
+    moe_router_pre_softmax: bool = False
+    """Enable pre-softmax routing for MoE, which means softmax is before the top-k selection. 
+    By default, softmax is done after top-k."""
+
+    moe_grouped_gemm: bool = False
+    """When there are multiple experts per rank, compress multiple local (potentially small) gemms
+    in a single kernel launch to improve the utilization and performance by leveraging the Grouped
+    GEMM feature introduced since CUTLASS 2.8 (https://github.com/fanshiqing/grouped_gemm).
+    """
+
+    moe_aux_loss_coeff: float = 0  # 1e-2 would be a good start value for load balance loss.
+    """Scaling coefficient for the aux loss. A starting value of 1e-2 is recommended."""
+
+    moe_z_loss_coeff: float = None  # 1e-3 would be a good start value for z-loss
+    """Scaling coefficient for the z-loss. A starting value of 1e-3 is recommended."""
+
+    moe_input_jitter_eps: float = None
+    """Add noise to the input tensor by applying jitter with a specified epsilon value."""
+
+    moe_token_dropping: bool = False  # TODO: Support token dropping.
+    """This feature involves selectively dropping and padding tokens for each expert to achieve a
+    specified capacity, similar to GShard, Switch-Transformer, and DeepSpeed-MoE. Note that this is
+    currently unsupported so should remain False."""
+
+    moe_token_dispatcher_type: str = "allgather"
+    """The type of token dispatcher to use. The default is 'allgather'.
+    Options are 'allgather' and 'alltoall'."""
+
+    moe_per_layer_logging: bool = False
+    """Enable per-layer logging for MoE, currently supports auxiliary loss and z loss."""
+
+    moe_expert_capacity_factor: float = None
+    """moe_expert_capacity_factor (float): The capacity factor for each expert, None means no token
+    will be dropped. The default is None."""
+
+    moe_pad_expert_input_to_capacity: bool = False
+    """moe_pad_expert_input_to_capacity (bool): If True, pads the input for each expert to match
+    the expert capacity length, effective only after the moe_expert_capacity_factor is set. The
+    default setting is False."""
+
+    moe_token_drop_policy: str = 'probs'
+    """The policy to drop tokens. Can be either "probs" or "position". If "probs", the tokens with
+    the lowest probabilities will be dropped. If "position", tokens at the end of each batch will
+    be dropped.
+    """
+
+    moe_layer_recompute: bool = False
+    """Memory optimization: checkpointing moe_layer to save actiavtion memory."""
+
+    ####################
+    # miscellaneous
+    ####################
+    clone_scatter_output_in_embedding: bool = True
+    """When set to True, clone the output of scatter_to_sequence_parallel_region in embedding layer
+    to facilitate garbage collection of input."""
+
+    disable_parameter_transpose_cache: bool = False
+    """When set to true, the parameter transposes are not cached for subsequent iterations."""
+
+    enable_cuda_graph: bool = False
+    """When set to true, TransformerLayer layers are swapped with a CUDA graphed version."""
+
+    external_cuda_graph: bool = False
+    """When set to true, TransformerLayer layers are swapped with user provided CUDA graphs."""
+
+    config_logger_dir: str = ""
+    """When non-empty, dumps entry-point configs to config_logger_dir"""
+
+    def __post_init__(self):
+        """Python dataclass method that is used to modify attributes after initialization.
+        See https://docs.python.org/3/library/dataclasses.html#post-init-processing for more
+        details.
+        """
+        super().__post_init__()
+        if self.fp16 and self.bf16:
+            raise ValueError(
+                f'Only one of self.fp16: {self.fp16} and self.bf16 {self.bf16} should be True.'
+            )
+
+        if self.num_attention_heads % self.tensor_model_parallel_size != 0:
+            raise ValueError(
+                f"num_attention_heads ({self.num_attention_heads}) must be a multiple of "
+                f"tensor_model_parallel_size ({self.tensor_model_parallel_size})."
+            )
+
+        if self.ffn_hidden_size is None:
+            self.ffn_hidden_size = 4 * self.hidden_size
+
+        if self.kv_channels is None:
+            self.kv_channels = self.hidden_size // self.num_attention_heads
+
+        if self.num_query_groups is None:
+            self.num_query_groups = self.num_attention_heads
+
+        if self.num_query_groups % self.tensor_model_parallel_size != 0:
+            raise ValueError(
+                f"num_query_groups ({self.num_query_groups}) must be a multiple of "
+                f"tensor_model_parallel_size ({self.tensor_model_parallel_size})."
+            )
+
+        if self.apply_query_key_layer_scaling:
+            self.attention_softmax_in_fp32 = True
+
+        if self.expert_model_parallel_size > 1 and self.num_moe_experts is None:
+            raise ValueError('num_moe_experts must be non None to use expert-parallel.')
+
+        if self.num_moe_experts is not None and self.num_moe_experts <= 0:
+            raise ValueError('num_moe_experts must be non-negative.')
+
+        if self.moe_shared_expert_intermediate_size is not None:
+            if self.moe_shared_expert_intermediate_size <= 0:
+                raise ValueError(
+                    f'moe_shared_expert_intermediate_size must be '
+                    f'num_shared_experts * ffn_size_of_each_shared_expert, '
+                    f'but got {self.moe_shared_expert_intermediate_size}'
+                )
+            if self.moe_shared_expert_overlap and self.moe_token_dispatcher_type not in [
+                "alltoall"
+            ]:
+                raise ValueError(
+                    f'moe_shared_expert_overlap only works with alltoall token dispatcher.'
+                )
+
+        if self.moe_expert_capacity_factor is not None:
+            if self.moe_token_dispatcher_type not in ["alltoall", "alltoall_seq"]:
+                raise ValueError(
+                    'moe_expert_capacity_factor only works with alltoall token dispatcher'
+                )
+            if self.moe_expert_capacity_factor < 0:
+                self.moe_expert_capacity_factor = None
+            if self.moe_router_load_balancing_type not in ["aux_loss", "none"]:
+                raise ValueError(
+                    'moe_expert_capacity_factor only works with aux_loss or none load balancing'
+                )
+
+        if self.moe_pad_expert_input_to_capacity:
+            if self.moe_expert_capacity_factor is None:
+                raise ValueError(
+                    'moe_expert_capacity_factor must be set to use moe_pad_expert_input_to_capacity'
+                )
+
+        if self.cpu_offloading and (
+            self.cpu_offloading_num_layers < 0 or self.cpu_offloading_num_layers >= self.num_layers
+        ):
+            raise ValueError(
+                f'CPU offloading can be done only for layers less than {self.num_layers}'
+            )
+
+        if self.cpu_offloading and self.pipeline_model_parallel_size > 1:
+            raise ValueError(
+                'Currently there is no support for Pipeline parallelism with CPU offloading'
+            )
+
+        if self.cpu_offloading and self.recompute_granularity is not None:
+            raise ValueError(
+                'CPU offloading does not work when activation recomputation is enabled'
+            )
+
+        if self.recompute_granularity is not None:
+            if self.recompute_granularity not in ['full', 'selective']:
+                raise ValueError(
+                    f'When using recompute_granuarlity: {self.recompute_granularity} must be "full"'
+                    'or "selective".'
+                )
+
+            if self.recompute_method is not None:
+                if self.recompute_method not in ['block', 'uniform']:
+                    raise ValueError(
+                        f'recompute_method: {self.recompute_method} must be "block" or "uniform".'
+                    )
+            elif self.recompute_granularity != 'selective':
+                raise ValueError(
+                    f'Using recompute_granularity: {self.recompute_granularity} so '
+                    'recompute_method must be "block" or "uniform"'
+                )
+
+            if self.recompute_granularity != 'selective' and self.recompute_num_layers is None:
+                raise ValueError(
+                    f'When using recompute_granularity: {self.recompute_granularity} '
+                    'recompute_num_layers must be between '
+                    '1 and num_layers_per_pipeline_rank: '
+                    f'{self.num_layers // self.pipeline_model_parallel_size}'
+                )
+            elif (
+                self.recompute_granularity == 'selective' and self.recompute_num_layers is not None
+            ):
+                raise ValueError(
+                    f'When using recompute_granularity: {self.recompute_granularity} '
+                    'recompute_num_layers must be None.'
+                )
+
+            if self.distribute_saved_activations and self.sequence_parallel:
+                raise ValueError(
+                    f'distribute_saved_activations: {self.distribute_saved_activations} must be '
+                    f'false when sequence parallel is enabled: {self.sequence_parallel}'
+                )
+
+            if self.virtual_pipeline_model_parallel_size is not None:
+                if not self.num_layers % self.virtual_pipeline_model_parallel_size == 0:
+                    raise ValueError(
+                        f'num_layers: {self.num_layers} must be divisible by '
+                        f'virtual_model_parallel_size {self.virtual_pipeline_model_parallel_size}'
+                    )
+
+        if self.apply_query_key_layer_scaling:
+            self.attention_softmax_in_fp32 = True
+
+        if self.bias_activation_fusion:
+            if self.activation_func not in [F.gelu, F.silu]:
+                raise ValueError(
+                    "When bias_activation_fusion is True, activation function should be either "
+                    "gelu or swiglu"
+                )
+            if (
+                self.activation_func == F.gelu
+                and not self.gated_linear_unit
+                and not self.add_bias_linear
+            ):
+                raise ValueError(
+                    "When bias_activation_fusion is True, gated_linear_unit is False, "
+                    "and activation function is gelu, add_bias_linear must also be True."
+                )
+        if self.activation_func_fp8_input_store:
+            if self.activation_func != F.silu or not self.gated_linear_unit:
+                raise ValueError("Storing activation input in FP8 is supported only for SwiGLU.")
+        if self.apply_rope_fusion and self.rotary_interleaved:
+            raise ValueError('rotary_interleaved does not work with apply_rope_fusion.')
+
+        if self.init_method is None:
+            self.init_method = init_method_normal(self.init_method_std)
+
+        if self.output_layer_init_method is None:
+            self.output_layer_init_method = scaled_init_method_normal(
+                self.init_method_std, self.num_layers
+            )
+
+        if self.moe_extended_tp:
+            if self.moe_token_dispatcher_type != 'allgather':
+                raise ValueError(
+                    "Moe extended TP parallelism only applies to allgather based token dispatcher."
+                )
+            extended_tp_size = self.tensor_model_parallel_size * self.expert_model_parallel_size
+            if self.ffn_hidden_size % extended_tp_size != 0:
+                raise ValueError(
+                    f'ffn_hidden_size: {self.ffn_hidden_size} must be divisible by '
+                    f'extended_tp_size {extended_tp_size}'
+                )
+
+        if self.num_moe_experts and self.fp8:
+            # TE version below 1.7.0 will raise Error when handle zeros tokens for expert
+            if not is_te_min_version("1.7.0.dev0"):
+                raise ValueError(
+                    "Only transformer-engine>=1.7.0 supports MoE FP8 training, "
+                    f"but your version is {get_te_version()}."
+                )
+
+            if self.moe_grouped_gemm:
+                raise ValueError("Grouped GEMM of MoE not support fp8 for now.")
+
+
+@dataclass
+class MLATransformerConfig(TransformerConfig):
+    """Configuration object for megatron-core Multi-Latent Attention (MLA) transformers.
+
+    The initialization function has an argument for each parameter, including those in
+    ModelParallelConfig. Included YaRN RoPE parameters that is fused in MLA.
+    """
+
+    multi_latent_attention: bool = True
+    """Whether to use Multi-Latent Attention."""
+
+    q_lora_rank: int = 512
+    """Rank of Query tensor's low rank representation."""
+
+    kv_lora_rank: int = 512
+    """Rank of Key and Value tensors' low rank representation."""
+
+    qk_head_dim: int = 128
+    """Dimension of the head in the QK projection. q_head_dim = qk_head_dim + qk_pos_emb_head_dim"""
+
+    qk_pos_emb_head_dim: int = 64
+    """Dimension of the position embedding in the QK projection."""
+
+    v_head_dim: int = 128
+    """Dimension of the head in the V projection."""
+
+    rotary_base: float = 10000
+    """Rotary base for the rotary embeddings."""
+
+    rotary_scaling_factor: float = 40
+    """Rotary scaling factor for the rotary embeddings."""
+
+    normalization: str = "RMSNorm"
+    """Default normalization layer for MLA models is RMSNorm."""
+
+    max_position_embeddings: int = 163840
+    """Maximum position embeddings for the original model."""
+
+    beta_fast: float = 32
+    """Beta fast for YaRN RoPE."""
+
+    beta_slow: float = 1
+    """Beta slow for YaRN RoPE."""
+
+    mscale: float = 0.707
+    """Mscale for YaRN RoPE in Multi-Latent Attention."""
+
+    mscale_all_dim: float = 0.707
+    """Mscale all dimensions for YaRN RoPE in Multi-Latent Attention."""
diff --git a/megatron/core/transformer/transformer_layer.py b/megatron/core/transformer/transformer_layer.py
new file mode 100644
index 0000000000..7f5f14944e
--- /dev/null
+++ b/megatron/core/transformer/transformer_layer.py
@@ -0,0 +1,377 @@
+# Copyright (c) 2024, NVIDIA CORPORATION. All rights reserved.
+
+from abc import ABC
+from dataclasses import dataclass, field
+from typing import Dict, Optional, Union
+
+import torch
+
+from megatron.core import parallel_state
+from megatron.core.dist_checkpointing.mapping import ShardedStateDict
+from megatron.core.dist_checkpointing.utils import apply_prefix_mapping
+from megatron.core.transformer.cuda_graphs import CudaGraphManager
+from megatron.core.transformer.identity_op import IdentityFuncOp, IdentityOp
+from megatron.core.transformer.module import MegatronModule
+from megatron.core.transformer.spec_utils import ModuleSpec, build_module
+from megatron.core.transformer.transformer_config import TransformerConfig
+from megatron.core.utils import make_viewless_tensor
+
+
+@dataclass
+class TransformerLayerSubmodules:
+    """
+    Configuration class for specifying the submodules of a transformer layer.
+
+    This class defines the structure and default implementations for various
+    components of a transformer layer, allowing for flexible customization
+    of the layer's architecture.
+
+    Args:
+        input_layernorm (Union[ModuleSpec, type]): Specification for the input layer normalization.
+        self_attention (Union[ModuleSpec, type]): Specification for the self-attention mechanism.
+        self_attn_bda (Union[ModuleSpec, type]): Specification for the bias-dropout-add operation
+            after self-attention.
+        pre_cross_attn_layernorm (Union[ModuleSpec, type]): Specification for the layer
+            normalization before cross-attention.
+        cross_attention (Union[ModuleSpec, type]): Specification for the cross-attention mechanism.
+        cross_attn_bda (Union[ModuleSpec, type]): Specification for the bias-dropout-add operation
+            after cross-attention.
+        pre_mlp_layernorm (Union[ModuleSpec, type]): Specification for the layer normalization
+            before the MLP.
+        mlp (Union[ModuleSpec, type]): Specification for the MLP.
+        mlp_bda (Union[ModuleSpec, type]): Specification for the bias-dropout-add operation
+            after the MLP.
+        sharded_state_dict_keys_map (Dict[str, str]): Mapping for sharded tensor keys to be applied
+            in the `sharded_state_dict` method.
+    """
+
+    input_layernorm: Union[ModuleSpec, type] = IdentityOp
+    self_attention: Union[ModuleSpec, type] = IdentityOp
+    self_attn_bda: Union[ModuleSpec, type] = IdentityFuncOp
+
+    pre_cross_attn_layernorm: Union[ModuleSpec, type] = IdentityOp
+    cross_attention: Union[ModuleSpec, type] = IdentityOp
+    cross_attn_bda: Union[ModuleSpec, type] = IdentityFuncOp
+
+    pre_mlp_layernorm: Union[ModuleSpec, type] = IdentityOp
+    mlp: Union[ModuleSpec, type] = IdentityOp
+    mlp_bda: Union[ModuleSpec, type] = IdentityFuncOp
+
+    # Mapping for sharded tensor keys to be applied in `sharded_state_dict` method
+    sharded_state_dict_keys_map: Dict[str, str] = field(default_factory=dict)
+
+
+class BaseTransformerLayer(ABC):
+    """A common parent class for `TransformerLayer` like implementations.
+
+    A dummy class that is subclassed by similar `TransformerLayer`s e.g. the
+    `TransformerLayer` in this file and possibly other `TransformerLayer`
+    implementations that aim to use `TransformerBlock` as the base module.
+    The main purpose is to check if any layer (or module) provided in the spec
+    is a subclass of this class to allow fanning-out of that spec for all the
+    layers in the `TransformerBlock`. See `_get_block_submodules` method
+    implementation in `transformer_block.py` file for more details.
+    """
+
+    def __init__(self):
+        pass
+
+
+class TransformerLayer(MegatronModule, BaseTransformerLayer):
+    """A single transformer layer.
+
+    Transformer layer takes input with size [s, b, h] and returns an
+    output of the same size.
+    """
+
+    def __init__(
+        self,
+        config: TransformerConfig,
+        submodules: TransformerLayerSubmodules,
+        layer_number: int = 1,
+        hidden_dropout: float = None,
+    ):
+        super().__init__(config=config)
+
+        if config.enable_cuda_graph and self.training:
+            assert (
+                not config.cpu_offloading and config.recompute_granularity is None
+            ), "Cudagraphs not supported"
+            self.cudagraph_manager = CudaGraphManager()
+
+        self.submodules_config = submodules
+        self.layer_number = layer_number + self._get_layer_offset()
+        self.hidden_dropout = config.hidden_dropout if hidden_dropout is None else hidden_dropout
+
+        # [Module 1: Input Layernorm] Optional Layernorm on the input data
+        # TODO: add pytorch only layernorm
+        self.input_layernorm = build_module(
+            submodules.input_layernorm,
+            config=self.config,
+            hidden_size=self.config.hidden_size,
+            eps=self.config.layernorm_epsilon,
+        )
+
+        # [Module 2: SelfAttention]
+        self.self_attention = build_module(
+            submodules.self_attention, config=self.config, layer_number=layer_number
+        )
+
+        # [Module 3: BiasDropoutFusion]
+        self.self_attn_bda = build_module(submodules.self_attn_bda)
+
+        # [Module 4: Post SelfAttention] Optional Layernorm after self-attn
+        self.pre_cross_attn_layernorm = build_module(
+            submodules.pre_cross_attn_layernorm,
+            config=self.config,
+            hidden_size=self.config.hidden_size,
+            eps=self.config.layernorm_epsilon,
+        )
+
+        # [Module 5: CrossAttention]
+        self.cross_attention = build_module(
+            submodules.cross_attention, config=self.config, layer_number=layer_number
+        )
+
+        # [Module 6: BiasDropoutFusion]
+        self.cross_attn_bda = build_module(submodules.cross_attn_bda, config=self.config)
+
+        # [Module 7: Pre MLP] Optional Layernorm before MLP
+        self.pre_mlp_layernorm = build_module(
+            submodules.pre_mlp_layernorm,
+            config=self.config,
+            hidden_size=self.config.hidden_size,
+            eps=self.config.layernorm_epsilon,
+        )
+
+        # [Module 8: MLP block]
+        # TODO how to set the gpt_layer_spec.py when we have moe_frequency > 1,
+        #      where MLP and MoE layer both appear alternately?
+        self.mlp = build_module(submodules.mlp, config=self.config)
+        if hasattr(self.mlp, 'set_layer_number'):
+            self.mlp.set_layer_number(self.layer_number)
+
+        # [Module 9: BiasDropoutFusion]
+        self.mlp_bda = build_module(submodules.mlp_bda)
+
+        # @jcasper how should we handle nvfuser?
+        # Set bias+dropout+add fusion grad_enable execution handler.
+        # TORCH_MAJOR = int(torch.__version__.split('.')[0])
+        # TORCH_MINOR = int(torch.__version__.split('.')[1])
+        # use_nvfuser = TORCH_MAJOR > 1 or (TORCH_MAJOR == 1 and TORCH_MINOR >= 10)
+        # self.bias_dropout_add_exec_handler = nullcontext if use_nvfuser else torch.enable_grad
+        self.bias_dropout_add_exec_handler = torch.enable_grad
+
+    def _get_layer_offset(self):
+        """Get the index number of this layer, given the level of pipelining."""
+        pipeline_rank = parallel_state.get_pipeline_model_parallel_rank()
+
+        num_layers_per_pipeline_rank = (
+            self.config.num_layers // self.config.pipeline_model_parallel_size
+        )
+
+        if parallel_state.get_virtual_pipeline_model_parallel_world_size() is not None:
+            vp_rank = parallel_state.get_virtual_pipeline_model_parallel_rank()
+            vp_size = parallel_state.get_virtual_pipeline_model_parallel_world_size()
+
+            total_num_layers = self.config.num_layers
+            num_layers_per_virtual_rank = num_layers_per_pipeline_rank // vp_size
+            total_virtual_chunks = total_num_layers // vp_size
+            offset = vp_rank * total_virtual_chunks + (pipeline_rank * num_layers_per_virtual_rank)
+
+        else:
+            # Each stage gets a contiguous set of layers.
+            if parallel_state.get_pipeline_model_parallel_world_size() > 1:
+                if (
+                    self.config.first_pipeline_num_layers is not None
+                    or self.config.last_pipeline_num_layers is not None
+                ):
+                    # Calculate number of pipelines for distributing layers
+                    middle_pipeline_stages = parallel_state.get_pipeline_model_parallel_world_size()
+                    middle_pipeline_stages -= sum(
+                        [
+                            1 if x is not None else 0
+                            for x in (
+                                self.config.first_pipeline_num_layers,
+                                self.config.last_pipeline_num_layers,
+                            )
+                        ]
+                    )
+
+                    # Calculate layers to distribute
+                    first_pipeline_offset = (
+                        0
+                        if self.config.first_pipeline_num_layers is None
+                        else self.config.first_pipeline_num_layers
+                    )
+                    last_pipeline_offset = (
+                        0
+                        if self.config.first_pipeline_num_layers is None
+                        else self.config.last_pipeline_num_layers
+                    )
+
+                    middle_num_layers = (
+                        self.config.num_layers - first_pipeline_offset - last_pipeline_offset
+                    )
+
+                    if middle_pipeline_stages > 0:
+                        num_layers_per_pipeline_rank = middle_num_layers // middle_pipeline_stages
+                    else:
+                        num_layers_per_pipeline_rank = 0
+
+                    middle_pipeline_rank = (
+                        pipeline_rank
+                        if self.config.first_pipeline_num_layers is None
+                        else pipeline_rank - 1
+                    )
+
+                    if pipeline_rank == 0:
+                        offset = 0
+                    else:
+                        offset = (
+                            middle_pipeline_rank * num_layers_per_pipeline_rank
+                        ) + first_pipeline_offset
+                else:
+                    offset = pipeline_rank * num_layers_per_pipeline_rank
+            else:
+                offset = 0
+
+        return offset
+
+    def forward(
+        self,
+        hidden_states,
+        attention_mask=None,
+        context=None,
+        context_mask=None,
+        rotary_pos_emb=None,
+        inference_params=None,
+        packed_seq_params=None,
+    ):
+        """
+        Perform a forward pass through the transformer layer.
+
+        This method implements the core computation of a transformer layer, including
+        self-attention, cross-attention (if applicable), and feed-forward operations.
+
+        Args:
+            hidden_states (Tensor): Input tensor of shape [s, b, h] where s is sequence length,
+                b is batch size, and h is hidden size.
+            attention_mask (Tensor): Mask tensor for self-attention.
+            context (Tensor, optional): Context tensor for cross-attention.
+            context_mask (Tensor, optional): Mask tensor for cross-attention.
+            rotary_pos_emb (Tensor, optional): Rotary positional embeddings.
+            inference_params (object, optional): Parameters for inference-time optimizations.
+            packed_seq_params (object, optional): Parameters for packed sequence processing.
+
+        Returns:
+            Tuple[Tensor, Tensor]: A tuple containing:
+                output (Tensor): Transformed hidden states of shape [s, b, h].
+                context (Tensor): Updated context tensor if cross-attention is used,
+                otherwise None.
+        """
+
+        # Residual connection.
+        residual = hidden_states
+
+        # Optional Input Layer norm
+        input_layernorm_output = self.input_layernorm(hidden_states)
+
+        # Self attention.
+        attention_output_with_bias = self.self_attention(
+            input_layernorm_output,
+            attention_mask=attention_mask,
+            inference_params=inference_params,
+            rotary_pos_emb=rotary_pos_emb,
+            packed_seq_params=packed_seq_params,
+        )
+
+        # TODO: could we move `bias_dropout_add_exec_handler` itself
+        # inside the module provided in the `bias_dropout_add_spec` module?
+        with self.bias_dropout_add_exec_handler():
+            hidden_states = self.self_attn_bda(self.training, self.config.bias_dropout_fusion)(
+                attention_output_with_bias, residual, self.hidden_dropout
+            )
+
+        # Residual connection.
+        residual = hidden_states
+
+        # Optional Layer norm after self-attention
+        pre_cross_attn_layernorm_output = self.pre_cross_attn_layernorm(hidden_states)
+
+        # Cross attention.
+        attention_output_with_bias = self.cross_attention(
+            pre_cross_attn_layernorm_output,
+            attention_mask=context_mask,
+            key_value_states=context,
+            inference_params=inference_params,
+        )
+
+        if isinstance(attention_output_with_bias, dict) and "context" in attention_output_with_bias:
+            context = attention_output_with_bias["context"]
+
+        # TODO: could we move `bias_dropout_add_exec_handler` itself
+        # inside the module provided in the `bias_dropout_add_spec` module?
+        with self.bias_dropout_add_exec_handler():
+            hidden_states = self.cross_attn_bda(self.training, self.config.bias_dropout_fusion)(
+                attention_output_with_bias, residual, self.hidden_dropout
+            )
+
+        # Residual connection.
+        residual = hidden_states
+
+        # Optional Layer norm post the cross-attention.
+        pre_mlp_layernorm_output = self.pre_mlp_layernorm(hidden_states)
+
+        # MLP.
+        mlp_output_with_bias = self.mlp(pre_mlp_layernorm_output)
+
+        # TODO: could we move `bias_dropout_add_exec_handler` itself
+        # inside the module provided in the `bias_dropout_add_spec` module?
+        with self.bias_dropout_add_exec_handler():
+            hidden_states = self.mlp_bda(self.training, self.config.bias_dropout_fusion)(
+                mlp_output_with_bias, residual, self.hidden_dropout
+            )
+
+        # Jit compiled function creates 'view' tensor. This tensor
+        # potentially gets saved in the MPU checkpoint function context,
+        # which rejects view tensors. While making a viewless tensor here
+        # won't result in memory savings (like the data loader, or
+        # p2p_communication), it serves to document the origin of this
+        # 'view' tensor.
+        output = make_viewless_tensor(
+            inp=hidden_states, requires_grad=hidden_states.requires_grad, keep_graph=True
+        )
+
+        # CUDA graph requires returned values to be Tensors
+        if self.config.external_cuda_graph and self.training:
+            return output
+        return output, context
+
+    def sharded_state_dict(
+        self, prefix: str = '', sharded_offsets: tuple = (), metadata: Optional[dict] = None
+    ) -> ShardedStateDict:
+        """
+        Generate a sharded state dictionary for the transformer layer.
+
+        Args:
+            prefix (str, optional): Prefix to be added to all keys in the state dict.
+            sharded_offsets (tuple, optional): Tuple of sharding offsets.
+            metadata (Optional[dict], optional): Additional metadata for sharding.
+
+        Returns:
+            ShardedStateDict: A dictionary containing the sharded state of the transformer layer.
+        """
+        sharded_state_dict = super().sharded_state_dict(prefix, sharded_offsets, metadata)
+        prefixed_map = {
+            f'{prefix}{k}': f'{prefix}{v}'
+            for k, v in self.submodules_config.sharded_state_dict_keys_map.items()
+        }
+        if prefixed_map:
+            apply_prefix_mapping(sharded_state_dict, prefixed_map)
+        return sharded_state_dict
+
+    def __call__(self, *args, **kwargs):
+        if hasattr(self, 'cudagraph_manager'):
+            return self.cudagraph_manager(self, args, kwargs)
+        return super(MegatronModule, self).__call__(*args, **kwargs)
diff --git a/megatron/core/transformer/utils.py b/megatron/core/transformer/utils.py
new file mode 100644
index 0000000000..4781b68d2a
--- /dev/null
+++ b/megatron/core/transformer/utils.py
@@ -0,0 +1,188 @@
+# Copyright (c) 2024, NVIDIA CORPORATION. All rights reserved.
+
+"""Utilities for transformer layers."""
+from functools import lru_cache
+from operator import itemgetter
+from typing import Any, Dict, Iterable, Iterator, Optional, Tuple, Union
+
+import torch
+
+from megatron.core import parallel_state
+from megatron.core.dist_checkpointing.mapping import ShardedObject, ShardedStateDict, StateDict
+from megatron.core.jit import jit_fuser
+from megatron.core.utils import (
+    make_sharded_tensor_for_checkpoint,
+    make_tp_sharded_tensor_for_checkpoint,
+)
+
+
+def get_linear_layer(rows, columns, init_method, perform_initialization=True):
+    """Simple linear layer with weight initialization."""
+    layer = torch.nn.Linear(rows, columns)
+    if perform_initialization:  # Take from modelparallel config
+        init_method(layer.weight)
+    with torch.no_grad():
+        layer.bias.zero_()
+    return layer
+
+
+@lru_cache(maxsize=32)
+def get_default_causal_mask(sq: int) -> torch.Tensor:
+    """Return the causal upper triangular mask for softmax input."""
+    return torch.triu(torch.ones(sq, sq, device="cuda"), diagonal=1).bool()
+
+
+def attention_mask_func(attention_scores, attention_mask):
+    attention_scores.masked_fill_(attention_mask, -10000.0)
+    return attention_scores
+
+
+@jit_fuser
+def gelu_impl(x):
+    """OpenAI's gelu implementation."""
+    return 0.5 * x * (1.0 + torch.tanh(0.7978845608028654 * x * (1.0 + 0.044715 * x * x)))
+
+
+def openai_gelu(x):
+    return gelu_impl(x)
+
+
+# This is actually Python equivalent of torch.nn.functional.gelu(), also with type hints for ONNX exporter
+@jit_fuser
+def erf_gelu(x):
+    return (
+        x * 0.5 * (torch.erf(x / 1.41421).to(dtype=x.dtype) + torch.ones_like(x).to(dtype=x.dtype))
+    )
+
+
+def make_sharded_tensors_for_checkpoint(
+    state_dict: StateDict,
+    prefix: str,
+    tensor_parallel_layers_axis_map: Optional[Dict[str, int]] = None,
+    sharded_offsets: Iterable[Tuple[int, int, int]] = (),
+    extra_state_suffix: str = '_extra_state',
+):
+    """Wraps tensors from transformer layers with ShardedTensor or ShardedObject.
+
+    For a given `state_dict`, wraps:
+    - all _extra_states with ShardedObject
+    - all tensors specified in tensor_parallel_layers_axis_map with TP and DP sharded ShardedTensor
+    - other values with DP sharded ShardedTensor
+
+    Args:
+        state_dict (StateDict): state_dict to convert
+        prefix (str): prefix appended to keys in final state dict
+        tensor_parallel_layers_axis_map (Dict[str, int], optional): dict mapping layer
+            names to the axis for TP sharding
+        sharded_offsets (Iterable[Tuple[int, int, int]], optional): sharding already
+            applied (e.g. PP related), passed along to ShardedTensor
+        extra_state_suffix (str, default = '_extra_state'): layers with this
+            suffix will be wrapped with ShardedObject instead of ShardedTensor.
+
+    """
+
+    if tensor_parallel_layers_axis_map is None:
+        tensor_parallel_layers_axis_map = {}
+
+    sharded_state_dict = {}
+    for layer_name in state_dict.keys():
+        tensor = state_dict[layer_name]
+        layer_key = f'{prefix}{layer_name}'
+
+        if layer_name.endswith(extra_state_suffix):
+            sharded_state_dict[layer_key] = make_sharded_object_for_checkpoint(
+                tensor, layer_key, sharded_offsets
+            )
+
+        elif layer_name in tensor_parallel_layers_axis_map:
+            tp_axis = tensor_parallel_layers_axis_map[layer_name]
+            sharded_state_dict[layer_key] = make_tp_sharded_tensor_for_checkpoint(
+                tensor, layer_key, tp_axis, prepend_offsets=sharded_offsets
+            )
+
+        else:
+            sharded_state_dict[layer_key] = make_sharded_tensor_for_checkpoint(
+                tensor, layer_key, prepend_offsets=sharded_offsets
+            )
+
+    return sharded_state_dict
+
+
+def make_sharded_object_for_checkpoint(
+    obj: Any,
+    key: str,
+    sharded_offsets: Iterable[Tuple[int, int, int]] = (),
+    replica_id: Union[None, int, Tuple[int, ...]] = None,
+    **kwargs,
+):
+    """Helper for instantiating a non-sharded ShardedObject (replicated across TP and DP group).
+
+    Args:
+        obj (object): any object to be sharded
+        key (str): unique identifier of the object
+        sharded_offsets (Iterable[Tuple[int, int, int]]): offsets normally
+            prepended to ShardedTensors, will be used as global offsets for
+            ShardedObject
+        replica_id (Union[None, int, Tuple[int, ...]]): replica id
+    """
+    if replica_id is None:
+        replica_id = (
+            0,
+            parallel_state.get_tensor_model_parallel_rank(),
+            parallel_state.get_data_parallel_rank(with_context_parallel=True),
+        )
+
+    return ShardedObject(key, obj, *_get_extra_state_offsets(sharded_offsets), replica_id, **kwargs)
+
+
+def _get_extra_state_offsets(
+    sharded_offsets: Iterable[Tuple[int, int, int]]
+) -> Tuple[Tuple[int, ...], Tuple[int, ...]]:
+    """Turns ShardedTensor offsets into offsets suitable for ShardedObject."""
+    if sharded_offsets:
+        sharded_offsets = sorted(sharded_offsets, key=itemgetter(0))  # sort by axis
+        axis, extra_state_offset, extra_state_shape = zip(*sharded_offsets)
+        assert list(axis) == list(
+            range(len(axis))
+        ), f'Expected contiguous axis for offsets: {sharded_offsets}'
+    else:
+        extra_state_shape = (1,)
+        extra_state_offset = (0,)
+    return extra_state_shape, extra_state_offset
+
+
+def sharded_state_dict_default(
+    module: torch.nn.Module,
+    prefix: str = '',
+    sharded_offsets: Tuple[Tuple[int, int, int]] = (),
+    metadata: Optional[dict] = None,
+) -> ShardedStateDict:
+    """Provides implementation for sharded_state_dict method for non-MegatronModules.
+
+    Tries to call `module.sharded_state_dict` when possible,
+    otherwise uses regular state dict and assumes tensors are replicated across TP and DP.
+
+    `keep_vars=True` is passed to module.state_dict so that optimizer states
+    can be sharded later on.
+
+    Args:
+        module (torch.nn.Module): module which sharded state dict we want to obtain
+        prefix (str): prefix for the state dict keys
+        sharded_offsets (Tuple[Tuple[int, int, int]], optional): sharding already
+            applied (e.g. PP related) by sup-modules. Passed along to ShardedTensor
+        metadata (dict, optional): metadata passed to module sharded_state_dict method
+
+    Returns:
+        dict: dictionary of state dict keys mapped to ShardedTensors
+    """
+
+    if hasattr(module, 'sharded_state_dict'):
+        module_sharded_sd = module.sharded_state_dict(
+            prefix=prefix, sharded_offsets=sharded_offsets, metadata=metadata
+        )
+    else:
+        module_sd = module.state_dict(prefix='', keep_vars=True)
+        module_sharded_sd = make_sharded_tensors_for_checkpoint(
+            module_sd, prefix, {}, sharded_offsets
+        )
+    return module_sharded_sd
diff --git a/megatron/core/utils.py b/megatron/core/utils.py
index 7214b0c271..f3910926ab 100644
--- a/megatron/core/utils.py
+++ b/megatron/core/utils.py
@@ -1,19 +1,63 @@
 # Copyright (c) 2023, NVIDIA CORPORATION. All rights reserved.
 
 """Utility functions used throughout Megatron core"""
-from functools import reduce
+import array
+import hashlib
+import logging
+import math
 import operator
+import queue
+import socket
+import sys
+import threading
+import time
+import traceback
+from dataclasses import dataclass
+from datetime import datetime
+from functools import reduce
+from importlib.metadata import version
+from types import TracebackType
+from typing import Any, Dict, List, Optional, Tuple, Type, Union
 
 import torch
+from packaging.version import Version as PkgVersion
 
 from megatron.core import parallel_state
+from megatron.core.dist_checkpointing.mapping import ShardedTensor
+
+logger = logging.getLogger(__name__)
+
+
+_te_version = None
+
+
+def get_te_version():
+    """Get TE version from __version__; if not available use pip's. Use caching."""
+
+    def get_te_version_str():
+        import transformer_engine as te
+
+        if hasattr(te, '__version__'):
+            return str(te.__version__)
+        else:
+            return version("transformer-engine")
+
+    global _te_version
+    if _te_version is None:
+        _te_version = PkgVersion(get_te_version_str())
+    return _te_version
+
+
+def is_te_min_version(version, check_equality=True):
+    """Check if minimum version of `transformer-engine` is installed."""
+    if check_equality:
+        return get_te_version() >= PkgVersion(version)
+    return get_te_version() > PkgVersion(version)
 
 
 def ensure_divisibility(numerator, denominator):
     """Ensure that numerator is divisible by the denominator."""
-    assert numerator % denominator == 0, "{} is not divisible by {}".format(
-        numerator, denominator
-    )
+    assert numerator % denominator == 0, "{} is not divisible by {}".format(numerator, denominator)
 
 
 def divide(numerator, denominator):
@@ -22,22 +66,53 @@ def divide(numerator, denominator):
     ensure_divisibility(numerator, denominator)
     return numerator // denominator
 
-def get_attr_wrapped_model(model, attr):
-    """Get an attribute from a wrapped model"""
+
+def get_attr_wrapped_model(model, attr, allow_none=True, return_model_obj=False):
+    """Get an attribute from a wrapped model.
+    If return_model_obj is true, return the object that has the 'attr' attribute;
+    otherwise, return the attribute directly."""
     if isinstance(model, list):
         raise RuntimeError("_get_attr_wrapped_model given a list of models")
 
-    while not hasattr(model, attr):
+    if allow_none:
+
+        def condition(model, attr):
+            return not hasattr(model, attr)
+
+    else:
+
+        def condition(model, attr):
+            return getattr(model, attr, None) is None
+
+    while condition(model, attr):
         if not hasattr(model, "module"):
             raise RuntimeError(f"_get_attr_wrapped_model couldn't find attribute {attr}")
 
         model = model.module
+
+    if return_model_obj:
+        return model
     return getattr(model, attr)
 
+
 def get_model_type(model):
+    """Returns model_type attribute"""
     return get_attr_wrapped_model(model, 'model_type')
 
 
+def get_model_xattn(model):
+    """Returns whether the model has the xattn_needed attribute"""
+    try:
+        return get_attr_wrapped_model(model, 'xattn_needed')
+    except RuntimeError:
+        return False
+
+
+def get_model_config(model):
+    """Returns the config attribute, allowed to return None"""
+    return get_attr_wrapped_model(model, 'config', allow_none=False)
+
+
 class GlobalMemoryBuffer:
     """Global buffer to avoid dynamic memory allocations.
     Caller should ensure that buffers of the same name
@@ -47,60 +122,65 @@ def __init__(self):
         self.buffer = {}
 
     def get_tensor(self, tensor_shape, dtype, name):
+        """
+        Returns (potentially) a sub-tensor from the self.buffer for the given shape.
+        """
         required_len = reduce(operator.mul, tensor_shape, 1)
-        if self.buffer.get((name, dtype), None) is None or \
-                self.buffer[(name, dtype)].numel() < required_len:
-            self.buffer[(name, dtype)] = \
-                torch.empty(required_len,
-                            dtype=dtype,
-                            device=torch.cuda.current_device(),
-                            requires_grad=False)
+        if (
+            self.buffer.get((name, dtype), None) is None
+            or self.buffer[(name, dtype)].numel() < required_len
+        ):
+            self.buffer[(name, dtype)] = torch.empty(
+                required_len, dtype=dtype, device=torch.cuda.current_device(), requires_grad=False
+            )
 
         return self.buffer[(name, dtype)][0:required_len].view(*tensor_shape)
 
+
 def _kernel_make_viewless_tensor(inp, requires_grad):
-    '''Make a viewless tensor.
+    """Make a viewless tensor.
 
     View tensors have the undesirable side-affect of retaining a reference
     to the originally-viewed tensor, even after manually setting the '.data'
     field. This method creates a new tensor that links to the old tensor's
     data, without linking the viewed tensor, referenced via the '._base'
     field.
-    '''
-    out = torch.empty(
-        (1,),
-        dtype = inp.dtype,
-        device = inp.device,
-        requires_grad = requires_grad,
-    )
+    """
+    out = torch.empty((1,), dtype=inp.dtype, device=inp.device, requires_grad=requires_grad)
     out.data = inp.data
     return out
 
+
 class MakeViewlessTensor(torch.autograd.Function):
-    '''
+    """
     Autograd function to make a viewless tensor.
 
     This function should be used in cases where the computation graph needs
     to be propagated, but we only want a viewless tensor (e.g.,
     ParallelTransformer's hidden_states). Call this function by passing
     'keep_graph = True' to 'make_viewless_tensor()'.
-    '''
+    """
+
     @staticmethod
     def forward(ctx, inp, requires_grad):
+        """Runs the fwd pass of _kernel_make_viewless_tensor"""
         return _kernel_make_viewless_tensor(inp, requires_grad)
+
     @staticmethod
     def backward(ctx, grad_output):
+        """No-op"""
         return grad_output, None
 
+
 def make_viewless_tensor(inp, requires_grad, keep_graph):
-    '''
+    """
     Entry-point for creating viewless tensors.
 
     This method should be used, rather than calling 'MakeViewlessTensor'
     or '_kernel_make_viewless_tensor' directly. This method acts as a
     switch for determining if an autograd function or a regular method
     should be used to create the tensor.
-    '''
+    """
 
     # return tensor as-is, if not a 'view'
     if inp._base is None:
@@ -112,11 +192,12 @@ def make_viewless_tensor(inp, requires_grad, keep_graph):
     else:
         return _kernel_make_viewless_tensor(inp, requires_grad)
 
-def assert_viewless_tensor(tensor, extra_msg = None):
-    '''Assert that a tensor is not a view (i.e., its '._base' field is
-    not set).'''
+
+def assert_viewless_tensor(tensor, extra_msg=None):
+    """Assert that a tensor is not a view (i.e., its '._base' field is
+    not set)."""
     if isinstance(tensor, list):
-        [ assert_viewless_tensor(t) for t in tensor ]
+        [assert_viewless_tensor(t) for t in tensor]
         return tensor
     if not isinstance(tensor, torch.Tensor):
         return tensor
@@ -127,11 +208,1100 @@ def assert_viewless_tensor(tensor, extra_msg = None):
     ) % extra_msg
     return tensor
 
+
 def safely_set_viewless_tensor_data(tensor, new_data_tensor):
-    '''Safely set tensor's '.data' field.
+    """Safely set tensor's '.data' field.
 
     Check first that the tensor is viewless (i.e., '._base' not set). If not,
     raise an exception.
-    '''
-    assert_viewless_tensor(tensor, extra_msg = "FYI, tensor._base has shape %s, and new_data_tensor has shape %s." % ("--" if tensor._base is None else tensor._base.shape, new_data_tensor.shape))
+    """
+    assert_viewless_tensor(
+        tensor,
+        extra_msg="FYI, tensor._base has shape %s, and new_data_tensor has shape %s."
+        % ("--" if tensor._base is None else tensor._base.shape, new_data_tensor.shape),
+    )
     tensor.data = new_data_tensor
+
+
+def init_method_normal(sigma):
+    """Init method based on N(0, sigma)."""
+
+    def init_(tensor):
+        return torch.nn.init.normal_(tensor, mean=0.0, std=sigma)
+
+    return init_
+
+
+def scaled_init_method_normal(sigma, num_layers):
+    """Init method based on N(0, sigma/sqrt(2*num_layers)."""
+    std = sigma / math.sqrt(2.0 * num_layers)
+
+    def init_(tensor):
+        return torch.nn.init.normal_(tensor, mean=0.0, std=std)
+
+    return init_
+
+
+def log_single_rank(logger: logging.Logger, *args: Any, rank: int = 0, **kwargs: Any):
+    """If torch distributed is initialized, log only on rank
+
+    Args:
+        logger (logging.Logger): The logger to write the logs
+
+        args (Tuple[Any]): All logging.Logger.log positional arguments
+
+        rank (int, optional): The rank to write on. Defaults to 0.
+
+        kwargs (Dict[str, Any]): All logging.Logger.log keyword arguments
+    """
+    if torch.distributed.is_initialized():
+        if torch.distributed.get_rank() == rank:
+            logger.log(*args, **kwargs)
+    else:
+        logger.log(*args, **kwargs)
+
+
+def log_on_each_pipeline_stage(logger: logging.Logger, *args: Any, **kwargs: Any):
+    """Log on first rank in each pipeline stage
+
+    Args:
+        logger (logging.Logger): The logger to write the logs
+
+        args (Tuple[Any]): All logging.Logger.log positional arguments
+
+        kwargs (Dict[str, Any]): All logging.Logger.log keyword arguments
+    """
+    assert torch.distributed.is_initialized()
+
+    if (
+        parallel_state.get_data_parallel_rank(with_context_parallel=True) == 0
+        and parallel_state.get_tensor_model_parallel_rank() == 0
+    ):
+        logger.log(*args, **kwargs)
+
+
+def check_param_hashes_across_dp_replicas(
+    model: List[torch.nn.Module], cross_check: bool = False
+) -> bool:
+    """Computes hashes of all parameters in model, all-gathers hashes across DP replicas,
+    and then checks for equality between the locally-computed hashes and those of other ranks.
+
+    NOTE: This function computes SHA-1 hashes on the CPU and thus needs to move all param
+    tensors from GPU to CPU first; as a result, this function is not intended to be called
+    very frequently in the main training loop.
+
+    Args:
+        model (List[torch.nn.Module]): List of model chunks whose parameter hashes need to
+            be checked.
+        cross_check (bool): If true, will check whether hashes match across all DP replicas.
+
+    Returns:
+        True if all param hashes match with corresponding hash on DP replica 0 or
+        across all replicas if cross_check is enabled, False otherwise.
+    """
+
+    # Compute per-parameter hashes on this rank.
+    params = []
+    local_param_hashes = []
+    for model_chunk_id, model_chunk in enumerate(model):
+        for param_name, param in model_chunk.named_parameters():
+            param_hash = torch.frombuffer(
+                array.array(
+                    'B', hashlib.sha1(param.data.to("cpu").float().numpy(force=True)).digest()
+                ),
+                dtype=torch.uint8,
+            )
+            params.append((model_chunk_id, param_name, param))
+            local_param_hashes.append(param_hash)
+    local_param_hashes = torch.stack(local_param_hashes)
+
+    # Collect per-parameter hashes across all ranks in DP group.
+    all_param_hashes = [
+        torch.zeros_like(local_param_hashes)
+        for _ in range(parallel_state.get_data_parallel_world_size())
+    ]
+    torch.distributed.all_gather(
+        all_param_hashes, local_param_hashes, group=parallel_state.get_data_parallel_group_gloo()
+    )
+
+    # Make sure local per-parameter hash matches DP rank 0.
+    param_hashes_match = torch.equal(local_param_hashes, all_param_hashes[0])
+    if not param_hashes_match:
+        for i, (model_chunk_id, param_name, param) in enumerate(params):
+            if not torch.equal(local_param_hashes[i], all_param_hashes[0][i]):
+                rank = torch.distributed.get_rank()
+                logger.info(
+                    f"[Rank {rank}] Hash not matching for {param_name} in model chunk"
+                    f"{model_chunk_id}"
+                )
+    if cross_check:
+        # Make sure all ranks have the same hash.
+        return all(map(lambda x: torch.equal(local_param_hashes, x), all_param_hashes))
+    else:
+        return param_hashes_match
+
+
+def make_tp_sharded_tensor_for_checkpoint(
+    tensor, key, tp_axis=0, replica_id=None, prepend_offsets=(), **kwargs
+):
+    """Helper for instantiating a ShardedTensor where the `tp_axis` dimension
+    is sharded across TP group.
+
+    Optionally, can provide offsets which prepend new dimensions to the tensor.
+    """
+
+    prepend_axis_num = len(prepend_offsets)
+
+    if replica_id is None:
+        replica_id = (0, 0, parallel_state.get_data_parallel_rank(with_context_parallel=True))
+
+    return ShardedTensor.from_rank_offsets(
+        key,
+        tensor,
+        *prepend_offsets,
+        (
+            tp_axis + prepend_axis_num,
+            parallel_state.get_tensor_model_parallel_rank(),
+            parallel_state.get_tensor_model_parallel_world_size(),
+        ),
+        replica_id=replica_id,
+        prepend_axis_num=prepend_axis_num,
+        **kwargs,
+    )
+
+
+def make_sharded_tensor_for_checkpoint(tensor, key, prepend_offsets=(), replica_id=None, **kwargs):
+    """Helper for instantiating a non-sharded ShardedTensor (replicated across TP and DP group).
+
+    Optionally, can provide offsets which prepend new dimensions to the tensor.
+    """
+
+    prepend_axis_num = len(prepend_offsets)
+
+    if replica_id is None:
+        replica_id = (
+            0,
+            parallel_state.get_tensor_model_parallel_rank(),
+            parallel_state.get_data_parallel_rank(with_context_parallel=True),
+        )
+
+    return ShardedTensor.from_rank_offsets(
+        key,
+        tensor,
+        *prepend_offsets,
+        replica_id=replica_id,
+        prepend_axis_num=prepend_axis_num,
+        **kwargs,
+    )
+
+
+def prepare_input_tensors_for_wgrad_compute(grad_output, all_gathered_input):
+    """Ensure grad_output is stored in a contiguous buffer."""
+    # Doing gather + slicing during the NeMo forward pass can make this tensor
+    # not be contiguous. PyTorch only checks if the tensor is contiguous, and only
+    # clones it if it's not contiguous:
+    # https://github.com/pytorch/pytorch/blob/c47cf9bc7f9e02f649ab4ed53fe4d35732c92ab6/torch/_refs/__init__.py#L2761
+    grad_output = grad_output.contiguous()
+    # Convert the tensor shapes to 2D for execution compatibility
+    if grad_output.dim() == 3:
+        grad_output = grad_output.view(
+            grad_output.shape[0] * grad_output.shape[1], grad_output.shape[2]
+        )
+        all_gathered_input = all_gathered_input.view(
+            all_gathered_input.shape[0] * all_gathered_input.shape[1], all_gathered_input.shape[2]
+        )
+
+    return grad_output, all_gathered_input
+
+
+def drain_embedding_wgrad_compute(config, embedding_activation_buffer, grad_output_buffer, weight):
+    """Helper for performing embedding wgrad GEMM's during the pipeline drain phase, pipelines the
+    AllGather and GEMM's.
+
+    Should only be used when pipeline model parallelism and gradient accumulation
+    fusion are enabled.
+    """
+
+    assert len(embedding_activation_buffer) == len(
+        grad_output_buffer
+    ), "Length of activation and gradient buffers need to be equal!"
+
+    import fused_weight_gradient_mlp_cuda
+
+    from megatron.core.parallel_state import (
+        get_global_memory_buffer,
+        get_tensor_model_parallel_group,
+        get_tensor_model_parallel_world_size,
+    )
+
+    input = embedding_activation_buffer.pop(0)
+    world_size = get_tensor_model_parallel_world_size()
+    dim_size = list(input.size())
+    dim_size[0] = dim_size[0] * world_size
+
+    all_gathered_input = [None, None]
+    if config.sequence_parallel:
+        all_gather_buffer = get_global_memory_buffer().get_tensor(dim_size, input.dtype, "mpu_0")
+        handle = torch.distributed._all_gather_base(
+            all_gather_buffer, input, group=get_tensor_model_parallel_group(), async_op=False
+        )
+
+        all_gathered_input[0] = all_gather_buffer
+        all_gather_buffer = None
+    else:
+        all_gathered_input[0] = input
+
+    input = None
+
+    def wgrad_compute(all_gathered_input, grad_output, weight):
+
+        grad_output, all_gathered_input = prepare_input_tensors_for_wgrad_compute(
+            grad_output, all_gathered_input
+        )
+
+        if config.gradient_accumulation_fusion:
+            if weight.main_grad.dtype == torch.float32:
+                fused_weight_gradient_mlp_cuda.wgrad_gemm_accum_fp32(
+                    all_gathered_input, grad_output, weight.main_grad
+                )
+            elif weight.main_grad.dtype in (torch.float16, torch.bfloat16):
+                fused_weight_gradient_mlp_cuda.wgrad_gemm_accum_fp16(
+                    all_gathered_input, grad_output, weight.main_grad
+                )
+            else:
+                raise RuntimeError("Unsupported gradient type for gradient accumulation fusion")
+
+    # We have all_gathered_input list acting as a double buffer here,
+    # since we are pipelining the AllGather and GEMM,one buffer all gathers
+    # the input while the other buffer reads from it for the GEMM. We use i
+    # and (i+1) for indexing to enable this double buffering.
+    for i in range(len(embedding_activation_buffer)):
+        input = embedding_activation_buffer.pop(0)
+        if config.sequence_parallel:
+            name = "mpu_" + str((i + 1) % 2)
+            all_gather_buffer = get_global_memory_buffer().get_tensor(dim_size, input.dtype, name)
+            handle = torch.distributed._all_gather_base(
+                all_gather_buffer, input, group=get_tensor_model_parallel_group(), async_op=True
+            )
+
+            all_gathered_input[(i + 1) % 2] = all_gather_buffer
+            all_gather_buffer = None
+        else:
+            all_gathered_input[(i + 1) % 2] = input
+
+        grad_output = grad_output_buffer.pop(0)
+        wgrad_compute(all_gathered_input[i % 2], grad_output, weight)
+        drain_idx = (i + 1) % 2
+        input, all_gathered_input[i % 2], grad_output = None, None, None
+
+        if config.sequence_parallel:
+            handle.wait()
+
+    grad_output = grad_output_buffer.pop(0)
+    wgrad_compute(all_gathered_input[drain_idx], grad_output, weight)
+    input, all_gathered_input[drain_idx], grad_output = None, None, None
+
+
+def local_multi_tensor_applier(op, noop_flag_buffer, tensor_lists, *args):
+    """Multi tensor op applier"""
+    return op(2048 * 32, noop_flag_buffer, tensor_lists, *args)
+
+
+# computes l2 norm for a list of contiguous tensors
+# works as a drop-in replacement for amp_C.multi_tensor_l2norm
+def local_multi_tensor_l2_norm(chunk_size, noop_flag, tensor_lists, per_tensor, *args):
+    """
+    Computes l2 norm for a list of contiguous tensors
+    works as a drop-in replacement for amp_C.multi_tensor_l2norm
+    """
+    l2 = [[(torch.norm(tensor)) for tensor in tensor_list] for tensor_list in tensor_lists]
+    l2_reduced = torch.norm(torch.tensor(l2))
+    l2_cuda = torch.tensor([float(l2_reduced)], dtype=torch.float, device='cuda')
+    return l2_cuda, None
+
+
+# works as a drop-in replacement for amp_C.multi_tensor_scale
+def local_multi_tensor_scale(chunk_size, noop_flag, tensor_lists, scale):
+    """Works as a drop-in replacement for amp_C.multi_tensor_scale."""
+    inputs, targets = tensor_lists[0], tensor_lists[1]
+    if inputs == targets:
+        for i in range(len(targets)):
+            # for parity with apex implementation
+            targets[i] *= scale
+    else:
+        for i in range(len(targets)):
+            targets[i] = inputs[i] * scale
+
+
+class _ValueWithRank:
+    """This is an internal class, not for use outside this module
+
+    Attributes:
+        _rank (int): rank for the value
+        _value (float) : the value it stores, eg elapsed time
+        _unit (str) : unit for the value
+    """
+
+    def __init__(self, value: float, rank: int, unit: str = "") -> None:
+        """Initializer
+
+        Args:
+            _value (float): the initial value with which it is inited
+            _rank (int): the rank number
+            _unit (str) : the unit of the value, eg ms or flops
+        """
+        self._rank = rank
+        self._value = value
+        self._unit = unit
+
+    def __lt__(self, other) -> bool:
+        """Check if value of self is smaller than other's value
+
+        Args:
+            other (_ValueWithRank): The other object to compare with
+
+        Returns:
+            bool: True if lhs._value of operand is less than rhs._value, else False
+        """
+        return self._value < other._value
+
+    def __gt__(self, other) -> bool:
+        """Check if value of self is larger than other's value
+
+        Args:
+            other (_ValueWithRank): The other object to compare with
+
+        Returns:
+            bool: True if lhs._value of operand is greater than rhs._value, else False
+        """
+        return self._value > other._value
+
+    def __call__(self) -> Tuple[float, int, str]:
+        """Returns the value, the rank, and unit as a Tuple
+
+        Returns:
+            Tuple[float, int, str]: value, rank, unit
+        """
+        return self._value, self._rank, self._unit
+
+    def __str__(self) -> str:
+        """String representation of the object
+
+        Returns:
+            str: strigified object
+        """
+
+        return f"{self._value:.2f}{self._unit}/{self._rank}"
+
+
+@dataclass
+class _StragglerData:
+    """This is an internal dataclass, not for use outside this module
+
+    Attributes:
+        min_elapsed (_ValueWithRank) min iteration time across all ranks
+        max_elapsed (_ValueWithRank) max iteration time across all ranks
+        min_btime (_ValueWithRank) min cpu time across all ranks
+        max_btime (_ValueWithRank) max cpu time across all ranks
+        min_temp (_ValueWithRank): min gpu temp across all ranks
+        max_temp (_ValueWithRank): max gpu temp across all ranks
+        min_power (_ValueWithRank) min gpu power across all ranks
+        max_power (_ValueWithRank) max gpu power across all ranks
+        min_util (_ValueWithRank): min gpu util across all ranks
+        max_util (_ValueWithRank): max gpu util across all ranks
+        min_clock (_ValueWithRank): min gpu clock across all ranks
+        max_clock (_ValueWithRank) max gpu clock across all ranks
+        aflops (List[_ValueWithRank]): sorted array of (_ValueWithRank)
+    """
+
+    # gemm time
+    min_elapsed = _ValueWithRank(sys.float_info.max, 0, "ms")
+    max_elapsed = _ValueWithRank(sys.float_info.min, 0, "ms")
+    # get_batch time
+    min_btime = _ValueWithRank(sys.float_info.max, 0, "us")
+    max_btime = _ValueWithRank(sys.float_info.min, 0, "us")
+    # temp
+    min_temp = _ValueWithRank(sys.float_info.max, 0, "C")
+    max_temp = _ValueWithRank(sys.float_info.min, 0, "C")
+    # power
+    min_power = _ValueWithRank(sys.float_info.max, 0, "W")
+    max_power = _ValueWithRank(sys.float_info.min, 0, "W")
+    # util
+    min_util = _ValueWithRank(sys.float_info.max, 0, "%")
+    max_util = _ValueWithRank(sys.float_info.min, 0, "%")
+    # clock
+    min_clock = _ValueWithRank(sys.float_info.max, 0, "MHz")
+    max_clock = _ValueWithRank(sys.float_info.min, 0, "MHz")
+    aflops: Union[List[_ValueWithRank], None] = None
+
+
+class StragglerDetector:
+    """Singleton Class implementing per rank Straggler Detector
+
+    It use cuda events to time operation of choice using the
+    start and stop methods which can be directly invoked using
+    the class instance or can be used like a python context.
+    After collection, a report() method is available to display
+    the collected metrics. It is only supported if CUDA is
+    available. megatron/core/README_STRAGGLER.md for more info
+
+    Note:
+        The instance and class attributes mentioned below are all
+        private to the class and has no use outside the class
+
+    Attributes:
+        _off (bool): current state of the toggle
+        start (FunctionType): start method
+        stop (FunctionType): stop method
+        world (int): world size
+        rank (int): rank for this instance
+        mmcnt (int): number of ranks to report
+        port (int): control port
+        amp (float): amplification factor for TFLOPs, default 3.0
+        toggle (bool): whether to start/stop detector collection
+        bdata (bool): when true, just collect get_batch
+        dev (int): cuda device
+        evt_q (LifoQueue): cuda event queue
+        start_gemm_ev (list[torch.cuda.Event]): cuda start event
+        stop_gemm_ev (list[torch.cuda.Event]): cuda stop event
+        start_data_ev (list[torch.cuda.Event]): cuda start event
+        stop_data_ev (list[torch.cuda.Event]): cuda stop event
+        start_gemm_tm (list[int]): start time (wallclock)
+        stop_gemm_tm (list[int]): stop time (wallclock)
+        start_data_tm (list[int]): start time for get_batch
+        stop_data_tm (list[int]): stop time for get_batch
+        sock (socket): the controller socket
+        ctrlr (Thread): the controller thread
+    """
+
+    _configured = False
+    """Indicates if the singleton instance is configured or not
+    """
+
+    def __new__(cls: Type["StragglerDetector"]) -> "StragglerDetector":
+        """Constructor
+        Creates an instance of the class if not created
+
+        Args:
+            cls (Type[&#39;StragglerDetector&#39;]): The class type
+
+        Returns:
+            StragglerDetector: the class instance
+        """
+
+        if not hasattr(cls, "_instance"):
+            cls._instance = super(StragglerDetector, cls).__new__(cls)
+        return cls._instance
+
+    def __init__(self) -> None:
+        """Initializer
+
+        The inital state of the StragglerDetector instance is disabled.
+        The enabled state is indicated using self._off member variable
+        and the proerty enabled.
+        """
+        self._off: bool = True
+        self.start = self.null_method
+        self.stop = self.null_method
+        self.world: int = 0
+        self.rank: int = 0
+        self.mmcnt: int = 1
+        self.port: int = 0
+        self.amp: float = 3.0
+        self.toggle: bool = False
+        self.bdata: bool = False
+        self.dev: Union[torch.device, int, None] = None
+        self.evt_q: Union[queue.LifoQueue, None] = None
+        self.start_gemm_ev: List[torch.cuda.Event] = []
+        self.stop_gemm_ev: List[torch.cuda.Event] = []
+        self.start_data_ev: List[torch.cuda.Event] = []
+        self.stop_data_ev: List[torch.cuda.Event] = []
+        self.start_gemm_tm: List[int] = []
+        self.stop_gemm_tm: List[int] = []
+        self.start_data_tm: List[int] = []
+        self.stop_data_tm: List[int] = []
+        self.sock: Union[socket.socket, None] = None
+        self.ctrlr: Union[threading.Thread, None] = None
+
+    def configure(
+        self,
+        world: int,
+        rank: int,
+        mmcnt: int = 1,
+        amp: float = 3.0,
+        port: int = 65535,
+        prefill: int = 1024,
+        enabled: bool = False,
+    ) -> None:
+        """This method is called to configure the Singleton instance
+
+        It should be called once per instantiation per process.
+
+        Note:
+            The constructor keeps the state of instance disabled
+            i.e no collection will happen even when start/stop methods are
+            called. Only when enabled is True (self._off is True), the
+            start/stop method pointers get assigned the real collection
+            methods, otherwise they are initialized with null_method
+
+        Args:
+            world (int): World Size
+            rank (int): The rank of this trainer
+            mmcnt (int, optional): Number of ranks to print for showing Min/Max Etpt.
+                                   Defaults to 1.
+            amp (float, optional): Set to 3.0 if we only use timers in fwd pass.
+                                   Defaults to 3.0.
+            port (int, optional): Control port, useful only for rank-0. Defaults to 65535.
+            prefill (int, optional): Howmany Events to pre-populate. Defaults to 1024.
+            enabled (bool, optional): Whether or not collection is enabled on startup.
+                                      Defaults to False.
+        """
+        if StragglerDetector._configured:
+            # don't throw
+            return
+        StragglerDetector._configured = True
+        self.bdata = False
+        self.start = self.null_method
+        self.stop = self.null_method
+        self._off = True
+        # No CUDA, No Support
+        if torch.cuda.is_available():
+            self._off = not enabled
+            self.world = world
+            self.rank = rank
+            self.mmcnt = mmcnt if mmcnt > 1 else 1
+            self.amp = amp
+            self.port = port
+            self.toggle = False
+            self.bdata = False
+            self.evt_q = queue.LifoQueue()
+            self.start_gemm_ev = []
+            self.stop_gemm_ev = []
+            self.start_data_ev = []
+            self.stop_data_ev = []
+            self.start_gemm_tm = []
+            self.stop_gemm_tm = []
+            self.start_data_tm = []
+            self.stop_data_tm = []
+            backend = torch.distributed.get_backend()
+            if backend == "nccl":
+                self.dev = torch.cuda.current_device()
+            else:
+                self.dev = torch.device("cpu")
+            # cache some events
+            for _ in range(prefill):
+                self.evt_q.put(torch.cuda.Event(enable_timing=True))
+            if self.rank == 0:
+                # Start the controller
+                self._controller()
+            if not self._off:
+                self.start = self.start_method
+                self.stop = self.stop_method
+
+    def reset(self) -> None:
+        """This method is called to reset the metrics state of the instance
+
+        It is generally called from within elapsed() after extracting per rank metrics.
+        """
+        if self._off:
+            return
+        # Pool them
+        if self.evt_q is not None:
+            _ = [self.evt_q.put(ev) for ev in self.start_gemm_ev]
+            _ = [self.evt_q.put(ev) for ev in self.stop_gemm_ev]
+            _ = [self.evt_q.put(ev) for ev in self.start_data_ev]
+            _ = [self.evt_q.put(ev) for ev in self.stop_data_ev]
+        self.start_gemm_ev = []
+        self.stop_gemm_ev = []
+        self.start_data_ev = []
+        self.stop_data_ev = []
+        # Use regular timers
+        self.start_gemm_tm = []
+        self.stop_gemm_tm = []
+        self.start_data_tm = []
+        self.stop_data_tm = []
+        self.bdata = False
+
+    def start_method(self) -> None:
+        """This method adds the start timers.
+
+        Both cuda event and perf_counter are added. If bdata is set to
+        true from __call__, this method skips inserting cuda
+        timer. This way it can be used to measure time spent on
+        CPU - generally useful for timing get_batch()
+        """
+        # Not reentrant
+        if self.evt_q is not None and self.evt_q.qsize() > 1:
+            sev = self.evt_q.get()  # no try-catch
+            eev = self.evt_q.get()  # no try-catch
+        else:
+            sev = torch.cuda.Event(enable_timing=True)
+            eev = torch.cuda.Event(enable_timing=True)
+        # First check if this start is for data
+        if self.bdata:
+            self.start_data_ev.append(sev)
+            self.stop_data_ev.append(eev)
+            self.start_data_tm.append(0)
+            self.stop_data_tm.append(0)
+            idx = len(self.stop_data_tm) - 1
+            self.start_data_tm[idx] = time.perf_counter_ns()
+            self.start_data_ev[idx].record()
+            self.bdata = False
+            return
+        self.start_gemm_ev.append(sev)
+        self.stop_gemm_ev.append(eev)
+        self.start_gemm_tm.append(0)
+        self.stop_gemm_tm.append(0)
+        idx = len(self.stop_gemm_tm) - 1
+        self.start_gemm_tm[idx] = time.perf_counter_ns()
+        self.start_gemm_ev[idx].record()
+
+    def stop_method(self) -> None:
+        """This method adds the stop timers.
+
+        Both cuda event and perf_counter are added. If bdata is set to
+        true from __call__, this method skips inserting cuda
+        timer. Also see start_method()
+        """
+        # Not reentrant
+        # First check if this stop is for data
+        idx = len(self.stop_data_tm) - 1
+        if idx >= 0 and self.stop_data_tm[idx] == 0:
+            self.stop_data_tm[idx] = time.perf_counter_ns()
+            self.stop_data_ev[idx].record()
+            return
+        idx = len(self.stop_gemm_tm) - 1
+        if idx >= 0 and self.stop_gemm_tm[idx] == 0:
+            self.stop_gemm_tm[idx] = time.perf_counter_ns()
+            self.stop_gemm_ev[idx].record()
+
+    def elapsed(self) -> Tuple[float, float, int, int, int, int]:
+        """This method is called from report(), or can be called directly
+
+         It is called to collect all the elapsed time since last reset().
+         It finally calls reset()
+
+        Returns:
+            Tuple[float, float, int, int, int, int]: see below for returns
+                delta       : time spent in kernel
+                batch_delta : time spent in get_batch
+                temp        : observed gpu temp
+                power       : observed gpu power
+                util        : observed gpu utilization
+                clock       : observed gpu clock
+        """
+        if self._off:
+            # match with return below
+            return 0, 0, 0, 0, 0, 0
+        ls_ev = len(self.start_gemm_ev)
+        le_ev = len(self.stop_gemm_ev)
+        ls_bs = len(self.start_data_ev)
+        ls_be = len(self.stop_data_ev)
+        delta = 0.0
+        batch_delta = 0.0
+        temp = 0
+        power = 0
+        clock = 0
+        if ls_ev != le_ev:
+            logger.warning(f"Event Start/Stop out of sync {ls_ev}/{le_ev}")
+        elif ls_bs != ls_be:
+            logger.warning(f"get_batch Start/Stop out of sync {ls_bs}/{ls_be}")
+        else:
+            temp = torch.cuda.temperature()
+            power = torch.cuda.power_draw()
+            util = torch.cuda.utilization()
+            clock = torch.cuda.clock_rate()
+            torch.cuda.synchronize()
+            # Process Events
+            for i in range(ls_ev):
+                e_ev = self.start_gemm_ev[i].elapsed_time(self.stop_gemm_ev[i])
+                e_tm = (self.stop_gemm_tm[i] - self.start_gemm_tm[i]) / 1e6  # ns to ms
+                # Pick the larger of Event and perf_counter time?
+                delta += max(e_ev, e_tm)
+            # Process get_batch
+            for i in range(ls_bs):
+                b_ev = self.start_data_ev[i].elapsed_time(self.stop_data_ev[i])
+                b_tm = (self.stop_data_tm[i] - self.start_data_tm[i]) / 1e6  # ns to ms
+                # data fetching has prefetch, hence take the max, instead of avg
+                batch_delta = max(batch_delta, max(b_ev, b_tm))
+        self.reset()  # Prepare for next round
+        # time in ms, batch_delta in ms, check return above
+        return delta, batch_delta, temp, power, util, clock
+
+    def report(self, total_flops: float = 0.0, log_interval: int = 0) -> bool:
+        """Function to log the min/max metircs and the associated rank over a time period
+
+        It finds the slowest and fastest rank among all ranks. It should be
+        called by all ranks, but only rank-0 prints the analysis
+        At the end it checks, if the straggler detector should
+        remain active or if it should be deactivated.
+
+        Args:
+            total_flops (float, optional): The theoretical flops over the period. Defaults to 0.0.
+            log_interval (int, optional): The training interval over which reporting is called(ms)
+                                          Defaults to 0.
+
+        Returns:
+            bool: True if reported, else False
+        """
+        ret = False
+        if not self._off and total_flops > 0.0 and log_interval > 0:
+            elapsed, btime, temp, power, util, clock = self.elapsed()  # get raw time
+            # btime (get_batch time is max in the iteration)
+            ptime = elapsed / (log_interval * 1.0)  # avg per iteration elapsed time, ms
+            api_flops = total_flops / (log_interval * 1.0)  # avg per iteration flops, ms
+            apir_flops = api_flops / (
+                ptime * 10**9 * self.world
+            )  # this is avg per iteration this rank's thruput, TFLOP/s (note 10**9),
+            et_flops = apir_flops / self.amp  # Estimated TFLOPs, not tracing backward
+
+            o_dt = self._min_max(
+                ptime, btime, float(temp), float(power), float(util), float(clock), et_flops
+            )
+            if self.rank == 0 and o_dt is not None and o_dt.aflops is not None:
+                now = f"[{datetime.now().strftime('%Y-%m-%d %H:%M:%S')}]"
+                min_flops, min_frank, _ = o_dt.aflops[0]()
+                max_flops, max_frank, _ = o_dt.aflops[-1]()
+                logger.info(
+                    f"{now} | "
+                    f"MnRtt/Rnk: {o_dt.min_elapsed} | "
+                    f"MxRtt/Rnk: {o_dt.max_elapsed} | "
+                    f"MnPwr/Rnk: {o_dt.min_power} | "
+                    f"MxPwr/Rnk: {o_dt.max_power} | "
+                    f"MnTmp/Rnk: {o_dt.min_temp} | "
+                    f"MxTmp/Rnk: {o_dt.max_temp} | "
+                    f"MnUtl/Rnk: {o_dt.min_util} | "
+                    f"MxUtl/Rnk: {o_dt.max_util} | "
+                    f"MnClk/Rnk: {o_dt.min_clock} | "
+                    f"MxClk/Rnk: {o_dt.max_clock} | "
+                    f"MnDRtt/Rnk: {o_dt.min_btime} | "
+                    f"MxDRtt/Rnk: {o_dt.max_btime} | "
+                    f"MnEtpt/Rnk: {min_flops:.2f}TF/{min_frank} | "
+                    f"MxEtpt/Rnk: {max_flops:.2f}TF/{max_frank}"
+                )
+                if self.mmcnt > 1 and self.mmcnt < self.world:
+                    line = f"^^^^ Bottom {self.mmcnt} Ranks with lowest  Etpt(TF):"
+                    for i in range(self.mmcnt):
+                        line += f" {o_dt.aflops[i]},"
+                    logger.info(line)
+                    line = f"^^^^ Top    {self.mmcnt} Ranks with highest Etpt(TF):"
+                    shift = self.world - self.mmcnt
+                    for i in range(self.mmcnt):
+                        line += f" {o_dt.aflops[i+shift]},"
+                    logger.info(line)
+                ret = True
+
+        # Check/Communicate if tracking is turned off or on
+        self._check_toggle()
+        return ret
+
+    def _check_toggle(self) -> None:
+        """Helper method to check if a request to toggle the collection state was made
+
+        It checks iof collection state toggle req was made via the server listening on
+        rank-0 since last call to report(). Called by report(). Calling this method
+        indirectly from report() is the only way to activate the change that is made
+        via rank-0
+        """
+        # If no change just commnunicate the current
+        off = self._off
+        if self.rank == 0 and self.toggle:
+            off = not self._off
+            self.toggle = False
+        st = torch.tensor(off, dtype=torch.bool, device=self.dev)
+        torch.distributed.broadcast(st, 0)  # Blocking
+        # save old switch
+        off = self._off
+        self._off = bool(st.item())
+        if off != self._off:
+            if not self._off:
+                self.start = self.start_method
+                self.stop = self.stop_method
+                state = "ON"
+            else:
+                self.start = self.null_method
+                self.stop = self.null_method
+                state = "OFF"
+            if self.rank == 0:
+                logger.info(f"Toggling StragglerDetector State {state}")
+
+    def _handler(self) -> None:
+        """Thread function for the controller.
+
+        It is a tcp-server that listens on a port. Uses HTTP protocol.
+        If connected to it using curl, it indicates a toggle of the
+        collection state. The actual toggling happens at the end of
+        calling report() when _check_toggle() is called.
+        """
+        resp = r"HTTP/1.0 200 OK\r\nConnection: Close\r\nContent-length: "
+
+        if self.rank == 0:
+            state = "OFF" if self._off else "ON"
+            logger.info(
+                f"Controller ready to recv " f"commands on port {self.port}. Current state {state}"
+            )
+            while True and self.sock is not None:
+                try:
+                    conn, _ = self.sock.accept()
+                    _ = conn.recv(1024)
+                    self.toggle = True
+                    state = "ON" if self._off else "OFF"
+                    msg = f"Will turn StragglerDetector {state} at next logging interval"
+                    msg_len = len(msg)
+                    final_resp = f"{resp}{msg_len}\r\n\r\n{msg}"
+                    conn.send(final_resp.encode())
+                    conn.close()
+                    logger.info(msg)
+                except Exception as err:
+                    logger.error(f"Error in stragler handler.. {str(err)}")
+                    return
+
+    def _controller(self):
+        """Installs a controller listener that is used to toggle collection state.
+
+        Called from configure(). Ignored for all ranks other than rank-0
+        """
+        try:
+            if self.rank == 0:
+                neth = "0.0.0.0"
+                netp = self.port
+                self.sock = socket.socket(socket.AF_INET, socket.SOCK_STREAM)
+                self.sock.setsockopt(socket.SOL_SOCKET, socket.SO_REUSEADDR, 1)
+                self.sock.bind((neth, netp))
+                self.sock.listen(128)
+                self.ctrlr = threading.Thread(
+                    target=self._handler, args=(), name="straggler", daemon=True
+                )
+                self.ctrlr.start()
+        except Exception as err:
+            logger.warning(f"StragglerDetector cannot be controlled.. {str(err)}")
+
+    def _min_max(
+        self,
+        ptime: float,
+        btime: float,
+        temp: float,
+        power: float,
+        util: float,
+        clock: float,
+        flops: float,
+    ) -> Union[_StragglerData, None]:
+        """Helper function to find the min/max values
+
+        Args:
+            ptime (float): avg per iteration gpu time
+            btime (float): avg per iteration cpu time
+            temp (float): gpu temp at the time of reporting
+            power (float): gpu power at the time of reporting
+            util (float): gpu util at the time of reporting
+            clock (float): gpu clock at the time of reporting
+            flops (float): estimated flops for the rank
+
+        Returns:
+            Union[_StragglerData, None]: It contains the min/max of few metrics and the
+                                         corresponding rank it also has sorted list of
+                                         all (flops, rank) sorted by flops (aflops)
+                                         or returns None if collecton is disabled
+        """
+        if self._off:
+            return None
+        # initialize output data object
+        o_dt = _StragglerData()
+
+        prof_data: Dict[str, Union[int, float]] = {}
+        data_list: List[Dict[str, Union[int, float]]] = []
+        prof_data["rank"] = self.rank
+        prof_data["time"] = ptime
+        prof_data["btime"] = btime
+        prof_data["temp"] = temp
+        prof_data["power"] = power
+        prof_data["util"] = util
+        prof_data["clock"] = clock
+        prof_data["flops"] = flops
+
+        if self.rank == 0:
+            data_list = [prof_data] * self.world
+
+        # this is blocking by default
+        torch.distributed.gather_object(prof_data, object_gather_list=data_list, dst=0)
+
+        if self.rank == 0:
+            min_ctime = min(data_list, key=lambda k: k["time"])  # elapsed
+            max_ctime = max(data_list, key=lambda k: k["time"])  # elapsed
+
+            min_cbatch = min(data_list, key=lambda k: k["btime"])  # batch time
+            max_cbatch = max(data_list, key=lambda k: k["btime"])  # batch time
+
+            min_ctemp = min(data_list, key=lambda k: k["temp"])  # temp
+            max_ctemp = max(data_list, key=lambda k: k["temp"])  # temp
+
+            min_cpower = min(data_list, key=lambda k: k["power"])  # power
+            max_cpower = max(data_list, key=lambda k: k["power"])  # power
+
+            min_cutil = min(data_list, key=lambda k: k["util"])  # gpu util
+            max_cutil = max(data_list, key=lambda k: k["util"])  # gpu util
+
+            min_cclock = min(data_list, key=lambda k: k["clock"])  # gpu clock
+            max_cclock = max(data_list, key=lambda k: k["clock"])  # gpu clock
+
+            min_val = min_ctime["time"]
+            min_rank = min_ctime["rank"]
+            max_val = max_ctime["time"]
+            max_rank = max_ctime["rank"]
+            o_dt.min_elapsed = _ValueWithRank(min_val, int(min_rank), "ms")
+            o_dt.max_elapsed = _ValueWithRank(max_val, int(max_rank), "ms")
+
+            min_val = min_cbatch["btime"]
+            min_rank = min_cbatch["rank"]
+            max_val = max_cbatch["btime"]
+            max_rank = max_cbatch["rank"]
+            o_dt.min_btime = _ValueWithRank(min_val, int(min_rank), "ms")
+            o_dt.max_btime = _ValueWithRank(max_val, int(max_rank), "ms")
+
+            min_val = min_ctemp["temp"]
+            min_rank = min_ctemp["rank"]
+            max_val = max_ctemp["temp"]
+            max_rank = max_ctemp["rank"]
+            o_dt.min_temp = _ValueWithRank(min_val, int(min_rank), "C")
+            o_dt.max_temp = _ValueWithRank(max_val, int(max_rank), "C")
+
+            min_val = min_cpower["power"]
+            min_rank = min_cpower["rank"]
+            max_val = max_cpower["power"]
+            max_rank = max_cpower["rank"]
+            o_dt.min_power = _ValueWithRank(min_val, int(min_rank), "W")
+            o_dt.max_power = _ValueWithRank(max_val, int(max_rank), "W")
+
+            min_val = min_cutil["util"]
+            min_rank = min_cutil["rank"]
+            max_val = max_cutil["util"]
+            max_rank = max_cutil["rank"]
+            o_dt.min_util = _ValueWithRank(min_val, int(min_rank), "%")
+            o_dt.max_util = _ValueWithRank(max_val, int(max_rank), "%")
+
+            min_val = min_cclock["clock"]
+            min_rank = min_cclock["rank"]
+            max_val = max_cclock["clock"]
+            max_rank = max_cclock["rank"]
+            o_dt.min_clock = _ValueWithRank(min_val, int(min_rank), "MHz")
+            o_dt.max_clock = _ValueWithRank(max_val, int(max_rank), "MHz")
+
+            o_dt.aflops = [
+                _ValueWithRank(d.get("flops", 0.0), int(d.get("rank", -1)))
+                for _, d in enumerate(data_list)
+            ]
+            o_dt.aflops.sort(key=lambda val_with_rank: val_with_rank()[0])
+        # wait for everyone here
+        torch.distributed.barrier()
+
+        return o_dt
+
+    @property
+    def enabled(self) -> bool:
+        """Can be called to check the enabled state of the instance
+
+        Note:
+            After the request to toggle the state, the
+            actual state change happens at end of call
+            to report()
+        """
+        return not self._off
+
+    @property
+    def configured(self) -> bool:
+        """Can be called to check if the the instance is already configured
+
+        Returns:
+            bool: returns True if configure was called and was a success, else False
+        """
+        return StragglerDetector._configured
+
+    @property
+    def my_rank(self):
+        """Can be called to get configured rank of this instance
+
+        Returns:
+            int: Configured rank for this instance
+        """
+        return self.rank
+
+    @property
+    def world_size(self) -> int:
+        """Can be called to get configured world of this instance
+
+        Returns:
+            int: World size configured for this instance
+        """
+        return self.world
+
+    def null_method(self) -> None:
+        """Default method to initialize start/stop method ptrs"""
+        pass
+
+    def __enter__(self) -> "StragglerDetector":
+        """Define context/instance entry
+
+        Returns:
+            StragglerDetector: the instance
+        """
+        self.start()
+        return self
+
+    def __call__(self, bdata: bool = False) -> "StragglerDetector":
+        """Callable for the instance. Set context state,
+
+        Useful when the context is used for cpu timers only when bdata=True
+
+        Args:
+            bdata (bool, optional): when true, only enables cpu timers. Defaults to False.
+
+        Returns:
+            StragglerDetector: the instance
+        """
+        self.bdata = bdata
+        return self
+
+    def __exit__(
+        self,
+        ex_type: Optional[Type[BaseException]],
+        ex_val: Optional[BaseException],
+        ex_tb: Optional[TracebackType],
+    ) -> bool:
+        """Define context/instance exit, calls the stop method
+
+        Args:
+            ex_type (Optional[Type[BaseException]]): Exception type
+            ex_val (Optional[BaseException]): _description_
+            ex_tb (Optional[TracebackType]): _description_
+
+        Returns:
+            bool: True if the exception was handled
+        """
+        # Should not suppress errors even if turned off
+        if ex_type is not None:
+            err = traceback.format_exception(ex_type, ex_val, ex_tb)
+            logger.warning(f"{str(ex_val)}\n{err}")
+        self.stop()
+        return False
+
+
+# Singleton, global visibility
+__straggler__ = StragglerDetector()
+"""StragglerDetector: private module variable, not be directly accessed
+"""
+
+
+# Check if Transformer Engine has Float8Tensor class
+HAVE_TE_FLOAT8TENSOR = False
+try:
+    from transformer_engine.pytorch.float8_tensor import Float8Tensor
+
+    HAVE_TE_FLOAT8TENSOR = True
+except (ImportError, ModuleNotFoundError):
+    # Float8Tensor not found
+    pass
+
+
+def is_float8tensor(tensor: torch.Tensor) -> bool:
+    """Check if a tensor is a Transformer Engine Float8Tensor"""
+    return HAVE_TE_FLOAT8TENSOR and isinstance(tensor, Float8Tensor)
diff --git a/megatron/data/__init__.py b/megatron/data/__init__.py
deleted file mode 100644
index cd5f898c6b..0000000000
--- a/megatron/data/__init__.py
+++ /dev/null
@@ -1 +0,0 @@
-from . import indexed_dataset
diff --git a/megatron/data/bert_dataset.py b/megatron/data/bert_dataset.py
deleted file mode 100644
index 036e6bccc9..0000000000
--- a/megatron/data/bert_dataset.py
+++ /dev/null
@@ -1,183 +0,0 @@
-# Copyright (c) 2022, NVIDIA CORPORATION. All rights reserved.
-
-"""BERT Style dataset."""
-
-import numpy as np
-import torch
-
-from megatron import (
-    get_args,
-    get_tokenizer,
-    mpu,
-    print_rank_0
-)
-from megatron.data.dataset_utils import (
-    get_samples_mapping,
-    get_a_and_b_segments,
-    truncate_segments,
-    create_tokens_and_tokentypes,
-    create_masked_lm_predictions
-)
-
-class BertDataset(torch.utils.data.Dataset):
-
-    def __init__(self, name, indexed_dataset, data_prefix,
-                 num_epochs, max_num_samples, masked_lm_prob,
-                 max_seq_length, short_seq_prob, seed, binary_head):
-
-        # Params to store.
-        self.name = name
-        self.seed = seed
-        self.masked_lm_prob = masked_lm_prob
-        self.max_seq_length = max_seq_length
-        self.binary_head = binary_head
-
-        # Dataset.
-        self.indexed_dataset = indexed_dataset
-
-        # Build the samples mapping.
-        self.samples_mapping = get_samples_mapping(self.indexed_dataset,
-                                                   data_prefix,
-                                                   num_epochs,
-                                                   max_num_samples,
-                                                   self.max_seq_length - 3, # account for added tokens
-                                                   short_seq_prob,
-                                                   self.seed,
-                                                   self.name,
-                                                   self.binary_head)
-
-        # Vocab stuff.
-        tokenizer = get_tokenizer()
-        self.vocab_id_list = list(tokenizer.inv_vocab.keys())
-        self.vocab_id_to_token_dict = tokenizer.inv_vocab
-        self.cls_id = tokenizer.cls
-        self.sep_id = tokenizer.sep
-        self.mask_id = tokenizer.mask
-        self.pad_id = tokenizer.pad
-
-    def __len__(self):
-        return self.samples_mapping.shape[0]
-
-    def __getitem__(self, idx):
-        start_idx, end_idx, seq_length = self.samples_mapping[idx]
-        sample = [self.indexed_dataset[i] for i in range(start_idx, end_idx)]
-        # Note that this rng state should be numpy and not python since
-        # python randint is inclusive whereas the numpy one is exclusive.
-        # We % 2**32 since numpy requres the seed to be between 0 and 2**32 - 1
-        np_rng = np.random.RandomState(seed=((self.seed + idx) % 2**32))
-        return build_training_sample(sample, seq_length,
-                                     self.max_seq_length,  # needed for padding
-                                     self.vocab_id_list,
-                                     self.vocab_id_to_token_dict,
-                                     self.cls_id, self.sep_id,
-                                     self.mask_id, self.pad_id,
-                                     self.masked_lm_prob, np_rng,
-                                     self.binary_head)
-
-
-
-
-def build_training_sample(sample,
-                          target_seq_length, max_seq_length,
-                          vocab_id_list, vocab_id_to_token_dict,
-                          cls_id, sep_id, mask_id, pad_id,
-                          masked_lm_prob, np_rng, binary_head):
-    """Biuld training sample.
-
-    Arguments:
-        sample: A list of sentences in which each sentence is a list token ids.
-        target_seq_length: Desired sequence length.
-        max_seq_length: Maximum length of the sequence. All values are padded to
-            this length.
-        vocab_id_list: List of vocabulary ids. Used to pick a random id.
-        vocab_id_to_token_dict: A dictionary from vocab ids to text tokens.
-        cls_id: Start of example id.
-        sep_id: Separator id.
-        mask_id: Mask token id.
-        pad_id: Padding token id.
-        masked_lm_prob: Probability to mask tokens.
-        np_rng: Random number genenrator. Note that this rng state should be
-              numpy and not python since python randint is inclusive for
-              the opper bound whereas the numpy one is exclusive.
-    """
-
-    if binary_head:
-        # We assume that we have at least two sentences in the sample
-        assert len(sample) > 1
-    assert target_seq_length <= max_seq_length
-
-    # Divide sample into two segments (A and B).
-    if binary_head:
-        tokens_a, tokens_b, is_next_random = get_a_and_b_segments(sample,
-                                                                  np_rng)
-    else:
-        tokens_a = []
-        for j in range(len(sample)):
-            tokens_a.extend(sample[j])
-        tokens_b = []
-        is_next_random = False
-
-    # Truncate to `target_sequence_length`.
-    max_num_tokens = target_seq_length
-    truncated = truncate_segments(tokens_a, tokens_b, len(tokens_a),
-                                  len(tokens_b), max_num_tokens, np_rng)
-
-    # Build tokens and toketypes.
-    tokens, tokentypes = create_tokens_and_tokentypes(tokens_a, tokens_b,
-                                                      cls_id, sep_id)
-
-    # Masking.
-    max_predictions_per_seq = masked_lm_prob * max_num_tokens
-    (tokens, masked_positions, masked_labels, _, _) = create_masked_lm_predictions(
-        tokens, vocab_id_list, vocab_id_to_token_dict, masked_lm_prob,
-        cls_id, sep_id, mask_id, max_predictions_per_seq, np_rng)
-
-    # Padding.
-    tokens_np, tokentypes_np, labels_np, padding_mask_np, loss_mask_np \
-        = pad_and_convert_to_numpy(tokens, tokentypes, masked_positions,
-                                   masked_labels, pad_id, max_seq_length)
-
-    train_sample = {
-        'text': tokens_np,
-        'types': tokentypes_np,
-        'labels': labels_np,
-        'is_random': int(is_next_random),
-        'loss_mask': loss_mask_np,
-        'padding_mask': padding_mask_np,
-        'truncated': int(truncated)}
-    return train_sample
-
-
-def pad_and_convert_to_numpy(tokens, tokentypes, masked_positions,
-                             masked_labels, pad_id, max_seq_length):
-    """Pad sequences and convert them to numpy."""
-
-    # Some checks.
-    num_tokens = len(tokens)
-    padding_length = max_seq_length - num_tokens
-    assert padding_length >= 0, \
-        f"num_tokens ({num_tokens}) is greater than " \
-        "max_seq_length ({max_seq_length})."
-    assert len(tokentypes) == num_tokens
-    assert len(masked_positions) == len(masked_labels)
-
-    # Tokens and token types.
-    filler = [pad_id] * padding_length
-    tokens_np = np.array(tokens + filler, dtype=np.int64)
-    tokentypes_np = np.array(tokentypes + filler, dtype=np.int64)
-
-    # Padding mask.
-    padding_mask_np = np.array([1] * num_tokens + [0] * padding_length,
-                               dtype=np.int64)
-
-    # Lables and loss mask.
-    labels = [-1] * max_seq_length
-    loss_mask = [0] * max_seq_length
-    for i in range(len(masked_positions)):
-        assert masked_positions[i] < num_tokens
-        labels[masked_positions[i]] = masked_labels[i]
-        loss_mask[masked_positions[i]] = 1
-    labels_np = np.array(labels, dtype=np.int64)
-    loss_mask_np = np.array(loss_mask, dtype=np.int64)
-
-    return tokens_np, tokentypes_np, labels_np, padding_mask_np, loss_mask_np
diff --git a/megatron/data/blendable_dataset.py b/megatron/data/blendable_dataset.py
deleted file mode 100644
index 453b362f3e..0000000000
--- a/megatron/data/blendable_dataset.py
+++ /dev/null
@@ -1,64 +0,0 @@
-# Copyright (c) 2022, NVIDIA CORPORATION. All rights reserved.
-
-"""Blendable dataset."""
-
-import time
-
-import numpy as np
-import torch
-
-from megatron import print_rank_0
-
-class BlendableDataset(torch.utils.data.Dataset):
-
-
-    def __init__(self, datasets, weights, size):
-
-        self.datasets = datasets
-        num_datasets = len(datasets)
-        assert num_datasets == len(weights)
-
-        self.size = size
-
-        # Normalize weights.
-        weights = np.array(weights, dtype=np.float64)
-        sum_weights = np.sum(weights)
-        assert sum_weights > 0.0
-        weights /= sum_weights
-
-        # Build indicies.
-        start_time = time.time()
-        assert num_datasets < 255
-        self.dataset_index = np.zeros(self.size, dtype=np.uint8)
-        self.dataset_sample_index = np.zeros(self.size, dtype=np.int64)
-
-        from megatron.data import helpers
-        helpers.build_blending_indices(self.dataset_index,
-                                       self.dataset_sample_index,
-                                       weights, num_datasets, self.size,
-                                       torch.distributed.get_rank() == 0)
-        print_rank_0('> elapsed time for building blendable dataset indices: '
-                     '{:.2f} (sec)'.format(time.time() - start_time))
-
-        # Check size
-        _ = self.__getitem__(self.size - 1)
-        try:
-            _ = self.__getitem__(self.size)
-            raise RuntimeError('BlendedDataset size is improperly bounded')
-        except IndexError:
-            pass
-        print_rank_0('> size of blendable dataset: '
-                     '{} samples'.format(self.size))
-
-
-    def __len__(self):
-        return self.size
-
-
-    def __getitem__(self, idx):
-        dataset_idx = self.dataset_index[idx]
-        sample_idx = self.dataset_sample_index[idx]
-        return {
-            "dataset_idx" : dataset_idx,
-            **self.datasets[dataset_idx][sample_idx],
-        }
diff --git a/megatron/data/gpt_dataset.py b/megatron/data/gpt_dataset.py
deleted file mode 100644
index 602e511678..0000000000
--- a/megatron/data/gpt_dataset.py
+++ /dev/null
@@ -1,524 +0,0 @@
-# Copyright (c) 2022, NVIDIA CORPORATION. All rights reserved.
-
-"""GPT style dataset."""
-
-import os
-import time
-
-import numpy as np
-import torch
-
-from megatron import print_rank_0
-from megatron.core import mpu
-from megatron.data.blendable_dataset import BlendableDataset
-from megatron.data.dataset_utils import get_datasets_weights_and_num_samples
-from megatron.data.dataset_utils import get_train_valid_test_split_
-from megatron.data.indexed_dataset import make_dataset as make_indexed_dataset
-
-
-def build_train_valid_test_datasets(data_prefix, data_impl, splits_string,
-                                    train_valid_test_num_samples,
-                                    seq_length, seed, skip_warmup,
-                                    train_data_prefix=None,
-                                    valid_data_prefix=None,
-                                    test_data_prefix=None,
-                                    return_doc_ids=False):
-    """Build train, valid, and test datasets."""
-
-    if data_prefix:
-        print_rank_0("Single data path provided for train, valid & test")
-
-        # Single dataset.
-        if len(data_prefix) == 1:
-            return _build_train_valid_test_datasets(data_prefix[0],
-                                                    data_impl, splits_string,
-                                                    train_valid_test_num_samples,
-                                                    seq_length, seed, skip_warmup)
-
-        # Blending dataset.
-        # Parse the values.
-        output = get_datasets_weights_and_num_samples(data_prefix,
-                                                      train_valid_test_num_samples)
-        prefixes, weights, datasets_train_valid_test_num_samples = output
-        train_num_samples, valid_num_samples, test_num_samples = map(
-            sum,
-            zip(*datasets_train_valid_test_num_samples)
-        )
-
-        # Build individual datasets.
-        train_datasets = []
-        valid_datasets = []
-        test_datasets = []
-        for i in range(len(prefixes)):
-            train_ds, valid_ds, test_ds = _build_train_valid_test_datasets(
-                prefixes[i], data_impl, splits_string,
-                datasets_train_valid_test_num_samples[i],
-                seq_length, seed, skip_warmup,
-                return_doc_ids)
-            if train_ds:
-                train_datasets.append(train_ds)
-            if valid_ds:
-                valid_datasets.append(valid_ds)
-            if test_ds:
-                test_datasets.append(test_ds)
-
-        # Blend.
-        blending_train_dataset = None
-        if train_datasets:
-            blending_train_dataset = BlendableDataset(train_datasets, weights, train_num_samples)
-        blending_valid_dataset = None
-        if valid_datasets:
-            blending_valid_dataset = BlendableDataset(valid_datasets, weights, valid_num_samples)
-        blending_test_dataset = None
-        if test_datasets:
-            blending_test_dataset = BlendableDataset(test_datasets, weights, test_num_samples)
-
-        return (blending_train_dataset, blending_valid_dataset,
-                blending_test_dataset)
-
-    else:
-        print_rank_0("Separate data paths provided for train, valid & test. Split string will be ignored.")
-
-        train_dataset, valid_dataset, test_dataset = None, None, None
-        # Single dataset.
-        if train_data_prefix is not None:
-            train_dataset = build_dataset("train", train_data_prefix, data_impl,
-                                          train_valid_test_num_samples[0],
-                                          seq_length, seed, skip_warmup)
-
-        if valid_data_prefix is not None:
-            valid_dataset = build_dataset("valid", valid_data_prefix, data_impl,
-                                          train_valid_test_num_samples[1],
-                                          seq_length, seed, False)
-
-        if test_data_prefix is not None:
-            test_dataset = build_dataset("test", test_data_prefix, data_impl,
-                                         train_valid_test_num_samples[2],
-                                         seq_length, seed, False)
-
-        return (train_dataset, valid_dataset, test_dataset)
-
-
-def _build_train_valid_test_datasets(data_prefix, data_impl, splits_string,
-                                     train_valid_test_num_samples,
-                                     seq_length, seed, skip_warmup,
-                                     return_doc_ids=False):
-    """Build train, valid, and test datasets."""
-
-    # Indexed dataset.
-    indexed_dataset = get_indexed_dataset_(data_prefix,
-                                           data_impl,
-                                           skip_warmup)
-
-    total_num_of_documents = indexed_dataset.sizes.shape[0]
-    splits = get_train_valid_test_split_(splits_string, total_num_of_documents)
-
-    # Print stats about the splits.
-    print_rank_0(' > dataset split:')
-
-    def print_split_stats(name, index):
-        print_rank_0('    {}:'.format(name))
-        print_rank_0('     document indices in [{}, {}) total of {} '
-                     'documents'.format(splits[index], splits[index + 1],
-                                        splits[index + 1] - splits[index]))
-    print_split_stats('train', 0)
-    print_split_stats('validation', 1)
-    print_split_stats('test', 2)
-
-    def build_dataset(index, name):
-        dataset = None
-        if splits[index + 1] > splits[index]:
-            documents = np.arange(start=splits[index], stop=splits[index + 1],
-                                  step=1, dtype=np.int32)
-            dataset = GPTDataset(name, data_prefix,
-                                 documents, indexed_dataset,
-                                 train_valid_test_num_samples[index],
-                                 seq_length, seed,
-                                 return_doc_ids)
-        return dataset
-
-    train_dataset = build_dataset(0, 'train')
-    valid_dataset = build_dataset(1, 'valid')
-    test_dataset = build_dataset(2, 'test')
-
-    return (train_dataset, valid_dataset, test_dataset)
-
-
-def build_dataset(dataset_name, data_prefix, data_impl, num_samples,
-                  seq_length, seed, skip_warmup):
-    dataset = None
-    if len(data_prefix) == 1:
-        dataset = _build_dataset(dataset_name,
-                        data_prefix[0], data_impl,
-                        num_samples, seq_length,
-                        seed, skip_warmup)
-    else:
-        # Blending dataset.
-        # Parse the values.
-        output = get_datasets_weights_and_num_samples(data_prefix, num_samples)
-        prefixes, weights, dataset_num_samples = output
-        num_samples = sum(dataset_num_samples)
-
-        # Build individual datasets.
-        datasets = []
-        for i in range(len(prefixes)):
-            ds = _build_dataset(dataset_name, prefixes[i],
-                            data_impl, dataset_num_samples[i],
-                            seq_length, seed, skip_warmup)
-            if ds:
-                datasets.append(ds)
-
-        if datasets:
-            dataset = BlendableDataset(datasets, weights, num_samples)
-
-    return dataset
-
-
-def _build_dataset(dataset_name, data_prefix, data_impl,
-                   num_samples, seq_length, seed, skip_warmup):
-    """
-    Build dataset. This method is called when individual
-    train, valid, test datasets are provided
-    """
-
-    # Indexed dataset.
-    indexed_dataset = get_indexed_dataset_(data_prefix,
-                                           data_impl,
-                                           skip_warmup)
-
-    total_num_of_documents = indexed_dataset.sizes.shape[0]
-
-    print_rank_0('    {}:'.format(dataset_name))
-    print_rank_0('     document indices in [0, {}) total of {} '
-                 'documents'.format(total_num_of_documents, total_num_of_documents))
-
-    documents = np.arange(start=0, stop=total_num_of_documents,
-                        step=1, dtype=np.int32)
-
-    dataset = GPTDataset(dataset_name, data_prefix,
-                        documents, indexed_dataset,
-                        num_samples, seq_length, seed)
-
-    return dataset
-
-
-def get_indexed_dataset_(data_prefix, data_impl, skip_warmup):
-    """Build indexed dataset."""
-    print_rank_0(' > building dataset index ...')
-
-    start_time = time.time()
-    indexed_dataset = make_indexed_dataset(data_prefix,
-                                           data_impl,
-                                           skip_warmup)
-    print_rank_0(' > finished creating indexed dataset in {:4f} '
-                 'seconds'.format(time.time() - start_time))
-    print_rank_0('    number of documents: {}'.format(
-        indexed_dataset.sizes.shape[0]))
-
-    return indexed_dataset
-
-
-class GPTDataset(torch.utils.data.Dataset):
-
-    def __init__(self, name, data_prefix, documents, indexed_dataset,
-                 num_samples, seq_length, seed,
-                 return_doc_ids=False):
-
-        self.name = name
-        self.indexed_dataset = indexed_dataset
-        self.return_doc_ids = return_doc_ids
-
-        # Checks
-        assert np.min(documents) >= 0
-        assert np.max(documents) < indexed_dataset.sizes.shape[0]
-
-        # Build index mappings.
-        self.doc_idx, self.sample_idx, self.shuffle_idx, self.index_prefix = \
-            _build_index_mappings(self.name, data_prefix,
-                                  documents, self.indexed_dataset.sizes,
-                                  num_samples, seq_length, seed)
-
-
-    def __len__(self):
-        # -1 is due to data structure used to retieve the index:
-        #    sample i --> [sample_idx[i], sample_idx[i+1])
-        return self.sample_idx.shape[0] - 1
-
-    def __getitem__(self, idx):
-        # Get the shuffled index.
-        idx = self.shuffle_idx[idx]
-        # Start and end documents and offsets.
-        doc_index_f = self.sample_idx[idx][0]
-        doc_index_l = self.sample_idx[idx + 1][0]
-        offset_f = self.sample_idx[idx][1]
-        offset_l = self.sample_idx[idx + 1][1]
-        # If we are within the same document, just extract the chunk.
-        doc_ids = []
-        if doc_index_f == doc_index_l:
-            doc_ids.append(self.doc_idx[doc_index_f])
-            sample = self.indexed_dataset.get(self.doc_idx[doc_index_f],
-                                              offset=offset_f,
-                                              length=offset_l - offset_f + 1)
-        else:
-            # Otherwise, get the rest of the initial document.
-            doc_ids.append(self.doc_idx[doc_index_f])
-            sample_list = [self.indexed_dataset.get(self.doc_idx[doc_index_f],
-                                                    offset=offset_f)]
-            # Loop over all in between documents and add the entire document.
-            for i in range(doc_index_f + 1, doc_index_l):
-                doc_ids.append(self.doc_idx[i])
-                sample_list.append(self.indexed_dataset.get(self.doc_idx[i]))
-            # And finally add the relevant portion of last document.
-            doc_ids.append(self.doc_idx[doc_index_l])
-            sample_list.append(self.indexed_dataset.get(
-                self.doc_idx[doc_index_l],
-                length=offset_l + 1))
-            sample = np.concatenate(sample_list)
-
-        if self.return_doc_ids: # for retro preprocessing
-            return {'text': np.array(sample, dtype=np.int64),
-                    'doc_ids': np.array(doc_ids, dtype=np.int64)}
-        else:
-            return {'text': np.array(sample, dtype=np.int64)}
-
-
-def _build_index_mappings(name, data_prefix, documents, sizes,
-                          num_samples, seq_length, seed):
-    """Build doc-idx, sample-idx, and shuffle-idx.
-    doc-idx: is an array (ordered) of documents to be used in training.
-    sample-idx: is the start document index and document offset for each
-       training sample.
-    shuffle-idx: maps the sample index into a random index into sample-idx.
-    """
-    # Number of tokens in each epoch and number of required epochs.
-    tokens_per_epoch = _num_tokens(documents, sizes)
-    num_epochs = _num_epochs(tokens_per_epoch, seq_length, num_samples)
-
-    # rng state
-    np_rng = np.random.RandomState(seed=seed)
-
-    # Filename of the index mappings.
-    index_prefix = '{}_indexmap'.format(name)
-    index_prefix += '_{}ns'.format(num_samples)
-    index_prefix += '_{}sl'.format(seq_length)
-    index_prefix += '_{}s'.format(seed)
-    _filename = data_prefix + '_' + index_prefix
-    doc_idx_filename = _filename + '_doc_idx.npy'
-    sample_idx_filename = _filename + '_sample_idx.npy'
-    shuffle_idx_filename = _filename + '_shuffle_idx.npy'
-
-    # Build the indexed mapping if not exist.
-    if torch.distributed.get_rank() == 0 and \
-       (not os.path.isfile(doc_idx_filename) or
-        not os.path.isfile(sample_idx_filename) or
-        not os.path.isfile(shuffle_idx_filename)):
-
-        print_rank_0(' > WARNING: could not find index map files, building '
-                     'the indices on rank 0 ...')
-
-        # For the last epoch, decide whether include the entire epoch
-        # in the global shuffle or not.
-
-        # If we need only one epoch, then separating last epoch  does
-        # not mean anything.
-        if num_epochs == 1:
-            separate_last_epoch = False
-            print(' > only one epoch required, setting '
-                  'separate_last_epoch to False', flush=True)
-
-        else:
-            # Get the number of samples for the last epoch
-            num_samples_from_epochs_minus_one = (
-                (num_epochs - 1) * tokens_per_epoch - 1) // seq_length
-            last_epoch_num_samples = num_samples - \
-                                     num_samples_from_epochs_minus_one
-            assert last_epoch_num_samples >= 0, \
-                'last epoch number of samples should be non-negative.'
-            num_samples_per_epoch = (tokens_per_epoch - 1) // seq_length
-            assert last_epoch_num_samples < (num_samples_per_epoch + 1), \
-                'last epoch number of samples exceeded max value.'
-            # If we have less than 80% of the samples for the last epoch,
-            # seperate out the epoch and treat it differently.
-            # Note: the 80% number is just based on common sense and can
-            # be adjusted if needed.
-            separate_last_epoch = (last_epoch_num_samples <
-                                   int(0.80 * num_samples_per_epoch))
-            if separate_last_epoch:
-                string = ' > last epoch number of samples ({}) is smaller '\
-                         'than 80% of number of samples per epoch ({}), '\
-                         'setting separate_last_epoch to True'
-            else:
-                string = ' > last epoch number of samples ({}) is larger '\
-                         'than 80% of number of samples per epoch ({}), '\
-                         'setting separate_last_epoch to False'
-            print(string.format(last_epoch_num_samples,
-                                num_samples_per_epoch), flush=True)
-
-        # doc-idx.
-        start_time = time.time()
-        doc_idx = _build_doc_idx(documents, num_epochs, np_rng,
-                                 separate_last_epoch)
-        np.save(doc_idx_filename, doc_idx, allow_pickle=True)
-        print_rank_0(' > elasped time to build and save doc-idx mapping '
-                     '(seconds): {:4f}'.format(time.time() - start_time))
-        # sample-idx.
-        start_time = time.time()
-        # Use C++ implementation for speed.
-        # First compile and then import.
-        from megatron.data import helpers
-        assert doc_idx.dtype == np.int32
-        assert sizes.dtype == np.int32
-        sample_idx = helpers.build_sample_idx(sizes, doc_idx, seq_length,
-                                              num_epochs, tokens_per_epoch)
-        np.save(sample_idx_filename, sample_idx, allow_pickle=True)
-        print_rank_0(' > elasped time to build and save sample-idx mapping '
-                     '(seconds): {:4f}'.format(time.time() - start_time))
-        # shuffle-idx.
-        start_time = time.time()
-        # -1 is due to data structure used to retieve the index:
-        #    sample i --> [sample_idx[i], sample_idx[i+1])
-        if separate_last_epoch:
-            num_samples_ = num_samples_from_epochs_minus_one
-        else:
-            num_samples_ = sample_idx.shape[0] - 1
-        shuffle_idx = _build_shuffle_idx(num_samples_,
-                                         sample_idx.shape[0] - 1, np_rng)
-        np.save(shuffle_idx_filename, shuffle_idx, allow_pickle=True)
-        print_rank_0(' > elasped time to build and save shuffle-idx mapping'
-                     ' (seconds): {:4f}'.format(time.time() - start_time))
-
-    # This should be a barrier but nccl barrier assumes
-    # device_index=rank which is not the case for model
-    # parallel case
-    counts = torch.cuda.LongTensor([1])
-    torch.distributed.all_reduce(counts, group=mpu.get_data_parallel_group())
-    torch.distributed.all_reduce(counts, group=mpu.get_pipeline_model_parallel_group())
-    assert counts[0].item() == (
-        torch.distributed.get_world_size() //
-        torch.distributed.get_world_size(group=mpu.get_tensor_model_parallel_group()))
-
-    # Load mappings.
-    start_time = time.time()
-    print_rank_0(' > loading doc-idx mapping from {}'.format(
-        doc_idx_filename))
-    doc_idx = np.load(doc_idx_filename, allow_pickle=True, mmap_mode='r')
-    print_rank_0(' > loading sample-idx mapping from {}'.format(
-        sample_idx_filename))
-    sample_idx = np.load(sample_idx_filename, allow_pickle=True, mmap_mode='r')
-    print_rank_0(' > loading shuffle-idx mapping from {}'.format(
-        shuffle_idx_filename))
-    shuffle_idx = np.load(shuffle_idx_filename, allow_pickle=True, mmap_mode='r')
-    print_rank_0('    loaded indexed file in {:3.3f} seconds'.format(
-        time.time() - start_time))
-    print_rank_0('    total number of samples: {}'.format(
-        sample_idx.shape[0]))
-    print_rank_0('    total number of epochs: {}'.format(num_epochs))
-
-    return doc_idx, sample_idx, shuffle_idx, index_prefix
-
-
-def _num_tokens(documents, sizes):
-    """Total number of tokens in the dataset."""
-    return np.sum(sizes[documents])
-
-
-def _num_epochs(tokens_per_epoch, seq_length, num_samples):
-    """Based on number of samples and sequence lenght, calculate how many
-    epochs will be needed."""
-    num_epochs = 0
-    total_tokens = 0
-    while True:
-        num_epochs += 1
-        total_tokens += tokens_per_epoch
-        # -1 is because we need to retrieve seq_length + 1 token each time
-        # but the last token will overlap with the first token of the next
-        # sample except for the last sample.
-        if ((total_tokens - 1) // seq_length) >= num_samples:
-            return num_epochs
-
-
-def _build_doc_idx(documents, num_epochs, np_rng, separate_last_epoch):
-    """Build an array with length = number-of-epochs * number-of-dcuments.
-    Each index is mapped to a corresponding document."""
-    if not separate_last_epoch or num_epochs == 1:
-        doc_idx = np.mgrid[0:num_epochs, 0:len(documents)][1]
-        doc_idx[:] = documents
-        doc_idx = doc_idx.reshape(-1)
-        doc_idx = doc_idx.astype(np.int32)
-        np_rng.shuffle(doc_idx)
-        return doc_idx
-
-    doc_idx_first = _build_doc_idx(documents, num_epochs-1, np_rng, False)
-    doc_idx_last = _build_doc_idx(documents, 1, np_rng, False)
-    return np.concatenate((doc_idx_first, doc_idx_last))
-
-
-def _build_sample_idx(sizes, doc_idx, seq_length,
-                      num_epochs, tokens_per_epoch):
-    """Sample index mapping is a 2D array with sizes
-    [number-of-samples + 1, 2] where [..., 0] contains
-    the index into `doc_idx` and [..., 1] is the
-    starting offset in that document."""
-
-    # Total number of samples. For -1 see comments in `_num_epochs`.
-    num_samples = (num_epochs * tokens_per_epoch - 1) // seq_length
-    sample_idx = np.zeros([num_samples + 1, 2], dtype=np.int32)
-
-    # Index into sample_idx.
-    sample_index = 0
-    # Index into doc_idx.
-    doc_idx_index = 0
-    # Begining offset for each document.
-    doc_offset = 0
-    # Start with first document and no offset.
-    sample_idx[sample_index][0] = doc_idx_index
-    sample_idx[sample_index][1] = doc_offset
-    sample_index += 1
-    while sample_index <= num_samples:
-        # Start with a fresh sequence.
-        remaining_seq_length = seq_length + 1
-        while remaining_seq_length != 0:
-            # Get the document length.
-            doc_id = doc_idx[doc_idx_index]
-            doc_length = sizes[doc_id] - doc_offset
-            # And add it to the current sequence.
-            remaining_seq_length -= doc_length
-            # If we have more than a full sequence, adjust offset and set
-            # remaining length to zero so we return from the while loop.
-            # Note that -1 here is for the same reason we have -1 in
-            # `_num_epochs` calculations.
-            if remaining_seq_length <= 0:
-                doc_offset += (remaining_seq_length + doc_length - 1)
-                remaining_seq_length = 0
-            else:
-                # Otherwise, start from the begining of the next document.
-                doc_idx_index += 1
-                doc_offset = 0
-        # Record the sequence.
-        sample_idx[sample_index][0] = doc_idx_index
-        sample_idx[sample_index][1] = doc_offset
-        sample_index += 1
-
-    return sample_idx
-
-
-def _build_shuffle_idx(num_samples, total_size, np_rng):
-    """Build the range [0, size) and shuffle."""
-    print(' > building shuffle index with split [0, {}) and [{}, {}) '
-          '...'.format(num_samples, num_samples, total_size), flush=True)
-
-    dtype_ = np.uint32
-    if total_size >= (np.iinfo(np.uint32).max - 1):
-        dtype_ = np.int64
-
-    shuffle_idx_first = np.arange(start=0, stop=num_samples,
-                                  step=1, dtype=dtype_)
-    np_rng.shuffle(shuffle_idx_first)
-    if num_samples == total_size:
-        return shuffle_idx_first
-
-    shuffle_idx_last = np.arange(start=num_samples, stop=total_size,
-                                 step=1, dtype=dtype_)
-    np_rng.shuffle(shuffle_idx_last)
-
-    return np.concatenate((shuffle_idx_first, shuffle_idx_last))
diff --git a/megatron/data/helpers.cpp b/megatron/data/helpers.cpp
deleted file mode 100644
index 09f5f97626..0000000000
--- a/megatron/data/helpers.cpp
+++ /dev/null
@@ -1,701 +0,0 @@
-/* Copyright (c) 2022, NVIDIA CORPORATION.  All rights reserved. */
-
-/* Helper methods for fast index mapping builds */
-
-#include <algorithm>
-#include <iostream>
-#include <limits>
-#include <math.h>
-#include <stdexcept>
-#include <pybind11/pybind11.h>
-#include <pybind11/numpy.h>
-#include <random>
-
-namespace py = pybind11;
-using namespace std;
-
-const int32_t LONG_SENTENCE_LEN = 512;
-
-
-void build_blending_indices(py::array_t<uint8_t>& dataset_index,
-			    py::array_t<int64_t>& dataset_sample_index,
-			    const py::array_t<double>& weights,
-			    const int32_t num_datasets,
-			    const int64_t size, const bool verbose) {
-  /* Given multiple datasets and a weighting array, build samples
-   such that it follows those wieghts.*/
-
-  if (verbose) {
-    std::cout << "> building indices for blendable datasets ..." << std::endl;
-  }
-
-  // Get the pointer access without the checks.
-  auto dataset_index_ptr = dataset_index.mutable_unchecked<1>();
-  auto dataset_sample_index_ptr = dataset_sample_index.mutable_unchecked<1>();
-  auto weights_ptr = weights.unchecked<1>();
-
-  // Initialize buffer for number of samples used for each dataset.
-  int64_t current_samples[num_datasets];
-  for(int64_t i = 0; i < num_datasets; ++i) {
-    current_samples[i] = 0;
-  }
-
-  // For each sample:
-  for(int64_t sample_idx = 0; sample_idx < size; ++sample_idx) {
-
-    // Determine where the max error in sampling is happening.
-    auto sample_idx_double = std::max(static_cast<double>(sample_idx), 1.0);
-    int64_t max_error_index = 0;
-    double max_error = weights_ptr[0] * sample_idx_double -
-      static_cast<double>(current_samples[0]);
-    for (int64_t dataset_idx = 1; dataset_idx < num_datasets; ++dataset_idx) {
-      double error = weights_ptr[dataset_idx] * sample_idx_double -
-	static_cast<double>(current_samples[dataset_idx]);
-      if (error > max_error) {
-	max_error = error;
-	max_error_index = dataset_idx;
-      }
-    }
-
-    // Populate the indices.
-    dataset_index_ptr[sample_idx] = static_cast<uint8_t>(max_error_index);
-    dataset_sample_index_ptr[sample_idx] = current_samples[max_error_index];
-
-    // Update the total samples.
-    current_samples[max_error_index] += 1;
-    
-  }
-
-  // print info
-  if (verbose) {
-    std::cout << " > sample ratios:" << std::endl;
-    for (int64_t dataset_idx = 0; dataset_idx < num_datasets; ++dataset_idx) {
-      auto ratio = static_cast<double>(current_samples[dataset_idx]) /
-	static_cast<double>(size);
-      std::cout << "   dataset " << dataset_idx << ", input: " <<
-	weights_ptr[dataset_idx] << ", achieved: " << ratio << std::endl; 
-    }
-  }
-
-}
-
-
-py::array build_sample_idx(const py::array_t<int32_t>& sizes_,
-			   const py::array_t<int32_t>& doc_idx_,
-			   const int32_t seq_length,
-			   const int32_t num_epochs,
-			   const int64_t tokens_per_epoch) {
-    /* Sample index (sample_idx) is used for gpt2 like dataset for which
-       the documents are flattened and the samples are built based on this
-       1-D flatten array. It is a 2D array with sizes [number-of-samples + 1, 2]
-       where [..., 0] contains the index into `doc_idx` and [..., 1] is the
-       starting offset in that document.*/
-
-    // Consistency checks.
-    assert(seq_length > 1);
-    assert(num_epochs > 0);
-    assert(tokens_per_epoch > 1);
-
-    // Remove bound checks.
-    auto sizes = sizes_.unchecked<1>();
-    auto doc_idx = doc_idx_.unchecked<1>();
-
-    // Mapping and it's length (1D).
-    int64_t num_samples = (num_epochs * tokens_per_epoch - 1) / seq_length;
-    int32_t* sample_idx = new int32_t[2*(num_samples+1)];
-
-    cout << "    using:" << endl << std::flush;
-    cout << "     number of documents:       " <<
-      doc_idx_.shape(0) / num_epochs << endl << std::flush;
-    cout << "     number of epochs:          " << num_epochs <<
-      endl << std::flush;
-    cout << "     sequence length:           " << seq_length <<
-      endl << std::flush;
-    cout << "     total number of samples:   " << num_samples <<
-      endl << std::flush;
-
-    // Index into sample_idx.
-    int64_t sample_index = 0;
-    // Index into doc_idx.
-    int64_t doc_idx_index = 0;
-    // Begining offset for each document.
-    int32_t doc_offset = 0;
-    // Start with first document and no offset.
-    sample_idx[2 * sample_index] = doc_idx_index;
-    sample_idx[2 * sample_index + 1] = doc_offset;
-    ++sample_index;
-
-    while (sample_index <= num_samples) {
-        // Start with a fresh sequence.
-      int32_t remaining_seq_length = seq_length + 1;
-      while (remaining_seq_length != 0) {
-            // Get the document length.
-	auto doc_id = doc_idx[doc_idx_index];
-	auto doc_length = sizes[doc_id] - doc_offset;
-	// And add it to the current sequence.
-	remaining_seq_length -= doc_length;
-	// If we have more than a full sequence, adjust offset and set
-	// remaining length to zero so we return from the while loop.
-	// Note that -1 here is for the same reason we have -1 in
-	// `_num_epochs` calculations.
-	if (remaining_seq_length <= 0) {
-	  doc_offset += (remaining_seq_length + doc_length - 1);
-	  remaining_seq_length = 0;
-	} else {
-	  // Otherwise, start from the begining of the next document.
-	  ++doc_idx_index;
-	  doc_offset = 0;
-	}
-      }
-      // Record the sequence.
-      sample_idx[2 * sample_index] = doc_idx_index;
-      sample_idx[2 * sample_index + 1] = doc_offset;
-      ++sample_index;
-    }
-
-    // Method to deallocate memory.
-    py::capsule free_when_done(sample_idx, [](void *mem_) {
-	int32_t *mem = reinterpret_cast<int32_t*>(mem_);
-	delete[] mem;
-      });
-
-    // Return the numpy array.
-    const auto byte_size = sizeof(int32_t);
-    return py::array(std::vector<int64_t>{num_samples+1, 2}, // shape
-                     {2*byte_size, byte_size}, // C-style contiguous strides
-                     sample_idx, // the data pointer
-                     free_when_done); // numpy array references
-    
-}
-
-
-inline int32_t get_target_sample_len(const int32_t short_seq_ratio,
-				     const int32_t max_length,
-				     std::mt19937& rand32_gen) {
-    /* Training sample length. */
-    if (short_seq_ratio == 0) {
-      return max_length;
-    }
-    const auto random_number = rand32_gen();
-    if ((random_number % short_seq_ratio) == 0) {
-      return 2 + random_number % (max_length - 1);
-    }
-    return max_length;
-}
-
-
-template<typename DocIdx>
-py::array build_mapping_impl(const py::array_t<int64_t>& docs_,
-                             const py::array_t<int32_t>& sizes_,
-                             const int32_t num_epochs,
-                             const uint64_t max_num_samples,
-                             const int32_t max_seq_length,
-                             const double short_seq_prob,
-                             const int32_t seed,
-			     const bool verbose,
-			     const int32_t min_num_sent) {
-    /* Build a mapping of (start-index, end-index, sequence-length) where
-       start and end index are the indices of the sentences in the sample
-       and sequence-length is the target sequence length.
-    */
-
-    // Consistency checks.
-    assert(num_epochs > 0);
-    assert(max_seq_length > 1);
-    assert(short_seq_prob >= 0.0);
-    assert(short_seq_prob <= 1.0);
-    assert(seed > 0);
-
-    // Remove bound checks.
-    auto docs = docs_.unchecked<1>();
-    auto sizes = sizes_.unchecked<1>();
-
-    // For efficiency, convert probability to ratio. Note: rand() generates int.
-    int32_t short_seq_ratio = 0;
-    if (short_seq_prob > 0) {
-      short_seq_ratio = static_cast<int32_t>(round(1.0 / short_seq_prob));
-    }
-
-    if (verbose) {
-        const auto sent_start_index = docs[0];
-	const auto sent_end_index = docs[docs_.shape(0) - 1];
-	const auto num_sentences = sent_end_index - sent_start_index;
-	cout << "    using:" << endl << std::flush;
-	cout << "     number of documents:            " << docs_.shape(0) - 1 <<
-	  endl << std::flush;
-	cout << "     sentences range:                [" << sent_start_index <<
-	", " << sent_end_index << ")" << endl << std::flush;
-	cout << "     total number of sentences:      " << num_sentences <<
-	  endl << std::flush;
-	cout << "     number of epochs:               " << num_epochs <<
-	  endl << std::flush;
-	cout << "     maximum number of samples:      " << max_num_samples <<
-	  endl << std::flush;
-	cout << "     maximum sequence length:        " << max_seq_length <<
-	  endl << std::flush;
-	cout << "     short sequence probability:     " << short_seq_prob <<
-	endl << std::flush;
-	cout << "     short sequence ration (1/prob): " << short_seq_ratio <<
-	  endl << std::flush;
-	cout << "     seed:                           " << seed << endl <<
-	  std::flush;
-    }
-
-    // Mapping and it's length (1D).
-    int64_t num_samples = -1;
-    DocIdx* maps = NULL;
-
-    // Perform two iterations, in the first iteration get the size
-    // and allocate memory and in the second iteration populate the map.
-    bool second = false;
-    for (int32_t iteration=0; iteration<2; ++iteration) {
-
-        // Set the seed so both iterations produce the same results.
-        std::mt19937 rand32_gen(seed);
-
-        // Set the flag on second iteration.
-        second = (iteration == 1);
-
-        // Counters:
-        uint64_t empty_docs = 0;
-        uint64_t one_sent_docs = 0;
-	uint64_t long_sent_docs = 0;
-
-        // Current map index.
-        uint64_t map_index = 0;
-
-        // For each epoch:
-        for (int32_t epoch=0; epoch<num_epochs; ++epoch) {
-            if (map_index >= max_num_samples) {
-	        if (verbose && (!second)) {
-		  cout << "    reached " << max_num_samples << " samples after "
-		       << epoch << " epochs ..." << endl << std::flush;
-		}
-                break;
-            }
-            // For each document:
-            for (int32_t doc=0; doc<(docs.shape(0) - 1); ++doc) {
-
-                // Document sentences are in [sent_index_first, sent_index_last)
-                const auto sent_index_first = docs[doc];
-                const auto sent_index_last = docs[doc + 1];
-
-                // At the begining of the document previous index is the
-		// start index.
-                auto prev_start_index = sent_index_first;
-
-                // Remaining documents.
-                auto num_remain_sent = sent_index_last - sent_index_first;
-
-                // Some bookkeeping
-                if ((epoch == 0) && (!second)) {
-                    if (num_remain_sent == 0) {
-		        ++empty_docs;
-                    }
-                    if (num_remain_sent == 1) {
-		        ++one_sent_docs;
-                    }
-                }
-
-		// Detect documents with long sentences.
-		bool contains_long_sentence = false;
-		if (num_remain_sent > 1) {
-		    for (auto sent_index=sent_index_first;
-			 sent_index < sent_index_last; ++sent_index) {
-		        if (sizes[sent_index] > LONG_SENTENCE_LEN){
-			    if ((epoch == 0) && (!second)) {
-			        ++long_sent_docs;
-			    }
-			    contains_long_sentence = true;
-			    break;
-			}
-		    }
-		}
-
-                // If we have more than two sentences.
-                if ((num_remain_sent >= min_num_sent) && (!contains_long_sentence)) {
-
-                    // Set values.
-                    auto seq_len = int32_t{0};
-                    auto num_sent = int32_t{0};
-                    auto target_seq_len = get_target_sample_len(short_seq_ratio,
-								max_seq_length,
-								rand32_gen);
-
-                    // Loop through sentences.
-                    for (auto sent_index=sent_index_first;
-                         sent_index < sent_index_last; ++sent_index) {
-
-		        // Add the size and number of sentences.
-		        seq_len += sizes[sent_index];
-		        ++num_sent;
-			--num_remain_sent;
-
-			// If we have reached the target length.
-			// and if not only one sentence is left in the document.
-			// and if we have at least two sentneces.
-			// and if we have reached end of the document.
-			if (((seq_len >= target_seq_len) &&
-			     (num_remain_sent > 1) &&
-			     (num_sent >= min_num_sent) ) || (num_remain_sent == 0)) {
-
-			    // Check for overflow.
-			    if ((3 * map_index + 2) >
-				std::numeric_limits<int64_t>::max()) {
-			        cout << "number of samples exceeded maximum "
-				     << "allowed by type int64: "
-				     << std::numeric_limits<int64_t>::max()
-				     << endl;
-				throw std::overflow_error("Number of samples");
-			    }
-
-			    // Populate the map.
-			    if (second) {
-			        const auto map_index_0 = 3 * map_index;
-				maps[map_index_0] = static_cast<DocIdx>(prev_start_index);
-				maps[map_index_0 + 1] = static_cast<DocIdx>(sent_index + 1);
-				maps[map_index_0 + 2] = static_cast<DocIdx>(target_seq_len);
-			    }
-
-			    // Update indices / counters.
-			    ++map_index;
-			    prev_start_index = sent_index + 1;
-			    target_seq_len = get_target_sample_len(short_seq_ratio,
-								   max_seq_length,
-								   rand32_gen);
-			    seq_len = 0;
-			    num_sent = 0;
-			}
-
-                    } // for (auto sent_index=sent_index_first; ...
-                } // if (num_remain_sent > 1) {
-            } // for (int doc=0; doc < num_docs; ++doc) {
-        } // for (int epoch=0; epoch < num_epochs; ++epoch) {
-
-        if (!second) {
-	    if (verbose) {
-	        cout << "   number of empty documents: " << empty_docs <<
-		  endl << std::flush;
-		cout << "   number of documents with one sentence: " <<
-		  one_sent_docs << endl << std::flush;
-		cout << "   number of documents with long sentences: " <<
-		  long_sent_docs << endl << std::flush;
-		cout << "   will create mapping for " << map_index <<
-		  " samples" << endl << std::flush;
-	    }
-	    assert(maps == NULL);
-	    assert(num_samples < 0);
-            maps = new DocIdx[3*map_index];
-            num_samples = static_cast<int64_t>(map_index);
-        }
-
-    } // for (int iteration=0; iteration < 2; ++iteration) {
-
-    // Shuffle.
-    // We need a 64 bit random number generator as we might have more
-    // than 2 billion samples.
-    std::mt19937_64 rand64_gen(seed + 1);
-    for (auto i=(num_samples - 1); i > 0; --i) {
-      const auto j = static_cast<int64_t>(rand64_gen() % (i + 1));
-      const auto i0 = 3 * i;
-      const auto j0 = 3 * j;
-      // Swap values.
-      swap(maps[i0], maps[j0]);
-      swap(maps[i0 + 1], maps[j0 + 1]);
-      swap(maps[i0 + 2], maps[j0 + 2]);
-    }
-
-    // Method to deallocate memory.
-    py::capsule free_when_done(maps, [](void *mem_) {
-            DocIdx *mem = reinterpret_cast<DocIdx*>(mem_);
-	    delete[] mem;
-        });
-
-    // Return the numpy array.
-    const auto byte_size = sizeof(DocIdx);
-    return py::array(std::vector<int64_t>{num_samples, 3}, // shape
-                     {3*byte_size, byte_size}, // C-style contiguous strides
-                     maps, // the data pointer
-                     free_when_done); // numpy array references
-
-}
-
-
-py::array build_mapping(const py::array_t<int64_t>& docs_,
-                        const py::array_t<int>& sizes_,
-                        const int num_epochs,
-                        const uint64_t max_num_samples,
-                        const int max_seq_length,
-                        const double short_seq_prob,
-                        const int seed,
-			const bool verbose,
-			const int32_t min_num_sent) {
-
-    if (sizes_.size() > std::numeric_limits<uint32_t>::max()) {
-        if (verbose) {
-	   cout << "    using uint64 for data mapping..." << endl << std::flush;
-	}
-	return build_mapping_impl<uint64_t>(docs_, sizes_, num_epochs,
-					    max_num_samples, max_seq_length,
-					    short_seq_prob, seed, verbose,
-					    min_num_sent);
-    } else {
-       if (verbose) {
-	   cout << "    using uint32 for data mapping..." << endl << std::flush;
-       }
-       return build_mapping_impl<uint32_t>(docs_, sizes_, num_epochs,
-					   max_num_samples, max_seq_length,
-					   short_seq_prob, seed, verbose,
-					   min_num_sent);
-    }
-}
-
-template<typename DocIdx>
-py::array build_blocks_mapping_impl(const py::array_t<int64_t>& docs_,
-                                    const py::array_t<int32_t>& sizes_,
-                                    const py::array_t<int32_t>& titles_sizes_,
-                                    const int32_t num_epochs,
-                                    const uint64_t max_num_samples,
-                                    const int32_t max_seq_length,
-                                    const int32_t seed,
-                                    const bool verbose,
-                                    const bool use_one_sent_blocks) {
-    /* Build a mapping of (start-index, end-index, sequence-length) where
-       start and end index are the indices of the sentences in the sample
-       and sequence-length is the target sequence length.
-    */
-
-    // Consistency checks.
-    assert(num_epochs > 0);
-    assert(max_seq_length > 1);
-    assert(seed > 0);
-
-    // Remove bound checks.
-    auto docs = docs_.unchecked<1>();
-    auto sizes = sizes_.unchecked<1>();
-    auto titles_sizes = titles_sizes_.unchecked<1>();
-
-    if (verbose) {
-        const auto sent_start_index = docs[0];
-        const auto sent_end_index = docs[docs_.shape(0) - 1];
-        const auto num_sentences = sent_end_index - sent_start_index;
-        cout << "    using:" << endl << std::flush;
-        cout << "     number of documents:            " << docs_.shape(0) - 1 <<
-          endl << std::flush;
-        cout << "     sentences range:                [" << sent_start_index <<
-        ", " << sent_end_index << ")" << endl << std::flush;
-        cout << "     total number of sentences:      " << num_sentences <<
-          endl << std::flush;
-        cout << "     number of epochs:               " << num_epochs <<
-          endl << std::flush;
-        cout << "     maximum number of samples:      " << max_num_samples <<
-          endl << std::flush;
-        cout << "     maximum sequence length:        " << max_seq_length <<
-          endl << std::flush;
-        cout << "     seed:                           " << seed << endl <<
-          std::flush;
-    }
-
-    // Mapping and its length (1D).
-    int64_t num_samples = -1;
-    DocIdx* maps = NULL;
-
-    // Acceptable number of sentences per block.
-    int min_num_sent = 2;
-    if (use_one_sent_blocks) {
-        min_num_sent = 1;
-    }
-
-    // Perform two iterations, in the first iteration get the size
-    // and allocate memory and in the second iteration populate the map.
-    bool second = false;
-    for (int32_t iteration=0; iteration<2; ++iteration) {
-
-        // Set the flag on second iteration.
-        second = (iteration == 1);
-
-        // Current map index.
-        uint64_t map_index = 0;
-
-        uint64_t empty_docs = 0;
-        uint64_t one_sent_docs = 0;
-        uint64_t long_sent_docs = 0;
-        // For each epoch:
-        for (int32_t epoch=0; epoch<num_epochs; ++epoch) {
-            // assign every block a unique id
-            int32_t block_id = 0;
-
-            if (map_index >= max_num_samples) {
-                if (verbose && (!second)) {
-                cout << "    reached " << max_num_samples << " samples after "
-                     << epoch << " epochs ..." << endl << std::flush;
-                }
-                break;
-            }
-            // For each document:
-            for (int32_t doc=0; doc<(docs.shape(0) - 1); ++doc) {
-
-                // Document sentences are in [sent_index_first, sent_index_last)
-                const auto sent_index_first = docs[doc];
-                const auto sent_index_last = docs[doc + 1];
-                const auto target_seq_len = max_seq_length - titles_sizes[doc];
-
-                // At the begining of the document previous index is the
-                // start index.
-                auto prev_start_index = sent_index_first;
-
-                // Remaining documents.
-                auto num_remain_sent = sent_index_last - sent_index_first;
-
-                // Some bookkeeping
-                if ((epoch == 0) && (!second)) {
-                    if (num_remain_sent == 0) {
-		                ++empty_docs;
-                    }
-                    if (num_remain_sent == 1) {
-		                ++one_sent_docs;
-                    }
-                }
-                // Detect documents with long sentences.
-                bool contains_long_sentence = false;
-                if (num_remain_sent >= min_num_sent) {
-                    for (auto sent_index=sent_index_first;
-                    sent_index < sent_index_last; ++sent_index) {
-                        if (sizes[sent_index] > LONG_SENTENCE_LEN){
-                            if ((epoch == 0) && (!second)) {
-                                ++long_sent_docs;
-                            }
-                            contains_long_sentence = true;
-                            break;
-                        }
-                    }
-                }
-                // If we have enough sentences and no long sentences.
-                if ((num_remain_sent >= min_num_sent) && (!contains_long_sentence)) {
-
-                    // Set values.
-                    auto seq_len = int32_t{0};
-                    auto num_sent = int32_t{0};
-
-                    // Loop through sentences.
-                    for (auto sent_index=sent_index_first;
-                         sent_index < sent_index_last; ++sent_index) {
-
-                            // Add the size and number of sentences.
-                            seq_len += sizes[sent_index];
-                            ++num_sent;
-                            --num_remain_sent;
-
-                        // If we have reached the target length.
-                        // and there are an acceptable number of sentences left
-                        // and if we have at least the minimum number of sentences.
-                        // or if we have reached end of the document.
-                        if (((seq_len >= target_seq_len) &&
-                             (num_remain_sent >= min_num_sent) &&
-                             (num_sent >= min_num_sent) ) || (num_remain_sent == 0)) {
-
-                            // Populate the map.
-                            if (second) {
-                                const auto map_index_0 = 4 * map_index;
-                                // Each sample has 4 items: the starting sentence index, ending sentence index,
-                                // the index of the document from which the block comes (used for fetching titles)
-                                // and the unique id of the block (used for creating block indexes)
-
-                                maps[map_index_0] = static_cast<DocIdx>(prev_start_index);
-                                maps[map_index_0 + 1] = static_cast<DocIdx>(sent_index + 1);
-                                maps[map_index_0 + 2] = static_cast<DocIdx>(doc);
-                                maps[map_index_0 + 3] = static_cast<DocIdx>(block_id);
-                            }
-
-                            // Update indices / counters.
-                            ++map_index;
-                            ++block_id;
-                            prev_start_index = sent_index + 1;
-                            seq_len = 0;
-                            num_sent = 0;
-                        }
-                    } // for (auto sent_index=sent_index_first; ...
-                } // if (num_remain_sent > 1) {
-            } // for (int doc=0; doc < num_docs; ++doc) {
-        } // for (int epoch=0; epoch < num_epochs; ++epoch) {
-
-        if (!second) {
-            if (verbose) {
-	        cout << "   number of empty documents: " << empty_docs <<
-              endl << std::flush;
-            cout << "   number of documents with one sentence: " <<
-              one_sent_docs << endl << std::flush;
-            cout << "   number of documents with long sentences: " <<
-              long_sent_docs << endl << std::flush;
-            cout << "   will create mapping for " << map_index <<
-              " samples" << endl << std::flush;
-            }
-            assert(maps == NULL);
-            assert(num_samples < 0);
-            maps = new DocIdx[4*map_index];
-            num_samples = static_cast<int64_t>(map_index);
-        }
-
-    } // for (int iteration=0; iteration < 2; ++iteration) {
-
-    // Shuffle.
-    // We need a 64 bit random number generator as we might have more
-    // than 2 billion samples.
-    std::mt19937_64 rand64_gen(seed + 1);
-    for (auto i=(num_samples - 1); i > 0; --i) {
-        const auto j = static_cast<int64_t>(rand64_gen() % (i + 1));
-        const auto i0 = 4 * i;
-        const auto j0 = 4 * j;
-        // Swap values.
-        swap(maps[i0], maps[j0]);
-        swap(maps[i0 + 1], maps[j0 + 1]);
-        swap(maps[i0 + 2], maps[j0 + 2]);
-        swap(maps[i0 + 3], maps[j0 + 3]);
-    }
-
-    // Method to deallocate memory.
-    py::capsule free_when_done(maps, [](void *mem_) {
-            DocIdx *mem = reinterpret_cast<DocIdx*>(mem_);
-	    delete[] mem;
-        });
-
-    // Return the numpy array.
-    const auto byte_size = sizeof(DocIdx);
-    return py::array(std::vector<int64_t>{num_samples, 4}, // shape
-                     {4*byte_size, byte_size}, // C-style contiguous strides
-                     maps, // the data pointer
-                     free_when_done); // numpy array references
-
-}
-
-py::array build_blocks_mapping(const py::array_t<int64_t>& docs_,
-                               const py::array_t<int>& sizes_,
-                               const py::array_t<int>& titles_sizes_,
-                               const int num_epochs,
-                               const uint64_t max_num_samples,
-                               const int max_seq_length,
-                               const int seed,
-                    const bool verbose,
-                    const bool use_one_sent_blocks) {
-
-    if (sizes_.size() > std::numeric_limits<uint32_t>::max()) {
-        if (verbose) {
-	   cout << "    using uint64 for data mapping..." << endl << std::flush;
-	}
-	return build_blocks_mapping_impl<uint64_t>(docs_, sizes_, titles_sizes_,
-	                    num_epochs, max_num_samples, max_seq_length, seed, verbose, use_one_sent_blocks);
-    } else {
-       if (verbose) {
-	   cout << "    using uint32 for data mapping..." << endl << std::flush;
-       }
-       return build_blocks_mapping_impl<uint32_t>(docs_, sizes_, titles_sizes_,
-                        num_epochs, max_num_samples, max_seq_length, seed, verbose, use_one_sent_blocks);
-    }
-}
-
-PYBIND11_MODULE(helpers, m) {
-    m.def("build_mapping", &build_mapping);
-    m.def("build_blocks_mapping", &build_blocks_mapping);
-    m.def("build_sample_idx", &build_sample_idx);
-    m.def("build_blending_indices", &build_blending_indices);
-}
diff --git a/megatron/data/indexed_dataset.py b/megatron/data/indexed_dataset.py
deleted file mode 100644
index 4286e69b45..0000000000
--- a/megatron/data/indexed_dataset.py
+++ /dev/null
@@ -1,584 +0,0 @@
-# Copyright (c) Facebook, Inc. and its affiliates.
-#
-# This source code is licensed under the MIT license found in the
-# LICENSE file in the root directory of this source tree.
-
-
-# copied from fairseq/fairseq/data/indexed_dataset.py
-# Removed IndexedRawTextDataset since it relied on Fairseq dictionary
-# other slight modifications to remove fairseq dependencies
-# Added document index to index file and made it accessible.
-#    An empty sentence no longer separates documents.
-
-from functools import lru_cache
-import os
-import shutil
-import struct
-from itertools import accumulate
-
-import numpy as np
-import torch
-from megatron import print_rank_0
-
-
-def __best_fitting_dtype(vocab_size=None):
-    if vocab_size is not None and vocab_size < 65500:
-        return np.uint16
-    else:
-        return np.int32
-
-
-def get_available_dataset_impl():
-    return ['lazy', 'cached', 'mmap']
-
-
-def infer_dataset_impl(path):
-    if IndexedDataset.exists(path):
-        with open(index_file_path(path), 'rb') as f:
-            magic = f.read(8)
-            if magic == IndexedDataset._HDR_MAGIC:
-                return 'cached'
-            elif magic == MMapIndexedDataset.Index._HDR_MAGIC[:8]:
-                return 'mmap'
-            else:
-                return None
-    else:
-        print(f"Dataset does not exist: {path}")
-        print("Path should be a basename that both .idx and .bin can be appended to get full filenames.")
-        return None
-
-
-def make_builder(out_file, impl, vocab_size=None):
-    if impl == 'mmap':
-        return MMapIndexedDatasetBuilder(out_file, dtype=__best_fitting_dtype(vocab_size))
-    else:
-        return IndexedDatasetBuilder(out_file)
-
-
-def make_dataset(path, impl, skip_warmup=False):
-    if not IndexedDataset.exists(path):
-        print(f"Dataset does not exist: {path}")
-        print("Path should be a basename that both .idx and .bin can be appended to get full filenames.")
-        return None
-    if impl == 'infer':
-        impl = infer_dataset_impl(path)
-    if impl == 'lazy' and IndexedDataset.exists(path):
-        return IndexedDataset(path)
-    elif impl == 'cached' and IndexedDataset.exists(path):
-        return IndexedCachedDataset(path)
-    elif impl == 'mmap' and MMapIndexedDataset.exists(path):
-        return MMapIndexedDataset(path, skip_warmup)
-    print(f"Unknown dataset implementation: {impl}")
-    return None
-
-
-def dataset_exists(path, impl):
-    if impl == 'mmap':
-        return MMapIndexedDataset.exists(path)
-    else:
-        return IndexedDataset.exists(path)
-
-
-def read_longs(f, n):
-    a = np.empty(n, dtype=np.int64)
-    f.readinto(a)
-    return a
-
-
-def write_longs(f, a):
-    f.write(np.array(a, dtype=np.int64))
-
-
-dtypes = {
-    1: np.uint8,
-    2: np.int8,
-    3: np.int16,
-    4: np.int32,
-    5: np.int64,
-    6: np.float32,
-    7: np.double,
-    8: np.uint16
-}
-
-
-def code(dtype):
-    for k in dtypes.keys():
-        if dtypes[k] == dtype:
-            return k
-    raise ValueError(dtype)
-
-
-def index_file_path(prefix_path):
-    return prefix_path + '.idx'
-
-
-def data_file_path(prefix_path):
-    return prefix_path + '.bin'
-
-
-def create_doc_idx(sizes):
-    doc_idx = [0]
-    for i, s in enumerate(sizes):
-        if s == 0:
-            doc_idx.append(i + 1)
-    return doc_idx
-
-
-class IndexedDataset(torch.utils.data.Dataset):
-    """Loader for IndexedDataset"""
-    _HDR_MAGIC = b'TNTIDX\x00\x00'
-
-    def __init__(self, path):
-        super().__init__()
-        self.path = path
-        self.data_file = None
-        self.read_index(path)
-
-    def read_index(self, path):
-        with open(index_file_path(path), 'rb') as f:
-            magic = f.read(8)
-            assert magic == self._HDR_MAGIC, (
-                'Index file doesn\'t match expected format. '
-                'Make sure that --dataset-impl is configured properly.'
-            )
-            version = f.read(8)
-            assert struct.unpack('<Q', version) == (1,)
-            code, self.element_size = struct.unpack('<QQ', f.read(16))
-            self.dtype = dtypes[code]
-            self._len, self.s = struct.unpack('<QQ', f.read(16))
-            self.doc_count = struct.unpack('<Q', f.read(8))
-            self.dim_offsets = read_longs(f, self._len + 1)
-            self.data_offsets = read_longs(f, self._len + 1)
-            self.sizes = read_longs(f, self.s)
-            self.doc_idx = read_longs(f, self.doc_count)
-
-    def read_data(self, path):
-        self.data_file = open(data_file_path(path), 'rb', buffering=0)
-
-    def check_index(self, i):
-        if i < 0 or i >= self._len:
-            raise IndexError('index out of range')
-
-    def __del__(self):
-        if self.data_file:
-            self.data_file.close()
-
-    # @lru_cache(maxsize=8)
-    def __getitem__(self, idx):
-        if not self.data_file:
-            self.read_data(self.path)
-        if isinstance(idx, int):
-            i = idx
-            self.check_index(i)
-            tensor_size = self.sizes[self.dim_offsets[i]:self.dim_offsets[i + 1]]
-            a = np.empty(tensor_size, dtype=self.dtype)
-            self.data_file.seek(self.data_offsets[i] * self.element_size)
-            self.data_file.readinto(a)
-            return a
-        elif isinstance(idx, slice):
-            start, stop, step = idx.indices(len(self))
-            if step != 1:
-                raise ValueError("Slices into indexed_dataset must be contiguous")
-            sizes = self.sizes[self.dim_offsets[start]:self.dim_offsets[stop]]
-            size = sum(sizes)
-            a = np.empty(size, dtype=self.dtype)
-            self.data_file.seek(self.data_offsets[start] * self.element_size)
-            self.data_file.readinto(a)
-            offsets = list(accumulate(sizes))
-            sents = np.split(a, offsets[:-1])
-            return sents
-
-    def __len__(self):
-        return self._len
-
-    def num_tokens(self, index):
-        return self.sizes[index]
-
-    def size(self, index):
-        return self.sizes[index]
-
-    @staticmethod
-    def exists(path):
-        return (
-            os.path.exists(index_file_path(path)) and os.path.exists(data_file_path(path))
-        )
-
-    @property
-    def supports_prefetch(self):
-        return False  # avoid prefetching to save memory
-
-
-class IndexedCachedDataset(IndexedDataset):
-
-    def __init__(self, path):
-        super().__init__(path)
-        self.cache = None
-        self.cache_index = {}
-
-    @property
-    def supports_prefetch(self):
-        return True
-
-    def prefetch(self, indices):
-        if all(i in self.cache_index for i in indices):
-            return
-        if not self.data_file:
-            self.read_data(self.path)
-        indices = sorted(set(indices))
-        total_size = 0
-        for i in indices:
-            total_size += self.data_offsets[i + 1] - self.data_offsets[i]
-        self.cache = np.empty(total_size, dtype=self.dtype)
-        ptx = 0
-        self.cache_index.clear()
-        for i in indices:
-            self.cache_index[i] = ptx
-            size = self.data_offsets[i + 1] - self.data_offsets[i]
-            a = self.cache[ptx: ptx + size]
-            self.data_file.seek(self.data_offsets[i] * self.element_size)
-            self.data_file.readinto(a)
-            ptx += size
-        if self.data_file:
-            # close and delete data file after prefetch so we can pickle
-            self.data_file.close()
-            self.data_file = None
-
-    # @lru_cache(maxsize=8)
-    def __getitem__(self, idx):
-        if isinstance(idx, int):
-            i = idx
-            self.check_index(i)
-            tensor_size = self.sizes[self.dim_offsets[i]:self.dim_offsets[i + 1]]
-            a = np.empty(tensor_size, dtype=self.dtype)
-            ptx = self.cache_index[i]
-            np.copyto(a, self.cache[ptx: ptx + a.size])
-            return a
-        elif isinstance(idx, slice):
-            # Hack just to make this work, can optimizer later if necessary
-            sents = []
-            for i in range(*idx.indices(len(self))):
-                sents.append(self[i])
-            return sents
-
-
-class IndexedDatasetBuilder(object):
-    element_sizes = {
-        np.uint8: 1,
-        np.int8: 1,
-        np.int16: 2,
-        np.int32: 4,
-        np.int64: 8,
-        np.float: 4,
-        np.double: 8
-    }
-
-    def __init__(self, out_file, dtype=np.int32):
-        self.out_file = open(out_file, 'wb')
-        self.dtype = dtype
-        self.data_offsets = [0]
-        self.dim_offsets = [0]
-        self.sizes = []
-        self.element_size = self.element_sizes[self.dtype]
-        self.doc_idx = [0]
-
-    def add_item(self, tensor):
-        bytes = self.out_file.write(np.array(tensor.numpy(), dtype=self.dtype))
-        self.data_offsets.append(self.data_offsets[-1] + bytes / self.element_size)
-        for s in tensor.size():
-            self.sizes.append(s)
-        self.dim_offsets.append(self.dim_offsets[-1] + len(tensor.size()))
-
-    def end_document(self):
-        self.doc_idx.append(len(self.sizes))
-
-    def merge_file_(self, another_file):
-        index = IndexedDataset(another_file)
-        assert index.dtype == self.dtype
-
-        doc_offset = len(self.sizes)
-
-        begin = self.data_offsets[-1]
-        for data_offset in index.data_offsets[1:]:
-            self.data_offsets.append(begin + data_offset)
-        self.sizes.extend(index.sizes)
-
-        begin = self.dim_offsets[-1]
-        for dim_offset in index.dim_offsets[1:]:
-            self.dim_offsets.append(begin + dim_offset)
-
-        self.doc_idx.extend((doc_offset + index.doc_idx)[1:])
-
-        with open(data_file_path(another_file), 'rb') as f:
-            while True:
-                data = f.read(1024)
-                if data:
-                    self.out_file.write(data)
-                else:
-                    break
-
-    def finalize(self, index_file):
-        self.out_file.close()
-        index = open(index_file, 'wb')
-        index.write(b'TNTIDX\x00\x00')
-        index.write(struct.pack('<Q', 1))
-        index.write(struct.pack('<QQ', code(self.dtype), self.element_size))
-        index.write(struct.pack('<QQ', len(self.data_offsets) - 1, len(self.sizes)))
-        index.write(struct.pack('<Q', len(self.doc_idx)))
-        write_longs(index, self.dim_offsets)
-        write_longs(index, self.data_offsets)
-        write_longs(index, self.sizes)
-        write_longs(index, self.doc_idx)
-        index.close()
-
-
-def _warmup_mmap_file(path):
-    with open(path, 'rb') as stream:
-        while stream.read(100 * 1024 * 1024):
-            pass
-
-
-class MMapIndexedDataset(torch.utils.data.Dataset):
-    class Index(object):
-        _HDR_MAGIC = b'MMIDIDX\x00\x00'
-
-        @classmethod
-        def writer(cls, path, dtype):
-            class _Writer(object):
-                def __enter__(self):
-                    self._file = open(path, 'wb')
-
-                    self._file.write(cls._HDR_MAGIC)
-                    self._file.write(struct.pack('<Q', 1))
-                    self._file.write(struct.pack('<B', code(dtype)))
-
-                    return self
-
-                @staticmethod
-                def _get_pointers(sizes):
-                    dtype_size = dtype().itemsize
-                    address = 0
-                    pointers = []
-
-                    for size in sizes:
-                        pointers.append(address)
-                        address += size * dtype_size
-
-                    return pointers
-
-                def write(self, sizes, doc_idx):
-                    pointers = self._get_pointers(sizes)
-
-                    self._file.write(struct.pack('<Q', len(sizes)))
-                    self._file.write(struct.pack('<Q', len(doc_idx)))
-
-                    sizes = np.array(sizes, dtype=np.int32)
-                    self._file.write(sizes.tobytes(order='C'))
-                    del sizes
-
-                    pointers = np.array(pointers, dtype=np.int64)
-                    self._file.write(pointers.tobytes(order='C'))
-                    del pointers
-
-                    doc_idx = np.array(doc_idx, dtype=np.int64)
-                    self._file.write(doc_idx.tobytes(order='C'))
-
-                def __exit__(self, exc_type, exc_val, exc_tb):
-                    self._file.close()
-
-            return _Writer()
-
-        def __init__(self, path, skip_warmup=False):
-            with open(path, 'rb') as stream:
-                magic_test = stream.read(9)
-                assert self._HDR_MAGIC == magic_test, (
-                    'Index file doesn\'t match expected format. '
-                    'Make sure that --dataset-impl is configured properly.'
-                )
-                version = struct.unpack('<Q', stream.read(8))
-                assert (1,) == version
-
-                dtype_code, = struct.unpack('<B', stream.read(1))
-                self._dtype = dtypes[dtype_code]
-                self._dtype_size = self._dtype().itemsize
-
-                self._len = struct.unpack('<Q', stream.read(8))[0]
-                self._doc_count = struct.unpack('<Q', stream.read(8))[0]
-                offset = stream.tell()
-
-            if not skip_warmup:
-                print_rank_0("    warming up index mmap file...")
-                _warmup_mmap_file(path)
-
-            self._bin_buffer_mmap = np.memmap(path, mode='r', order='C')
-            self._bin_buffer = memoryview(self._bin_buffer_mmap)
-            print_rank_0("    reading sizes...")
-            self._sizes = np.frombuffer(
-                self._bin_buffer,
-                dtype=np.int32,
-                count=self._len,
-                offset=offset)
-            print_rank_0("    reading pointers...")
-            self._pointers = np.frombuffer(self._bin_buffer, dtype=np.int64, count=self._len,
-                                           offset=offset + self._sizes.nbytes)
-            print_rank_0("    reading document index...")
-            self._doc_idx = np.frombuffer(self._bin_buffer, dtype=np.int64, count=self._doc_count,
-                                          offset=offset + self._sizes.nbytes + self._pointers.nbytes)
-
-        def __del__(self):
-            self._bin_buffer_mmap._mmap.close()
-            del self._bin_buffer_mmap
-
-        @property
-        def dtype(self):
-            return self._dtype
-
-        @property
-        def sizes(self):
-            return self._sizes
-
-        @property
-        def doc_idx(self):
-            return self._doc_idx
-
-        @lru_cache(maxsize=8)
-        def __getitem__(self, i):
-            return self._pointers[i], self._sizes[i]
-
-        def __len__(self):
-            return self._len
-
-    def __init__(self, path, skip_warmup=False):
-        super().__init__()
-
-        self._path = None
-        self._index = None
-        self._bin_buffer = None
-
-        self._do_init(path, skip_warmup)
-
-    def __getstate__(self):
-        return self._path
-
-    def __setstate__(self, state):
-        self._do_init(state, skip_warmup=True)
-
-    def _do_init(self, path, skip_warmup):
-        self._path = path
-        self._index = self.Index(index_file_path(self._path), skip_warmup)
-
-        if not skip_warmup:
-            print_rank_0("    warming up data mmap file...")
-            _warmup_mmap_file(data_file_path(self._path))
-        print_rank_0("    creating numpy buffer of mmap...")
-        self._bin_buffer_mmap = np.memmap(data_file_path(self._path), mode='r', order='C')
-        print_rank_0("    creating memory view of numpy buffer...")
-        self._bin_buffer = memoryview(self._bin_buffer_mmap)
-
-    def __del__(self):
-        self._bin_buffer_mmap._mmap.close()
-        del self._bin_buffer_mmap
-        del self._index
-
-    def __len__(self):
-        return len(self._index)
-
-    # @lru_cache(maxsize=8)
-    def __getitem__(self, idx):
-        if isinstance(idx, (int, np.integer)):
-            ptr, size = self._index[idx]
-            np_array = np.frombuffer(self._bin_buffer, dtype=self._index.dtype,
-                                     count=size, offset=ptr)
-            return np_array
-        elif isinstance(idx, slice):
-            start, stop, step = idx.indices(len(self))
-            if step != 1:
-                raise ValueError("Slices into indexed_dataset must be contiguous")
-            ptr = self._index._pointers[start]
-            sizes = self._index._sizes[idx]
-            offsets = list(accumulate(sizes))
-            total_size = sum(sizes)
-            np_array = np.frombuffer(self._bin_buffer, dtype=self._index.dtype,
-                                     count=total_size, offset=ptr)
-            sents = np.split(np_array, offsets[:-1])
-            return sents
-        else:
-            raise TypeError("Unexpected type received for idx: {}".format(type(idx)))
-
-    def get(self, idx, offset=0, length=None):
-        """ Retrieves a single item from the dataset with the option to only
-        return a portion of the item.
-
-        get(idx) is the same as [idx] but get() does not support slicing.
-        """
-        ptr, size = self._index[idx]
-        if length is None:
-            length = size - offset
-        ptr += offset * np.dtype(self._index.dtype).itemsize
-        np_array = np.frombuffer(self._bin_buffer, dtype=self._index.dtype,
-                                 count=length, offset=ptr)
-        return np_array
-
-    @property
-    def sizes(self):
-        return self._index.sizes
-
-    @property
-    def doc_idx(self):
-        return self._index.doc_idx
-
-    def get_doc_idx(self):
-        return self._index._doc_idx
-
-    def set_doc_idx(self, doc_idx_):
-        self._index._doc_idx = doc_idx_
-
-    @property
-    def supports_prefetch(self):
-        return False
-
-    @staticmethod
-    def exists(path):
-        return (
-            os.path.exists(index_file_path(path)) and os.path.exists(data_file_path(path))
-        )
-
-
-class MMapIndexedDatasetBuilder(object):
-    def __init__(self, out_file, dtype=np.int64):
-        self._data_file = open(out_file, 'wb')
-        self._dtype = dtype
-        self._sizes = []
-        self._doc_idx = [0]
-
-    def add_item(self, tensor):
-        np_array = np.array(tensor.numpy(), dtype=self._dtype)
-        self._data_file.write(np_array.tobytes(order='C'))
-        self._sizes.append(np_array.size)
-
-    def add_doc(self, tensor, sizes):
-        np_array = np.array(tensor, dtype=self._dtype)
-        self._data_file.write(np_array.tobytes(order='C'))
-        self._sizes.extend(sizes)
-        self._doc_idx.append(len(self._sizes))
-
-    def end_document(self):
-        self._doc_idx.append(len(self._sizes))
-
-    def merge_file_(self, another_file):
-        # Concatenate index
-        index = MMapIndexedDataset.Index(index_file_path(another_file))
-        assert index.dtype == self._dtype
-
-        offset = len(self._sizes)
-        self._sizes.extend(index.sizes)
-        self._doc_idx.extend((offset + index.doc_idx)[1:])
-
-        # Concatenate data
-        with open(data_file_path(another_file), 'rb') as f:
-            shutil.copyfileobj(f, self._data_file)
-
-    def finalize(self, index_file):
-        self._data_file.close()
-
-        with MMapIndexedDataset.Index.writer(index_file, self._dtype) as index:
-            index.write(self._sizes, self._doc_idx)
diff --git a/megatron/data/t5_dataset.py b/megatron/data/t5_dataset.py
deleted file mode 100644
index e606814909..0000000000
--- a/megatron/data/t5_dataset.py
+++ /dev/null
@@ -1,257 +0,0 @@
-# Copyright (c) 2022, NVIDIA CORPORATION. All rights reserved.
-
-"""T5 Style dataset."""
-
-import collections
-
-import numpy as np
-import torch
-
-from megatron import get_tokenizer
-from megatron.data.dataset_utils import (
-    create_masked_lm_predictions,
-    get_samples_mapping
-)
-
-class T5Dataset(torch.utils.data.Dataset):
-
-    def __init__(self, name, indexed_dataset, data_prefix,
-                 num_epochs, max_num_samples, masked_lm_prob,
-                 max_seq_length, max_seq_length_dec,
-                 short_seq_prob, seed):
-
-        # Params to store.
-        self.name = name
-        self.seed = seed
-        self.masked_lm_prob = masked_lm_prob
-        self.max_seq_length = max_seq_length
-        self.max_seq_length_dec = max_seq_length_dec
-
-        # Dataset.
-        self.indexed_dataset = indexed_dataset
-
-        # Build the samples mapping.
-        self.samples_mapping = get_samples_mapping(self.indexed_dataset,
-                                                   data_prefix,
-                                                   num_epochs,
-                                                   max_num_samples,
-                                                   self.max_seq_length - 2, # account for added tokens
-                                                   short_seq_prob,
-                                                   self.seed,
-                                                   self.name,
-                                                   False)
-
-        # Vocab stuff.
-        tokenizer = get_tokenizer()
-        self.vocab_id_list = list(tokenizer.inv_vocab.keys())
-        self.vocab_id_to_token_dict = tokenizer.inv_vocab
-        self.cls_id = tokenizer.cls
-        self.sep_id = tokenizer.sep
-        self.mask_id = tokenizer.mask
-        self.pad_id = tokenizer.pad
-        self.bos_id = tokenizer.bos_token_id
-        self.eos_id = tokenizer.eos_token_id
-        self.sentinel_tokens = tokenizer.additional_special_tokens_ids
-        assert len(self.sentinel_tokens) > 0, "Provide the argument --vocab-extra-ids 100 to the script"
-
-    def __len__(self):
-        return self.samples_mapping.shape[0]
-
-    def __getitem__(self, idx):
-
-        start_index, end_index, seq_length = self.samples_mapping[idx]
-        sample = []
-        for index in range(start_index, end_index):
-            sample.append(self.indexed_dataset[index])
-        # Note that this rng state should be numpy and not python since
-        # python randint is inclusive whereas the numpy one is exclusive.
-        np_rng = np.random.RandomState(seed=(self.seed + idx))
-        return build_training_sample(sample, seq_length,
-                                     self.max_seq_length,  # needed for padding
-                                     self.max_seq_length_dec,
-                                     self.vocab_id_list,
-                                     self.vocab_id_to_token_dict,
-                                     self.cls_id, self.sep_id,
-                                     self.mask_id, self.pad_id,
-                                     self.masked_lm_prob, np_rng,
-                                     self.bos_id, self.eos_id,
-                                     self.sentinel_tokens)
-
-
-def build_training_sample(sample, target_seq_length,
-                          max_seq_length, max_seq_length_dec,
-                          vocab_id_list, vocab_id_to_token_dict,
-                          cls_id, sep_id, mask_id, pad_id,
-                          masked_lm_prob, np_rng, bos_id=None,
-                          eos_id=None, sentinel_tokens=None):
-    """Build training sample.
-
-    Arguments:
-        sample: A list of sentences in which each sentence is a list token ids.
-        target_seq_length: Desired sequence length.
-        max_seq_length: Maximum length of the sequence. All values are padded to
-            this length.
-        vocab_id_list: List of vocabulary ids. Used to pick a random id.
-        vocab_id_to_token_dict: A dictionary from vocab ids to text tokens.
-        cls_id: Start of example id.
-        sep_id: Separator id.
-        mask_id: Mask token id.
-        pad_id: Padding token id.
-        masked_lm_prob: Probability to mask tokens.
-        np_rng: Random number genenrator. Note that this rng state should be
-              numpy and not python since python randint is inclusive for
-              the opper bound whereas the numpy one is exclusive.
-        bos_id: start of decoder example id
-        eos_id: end of generation id
-        sentinel_tokens: unique value to be substituted for every replaced span
-    """
-
-    assert target_seq_length <= max_seq_length
-
-    # flatten sentences into one list
-    tokens = [token for sentence in sample for token in sentence]
-
-    # Truncate to `target_sequence_length`.
-    max_num_tokens = target_seq_length
-    truncated = len(tokens) > max_num_tokens
-    tokens = tokens[:max_num_tokens]
-
-    # Masking.
-    max_predictions_per_seq = masked_lm_prob * max_num_tokens
-    (tokens, masked_positions, masked_labels, _, masked_spans) = create_masked_lm_predictions(
-        tokens, vocab_id_list, vocab_id_to_token_dict, masked_lm_prob,
-        cls_id, sep_id, mask_id, max_predictions_per_seq, np_rng,
-        max_ngrams=10, geometric_dist=True, masking_style="t5")
-
-    # Padding.
-    tokens_enc, tokens_dec_in, labels, enc_mask, \
-    dec_mask, enc_dec_mask, loss_mask \
-        = pad_and_convert_to_numpy(tokens, masked_positions,
-                                   masked_labels, pad_id, max_seq_length,
-                                   max_seq_length_dec, masked_spans,
-                                   bos_id, eos_id, sentinel_tokens)
-
-    train_sample = {
-        'text_enc': tokens_enc,
-        'text_dec': tokens_dec_in,
-        'labels': labels,
-        'loss_mask': loss_mask,
-        'truncated': int(truncated),
-        'enc_mask': enc_mask,
-        'dec_mask': dec_mask,
-        'enc_dec_mask': enc_dec_mask,
-    }
-    return train_sample
-
-
-def pad_and_convert_to_numpy(tokens, masked_positions,
-                             masked_labels, pad_id,
-                             max_seq_length, max_seq_length_dec,
-                             masked_spans=None, bos_id=None,
-                             eos_id=None, sentinel_tokens=None):
-    """Pad sequences and convert them to numpy."""
-
-    sentinel_tokens = collections.deque(sentinel_tokens)
-    t5_input = []
-    (t5_decoder_in, t5_decoder_out) = ([bos_id], [])
-    (start_index, end_index) = (0, None)
-    for span in masked_spans:
-        flag = sentinel_tokens.popleft()
-
-        # Append the same tokens in decoder input and output
-        t5_decoder_in.append(flag)
-        t5_decoder_in.extend(span.label)
-        t5_decoder_out.append(flag)
-        t5_decoder_out.extend(span.label)
-
-        end_index = span.index[0]
-        t5_input.extend(tokens[start_index: end_index])
-        t5_input.append(flag)
-
-        # the next start index is the token after the last span token
-        start_index = span.index[-1] + 1
-
-    # Add <eos> token to the t5_decoder_out
-    t5_decoder_out.append(eos_id)
-
-    # Add the remaining tokens to the t5 input
-    t5_input.extend(tokens[start_index:])
-
-    # assert (len(t5_input) - len(masked_spans)) + \
-    #        (len(t5_decoder_in) - (len(masked_spans) + 1)) == len(tokens)
-
-    # Some checks.
-
-    # Encoder-side padding mask.
-    num_tokens = len(t5_input)
-    padding_length = max_seq_length - num_tokens
-    assert padding_length >= 0
-    assert len(masked_positions) == len(masked_labels)
-
-    # Tokens..
-    filler = [pad_id] * padding_length
-    tokens_enc = np.array(t5_input + filler, dtype=np.int64)
-
-    # Decoder-side padding mask.
-    num_tokens_dec = len(t5_decoder_in)
-    padding_length_dec = max_seq_length_dec - num_tokens_dec
-    assert padding_length_dec >= 0
-    filler_dec = [pad_id] * padding_length_dec
-    tokens_dec_in = np.array(t5_decoder_in + filler_dec, dtype=np.int64)
-
-    # Create attention masks
-    enc_mask = make_attention_mask(tokens_enc, tokens_enc)
-    enc_dec_mask = make_attention_mask(tokens_dec_in, tokens_enc)
-    dec_mask = make_attention_mask(tokens_dec_in, tokens_dec_in)
-    dec_mask = dec_mask * make_history_mask(tokens_dec_in)
-
-    # Labels mask.
-    labels = t5_decoder_out + ([-1] * padding_length_dec)
-    labels = np.array(labels, dtype=np.int64)
-
-    # Loss mask
-    loss_mask = ([1] * num_tokens_dec) + ([0] * padding_length_dec)
-    loss_mask = np.array(loss_mask, dtype=np.int64)
-
-    return tokens_enc, tokens_dec_in, labels, enc_mask, \
-           dec_mask, enc_dec_mask, loss_mask
-
-
-def make_attention_mask(source_block, target_block):
-    """
-    Returns a 2-dimensional (2-D) attention mask
-    :param source_block: 1-D array
-    :param target_block: 1-D array
-    """
-    mask = (target_block[None, :] >= 1) * (source_block[:, None] >= 1)
-    mask = mask.astype(np.int64)
-    # (source_length, target_length)
-    return mask
-
-
-def make_attention_mask_3d(source_block, target_block):
-    """
-    Returns a 3-dimensional (3-D) attention mask
-    :param source_block: 1-D array
-    :param target_block: 1-D array
-    """
-    mask = (target_block[:, None, :] >= 1) * (source_block[:, :, None] >= 1)
-    # (batch, source_length, target_length)
-    # mask = mask.astype(np.int64)
-    return mask
-
-
-def make_history_mask(block):
-    length = block.shape[0]
-    arange = np.arange(length)
-    history_mask = (arange[None, ] <= arange[:, None])
-    history_mask = history_mask.astype(np.int64)
-    return history_mask
-
-
-def make_history_mask_3d(block):
-    batch, length = block.shape
-    arange = torch.arange(length, device=block.device)
-    history_mask = (arange[None, ] <= arange[:, None])[None, ]
-    history_mask = history_mask.expand(batch, length, length)
-    return history_mask
diff --git a/megatron/data/test/test_indexed_dataset.py b/megatron/data/test/test_indexed_dataset.py
deleted file mode 100644
index 12fec8d819..0000000000
--- a/megatron/data/test/test_indexed_dataset.py
+++ /dev/null
@@ -1,125 +0,0 @@
-# This file isn't really a formal automated test, it's just a place to
-# put some code used during development and manual testing of
-# indexed_dataset.
-
-from megatron.data import indexed_dataset
-from megatron.tokenizer import build_tokenizer
-import argparse
-import os
-import sys
-
-import torch
-
-script_dir = os.path.dirname(os.path.realpath(__file__))
-sys.path.append(os.path.join(script_dir, "../../../"))
-
-
-def test_indexed_dataset(args):
-    ds = indexed_dataset.make_dataset(args.data, args.dataset_impl)
-    tokenizer = build_tokenizer(args)
-    print(len(ds.doc_idx))
-    print(len(ds))
-    print(ds.doc_idx[-1])
-    if ds.supports_prefetch:
-        # just prefetch the whole thing in test (so assume it is small)
-        ds.prefetch(range(len(ds)))
-    if args.count > len(ds.doc_idx) - 1:
-        args.count = len(ds.doc_idx) - 1
-
-    for i in range(args.count):
-        start = ds.doc_idx[i]
-        end = ds.doc_idx[i + 1]
-        ids = ds[start:end]
-        print(f"Document {i}:")
-        print("--------------")
-        for s in ids:
-            assert len(s) > 0
-            l = s.data.tolist()
-            text = tokenizer.detokenize(l)
-            print(text)
-            print("---")
-
-
-def test_indexed_dataset_get(args):
-    ds = indexed_dataset.make_dataset(args.data, args.dataset_impl)
-    tokenizer = build_tokenizer(args)
-    size = ds.sizes[0]
-    print(f"size: {size}")
-    full = ds.get(0)
-    print(full)
-    # print(tokenizer.detokenize(full.data.tolist()))
-    print("---")
-    end = ds.get(0, offset=size - 10)
-    print(end)
-    # print(tokenizer.detokenize(end.data.tolist()))
-
-    start = ds.get(0, length=10)
-    print(start)
-    # print(tokenizer.detokenize(start.data.tolist()))
-
-    part = ds.get(0, offset=2, length=8)
-    print(part)
-    # print(tokenizer.detokenize(part.data.tolist()))
-
-# def test_albert_dataset(args):
-#     # tokenizer = FullBertTokenizer(args.vocab, do_lower_case=True)
-#     # idataset = indexed_dataset.make_dataset(args.data, args.dataset_impl)
-#     # ds = AlbertDataset(idataset, tokenizer)
-#     ds = AlbertDataset.from_paths(args.vocab, args.data, args.dataset_impl,
-#                                   args.epochs, args.max_num_samples,
-#                                   args.masked_lm_prob, args.seq_length,
-#                                   args.short_seq_prob, args.seed)
-#     truncated = 0
-#     total = 0
-#     for i, s in enumerate(ds):
-#         ids = s['text']
-#         tokens = ds.tokenizer.convert_ids_to_tokens(ids)
-#         print(tokens)
-#         if i >= args.count-1:
-#             exit()
-
-
-def main():
-    parser = argparse.ArgumentParser()
-    parser.add_argument('--data', type=str, help='prefix to data files')
-    parser.add_argument('--dataset-impl', type=str, default='infer',
-                        choices=['lazy', 'cached', 'mmap', 'infer'])
-    parser.add_argument('--count', type=int, default=10,
-                        help='Number of samples/documents to print')
-
-    group = parser.add_argument_group(title='tokenizer')
-    group.add_argument('--tokenizer-type', type=str, required=True,
-                       choices=['BertWordPieceLowerCase',
-                                'GPT2BPETokenizer'],
-                       help='What type of tokenizer to use.')
-    group.add_argument('--vocab-file', type=str, default=None,
-                       help='Path to the vocab file')
-    group.add_argument('--merge-file', type=str, default=None,
-                       help='Path to the BPE merge file (if necessary).')
-
-    parser.add_argument('--epochs', type=int, default=5,
-                        help='Number of epochs to plan for')
-    parser.add_argument('--max-num-samples', type=int, default=None,
-                        help='Maximum number of samples to plan for')
-    parser.add_argument('--masked-lm-prob', type=float, default=0.15,
-                        help='probability of masking tokens')
-    parser.add_argument('--seq-length', type=int, default=512,
-                        help='maximum sequence length')
-    parser.add_argument('--short-seq-prob', type=float, default=0.1,
-                        help='probability of creating a short sequence')
-    parser.add_argument('--seed', type=int, default=1234,
-                        help='random seed')
-    args = parser.parse_args()
-    args.rank = 0
-    args.make_vocab_size_divisible_by = 128
-    args.tensor_model_parallel_size = 1
-
-    if args.dataset_impl == "infer":
-        args.dataset_impl = indexed_dataset.infer_dataset_impl(args.data)
-
-#    test_albert_dataset(args)
-    test_indexed_dataset_get(args)
-
-
-if __name__ == "__main__":
-    main()
diff --git a/megatron/data/test/test_preprocess_data.sh b/megatron/data/test/test_preprocess_data.sh
deleted file mode 100755
index d121c85958..0000000000
--- a/megatron/data/test/test_preprocess_data.sh
+++ /dev/null
@@ -1,10 +0,0 @@
-#!/bin/bash
-
-IMPL=cached
-python ../preprocess_data.py \
-       --input test_samples.json \
-       --vocab vocab.txt \
-       --dataset-impl ${IMPL} \
-       --output-prefix test_samples_${IMPL} \
-       --workers 1 \
-       --log-interval 2
diff --git a/megatron/fused_kernels/scaled_masked_softmax.cpp b/megatron/fused_kernels/scaled_masked_softmax.cpp
deleted file mode 100644
index 4c8a8c2ee3..0000000000
--- a/megatron/fused_kernels/scaled_masked_softmax.cpp
+++ /dev/null
@@ -1,83 +0,0 @@
-/* Copyright (c) 2022, NVIDIA CORPORATION. All rights reserved. */
-
-#include <cuda_fp16.h>
-#include <torch/extension.h>
-#include <vector>
-
-namespace multihead_attn {
-namespace fused_softmax {
-namespace scaled_masked_softmax {
-
-torch::Tensor fwd_cuda(
-    torch::Tensor const& input, 
-    torch::Tensor const& mask,
-    float scale_factor);
-
-torch::Tensor bwd_cuda(
-    torch::Tensor const& output_grads, 
-    torch::Tensor const& softmax_results,
-    float scale_factor);
-
-int get_batch_per_block_cuda(
-    int query_seq_len,
-    int key_seq_len,
-    int batches,
-    int attn_heads);
-
-torch::Tensor fwd(
-    torch::Tensor const& input,
-    torch::Tensor const& mask,
-    float scale_factor) {
-  AT_ASSERTM(input.dim() == 4, "expected 4D tensor");
-  AT_ASSERTM((input.scalar_type() == at::ScalarType::Half) ||
-	     (input.scalar_type() == at::ScalarType::BFloat16), 
-      "Only fp16 and bf16 are supported");
-  AT_ASSERTM(mask.dim() == 4, "expected 4D tensor");
-
-  return fwd_cuda(input, mask, scale_factor);
-}
-
-torch::Tensor bwd(
-    torch::Tensor const& output_grads, 
-    torch::Tensor const& softmax_results,
-    float scale_factor) {
-
-  AT_ASSERTM(output_grads.dim() == 4, "expected 3D tensor");
-  AT_ASSERTM(softmax_results.dim() == 4, "expected 3D tensor");
-
-  AT_ASSERTM((output_grads.scalar_type() == at::ScalarType::Half) ||
-	     (output_grads.scalar_type() == at::ScalarType::BFloat16), 
-      "Only fp16 and bf16 are supported");
-  AT_ASSERTM((softmax_results.scalar_type() == at::ScalarType::Half) ||
-	     (softmax_results.scalar_type() == at::ScalarType::BFloat16), 
-      "Only fp16 and bf16 are supported");
-
-  return bwd_cuda(output_grads, softmax_results, scale_factor);
-}
-
-int get_batch_per_block(
-    int query_seq_len,
-    int key_seq_len,
-    int batches,
-    int attn_heads) {
-    return get_batch_per_block_cuda(query_seq_len, key_seq_len, batches, attn_heads);
-}
-
-} // end namespace scaled_masked_softmax
-} // end namespace fused_softmax
-} // end namespace multihead_attn
-
-PYBIND11_MODULE(TORCH_EXTENSION_NAME, m) {
-  m.def("forward", 
-        &multihead_attn::fused_softmax::scaled_masked_softmax::fwd, 
-	"Self Multihead Attention scaled, time masked softmax -- Forward.");
-
-  m.def("backward",
-        &multihead_attn::fused_softmax::scaled_masked_softmax::bwd,
-	"Self Multihead Attention scaled, time masked softmax -- Backward.");
-
-  m.def("get_batch_per_block",
-        &multihead_attn::fused_softmax::scaled_masked_softmax::get_batch_per_block,
-        "Return Batch per block size."
-  );
-}
diff --git a/megatron/fused_kernels/scaled_masked_softmax.h b/megatron/fused_kernels/scaled_masked_softmax.h
deleted file mode 100644
index 21ebbd5228..0000000000
--- a/megatron/fused_kernels/scaled_masked_softmax.h
+++ /dev/null
@@ -1,710 +0,0 @@
-/* Copyright (c) 2022, NVIDIA CORPORATION. All rights reserved. */
-
-#pragma once
-
-#include <assert.h>
-#include <cuda_fp16.h>
-#include <cfloat>
-#include <limits>
-#include <stdint.h>
-#include <cuda_fp16.h>
-#include <c10/macros/Macros.h>
-
-namespace {
-
-template <typename Datatype, int ELEMENTS_PER_LDG>
-__device__ __inline__ void copy_vector(Datatype *dst, const Datatype *src);
-
-template <>
-__device__ __inline__ void copy_vector<c10::BFloat16, 1>(c10::BFloat16 *dst, const c10::BFloat16 *src) { *dst = *src; }
-
-template <>
-__device__ __inline__ void copy_vector<c10::BFloat16, 4>(c10::BFloat16 *dst, const c10::BFloat16 *src) { *((float2*) dst) = *((float2*) src); }
-
-template <>
-__device__ __inline__ void copy_vector<c10::Half, 1>(c10::Half *dst, const c10::Half *src) { *dst = *src; }
-
-template <>
-__device__ __inline__ void copy_vector<c10::Half, 4>(c10::Half *dst, const c10::Half *src) { *((float2*) dst) = *((float2*) src); }
-
-template <>
-__device__ __inline__ void copy_vector<uint8_t, 1>(uint8_t *dst, const uint8_t *src) { *dst = *src; }
-
-template <>
-__device__ __inline__ void copy_vector<uint8_t, 4>(uint8_t *dst, const uint8_t *src) {*((half2*) dst) = *((half2*) src); }
-
-int log2_ceil(int value) {
-    int log2_value = 0;
-    while ((1 << log2_value) < value) ++log2_value;
-    return log2_value;
-}
-
-template<typename T>
-struct Add {
-  __device__ __forceinline__ T operator()(T a, T b) const {
-    return a + b;
-  }
-};
-
-template<typename T>
-struct Max {
-  __device__ __forceinline__ T operator()(T a, T b) const {
-    return a < b ? b : a;
-  }
-};
-
-template <typename T>
-__device__ __forceinline__ T WARP_SHFL_XOR_NATIVE(T value, int laneMask, int width = warpSize, unsigned int mask = 0xffffffff)
-{
-#if CUDA_VERSION >= 9000
-    return __shfl_xor_sync(mask, value, laneMask, width);
-#else
-    return __shfl_xor(value, laneMask, width);
-#endif
-}
-
-template <typename acc_t, int WARP_BATCH, int WARP_SIZE, template<typename> class ReduceOp>
-__device__ __forceinline__ void warp_reduce(acc_t* sum) {
-    ReduceOp<acc_t> r;
-    #pragma unroll
-    for (int offset = WARP_SIZE / 2; offset > 0; offset /= 2) {
-        #pragma unroll
-        for (int i = 0;  i < WARP_BATCH;  ++i) {
-            acc_t b = WARP_SHFL_XOR_NATIVE(sum[i], offset, WARP_SIZE);
-            sum[i] = r(sum[i], b);
-        }
-    }
-}
-
-
-/*
- * Extended softmax (from native aten pytorch) with following additional features
- * 1) input scaling
- */	
-template <typename input_t, typename output_t, typename acc_t, int log2_elements>
-__global__ void scaled_softmax_warp_forward(
-    output_t *dst, 
-    const input_t *src,
-    const acc_t scale, 
-    int micro_batch_size, 
-    int element_count)
-{
-    // WARP_SIZE and WARP_BATCH must match the return values batches_per_warp and 
-    // warp_size of method warp_softmax_forward_kernel.
-    constexpr int next_power_of_two = 1 << log2_elements;
-    constexpr int WARP_SIZE = (next_power_of_two < C10_WARP_SIZE) ? next_power_of_two : C10_WARP_SIZE;
-    constexpr int WARP_ITERATIONS = next_power_of_two / WARP_SIZE;
-    constexpr int WARP_BATCH = (next_power_of_two <= 128) ? 2 : 1;
-    constexpr int ELEMENTS_PER_LDG_STG = (WARP_ITERATIONS < 4) ? 1 : 4;
-
-    // blockDim/threadIdx = (WARP_SIZE, WARPS_PER_BLOCK, )
-    // gridDim/blockIdx = (seq_len, attn_heads, batches) 
-    int first_batch = (blockDim.y * (blockIdx.x + gridDim.x * (blockIdx.y + gridDim.y * blockIdx.z))+ threadIdx.y) * WARP_BATCH;
-
-    // micro_batch_size might not be a multiple of WARP_BATCH. Check how
-    // many batches have to computed within this WARP.
-    int local_batches = micro_batch_size - first_batch;
-    if (local_batches > WARP_BATCH)
-        local_batches = WARP_BATCH;
-
-    // there might be multiple batches per warp. compute the index within the batch
-    int local_idx = threadIdx.x;
-
-    src += first_batch * element_count + ELEMENTS_PER_LDG_STG * local_idx;
-    dst += first_batch * element_count + ELEMENTS_PER_LDG_STG * local_idx;
-
-    // load data from global memory
-    acc_t elements[WARP_BATCH][WARP_ITERATIONS];
-    input_t temp_data[ELEMENTS_PER_LDG_STG];
-    #pragma unroll
-    for (int i = 0;  i < WARP_BATCH;  ++i) {
-        int batch_element_count = (i >= local_batches) ? 0 : element_count;
-
-        #pragma unroll
-        for (int it = 0;  it < WARP_ITERATIONS;  it+=ELEMENTS_PER_LDG_STG) {
-            int element_index = ELEMENTS_PER_LDG_STG * local_idx + it * WARP_SIZE;
-
-            if (element_index < batch_element_count) {
-                int itr_idx = i*element_count+it*WARP_SIZE;
-                copy_vector<input_t, ELEMENTS_PER_LDG_STG>(temp_data, src + itr_idx);
-
-                #pragma unroll
-                for (int element = 0; element < ELEMENTS_PER_LDG_STG; ++element) {
-                    elements[i][it + element] = (acc_t)temp_data[element] * scale;
-                }
-            } else {
-                #pragma unroll
-                for (int element = 0; element < ELEMENTS_PER_LDG_STG; ++element) {
-                    elements[i][it + element] = -std::numeric_limits<acc_t>::infinity();
-                }
-            }
-        }
-    }
-
-    // compute max_value
-    acc_t max_value[WARP_BATCH];
-    #pragma unroll
-    for (int i = 0;  i < WARP_BATCH;  ++i) {
-        max_value[i] = elements[i][0];
-        #pragma unroll
-        for (int it = 1;  it < WARP_ITERATIONS;  ++it) {
-            max_value[i] = (max_value[i] > elements[i][it]) ? max_value[i] : elements[i][it];
-        }
-    }
-    warp_reduce<acc_t, WARP_BATCH, WARP_SIZE, Max>(max_value);
-
-    acc_t sum[WARP_BATCH] { 0.0f };
-    #pragma unroll
-    for (int i = 0;  i < WARP_BATCH;  ++i) {
-        #pragma unroll
-        for (int it = 0;  it < WARP_ITERATIONS;  ++it) {
-            elements[i][it] = std::exp((elements[i][it] - max_value[i]));
-            sum[i] += elements[i][it];
-        }
-    }
-    warp_reduce<acc_t, WARP_BATCH, WARP_SIZE, Add>(sum);
-
-    // store result
-    output_t out[ELEMENTS_PER_LDG_STG];
-    #pragma unroll
-    for (int i = 0;  i < WARP_BATCH;  ++i) {
-        if (i >= local_batches)
-            break;
-        #pragma unroll
-        for (int it = 0;  it < WARP_ITERATIONS;  it+=ELEMENTS_PER_LDG_STG) {
-            int element_index = ELEMENTS_PER_LDG_STG * local_idx + it * WARP_SIZE;
-            if (element_index < element_count) {
-                #pragma unroll
-                for (int element = 0; element < ELEMENTS_PER_LDG_STG; ++element) {
-                    out[element] = elements[i][it + element] / sum[i];
-                }
-                copy_vector<output_t, ELEMENTS_PER_LDG_STG>(dst + i * element_count + it * WARP_SIZE, out);  
-            } else {
-                break;
-            } 
-        }
-    }
-}
-
-
-/*
- * Extended softmax (from native aten pytorch) with following additional features
- * 1) input scaling
- * 2) Explicit masking
- */	
-template <typename input_t, typename output_t, typename acc_t, int log2_elements>
-__global__ void scaled_masked_softmax_warp_forward(
-    output_t *dst, 
-    const input_t *src,
-    const uint8_t *mask, 
-    const acc_t scale, 
-    int micro_batch_size, 
-    int element_count,
-    int pad_batches) 
-{
-    // WARP_SIZE and WARP_BATCH must match the return values batches_per_warp and 
-    // warp_size of method warp_softmax_forward_kernel.
-    constexpr int next_power_of_two = 1 << log2_elements;
-    constexpr int WARP_SIZE = (next_power_of_two < C10_WARP_SIZE) ? next_power_of_two : C10_WARP_SIZE;
-    constexpr int WARP_ITERATIONS = next_power_of_two / WARP_SIZE;
-    constexpr int WARP_BATCH = (next_power_of_two <= 128) ? 2 : 1;
-    constexpr int ELEMENTS_PER_LDG_STG = (WARP_ITERATIONS < 4) ? 1 : 4;
-
-    // blockDim/threadIdx = (WARP_SIZE, WARPS_PER_BLOCK, )
-    // gridDim/blockIdx = (seq_len, attn_heads, batches) 
-    int first_batch = (blockDim.y * (blockIdx.x + gridDim.x * (blockIdx.y + gridDim.y * blockIdx.z))+ threadIdx.y) * WARP_BATCH;
-    int pad_first_batch = 0;
-    if (pad_batches != 1) { // bert style
-        pad_first_batch = (blockDim.y * (blockIdx.x + gridDim.x * blockIdx.z) + threadIdx.y) * WARP_BATCH;
-    } else { // gpt2 style
-        pad_first_batch = (blockDim.y * blockIdx.x + threadIdx.y) * WARP_BATCH;
-    }
-
-    // micro_batch_size might not be a multiple of WARP_BATCH. Check how
-    // many batches have to computed within this WARP.
-    int local_batches = micro_batch_size - first_batch;
-    if (local_batches > WARP_BATCH)
-        local_batches = WARP_BATCH;
-
-    // there might be multiple batches per warp. compute the index within the batch
-    int local_idx = threadIdx.x;
-
-    src += first_batch * element_count + ELEMENTS_PER_LDG_STG * local_idx;
-    dst += first_batch * element_count + ELEMENTS_PER_LDG_STG * local_idx;
-    mask += pad_first_batch * element_count + ELEMENTS_PER_LDG_STG * local_idx;
-
-    // load data from global memory
-    acc_t elements[WARP_BATCH][WARP_ITERATIONS];
-    input_t temp_data[ELEMENTS_PER_LDG_STG];
-    uint8_t temp_mask[ELEMENTS_PER_LDG_STG];
-    #pragma unroll
-    for (int i = 0;  i < WARP_BATCH;  ++i) {
-        int batch_element_count = (i >= local_batches) ? 0 : element_count;
-
-        #pragma unroll
-        for (int it = 0;  it < WARP_ITERATIONS;  it+=ELEMENTS_PER_LDG_STG) {
-            int element_index = ELEMENTS_PER_LDG_STG * local_idx + it * WARP_SIZE;
-
-            if (element_index < batch_element_count) {
-                int itr_idx = i*element_count+it*WARP_SIZE;
-                copy_vector<input_t, ELEMENTS_PER_LDG_STG>(temp_data, src + itr_idx);
-                copy_vector<uint8_t, ELEMENTS_PER_LDG_STG>(temp_mask, mask + itr_idx);
-
-                #pragma unroll
-                  for (int element = 0; element < ELEMENTS_PER_LDG_STG; ++element) {
-                      if (temp_mask[element] != 1) {
-                          elements[i][it + element] = (acc_t)temp_data[element] * scale;
-                      } else {
-                          elements[i][it + element] = -10000.0;
-                      }
-                  }
-            } else {
-                #pragma unroll
-                for (int element = 0; element < ELEMENTS_PER_LDG_STG; ++element) {
-                    elements[i][it + element] = -std::numeric_limits<acc_t>::infinity();
-                }
-            }
-        }
-    }
-
-    // compute max_value
-    acc_t max_value[WARP_BATCH];
-    #pragma unroll
-    for (int i = 0;  i < WARP_BATCH;  ++i) {
-        max_value[i] = elements[i][0];
-        #pragma unroll
-        for (int it = 1;  it < WARP_ITERATIONS;  ++it) {
-            max_value[i] = (max_value[i] > elements[i][it]) ? max_value[i] : elements[i][it];
-        }
-    }
-    warp_reduce<acc_t, WARP_BATCH, WARP_SIZE, Max>(max_value);
-
-    // compute scale value to account for full mask
-    acc_t scale_value[WARP_BATCH];
-    #pragma unroll
-    for (int i = 0;  i < WARP_BATCH;  ++i) {
-        scale_value[i] = (max_value[i] == -10000.0) ? 0.0 : 1.0;
-    }
-
-    acc_t sum[WARP_BATCH] { 0.0f };
-    #pragma unroll
-    for (int i = 0;  i < WARP_BATCH;  ++i) {
-        #pragma unroll
-        for (int it = 0;  it < WARP_ITERATIONS;  ++it) {
-            elements[i][it] = std::exp((elements[i][it] - max_value[i]));
-            sum[i] += elements[i][it];
-        }
-    }
-    warp_reduce<acc_t, WARP_BATCH, WARP_SIZE, Add>(sum);
-
-    // store result
-    output_t out[ELEMENTS_PER_LDG_STG];
-    #pragma unroll
-    for (int i = 0;  i < WARP_BATCH;  ++i) {
-        if (i >= local_batches)
-            break;
-        #pragma unroll
-        for (int it = 0;  it < WARP_ITERATIONS;  it+=ELEMENTS_PER_LDG_STG) {
-            int element_index = ELEMENTS_PER_LDG_STG * local_idx + it * WARP_SIZE;
-            if (element_index < element_count) {
-                #pragma unroll
-                for (int element = 0; element < ELEMENTS_PER_LDG_STG; ++element) {
-                    out[element] = elements[i][it + element] * scale_value[i] / sum[i];
-                }
-                copy_vector<output_t, ELEMENTS_PER_LDG_STG>(dst + i * element_count + it * WARP_SIZE, out);  
-            } else {
-                break;
-            } 
-        }
-    }
-}
-
-template <typename input_t, typename output_t, typename acc_t, int log2_elements>
-__global__ void scaled_masked_softmax_warp_backward(
-    output_t *gradInput, 
-    input_t *grad, 
-    const input_t *output,
-    acc_t scale, 
-    int micro_batch_size, 
-    int element_count)
-{
-    // WARP_SIZE and WARP_BATCH must match the return values batches_per_warp and 
-    // warp_size of method warp_softmax_backward_kernel.
-    constexpr int next_power_of_two = 1 << log2_elements;
-    constexpr int WARP_SIZE = (next_power_of_two < C10_WARP_SIZE) ? next_power_of_two : C10_WARP_SIZE;
-    constexpr int WARP_ITERATIONS = next_power_of_two / WARP_SIZE;
-    constexpr int WARP_BATCH = (next_power_of_two <= 128) ? 2 : 1;
-    constexpr int ELEMENTS_PER_LDG_STG = (WARP_ITERATIONS < 4) ? 1 : 4;
-
-    // blockDim/threadIdx = (WARP_SIZE, WARPS_PER_BLOCK, )
-    // gridDim/blockIdx = (seq_len, attn_heads, batches) 
-    int first_batch = (blockDim.y * blockIdx.x + threadIdx.y) * WARP_BATCH;
-    
-    // micro_batch_size might not be a multiple of WARP_BATCH. Check how
-    // many batches have to computed within this WARP.
-    int local_batches = micro_batch_size - first_batch;
-    if (local_batches > WARP_BATCH)
-        local_batches = WARP_BATCH;
-
-    // there might be multiple batches per warp. compute the index within the batch
-    int local_idx = threadIdx.x;
-
-    // the first element to process by the current thread
-    int thread_offset = first_batch * element_count + ELEMENTS_PER_LDG_STG * local_idx;
-    grad += thread_offset;
-    output += thread_offset;
-    gradInput += thread_offset;
-
-    // load data from global memory
-    acc_t grad_reg[WARP_BATCH][WARP_ITERATIONS] { 0.0f };
-    acc_t output_reg[WARP_BATCH][WARP_ITERATIONS] { 0.0f };
-    input_t temp_grad[ELEMENTS_PER_LDG_STG];
-    input_t temp_output[ELEMENTS_PER_LDG_STG];
-    #pragma unroll
-    for (int i = 0;  i < WARP_BATCH;  ++i) {
-        int batch_element_count = (i >= local_batches) ? 0 : element_count;
-
-        #pragma unroll
-        for (int it = 0;  it < WARP_ITERATIONS;  it+=ELEMENTS_PER_LDG_STG) {
-            int element_index = ELEMENTS_PER_LDG_STG * local_idx + it * WARP_SIZE;
-            if (element_index < batch_element_count) {
-                copy_vector<input_t, ELEMENTS_PER_LDG_STG>(temp_grad, grad + i * element_count + it * WARP_SIZE);
-                copy_vector<input_t, ELEMENTS_PER_LDG_STG>(temp_output, output + i * element_count + it * WARP_SIZE);
-
-                #pragma unroll
-                for (int element = 0; element < ELEMENTS_PER_LDG_STG; ++element) {
-                    output_reg[i][it + element] = (acc_t)temp_output[element];
-                }
-                #pragma unroll
-                for (int element = 0; element < ELEMENTS_PER_LDG_STG; ++element) {
-                    grad_reg[i][it + element] = (acc_t)temp_grad[element] * output_reg[i][it + element];
-                }
-            } 
-        }
-    }
-   
-    acc_t sum[WARP_BATCH];
-    #pragma unroll
-    for (int i = 0;  i < WARP_BATCH;  ++i) {
-        sum[i] = grad_reg[i][0];
-        #pragma unroll
-        for (int it = 1;  it < WARP_ITERATIONS;  ++it) {
-            sum[i] += grad_reg[i][it];
-        }
-    }
-    warp_reduce<acc_t, WARP_BATCH, WARP_SIZE, Add>(sum);
-
-    // store result
-    #pragma unroll
-    for (int i = 0;  i < WARP_BATCH;  ++i) {
-        if (i >= local_batches)
-            break;
-        #pragma unroll
-        for (int it = 0;  it < WARP_ITERATIONS;  it+=ELEMENTS_PER_LDG_STG) {
-            int element_index = ELEMENTS_PER_LDG_STG * local_idx + it * WARP_SIZE;
-            if (element_index < element_count) {
-                // compute gradients
-                output_t out[ELEMENTS_PER_LDG_STG];
-                #pragma unroll
-                for (int element = 0; element < ELEMENTS_PER_LDG_STG; ++element) {
-                    out[element] = (output_t)(scale * (grad_reg[i][it + element] - output_reg[i][it + element] * sum[i]));
-                }
-                copy_vector<output_t, ELEMENTS_PER_LDG_STG>(gradInput + i * element_count + it * WARP_SIZE, out);
-            } 
-        }
-    }
-}
-} // end of anonymous namespace
-
-int get_batch_per_block(int query_seq_len, int key_seq_len, int batches, int attn_heads){
-    int log2_elements = log2_ceil(key_seq_len);
-    const int next_power_of_two = 1 << log2_elements;
-
-    int warp_size = (next_power_of_two < C10_WARP_SIZE) ? next_power_of_two : C10_WARP_SIZE;
-    int batches_per_warp = (next_power_of_two <= 128) ? 2 : 1;
-
-    constexpr int threads_per_block = 128;
-    int warps_per_block = (threads_per_block / warp_size);
-    int batches_per_block = warps_per_block * batches_per_warp;
-
-    return batches_per_block;
-}
-
-template<typename input_t, typename output_t, typename acc_t>
-void dispatch_scaled_softmax_forward(
-    output_t *dst, 
-    const input_t *src, 
-    const input_t scale, 
-    int query_seq_len, 
-    int key_seq_len, 
-    int batches,
-    int attn_heads)
-{
-    TORCH_INTERNAL_ASSERT(key_seq_len >= 0 && key_seq_len <= 4096 );
-    if (key_seq_len == 0) {
-        return;
-    } else {
-        int log2_elements = log2_ceil(key_seq_len);
-        const int next_power_of_two = 1 << log2_elements;
-        int batch_count = batches * attn_heads * query_seq_len;
-
-        // This value must match the WARP_SIZE constexpr value computed inside softmax_warp_forward.
-        int warp_size = (next_power_of_two < C10_WARP_SIZE) ? next_power_of_two : C10_WARP_SIZE;
-
-        // This value must match the WARP_BATCH constexpr value computed inside softmax_warp_forward.
-        int batches_per_warp = (next_power_of_two <= 128) ? 2 : 1;
-
-        // use 128 threads per block to maximimize gpu utilization
-        constexpr int threads_per_block = 128;
-
-        int warps_per_block = (threads_per_block / warp_size);
-        int batches_per_block = warps_per_block * batches_per_warp;
-        TORCH_INTERNAL_ASSERT(query_seq_len%batches_per_block == 0);
-        dim3 blocks(query_seq_len/batches_per_block, attn_heads, batches);
-        dim3 threads(warp_size, warps_per_block, 1);
-        // Launch code would be more elegant if C++ supported FOR CONSTEXPR
-        switch (log2_elements) {
-            case 0: // 1
-                scaled_softmax_warp_forward<input_t, output_t, acc_t, 0>
-                    <<<blocks, threads, 0, at::cuda::getCurrentCUDAStream()>>>(dst, src, scale, batch_count, key_seq_len);
-                break;
-            case 1: // 2
-                scaled_softmax_warp_forward<input_t, output_t, acc_t, 1>
-                    <<<blocks, threads, 0, at::cuda::getCurrentCUDAStream()>>>(dst, src, scale, batch_count, key_seq_len);
-                break;
-            case 2: // 4
-                scaled_softmax_warp_forward<input_t, output_t, acc_t, 2>
-                    <<<blocks, threads, 0, at::cuda::getCurrentCUDAStream()>>>(dst, src, scale, batch_count, key_seq_len);
-                break;
-            case 3: // 8
-                scaled_softmax_warp_forward<input_t, output_t, acc_t, 3>
-                    <<<blocks, threads, 0, at::cuda::getCurrentCUDAStream()>>>(dst, src, scale, batch_count, key_seq_len);
-                break;
-            case 4: // 16
-                scaled_softmax_warp_forward<input_t, output_t, acc_t, 4>
-                    <<<blocks, threads, 0, at::cuda::getCurrentCUDAStream()>>>(dst, src, scale, batch_count, key_seq_len);
-                break;
-            case 5: // 32
-                scaled_softmax_warp_forward<input_t, output_t, acc_t, 5>
-                    <<<blocks, threads, 0, at::cuda::getCurrentCUDAStream()>>>(dst, src, scale, batch_count, key_seq_len);
-                break;
-            case 6: // 64
-                scaled_softmax_warp_forward<input_t, output_t, acc_t, 6>
-                    <<<blocks, threads, 0, at::cuda::getCurrentCUDAStream()>>>(dst, src, scale, batch_count, key_seq_len);
-                break;
-            case 7: // 128
-                scaled_softmax_warp_forward<input_t, output_t, acc_t, 7>
-                    <<<blocks, threads, 0, at::cuda::getCurrentCUDAStream()>>>(dst, src, scale, batch_count, key_seq_len);
-                break;
-            case 8: // 256
-                scaled_softmax_warp_forward<input_t, output_t, acc_t, 8>
-                    <<<blocks, threads, 0, at::cuda::getCurrentCUDAStream()>>>(dst, src, scale, batch_count, key_seq_len);
-                break;
-            case 9: // 512
-                scaled_softmax_warp_forward<input_t, output_t, acc_t, 9>
-                    <<<blocks, threads, 0, at::cuda::getCurrentCUDAStream()>>>(dst, src, scale, batch_count, key_seq_len);
-                break;
-            case 10: // 1024
-                scaled_softmax_warp_forward<input_t, output_t, acc_t, 10>
-                    <<<blocks, threads, 0, at::cuda::getCurrentCUDAStream()>>>(dst, src, scale, batch_count, key_seq_len);
-                break;
-            case 11: // 2048
-                scaled_softmax_warp_forward<input_t, output_t, acc_t, 11>
-                    <<<blocks, threads, 0, at::cuda::getCurrentCUDAStream()>>>(dst, src, scale, batch_count, key_seq_len);
-                break;
-            case 12: // 4096
-                scaled_softmax_warp_forward<input_t, output_t, acc_t, 12>
-                    <<<blocks, threads, 0, at::cuda::getCurrentCUDAStream()>>>(dst, src, scale, batch_count, key_seq_len);
-                break;
-            default:
-                break;
-        }
-    }
-}
-
-template<typename input_t, typename output_t, typename acc_t>
-void dispatch_scaled_masked_softmax_forward(
-    output_t *dst, 
-    const input_t *src, 
-    const uint8_t *mask,
-    const input_t scale, 
-    int query_seq_len, 
-    int key_seq_len, 
-    int batches,
-    int attn_heads,
-    int pad_batches)
-{
-    TORCH_INTERNAL_ASSERT(key_seq_len >= 0 && key_seq_len <= 4096 );
-    if (key_seq_len == 0) {
-        return;
-    } else {
-        int log2_elements = log2_ceil(key_seq_len);
-        const int next_power_of_two = 1 << log2_elements;
-        int batch_count = batches * attn_heads * query_seq_len;
-
-        // This value must match the WARP_SIZE constexpr value computed inside softmax_warp_forward.
-        int warp_size = (next_power_of_two < C10_WARP_SIZE) ? next_power_of_two : C10_WARP_SIZE;
-
-        // This value must match the WARP_BATCH constexpr value computed inside softmax_warp_forward.
-        int batches_per_warp = (next_power_of_two <= 128) ? 2 : 1;
-
-        // use 128 threads per block to maximimize gpu utilization
-        constexpr int threads_per_block = 128;
-
-        int warps_per_block = (threads_per_block / warp_size);
-        int batches_per_block = warps_per_block * batches_per_warp;
-        TORCH_INTERNAL_ASSERT(query_seq_len%batches_per_block == 0);
-        dim3 blocks(query_seq_len/batches_per_block, attn_heads, batches);
-        dim3 threads(warp_size, warps_per_block, 1);
-        // Launch code would be more elegant if C++ supported FOR CONSTEXPR
-        switch (log2_elements) {
-            case 0: // 1
-                scaled_masked_softmax_warp_forward<input_t, output_t, acc_t, 0>
-                    <<<blocks, threads, 0, at::cuda::getCurrentCUDAStream()>>>(dst, src, mask, scale, batch_count, key_seq_len, pad_batches);
-                break;
-            case 1: // 2
-                scaled_masked_softmax_warp_forward<input_t, output_t, acc_t, 1>
-                    <<<blocks, threads, 0, at::cuda::getCurrentCUDAStream()>>>(dst, src, mask, scale, batch_count, key_seq_len, pad_batches);
-                break;
-            case 2: // 4
-                scaled_masked_softmax_warp_forward<input_t, output_t, acc_t, 2>
-                    <<<blocks, threads, 0, at::cuda::getCurrentCUDAStream()>>>(dst, src, mask, scale, batch_count, key_seq_len, pad_batches);
-                break;
-            case 3: // 8
-                scaled_masked_softmax_warp_forward<input_t, output_t, acc_t, 3>
-                    <<<blocks, threads, 0, at::cuda::getCurrentCUDAStream()>>>(dst, src, mask, scale, batch_count, key_seq_len, pad_batches);
-                break;
-            case 4: // 16
-                scaled_masked_softmax_warp_forward<input_t, output_t, acc_t, 4>
-                    <<<blocks, threads, 0, at::cuda::getCurrentCUDAStream()>>>(dst, src, mask, scale, batch_count, key_seq_len, pad_batches);
-                break;
-            case 5: // 32
-                scaled_masked_softmax_warp_forward<input_t, output_t, acc_t, 5>
-                    <<<blocks, threads, 0, at::cuda::getCurrentCUDAStream()>>>(dst, src, mask, scale, batch_count, key_seq_len, pad_batches);
-                break;
-            case 6: // 64
-                scaled_masked_softmax_warp_forward<input_t, output_t, acc_t, 6>
-                    <<<blocks, threads, 0, at::cuda::getCurrentCUDAStream()>>>(dst, src, mask, scale, batch_count, key_seq_len, pad_batches);
-                break;
-            case 7: // 128
-                scaled_masked_softmax_warp_forward<input_t, output_t, acc_t, 7>
-                    <<<blocks, threads, 0, at::cuda::getCurrentCUDAStream()>>>(dst, src, mask, scale, batch_count, key_seq_len, pad_batches);
-                break;
-            case 8: // 256
-                scaled_masked_softmax_warp_forward<input_t, output_t, acc_t, 8>
-                    <<<blocks, threads, 0, at::cuda::getCurrentCUDAStream()>>>(dst, src, mask, scale, batch_count, key_seq_len, pad_batches);
-                break;
-            case 9: // 512
-                scaled_masked_softmax_warp_forward<input_t, output_t, acc_t, 9>
-                    <<<blocks, threads, 0, at::cuda::getCurrentCUDAStream()>>>(dst, src, mask, scale, batch_count, key_seq_len, pad_batches);
-                break;
-            case 10: // 1024
-                scaled_masked_softmax_warp_forward<input_t, output_t, acc_t, 10>
-                    <<<blocks, threads, 0, at::cuda::getCurrentCUDAStream()>>>(dst, src, mask, scale, batch_count, key_seq_len, pad_batches);
-                break;
-            case 11: // 2048
-                scaled_masked_softmax_warp_forward<input_t, output_t, acc_t, 11>
-                    <<<blocks, threads, 0, at::cuda::getCurrentCUDAStream()>>>(dst, src, mask, scale, batch_count, key_seq_len, pad_batches);
-                break;
-            case 12: // 4096
-                scaled_masked_softmax_warp_forward<input_t, output_t, acc_t, 12>
-                    <<<blocks, threads, 0, at::cuda::getCurrentCUDAStream()>>>(dst, src, mask, scale, batch_count, key_seq_len, pad_batches);
-                break;
-            default:
-                break;
-        }
-    }
-}
-
-template<typename input_t, typename output_t, typename acc_t>
-void dispatch_scaled_masked_softmax_backward(
-    output_t *grad_input, 
-    input_t *grad, 
-    const input_t *output, 
-    const acc_t scale, 
-    int query_seq_len, 
-    int key_seq_len, 
-    int batches,
-    int attn_heads)
-{
-    TORCH_INTERNAL_ASSERT( key_seq_len >= 0 && key_seq_len <= 4096 );
-    if (key_seq_len == 0) {
-       return;
-    } else {
-        int log2_elements = log2_ceil(key_seq_len);
-        const int next_power_of_two = 1 << log2_elements;
-        int batch_count = batches *  attn_heads * query_seq_len;
-
-        // This value must match the WARP_SIZE constexpr value computed inside softmax_warp_backward.
-        int warp_size = (next_power_of_two < C10_WARP_SIZE) ? next_power_of_two : C10_WARP_SIZE;
-
-        // This value must match the WARP_BATCH constexpr value computed inside softmax_warp_backward.
-        int batches_per_warp = (next_power_of_two <= 128) ? 2 : 1;
-
-        // use 128 threads per block to maximimize gpu utilization
-        constexpr int threads_per_block = 128;
-
-        int warps_per_block = (threads_per_block / warp_size);
-        int batches_per_block = warps_per_block * batches_per_warp;
-        int blocks = batch_count/batches_per_block;
-        dim3 threads(warp_size, warps_per_block, 1);
-        // Launch code would be more elegant if C++ supported FOR CONSTEXPR
-        switch (log2_elements) {
-            case 0: // 1
-                scaled_masked_softmax_warp_backward<input_t, output_t, acc_t, 0>
-                    <<<blocks, threads, 0, at::cuda::getCurrentCUDAStream()>>>(grad_input, grad, output, scale, batch_count, key_seq_len);
-                break;
-            case 1: // 2
-                scaled_masked_softmax_warp_backward<input_t, output_t, acc_t, 1>
-                    <<<blocks, threads, 0, at::cuda::getCurrentCUDAStream()>>>(grad_input, grad, output, scale, batch_count, key_seq_len);
-                break;
-            case 2: // 4
-                scaled_masked_softmax_warp_backward<input_t, output_t, acc_t, 2>
-                    <<<blocks, threads, 0, at::cuda::getCurrentCUDAStream()>>>(grad_input, grad, output, scale, batch_count, key_seq_len);
-                break;
-            case 3: // 8
-                scaled_masked_softmax_warp_backward<input_t, output_t, acc_t, 3>
-                    <<<blocks, threads, 0, at::cuda::getCurrentCUDAStream()>>>(grad_input, grad, output, scale, batch_count, key_seq_len);
-                break;
-            case 4: // 16
-                scaled_masked_softmax_warp_backward<input_t, output_t, acc_t, 4>
-                    <<<blocks, threads, 0, at::cuda::getCurrentCUDAStream()>>>(grad_input, grad, output, scale, batch_count, key_seq_len);
-                break;
-            case 5: // 32
-                scaled_masked_softmax_warp_backward<input_t, output_t, acc_t, 5>
-                    <<<blocks, threads, 0, at::cuda::getCurrentCUDAStream()>>>(grad_input, grad, output, scale, batch_count, key_seq_len);
-                break;
-            case 6: // 64
-                scaled_masked_softmax_warp_backward<input_t, output_t, acc_t, 6>
-                    <<<blocks, threads, 0, at::cuda::getCurrentCUDAStream()>>>(grad_input, grad, output, scale, batch_count, key_seq_len);
-                break;
-            case 7: // 128
-                scaled_masked_softmax_warp_backward<input_t, output_t, acc_t, 7>
-                    <<<blocks, threads, 0, at::cuda::getCurrentCUDAStream()>>>(grad_input, grad, output, scale, batch_count, key_seq_len);
-                break;
-            case 8: // 256
-                scaled_masked_softmax_warp_backward<input_t, output_t, acc_t, 8>
-                    <<<blocks, threads, 0, at::cuda::getCurrentCUDAStream()>>>(grad_input, grad, output, scale, batch_count, key_seq_len);
-                break;
-            case 9: // 512
-                scaled_masked_softmax_warp_backward<input_t, output_t, acc_t, 9>
-                    <<<blocks, threads, 0, at::cuda::getCurrentCUDAStream()>>>(grad_input, grad, output, scale, batch_count, key_seq_len);
-                break;
-            case 10: // 1024
-                scaled_masked_softmax_warp_backward<input_t, output_t, acc_t, 10>
-                    <<<blocks, threads, 0, at::cuda::getCurrentCUDAStream()>>>(grad_input, grad, output, scale, batch_count, key_seq_len);
-                break;
-            case 11: // 2048
-                scaled_masked_softmax_warp_backward<input_t, output_t, acc_t, 11>
-                    <<<blocks, threads, 0, at::cuda::getCurrentCUDAStream()>>>(grad_input, grad, output, scale, batch_count, key_seq_len);
-                break;
-			case 12: // 4096
-                scaled_masked_softmax_warp_backward<input_t, output_t, acc_t, 12>
-                    <<<blocks, threads, 0, at::cuda::getCurrentCUDAStream()>>>(grad_input, grad, output, scale, batch_count, key_seq_len);
-                break;
-
-            default:
-                break;
-        }
-    }
-}
diff --git a/megatron/fused_kernels/scaled_masked_softmax_cuda.cu b/megatron/fused_kernels/scaled_masked_softmax_cuda.cu
deleted file mode 100644
index a8be57c052..0000000000
--- a/megatron/fused_kernels/scaled_masked_softmax_cuda.cu
+++ /dev/null
@@ -1,107 +0,0 @@
-/* Copyright (c) 2022, NVIDIA CORPORATION. All rights reserved. */
-
-#include <ATen/ATen.h>
-#include <cuda.h>
-#include <cuda_runtime.h>
-#include <cuda_fp16.h>
-#include <cuda_profiler_api.h>
-#include <ATen/cuda/CUDAContext.h>
-#include <torch/extension.h>
-#include "scaled_masked_softmax.h"
-#include "type_shim.h"
-
-namespace multihead_attn {
-namespace fused_softmax {
-namespace scaled_masked_softmax {
-
-int get_batch_per_block_cuda(int query_seq_len, int key_seq_len, int batches, int attn_heads){
-    return get_batch_per_block(query_seq_len, key_seq_len, batches, attn_heads);
-}
-
-
-torch::Tensor fwd_cuda(
-    torch::Tensor const& input,
-    torch::Tensor const& mask,
-    float scale_factor)
-{
-  // input is a 4d tensor with dimensions [batches, attn_heads, seq_len, seq_len]
-  const int batches = input.size(0);
-  const int pad_batches = mask.size(0);
-  const int attn_heads = input.size(1);
-  const int query_seq_len = input.size(2);
-  const int key_seq_len = input.size(3);
-  TORCH_INTERNAL_ASSERT(key_seq_len <= 4096);
-  TORCH_INTERNAL_ASSERT(query_seq_len > 1);
-  TORCH_INTERNAL_ASSERT(pad_batches == 1 || pad_batches == batches);
-  TORCH_INTERNAL_ASSERT(mask.size(1) == 1);
-  TORCH_INTERNAL_ASSERT(mask.size(2) == query_seq_len);
-  TORCH_INTERNAL_ASSERT(mask.size(3) == key_seq_len);
-
-  // Output 
-  auto act_options = input.options().requires_grad(false);
-  torch::Tensor softmax_results = 
-      torch::empty({batches, attn_heads, query_seq_len, key_seq_len}, act_options);
-
-  // Softmax Intermediate Result Ptr
-  void* input_ptr = static_cast<void*>(input.data_ptr());
-  void* mask_ptr = static_cast<void*>(mask.data_ptr());
-  void* softmax_results_ptr = static_cast<void*>(softmax_results.data_ptr());
-
-  DISPATCH_HALF_AND_BFLOAT(
-      input.scalar_type(),
-      "dispatch_scaled_masked_softmax_forward",
-      dispatch_scaled_masked_softmax_forward<scalar_t, scalar_t, float>(
-      reinterpret_cast<scalar_t*>(softmax_results_ptr),
-	  reinterpret_cast<const scalar_t*>(input_ptr),
-	  reinterpret_cast<const uint8_t*>(mask_ptr),
-	  scale_factor,
-	  query_seq_len,
-	  key_seq_len,
-	  batches,
-	  attn_heads,
-	  pad_batches);
-      );
-  return softmax_results;
-}
-
-torch::Tensor bwd_cuda(
-    torch::Tensor const& output_grads_, 
-    torch::Tensor const& softmax_results_, 
-    float scale_factor)  {
-	
-  auto output_grads = output_grads_.contiguous();
-  auto softmax_results = softmax_results_.contiguous();
-
-  //output grads is a 4d tensor with dimensions [batches, attn_heads, seq_len, seq_len]
-  const int batches = output_grads.size(0);
-  const int attn_heads = output_grads.size(1);
-  const int query_seq_len = output_grads.size(2);
-  const int key_seq_len = output_grads.size(3);
-
-  auto act_options = output_grads.options().requires_grad(false);
-  torch::Tensor input_grads = 
-            torch::empty({batches, attn_heads, query_seq_len, key_seq_len}, act_options);  
-
-  void* output_grads_ptr = static_cast<void*>(output_grads.data_ptr());
-  void* input_grads_ptr = static_cast<void*>(input_grads.data_ptr());
-
-  //Softmax Grad
-  DISPATCH_HALF_AND_BFLOAT(
-      output_grads_.scalar_type(),
-      "dispatch_scaled_masked_softmax_backward",
-      dispatch_scaled_masked_softmax_backward<scalar_t, scalar_t, float>(
-      reinterpret_cast<scalar_t*>(input_grads_ptr), 
-	  reinterpret_cast<scalar_t*>(output_grads_ptr), 
-	  reinterpret_cast<scalar_t const*>(softmax_results.data_ptr()),
-	  scale_factor,
-	  query_seq_len,
-	  key_seq_len,
-	  batches,
-	  attn_heads);
-      );
-  
-  return input_grads;
-}
-}
-}
-}
diff --git a/megatron/fused_kernels/scaled_softmax.cpp b/megatron/fused_kernels/scaled_softmax.cpp
deleted file mode 100644
index e10cd77e7f..0000000000
--- a/megatron/fused_kernels/scaled_softmax.cpp
+++ /dev/null
@@ -1,61 +0,0 @@
-/* Copyright (c) 2022, NVIDIA CORPORATION. All rights reserved. */
-
-#include <cuda_fp16.h>
-#include <torch/extension.h>
-#include <vector>
-
-namespace multihead_attn {
-namespace fused_softmax {
-namespace scaled_softmax {
-
-torch::Tensor fwd_cuda(
-    torch::Tensor const& input, 
-    float scale_factor);
-
-torch::Tensor bwd_cuda(
-    torch::Tensor const& output_grads, 
-    torch::Tensor const& softmax_results,
-    float scale_factor);
-
-torch::Tensor fwd(
-    torch::Tensor const& input,
-    float scale_factor) {
-  AT_ASSERTM(input.dim() == 4, "expected 4D tensor");
-  AT_ASSERTM((input.scalar_type() == at::ScalarType::Half) ||
-	     (input.scalar_type() == at::ScalarType::BFloat16), 
-      "Only fp16 and bf16 are supported");
-
-  return fwd_cuda(input, scale_factor);
-}
-
-torch::Tensor bwd(
-    torch::Tensor const& output_grads, 
-    torch::Tensor const& softmax_results,
-    float scale_factor) {
-
-  AT_ASSERTM(output_grads.dim() == 4, "expected 3D tensor");
-  AT_ASSERTM(softmax_results.dim() == 4, "expected 3D tensor");
-
-  AT_ASSERTM((output_grads.scalar_type() == at::ScalarType::Half) ||
-	     (output_grads.scalar_type() == at::ScalarType::BFloat16), 
-      "Only fp16 and bf16 are supported");
-  AT_ASSERTM((softmax_results.scalar_type() == at::ScalarType::Half) ||
-	     (softmax_results.scalar_type() == at::ScalarType::BFloat16), 
-      "Only fp16 and bf16 are supported");
-
-  return bwd_cuda(output_grads, softmax_results, scale_factor);
-}
-
-} // end namespace scaled_softmax
-} // end namespace fused_softmax
-} // end namespace multihead_attn
-
-PYBIND11_MODULE(TORCH_EXTENSION_NAME, m) {
-  m.def("forward", 
-        &multihead_attn::fused_softmax::scaled_softmax::fwd, 
-	"Self Multihead Attention scaled, softmax -- Forward.");
-  m.def("backward", 
-        &multihead_attn::fused_softmax::scaled_softmax::bwd,
-	"Self Multihead Attention scaled, softmax -- Backward.");
-}
-
diff --git a/megatron/fused_kernels/scaled_softmax_cuda.cu b/megatron/fused_kernels/scaled_softmax_cuda.cu
deleted file mode 100644
index ecc6eb06e8..0000000000
--- a/megatron/fused_kernels/scaled_softmax_cuda.cu
+++ /dev/null
@@ -1,90 +0,0 @@
-/* Copyright (c) 2022, NVIDIA CORPORATION. All rights reserved. */
-
-#include <ATen/ATen.h>
-#include <cuda.h>
-#include <cuda_runtime.h>
-#include <cuda_fp16.h>
-#include <cuda_profiler_api.h>
-#include <ATen/cuda/CUDAContext.h>
-#include <torch/extension.h>
-#include "scaled_masked_softmax.h"
-#include "type_shim.h"
-
-namespace multihead_attn {
-namespace fused_softmax {
-namespace scaled_softmax {
-
-torch::Tensor fwd_cuda(
-    torch::Tensor const& input,
-    float scale_factor)
-{
-  // input is a 4d tensor with dimensions [batches, attn_heads, seq_len, seq_len]
-  const int batches = input.size(0);
-  const int attn_heads = input.size(1);
-  const int query_seq_len = input.size(2);
-  const int key_seq_len = input.size(3);
-  TORCH_INTERNAL_ASSERT(key_seq_len <= 4096);
-  TORCH_INTERNAL_ASSERT(query_seq_len > 1);
-
-  // Output 
-  auto act_options = input.options().requires_grad(false);
-  torch::Tensor softmax_results = 
-      torch::empty({batches, attn_heads, query_seq_len, key_seq_len}, act_options);
-
-  // Softmax Intermediate Result Ptr
-  void* input_ptr = static_cast<void*>(input.data_ptr());
-  void* softmax_results_ptr = static_cast<void*>(softmax_results.data_ptr());
-
-  DISPATCH_HALF_AND_BFLOAT(
-      input.scalar_type(),
-      "dispatch_scaled_softmax_forward",
-      dispatch_scaled_softmax_forward<scalar_t, scalar_t, float>(
-          reinterpret_cast<scalar_t*>(softmax_results_ptr),
-	  reinterpret_cast<const scalar_t*>(input_ptr),
-	  scale_factor,
-	  query_seq_len,
-	  key_seq_len,
-	  batches,
-	  attn_heads);
-      );
-  return softmax_results;
-}
-
-torch::Tensor bwd_cuda(
-    torch::Tensor const& output_grads_, 
-    torch::Tensor const& softmax_results_, 
-    float scale_factor)  {
-	
-  auto output_grads = output_grads_.contiguous();
-  auto softmax_results = softmax_results_.contiguous();
-
-  //output grads is a 4d tensor with dimensions [batches, attn_heads, seq_len, seq_len]
-  const int batches = output_grads.size(0);
-  const int attn_heads = output_grads.size(1);
-  const int query_seq_len = output_grads.size(2);
-  const int key_seq_len = output_grads.size(3);
-
-  void* output_grads_ptr = static_cast<void*>(output_grads.data_ptr());
-
-  //Softmax Grad
-  DISPATCH_HALF_AND_BFLOAT(
-      output_grads_.scalar_type(),
-      "dispatch_scaled_masked_softmax_backward",
-      dispatch_scaled_masked_softmax_backward<scalar_t, scalar_t, float>(
-          reinterpret_cast<scalar_t*>(output_grads_ptr), 
-	  reinterpret_cast<scalar_t*>(output_grads_ptr), 
-	  reinterpret_cast<scalar_t const*>(softmax_results.data_ptr()),
-	  scale_factor,
-	  query_seq_len,
-	  key_seq_len,
-	  batches,
-	  attn_heads);
-			   );
-  
-  //backward pass is completely in-place
-  return output_grads;
-}
-}
-}
-}
-
diff --git a/megatron/fused_kernels/scaled_upper_triang_masked_softmax.cpp b/megatron/fused_kernels/scaled_upper_triang_masked_softmax.cpp
deleted file mode 100644
index ddfc8646a3..0000000000
--- a/megatron/fused_kernels/scaled_upper_triang_masked_softmax.cpp
+++ /dev/null
@@ -1,58 +0,0 @@
-/* Copyright (c) 2022, NVIDIA CORPORATION. All rights reserved. */
-
-#include <cuda_fp16.h>
-#include <torch/extension.h>
-#include <vector>
-
-namespace multihead_attn {
-namespace fused_softmax {
-namespace scaled_upper_triang_masked_softmax {
-
-torch::Tensor fwd_cuda(
-    torch::Tensor const& input, 
-    float scale_factor);
-
-torch::Tensor bwd_cuda(
-    torch::Tensor const& output_grads, 
-    torch::Tensor const& softmax_results,
-    float scale_factor);
-
-torch::Tensor fwd(torch::Tensor const& input, float scale_factor) {
-  AT_ASSERTM(input.dim() == 3, "expected 3D tensor");
-  AT_ASSERTM((input.scalar_type() == at::ScalarType::Half) ||
-	     (input.scalar_type() == at::ScalarType::BFloat16), 
-      "Only fp16 and bf16 are supported");
-
-  return fwd_cuda(input, scale_factor);
-}
-
-torch::Tensor bwd(
-    torch::Tensor const& output_grads, 
-    torch::Tensor const& softmax_results,
-    float scale_factor) {
-
-  AT_ASSERTM(output_grads.dim() == 3, "expected 3D tensor");
-  AT_ASSERTM(softmax_results.dim() == 3, "expected 3D tensor");
-
-  AT_ASSERTM((output_grads.scalar_type() == at::ScalarType::Half) ||
-	     (output_grads.scalar_type() == at::ScalarType::BFloat16), 
-      "Only fp16 and bf16 are supported");
-  AT_ASSERTM((softmax_results.scalar_type() == at::ScalarType::Half) ||
-	     (softmax_results.scalar_type() == at::ScalarType::BFloat16), 
-      "Only fp16 and bf16 are supported");
-
-  return bwd_cuda(output_grads, softmax_results, scale_factor);
-}
-
-} // end namespace scaled_upper_triang_masked_softmax
-} // end namespace fused_softmax
-} // end namespace multihead_attn
-
-PYBIND11_MODULE(TORCH_EXTENSION_NAME, m) {
-  m.def("forward", 
-        &multihead_attn::fused_softmax::scaled_upper_triang_masked_softmax::fwd,
-	"Self Multihead Attention scaled, time masked softmax -- Forward.");
-  m.def("backward", 
-        &multihead_attn::fused_softmax::scaled_upper_triang_masked_softmax::bwd,
-	"Self Multihead Attention scaled, time masked softmax -- Backward.");
-}
diff --git a/megatron/fused_kernels/scaled_upper_triang_masked_softmax.h b/megatron/fused_kernels/scaled_upper_triang_masked_softmax.h
deleted file mode 100644
index 5711f0fbf4..0000000000
--- a/megatron/fused_kernels/scaled_upper_triang_masked_softmax.h
+++ /dev/null
@@ -1,524 +0,0 @@
-/* Copyright (c) 2022, NVIDIA CORPORATION. All rights reserved. */
-
-#pragma once
-
-#include <assert.h>
-#include <cuda_fp16.h>
-#include <cfloat>
-#include <limits>
-#include <stdint.h>
-#include <c10/macros/Macros.h>
-
-namespace {
-
-template <typename Datatype, int ELEMENTS_PER_LDG>
-__device__ __inline__ void copy_vector(Datatype *dst, const Datatype *src);
-
-template <>
-__device__ __inline__ void copy_vector<c10::BFloat16, 1>(c10::BFloat16 *dst, const c10::BFloat16 *src) { *dst = *src; }
-
-template <>
-__device__ __inline__ void copy_vector<c10::BFloat16, 4>(c10::BFloat16 *dst, const c10::BFloat16 *src) { *((float2*) dst) = *((float2*) src); }
-  
-template <>
-__device__ __inline__ void copy_vector<c10::Half, 1>(c10::Half *dst, const c10::Half *src) { *dst = *src; }
-
-template <>
-__device__ __inline__ void copy_vector<c10::Half, 4>(c10::Half *dst, const c10::Half *src) { *((float2*) dst) = *((float2*) src); }
-
-template <>
-__device__ __inline__ void copy_vector<uint8_t, 1>(uint8_t *dst, const uint8_t *src) { *dst = *src; }
-
-template <>
-__device__ __inline__ void copy_vector<uint8_t, 4>(uint8_t *dst, const uint8_t *src) {*((half2*) dst) = *((half2*) src); }
-
-template <typename Datatype, int ELEMENTS_PER_LDG>
-__device__ __inline__ void copy_zero_vector(Datatype *dst);
-
-template <>
-__device__ __inline__ void copy_zero_vector<c10::BFloat16, 1>(c10::BFloat16 *dst) { *dst = 0.0; }
-
-template <>
-__device__ __inline__ void copy_zero_vector<c10::BFloat16, 4>(c10::BFloat16 *dst) { *((float2*) dst) = make_float2(0.0f, 0.0f); }
-
-template <>
-__device__ __inline__ void copy_zero_vector<c10::Half, 1>(c10::Half *dst) { *dst = 0.0; }
-
-template <>
-__device__ __inline__ void copy_zero_vector<c10::Half, 4>(c10::Half *dst) { *((float2*) dst) = make_float2(0.0f, 0.0f); }
-
-
-int log2_ceil(int value) {
-    int log2_value = 0;
-    while ((1 << log2_value) < value) ++log2_value;
-    return log2_value;
-}
-
-template<typename T>
-struct Add {
-  __device__ __forceinline__ T operator()(T a, T b) const {
-    return a + b;
-  }
-};
-
-template<typename T>
-struct Max {
-  __device__ __forceinline__ T operator()(T a, T b) const {
-    return a < b ? b : a;
-  }
-};
-
-template <typename T>
-__device__ __forceinline__ T WARP_SHFL_XOR_NATIVE(T value, int laneMask, int width = warpSize, unsigned int mask = 0xffffffff)
-{
-#if CUDA_VERSION >= 9000
-    return __shfl_xor_sync(mask, value, laneMask, width);
-#else
-    return __shfl_xor(value, laneMask, width);
-#endif
-}
-
-template <typename acc_t, int WARP_BATCH, int WARP_SIZE, template<typename> class ReduceOp>
-__device__ __forceinline__ void warp_reduce(acc_t* sum) {
-    ReduceOp<acc_t> r;
-    #pragma unroll
-    for (int offset = WARP_SIZE / 2; offset > 0; offset /= 2) {
-        #pragma unroll
-        for (int i = 0;  i < WARP_BATCH;  ++i) {
-            acc_t b = WARP_SHFL_XOR_NATIVE(sum[i], offset, WARP_SIZE);
-            sum[i] = r(sum[i], b);
-        }
-    }
-}
-
-/*
- * Extended softmax (from native aten pytorch) with following additional features
- * 1) input scaling
- * 2) Implicit time (diagonal masking)
- */
-template <typename input_t, typename output_t, typename acc_t, int log2_elements>
-__global__ void scaled_upper_triang_masked_softmax_warp_forward(
-    output_t *dst, 
-    const input_t *src, 
-    const acc_t scale, 
-    int micro_batch_size, 
-    int stride, 
-    int element_count) 
-{
-    // WARP_SIZE and WARP_BATCH must match the return values batches_per_warp and 
-    // warp_size of method warp_softmax_forward_kernel.
-    constexpr int next_power_of_two = 1 << log2_elements;
-    constexpr int WARP_SIZE = (next_power_of_two < C10_WARP_SIZE) ? next_power_of_two : C10_WARP_SIZE;
-    constexpr int WARP_ITERATIONS = next_power_of_two / WARP_SIZE;
-    constexpr int WARP_BATCH = (next_power_of_two <= 128) ? 2 : 1;
-    constexpr int ELEMENTS_PER_LDG_STG = (WARP_ITERATIONS < 4) ? 1 : 4;
-
-    int first_batch = (blockDim.y * blockIdx.y + threadIdx.y) * gridDim.x * WARP_BATCH + blockIdx.x;
-    int local_seq = blockIdx.x + 1; 
-    int warp_iteration_limit = (local_seq + ELEMENTS_PER_LDG_STG * WARP_SIZE - 1)/ WARP_SIZE;
-
-    // micro_batch_size might not be a multiple of WARP_BATCH. Check how
-    // many batches have to computed within this WARP.
-    int local_batches = micro_batch_size - first_batch;
-    if (local_batches > WARP_BATCH)
-        local_batches = WARP_BATCH;
-
-    // there might be multiple batches per warp. compute the index within the batch
-    int local_idx = threadIdx.x;
-
-    src += first_batch * stride + ELEMENTS_PER_LDG_STG * local_idx;
-    dst += first_batch * stride + ELEMENTS_PER_LDG_STG * local_idx;
-
-    // load data from global memory
-    acc_t elements[WARP_BATCH][WARP_ITERATIONS];
-    input_t temp_data[ELEMENTS_PER_LDG_STG];
-    #pragma unroll
-    for (int i = 0;  i < WARP_BATCH;  ++i) {
-        int batch_element_count = (i >= local_batches) ? 0 : local_seq;
-
-        #pragma unroll
-        for (int it = 0;  it < WARP_ITERATIONS;  it+=ELEMENTS_PER_LDG_STG) {
-            int element_index = ELEMENTS_PER_LDG_STG * local_idx + it * WARP_SIZE;
-
-            if (element_index < batch_element_count) {
-                copy_vector<input_t, ELEMENTS_PER_LDG_STG>(temp_data, src + i*element_count*stride + it*WARP_SIZE);
-
-                #pragma unroll
-                for (int element = 0; element < ELEMENTS_PER_LDG_STG; ++element) {
-                    if ((element_index + element) < batch_element_count) {
-                        elements[i][it+element] = (acc_t)temp_data[element] * scale;
-                    } else {
-                        elements[i][it + element] = -std::numeric_limits<acc_t>::infinity();
-                    }
-                }
-            } else {
-                #pragma unroll
-                for (int element = 0; element < ELEMENTS_PER_LDG_STG; ++element) {
-                    elements[i][it + element] = -std::numeric_limits<acc_t>::infinity();
-                }
-            }
-        }
-    }
-
-    // compute max_value
-    acc_t max_value[WARP_BATCH];
-    #pragma unroll
-    for (int i = 0;  i < WARP_BATCH;  ++i) {
-        max_value[i] = elements[i][0];
-        #pragma unroll
-        for (int it = 1;  it < WARP_ITERATIONS;  ++it) {
-            max_value[i] = (max_value[i] > elements[i][it]) ? max_value[i] : elements[i][it];
-        }
-    }
-    warp_reduce<acc_t, WARP_BATCH, WARP_SIZE, Max>(max_value);
-
-    acc_t sum[WARP_BATCH] { 0.0f };
-    #pragma unroll
-    for (int i = 0;  i < WARP_BATCH;  ++i) {
-        #pragma unroll
-        for (int it = 0;  it < WARP_ITERATIONS;  ++it) {
-            if (it < warp_iteration_limit) {
-                elements[i][it] = std::exp((elements[i][it] - max_value[i]));
-                sum[i] += elements[i][it];
-            } 
-        }
-    }
-    warp_reduce<acc_t, WARP_BATCH, WARP_SIZE, Add>(sum);
-
-    // store result
-    output_t out[ELEMENTS_PER_LDG_STG];
-    #pragma unroll
-    for (int i = 0;  i < WARP_BATCH;  ++i) {
-        if (i >= local_batches)
-            break;
-        #pragma unroll
-        for (int it = 0;  it < WARP_ITERATIONS;  it+=ELEMENTS_PER_LDG_STG) {
-            int element_index = ELEMENTS_PER_LDG_STG * local_idx + it * WARP_SIZE;
-
-            if (element_index < local_seq) {
-
-                #pragma unroll  
-                for (int element = 0; element < ELEMENTS_PER_LDG_STG; ++element) {
-                    if (element_index + element < local_seq) {
-                        out[element] = elements[i][it + element] / sum[i];
-                    } else {
-                        out[element] = 0;
-                    }
-                }
-                copy_vector<output_t, ELEMENTS_PER_LDG_STG>(dst + i * element_count * stride + it * WARP_SIZE, out);
-            } else if (element_index < element_count) {
-                copy_zero_vector<output_t, ELEMENTS_PER_LDG_STG>(dst + i * element_count * stride + it * WARP_SIZE);
-            } else {
-                break;
-            } 
-        }
-    }
-}
-
-template <typename input_t, typename output_t, typename acc_t, int log2_elements>
-__global__ void scaled_upper_triang_masked_softmax_warp_backward(
-    output_t *gradInput, 
-    input_t *grad, 
-    const input_t *output,
-    acc_t scale, 
-    int micro_batch_size, 
-    int stride, 
-    int element_count)
-{
-    // WARP_SIZE and WARP_BATCH must match the return values batches_per_warp and 
-    // warp_size of method warp_softmax_backward_kernel.
-    constexpr int next_power_of_two = 1 << log2_elements;
-    constexpr int WARP_SIZE = (next_power_of_two < C10_WARP_SIZE) ? next_power_of_two : C10_WARP_SIZE;
-    constexpr int WARP_ITERATIONS = next_power_of_two / WARP_SIZE;
-    constexpr int WARP_BATCH = (next_power_of_two <= 128) ? 2 : 1;
-    constexpr int ELEMENTS_PER_LDG_STG = (WARP_ITERATIONS < 4) ? 1 : 4;
-
-    int first_batch = (blockDim.y * blockIdx.y + threadIdx.y) * gridDim.x * WARP_BATCH + blockIdx.x;
-    int local_seq = blockIdx.x + 1; 
-    
-    // micro_batch_size might not be a multiple of WARP_BATCH. Check how
-    // many batches have to computed within this WARP.
-    int local_batches = micro_batch_size - first_batch;
-    if (local_batches > WARP_BATCH)
-        local_batches = WARP_BATCH;
-
-    // there might be multiple batches per warp. compute the index within the batch
-    int local_idx = threadIdx.x;
-
-    // the first element to process by the current thread
-    int thread_offset = first_batch * stride + ELEMENTS_PER_LDG_STG * local_idx;
-    grad += thread_offset;
-    output += thread_offset;
-    gradInput += thread_offset;
-
-    // load data from global memory
-    acc_t grad_reg[WARP_BATCH][WARP_ITERATIONS] { 0.0f };
-    acc_t output_reg[WARP_BATCH][WARP_ITERATIONS] { 0.0f };
-    input_t temp_grad[ELEMENTS_PER_LDG_STG];
-    input_t temp_output[ELEMENTS_PER_LDG_STG];
-    #pragma unroll
-    for (int i = 0;  i < WARP_BATCH;  ++i) {
-        int batch_element_count = (i >= local_batches) ? 0 : local_seq;
-
-        #pragma unroll
-        for (int it = 0;  it < WARP_ITERATIONS;  it+=ELEMENTS_PER_LDG_STG) {
-            int element_index = ELEMENTS_PER_LDG_STG * local_idx + it * WARP_SIZE;
-            if (element_index < batch_element_count) {
-                copy_vector<input_t, ELEMENTS_PER_LDG_STG>(temp_grad, grad + i * element_count * stride + it * WARP_SIZE);
-                copy_vector<input_t, ELEMENTS_PER_LDG_STG>(temp_output, output + i * element_count * stride + it * WARP_SIZE);
-
-                #pragma unroll
-                for (int element = 0; element < ELEMENTS_PER_LDG_STG; ++element) {
-                    if (element_index + element < batch_element_count) {
-                        output_reg[i][it + element] = (acc_t)temp_output[element];
-                    }
-                }
-                #pragma unroll
-                for (int element = 0; element < ELEMENTS_PER_LDG_STG; ++element) {
-                    if (element_index + element < batch_element_count) {
-                        grad_reg[i][it + element] = (acc_t)temp_grad[element] * output_reg[i][it + element];
-                    }
-                }
-            }
-        }
-    }
-   
-    acc_t sum[WARP_BATCH];
-    #pragma unroll
-    for (int i = 0;  i < WARP_BATCH;  ++i) {
-        sum[i] = grad_reg[i][0];
-        #pragma unroll
-        for (int it = 1;  it < WARP_ITERATIONS;  ++it) {
-            sum[i] += grad_reg[i][it];
-        }
-    }
-    warp_reduce<acc_t, WARP_BATCH, WARP_SIZE, Add>(sum);
-
-    // store result
-    #pragma unroll
-    for (int i = 0;  i < WARP_BATCH;  ++i) {
-        if (i >= local_batches)
-            break;
-        #pragma unroll
-        for (int it = 0;  it < WARP_ITERATIONS;  it+=ELEMENTS_PER_LDG_STG) {
-            int element_index = ELEMENTS_PER_LDG_STG * local_idx + it * WARP_SIZE;
-            if (element_index < element_count) {
-                // compute gradients
-                output_t out[ELEMENTS_PER_LDG_STG];
-                #pragma unroll
-                for (int element = 0; element < ELEMENTS_PER_LDG_STG; ++element) {
-                    out[element] = (output_t)(scale * (grad_reg[i][it + element] - output_reg[i][it + element] * sum[i]));
-                }
-                copy_vector<output_t, ELEMENTS_PER_LDG_STG>(gradInput + i * element_count * stride + it * WARP_SIZE, out);
-            } 
-        }
-    }
-}
-
-} // end of anonymous namespace
-
-template<typename input_t, typename output_t, typename acc_t>
-void dispatch_scaled_upper_triang_masked_softmax_forward(
-    output_t *dst, 
-    const input_t *src, 
-    const input_t scale, 
-    int softmax_elements, 
-    int softmax_elements_stride, 
-    int attn_batches)
-{
-    TORCH_INTERNAL_ASSERT(softmax_elements >= 0 && softmax_elements <= 16384 );
-    if (softmax_elements == 0) {
-        return;
-    } else {
-        int log2_elements = log2_ceil(softmax_elements);
-        const int next_power_of_two = 1 << log2_elements;
-        int seq_len = softmax_elements;
-        int batch_count = attn_batches * seq_len;
-
-        // This value must match the WARP_SIZE constexpr value computed inside softmax_warp_forward.
-        int warp_size = (next_power_of_two < C10_WARP_SIZE) ? next_power_of_two : C10_WARP_SIZE;
-
-        // This value must match the WARP_BATCH constexpr value computed inside softmax_warp_forward.
-        int batches_per_warp = (next_power_of_two <= 128) ? 2 : 1;
-
-        // use 128 threads per block to maximimize gpu utilization
-        constexpr int threads_per_block = 128;
-
-        int warps_per_block = (threads_per_block / warp_size);
-        int batches_per_block = warps_per_block * batches_per_warp;
-        TORCH_INTERNAL_ASSERT(attn_batches % batches_per_block == 0);
-
-        int blocks_per_seq = attn_batches / batches_per_block;
-        dim3 blocks(seq_len, blocks_per_seq, 1);
-        dim3 threads(warp_size, warps_per_block, 1);
-        // Launch code would be more elegant if C++ supported FOR CONSTEXPR
-        switch (log2_elements) {
-            case 0: // 1
-                scaled_upper_triang_masked_softmax_warp_forward<input_t, output_t, acc_t, 0>
-                    <<<blocks, threads, 0, at::cuda::getCurrentCUDAStream()>>>(dst, src, scale, batch_count, softmax_elements_stride, softmax_elements);
-                break;
-            case 1: // 2
-                scaled_upper_triang_masked_softmax_warp_forward<input_t, output_t, acc_t, 1>
-                    <<<blocks, threads, 0, at::cuda::getCurrentCUDAStream()>>>(dst, src, scale, batch_count, softmax_elements_stride, softmax_elements);
-                break;
-            case 2: // 4
-                scaled_upper_triang_masked_softmax_warp_forward<input_t, output_t, acc_t, 2>
-                    <<<blocks, threads, 0, at::cuda::getCurrentCUDAStream()>>>(dst, src, scale, batch_count, softmax_elements_stride, softmax_elements);
-                break;
-            case 3: // 8
-                scaled_upper_triang_masked_softmax_warp_forward<input_t, output_t, acc_t, 3>
-                    <<<blocks, threads, 0, at::cuda::getCurrentCUDAStream()>>>(dst, src, scale, batch_count, softmax_elements_stride, softmax_elements);
-                break;
-            case 4: // 16
-                scaled_upper_triang_masked_softmax_warp_forward<input_t, output_t, acc_t, 4>
-                    <<<blocks, threads, 0, at::cuda::getCurrentCUDAStream()>>>(dst, src, scale, batch_count, softmax_elements_stride, softmax_elements);
-                break;
-            case 5: // 32
-                scaled_upper_triang_masked_softmax_warp_forward<input_t, output_t, acc_t, 5>
-                    <<<blocks, threads, 0, at::cuda::getCurrentCUDAStream()>>>(dst, src, scale, batch_count, softmax_elements_stride, softmax_elements);
-                break;
-            case 6: // 64
-                scaled_upper_triang_masked_softmax_warp_forward<input_t, output_t, acc_t, 6>
-                    <<<blocks, threads, 0, at::cuda::getCurrentCUDAStream()>>>(dst, src, scale, batch_count, softmax_elements_stride, softmax_elements);
-                break;
-            case 7: // 128
-                scaled_upper_triang_masked_softmax_warp_forward<input_t, output_t, acc_t, 7>
-                    <<<blocks, threads, 0, at::cuda::getCurrentCUDAStream()>>>(dst, src, scale, batch_count, softmax_elements_stride, softmax_elements);
-                break;
-            case 8: // 256
-                scaled_upper_triang_masked_softmax_warp_forward<input_t, output_t, acc_t, 8>
-                    <<<blocks, threads, 0, at::cuda::getCurrentCUDAStream()>>>(dst, src, scale, batch_count, softmax_elements_stride, softmax_elements);
-                break;
-            case 9: // 512
-                scaled_upper_triang_masked_softmax_warp_forward<input_t, output_t, acc_t, 9>
-                    <<<blocks, threads, 0, at::cuda::getCurrentCUDAStream()>>>(dst, src, scale, batch_count, softmax_elements_stride, softmax_elements);
-                break;
-            case 10: // 1024
-                scaled_upper_triang_masked_softmax_warp_forward<input_t, output_t, acc_t, 10>
-                    <<<blocks, threads, 0, at::cuda::getCurrentCUDAStream()>>>(dst, src, scale, batch_count, softmax_elements_stride, softmax_elements);
-                break;
-            case 11: // 2048
-                scaled_upper_triang_masked_softmax_warp_forward<input_t, output_t, acc_t, 11>
-                    <<<blocks, threads, 0, at::cuda::getCurrentCUDAStream()>>>(dst, src, scale, batch_count, softmax_elements_stride, softmax_elements);
-                break;
-            case 12: // 4096
-                scaled_upper_triang_masked_softmax_warp_forward<input_t, output_t, acc_t, 12>
-                    <<<blocks, threads, 0, at::cuda::getCurrentCUDAStream()>>>(dst, src, scale, batch_count, softmax_elements_stride, softmax_elements);
-                break;
-	    case 13: // 8192
-                scaled_upper_triang_masked_softmax_warp_forward<input_t, output_t, acc_t, 13>
-                    <<<blocks, threads, 0, at::cuda::getCurrentCUDAStream()>>>(dst, src, scale, batch_count, softmax_elements_stride, softmax_elements);
-                break;
-	    case 14: // 16384
-                scaled_upper_triang_masked_softmax_warp_forward<input_t, output_t, acc_t, 14>
-                    <<<blocks, threads, 0, at::cuda::getCurrentCUDAStream()>>>(dst, src, scale, batch_count, softmax_elements_stride, softmax_elements);
-                break;
-
-            default:
-                break;
-        }
-    }
-}
-
-template<typename input_t, typename output_t, typename acc_t>
-void dispatch_scaled_upper_triang_masked_softmax_backward(
-    output_t *grad_input, 
-    input_t *grad, 
-    const input_t *output, 
-    const acc_t scale, 
-    int softmax_elements, 
-    int softmax_elements_stride, 
-    int attn_batches)
-{
-    TORCH_INTERNAL_ASSERT( softmax_elements >= 0 && softmax_elements <= 16384 );
-    if (softmax_elements == 0) {
-       return;
-    } else {
-        int log2_elements = log2_ceil(softmax_elements);
-        const int next_power_of_two = 1 << log2_elements;
-        int seq_len = softmax_elements;
-        int batch_count = attn_batches * seq_len;
-
-        // This value must match the WARP_SIZE constexpr value computed inside softmax_warp_backward.
-        int warp_size = (next_power_of_two < C10_WARP_SIZE) ? next_power_of_two : C10_WARP_SIZE;
-
-        // This value must match the WARP_BATCH constexpr value computed inside softmax_warp_backward.
-        int batches_per_warp = (next_power_of_two <= 128) ? 2 : 1;
-
-        // use 128 threads per block to maximimize gpu utilization
-        constexpr int threads_per_block = 128;
-
-        int warps_per_block = (threads_per_block / warp_size);
-        int batches_per_block = warps_per_block * batches_per_warp;
-        TORCH_INTERNAL_ASSERT(attn_batches % batches_per_block == 0);
-
-        int blocks_per_seq = attn_batches / batches_per_block;
-        dim3 blocks(seq_len, blocks_per_seq, 1);
-        dim3 threads(warp_size, warps_per_block, 1);
-        // Launch code would be more elegant if C++ supported FOR CONSTEXPR
-        switch (log2_elements) {
-            case 0: // 1
-                scaled_upper_triang_masked_softmax_warp_backward<input_t, output_t, acc_t, 0>
-                    <<<blocks, threads, 0, at::cuda::getCurrentCUDAStream()>>>(grad_input, grad, output, scale, batch_count, softmax_elements_stride, softmax_elements);
-                break;
-            case 1: // 2
-                scaled_upper_triang_masked_softmax_warp_backward<input_t, output_t, acc_t, 1>
-                    <<<blocks, threads, 0, at::cuda::getCurrentCUDAStream()>>>(grad_input, grad, output, scale, batch_count, softmax_elements_stride, softmax_elements);
-                break;
-            case 2: // 4
-                scaled_upper_triang_masked_softmax_warp_backward<input_t, output_t, acc_t, 2>
-                    <<<blocks, threads, 0, at::cuda::getCurrentCUDAStream()>>>(grad_input, grad, output, scale, batch_count, softmax_elements_stride, softmax_elements);
-                break;
-            case 3: // 8
-                scaled_upper_triang_masked_softmax_warp_backward<input_t, output_t, acc_t, 3>
-                    <<<blocks, threads, 0, at::cuda::getCurrentCUDAStream()>>>(grad_input, grad, output, scale, batch_count, softmax_elements_stride, softmax_elements);
-                break;
-            case 4: // 16
-                scaled_upper_triang_masked_softmax_warp_backward<input_t, output_t, acc_t, 4>
-                    <<<blocks, threads, 0, at::cuda::getCurrentCUDAStream()>>>(grad_input, grad, output, scale, batch_count, softmax_elements_stride, softmax_elements);
-                break;
-            case 5: // 32
-                scaled_upper_triang_masked_softmax_warp_backward<input_t, output_t, acc_t, 5>
-                    <<<blocks, threads, 0, at::cuda::getCurrentCUDAStream()>>>(grad_input, grad, output, scale, batch_count, softmax_elements_stride, softmax_elements);
-                break;
-            case 6: // 64
-                scaled_upper_triang_masked_softmax_warp_backward<input_t, output_t, acc_t, 6>
-                    <<<blocks, threads, 0, at::cuda::getCurrentCUDAStream()>>>(grad_input, grad, output, scale, batch_count, softmax_elements_stride, softmax_elements);
-                break;
-            case 7: // 128
-                scaled_upper_triang_masked_softmax_warp_backward<input_t, output_t, acc_t, 7>
-                    <<<blocks, threads, 0, at::cuda::getCurrentCUDAStream()>>>(grad_input, grad, output, scale, batch_count, softmax_elements_stride, softmax_elements);
-                break;
-            case 8: // 256
-                scaled_upper_triang_masked_softmax_warp_backward<input_t, output_t, acc_t, 8>
-                    <<<blocks, threads, 0, at::cuda::getCurrentCUDAStream()>>>(grad_input, grad, output, scale, batch_count, softmax_elements_stride, softmax_elements);
-                break;
-            case 9: // 512
-                scaled_upper_triang_masked_softmax_warp_backward<input_t, output_t, acc_t, 9>
-                    <<<blocks, threads, 0, at::cuda::getCurrentCUDAStream()>>>(grad_input, grad, output, scale, batch_count, softmax_elements_stride, softmax_elements);
-                break;
-            case 10: // 1024
-                scaled_upper_triang_masked_softmax_warp_backward<input_t, output_t, acc_t, 10>
-                    <<<blocks, threads, 0, at::cuda::getCurrentCUDAStream()>>>(grad_input, grad, output, scale, batch_count, softmax_elements_stride, softmax_elements);
-                break;
-            case 11: // 2048
-                scaled_upper_triang_masked_softmax_warp_backward<input_t, output_t, acc_t, 11>
-                    <<<blocks, threads, 0, at::cuda::getCurrentCUDAStream()>>>(grad_input, grad, output, scale, batch_count, softmax_elements_stride, softmax_elements);
-                break;
-            case 12: // 4096
-                scaled_upper_triang_masked_softmax_warp_backward<input_t, output_t, acc_t, 12>
-                    <<<blocks, threads, 0, at::cuda::getCurrentCUDAStream()>>>(grad_input, grad, output, scale, batch_count, softmax_elements_stride, softmax_elements);
-                break;
-            case 13: // 8192
-                scaled_upper_triang_masked_softmax_warp_backward<input_t, output_t, acc_t, 13>
-                    <<<blocks, threads, 0, at::cuda::getCurrentCUDAStream()>>>(grad_input, grad, output, scale, batch_count, softmax_elements_stride, softmax_elements);
-                break;
-            case 14: // 16384
-                scaled_upper_triang_masked_softmax_warp_backward<input_t, output_t, acc_t, 14>
-                    <<<blocks, threads, 0, at::cuda::getCurrentCUDAStream()>>>(grad_input, grad, output, scale, batch_count, softmax_elements_stride, softmax_elements);
-                break;
-            default:
-                break;
-        }
-    }
-}
diff --git a/megatron/fused_kernels/scaled_upper_triang_masked_softmax_cuda.cu b/megatron/fused_kernels/scaled_upper_triang_masked_softmax_cuda.cu
deleted file mode 100644
index 30bcf8d4ca..0000000000
--- a/megatron/fused_kernels/scaled_upper_triang_masked_softmax_cuda.cu
+++ /dev/null
@@ -1,84 +0,0 @@
-/* Copyright (c) 2022, NVIDIA CORPORATION. All rights reserved. */
-
-#include <ATen/ATen.h>
-#include <cuda.h>
-#include <cuda_runtime.h>
-#include <cuda_fp16.h>
-#include <cuda_profiler_api.h>
-#include <ATen/cuda/CUDAContext.h>
-#include <torch/extension.h>
-#include "scaled_upper_triang_masked_softmax.h"
-#include "type_shim.h"
-
-namespace multihead_attn {
-namespace fused_softmax {
-namespace scaled_upper_triang_masked_softmax {
-
-torch::Tensor fwd_cuda(
-    torch::Tensor const& input, 
-    float scale_factor)
-{
-  // input is a 3d tensor with dimensions [attn_batches, seq_len, seq_len]
-  const int attn_batches = input.size(0);
-  const int seq_len = input.size(1);
-  TORCH_INTERNAL_ASSERT(seq_len <= 16384);
-
-  // Output 
-  auto act_options = input.options().requires_grad(false);
-  torch::Tensor softmax_results = 
-      torch::empty({attn_batches, seq_len, seq_len}, act_options);
-
-  // Softmax Intermediate Result Ptr
-  void* input_ptr = static_cast<void*>(input.data_ptr());
-  void* softmax_results_ptr = static_cast<void*>(softmax_results.data_ptr());
-
-  DISPATCH_HALF_AND_BFLOAT(
-      input.scalar_type(),
-      "dispatch_scaled_upper_triang_masked_softmax_forward",
-      dispatch_scaled_upper_triang_masked_softmax_forward<scalar_t, scalar_t, float>(
-	  reinterpret_cast<scalar_t*>(softmax_results_ptr),
-	  reinterpret_cast<const scalar_t*>(input_ptr),
-	  scale_factor,
-	  seq_len,
-	  seq_len,
-	  attn_batches);
-      );
-  return softmax_results;
-}
-				      
-
-torch::Tensor bwd_cuda(
-    torch::Tensor const& output_grads_, 
-    torch::Tensor const& softmax_results_, 
-    float scale_factor)  {
-	
-  auto output_grads = output_grads_.contiguous();
-  auto softmax_results = softmax_results_.contiguous();
-
-  //output grads is a 3d tensor with dimensions [attn_batches, seq_len, seq_len]
-  const int attn_batches = output_grads.size(0);
-  const int seq_len = output_grads.size(1);
-  TORCH_INTERNAL_ASSERT(output_grads.size(1) == output_grads.size(2));
-
-  void* output_grads_ptr = static_cast<void*>(output_grads.data_ptr());
-
-  //Softmax Grad
-  DISPATCH_HALF_AND_BFLOAT(
-      output_grads_.scalar_type(),
-      "dispatch_scaled_upper_triang_masked_softmax_backward",
-      dispatch_scaled_upper_triang_masked_softmax_backward<scalar_t, scalar_t, float>(
-          reinterpret_cast<scalar_t*>(output_grads_ptr), 
-	  reinterpret_cast<scalar_t*>(output_grads_ptr), 
-	  reinterpret_cast<scalar_t const*>(softmax_results.data_ptr()),
-	  scale_factor,
-	  seq_len,
-	  seq_len,
-	  attn_batches);
-      );
-  
-  //backward pass is completely in-place
-  return output_grads;
-}
-}
-}
-}
diff --git a/megatron/inference/__init__.py b/megatron/inference/__init__.py
new file mode 100644
index 0000000000..f8011007a5
--- /dev/null
+++ b/megatron/inference/__init__.py
@@ -0,0 +1 @@
+# Copyright (c) 2024, NVIDIA CORPORATION. All rights reserved.
diff --git a/megatron/inference/arguments.py b/megatron/inference/arguments.py
new file mode 100644
index 0000000000..7fcd7a7dc3
--- /dev/null
+++ b/megatron/inference/arguments.py
@@ -0,0 +1,26 @@
+# Copyright (c) 2024, NVIDIA CORPORATION. All rights reserved.
+
+
+def add_modelopt_args(parser):
+    """Add additional arguments for using TensorRT Model Optimizer (modelopt) features."""
+    group = parser.add_argument_group(title="modelopt-generic")
+
+    group.add_argument(
+        "--export-legacy-megatron",
+        action="store_true",
+        help="Export a legacy megatron-lm checkpoint.",
+    )
+    group.add_argument(
+        "--export-te-mcore-model",
+        action="store_true",
+        help="Export a megatron-core transformer-engine checkpoint.",
+    )
+    group.add_argument(
+        "--export-quant-cfg",
+        type=str,
+        default=None,
+        choices=["int8", "int8_sq", "fp8", "int4_awq", "w4a8_awq", "int4", "None"],
+        help="Specify a quantization config from the supported choices.",
+    )
+
+    return parser
diff --git a/megatron/inference/checkpointing.py b/megatron/inference/checkpointing.py
new file mode 100644
index 0000000000..f8d3e2dd59
--- /dev/null
+++ b/megatron/inference/checkpointing.py
@@ -0,0 +1,135 @@
+# Copyright (c) 2024, NVIDIA CORPORATION. All rights reserved.
+
+import os
+from pathlib import Path
+from typing import Optional, Dict
+
+from megatron.core import dist_checkpointing
+from megatron.training import get_args
+from megatron.training.checkpointing import _load_base_checkpoint, load_checkpoint
+from megatron.training.utils import print_rank_0, unwrap_model
+
+try:
+    from modelopt.torch.opt.plugins import (
+        get_sharded_modelopt_state,
+        restore_modelopt_state_metadata,
+    )
+except ImportError as e:
+    raise ImportError("Required `\"nvidia-modelopt[torch]\"` is not installed!") from e
+
+
+def load_modelopt_state(load_dir: Optional[str] = None) -> Dict:
+    """Loading modelopt_state without a model.
+
+    If --use-dist-ckpt, we try to load from the sharded modelopt_state. This will not load the model
+    state_dict. Otherwise, if the checkpoint is not sharded, we load the base checkpoint (that
+    contains the model state as well) and extract the modelopt_state.
+
+    Args:
+        load_dir: optionally provide a different loading path
+    """
+    args = get_args()
+
+    if load_dir is None:
+        load_dir = args.load
+
+    if args.use_dist_ckpt:
+        # Read the tracker file and set the iteration.
+        tracker_filename = os.path.join(load_dir, 'latest_checkpointed_iteration.txt')
+        # If no tracker file, assuming that it is a .nemo checkpoint.
+        if not os.path.isfile(tracker_filename):
+            sharded_load_dir = Path(load_dir) / "model_weights"
+        else:
+            with open(tracker_filename, 'r') as f:
+                metastring = f.read().strip()
+                try:
+                    iteration = int(metastring)
+                    sharded_load_dir = Path(load_dir) / 'iter_{:07d}'.format(iteration)
+                except ValueError:
+                    sharded_load_dir = Path(load_dir) / metastring
+        modelopt_state_dir = sharded_load_dir / "modelopt_state"
+        if modelopt_state_dir.exists():
+            print_rank_0("Loading sharded modelopt_state ({})".format(modelopt_state_dir))
+            modelopt_state = restore_modelopt_state_metadata(
+                dist_checkpointing.load(
+                    get_sharded_modelopt_state(args.num_layers), modelopt_state_dir,
+                )
+            )
+            return modelopt_state
+        else:
+            print_rank_0(
+                "sharded modelopt_state ({}) does not exist!".format(modelopt_state_dir)
+            )
+            return {}
+    else:
+        print_rank_0("Loading modelopt_state from base checkpoint ({})".format(load_dir))
+        try:
+            state_dict, _, _ = _load_base_checkpoint(args.load, rank0=False)
+        except Exception:
+            print_rank_0("Failed to load base checkpoint via megatron _load_base_checkpoint!")
+            return {}
+        if state_dict is None:
+            return {}
+        return state_dict.get("modelopt_state", {})
+
+
+def load_modelopt_checkpoint(
+    model,
+    optimizer=None,
+    opt_param_scheduler=None,
+    strict: bool = True,
+    additional_sharded_prefix: str = "model.",
+    load_arg: str = "load",
+) -> None:
+    """Load a sharded (untar .nemo or megatron --use-dist-ckpt) or unsharded checkpoint.
+
+    Essentially, the function is detecting whether the checkpoint is a .nemo sharded checkpoint.
+    If so, we load the sharded state_dict with additional_sharded_prefix `model.`.
+    This additional prefix is tha artifact of the lightning module wrapper. Once the sharded
+    state_dict is loaded, we use a state_dict pre_hook to pop this additional prefix (`model.`)
+    from all state_dict keys.
+
+    If this is not a .nemo sharded checkpoint, then this function will simply call
+    load_checkpoint. See megatron.checkpointing.load_checkpoint for explanation.
+
+    Args:
+        additional_sharded_prefix: append additional prefix to align the sharded checkpoint keys.
+            When loading an .nemo sharded checkpoint, this is usually `model.`. Otherwise, this is
+            typically an empty string.
+    """
+
+    def _remove_prefix_state_dict_pre_hook(
+        state_dict, prefix, local_metadata, strict, missing_keys, unexpected_keys, error_msgs,
+    ):
+        """Pytorch state_dict pre_hook to remove prefix of the state_dict keys."""
+        if additional_sharded_prefix is None:
+            return
+        key_rewrite_list = []
+        for key, _ in state_dict.items():
+            if key.startswith(additional_sharded_prefix):
+                key_rewrite_list.append(key)
+        for old_key in key_rewrite_list:
+            new_key = old_key[len(additional_sharded_prefix) :]
+            state_dict[new_key] = state_dict.pop(old_key)
+
+    args = get_args()
+    load_dir = getattr(args, load_arg)
+
+    sharded_load_dir = Path(load_dir) / "model_weights"
+
+    if sharded_load_dir.exists() and optimizer is None and opt_param_scheduler is None:
+        unwrapped_model = unwrap_model(model)
+        # Set this attribute will alter the sharded_offsets of transformer_block.
+        unwrapped_model[0].decoder.config.non_homogeneous_layers = False
+        sharded_state_dict = unwrapped_model[0].sharded_state_dict(prefix=additional_sharded_prefix)
+        if additional_sharded_prefix:
+            unwrapped_model[0]._register_load_state_dict_pre_hook(
+                _remove_prefix_state_dict_pre_hook
+            )
+        unwrapped_model[0].load_state_dict(
+            dist_checkpointing.load(sharded_state_dict, sharded_load_dir)
+        )
+        # Set the attribute to True such that by-default we are storing the heterogenous arch.
+        unwrapped_model[0].decoder.config.non_homogeneous_layers = True
+    else:
+        _ = load_checkpoint(model, optimizer, opt_param_scheduler, strict=strict, load_arg=load_arg)
diff --git a/megatron/inference/gpt/__init__.py b/megatron/inference/gpt/__init__.py
new file mode 100644
index 0000000000..f8011007a5
--- /dev/null
+++ b/megatron/inference/gpt/__init__.py
@@ -0,0 +1 @@
+# Copyright (c) 2024, NVIDIA CORPORATION. All rights reserved.
diff --git a/megatron/inference/gpt/model_provider.py b/megatron/inference/gpt/model_provider.py
new file mode 100644
index 0000000000..0df0168fa5
--- /dev/null
+++ b/megatron/inference/gpt/model_provider.py
@@ -0,0 +1,88 @@
+# Copyright (c) 2024, NVIDIA CORPORATION. All rights reserved.
+
+"""ModelOpt GPT model provider."""
+
+import modelopt.torch.opt as mto
+from megatron.core.inference.modelopt_support.gpt.model_specs import get_gpt_layer_modelopt_spec
+from megatron.core.inference.modelopt_support.gpt.state_dict_hooks import (
+    mcore_gpt_load_legacy_state_dict_pre_hook,
+    mcore_gpt_load_te_state_dict_pre_hook,
+)
+from megatron.core.models.gpt import GPTModel as MCoreGPTModel
+from megatron.core.parallel_state import get_tensor_model_parallel_rank
+from megatron.core.transformer.spec_utils import import_module
+from megatron.inference.checkpointing import load_modelopt_state
+from megatron.training import get_args, print_rank_0
+from megatron.training.arguments import core_transformer_config_from_args
+
+
+def model_provider(pre_process=True, post_process=True, parallel_output=True) -> MCoreGPTModel:
+    """Builds the model.
+
+    If you set the use_legacy_models to True, it will return the legacy GPT model and if not the core GPT model.
+
+    Args:
+        pre_process (bool, optional): Set to true if you need to compute embedings. Defaults to True.
+        post_process (bool, optional): Set to true if you need to want to compute output logits/loss. Defaults to True.
+        parallel_output (bool): whether to allgather the output logits? This must be
+            True if `model_provider` is called in text_generation_server.
+
+    Returns:
+        MCoreGPTModel: The returned model
+    """
+    args = get_args()
+
+    print_rank_0("building GPT model ...")
+
+    # ModelOpt by default assumes none homogenous layers. This affect the storage format of the sharded checkpoint.
+    config = core_transformer_config_from_args(args)
+    config.non_homogeneous_layers = True
+
+    if args.use_legacy_models:
+        raise ValueError(
+            "ModelOpt integration only support MCore models. Use --use-mcore-modules instead."
+        )
+
+    if args.spec is not None:
+        transformer_layer_spec = import_module(args.spec)
+    else:
+        transformer_layer_spec = get_gpt_layer_modelopt_spec(
+            remap_te_layernorm=args.export_te_mcore_model, qk_layernorm=False,
+        )
+
+    model_type = MCoreGPTModel
+    model_kwargs = {
+        "config": config,
+        "transformer_layer_spec": transformer_layer_spec,
+        "vocab_size": args.padded_vocab_size,
+        "max_sequence_length": args.max_position_embeddings,
+        "pre_process": pre_process,
+        "post_process": post_process,
+        "fp16_lm_cross_entropy": args.fp16_lm_cross_entropy,
+        "parallel_output": parallel_output,
+        "share_embeddings_and_output_weights": not args.untie_embeddings_and_output_weights,
+        "position_embedding_type": args.position_embedding_type,
+        "rotary_percent": args.rotary_percent,
+        "rotary_base": args.rotary_base,
+        "rope_scaling": args.use_rope_scaling,
+    }
+
+    model = model_type(**model_kwargs)
+
+    # Load modelopt_state
+    modelopt_state = load_modelopt_state() if args.load else {}
+    if modelopt_state:
+        model = mto.restore_from_modelopt_state(model, modelopt_state)
+
+    # Register some load_state_dict prehooks to handle some known state_dict key mismatch.
+    # (legacy <-> modelopt) and (default te <-> modelopt)
+    if args.export_legacy_megatron:
+        model._register_load_state_dict_pre_hook(mcore_gpt_load_legacy_state_dict_pre_hook)
+    if args.export_te_mcore_model:
+        model._register_load_state_dict_pre_hook(mcore_gpt_load_te_state_dict_pre_hook)
+
+    # Print models on all pp ranks.
+    if get_tensor_model_parallel_rank() == 0:
+        print(str(model))
+
+    return model
diff --git a/megatron/static/index.html b/megatron/inference/static/index.html
similarity index 100%
rename from megatron/static/index.html
rename to megatron/inference/static/index.html
diff --git a/megatron/text_generation/__init__.py b/megatron/inference/text_generation/__init__.py
similarity index 100%
rename from megatron/text_generation/__init__.py
rename to megatron/inference/text_generation/__init__.py
diff --git a/megatron/text_generation/api.py b/megatron/inference/text_generation/api.py
similarity index 77%
rename from megatron/text_generation/api.py
rename to megatron/inference/text_generation/api.py
index 090b630a5f..06dad2e519 100644
--- a/megatron/text_generation/api.py
+++ b/megatron/inference/text_generation/api.py
@@ -14,8 +14,10 @@
 from .tokenization import (
     tokenize_prompts,
     detokenize_generations)
+from .forward_step import ForwardStep
 
 def generate_and_post_process(model,
+                              forward_step=ForwardStep,
                               prompts=None,
                               tokens_to_generate=0,
                               return_output_log_probs=False,
@@ -29,13 +31,23 @@ def generate_and_post_process(model,
                               stop_on_double_eol=False,
                               stop_on_eol=False,
                               prevent_newline_after_colon=False,
-                              random_seed=-1):
+                              random_seed=-1,
+                              return_logits=False,
+                              detokenize_segments=True,
+                              data_parallel=False):
     """Run inference and post-process outputs, i.e., detokenize,
-    move to cpu and convert to list."""
+    move to cpu and convert to list.
+
+    Args:
+        data_parallel (bool): Enable data parallel text generation. Note: Caller must ensure
+            that 1) different data parallel model replicas are provided different prompts and
+            2) outputs from the different model replicas are gathered.
+    """
 
     # Main inference.
-    tokens, lengths, output_log_probs = generate(
+    tokens, lengths, output_log_probs, logits = generate(
         model,
+        forward_step=forward_step,
         prompts=prompts,
         tokens_to_generate=tokens_to_generate,
         return_output_log_probs=return_output_log_probs,
@@ -49,24 +61,32 @@ def generate_and_post_process(model,
         stop_on_double_eol=stop_on_double_eol,
         stop_on_eol=stop_on_eol,
         prevent_newline_after_colon=prevent_newline_after_colon,
-        random_seed=random_seed)
+        random_seed=random_seed,
+        data_parallel=data_parallel)
 
     # Only post-process on first stage.
     if mpu.is_pipeline_first_stage():
         tokens, prompts_plus_generations, prompts_plus_generations_segments = \
-            detokenize_generations(tokens, lengths, True)
+            detokenize_generations(tokens, lengths, detokenize_segments)
 
         if return_output_log_probs:
             output_log_probs = output_log_probs.cpu().numpy().tolist()
             for i, (prob, seg) in enumerate(zip(output_log_probs, prompts_plus_generations_segments)):
                 output_log_probs[i] = prob[:len(seg)-1]
 
-        return prompts_plus_generations, prompts_plus_generations_segments, \
+        if return_logits:
+            assert(tokens_to_generate == 0)
+            assert(mpu.get_pipeline_model_parallel_world_size() == 1)
+            return prompts_plus_generations, prompts_plus_generations_segments, \
+            output_log_probs, tokens, logits
+        else:
+            return prompts_plus_generations, prompts_plus_generations_segments, \
             output_log_probs, tokens
 
     return None
 
 def generate(model,
+             forward_step=None,
              prompts=None,
              tokens_to_generate=0,
              return_output_log_probs=False,
@@ -80,15 +100,20 @@ def generate(model,
              stop_on_double_eol=False,
              stop_on_eol=False,
              prevent_newline_after_colon=False,
-             random_seed=-1):
-    """Given prompts and input parameters, run inference and return:
+             random_seed=-1,
+             data_parallel=False):
+    """Given prompts and input parameters, run inference.
+
+    Args:
+        data_parallel (bool): Enable data parallel text generation.
+
+    Returns:
        tokens: prompts plus the generated tokens.
        lengths: length of the prompt + generations. Note that we can
            discard tokens in the tokens tensor that are after the
            corresponding length.
        output_log_probs: log probs of the tokens.
     """
-
     # Make sure input params are avaialble to all ranks.
     values = [tokens_to_generate,
               return_output_log_probs,
@@ -98,7 +123,8 @@ def generate(model,
               stop_on_eol,
               prevent_newline_after_colon,
               random_seed]
-    values_float_tensor = broadcast_float_list(len(values), float_list=values)
+
+    values_float_tensor = broadcast_float_list(len(values), float_list=values, data_parallel=data_parallel)
     tokens_to_generate = int(values_float_tensor[0].item())
     return_output_log_probs = bool(values_float_tensor[1].item())
     top_k_sampling = int(values_float_tensor[2].item())
@@ -117,21 +143,22 @@ def generate(model,
         torch.random.manual_seed(random_seed)
 
     # Tokenize prompts and get the batch.
-    # Note that these tensors are broadcaseted to all ranks.
+    # Note that these tensors are broadcasted to all ranks.
     if torch.distributed.get_rank() == 0:
         assert prompts is not None
-    
+
     context_tokens_tensor, context_length_tensor = tokenize_prompts(
-        prompts=prompts, tokens_to_generate=tokens_to_generate, add_BOS=add_BOS)
+        prompts=prompts, tokens_to_generate=tokens_to_generate, add_BOS=add_BOS,
+        data_parallel=data_parallel)
 
     if tokens_to_generate == 0:
         return score_and_return_on_first_stage(
             model, context_tokens_tensor, context_length_tensor)
-    
+
     # Main inference function.
     # Note that the outputs are available on the first stage.
     return generate_tokens_probs_and_return_on_first_stage(
-        model, context_tokens_tensor, context_length_tensor,
+        model, forward_step, context_tokens_tensor, context_length_tensor,
         return_output_log_probs=return_output_log_probs,
         top_k=top_k_sampling,
         top_p=top_p_sampling,
@@ -144,6 +171,7 @@ def generate(model,
         prevent_newline_after_colon=prevent_newline_after_colon)
 
 def beam_search_and_post_process(model,
+                                 forward_step=ForwardStep,
                                  prompts=None,
                                  tokens_to_generate=0,
                                  beam_size=0,
@@ -151,12 +179,14 @@ def beam_search_and_post_process(model,
                                  stop_token=50256,
                                  num_return_gen=1,
                                  length_penalty=1,
-                                 prevent_newline_after_colon=False):
+                                 prevent_newline_after_colon=False,
+                                 detokenize_segments=True):
     """Run beam search and post-process outputs, i.e., detokenize,
     move to cpu and convert to list."""
 
     # Main inference.
     tokens, scores = beam_search(model,
+                                 forward_step=forward_step,
                                  prompts=prompts,
                                  tokens_to_generate=tokens_to_generate,
                                  beam_size=beam_size,
@@ -167,14 +197,14 @@ def beam_search_and_post_process(model,
                                  prevent_newline_after_colon=prevent_newline_after_colon)
     # Only post-process on first stage.
     if mpu.is_pipeline_first_stage():
-        lengths = tokens.size(1)*torch.ones(beam_size, dtype=torch.int64, device=torch.cuda.current_device()) 
-        tokens, prompts_plus_generations, prompts_plus_generations_segments = detokenize_generations(tokens, lengths, True)
+        lengths = tokens.size(1)*torch.ones(beam_size, dtype=torch.int64, device=torch.cuda.current_device())
+        tokens, prompts_plus_generations, prompts_plus_generations_segments = detokenize_generations(tokens, lengths, detokenize_segments)
         scores = scores.cpu().numpy().tolist()
         return prompts_plus_generations, prompts_plus_generations_segments, scores
 
     return None
 
-def beam_search(model, prompts=None, tokens_to_generate=0, beam_size=0, add_BOS=False, stop_token=50256, num_return_gen=1, length_penalty=1, prevent_newline_after_colon=False):
+def beam_search(model, forward_step, prompts=None, tokens_to_generate=0, beam_size=0, add_BOS=False, stop_token=50256, num_return_gen=1, length_penalty=1, prevent_newline_after_colon=False):
     # Make sure input params are avaialble to all ranks.
     values = [tokens_to_generate,
               beam_size,
@@ -194,7 +224,7 @@ def beam_search(model, prompts=None, tokens_to_generate=0, beam_size=0, add_BOS=
 
     context_tokens_tensor, context_length_tensor = tokenize_prompts(
         prompts=prompts, tokens_to_generate=tokens_to_generate, add_BOS=add_BOS)
-    
-    return beam_search_and_return_on_first_stage(model, context_tokens_tensor, context_length_tensor, 
+
+    return beam_search_and_return_on_first_stage(model, forward_step, context_tokens_tensor, context_length_tensor,
             beam_size, stop_token=stop_token, num_return_gen=num_return_gen, length_penalty=length_penalty,
             prevent_newline_after_colon=prevent_newline_after_colon)
diff --git a/megatron/text_generation/beam_utils.py b/megatron/inference/text_generation/beam_utils.py
similarity index 97%
rename from megatron/text_generation/beam_utils.py
rename to megatron/inference/text_generation/beam_utils.py
index 911a64143a..ab6ffe0952 100644
--- a/megatron/text_generation/beam_utils.py
+++ b/megatron/inference/text_generation/beam_utils.py
@@ -1,3 +1,4 @@
+# Copyright (c) 2024, NVIDIA CORPORATION. All rights reserved.
 # coding=utf-8
 # Copyright 2018 The Google AI Language Team Authors, Facebook AI Research authors and The HuggingFace Inc. team.
 # Copyright (c) 2018, NVIDIA CORPORATION.  All rights reserved.
diff --git a/megatron/text_generation/communication.py b/megatron/inference/text_generation/communication.py
similarity index 73%
rename from megatron/text_generation/communication.py
rename to megatron/inference/text_generation/communication.py
index dee32077f3..a67e0a5e42 100644
--- a/megatron/text_generation/communication.py
+++ b/megatron/inference/text_generation/communication.py
@@ -5,6 +5,7 @@
 
 import torch
 
+from megatron.core import parallel_state
 from megatron.core import mpu
 
 
@@ -141,10 +142,15 @@ def copy_from_last_to_first_pipeline_stage(size, dtype, tensor=None):
 
 
 
-def broadcast_tensor(size, dtype, tensor=None, rank=0):
-    """ Given size and type of a tensor on all ranks and the tensor value
-        only on a specific rank, broadcast from that rank to all other ranks.
+def broadcast_tensor(size, dtype, tensor=None, rank=0, data_parallel=False):
+    """Given size and type of a tensor on all ranks and the tensor value
+    only on a specific rank, broadcast from that rank to all other ranks.
+
+    Args:
+        data_parallel (bool): Broadcast across a single data parallel model replica.
     """
+    if data_parallel:
+        rank = parallel_state.get_tensor_model_parallel_src_rank()
 
     if torch.distributed.get_rank() == rank:
         _is_cuda_contiguous(tensor)
@@ -153,33 +159,58 @@ def broadcast_tensor(size, dtype, tensor=None, rank=0):
                              dtype=dtype,
                              device=torch.cuda.current_device())
 
-    torch.distributed.broadcast(tensor, rank)
+    group = None
+    if data_parallel:
+        group = parallel_state.get_tensor_model_parallel_group()
+
+    torch.distributed.broadcast(tensor, rank, group=group)
 
     return tensor
 
 
 
-def broadcast_list(size, dtype, list_values=None, rank=0):
-    """Broadcast a list of values with a given type."""
+def broadcast_list(size, dtype, list_values=None, rank=0, data_parallel=False):
+    """Broadcast a list of values with a given type.
+
+    Args:
+        data_parallel (bool): Broadcast across a single data parallel model replica.
+    """
 
     tensor = None
-    if torch.distributed.get_rank() == rank:
-        tensor = torch.tensor(list_values, dtype=dtype,
-                              device=torch.cuda.current_device())
 
-    return broadcast_tensor(size, dtype, tensor=tensor, rank=rank)
+    if data_parallel:
+        src_rank = parallel_state.get_data_parallel_src_rank()
+        if src_rank == 0:
+            tensor = torch.tensor(list_values, dtype=dtype,
+                                  device=torch.cuda.current_device())
+
+        rank = parallel_state.get_tensor_model_parallel_src_rank()
+    else:
+        if torch.distributed.get_rank() == rank:
+            tensor = torch.tensor(list_values, dtype=dtype,
+                                  device=torch.cuda.current_device())
+
+    return broadcast_tensor(size, dtype, tensor=tensor, rank=rank, data_parallel=data_parallel)
+
 
 
+def broadcast_int_list(size, int_list=None, rank=0, data_parallel=False):
+    """Broadcast a list of integer values.
 
-def broadcast_int_list(size, int_list=None, rank=0):
-    """Broadcast a list of interger values."""
+    Args:
+        data_parallel (bool): Broadcast across a single data parallel model replica.
+    """
+
+    return broadcast_list(size, torch.int64, list_values=int_list, rank=rank, data_parallel=data_parallel)
 
-    return broadcast_list(size, torch.int64, list_values=int_list, rank=rank)
 
 
+def broadcast_float_list(size, float_list=None, rank=0, data_parallel=False):
+    """Broadcast a list of float values.
 
-def broadcast_float_list(size, float_list=None, rank=0):
-    """Broadcast a list of float values."""
+    Args:
+        data_parallel (bool): Broadcast across a single data parallel model replica.
+    """
 
     return broadcast_list(size, torch.float32, list_values=float_list,
-                          rank=rank)
+                          rank=rank, data_parallel=data_parallel)
diff --git a/megatron/inference/text_generation/forward_step.py b/megatron/inference/text_generation/forward_step.py
new file mode 100644
index 0000000000..4d4878d337
--- /dev/null
+++ b/megatron/inference/text_generation/forward_step.py
@@ -0,0 +1,164 @@
+# Copyright (c) 2024, NVIDIA CORPORATION. All rights reserved.
+
+"""Forward step utilities."""
+
+from collections.abc import Iterable
+
+import torch
+
+from megatron.training import get_args
+from megatron.core import mpu, InferenceParams
+from .communication import (
+    send_to_next_pipeline_rank,
+    recv_from_prev_pipeline_rank_)
+
+
+class ForwardStep:
+    """Forward step function with all the communications.
+    We use a class here to hide the inference parameters
+    from the outside caller."""
+
+    def __init__(self, model, max_batch_size, max_sequence_length):
+        """Set values so we don't need to do it multiple times."""
+        # Make sure model is in eval mode.
+        assert not isinstance(model, Iterable), \
+            'interleaving schedule is not supported for inference'
+        model.eval()
+        self.model = model
+        # Initialize inference parameters.
+        self.inference_params = InferenceParams(max_batch_size,
+                                                max_sequence_length)
+        # Pipelining arguments.
+        args = get_args()
+        self.pipeline_size_larger_than_one = (
+            args.pipeline_model_parallel_size > 1)
+        # Threshold of pipelining.
+        self.pipelining_batch_x_seqlen = \
+            args.inference_batch_times_seqlen_threshold
+
+    def _forward(self, tokens, position_ids, attention_mask):
+        return self.model(tokens, position_ids, attention_mask, inference_params=self.inference_params)
+
+    def __call__(self, tokens, position_ids, attention_mask):
+        """Invocation of the forward methods. Note that self.inference_params
+        is being modified by the forward step."""
+        # Pipelining case.
+        if self.pipeline_size_larger_than_one:
+            current_batch_x_seqlen = tokens.size(0) * tokens.size(1)
+            if current_batch_x_seqlen >= self.pipelining_batch_x_seqlen:
+                micro_batch_size = \
+                    max(1, self.pipelining_batch_x_seqlen // tokens.size(1))
+                return self._with_pipelining_forward_step(tokens,
+                                                          position_ids,
+                                                          attention_mask,
+                                                          micro_batch_size)
+
+        return self._no_pipelining_forward_step(tokens,
+                                                position_ids,
+                                                attention_mask)
+
+
+    def _forward_step_helper(self, tokens, position_ids, attention_mask, recv_buffer=None):
+        """Single forward step. Update the allocate memory flag so
+        only the first time the memory is allocated."""
+        batch_size = tokens.size(0)
+        sequence_length = tokens.size(1)
+        if recv_buffer is None:
+            recv_buffer = _allocate_recv_buffer(batch_size, sequence_length)
+
+        # Receive from previous stage.
+        recv_from_prev_pipeline_rank_(recv_buffer)
+
+        # Forward pass through the model.
+        self.model.set_input_tensor(recv_buffer)
+        output_tensor = self._forward(tokens, position_ids, attention_mask)
+
+        # Send output to the next stage.
+        send_to_next_pipeline_rank(output_tensor)
+
+        return output_tensor
+
+
+
+    def _no_pipelining_forward_step(self, tokens, position_ids, attention_mask,
+                                    recv_buffer=None):
+        """If recv_buffer is none, we will allocate one on the fly."""
+        # Run a simple forward pass.
+        output_tensor = self._forward_step_helper(tokens, position_ids,
+                                                  attention_mask, recv_buffer=recv_buffer)
+        # Update the sequence length offset.
+        self.inference_params.sequence_len_offset += tokens.size(1)
+
+        logits = None
+        if mpu.is_pipeline_last_stage():
+            logits = output_tensor
+
+        return logits
+
+
+    def _with_pipelining_forward_step(self, tokens, position_ids, attention_mask, micro_batch_size):
+        """No interleaving is supported."""
+        sequence_length = tokens.size(1)
+        batch_size = tokens.size(0)
+
+        # Divide the batch dimension into micro batches.
+        num_micro_batches, last_chunk = divmod(batch_size,
+                                            micro_batch_size)
+        if last_chunk > 0:
+            num_micro_batches += 1
+
+        # Preallocate memory for output logits.
+        logits = None
+        if mpu.is_pipeline_last_stage():
+            args = get_args()
+            logits = torch.empty(
+                (batch_size, sequence_length, args.padded_vocab_size),
+                dtype=torch.float32, device=torch.cuda.current_device())
+
+        # Preallocate recv buffer.
+        recv_buffer = _allocate_recv_buffer(micro_batch_size, sequence_length)
+
+        for micro_batch_index in range(num_micro_batches):
+            # Slice among the batch dimenion.
+            start = micro_batch_index * micro_batch_size
+            end = min(start + micro_batch_size, batch_size)
+            this_micro_batch_size = end - start
+            tokens2use = tokens[start:end, ...]
+            position_ids2use = position_ids[start:end, ...]
+
+            # Run a simple forward pass.
+            if this_micro_batch_size != micro_batch_size:
+                recv_buffer = None
+            output = self._forward_step_helper(tokens2use, position_ids2use, attention_mask, recv_buffer=recv_buffer)
+
+            # Adjust the batch size offset to account for the micro-batch.
+            self.inference_params.batch_size_offset += this_micro_batch_size
+
+            # Copy logits.
+            if mpu.is_pipeline_last_stage():
+                logits[start:end, ...] = output
+
+        # Once we are done with all the micro-batches, we can
+        # adjust the sequence length offset.
+        self.inference_params.sequence_len_offset += sequence_length
+        # and reset the batch size offset
+        self.inference_params.batch_size_offset = 0
+
+        return logits
+
+
+def _get_recv_buffer_dtype(args):
+    """Receive happens between the layers."""
+    if args.fp32_residual_connection:
+        return torch.float
+    return args.params_dtype
+
+def _allocate_recv_buffer(batch_size, sequence_length):
+    """Receive happens between the layers with size [s, b, h]."""
+    if mpu.is_pipeline_first_stage():
+        return None
+    args = get_args()
+    recv_size = (sequence_length, batch_size, args.hidden_size)
+    return torch.empty(recv_size,
+                       dtype=_get_recv_buffer_dtype(args),
+                       device=torch.cuda.current_device())
diff --git a/megatron/text_generation/generation.py b/megatron/inference/text_generation/generation.py
similarity index 94%
rename from megatron/text_generation/generation.py
rename to megatron/inference/text_generation/generation.py
index 098706ee6d..5e4c238758 100644
--- a/megatron/text_generation/generation.py
+++ b/megatron/inference/text_generation/generation.py
@@ -5,9 +5,9 @@
 import torch
 import torch.nn.functional as F
 
-from megatron import get_args, get_tokenizer
+from megatron.training import get_args, get_tokenizer
 from megatron.core import mpu
-from megatron.utils import get_ltor_masks_and_position_ids
+from megatron.training.utils import get_ltor_masks_and_position_ids
 from .communication import (
     copy_from_last_to_first_pipeline_stage,
     broadcast_from_last_pipeline_stage,
@@ -18,13 +18,15 @@
 
 def score_and_return_on_first_stage(model, tokens, lengths):
     """Function for just scoring.
-    Arguments:
+
+    Args:
         model: no interleaving is supported.
         tokens: prompt tokens extended to be of size [b, max_prompt_length]
         lengths: original prompt length, size: [b]
     Note: Outside of model, other parameters only need to be available on
           rank 0.
-    Outputs: 
+
+    Returns:
         output_log_probs: log probability of the selected tokens. size: [b, s]
     """
 
@@ -33,10 +35,10 @@ def score_and_return_on_first_stage(model, tokens, lengths):
     batch_size = tokens.size(0)
     max_prompt_length = lengths.max().item()
     assert max_prompt_length == tokens.size(1)
-    
+
     if max_prompt_length > args.max_position_embeddings:
         raise ValueError("Length of prompt + tokens_to_generate longer than allowed")
-    
+
     if max_prompt_length * batch_size > args.max_tokens_to_oom:
         raise ValueError("Too many tokens.  " + str(max_prompt_length*batch_size)+ " is greater than "+str(args.max_tokens_to_oom))
 
@@ -50,18 +52,18 @@ def score_and_return_on_first_stage(model, tokens, lengths):
     # Log probability of the sequence (prompt + generated tokens).
     output_log_probs = None
     output_log_probs_size = (batch_size, max_prompt_length - 1)
-    
+
     if mpu.is_pipeline_last_stage():
         output_log_probs = torch.empty(output_log_probs_size,
                                        dtype=torch.float32,
                                        device=torch.cuda.current_device())
-    
+
     # =============
     # Run infernece
     # =============
     with torch.no_grad():
         attention_mask, position_ids = _build_attention_mask_and_position_ids(tokens)
-        
+
         # logits will be meanigful only in the last pipeline stage.
         logits = forward_step(tokens, position_ids, attention_mask)
 
@@ -69,24 +71,24 @@ def score_and_return_on_first_stage(model, tokens, lengths):
             # Always the last stage should have an output.
             assert logits is not None
             log_probs = F.log_softmax(logits, dim=2)
-            
+
             # Pick the tokens that we need to get the log
             # probabilities for. Note that next input token is
             # the token which we selected in the current logits,
             # so shift by 1.
             indices = torch.unsqueeze(tokens[:, 1:], 2)
             output_log_probs = torch.gather(log_probs, 2, indices).squeeze(2)
-    
+
     # ======================================
     # Broadcast to the first pipeline stage.
     # ======================================
     output_log_probs = broadcast_from_last_to_first_pipeline_stage(
         output_log_probs_size, torch.float32, output_log_probs)
-    
-    return tokens, lengths, output_log_probs
+
+    return tokens, lengths, output_log_probs, logits
 
 def generate_tokens_probs_and_return_on_first_stage(
-        model, tokens, lengths,
+        model, forward_step, tokens, lengths,
         return_output_log_probs=False,
         top_k=0, top_p=0.0, top_p_decay=0.0, top_p_bound=0.0,
         temperature=1.0,
@@ -96,8 +98,10 @@ def generate_tokens_probs_and_return_on_first_stage(
         prevent_newline_after_colon=True
         ):
     """Main token generation function.
-    Arguments:
+
+    Args:
         model: no interleaving is supported.
+        forward_step (ForwardStep): Class for running the model forward step.
         tokens: prompt tokens extended to be of size [b, max-sequence-length]
         lengths: original prompt length, size: [b]
         return_output_log_probs: flag to calculate the log probability of
@@ -114,7 +118,8 @@ def generate_tokens_probs_and_return_on_first_stage(
         prevent_newline_after_colon: if True, it will disable generating new line \n after :
     Note: Outside of model, other parameters only need to be available on
           rank 0.
-    Outputs: Note that is size is adjusted to a lower value than
+
+    Returns: Note that is size is adjusted to a lower value than
              max-sequence-length if generation is terminated early.
         tokens: prompt and generated tokens. size: [b, :]
         generated_sequence_lengths: total length (including prompt) of
@@ -131,19 +136,23 @@ def generate_tokens_probs_and_return_on_first_stage(
 
     if max_sequence_length > args.max_position_embeddings:
         raise ValueError("Length of prompt + tokens_to_generate longer than allowed")
-    
+
     if max_sequence_length * batch_size > args.max_tokens_to_oom:
         raise ValueError("Too many tokens.  " + str(max_sequence_length*batch_size)+ " is greater than "+str(args.max_tokens_to_oom))
 
     # forward step.
-    forward_step = ForwardStep(model, batch_size, max_sequence_length)
+    forward_step = forward_step(model, batch_size, max_sequence_length)
 
     # Added termination_id to support the case that we want to terminate the
     # generation once that id is generated.
     if hasattr(args, 'eos_id'):
         termination_id = args.eos_id
-    else:
+    elif hasattr(tokenizer, 'eod'):
         termination_id = tokenizer.eod
+    elif hasattr(tokenizer, 'eos_id'):
+        termination_id = tokenizer.eos_id
+    else:
+        raise AttributeError('No eod token found in tokenizer or args')
 
     # ===================
     # Pre-allocate memory
@@ -162,7 +171,7 @@ def generate_tokens_probs_and_return_on_first_stage(
         generated_sequence_lengths = torch.ones(
                 batch_size, dtype=torch.int64,
                 device=torch.cuda.current_device()) * max_sequence_length
-    
+
     # Whether we have reached a termination id.
     is_generation_done = torch.zeros(batch_size, dtype=torch.uint8,
                                      device=torch.cuda.current_device())
@@ -248,10 +257,10 @@ def generate_tokens_probs_and_return_on_first_stage(
                     hit_double_eol = (new_sample == 628).byte() & started.byte()
                     hit_eol = (new_sample == 198).byte() & started.byte()
                     done_token = hit_double_eol | hit_eol
-                else: 
+                else:
                     done_token = (new_sample == termination_id).byte() & \
                         started.byte()
-                
+
                 just_finished = (done_token & ~is_generation_done).bool()
                 generated_sequence_lengths[just_finished.view(-1)] = \
                     context_length + 1
@@ -261,7 +270,7 @@ def generate_tokens_probs_and_return_on_first_stage(
                                                       tensor=done)
             if use_eod_token_for_early_termination and done:
                 break
-            
+
     # ===================================================
     # Update the length of based on max generated length.
     # ===================================================
@@ -282,9 +291,9 @@ def generate_tokens_probs_and_return_on_first_stage(
         output_log_probs = broadcast_from_last_to_first_pipeline_stage(
             output_log_probs_size, torch.float32, output_log_probs)
 
-    return tokens, generated_sequence_lengths, output_log_probs
+    return tokens, generated_sequence_lengths, output_log_probs, None
 
-def beam_search_and_return_on_first_stage(model, tokens, lengths, beam_size, stop_token, num_return_gen, length_penalty, prevent_newline_after_colon=True):
+def beam_search_and_return_on_first_stage(model, forward_step, tokens, lengths, beam_size, stop_token, num_return_gen, length_penalty, prevent_newline_after_colon=True):
     args = get_args()
     tokenizer = get_tokenizer()
 
@@ -293,13 +302,13 @@ def beam_search_and_return_on_first_stage(model, tokens, lengths, beam_size, sto
     prompt_length = lengths.item()
     final_sequence_length = tokens.size(1)
     final_sequence_length = min(final_sequence_length, args.max_position_embeddings)
-    
+
     # If the context is too big, this happens
     if prompt_length >= final_sequence_length:
         raise ValueError("context length + tokens_to_generate too large")
 
     # forward step.
-    forward_step = ForwardStep(model, beam_size, final_sequence_length)
+    forward_step = forward_step(model, beam_size, final_sequence_length)
 
     beam_hyp = BeamHypotheses(beam_size, length_penalty)
     best_batches = None
@@ -365,12 +374,12 @@ def beam_search_and_return_on_first_stage(model, tokens, lengths, beam_size, sto
 
                 if beam_hyp.is_done(best_scores.max().item(), context_length + 1 - prompt_length):
                     done = torch.ones(1, dtype=torch.uint8, device=torch.cuda.current_device())
-            
+
                 best_batches = tokens.new([item[2] for item in next_beams])
                 tokens = tokens[best_batches,:]
                 tokens[:, context_length] = tokens.new([item[0] for item in next_beams])
                 scores = scores.new([item[1] for item in next_beams]).unsqueeze(1)
-          
+
             # torch.distributed.barrier()
             done = broadcast_from_last_pipeline_stage(1, torch.uint8, done)
             if done:
diff --git a/megatron/text_generation/sampling.py b/megatron/inference/text_generation/sampling.py
similarity index 100%
rename from megatron/text_generation/sampling.py
rename to megatron/inference/text_generation/sampling.py
diff --git a/megatron/text_generation/tokenization.py b/megatron/inference/text_generation/tokenization.py
similarity index 63%
rename from megatron/text_generation/tokenization.py
rename to megatron/inference/text_generation/tokenization.py
index accead319a..32d3b50c6b 100644
--- a/megatron/text_generation/tokenization.py
+++ b/megatron/inference/text_generation/tokenization.py
@@ -6,53 +6,55 @@
 import torch
 
 
-from megatron import get_tokenizer, get_args
+from megatron.core import parallel_state
+from megatron.training import get_args, get_tokenizer
 from .communication import broadcast_int_list, broadcast_tensor
 
 
 def detokenize_generations(tokens_gpu_tensor,
                            lengths_gpu_tensor,
-                           return_segments):
+                           detokenize_segments):
     """Detokenize the generated tokens."""
 
-    tokenizer = get_tokenizer()
     args = get_args()
+    tokenizer = get_tokenizer()
     prompts_plus_generations = []
-    if return_segments:
-        prompts_plus_generations_segments = []
+    prompts_plus_generations_segments = []
 
     tokens = tokens_gpu_tensor.cpu().numpy().tolist()
     lengths = lengths_gpu_tensor.cpu().numpy().tolist()
     for sequence_tokens, length in zip(tokens, lengths):
         sequence_tokens = sequence_tokens[:length]
-        prompts_plus_generations.append(
-            tokenizer.detokenize(sequence_tokens))
-        if return_segments:
-            words = []
-            for token in sequence_tokens:
-                if args.tokenizer_type in ['SentencePieceTokenizer', 
-                        'GPTSentencePieceTokenizer']:
-                    word = tokenizer.decoder[token]
-                elif args.tokenizer_type == 'NullTokenizer':
-                    word = str(token)
-                else:
+        detok_str = tokenizer.detokenize(sequence_tokens)
+        prompts_plus_generations.append(detok_str)
+        if detokenize_segments:
+            try:
+                offsets = tokenizer.offsets(sequence_tokens, detok_str)
+                words = [
+                    detok_str[start:end]
+                    for start, end in zip(offsets, offsets[1:] + [len(detok_str)])
+                ]
+            except NotImplementedError:
+                words = []
+                for token in sequence_tokens:
                     word = tokenizer.tokenizer.decoder[token]
-                    word = bytearray(
-                        [tokenizer.tokenizer.byte_decoder[c] for c in word]).decode(
-                            'utf-8', errors='replace')
-                words.append(word)
-            prompts_plus_generations_segments.append(words)
+                    word = bytearray([tokenizer.tokenizer.byte_decoder[c] for c in word]).decode(
+                        "utf-8", errors="replace"
+                    )
+                    words.append(word)
 
-    if return_segments:
-        return tokens, prompts_plus_generations, \
-            prompts_plus_generations_segments
+            prompts_plus_generations_segments.append(words)
 
-    return tokens, prompts_plus_generations
+    return tokens, prompts_plus_generations, prompts_plus_generations_segments
 
 
 def tokenize_prompts(prompts=None, tokens_to_generate=None,
-                     add_BOS=None, rank=0):
-    """Tokenize prompts and make them avaiable on all ranks."""
+                     add_BOS=None, rank=0, data_parallel=False):
+    """Tokenize prompts and make them avaiable on all ranks.
+
+    Args:
+        data_parallel (bool): Broadcast tokens across a single data parallel model replica.
+    """
 
     # On all ranks set to None so we can pass them to functions
     sizes_list = None
@@ -60,7 +62,11 @@ def tokenize_prompts(prompts=None, tokens_to_generate=None,
     prompts_length_cuda_long_tensor = None
 
     # On the specified rank, build the above.
-    if torch.distributed.get_rank() == rank:
+    src_rank = torch.distributed.get_rank()
+    if data_parallel:
+        src_rank = parallel_state.get_data_parallel_src_rank()
+
+    if src_rank == rank:
         assert prompts is not None
         assert tokens_to_generate is not None
         # Tensor of tokens padded and their unpadded length.
@@ -71,16 +77,16 @@ def tokenize_prompts(prompts=None, tokens_to_generate=None,
                       prompts_tokens_cuda_long_tensor.size(1)] # Sequence lenght
 
     # First, broadcast the sizes.
-    sizes_tensor = broadcast_int_list(2, int_list=sizes_list, rank=rank)
+    sizes_tensor = broadcast_int_list(2, int_list=sizes_list, rank=rank, data_parallel=data_parallel)
 
     # Now that we have the sizes, we can boradcast the tokens
     # and length tensors.
     sizes = sizes_tensor.tolist()
     prompts_tokens_cuda_long_tensor = broadcast_tensor(
-        sizes, torch.int64, tensor=prompts_tokens_cuda_long_tensor, rank=rank)
+        sizes, torch.int64, tensor=prompts_tokens_cuda_long_tensor, rank=rank, data_parallel=data_parallel)
     prompts_length_cuda_long_tensor = broadcast_tensor(
         sizes[0], torch.int64, tensor=prompts_length_cuda_long_tensor,
-        rank=rank)
+        rank=rank, data_parallel=data_parallel)
 
     return prompts_tokens_cuda_long_tensor, prompts_length_cuda_long_tensor
 
@@ -95,9 +101,16 @@ def _tokenize_prompts_and_batch(prompts, tokens_to_generate, add_BOS):
     """
 
     # Tokenize all the prompts.
+    args = get_args()
     tokenizer = get_tokenizer()
+    if hasattr(tokenizer, 'eod'):
+        eod_token = tokenizer.eod
+    elif hasattr(tokenizer, 'eos_id'):
+        eod_token = tokenizer.eos_id
+    else:
+        raise AttributeError('No eod token found in Tokenizer')
     if add_BOS:
-        prompts_tokens = [[tokenizer.eod] + tokenizer.tokenize(prompt)
+        prompts_tokens = [[eod_token] + tokenizer.tokenize(prompt)
                           for prompt in prompts]
     else:
         prompts_tokens = [tokenizer.tokenize(prompt) for prompt in prompts]
@@ -115,10 +128,10 @@ def _tokenize_prompts_and_batch(prompts, tokens_to_generate, add_BOS):
     # Now update the list of list to be of the same size: samples_length.
     for prompt_tokens, prompt_length in zip(prompts_tokens, prompts_length):
         padding_size = samples_length - prompt_length
-        prompt_tokens.extend([tokenizer.eod] * padding_size)
+        prompt_tokens.extend([eod_token] * padding_size)
 
     # Now we are in a structured format, we can convert to tensors.
-    prompts_tokens_tensor = torch.cuda.LongTensor(prompts_tokens)
-    prompts_length_tensor = torch.cuda.LongTensor(prompts_length)
+    prompts_tokens_tensor = torch.tensor(prompts_tokens, dtype=torch.long, device='cuda')
+    prompts_length_tensor = torch.tensor(prompts_length, dtype=torch.long, device='cuda')
 
     return prompts_tokens_tensor, prompts_length_tensor
diff --git a/megatron/text_generation_server.py b/megatron/inference/text_generation_server.py
similarity index 95%
rename from megatron/text_generation_server.py
rename to megatron/inference/text_generation_server.py
index 58550f2e63..2eba2e259e 100644
--- a/megatron/text_generation_server.py
+++ b/megatron/inference/text_generation_server.py
@@ -5,9 +5,9 @@
 import threading
 from flask import Flask, request, jsonify, current_app
 from flask_restful import Resource, Api
-from megatron import get_args
-from megatron.text_generation import generate_and_post_process
-from megatron.text_generation import beam_search_and_post_process
+from megatron.training import get_args
+from megatron.inference.text_generation import generate_and_post_process
+from megatron.inference.text_generation import beam_search_and_post_process
 
 
 GENERATE_NUM = 0
@@ -20,12 +20,12 @@ def __init__(self, model):
 
     @staticmethod
     def send_do_generate():
-        choice = torch.cuda.LongTensor([GENERATE_NUM])
+        choice = torch.tensor([GENERATE_NUM], dtype=torch.long, device='cuda')
         torch.distributed.broadcast(choice, 0)
      
     @staticmethod
     def send_do_beam_search():
-        choice = torch.cuda.LongTensor([BEAM_NUM])
+        choice = torch.tensor([BEAM_NUM], dtype=torch.long, device='cuda')
         torch.distributed.broadcast(choice, 0)
     
     def put(self):
@@ -237,5 +237,5 @@ def __init__(self, model):
         api = Api(self.app)
         api.add_resource(MegatronGenerate, '/api', resource_class_args=[model])
         
-    def run(self, url): 
-        self.app.run(url, threaded=True, debug=False)
+    def run(self, url, port): 
+        self.app.run(url, threaded=True, debug=False, port=port)
diff --git a/megatron/initialize.py b/megatron/initialize.py
deleted file mode 100644
index fdb312068c..0000000000
--- a/megatron/initialize.py
+++ /dev/null
@@ -1,300 +0,0 @@
-# Copyright (c) 2022, NVIDIA CORPORATION. All rights reserved.
-
-"""Megatron initialization."""
-
-import random
-import os
-import time
-
-import numpy as np
-import torch
-from datetime import timedelta
-
-from megatron import fused_kernels
-from megatron import get_adlr_autoresume
-from megatron import get_args
-from megatron import get_tensorboard_writer
-from megatron.core import mpu, tensor_parallel
-from megatron.arguments import (parse_args, validate_args)
-from megatron.checkpointing import load_args_from_checkpoint
-from megatron.global_vars import set_global_variables
-from megatron.model.transformer import bias_dropout_add_fused_train
-from megatron.model.fused_bias_gelu import bias_gelu
-
-
-def initialize_megatron(extra_args_provider=None, args_defaults={},
-                        ignore_unknown_args=False, allow_no_cuda=False):
-    """Set global variables, initialize distributed, and
-    set autoresume and random seeds.
-    `allow_no_cuda` should not be set unless using megatron for cpu only 
-    data processing. In general this arg should not be set unless you know 
-    what you are doing.
-    Returns a function to finalize distributed env initialization 
-    (optionally, only when args.lazy_mpu_init == True)
-    """
-    if not allow_no_cuda:
-        # Make sure cuda is available.
-        assert torch.cuda.is_available(), 'Megatron requires CUDA.'
-
-    # Parse arguments
-    args = parse_args(extra_args_provider, ignore_unknown_args)
-
-    if args.use_checkpoint_args or args_defaults.get('use_checkpoint_args', False):
-        assert args.load is not None, '--use-checkpoints-args requires --load argument'
-        load_args_from_checkpoint(args)
-
-    validate_args(args, args_defaults)
-        
-    # set global args, build tokenizer, and set adlr-autoresume,
-    # tensorboard-writer, and timers.
-    set_global_variables(args)
-
-    # torch.distributed initialization
-    def finish_mpu_init():
-        args = get_args()
-        # Pytorch distributed.
-        _initialize_distributed()
-        
-        # Random seeds for reproducibility.
-        if args.rank == 0:
-            print('> setting random seeds to {} ...'.format(args.seed))
-        _set_random_seed(args.seed, args.data_parallel_random_init)
-
-    args = get_args()
-    if  args.lazy_mpu_init:
-        # TODO is this still a necessary option?
-        args.use_cpu_initialization=True
-        # delayed initialization of DDP-related stuff
-        # We only set basic DDP globals
-        mpu.set_tensor_model_parallel_world_size(args.tensor_model_parallel_size)
-        # and return function for external DDP manager
-        # to call when it has DDP initialized
-        mpu.set_tensor_model_parallel_rank(args.rank)
-        return finish_mpu_init
-    else:
-        # Megatron's MPU is the master. Complete initialization right away.
-        finish_mpu_init()
-
-        # Autoresume.
-        _init_autoresume()
-
-        # Compile dependencies.
-        _compile_dependencies()
-
-        # No continuation function
-        return None
-
-
-def _compile_dependencies():
-
-    args = get_args()
-
-    # =========================
-    # Compile dataset C++ code.
-    # =========================
-    # TODO: move this to ninja
-    if torch.distributed.get_rank() == 0:
-        start_time = time.time()
-        print('> compiling dataset index builder ...')
-        from megatron.data.dataset_utils import compile_helper
-        compile_helper()
-        print('>>> done with dataset index builder. Compilation time: {:.3f} '
-              'seconds'.format(time.time() - start_time), flush=True)
-
-    # ==================
-    # Load fused kernels
-    # ==================
-
-    # Custom kernel constraints check.
-    seq_len = args.seq_length
-    attn_batch_size = \
-        (args.num_attention_heads / args.tensor_model_parallel_size) * \
-        args.micro_batch_size
-    # Constraints on sequence length and attn_batch_size to enable warp based
-    # optimization and upper triangular optimization (for causal mask)
-    custom_kernel_constraint = seq_len > 16 and seq_len <=4096 and \
-        seq_len % 4 == 0 and attn_batch_size % 4 == 0
-    # Print a warning.
-    if not ((args.fp16 or args.bf16) and
-            custom_kernel_constraint and
-            args.masked_softmax_fusion):
-        if args.rank == 0:
-            print('WARNING: constraints for invoking optimized'
-                  ' fused softmax kernel are not met. We default'
-                  ' back to unfused kernel invocations.', flush=True)
-    
-    # Always build on rank zero first.
-    if torch.distributed.get_rank() == 0:
-        start_time = time.time()
-        print('> compiling and loading fused kernels ...', flush=True)
-        fused_kernels.load(args)
-        torch.distributed.barrier()
-    else:
-        torch.distributed.barrier()
-        fused_kernels.load(args)
-    # Simple barrier to make sure all ranks have passed the
-    # compilation phase successfully before moving on to the
-    # rest of the program. We think this might ensure that
-    # the lock is released.
-    torch.distributed.barrier()
-    if torch.distributed.get_rank() == 0:
-        print('>>> done with compiling and loading fused kernels. '
-              'Compilation time: {:.3f} seconds'.format(
-                  time.time() - start_time), flush=True)
-
-
-
-def _initialize_distributed():
-    """Initialize torch.distributed and core model parallel."""
-    args = get_args()
-
-    device_count = torch.cuda.device_count()
-    if torch.distributed.is_initialized():
-
-        if args.rank == 0:
-            print('torch distributed is already initialized, '
-                  'skipping initialization ...', flush=True)
-        args.rank = torch.distributed.get_rank()
-        args.world_size = torch.distributed.get_world_size()
-
-    else:
-
-        if args.rank == 0:
-            print('> initializing torch distributed ...', flush=True)
-        # Manually set the device ids.
-        if device_count > 0:
-            device = args.rank % device_count
-            if args.local_rank is not None:
-                assert args.local_rank == device, \
-                    'expected local-rank to be the same as rank % device-count.'
-            else:
-                args.local_rank = device
-            torch.cuda.set_device(device)
-    # Call the init process
-    torch.distributed.init_process_group(
-        backend=args.distributed_backend,
-        world_size=args.world_size, rank=args.rank,
-        timeout=timedelta(minutes=args.distributed_timeout_minutes))
-
-    # Set the tensor model-parallel, pipeline model-parallel, and
-    # data-parallel communicators.
-    if device_count > 0:
-        if mpu.model_parallel_is_initialized():
-            print('model parallel is already initialized')
-        else:
-            mpu.initialize_model_parallel(args.tensor_model_parallel_size,
-                                           args.pipeline_model_parallel_size,
-                                           args.virtual_pipeline_model_parallel_size,
-                                           args.pipeline_model_parallel_split_rank)
-            if args.rank == 0:
-                print(f'> initialized tensor model parallel with size '
-                      f'{mpu.get_tensor_model_parallel_world_size()}')
-                print(f'> initialized pipeline model parallel with size '
-                      f'{mpu.get_pipeline_model_parallel_world_size()}')
-
-
-def _init_autoresume():
-    """Set autoresume start time."""
-    autoresume = get_adlr_autoresume()
-    if autoresume:
-        torch.distributed.barrier()
-        autoresume.init()
-        torch.distributed.barrier()
-
-
-def _set_random_seed(seed_, data_parallel_random_init=False):
-    """Set random seed for reproducability."""
-    if seed_ is not None and seed_ > 0:
-        # Ensure that different pipeline MP stages get different seeds.
-        seed = seed_ + (100 * mpu.get_pipeline_model_parallel_rank())
-        # Ensure different data parallel ranks get different seeds
-        if data_parallel_random_init:
-            seed = seed + (10 * mpu.get_data_parallel_rank())
-        random.seed(seed)
-        np.random.seed(seed)
-        torch.manual_seed(seed)
-        if torch.cuda.device_count() > 0:
-            tensor_parallel.model_parallel_cuda_manual_seed(seed)
-    else:
-        raise ValueError('Seed ({}) should be a positive integer.'.format(seed))
-
-
-def write_args_to_tensorboard():
-    """Write arguments to tensorboard."""
-    args = get_args()
-    writer = get_tensorboard_writer()
-    if writer:
-        for arg in vars(args):
-            writer.add_text(arg, str(getattr(args, arg)),
-                            global_step=args.iteration)
-
-
-def set_jit_fusion_options():
-    """Set PyTorch JIT layer fusion options."""
-    # flags required to enable jit fusion kernels
-    TORCH_MAJOR = int(torch.__version__.split('.')[0])
-    TORCH_MINOR = int(torch.__version__.split('.')[1])
-    if (TORCH_MAJOR > 1) or (TORCH_MAJOR == 1 and TORCH_MINOR >= 10):
-        # nvfuser
-        torch._C._jit_set_profiling_executor(True)
-        torch._C._jit_set_profiling_mode(True)
-        torch._C._jit_override_can_fuse_on_cpu(False)
-        torch._C._jit_override_can_fuse_on_gpu(False)
-        torch._C._jit_set_texpr_fuser_enabled(False)
-        torch._C._jit_set_nvfuser_enabled(True)
-        torch._C._debug_set_autodiff_subgraph_inlining(False)
-    else:
-        # legacy pytorch fuser
-        torch._C._jit_set_profiling_mode(False)
-        torch._C._jit_set_profiling_executor(False)
-        torch._C._jit_override_can_fuse_on_cpu(True)
-        torch._C._jit_override_can_fuse_on_gpu(True)
-
-    _warmup_jit_function()
-
-
-def _warmup_jit_function():
-    """ Compilie JIT functions before the main training steps """
-    args = get_args()
-    if args.bf16:
-        dtype = torch.bfloat16
-    elif args.fp16:
-        dtype = torch.float16
-    else:
-        dtype = torch.float32
-
-    # Warmup fused bias+gelu
-    bias = torch.rand(args.ffn_hidden_size // args.tensor_model_parallel_size,
-                      dtype=dtype, device='cuda')
-    input = torch.rand((args.seq_length, args.micro_batch_size,
-                        args.ffn_hidden_size // args.tensor_model_parallel_size),
-                       dtype=dtype, device='cuda')
-    # Warmup JIT fusions with the input grad_enable state of both forward
-    # prop and recomputation
-    for bias_grad, input_grad in zip([True, True], [False, True]):
-        bias.requires_grad, input.requires_grad = bias_grad, input_grad
-        for _ in range(5):
-            output = bias_gelu(bias, input)
-    del bias, input, output
-
-    # Warmup fused bias+dropout+add
-    if args.sequence_parallel:
-        seq_length = args.seq_length // mpu.get_tensor_model_parallel_world_size()
-    else:
-        seq_length = args.seq_length
-    input = torch.rand((seq_length, args.micro_batch_size, args.hidden_size),
-                       dtype=dtype, device='cuda')
-    residual = torch.rand((seq_length, args.micro_batch_size, args.hidden_size),
-                          dtype=dtype, device='cuda')
-    bias = torch.rand((args.hidden_size), dtype=dtype, device='cuda').expand_as(residual)
-    dropout_rate = 0.1
-    # Warmup JIT fusions with the input grad_enable state of both forward
-    # prop and recomputation
-    for input_grad, bias_grad, residual_grad in zip([False, True], [True, True], [True, True]):
-        input.requires_grad = input_grad
-        bias.requires_grad = bias_grad
-        residual.requires_grad = residual_grad
-        for _ in range(5):
-            output = bias_dropout_add_fused_train(input, bias, residual, dropout_rate)
-    del bias, input, residual, output
-    torch.cuda.empty_cache()
diff --git a/megatron/legacy/data/__init__.py b/megatron/legacy/data/__init__.py
new file mode 100644
index 0000000000..f8011007a5
--- /dev/null
+++ b/megatron/legacy/data/__init__.py
@@ -0,0 +1 @@
+# Copyright (c) 2024, NVIDIA CORPORATION. All rights reserved.
diff --git a/megatron/data/autoaugment.py b/megatron/legacy/data/autoaugment.py
similarity index 99%
rename from megatron/data/autoaugment.py
rename to megatron/legacy/data/autoaugment.py
index 585a4fa6a5..d86127a60b 100644
--- a/megatron/data/autoaugment.py
+++ b/megatron/legacy/data/autoaugment.py
@@ -1,3 +1,4 @@
+# Copyright (c) 2024, NVIDIA CORPORATION. All rights reserved.
 """AutoAugment data augmentation policy for ImageNet.
 
 -- Begin license text.
@@ -193,7 +194,7 @@ def __init__(
             "rotate": np.linspace(0, 30, num_levels),
             "color": np.linspace(0.0, 0.9, num_levels),
             "posterize": np.round(np.linspace(8, 4, num_levels), 0).astype(
-                np.int
+                np.int32
             ),
             "solarize": np.linspace(256, 0, num_levels),  # range [0, 256]
             "contrast": np.linspace(0.0, 0.9, num_levels),
diff --git a/megatron/data/biencoder_dataset_utils.py b/megatron/legacy/data/biencoder_dataset_utils.py
similarity index 91%
rename from megatron/data/biencoder_dataset_utils.py
rename to megatron/legacy/data/biencoder_dataset_utils.py
index c08f067923..05e5ff0ca9 100644
--- a/megatron/data/biencoder_dataset_utils.py
+++ b/megatron/legacy/data/biencoder_dataset_utils.py
@@ -1,14 +1,15 @@
+# Copyright (c) 2024, NVIDIA CORPORATION. All rights reserved.
 import os
 import time
 
 import numpy as np
 import torch
 
-from megatron import get_args, get_tokenizer, print_rank_0
+from megatron.training import get_args, get_tokenizer, print_rank_0
 from megatron.core import mpu, tensor_parallel
-from megatron.data.dataset_utils import create_masked_lm_predictions, \
+from megatron.legacy.data.dataset_utils import create_masked_lm_predictions, \
                                             pad_and_convert_to_numpy
-from megatron.data.data_samplers import MegatronPretrainingSampler
+from megatron.legacy.data.data_samplers import MegatronPretrainingSampler
 
 def make_attention_mask(source_block, target_block):
     """
@@ -154,8 +155,8 @@ def get_block_samples_mapping(block_dataset, title_dataset, data_prefix, num_epo
               'the indices on rank 0 ...'.format(indexmap_filename))
 
         # Make sure the types match the helpers input types.
-        assert block_dataset.doc_idx.dtype == np.int64
-        assert block_dataset.sizes.dtype == np.int32
+        assert block_dataset.document_indices.dtype == np.int64
+        assert block_dataset.sequence_lengths.dtype == np.int32
 
         # Build samples mapping
         verbose = torch.distributed.get_rank() == 0
@@ -163,11 +164,11 @@ def get_block_samples_mapping(block_dataset, title_dataset, data_prefix, num_epo
         print_rank_0(' > building samples index mapping for {} ...'.format(
             name))
 
-        from megatron.data import helpers
+        from megatron.core.datasets import helpers
         mapping_array = helpers.build_blocks_mapping(
-            block_dataset.doc_idx,
-            block_dataset.sizes,
-            title_dataset.sizes,
+            block_dataset.document_indices,
+            block_dataset.sequence_lengths,
+            title_dataset.sequence_lengths,
             num_epochs,
             max_num_samples,
             max_seq_length - 3,  # account for added tokens
@@ -188,7 +189,7 @@ def get_block_samples_mapping(block_dataset, title_dataset, data_prefix, num_epo
     # This should be a barrier but nccl barrier assumes
     # device_index=rank which is not the case for model
     # parallel case
-    counts = torch.cuda.LongTensor([1])
+    counts = torch.tensor([1], dtype=torch.long, device='cuda')
     torch.distributed.all_reduce(counts, group=mpu.get_data_parallel_group())
     assert counts[0].item() == torch.distributed.get_world_size(
         group=mpu.get_data_parallel_group())
diff --git a/megatron/data/data_samplers.py b/megatron/legacy/data/data_samplers.py
similarity index 93%
rename from megatron/data/data_samplers.py
rename to megatron/legacy/data/data_samplers.py
index 8dec2c1922..78c7e1af41 100644
--- a/megatron/data/data_samplers.py
+++ b/megatron/legacy/data/data_samplers.py
@@ -7,12 +7,12 @@
 import torch
 import numpy as np
 from torch.utils.data import Dataset
-from megatron import get_args
+from megatron.training import get_args
 from megatron.core import mpu
 
 
 def build_pretraining_data_loader(dataset, consumed_samples):
-    """Buld dataloader given an input dataset."""
+    """Build dataloader given an input dataset."""
 
     if dataset is None:
         return None
@@ -35,6 +35,10 @@ def build_pretraining_data_loader(dataset, consumed_samples):
             data_parallel_rank=mpu.get_data_parallel_rank(),
             data_parallel_size=mpu.get_data_parallel_world_size(),
             data_sharding=args.data_sharding)
+    elif args.dataloader_type == "external":
+        # External dataloaders are passed through. User is expected to provide a
+        # torch-compatible dataloader and define samplers, if needed.
+        return dataset
     else:
         raise Exception('{} dataloader type is not supported.'.format(
                 args.dataloader_type))
@@ -43,7 +47,9 @@ def build_pretraining_data_loader(dataset, consumed_samples):
     return torch.utils.data.DataLoader(dataset,
                                        batch_sampler=batch_sampler,
                                        num_workers=args.num_workers,
-                                       pin_memory=True)
+                                       pin_memory=True,
+                                       persistent_workers=True if args.num_workers > 0 else False,
+                                       )
 
 class MegatronPretrainingSampler:
 
@@ -160,7 +166,7 @@ def __iter__(self):
                            * self.micro_batch_size
             bucket_offset = current_epoch_samples // self.data_parallel_size
             start_idx = self.data_parallel_rank * bucket_size
-            
+
             g = torch.Generator()
             g.manual_seed(self.epoch)
             random_idx = torch.randperm(bucket_size, generator=g).tolist()
diff --git a/megatron/data/dataset_utils.py b/megatron/legacy/data/dataset_utils.py
similarity index 77%
rename from megatron/data/dataset_utils.py
rename to megatron/legacy/data/dataset_utils.py
index 2f6f3e2fe9..067f87ccea 100644
--- a/megatron/data/dataset_utils.py
+++ b/megatron/legacy/data/dataset_utils.py
@@ -1,3 +1,4 @@
+# Copyright (c) 2024, NVIDIA CORPORATION. All rights reserved.
 # coding=utf-8
 # Copyright 2018 The Google AI Language Team Authors, and NVIDIA.
 #
@@ -26,19 +27,20 @@
 import numpy as np
 import torch
 
-from megatron import (
+from megatron.training import (
     get_args,
     print_rank_0
 )
 from megatron.core import mpu
-from megatron.data.blendable_dataset import BlendableDataset
-from megatron.data.indexed_dataset import make_dataset as make_indexed_dataset
+from megatron.core.datasets.indexed_dataset import IndexedDataset
+
 
 DSET_TYPE_BERT = 'standard_bert'
 DSET_TYPE_ICT = 'ict'
 DSET_TYPE_T5  = 't5'
+DSET_TYPE_MULTIMODAL = 'multimodal'
 
-DSET_TYPES = [DSET_TYPE_BERT, DSET_TYPE_ICT, DSET_TYPE_T5]
+DSET_TYPES = [DSET_TYPE_BERT, DSET_TYPE_ICT, DSET_TYPE_T5, DSET_TYPE_MULTIMODAL]
 
 
 def get_datasets_weights_and_num_samples(data_prefix,
@@ -79,19 +81,6 @@ def get_datasets_weights_and_num_samples(data_prefix,
     return prefixes, weights, datasets_train_valid_test_num_samples
 
 
-def compile_helper():
-    """Compile helper function ar runtime. Make sure this
-    is invoked on a single process."""
-    import os
-    import subprocess
-    path = os.path.abspath(os.path.dirname(__file__))
-    ret = subprocess.run(['make', '-C', path])
-    if ret.returncode != 0:
-        print("Making C++ dataset helpers module failed, exiting.")
-        import sys
-        sys.exit(1)
-
-
 def get_a_and_b_segments(sample, np_rng):
     """Divide sample into a and b segments."""
 
@@ -419,93 +408,77 @@ def pad_and_convert_to_numpy(tokens, tokentypes, masked_positions,
     return tokens_np, tokentypes_np, labels_np, padding_mask_np, loss_mask_np
 
 
-def build_train_valid_test_datasets(data_prefix, data_impl, splits_string,
+def build_train_valid_test_datasets_with_prefixes(train_valid_test_num_samples,
+                                                  max_seq_length,
+                                                  seed,
+                                                  train_data_prefix=None,
+                                                  valid_data_prefix=None,
+                                                  test_data_prefix=None,
+                                                  binary_head=False,
+                                                  max_seq_length_dec=None,
+                                                  dataset_type='standard_bert'):
+    print_rank_0("Separate data paths provided for train, valid & test.")
+
+    train_dataset, valid_dataset, test_dataset = None, None, None
+    # Single dataset.
+    if train_data_prefix is not None:
+        train_dataset = build_dataset("train", train_data_prefix,
+                                      train_valid_test_num_samples[0],
+                                      max_seq_length, seed,
+                                      binary_head, max_seq_length_dec,
+                                      dataset_type=dataset_type)
+
+    if valid_data_prefix is not None:
+        valid_dataset = build_dataset("valid", valid_data_prefix,
+                                      train_valid_test_num_samples[1],
+                                      max_seq_length, seed, False,
+                                      binary_head, max_seq_length_dec,
+                                      dataset_type=dataset_type)
+
+    if test_data_prefix is not None:
+        test_dataset = build_dataset("test", test_data_prefix,
+                                     train_valid_test_num_samples[2],
+                                     max_seq_length, seed, False,
+                                     binary_head, max_seq_length_dec,
+                                     dataset_type=dataset_type)
+
+    return (train_dataset, valid_dataset, test_dataset)
+
+
+def build_train_valid_test_datasets(data_prefix, splits_string,
                                     train_valid_test_num_samples,
-                                    max_seq_length,
-                                    masked_lm_prob, short_seq_prob, seed,
-                                    skip_warmup, binary_head=False,
+                                    max_seq_length, seed,
+                                    binary_head=False,
                                     max_seq_length_dec=None,
                                     dataset_type='standard_bert'):
 
     if len(data_prefix) == 1:
         return _build_train_valid_test_datasets(data_prefix[0],
-                                                data_impl, splits_string,
+                                                splits_string,
                                                 train_valid_test_num_samples,
-                                                max_seq_length, masked_lm_prob,
-                                                short_seq_prob, seed,
-                                                skip_warmup,
+                                                max_seq_length, seed,
                                                 binary_head,
                                                 max_seq_length_dec,
                                                 dataset_type=dataset_type)
-    # Blending dataset.
-    # Parse the values.
-    output = get_datasets_weights_and_num_samples(data_prefix,
-                                                  train_valid_test_num_samples)
-    prefixes, weights, datasets_train_valid_test_num_samples = output
-    train_num_samples, valid_num_samples, test_num_samples = map(
-        sum,
-        zip(*datasets_train_valid_test_num_samples)
-    )
 
-    # Build individual datasets.
-    train_datasets = []
-    valid_datasets = []
-    test_datasets = []
-    for i in range(len(prefixes)):
-        train_ds, valid_ds, test_ds = _build_train_valid_test_datasets(
-            prefixes[i], data_impl, splits_string,
-            datasets_train_valid_test_num_samples[i],
-            max_seq_length, masked_lm_prob, short_seq_prob,
-            seed, skip_warmup, binary_head, max_seq_length_dec,
-            dataset_type=dataset_type)
-        if train_ds:
-            train_datasets.append(train_ds)
-        if valid_ds:
-            valid_datasets.append(valid_ds)
-        if test_ds:
-            test_datasets.append(test_ds)
-
-    # Blend.
-    blending_train_dataset = None
-    if train_datasets:
-        blending_train_dataset = BlendableDataset(train_datasets, weights, train_num_samples)
-    blending_valid_dataset = None
-    if valid_datasets:
-        blending_valid_dataset = BlendableDataset(valid_datasets, weights, valid_num_samples)
-    blending_test_dataset = None
-    if test_datasets:
-        blending_test_dataset = BlendableDataset(test_datasets, weights, test_num_samples)
-
-    return (blending_train_dataset, blending_valid_dataset,
-            blending_test_dataset)
-
-
-def _build_train_valid_test_datasets(data_prefix, data_impl, splits_string,
+    raise NotImplementedError("Blending currently unsupported for non-GPT dataset instances")
+
+
+def _build_train_valid_test_datasets(data_prefix, splits_string,
                                      train_valid_test_num_samples,
-                                     max_seq_length,
-                                     masked_lm_prob, short_seq_prob, seed,
-                                     skip_warmup, binary_head,
+                                     max_seq_length, seed,
+                                     binary_head,
                                      max_seq_length_dec,
                                      dataset_type='standard_bert'):
 
-    if dataset_type not in DSET_TYPES:
-        raise ValueError("Invalid dataset_type: ", dataset_type)
-
     # Indexed dataset.
     indexed_dataset = get_indexed_dataset_(data_prefix,
-                                           data_impl,
-                                           skip_warmup)
-
-    if dataset_type == DSET_TYPE_ICT:
-        args = get_args()
-        title_dataset = get_indexed_dataset_(args.titles_data_path,
-                                             data_impl,
-                                             skip_warmup)
+                                           dataset_type)
 
     # Get start and end indices of train/valid/train into doc-idx
     # Note that doc-idx is desinged to be num-docs + 1 so we can
     # easily iterate over it.
-    total_num_of_documents = indexed_dataset.doc_idx.shape[0] - 1
+    total_num_of_documents = indexed_dataset.document_indices.shape[0] - 1
     splits = get_train_valid_test_split_(splits_string, total_num_of_documents)
 
     # Print stats about the splits.
@@ -516,8 +489,8 @@ def print_split_stats(name, index):
         print_rank_0('     document indices in [{}, {}) total of {} '
                      'documents'.format(splits[index], splits[index + 1],
                                         splits[index + 1] - splits[index]))
-        start_index = indexed_dataset.doc_idx[splits[index]]
-        end_index = indexed_dataset.doc_idx[splits[index + 1]]
+        start_index = indexed_dataset.document_indices[splits[index]]
+        end_index = indexed_dataset.document_indices[splits[index + 1]]
         print_rank_0('     sentence indices in [{}, {}) total of {} '
                      'sentences'.format(start_index, end_index,
                                         end_index - start_index))
@@ -525,91 +498,115 @@ def print_split_stats(name, index):
     print_split_stats('validation', 1)
     print_split_stats('test', 2)
 
-    def build_dataset(index, name):
-        from megatron.data.bert_dataset import BertDataset
-        from megatron.data.ict_dataset import ICTDataset
-        from megatron.data.t5_dataset import T5Dataset
+    def build_split_dataset(index, name):
         dataset = None
         if splits[index + 1] > splits[index]:
             # Get the pointer to the original doc-idx so we can set it later.
-            doc_idx_ptr = indexed_dataset.get_doc_idx()
+            doc_idx_ptr = indexed_dataset.get_document_indices()
             # Slice the doc-idx
             start_index = splits[index]
             # Add +1 so we can index into the dataset to get the upper bound.
             end_index = splits[index + 1] + 1
             # New doc_idx view.
-            indexed_dataset.set_doc_idx(doc_idx_ptr[start_index:end_index])
-            # Build the dataset accordingly.
-            kwargs = dict(
-                name=name,
-                data_prefix=data_prefix,
-                num_epochs=None,
-                max_num_samples=train_valid_test_num_samples[index],
-                max_seq_length=max_seq_length,
-                seed=seed,
-            )
-
-            if dataset_type == DSET_TYPE_ICT:
-                args = get_args()
-                dataset = ICTDataset(
-                    block_dataset=indexed_dataset,
-                    title_dataset=title_dataset,
-                    query_in_block_prob=args.query_in_block_prob,
-                    use_one_sent_docs=args.use_one_sent_docs,
-                    binary_head=binary_head,
-                    **kwargs
-                )
-            elif dataset_type == DSET_TYPE_T5:
-                dataset = T5Dataset(
-                    indexed_dataset=indexed_dataset,
-                    masked_lm_prob=masked_lm_prob,
-                    max_seq_length_dec=max_seq_length_dec,
-                    short_seq_prob=short_seq_prob,
-                    **kwargs
-                )
-            elif dataset_type == DSET_TYPE_BERT:
-                dataset = BertDataset(
-                    indexed_dataset=indexed_dataset,
-                    masked_lm_prob=masked_lm_prob,
-                    short_seq_prob=short_seq_prob,
-                    binary_head=binary_head,
-                    **kwargs
-                )
-            else:
-                raise NotImplementedError("Dataset type not fully implemented.")
+            indexed_dataset.set_document_indices(doc_idx_ptr[start_index:end_index])
+
+            dataset = build_dataset(
+                name, data_prefix,
+                train_valid_test_num_samples[index], max_seq_length,
+                seed, binary_head, max_seq_length_dec,
+                dataset_type, indexed_dataset)
 
             # Set the original pointer so dataset remains the main dataset.
-            indexed_dataset.set_doc_idx(doc_idx_ptr)
+            indexed_dataset.set_document_indices(doc_idx_ptr)
             # Checks.
-            assert indexed_dataset.doc_idx[0] == 0
-            assert indexed_dataset.doc_idx.shape[0] == \
+            assert indexed_dataset.document_indices[0] == 0
+            assert indexed_dataset.document_indices.shape[0] == \
                 (total_num_of_documents + 1)
         return dataset
-
-    train_dataset = build_dataset(0, 'train')
-    valid_dataset = build_dataset(1, 'valid')
-    test_dataset = build_dataset(2, 'test')
+    
+    train_dataset = build_split_dataset(0, 'train')
+    valid_dataset = build_split_dataset(1, 'valid')
+    test_dataset = build_split_dataset(2, 'test')
 
     return (train_dataset, valid_dataset, test_dataset)
 
 
-def get_indexed_dataset_(data_prefix, data_impl, skip_warmup):
+def build_dataset(name, data_prefix, max_num_samples,
+                  max_seq_length, seed, binary_head,
+                  max_seq_length_dec, dataset_type='standard_bert',
+                  indexed_dataset=None):
+
+    from megatron.legacy.data.ict_dataset import ICTDataset
+    from megatron.legacy.data.multimodal_dataset import MultiModalDataset
+
+    if dataset_type == DSET_TYPE_BERT or dataset_type == DSET_TYPE_T5:
+        raise ValueError("The Megatron-LM BERT and T5 datasets are deprecated.")
+
+    if dataset_type not in DSET_TYPES:
+        raise ValueError("Invalid dataset_type: ", dataset_type)
+
+    if indexed_dataset is None:
+        indexed_dataset = get_indexed_dataset_(data_prefix,
+                                               dataset_type)
+
+    kwargs = dict(
+        name=name,
+        data_prefix=data_prefix,
+        num_epochs=None,
+        max_num_samples=max_num_samples,
+        max_seq_length=max_seq_length,
+        seed=seed,
+    )
+
+    if dataset_type == DSET_TYPE_ICT:
+        args = get_args()
+
+        title_dataset = get_indexed_dataset_(
+            args.titles_data_path,
+            dataset_type)
+
+        dataset = ICTDataset(
+            block_dataset=indexed_dataset,
+            title_dataset=title_dataset,
+            query_in_block_prob=args.query_in_block_prob,
+            use_one_sent_docs=args.use_one_sent_docs,
+            binary_head=binary_head,
+            **kwargs
+        )
+    elif dataset_type == DSET_TYPE_MULTIMODAL:
+        args = get_args()
+        dataset = MultiModalDataset(
+            name=name,
+            data_prefix=data_prefix,
+            indexed_dataset=indexed_dataset,
+            num_samples=max_num_samples,
+            seq_length=max_seq_length,
+            seed=seed,
+            img_h=args.img_h,
+            img_w=args.img_w,
+        )
+    else:
+        raise NotImplementedError("Dataset type not fully implemented.")
+
+    return dataset
+
+
+def get_indexed_dataset_(data_prefix, dataset_type):
 
     print_rank_0(' > building dataset index ...')
 
     start_time = time.time()
-    indexed_dataset = make_indexed_dataset(data_prefix,
-                                           data_impl,
-                                           skip_warmup)
-    assert indexed_dataset.sizes.shape[0] == indexed_dataset.doc_idx[-1]
+    multimodal = dataset_type == DSET_TYPE_MULTIMODAL
+    indexed_dataset = IndexedDataset(data_prefix, multimodal)
+    assert indexed_dataset.sequence_lengths.shape[0] == indexed_dataset.document_indices[-1]
     print_rank_0(' > finished creating indexed dataset in {:4f} '
                  'seconds'.format(time.time() - start_time))
 
     print_rank_0(' > indexed dataset stats:')
     print_rank_0('    number of documents: {}'.format(
-        indexed_dataset.doc_idx.shape[0] - 1))
+        indexed_dataset.document_indices.shape[0] - 1))
     print_rank_0('    number of sentences: {}'.format(
-        indexed_dataset.sizes.shape[0]))
+        indexed_dataset.sequence_lengths.shape[0]))
 
     return indexed_dataset
 
@@ -679,8 +676,8 @@ def get_samples_mapping(indexed_dataset,
               'the indices on rank 0 ...'.format(indexmap_filename))
 
         # Make sure the types match the helpers input types.
-        assert indexed_dataset.doc_idx.dtype == np.int64
-        assert indexed_dataset.sizes.dtype == np.int32
+        assert indexed_dataset.document_indices.dtype == np.int64
+        assert indexed_dataset.sequence_lengths.dtype == np.int32
 
         # Build samples mapping
         verbose = torch.distributed.get_rank() == 0
@@ -688,10 +685,10 @@ def get_samples_mapping(indexed_dataset,
         print_rank_0(' > building samples index mapping for {} ...'.format(
             name))
         # First compile and then import.
-        from megatron.data import helpers
+        from megatron.core.datasets import helpers
         samples_mapping = helpers.build_mapping(
-            indexed_dataset.doc_idx,
-            indexed_dataset.sizes,
+            indexed_dataset.document_indices,
+            indexed_dataset.sequence_lengths,
             num_epochs,
             max_num_samples,
             max_seq_length,
@@ -710,7 +707,7 @@ def get_samples_mapping(indexed_dataset,
     # This should be a barrier but nccl barrier assumes
     # device_index=rank which is not the case for model
     # parallel case
-    counts = torch.cuda.LongTensor([1])
+    counts = torch.tensor([1], dtype=torch.long, device='cuda')
     torch.distributed.all_reduce(counts, group=mpu.get_data_parallel_group())
     torch.distributed.all_reduce(counts, group=mpu.get_pipeline_model_parallel_group())
     assert counts[0].item() == (
diff --git a/megatron/data/ict_dataset.py b/megatron/legacy/data/ict_dataset.py
similarity index 95%
rename from megatron/data/ict_dataset.py
rename to megatron/legacy/data/ict_dataset.py
index 6dac35ff9d..9af552d636 100644
--- a/megatron/data/ict_dataset.py
+++ b/megatron/legacy/data/ict_dataset.py
@@ -1,13 +1,14 @@
+# Copyright (c) 2024, NVIDIA CORPORATION. All rights reserved.
 import itertools
 import random
 
 import numpy as np
 from torch.utils.data import Dataset
 
-from megatron import get_tokenizer
-from megatron import get_args
-from megatron.data.dataset_utils import get_indexed_dataset_
-from megatron.data.realm_dataset_utils import get_block_samples_mapping
+from megatron.training import get_tokenizer
+from megatron.training import get_args
+from megatron.legacy.data.dataset_utils import get_indexed_dataset_
+from megatron.legacy.data.realm_dataset_utils import get_block_samples_mapping
 
 def make_attention_mask(source_block, target_block):
     """
diff --git a/megatron/data/image_folder.py b/megatron/legacy/data/image_folder.py
similarity index 100%
rename from megatron/data/image_folder.py
rename to megatron/legacy/data/image_folder.py
diff --git a/megatron/legacy/data/multimodal_dataset.py b/megatron/legacy/data/multimodal_dataset.py
new file mode 100644
index 0000000000..93ea790329
--- /dev/null
+++ b/megatron/legacy/data/multimodal_dataset.py
@@ -0,0 +1,54 @@
+# Copyright (c) 2023, NVIDIA CORPORATION. All rights reserved.
+
+from PIL import Image, UnidentifiedImageError
+import numpy as np
+import io
+import torch
+
+try:
+    from torchvision.transforms import InterpolationMode
+    BICUBIC = InterpolationMode.BICUBIC
+except ImportError:
+    BICUBIC = Image.BICUBIC
+
+from torchvision.transforms import Compose, ToTensor, Normalize, ToPILImage, RandomResizedCrop, Resize
+
+def _convert_image_to_rgb(image):
+    return image.convert("RGB")
+
+def _transform(img_h, img_w):
+    return Compose([
+        ToPILImage(),
+        RandomResizedCrop((img_h, img_w), scale=(0.5, 1.0), interpolation=BICUBIC),
+        _convert_image_to_rgb,
+        ToTensor(),
+        Normalize((0.48145466, 0.4578275, 0.40821073), (0.26862954, 0.26130258, 0.27577711)),
+    ])
+
+class MultiModalDataset(torch.utils.data.Dataset):
+
+    def __init__(self, name, data_prefix, indexed_dataset,
+                 num_samples, seq_length, seed, img_h, img_w):
+
+        self.name = name
+        self.indexed_dataset = indexed_dataset
+        self.doc_idx = indexed_dataset.get_document_indices()
+        self.visual_transform = _transform(img_h, img_w)
+
+    def __len__(self):
+        return self.indexed_dataset.sequence_lengths.shape[0]
+
+    def __getitem__(self, idx):
+        text_sample, mode = self.indexed_dataset.get(self.doc_idx[idx])
+        assert mode == 0
+        img_sample, mode = self.indexed_dataset.get(self.doc_idx[idx]+1)
+        assert mode == 1
+        img_pad = img_sample[0].item()
+        xs = img_sample[1:].tobytes(order='C')
+        xs = xs[:len(xs)-img_pad]
+
+        img_sample = np.array(Image.open(io.BytesIO(xs)))
+        img_sample = self.visual_transform(img_sample).reshape(-1)
+
+        return {'text': np.array(text_sample, dtype=np.int64),
+                'img': np.array(img_sample, dtype=np.float32)}
diff --git a/megatron/data/orqa_wiki_dataset.py b/megatron/legacy/data/orqa_wiki_dataset.py
similarity index 97%
rename from megatron/data/orqa_wiki_dataset.py
rename to megatron/legacy/data/orqa_wiki_dataset.py
index 4019cd764c..99217d64b0 100644
--- a/megatron/data/orqa_wiki_dataset.py
+++ b/megatron/legacy/data/orqa_wiki_dataset.py
@@ -9,9 +9,9 @@
 import torch
 from torch.utils.data import Dataset
 
-from megatron import print_rank_0, get_args, get_tokenizer
+from megatron.training import print_rank_0, get_args, get_tokenizer
 from megatron.core import tensor_parallel
-from megatron.data.biencoder_dataset_utils import make_attention_mask
+from megatron.legacy.data.biencoder_dataset_utils import make_attention_mask
 
 def get_open_retrieval_wiki_dataset():
     args = get_args()
diff --git a/megatron/data/realm_dataset_utils.py b/megatron/legacy/data/realm_dataset_utils.py
similarity index 90%
rename from megatron/data/realm_dataset_utils.py
rename to megatron/legacy/data/realm_dataset_utils.py
index 21445573e3..d8ebc450dd 100644
--- a/megatron/data/realm_dataset_utils.py
+++ b/megatron/legacy/data/realm_dataset_utils.py
@@ -1,13 +1,14 @@
+# Copyright (c) 2024, NVIDIA CORPORATION. All rights reserved.
 import os
 import time
 
 import numpy as np
 import torch
 
-from megatron import print_rank_0
+from megatron.training import print_rank_0
 from megatron.core import mpu, tensor_parallel
-from megatron.data.dataset_utils import create_masked_lm_predictions, pad_and_convert_to_numpy
-from megatron import get_args, get_tokenizer, print_rank_0
+from megatron.legacy.data.dataset_utils import create_masked_lm_predictions, pad_and_convert_to_numpy
+from megatron.training import get_args, get_tokenizer, print_rank_0
 
 
 def get_one_epoch_dataloader(dataset, micro_batch_size=None):
@@ -24,7 +25,7 @@ def get_one_epoch_dataloader(dataset, micro_batch_size=None):
     sampler = torch.utils.data.SequentialSampler(dataset)
     # importantly, drop_last must be False to get all the data.
     assert False, 'DistributedBatchSampler deprecated, change the implementation'
-    from megatron.data.samplers import DistributedBatchSampler
+    from megatron.legacy.data.samplers import DistributedBatchSampler
     batch_sampler = DistributedBatchSampler(sampler,
                                             batch_size=global_batch_size,
                                             drop_last=False,
@@ -144,8 +145,8 @@ def get_block_samples_mapping(block_dataset, title_dataset, data_prefix, num_epo
               'the indices on rank 0 ...'.format(indexmap_filename))
 
         # Make sure the types match the helpers input types.
-        assert block_dataset.doc_idx.dtype == np.int64
-        assert block_dataset.sizes.dtype == np.int32
+        assert block_dataset.document_indices.dtype == np.int64
+        assert block_dataset.sequence_lengths.dtype == np.int32
 
         # Build samples mapping
         verbose = torch.distributed.get_rank() == 0
@@ -153,11 +154,11 @@ def get_block_samples_mapping(block_dataset, title_dataset, data_prefix, num_epo
         print_rank_0(' > building samples index mapping for {} ...'.format(
             name))
 
-        from megatron.data import helpers
+        from megatron.core.datasets import helpers
         mapping_array = helpers.build_blocks_mapping(
-            block_dataset.doc_idx,
-            block_dataset.sizes,
-            title_dataset.sizes,
+            block_dataset.document_indices,
+            block_dataset.sequence_lengths,
+            title_dataset.sequence_lengths,
             num_epochs,
             max_num_samples,
             max_seq_length - 3,  # account for added tokens
@@ -178,7 +179,7 @@ def get_block_samples_mapping(block_dataset, title_dataset, data_prefix, num_epo
     # This should be a barrier but nccl barrier assumes
     # device_index=rank which is not the case for model
     # parallel case
-    counts = torch.cuda.LongTensor([1])
+    counts = torch.tensor([1], dtype=torch.long, device='cuda')
     torch.distributed.all_reduce(counts, group=mpu.get_data_parallel_group())
     assert counts[0].item() == torch.distributed.get_world_size(
         group=mpu.get_data_parallel_group())
diff --git a/megatron/data/realm_index.py b/megatron/legacy/data/realm_index.py
similarity index 98%
rename from megatron/data/realm_index.py
rename to megatron/legacy/data/realm_index.py
index 1fa4a309ed..dbe924a52a 100644
--- a/megatron/data/realm_index.py
+++ b/megatron/legacy/data/realm_index.py
@@ -1,3 +1,4 @@
+# Copyright (c) 2024, NVIDIA CORPORATION. All rights reserved.
 import itertools
 import os
 import pickle
@@ -6,7 +7,7 @@
 import numpy as np
 import torch
 
-from megatron import get_args
+from megatron.training import get_args
 from megatron.core import mpu
 
 
diff --git a/megatron/data/vit_dataset.py b/megatron/legacy/data/vit_dataset.py
similarity index 97%
rename from megatron/data/vit_dataset.py
rename to megatron/legacy/data/vit_dataset.py
index 82391e9157..e65c536c89 100644
--- a/megatron/data/vit_dataset.py
+++ b/megatron/legacy/data/vit_dataset.py
@@ -5,10 +5,10 @@
 import torch
 import torchvision.transforms as T
 from torchvision import datasets
-from megatron import get_args
-from megatron.data.image_folder import ImageFolder
-from megatron.data.autoaugment import ImageNetPolicy
-from megatron.data.data_samplers import RandomSeedDataset
+from megatron.training import get_args
+from megatron.legacy.data.image_folder import ImageFolder
+from megatron.legacy.data.autoaugment import ImageNetPolicy
+from megatron.legacy.data.data_samplers import RandomSeedDataset
 from PIL import Image, ImageFilter, ImageOps
 
 
diff --git a/megatron/fp16_deprecated/loss_scaler.py b/megatron/legacy/fp16_deprecated/loss_scaler.py
similarity index 100%
rename from megatron/fp16_deprecated/loss_scaler.py
rename to megatron/legacy/fp16_deprecated/loss_scaler.py
diff --git a/megatron/fused_kernels/__init__.py b/megatron/legacy/fused_kernels/__init__.py
similarity index 50%
rename from megatron/fused_kernels/__init__.py
rename to megatron/legacy/fused_kernels/__init__.py
index dcbf24cb3f..87cceac3e3 100644
--- a/megatron/fused_kernels/__init__.py
+++ b/megatron/legacy/fused_kernels/__init__.py
@@ -19,17 +19,18 @@ def load(args):
     # Check if cuda 11 is installed for compute capability 8.0
     cc_flag = []
     _, bare_metal_major, bare_metal_minor = _get_cuda_bare_metal_version(
-        cpp_extension.CUDA_HOME)
+        cpp_extension.CUDA_HOME
+    )
     if int(bare_metal_major) >= 11:
         cc_flag.append('-gencode')
         cc_flag.append('arch=compute_80,code=sm_80')
-        if int(bare_metal_minor) >= 7:
+        if int(bare_metal_minor) >= 8:
             cc_flag.append('-gencode')
             cc_flag.append('arch=compute_90,code=sm_90')
 
     # Build path
     srcpath = pathlib.Path(__file__).parent.absolute()
-    buildpath = srcpath / 'build'
+    buildpath = srcpath / "build"
     _create_build_dir(buildpath)
 
     # Helper function to build the kernels.
@@ -38,46 +39,25 @@ def _cpp_extention_load_helper(name, sources, extra_cuda_flags):
             name=name,
             sources=sources,
             build_directory=buildpath,
-            extra_cflags=['-O3',],
-            extra_cuda_cflags=['-O3',
-                               '-gencode', 'arch=compute_70,code=sm_70',
-                               '--use_fast_math'] + extra_cuda_flags + cc_flag,
-            verbose=(args.rank == 0)
+            extra_cflags=[
+                "-O3",
+            ],
+            extra_cuda_cflags=[
+                "-O3",
+                "-gencode",
+                "arch=compute_70,code=sm_70",
+                "--use_fast_math",
+            ]
+            + extra_cuda_flags
+            + cc_flag,
+            verbose=(args.rank == 0),
         )
 
-    # ==============
-    # Fused softmax.
-    # ==============
-
-    if args.masked_softmax_fusion:
-        extra_cuda_flags = ['-U__CUDA_NO_HALF_OPERATORS__',
-                            '-U__CUDA_NO_HALF_CONVERSIONS__',
-                            '--expt-relaxed-constexpr',
-                            '--expt-extended-lambda']
-
-        # Upper triangular softmax.
-        sources=[srcpath / 'scaled_upper_triang_masked_softmax.cpp',
-                 srcpath / 'scaled_upper_triang_masked_softmax_cuda.cu']
-        scaled_upper_triang_masked_softmax_cuda = _cpp_extention_load_helper(
-            "scaled_upper_triang_masked_softmax_cuda",
-            sources, extra_cuda_flags)
-
-        # Masked softmax.
-        sources=[srcpath / 'scaled_masked_softmax.cpp',
-                 srcpath / 'scaled_masked_softmax_cuda.cu']
-        scaled_masked_softmax_cuda = _cpp_extention_load_helper(
-            "scaled_masked_softmax_cuda", sources, extra_cuda_flags)
-
-        # Softmax
-        sources=[srcpath / 'scaled_softmax.cpp',
-                 srcpath / 'scaled_softmax_cuda.cu']
-        scaled_softmax_cuda = _cpp_extention_load_helper(
-            "scaled_softmax_cuda", sources, extra_cuda_flags)
-
 
 def _get_cuda_bare_metal_version(cuda_dir):
-    raw_output = subprocess.check_output([cuda_dir + "/bin/nvcc", "-V"],
-                                         universal_newlines=True)
+    raw_output = subprocess.check_output(
+        [cuda_dir + "/bin/nvcc", "-V"], universal_newlines=True
+    )
     output = raw_output.split()
     release_idx = output.index("release") + 1
     release = output[release_idx].split(".")
diff --git a/megatron/fused_kernels/compat.h b/megatron/legacy/fused_kernels/compat.h
similarity index 100%
rename from megatron/fused_kernels/compat.h
rename to megatron/legacy/fused_kernels/compat.h
diff --git a/megatron/legacy/fused_kernels/tests/__init__.py b/megatron/legacy/fused_kernels/tests/__init__.py
new file mode 100644
index 0000000000..e69de29bb2
diff --git a/megatron/fused_kernels/tests/test_fused_kernels.py b/megatron/legacy/fused_kernels/tests/test_fused_kernels.py
similarity index 97%
rename from megatron/fused_kernels/tests/test_fused_kernels.py
rename to megatron/legacy/fused_kernels/tests/test_fused_kernels.py
index 74024c5020..f5b2b78a3f 100644
--- a/megatron/fused_kernels/tests/test_fused_kernels.py
+++ b/megatron/legacy/fused_kernels/tests/test_fused_kernels.py
@@ -1,13 +1,14 @@
+# Copyright (c) 2024, NVIDIA CORPORATION. All rights reserved.
 import math
 
 import torch
 from torch.nn import LayerNorm
 
-from megatron.model.enums import AttnMaskType
-from megatron.model.fused_layer_norm import MixedFusedLayerNorm
-from megatron.model.fused_softmax import FusedScaleMaskSoftmax
-from megatron.model.utils import attention_mask_func
-from megatron.fused_kernels import load
+from megatron.legacy.model.enums import AttnMaskType
+from megatron.legacy.model.fused_layer_norm import MixedFusedLayerNorm
+from megatron.legacy.model.fused_softmax import FusedScaleMaskSoftmax
+from megatron.legacy.model.utils import attention_mask_func
+from megatron.legacy.fused_kernels import load
 
 def test_load_fused_kernels():
     try:
@@ -373,7 +374,7 @@ def test_allmasked_softmax_backward():
             transformers.logging.FATAL,
         )
 
-    except:
+    except ImportError:
         print("\n[Fail] Please install `transformers` package to test fused kernels\n")
         exit(-1)
 
diff --git a/megatron/fused_kernels/type_shim.h b/megatron/legacy/fused_kernels/type_shim.h
similarity index 100%
rename from megatron/fused_kernels/type_shim.h
rename to megatron/legacy/fused_kernels/type_shim.h
diff --git a/megatron/indexer.py b/megatron/legacy/indexer.py
similarity index 88%
rename from megatron/indexer.py
rename to megatron/legacy/indexer.py
index 45f530a7d4..179e00e6cd 100644
--- a/megatron/indexer.py
+++ b/megatron/legacy/indexer.py
@@ -1,16 +1,17 @@
+# Copyright (c) 2024, NVIDIA CORPORATION. All rights reserved.
 import sys
 import time
 import torch
 import torch.distributed as dist
 
-from megatron import get_args, print_rank_0
+from megatron.training import get_args, print_rank_0
 from megatron.core import mpu
-from megatron.checkpointing import load_biencoder_checkpoint
-from megatron.data.orqa_wiki_dataset import get_open_retrieval_wiki_dataset
-from megatron.data.orqa_wiki_dataset import get_open_retrieval_batch
-from megatron.data.biencoder_dataset_utils import get_one_epoch_dataloader
-from megatron.data.realm_index import detach, OpenRetreivalDataStore
-from megatron.model.biencoder_model import get_model_provider
+from megatron.training.checkpointing import load_biencoder_checkpoint
+from megatron.legacy.data.orqa_wiki_dataset import get_open_retrieval_wiki_dataset
+from megatron.legacy.data.orqa_wiki_dataset import get_open_retrieval_batch
+from megatron.legacy.data.biencoder_dataset_utils import get_one_epoch_dataloader
+from megatron.legacy.data.realm_index import detach, OpenRetreivalDataStore
+from megatron.legacy.model.biencoder_model import get_model_provider
 from megatron.training import get_model
 
 
diff --git a/megatron/model/__init__.py b/megatron/legacy/model/__init__.py
similarity index 68%
rename from megatron/model/__init__.py
rename to megatron/legacy/model/__init__.py
index f5025bf25d..cb010e5fb6 100644
--- a/megatron/model/__init__.py
+++ b/megatron/legacy/model/__init__.py
@@ -1,8 +1,8 @@
-# Copyright (c) 2022, NVIDIA CORPORATION. All rights reserved.
+# Copyright (c) 2023, NVIDIA CORPORATION. All rights reserved.
 
 from .fused_layer_norm import MixedFusedLayerNorm as LayerNorm
+from .rms_norm import RMSNorm
 
-from .distributed import DistributedDataParallel
 from .bert_model import BertModel
 from .gpt_model import GPTModel
 from .t5_model import T5Model
diff --git a/megatron/model/bert_model.py b/megatron/legacy/model/bert_model.py
similarity index 78%
rename from megatron/model/bert_model.py
rename to megatron/legacy/model/bert_model.py
index f6dd7ddc4e..eca22f0433 100644
--- a/megatron/model/bert_model.py
+++ b/megatron/legacy/model/bert_model.py
@@ -1,19 +1,19 @@
-# Copyright (c) 2022, NVIDIA CORPORATION. All rights reserved.
+# Copyright (c) 2023, NVIDIA CORPORATION. All rights reserved.
 
 """BERT model."""
 
 import torch
 
-from megatron import get_args
+from megatron.training import get_args
 from megatron.core import tensor_parallel
-from megatron.model.enums import AttnMaskType
-from megatron.model.language_model import parallel_lm_logits
-from megatron.model.language_model import get_language_model
-from megatron.model import LayerNorm
-from megatron.model.utils import openai_gelu, erf_gelu
-from megatron.model.utils import get_linear_layer
-from megatron.model.utils import init_method_normal
-from megatron.model.utils import scaled_init_method_normal
+from megatron.legacy.model.enums import AttnMaskType
+from megatron.legacy.model.language_model import parallel_lm_logits
+from megatron.legacy.model.language_model import get_language_model
+from megatron.legacy.model.utils import get_norm
+from megatron.legacy.model.utils import openai_gelu, erf_gelu
+from megatron.legacy.model.utils import get_linear_layer
+from megatron.legacy.model.utils import init_method_normal
+from megatron.legacy.model.utils import scaled_init_method_normal
 from .module import MegatronModule
 
 
@@ -46,32 +46,25 @@ def bert_position_ids(token_ids):
 class BertLMHead(MegatronModule):
     """Masked LM head for Bert
 
-    Arguments:
+    Args:
+        config: TransformerConfig object
         mpu_vocab_size: model parallel size of vocabulary.
-        hidden_size: hidden size
-        init_method: init method for weight initialization
-        layernorm_epsilon: tolerance for layer norm divisions
         parallel_output: whether output logits being distributed or not.
     """
 
-    def __init__(self, mpu_vocab_size, hidden_size, init_method,
-                 layernorm_epsilon, parallel_output):
-
-        super(BertLMHead, self).__init__()
+    def __init__(self, mpu_vocab_size, config, parallel_output):
+        super().__init__(config=config)
 
         args = get_args()
-
         self.bias = torch.nn.Parameter(torch.zeros(mpu_vocab_size))
         tensor_parallel.set_tensor_model_parallel_attributes(self.bias, True, 0, 1)
         self.parallel_output = parallel_output
 
-        self.dense = get_linear_layer(hidden_size, hidden_size, init_method)
-        setattr(self.dense.weight, 'sequence_parallel', args.sequence_parallel)
-        setattr(self.dense.bias, 'sequence_parallel', args.sequence_parallel)
+        self.dense = get_linear_layer(config.hidden_size, config.hidden_size, config.init_method)
+        setattr(self.dense.weight, 'sequence_parallel', config.sequence_parallel)
+        setattr(self.dense.bias, 'sequence_parallel', config.sequence_parallel)
 
-        self.layernorm = LayerNorm(hidden_size,
-                                   eps=layernorm_epsilon,
-                                   sequence_parallel=args.sequence_parallel)
+        self.norm = get_norm(config)
         self.gelu = torch.nn.functional.gelu
         if args.openai_gelu:
             self.gelu = openai_gelu
@@ -81,13 +74,24 @@ def __init__(self, mpu_vocab_size, hidden_size, init_method,
     def forward(self, hidden_states, word_embeddings_weight):
         hidden_states = self.dense(hidden_states)
         hidden_states = self.gelu(hidden_states)
-        hidden_states = self.layernorm(hidden_states)
+        hidden_states = self.norm(hidden_states)
         output = parallel_lm_logits(hidden_states,
                                     word_embeddings_weight,
                                     self.parallel_output,
                                     bias=self.bias)
         return output
 
+    def load_state_dict(self, state_dict, strict=True):
+        """Customize load."""
+
+        # Handle renaming layernorm -> norm in component names
+        state_dict_ = {}
+        for key in state_dict.keys():
+            newkey = key.replace("layernorm", "norm")
+            state_dict_[newkey] = state_dict[key]
+
+        super().load_state_dict(state_dict_, strict)
+
 
 def post_language_model_processing(lm_output, pooled_output,
                                    lm_head, binary_head,
@@ -124,12 +128,13 @@ class BertModel(MegatronModule):
     """Bert Language model."""
 
     def __init__(self,
+                 config,
                  num_tokentypes=2,
                  add_binary_head=True,
                  parallel_output=True,
                  pre_process=True,
                  post_process=True):
-        super(BertModel, self).__init__()
+        super().__init__(config=config)
         args = get_args()
 
         # TODO this option is not yet implemented in BERT
@@ -145,33 +150,26 @@ def __init__(self,
         if self.return_embeddings:
             assert self.post_process and self.add_binary_head
 
-        init_method = init_method_normal(args.init_method_std)
-        scaled_init_method = scaled_init_method_normal(args.init_method_std,
-                                                       args.num_layers)
-
         self.language_model, self._language_model_key = get_language_model(
+            config=config,
             num_tokentypes=num_tokentypes,
             add_pooler=self.add_binary_head,
             encoder_attn_mask_type=AttnMaskType.padding,
-            init_method=init_method,
-            scaled_init_method=scaled_init_method,
             pre_process=self.pre_process,
             post_process=self.post_process)
 
-        self.initialize_word_embeddings(init_method_normal)
+        self.initialize_word_embeddings()
         if self.post_process:
-            self.lm_head = BertLMHead(
-                self.word_embeddings_weight().size(0),
-                args.hidden_size, init_method, args.layernorm_epsilon, parallel_output)
+            self.lm_head = BertLMHead(self.shared_embedding_or_output_weight().size(0), config, parallel_output)
             self._lm_head_key = 'lm_head'
             self.binary_head = None
             if self.add_binary_head:
-                self.binary_head = get_linear_layer(args.hidden_size, 2,
-                                                    init_method)
+                self.binary_head = get_linear_layer(config.hidden_size, 2,
+                                                    config.init_method)
                 self._binary_head_key = 'binary_head'
 
     def set_input_tensor(self, input_tensor):
-        """See megatron.model.transformer.set_input_tensor()"""
+        """See megatron.legacy.model.transformer.set_input_tensor()"""
         self.language_model.set_input_tensor(input_tensor)
 
     def forward(self, bert_model_input, attention_mask,
@@ -215,7 +213,7 @@ def forward(self, bert_model_input, attention_mask,
             return post_language_model_processing(lm_output, pooled_output,
                                                   self.lm_head, self.binary_head,
                                                   lm_labels,
-                                                  self.word_embeddings_weight(),
+                                                  self.shared_embedding_or_output_weight(),
                                                   self.fp16_lm_cross_entropy)
         else:
             return lm_output
diff --git a/megatron/model/biencoder_model.py b/megatron/legacy/model/biencoder_model.py
similarity index 93%
rename from megatron/model/biencoder_model.py
rename to megatron/legacy/model/biencoder_model.py
index c910879dc8..df787686b4 100644
--- a/megatron/model/biencoder_model.py
+++ b/megatron/legacy/model/biencoder_model.py
@@ -1,18 +1,19 @@
+# Copyright (c) 2024, NVIDIA CORPORATION. All rights reserved.
 import os
 import torch
 import sys
 
-from megatron import get_args, print_rank_0, get_tokenizer
+from megatron.training import get_args, print_rank_0, get_tokenizer
 from megatron.core import mpu
-from megatron.checkpointing import fix_query_key_value_ordering
-from megatron.checkpointing import get_checkpoint_tracker_filename
-from megatron.checkpointing import get_checkpoint_name
-from megatron.model.bert_model import bert_position_ids
-from megatron.model.enums import AttnMaskType
-from megatron.model.language_model import get_language_model
-from megatron.model.utils import get_linear_layer
-from megatron.model.utils import init_method_normal
-from megatron.model.utils import scaled_init_method_normal
+from megatron.training.checkpointing import fix_query_key_value_ordering
+from megatron.training.checkpointing import get_checkpoint_tracker_filename
+from megatron.training.checkpointing import get_checkpoint_name
+from megatron.legacy.model.bert_model import bert_position_ids
+from megatron.legacy.model.enums import AttnMaskType
+from megatron.legacy.model.language_model import get_language_model
+from megatron.legacy.model.utils import get_linear_layer
+from megatron.legacy.model.utils import init_method_normal
+from megatron.legacy.model.utils import scaled_init_method_normal
 from .module import MegatronModule
 
 def get_model_provider(only_query_model=False, only_context_model=False,
@@ -104,7 +105,7 @@ def __init__(self,
                 self._context_key = 'context_model'
 
     def set_input_tensor(self, input_tensor):
-        """See megatron.model.transformer.set_input_tensor()"""
+        """See megatron.legacy.model.transformer.set_input_tensor()"""
         # this is just a placeholder and will be needed when model
         # parallelism will be used
         # self.language_model.set_input_tensor(input_tensor)
@@ -201,7 +202,7 @@ def init_state_dict_from_bert(self):
         try:
             state_dict = torch.load(checkpoint_name, map_location='cpu')
         except ModuleNotFoundError:
-            from megatron.fp16_deprecated import loss_scaler
+            from megatron.legacy.fp16_deprecated import loss_scaler
             # For backward compatibility.
             print_rank_0(' > deserializing using the old code structure ...')
             sys.modules['fp16.loss_scaler'] = sys.modules[
@@ -211,7 +212,7 @@ def init_state_dict_from_bert(self):
             state_dict = torch.load(checkpoint_name, map_location='cpu')
             sys.modules.pop('fp16.loss_scaler', None)
             sys.modules.pop('megatron.fp16.loss_scaler', None)
-        except BaseException:
+        except Exception:
             print_rank_0('could not load the BERT checkpoint')
             sys.exit()
 
diff --git a/megatron/model/classification.py b/megatron/legacy/model/classification.py
similarity index 79%
rename from megatron/model/classification.py
rename to megatron/legacy/model/classification.py
index 54a452065a..c9fe165280 100644
--- a/megatron/model/classification.py
+++ b/megatron/legacy/model/classification.py
@@ -4,38 +4,36 @@
 
 import torch
 
-from megatron import get_args, print_rank_last
-from megatron.model.enums import AttnMaskType
-from megatron.model.bert_model import bert_extended_attention_mask, bert_position_ids
-from megatron.model.language_model import get_language_model
-from megatron.model.utils import get_linear_layer
-from megatron.model.utils import init_method_normal
-from megatron.model.utils import scaled_init_method_normal
+from megatron.training import get_args, print_rank_last
+from megatron.legacy.model.enums import AttnMaskType
+from megatron.legacy.model.bert_model import bert_extended_attention_mask, bert_position_ids
+from megatron.legacy.model.language_model import get_language_model
+from megatron.legacy.model.utils import get_linear_layer
+from megatron.legacy.model.utils import init_method_normal
+from megatron.legacy.model.utils import scaled_init_method_normal
 from .module import MegatronModule
 
 
 class Classification(MegatronModule):
 
     def __init__(self,
+                 config,
                  num_classes,
                  num_tokentypes=2,
                  pre_process=True,
                  post_process=True):
-        super(Classification, self).__init__(share_word_embeddings=False)
+        super().__init__(config=config, share_embeddings_and_output_weights=False)
         args = get_args()
 
         self.num_classes = num_classes
         self.pre_process = pre_process
         self.post_process = post_process
-        init_method = init_method_normal(args.init_method_std)
 
         self.language_model, self._language_model_key = get_language_model(
+            config=config,
             num_tokentypes=num_tokentypes,
             add_pooler=True,
             encoder_attn_mask_type=AttnMaskType.padding,
-            init_method=init_method,
-            scaled_init_method=scaled_init_method_normal(args.init_method_std,
-                                                         args.num_layers),
             pre_process=self.pre_process,
             post_process=self.post_process)
 
@@ -44,11 +42,11 @@ def __init__(self,
             self.classification_dropout = torch.nn.Dropout(args.hidden_dropout)
             self.classification_head = get_linear_layer(args.hidden_size,
                                                         self.num_classes,
-                                                        init_method)
+                                                        config.init_method)
             self._classification_head_key = 'classification_head'
 
     def set_input_tensor(self, input_tensor):
-        """See megatron.model.transformer.set_input_tensor()"""
+        """See megatron.legacy.model.transformer.set_input_tensor()"""
         self.language_model.set_input_tensor(input_tensor)
 
     def forward(self, model_input, attention_mask, tokentype_ids=None):
diff --git a/megatron/model/enums.py b/megatron/legacy/model/enums.py
similarity index 100%
rename from megatron/model/enums.py
rename to megatron/legacy/model/enums.py
diff --git a/megatron/model/fused_bias_gelu.py b/megatron/legacy/model/fused_bias_gelu.py
similarity index 95%
rename from megatron/model/fused_bias_gelu.py
rename to megatron/legacy/model/fused_bias_gelu.py
index 29222db024..e00e63148b 100644
--- a/megatron/model/fused_bias_gelu.py
+++ b/megatron/legacy/model/fused_bias_gelu.py
@@ -1,6 +1,7 @@
 # Copyright (c) 2022, NVIDIA CORPORATION. All rights reserved.
 
 import torch
+from megatron.core.jit import jit_fuser
 
 
 ###### BIAS GELU FUSION/ NO AUTOGRAD ################
@@ -11,7 +12,7 @@
 # actual gelu is:
 # x * 0.5 * (1.0 + torch.erf(x * 0.70710678))
 
-@torch.jit.script
+@jit_fuser
 def bias_gelu(bias, y):
     x = bias + y
     return  x * 0.5 * (1.0 + torch.tanh(0.79788456 * x * (1 + 0.044715 * x * x)))
@@ -19,7 +20,7 @@ def bias_gelu(bias, y):
 # gradient of tanh approximation of gelu
 # gradient of actual gelu is:
 # 0.5 * (1. + torch.erf(x * 0.70710678)) + 0.3989423 * x * torch.exp(-0.5 * x * x)
-@torch.jit.script
+@jit_fuser
 def bias_gelu_back(g, bias, y):
     x = bias + y
     tanh_out = torch.tanh(0.79788456 * x * (1 + 0.044715 * x * x))
diff --git a/megatron/model/fused_layer_norm.py b/megatron/legacy/model/fused_layer_norm.py
similarity index 81%
rename from megatron/model/fused_layer_norm.py
rename to megatron/legacy/model/fused_layer_norm.py
index fd8591e4a3..5c35483874 100644
--- a/megatron/model/fused_layer_norm.py
+++ b/megatron/legacy/model/fused_layer_norm.py
@@ -4,6 +4,7 @@
       https://github.com/NVIDIA/apex
    with some changes. """
 
+import inspect
 import numbers
 import torch
 from torch.nn.parameter import Parameter
@@ -15,11 +16,13 @@
 try:
     from apex.contrib.layer_norm.layer_norm import FastLayerNormFN
     HAVE_PERSIST_LAYER_NORM = True
-except:
+except ImportError:
     HAVE_PERSIST_LAYER_NORM = False
 
-from apex.normalization.fused_layer_norm import FusedLayerNormAffineFunction
-
+try:
+    from apex.normalization.fused_layer_norm import fused_layer_norm_affine
+except ImportError:
+    fused_layer_norm_affine = None
 
 global fused_layer_norm_cuda
 fused_layer_norm_cuda = None
@@ -77,10 +80,14 @@ def forward(self, input):
     weight = self.weight + 1 if self.apply_layernorm_1p else self.weight
 
     if self.no_persist_layer_norm:
-        return FusedLayerNormAffineFunction.apply(input, weight, self.bias, self.normalized_shape, self.eps)
+        assert fused_layer_norm_affine is not None, \
+            "fused_layer_norm_affine is not available, please install apex from https://github.com/NVIDIA/apex"
+        return fused_layer_norm_affine(input, weight, self.bias, self.normalized_shape, eps=self.eps)
     else:
-        output = FastLayerNormFN.apply(input, weight, self.bias, self.eps)
-
+        if 'memory_efficient' in inspect.getfullargspec(FastLayerNormFN.forward).args:
+            output = FastLayerNormFN.apply(input, weight, self.bias, self.eps, False)
+        else:
+            output = FastLayerNormFN.apply(input, weight, self.bias, self.eps)
         # Apex's fast layer norm function outputs a 'view' tensor (i.e., has
         # a populated '_base' field). This will result in schedule.py's
         # deallocate_output_tensor() throwing an error, so a viewless tensor is
diff --git a/megatron/legacy/model/fused_softmax.py b/megatron/legacy/model/fused_softmax.py
new file mode 100644
index 0000000000..58f900bddd
--- /dev/null
+++ b/megatron/legacy/model/fused_softmax.py
@@ -0,0 +1,234 @@
+# Copyright (c) 2022, NVIDIA CORPORATION. All rights reserved.
+
+
+import torch
+import torch.nn as nn
+from megatron.legacy.model.enums import AttnMaskType
+
+
+class ScaledUpperTriangMaskedSoftmax(torch.autograd.Function):
+    """
+    Fused operation which performs following three operations in sequence
+    1. Scale the tensor.
+    2. Apply upper triangular mask (typically used in gpt models).
+    3. Perform softmax.
+    """
+
+    @staticmethod
+    def forward(ctx, inputs, scale):
+        try:
+            import scaled_upper_triang_masked_softmax_cuda
+        except (ImportError, ModuleNotFoundError):
+            print(f'Please install Apex to use fused_softmax')
+
+        scale_t = torch.tensor([scale])
+        softmax_results = scaled_upper_triang_masked_softmax_cuda.forward(
+            inputs, scale_t[0]
+        )
+
+        ctx.save_for_backward(softmax_results, scale_t)
+        return softmax_results
+
+    @staticmethod
+    def backward(ctx, output_grads):
+        try:
+            import scaled_upper_triang_masked_softmax_cuda
+        except (ImportError, ModuleNotFoundError):
+            print(f'Please install Apex to use fused_softmax')
+
+        softmax_results, scale_t = ctx.saved_tensors
+        input_grads = scaled_upper_triang_masked_softmax_cuda.backward(
+            output_grads, softmax_results, scale_t[0]
+        )
+
+        return input_grads, None
+
+
+class ScaledMaskedSoftmax(torch.autograd.Function):
+    """
+    Fused operation which performs following three operations in sequence
+    1. Scale the tensor.
+    2. Apply the mask.
+    3. Perform softmax.
+    """
+
+    @staticmethod
+    def forward(ctx, inputs, mask, scale):
+        try:
+            import scaled_masked_softmax_cuda
+        except (ImportError, ModuleNotFoundError):
+            print(f'Please install Apex to use fused_softmax')
+
+        scale_t = torch.tensor([scale])
+
+        softmax_results = scaled_masked_softmax_cuda.forward(inputs, mask, scale_t[0])
+        ctx.save_for_backward(softmax_results, scale_t)
+        return softmax_results
+
+    @staticmethod
+    def backward(ctx, output_grads):
+        try:
+            import scaled_masked_softmax_cuda
+        except (ImportError, ModuleNotFoundError):
+            print(f'Please install Apex to use fused_softmax')
+
+        softmax_results, scale_t = ctx.saved_tensors
+
+        input_grads = scaled_masked_softmax_cuda.backward(
+            output_grads, softmax_results, scale_t[0]
+        )
+        return input_grads, None, None
+
+
+class ScaledSoftmax(torch.autograd.Function):
+    """
+    Fused operation which performs following two operations in sequence
+    1. Scale the tensor.
+    2. Perform softmax.
+    """
+
+    @staticmethod
+    def forward(ctx, inputs, scale):
+        try:
+            import scaled_softmax_cuda
+        except (ImportError, ModuleNotFoundError):
+            print(f'Please install Apex to use fused_softmax')
+
+        scale_t = torch.tensor([scale])
+
+        softmax_results = scaled_softmax_cuda.forward(
+            inputs, scale_t[0]
+        )
+        ctx.save_for_backward(softmax_results, scale_t)
+        return softmax_results
+
+    @staticmethod
+    def backward(ctx, output_grads):
+        try:
+            import scaled_softmax_cudaa
+        except (ImportError, ModuleNotFoundError):
+            print(f'Please install Apex to use fused_softmax')
+
+        softmax_results, scale_t = ctx.saved_tensors
+
+        input_grads = scaled_softmax_cuda.backward(
+            output_grads, softmax_results, scale_t[0]
+        )
+        return input_grads, None, None
+
+
+class FusedScaleMaskSoftmax(nn.Module):
+    """
+    fused operation: scaling + mask + softmax
+
+    Args:
+        input_in_fp16: flag to indicate if input in fp16 data format.
+        input_in_bf16: flag to indicate if input in bf16 data format.
+        attn_mask_type: attention mask type (pad or causal)
+        scaled_masked_softmax_fusion: flag to indicate user want to use softmax fusion
+        mask_func: mask function to be applied.
+        softmax_in_fp32: if true, softmax in performed at fp32 precision.
+        scale: scaling factor used in input tensor scaling.
+    """
+
+    def __init__(
+        self,
+        input_in_fp16,
+        input_in_bf16,
+        attn_mask_type,
+        scaled_masked_softmax_fusion,
+        mask_func,
+        softmax_in_fp32,
+        scale,
+    ):
+        super(FusedScaleMaskSoftmax, self).__init__()
+        self.input_in_fp16 = input_in_fp16
+        self.input_in_bf16 = input_in_bf16
+        assert not (
+            self.input_in_fp16 and self.input_in_bf16
+        ), "both fp16 and bf16 flags cannot be active at the same time."
+        self.input_in_float16 = self.input_in_fp16 or self.input_in_bf16
+        self.attn_mask_type = attn_mask_type
+        self.scaled_masked_softmax_fusion = scaled_masked_softmax_fusion
+        self.mask_func = mask_func
+        self.softmax_in_fp32 = softmax_in_fp32
+        self.scale = scale
+
+        assert (
+            self.scale is None or softmax_in_fp32
+        ), "softmax should be in fp32 when scaled"
+
+    def forward(self, input, mask):
+        # [b, np, sq, sk]
+        assert input.dim() == 4
+
+        if self.is_kernel_available(mask, *input.size()):
+            return self.forward_fused_softmax(input, mask)
+        else:
+            return self.forward_torch_softmax(input, mask)
+
+    def is_kernel_available(self, mask, b, np, sq, sk):
+        attn_batches = b * np
+
+        if (
+            self.scaled_masked_softmax_fusion  # user want to fuse
+            and self.input_in_float16  # input must be fp16
+            and 16 < sk <= 16384  # sk must be 16 ~ 16384
+            and sq % 4 == 0  # sq must be divisor of 4
+            and sk % 4 == 0  # sk must be divisor of 4
+            and attn_batches % 4 == 0  # np * b must be divisor of 4
+        ):
+            if 0 <= sk <= 16384:
+                batch_per_block = self.get_batch_per_block(sq, sk, b, np)
+
+                if self.attn_mask_type == AttnMaskType.causal:
+                    if attn_batches % batch_per_block == 0:
+                        return True
+                else:
+                    if sq % batch_per_block == 0:
+                        return True
+        return False
+
+    def forward_fused_softmax(self, input, mask):
+        b, np, sq, sk = input.size()
+        scale = self.scale if self.scale is not None else 1.0
+
+        if self.attn_mask_type == AttnMaskType.causal:
+            assert sq == sk, "causal mask is only for self attention"
+
+            # input is 3D tensor (attn_batches, sq, sk)
+            input = input.view(-1, sq, sk)
+            probs = ScaledUpperTriangMaskedSoftmax.apply(input, scale)
+            return probs.view(b, np, sq, sk)
+        else:
+            # input is 4D tensor (b, np, sq, sk)
+            if mask is not None:
+                return ScaledMaskedSoftmax.apply(input, mask, scale)
+            else:
+                return ScaledSoftmax.apply(input, scale)
+
+    def forward_torch_softmax(self, input, mask):
+        if self.input_in_float16 and self.softmax_in_fp32:
+            input = input.float()
+
+        if self.scale is not None:
+            input = input * self.scale
+        mask_output = self.mask_func(input, mask) if mask is not None else input
+        probs = torch.nn.Softmax(dim=-1)(mask_output)
+
+        if self.input_in_float16 and self.softmax_in_fp32:
+            if self.input_in_fp16:
+                probs = probs.half()
+            else:
+                probs = probs.bfloat16()
+
+        return probs
+
+    @staticmethod
+    def get_batch_per_block(sq, sk, b, np):
+        try:
+            import scaled_masked_softmax_cuda
+        except (ImportError, ModuleNotFoundError):
+            print(f'Please install Apex to use fused_softmax')
+
+        return scaled_masked_softmax_cuda.get_batch_per_block(sq, sk, b, np)
diff --git a/megatron/model/gpt_model.py b/megatron/legacy/model/gpt_model.py
similarity index 86%
rename from megatron/model/gpt_model.py
rename to megatron/legacy/model/gpt_model.py
index a9be43722b..8e380199db 100644
--- a/megatron/model/gpt_model.py
+++ b/megatron/legacy/model/gpt_model.py
@@ -4,15 +4,13 @@
 
 import torch
 
-from megatron import get_args
+from megatron.training import get_args
 from megatron.core import tensor_parallel
 from .module import MegatronModule
 
 from .enums import AttnMaskType
 from .language_model import parallel_lm_logits
 from .language_model import get_language_model
-from .utils import init_method_normal
-from .utils import scaled_init_method_normal
 
 
 def post_language_model_processing(lm_output, labels, logit_weights,
@@ -46,12 +44,13 @@ class GPTModel(MegatronModule):
     """GPT-2 Language model."""
 
     def __init__(self,
+                 config,
                  num_tokentypes=0,
                  parallel_output=True,
                  pre_process=True,
                  post_process=True):
         args = get_args()
-        super(GPTModel, self).__init__(share_word_embeddings=not args.untie_embeddings_and_output_weights)
+        super().__init__(config=config, share_embeddings_and_output_weights=not args.untie_embeddings_and_output_weights)
 
         self.parallel_output = parallel_output
         self.pre_process = pre_process
@@ -60,20 +59,18 @@ def __init__(self,
         self.untie_embeddings_and_output_weights = args.untie_embeddings_and_output_weights
 
         self.language_model, self._language_model_key = get_language_model(
+            config=config,
             num_tokentypes=num_tokentypes,
             add_pooler=False,
             encoder_attn_mask_type=AttnMaskType.causal,
-            init_method=init_method_normal(args.init_method_std),
-            scaled_init_method=scaled_init_method_normal(args.init_method_std,
-                                                         args.num_layers),
             pre_process=self.pre_process,
             post_process=self.post_process)
         
         if not args.untie_embeddings_and_output_weights:
-            self.initialize_word_embeddings(init_method_normal)
+            self.initialize_word_embeddings()
 
     def set_input_tensor(self, input_tensor):
-        """See megatron.model.transformer.set_input_tensor()"""
+        """See megatron.legacy.model.transformer.set_input_tensor()"""
         self.language_model.set_input_tensor(input_tensor)
 
     def forward(self, input_ids, position_ids, attention_mask,
@@ -94,7 +91,7 @@ def forward(self, input_ids, position_ids, attention_mask,
         if self.post_process:
             return post_language_model_processing(
                 lm_output, labels,
-                self.language_model.output_layer.weight if self.untie_embeddings_and_output_weights else self.word_embeddings_weight(),
+                self.language_model.output_layer.weight if self.untie_embeddings_and_output_weights else self.shared_embedding_or_output_weight(),
                 self.parallel_output,
                 self.fp16_lm_cross_entropy)
         else:
diff --git a/megatron/model/language_model.py b/megatron/legacy/model/language_model.py
similarity index 66%
rename from megatron/model/language_model.py
rename to megatron/legacy/model/language_model.py
index 61f2501bcb..ce893902a8 100644
--- a/megatron/model/language_model.py
+++ b/megatron/legacy/model/language_model.py
@@ -5,32 +5,28 @@
 import torch
 import torch.nn.functional as F
 
-from megatron import get_args
 from megatron.core import mpu, tensor_parallel
 from megatron.core.enums import ModelType
+from megatron.core.models.common.embeddings.rotary_pos_embedding import RotaryEmbedding
+from megatron.training import get_args
 
 from .enums import AttnMaskType, LayerType
 from .module import MegatronModule
-from .rotary_pos_embedding import apply_rotary_pos_emb, RotaryEmbedding
 from .transformer import ParallelTransformer
-from .utils import get_linear_layer
-from .utils import init_method_normal, scaled_init_method_normal
+from .utils import get_linear_layer, init_method_normal, scaled_init_method_normal
 
 
-def parallel_lm_logits(input_, word_embeddings_weight, parallel_output,
-                       bias=None):
+def parallel_lm_logits(input_, word_embeddings_weight, parallel_output, bias=None):
     """LM logits using word embedding weights."""
     args = get_args()
     # Parallel logits.
-    if args.async_tensor_model_parallel_allreduce or\
-            args.sequence_parallel:
+    model_parallel = mpu.get_tensor_model_parallel_world_size() > 1
+    if model_parallel or args.sequence_parallel:
         input_parallel = input_
-        model_parallel = mpu.get_tensor_model_parallel_world_size() > 1
-        async_grad_allreduce = args.async_tensor_model_parallel_allreduce and \
-            model_parallel and not args.sequence_parallel
+        allreduce_dgrad = model_parallel and not args.sequence_parallel
     else:
         input_parallel = tensor_parallel.copy_to_tensor_model_parallel_region(input_)
-        async_grad_allreduce = False
+        allreduce_dgrad = False
 
     # Matrix multiply.
     logits_parallel = tensor_parallel.linear_with_grad_accumulation_and_async_allreduce(
@@ -38,8 +34,10 @@ def parallel_lm_logits(input_, word_embeddings_weight, parallel_output,
         weight=word_embeddings_weight,
         bias=bias,
         gradient_accumulation_fusion=args.gradient_accumulation_fusion,
-        async_grad_allreduce=async_grad_allreduce,
-        sequence_parallel_enabled=args.sequence_parallel)
+        sequence_parallel=args.sequence_parallel,
+        grad_output_buffer=None,
+        allreduce_dgrad=allreduce_dgrad,
+    )
     # Gather if needed.
 
     if parallel_output:
@@ -48,26 +46,30 @@ def parallel_lm_logits(input_, word_embeddings_weight, parallel_output,
     return tensor_parallel.gather_from_tensor_model_parallel_region(logits_parallel)
 
 
-def get_language_model(num_tokentypes, add_pooler,
-                       encoder_attn_mask_type, init_method=None,
-                       scaled_init_method=None, add_encoder=True,
-                       add_decoder=False,
-                       decoder_attn_mask_type=AttnMaskType.causal,
-                       pre_process=True, post_process=True):
+def get_language_model(
+    config,
+    num_tokentypes,
+    add_pooler,
+    encoder_attn_mask_type,
+    add_encoder=True,
+    add_decoder=False,
+    decoder_attn_mask_type=AttnMaskType.causal,
+    pre_process=True,
+    post_process=True,
+):
     """Build language model and return along with the key to save."""
     args = get_args()
+    if config.init_method is None:
+        config.init_method = init_method_normal(config.init_method_std)
 
-    if init_method is None:
-        init_method = init_method_normal(args.init_method_std)
-
-    if scaled_init_method is None:
-        scaled_init_method = scaled_init_method_normal(args.init_method_std,
-                                                       args.num_layers)
+    if config.output_layer_init_method is None:
+        config.output_layer_init_method = scaled_init_method_normal(
+            config.init_method_std, config.num_layers
+        )
 
     # Language model.
     language_model = TransformerLanguageModel(
-        init_method,
-        scaled_init_method,
+        config,
         encoder_attn_mask_type,
         num_tokentypes=num_tokentypes,
         add_encoder=add_encoder,
@@ -75,7 +77,7 @@ def get_language_model(num_tokentypes, add_pooler,
         decoder_attn_mask_type=decoder_attn_mask_type,
         add_pooler=add_pooler,
         pre_process=pre_process,
-        post_process=post_process
+        post_process=post_process,
     )
     # key used for checkpoints.
     language_model_key = 'language_model'
@@ -89,7 +91,7 @@ class Pooler(MegatronModule):
     Pool hidden states of a specific token (for example start of the
     sequence) and add a linear transformation followed by a tanh.
 
-    Arguments:
+    Args:
         hidden_size: hidden size
         init_method: weight initialization method for the linear layer.
             bias is set to zero.
@@ -101,7 +103,6 @@ def __init__(self, hidden_size, init_method):
         self.dense = get_linear_layer(hidden_size, hidden_size, init_method)
         self.sequence_parallel = args.sequence_parallel
 
-
     def forward(self, hidden_states, sequence_index=0):
         # hidden_states: [s, b, h]
         # sequence_index: index of the token to pool.
@@ -110,8 +111,8 @@ def forward(self, hidden_states, sequence_index=0):
         # same pooler is run on all tensor parallel nodes
         if self.sequence_parallel:
             hidden_states = tensor_parallel.gather_from_sequence_parallel_region(
-                hidden_states,
-                tensor_parallel_output_grad=False)
+                hidden_states, tensor_parallel_output_grad=False
+            )
 
         pooled = hidden_states[sequence_index, :, :]
         pooled = self.dense(pooled)
@@ -122,7 +123,7 @@ def forward(self, hidden_states, sequence_index=0):
 class Embedding(MegatronModule):
     """Language model embeddings.
 
-    Arguments:
+    Args:
         hidden_size: hidden size
         vocab_size: vocabulary size
         max_sequence_length: maximum size of sequence. This
@@ -133,36 +134,34 @@ class Embedding(MegatronModule):
                         will ignore this embedding
     """
 
-    def __init__(self,
-                 hidden_size,
-                 vocab_size,
-                 max_sequence_length,
-                 embedding_dropout_prob,
-                 init_method,
-                 num_tokentypes=0):
+    def __init__(
+        self,
+        hidden_size,
+        vocab_size,
+        max_sequence_length,
+        embedding_dropout_prob,
+        config,
+        num_tokentypes=0,
+    ):
         super(Embedding, self).__init__()
 
         self.hidden_size = hidden_size
-        self.init_method = init_method
+        self.init_method = config.init_method
         self.num_tokentypes = num_tokentypes
 
         args = get_args()
 
         # Word embeddings (parallel).
+        self.params_dtype = args.params_dtype
         self.word_embeddings = tensor_parallel.VocabParallelEmbedding(
-            vocab_size, self.hidden_size,
-            init_method=self.init_method,
-            params_dtype=args.params_dtype,
-            use_cpu_initialization=args.use_cpu_initialization,
-            perform_initialization=args.perform_initialization
+            vocab_size, self.hidden_size, config=config, init_method=config.init_method
         )
         self._word_embeddings_key = 'word_embeddings'
 
         # Position embedding (serial).
-        self.add_position_embedding = args.add_position_embedding
+        self.add_position_embedding = args.position_embedding_type == 'learned_absolute'
         if self.add_position_embedding:
-            self.position_embeddings = torch.nn.Embedding(
-                max_sequence_length, self.hidden_size)
+            self.position_embeddings = torch.nn.Embedding(max_sequence_length, self.hidden_size)
             self._position_embeddings_key = 'position_embeddings'
             # Initialize the position embeddings.
             if args.perform_initialization:
@@ -174,16 +173,16 @@ def __init__(self,
         # token types and add them as needed.
         self._tokentype_embeddings_key = 'tokentype_embeddings'
         if self.num_tokentypes > 0:
-            self.tokentype_embeddings = torch.nn.Embedding(self.num_tokentypes,
-                                                           self.hidden_size)
+            self.tokentype_embeddings = torch.nn.Embedding(self.num_tokentypes, self.hidden_size)
             # Initialize the token-type embeddings.
             if args.perform_initialization:
                 self.init_method(self.tokentype_embeddings.weight)
         else:
             self.tokentype_embeddings = None
 
-        self.fp32_residual_connection = args.fp32_residual_connection 
+        self.fp32_residual_connection = args.fp32_residual_connection
         self.sequence_parallel = args.sequence_parallel
+        self.clone_scatter_output_in_embedding = args.clone_scatter_output_in_embedding
         # Embeddings dropout
         self.embedding_dropout = torch.nn.Dropout(embedding_dropout_prob)
 
@@ -206,11 +205,9 @@ def add_tokentype_embeddings(self, num_tokentypes):
         if self.tokentype_embeddings is not None:
             raise Exception('tokentype embeddings is already initialized')
         if torch.distributed.get_rank() == 0:
-            print('adding embedding for {} tokentypes'.format(num_tokentypes),
-                  flush=True)
+            print('adding embedding for {} tokentypes'.format(num_tokentypes), flush=True)
         self.num_tokentypes = num_tokentypes
-        self.tokentype_embeddings = torch.nn.Embedding(num_tokentypes,
-                                                       self.hidden_size)
+        self.tokentype_embeddings = torch.nn.Embedding(num_tokentypes, self.hidden_size)
         # Initialize the token-type embeddings.
         args = get_args()
         self.init_method(self.tokentype_embeddings.weight)
@@ -240,6 +237,11 @@ def forward(self, input_ids, position_ids, tokentype_ids=None):
         # Dropout.
         if self.sequence_parallel:
             embeddings = tensor_parallel.scatter_to_sequence_parallel_region(embeddings)
+            # `scatter_to_sequence_parallel_region` returns a view, which prevents
+            # the original tensor from being garbage collected. Clone to facilitate GC.
+            # Has a small runtime cost (~0.5%).
+            if self.clone_scatter_output_in_embedding:
+                embeddings = embeddings.clone()
             with tensor_parallel.get_cuda_rng_tracker().fork():
                 embeddings = self.embedding_dropout(embeddings)
         else:
@@ -251,17 +253,17 @@ def state_dict_for_save_checkpoint(self, prefix='', keep_vars=False):
         """For easy load."""
 
         state_dict_ = {}
-        state_dict_[self._word_embeddings_key] \
-            = self.word_embeddings.state_dict(prefix=prefix,
-                                              keep_vars=keep_vars)
+        state_dict_[self._word_embeddings_key] = self.word_embeddings.state_dict(
+            prefix=prefix, keep_vars=keep_vars
+        )
         if self.add_position_embedding:
-            state_dict_[self._position_embeddings_key] \
-                = self.position_embeddings.state_dict(prefix=prefix,
-                                                  keep_vars=keep_vars)
+            state_dict_[self._position_embeddings_key] = self.position_embeddings.state_dict(
+                prefix=prefix, keep_vars=keep_vars
+            )
         if self.num_tokentypes > 0:
-            state_dict_[self._tokentype_embeddings_key] \
-                = self.tokentype_embeddings.state_dict(prefix=prefix,
-                                                       keep_vars=keep_vars)
+            state_dict_[self._tokentype_embeddings_key] = self.tokentype_embeddings.state_dict(
+                prefix=prefix, keep_vars=keep_vars
+            )
 
         return state_dict_
 
@@ -276,8 +278,7 @@ def load_state_dict(self, state_dict, strict=True):
             state_dict_ = {}
             for key in state_dict.keys():
                 if 'word_embeddings' in key:
-                    state_dict_[key.split('word_embeddings.')[1]] \
-                        = state_dict[key]
+                    state_dict_[key.split('word_embeddings.')[1]] = state_dict[key]
         self.word_embeddings.load_state_dict(state_dict_, strict=strict)
 
         # Position embedding.
@@ -289,8 +290,7 @@ def load_state_dict(self, state_dict, strict=True):
                 state_dict_ = {}
                 for key in state_dict.keys():
                     if 'position_embeddings' in key:
-                        state_dict_[key.split('position_embeddings.')[1]] \
-                            = state_dict[key]
+                        state_dict_[key.split('position_embeddings.')[1]] = state_dict[key]
             self.position_embeddings.load_state_dict(state_dict_, strict=strict)
 
         # Tokentype embedding.
@@ -302,20 +302,21 @@ def load_state_dict(self, state_dict, strict=True):
                 # for backward compatibility.
                 for key in state_dict.keys():
                     if 'tokentype_embeddings' in key:
-                        state_dict_[key.split('tokentype_embeddings.')[1]] \
-                            = state_dict[key]
+                        state_dict_[key.split('tokentype_embeddings.')[1]] = state_dict[key]
             if len(state_dict_.keys()) > 0:
-                self.tokentype_embeddings.load_state_dict(state_dict_,
-                                                          strict=strict)
+                self.tokentype_embeddings.load_state_dict(state_dict_, strict=strict)
             else:
-                print('***WARNING*** expected tokentype embeddings in the '
-                      'checkpoint but could not find it', flush=True)
+                print(
+                    '***WARNING*** expected tokentype embeddings in the '
+                    'checkpoint but could not find it',
+                    flush=True,
+                )
 
 
 class TransformerLanguageModel(MegatronModule):
     """Transformer language model.
 
-    Arguments:
+    Args:
         transformer_hparams: transformer hyperparameters
         vocab_size: vocabulary size
         max_sequence_length: maximum size of sequence. This
@@ -325,27 +326,31 @@ class TransformerLanguageModel(MegatronModule):
                         will ignore this embedding
     """
 
-    def __init__(self,
-                 init_method,
-                 output_layer_init_method,
-                 encoder_attn_mask_type,
-                 num_tokentypes=0,
-                 add_encoder=True,
-                 add_decoder=False,
-                 decoder_attn_mask_type=AttnMaskType.causal,
-                 add_pooler=False,
-                 pre_process=True,
-                 post_process=True):
+    def __init__(
+        self,
+        config,
+        encoder_attn_mask_type,
+        num_tokentypes=0,
+        add_encoder=True,
+        add_decoder=False,
+        decoder_attn_mask_type=AttnMaskType.causal,
+        add_pooler=False,
+        pre_process=True,
+        post_process=True,
+    ):
         args = get_args()
-        # TODO: passing share_word_embeddings=False will not work correctly for T5 and embeddings will not be synced. Fix later for T5.
-        if args.untie_embeddings_and_output_weights: assert not add_decoder
-        super(TransformerLanguageModel, self).__init__(share_word_embeddings=not args.untie_embeddings_and_output_weights)
+        # TODO: passing share_embeddings_and_output_weights=False will not work correctly for T5 and embeddings will not be synced. Fix later for T5.
+        if args.untie_embeddings_and_output_weights:
+            assert not add_decoder
+        super(TransformerLanguageModel, self).__init__(
+            share_embeddings_and_output_weights=not args.untie_embeddings_and_output_weights
+        )
 
         self.pre_process = pre_process
         self.post_process = post_process
-        self.hidden_size = args.hidden_size
+        self.hidden_size = config.hidden_size
         self.num_tokentypes = num_tokentypes
-        self.init_method = init_method
+        self.init_method = config.init_method
         self.add_encoder = add_encoder
         self.encoder_attn_mask_type = encoder_attn_mask_type
         self.add_decoder = add_decoder
@@ -357,38 +362,43 @@ def __init__(self,
 
         # Embeddings.
         if self.pre_process:
-            self.embedding = Embedding(self.hidden_size,
-                                       args.padded_vocab_size,
-                                       args.max_position_embeddings,
-                                       args.hidden_dropout,
-                                       self.init_method,
-                                       self.num_tokentypes)
+            self.embedding = Embedding(
+                self.hidden_size,
+                args.padded_vocab_size,
+                args.max_position_embeddings,
+                args.hidden_dropout,
+                config,
+                self.num_tokentypes,
+            )
             self._embedding_key = 'embedding'
 
         # Rotary positional embeddings
-        self.use_rotary_position_embeddings = \
-            args.use_rotary_position_embeddings
-        if args.use_rotary_position_embeddings:
+        self.use_rotary_position_embeddings = args.position_embedding_type == 'rope'
+        if self.use_rotary_position_embeddings:
             self.seq_length = args.seq_length
-            rotary_dim = args.hidden_size // args.num_attention_heads \
-                if args.kv_channels is None else args.kv_channels
-
-            if args.rotary_percent < 1.0:
-                rotary_dim = int(rotary_dim * args.rotary_percent)
+            rotary_dim = (
+                args.hidden_size // args.num_attention_heads
+                if args.kv_channels is None
+                else args.kv_channels
+            )
 
             # partial rotary embeddings, which is better than full rotary
             # Wang and Komatsuzaki et al
             # https://github.com/kingoflolz/mesh-transformer-jax/
-            self.rotary_pos_emb = RotaryEmbedding(rotary_dim)
+            self.rotary_pos_emb = RotaryEmbedding(
+                kv_channels=rotary_dim,
+                rotary_percent=args.rotary_percent,
+                seq_len_interpolation_factor=args.rotary_seq_len_interpolation_factor,
+            )
 
         # Encoder (usually set to True, False if part of an encoder-decoder
         # architecture and in encoder-only stage).
         if self.add_encoder:
             self.encoder = ParallelTransformer(
-                self.init_method,
-                output_layer_init_method,
-                model_type=args.model_type if not args.retro_add_retriever \
-                    else ModelType.retro_decoder,
+                config,
+                model_type=(
+                    args.model_type if not args.retro_add_retriever else ModelType.retro_decoder
+                ),
                 self_attn_mask_type=self.encoder_attn_mask_type,
                 pre_process=self.pre_process,
                 post_process=self.post_process,
@@ -401,13 +411,13 @@ def __init__(self,
         # architecture and in decoder-only stage).
         if self.add_decoder:
             self.decoder = ParallelTransformer(
-                self.init_method,
-                output_layer_init_method,
+                config,
                 model_type=args.model_type,
                 layer_type=LayerType.decoder,
                 self_attn_mask_type=self.decoder_attn_mask_type,
                 pre_process=self.pre_process,
-                post_process=self.post_process)
+                post_process=self.post_process,
+            )
             self._decoder_key = 'decoder'
         else:
             self.decoder = None
@@ -422,14 +432,14 @@ def __init__(self,
                 self.output_layer = tensor_parallel.ColumnParallelLinear(
                     args.hidden_size,
                     args.padded_vocab_size,
-                    bias=False, # Setting bias to False always to keep it consistent with embedding tying that also does not have a bias.
+                    config=config,
                     init_method=self.init_method,
-                    use_cpu_initialization=args.use_cpu_initialization,
-                    perform_initialization=args.perform_initialization)
+                    bias=False,
+                )  # Setting bias to False always to keep it consistent with embedding tying that also does not have a bias.
                 self._output_layer_key = 'output_layer'
 
     def set_input_tensor(self, input_tensor):
-        """ See megatron.model.transformer.set_input_tensor()"""
+        """See megatron.legacy.model.transformer.set_input_tensor()"""
 
         # This is usually handled in schedules.py but some inference code still
         # gives us non-lists or None
@@ -437,12 +447,14 @@ def set_input_tensor(self, input_tensor):
             input_tensor = [input_tensor]
 
         if self.add_encoder and self.add_decoder:
-            assert len(input_tensor) == 1, \
-                'input_tensor should only be length 1 for stage with both encoder and decoder'
+            assert (
+                len(input_tensor) == 1
+            ), 'input_tensor should only be length 1 for stage with both encoder and decoder'
             self.encoder.set_input_tensor(input_tensor[0])
         elif self.add_encoder:
-            assert len(input_tensor) == 1, \
-                'input_tensor should only be length 1 for stage with only encoder'
+            assert (
+                len(input_tensor) == 1
+            ), 'input_tensor should only be length 1 for stage with only encoder'
             self.encoder.set_input_tensor(input_tensor[0])
         elif self.add_decoder:
             if len(input_tensor) == 2:
@@ -456,28 +468,38 @@ def set_input_tensor(self, input_tensor):
         else:
             raise Exception('Stage must have at least either encoder or decoder')
 
-    def forward(self, enc_input_ids, enc_position_ids, enc_attn_mask,
-                dec_input_ids=None, dec_position_ids=None, dec_attn_mask=None,
-                retriever_input_ids=None,
-                retriever_position_ids=None,
-                retriever_attn_mask=None,
-                enc_dec_attn_mask=None, tokentype_ids=None,
-                inference_params=None,
-                pooling_sequence_index=0,
-                enc_hidden_states=None, output_enc_hidden=False):
+    def forward(
+        self,
+        enc_input_ids,
+        enc_position_ids,
+        enc_attn_mask,
+        dec_input_ids=None,
+        dec_position_ids=None,
+        dec_attn_mask=None,
+        retriever_input_ids=None,
+        retriever_position_ids=None,
+        retriever_attn_mask=None,
+        enc_dec_attn_mask=None,
+        tokentype_ids=None,
+        inference_params=None,
+        pooling_sequence_index=0,
+        enc_hidden_states=None,
+        output_enc_hidden=False,
+    ):
 
         # Encoder embedding.
         if self.pre_process:
-            encoder_input = self.embedding(enc_input_ids, enc_position_ids,
-                                           tokentype_ids=tokentype_ids)
+            encoder_input = self.embedding(
+                enc_input_ids, enc_position_ids, tokentype_ids=tokentype_ids
+            )
         else:
             encoder_input = None
 
         # Retriever embedding.
         if self.add_retriever and self.pre_process:
-            retriever_input = self.embedding(retriever_input_ids,
-                                             retriever_position_ids,
-                                             tokentype_ids=tokentype_ids)
+            retriever_input = self.embedding(
+                retriever_input_ids, retriever_position_ids, tokentype_ids=tokentype_ids
+            )
         else:
             retriever_input = None
 
@@ -485,8 +507,7 @@ def forward(self, enc_input_ids, enc_position_ids, enc_attn_mask,
         rotary_pos_emb = None
         if self.use_rotary_position_embeddings:
             if inference_params is not None:
-                rotary_pos_emb = \
-                    self.rotary_pos_emb(inference_params.max_sequence_len)
+                rotary_pos_emb = self.rotary_pos_emb(inference_params.max_sequence_length)
             else:
                 rotary_pos_emb = self.rotary_pos_emb(self.seq_length)
 
@@ -499,7 +520,8 @@ def forward(self, enc_input_ids, enc_position_ids, enc_attn_mask,
                     retriever_input=retriever_input,
                     retriever_attn_mask=retriever_attn_mask,
                     inference_params=inference_params,
-                    rotary_pos_emb=rotary_pos_emb)
+                    rotary_pos_emb=rotary_pos_emb,
+                )
             else:
                 encoder_output = self.encoder_hidden_state
         else:
@@ -507,8 +529,7 @@ def forward(self, enc_input_ids, enc_position_ids, enc_attn_mask,
 
         if self.post_process:
             if self.add_pooler:
-                pooled_output = self.pooler(encoder_output,
-                                            pooling_sequence_index)
+                pooled_output = self.pooler(encoder_output, pooling_sequence_index)
 
         # output_enc_hidden refers to when we just need the encoder's
         # output. For example, it is helpful to compute
@@ -521,8 +542,7 @@ def forward(self, enc_input_ids, enc_position_ids, enc_attn_mask,
 
         # Decoder embedding.
         if self.pre_process:
-            decoder_input = self.embedding(dec_input_ids,
-                                           dec_position_ids)
+            decoder_input = self.embedding(dec_input_ids, dec_position_ids)
         else:
             decoder_input = None
 
@@ -533,7 +553,8 @@ def forward(self, enc_input_ids, enc_position_ids, enc_attn_mask,
             encoder_output=encoder_output,
             enc_dec_attn_mask=enc_dec_attn_mask,
             inference_params=inference_params,
-            rotary_pos_emb=rotary_pos_emb)
+            rotary_pos_emb=rotary_pos_emb,
+        )
 
         if self.add_pooler and self.post_process:
             return decoder_output, encoder_output, pooled_output
@@ -545,26 +566,27 @@ def state_dict_for_save_checkpoint(self, prefix='', keep_vars=False):
 
         state_dict_ = {}
         if self.pre_process:
-            state_dict_[self._embedding_key] \
-                = self.embedding.state_dict_for_save_checkpoint(prefix=prefix,
-                                                                keep_vars=keep_vars)
+            state_dict_[self._embedding_key] = self.embedding.state_dict_for_save_checkpoint(
+                prefix=prefix, keep_vars=keep_vars
+            )
         if self.add_encoder:
-            state_dict_[self._encoder_key] \
-                = self.encoder.state_dict_for_save_checkpoint(prefix=prefix,
-                                                              keep_vars=keep_vars)
+            state_dict_[self._encoder_key] = self.encoder.state_dict_for_save_checkpoint(
+                prefix=prefix, keep_vars=keep_vars
+            )
         if self.post_process:
             if self.add_pooler:
-                state_dict_[self._pooler_key] \
-                    = self.pooler.state_dict_for_save_checkpoint(prefix=prefix,
-                                                                 keep_vars=keep_vars)
+                state_dict_[self._pooler_key] = self.pooler.state_dict_for_save_checkpoint(
+                    prefix=prefix, keep_vars=keep_vars
+                )
             if self.untie_embeddings_and_output_weights:
-                state_dict_[self._output_layer_key] \
-                    = self.output_layer.state_dict(prefix=prefix, keep_vars=keep_vars)
+                state_dict_[self._output_layer_key] = self.output_layer.state_dict(
+                    prefix=prefix, keep_vars=keep_vars
+                )
 
         if self.add_decoder:
-            state_dict_[self._decoder_key] \
-                = self.decoder.state_dict_for_save_checkpoint(prefix=prefix,
-                                                              keep_vars=keep_vars)
+            state_dict_[self._decoder_key] = self.decoder.state_dict_for_save_checkpoint(
+                prefix=prefix, keep_vars=keep_vars
+            )
 
         return state_dict_
 
@@ -601,8 +623,9 @@ def load_state_dict(self, state_dict, strict=True):
             state_dict_self_attention = {}
             for key in state_dict_.keys():
                 if '.attention.' in key:
-                    state_dict_self_attention[key.replace(".attention.",
-                        ".self_attention.")] = state_dict_[key]
+                    state_dict_self_attention[key.replace(".attention.", ".self_attention.")] = (
+                        state_dict_[key]
+                    )
                 else:
                     state_dict_self_attention[key] = state_dict_[key]
             state_dict_ = state_dict_self_attention
@@ -612,18 +635,14 @@ def load_state_dict(self, state_dict, strict=True):
         # Pooler.
         if self.post_process:
             if self.add_pooler:
-                assert 'pooler' in state_dict, \
-                    'could not find data for pooler in the checkpoint'
-                self.pooler.load_state_dict(state_dict[self._pooler_key],
-                                            strict=strict)
+                assert 'pooler' in state_dict, 'could not find data for pooler in the checkpoint'
+                self.pooler.load_state_dict(state_dict[self._pooler_key], strict=strict)
             if self.untie_embeddings_and_output_weights:
-                assert 'output_layer' in state_dict, \
-                    'could not find data for output_layer in the checkpoint'
-                self.output_layer.load_state_dict(state_dict[self._output_layer_key],
-                                                  strict=strict)
+                assert (
+                    'output_layer' in state_dict
+                ), 'could not find data for output_layer in the checkpoint'
+                self.output_layer.load_state_dict(state_dict[self._output_layer_key], strict=strict)
         # Decoder.
         if self.add_decoder:
-            assert 'decoder' in state_dict, \
-                'could not find data for pooler in the checkpoint'
-            self.decoder.load_state_dict(state_dict[self._decoder_key],
-                                         strict=strict)
+            assert 'decoder' in state_dict, 'could not find data for pooler in the checkpoint'
+            self.decoder.load_state_dict(state_dict[self._decoder_key], strict=strict)
diff --git a/megatron/model/module.py b/megatron/legacy/model/module.py
similarity index 81%
rename from megatron/model/module.py
rename to megatron/legacy/model/module.py
index d4ed76e4ad..c89700e336 100644
--- a/megatron/model/module.py
+++ b/megatron/legacy/model/module.py
@@ -6,7 +6,7 @@
 from torch.autograd import Variable
 from torch.nn.parameter import Parameter
 
-from megatron import get_args
+from megatron.training import get_args
 from megatron.core import mpu, tensor_parallel
 
 
@@ -25,10 +25,10 @@ class MegatronModule(torch.nn.Module):
     """Megatron specific extensions of torch Module with support
     for pipelining."""
 
-    def __init__(self, share_word_embeddings=True):
+    def __init__(self, config=None, share_embeddings_and_output_weights=True):
         super(MegatronModule, self).__init__()
-        self.share_word_embeddings = share_word_embeddings
-
+        self.config = config
+        self.share_embeddings_and_output_weights = share_embeddings_and_output_weights
 
     def state_dict_for_save_checkpoint(self, prefix='', keep_vars=False):
         """Use this function to override the state dict for
@@ -36,28 +36,35 @@ def state_dict_for_save_checkpoint(self, prefix='', keep_vars=False):
         return self.state_dict(prefix=prefix, keep_vars=keep_vars)
 
 
-    def word_embeddings_weight(self):
+    def shared_embedding_or_output_weight(self):
         if self.pre_process:
             return self.language_model.embedding.word_embeddings.weight
         else:
-            if not self.share_word_embeddings:
-                raise Exception('word_embeddings_weight() called for last '
-                                'stage, but share_word_embeddings is false')
+            if not self.share_embeddings_and_output_weights:
+                raise Exception('shared_embedding_or_output_weight() called for last '
+                                'stage, but share_embeddings_and_output_weights is false')
             return self.word_embeddings.weight
 
 
-    def initialize_word_embeddings(self, init_method_normal):
+    def initialize_word_embeddings(self):
         args = get_args()
-        if not self.share_word_embeddings:
+        if not self.share_embeddings_and_output_weights:
             raise Exception('initialize_word_embeddings() was called but '
-                            'share_word_embeddings is false')
+                            'share_embeddings_and_output_weights is false')
 
         # This function just initializes the word embeddings in the final stage
         # when we are using pipeline parallelism. Nothing to do if we aren't
         # using pipeline parallelism.
         if args.pipeline_model_parallel_size == 1:
+            # Zero out wgrad if sharing embeddings between two layers on same
+            # pipeline stage to make sure grad accumulation into main_grad is
+            # correct and does not include garbage values (e.g., from torch.empty).
+            self.shared_embedding_or_output_weight().zero_out_wgrad = True
             return
 
+        if mpu.is_pipeline_first_stage() and self.pre_process and not self.post_process:
+           self.shared_embedding_or_output_weight().shared_embedding = True
+
         # Parameters are shared between the word embeddings layers, and the
         # heads at the end of the model. In a pipelined setup with more than
         # one stage, the initial embedding layer and the head are on different
@@ -76,13 +83,11 @@ def initialize_word_embeddings(self, init_method_normal):
             # set word_embeddings weights to 0 here, then copy first
             # stage's weights using all_reduce below.
             self.word_embeddings = tensor_parallel.VocabParallelEmbedding(
-                args.padded_vocab_size, args.hidden_size,
-                init_method=init_method_normal(args.init_method_std),
-                params_dtype=args.params_dtype,
-                use_cpu_initialization=args.use_cpu_initialization,
-                perform_initialization=args.perform_initialization)
+                args.padded_vocab_size, self.config.hidden_size,
+                config=self.config, init_method=self.config.init_method)
             self.word_embeddings.weight.data.fill_(0)
             self.word_embeddings.weight.shared = True
+            self.word_embeddings.weight.shared_embedding = True
 
         # Zero out initial weights for decoder embedding.
         # NOTE: We don't currently support T5 with the interleaved schedule.
@@ -103,7 +108,8 @@ def initialize_word_embeddings(self, init_method_normal):
         # Ensure that first and last stages have the same initial parameter
         # values.
         if mpu.is_rank_in_embedding_group():
-            torch.distributed.all_reduce(self.word_embeddings_weight().data,
+            self.shared_embedding_or_output_weight().data = self.shared_embedding_or_output_weight().data.cuda()
+            torch.distributed.all_reduce(self.shared_embedding_or_output_weight().data,
                                          group=mpu.get_embedding_group())
 
         # Ensure that encoder(first stage) and decoder(split stage) position
diff --git a/megatron/model/multiple_choice.py b/megatron/legacy/model/multiple_choice.py
similarity index 83%
rename from megatron/model/multiple_choice.py
rename to megatron/legacy/model/multiple_choice.py
index 6af06240d4..bec0548c40 100644
--- a/megatron/model/multiple_choice.py
+++ b/megatron/legacy/model/multiple_choice.py
@@ -4,36 +4,34 @@
 
 import torch
 
-from megatron import get_args, print_rank_last
-from megatron.model.enums import AttnMaskType
-from megatron.model.bert_model import bert_extended_attention_mask, bert_position_ids
-from megatron.model.language_model import get_language_model
-from megatron.model.utils import get_linear_layer
-from megatron.model.utils import init_method_normal
-from megatron.model.utils import scaled_init_method_normal
+from megatron.training import get_args, print_rank_last
+from megatron.legacy.model.enums import AttnMaskType
+from megatron.legacy.model.bert_model import bert_extended_attention_mask, bert_position_ids
+from megatron.legacy.model.language_model import get_language_model
+from megatron.legacy.model.utils import get_linear_layer
+from megatron.legacy.model.utils import init_method_normal
+from megatron.legacy.model.utils import scaled_init_method_normal
 from .module import MegatronModule
 
 
 class MultipleChoice(MegatronModule):
 
     def __init__(self,
+                 config,
                  num_tokentypes=2,
                  pre_process=True,
                  post_process=True):
-        super(MultipleChoice, self).__init__(share_word_embeddings=False)
+        super(MultipleChoice, self).__init__(share_embeddings_and_output_weights=False)
         args = get_args()
 
-        init_method = init_method_normal(args.init_method_std)
         self.pre_process = pre_process
         self.post_process = post_process
 
         self.language_model, self._language_model_key = get_language_model(
+            config=config,
             num_tokentypes=num_tokentypes,
             add_pooler=True,
             encoder_attn_mask_type=AttnMaskType.padding,
-            init_method=init_method,
-            scaled_init_method=scaled_init_method_normal(args.init_method_std,
-                                                         args.num_layers),
             pre_process=self.pre_process,
             post_process=self.post_process)
 
@@ -45,7 +43,7 @@ def __init__(self,
             self._multichoice_head_key = 'multichoice_head'
 
     def set_input_tensor(self, input_tensor):
-        """See megatron.model.transformer.set_input_tensor()"""
+        """See megatron.legacy.model.transformer.set_input_tensor()"""
         self.language_model.set_input_tensor(input_tensor)
 
     def forward(self, model_input, attention_mask, tokentype_ids=None):
diff --git a/megatron/model/realm_model.py b/megatron/legacy/model/realm_model.py
similarity index 92%
rename from megatron/model/realm_model.py
rename to megatron/legacy/model/realm_model.py
index 654f2992f6..1999cdb07c 100644
--- a/megatron/model/realm_model.py
+++ b/megatron/legacy/model/realm_model.py
@@ -1,17 +1,18 @@
+# Copyright (c) 2024, NVIDIA CORPORATION. All rights reserved.
 import os
 import torch
 
-from megatron import get_args, print_rank_0
-from megatron.checkpointing import get_checkpoint_tracker_filename, get_checkpoint_name
-from megatron.model import BertModel
+from megatron.training import get_args, print_rank_0
+from megatron.training.checkpointing import get_checkpoint_tracker_filename, get_checkpoint_name
+from megatron.legacy.model import BertModel
 from .module import MegatronModule
 from megatron.core import mpu
-from megatron.model.enums import AttnMaskType
-from megatron.model.utils import get_linear_layer
-from megatron.model.utils import init_method_normal
-from megatron.model.language_model import get_language_model
-from megatron.model.utils import scaled_init_method_normal
-from megatron.model.bert_model import bert_extended_attention_mask, bert_position_ids
+from megatron.legacy.model.enums import AttnMaskType
+from megatron.legacy.model.utils import get_linear_layer
+from megatron.legacy.model.utils import init_method_normal
+from megatron.legacy.model.language_model import get_language_model
+from megatron.legacy.model.utils import scaled_init_method_normal
+from megatron.legacy.model.bert_model import bert_extended_attention_mask, bert_position_ids
 
 
 def general_ict_model_provider(only_query_model=False, only_block_model=False):
@@ -131,7 +132,7 @@ def init_state_dict_from_bert(self):
 
         try:
             state_dict = torch.load(checkpoint_name, map_location='cpu')
-        except BaseException:
+        except Exception:
             raise ValueError("Could not load checkpoint")
 
         # load the LM state dict into each model
diff --git a/megatron/legacy/model/rms_norm.py b/megatron/legacy/model/rms_norm.py
new file mode 100644
index 0000000000..21ba00c600
--- /dev/null
+++ b/megatron/legacy/model/rms_norm.py
@@ -0,0 +1,32 @@
+# Copyright (c) 2023, NVIDIA CORPORATION. All rights reserved.
+
+import torch
+from torch import nn
+
+class RMSNorm(torch.nn.Module):
+
+    def __init__(self,
+                 dim: int,
+                 eps: float = 1e-6,
+                 sequence_parallel: bool = False,
+                 config: dict = None):
+        """RMS Normaliation module
+
+        Args:
+            dim (int): The width of input, i.e. hidden size
+            eps (float): epsilon to use for the norm, default to 1e-6
+            sequence_parallel (bool): Set to true if sequence parallelism is being used,
+              this marks the weights as needing to be allreduced.
+        """
+        super().__init__()
+        self.eps = eps
+        self.weight = nn.Parameter(torch.ones(dim))
+
+        setattr(self.weight, 'sequence_parallel', sequence_parallel)
+
+    def _norm(self, x):
+        return x * torch.rsqrt(x.pow(2).mean(-1, keepdim=True) + self.eps)
+
+    def forward(self, x):
+        output = self._norm(x.float()).type_as(x)
+        return output * self.weight
diff --git a/megatron/model/t5_model.py b/megatron/legacy/model/t5_model.py
similarity index 86%
rename from megatron/model/t5_model.py
rename to megatron/legacy/model/t5_model.py
index 606c3e75d8..1662188334 100644
--- a/megatron/model/t5_model.py
+++ b/megatron/legacy/model/t5_model.py
@@ -4,16 +4,14 @@
 
 import torch
 
-from megatron import get_args
+from megatron.training import get_args
 from megatron.core import tensor_parallel
-from megatron.model.enums import AttnMaskType
-from megatron.model.language_model import parallel_lm_logits, get_language_model
-from megatron.model import LayerNorm
-from megatron.model.utils import (
+from megatron.legacy.model.enums import AttnMaskType
+from megatron.legacy.model.language_model import parallel_lm_logits, get_language_model
+from megatron.legacy.model import LayerNorm
+from megatron.legacy.model.utils import (
     openai_gelu,
-    get_linear_layer,
-    init_method_normal,
-    scaled_init_method_normal
+    get_linear_layer
 )
 from .module import MegatronModule
 
@@ -41,19 +39,14 @@ def t5_position_ids(token_ids):
 class T5LMHead(MegatronModule):
     """Masked LM head for T5
 
-    Arguments:
+    Args:
         mpu_vocab_size: model parallel size of vocabulary.
-        hidden_size: hidden size
-        init_method: init method for weight initialization
-        layernorm_epsilon: tolerance for layer norm divisions
         parallel_output: wether output logits being distributed or not.
     """
 
     def __init__(self, mpu_vocab_size, parallel_output):
         super(T5LMHead, self).__init__()
 
-        args = get_args()
-
         self.bias = torch.nn.Parameter(torch.zeros(mpu_vocab_size))
         self.bias.model_parallel = True
         self.bias.partition_dim = 0
@@ -72,46 +65,52 @@ class T5Model(MegatronModule):
     """T5 Language model."""
 
     def __init__(self,
+                 config,
                  num_tokentypes=0,
                  parallel_output=True,
                  pre_process=True,
                  post_process=True,
                  add_encoder=True,
                  add_decoder=True):
-        super(T5Model, self).__init__()
+        super().__init__(config=config)
         args = get_args()
 
         self.fp16_lm_cross_entropy = args.fp16_lm_cross_entropy
         self.parallel_output = parallel_output
-        init_method = init_method_normal(args.init_method_std)
-        scaled_init_method = scaled_init_method_normal(args.init_method_std,
-                                                       args.num_layers)
         self.pre_process = pre_process
         self.post_process = post_process
         self.add_encoder = add_encoder
         self.add_decoder = add_decoder
 
         self.language_model, self._language_model_key = get_language_model(
+            config=config,
             num_tokentypes=num_tokentypes,
             add_pooler=False,
             add_encoder=add_encoder,
             add_decoder=add_decoder,
             encoder_attn_mask_type=AttnMaskType.padding,
-            init_method=init_method,
-            scaled_init_method=scaled_init_method,
             pre_process=self.pre_process,
             post_process=self.post_process)
 
-        self.initialize_word_embeddings(init_method_normal)
+        self.initialize_word_embeddings()
+
+        if self.pre_process:
+            self.position_embeddings = self.language_model.embedding.position_embeddings
+        else:
+            self.position_embeddings = None
 
         if self.post_process and self.add_decoder:
             self.lm_head = T5LMHead(
-                self.word_embeddings_weight().size(0),
+                self.shared_embedding_or_output_weight().size(0),
                 parallel_output)
             self._lm_head_key = 'lm_head'
 
+        # Tells schedules.py that this model has a skip connection between the encoder's output and the decoder
+        # (and hence both the encoder and decoder's tensors are required for correct backprop).
+        self.xattn_needed = True
+
     def set_input_tensor(self, input_tensor):
-        """See megatron.model.transformer.set_input_tensor()"""
+        """See megatron.legacy.model.transformer.set_input_tensor()"""
         self.language_model.set_input_tensor(input_tensor)
 
     def forward(self, encoder_input_ids, decoder_input_ids, encoder_attn_mask,
@@ -139,7 +138,7 @@ def forward(self, encoder_input_ids, decoder_input_ids, encoder_attn_mask,
             decoder_output, encoder_output = lm_output
             # Output. [s, b, h]
             lm_logits = self.lm_head(decoder_output,
-                                     self.word_embeddings_weight())
+                                     self.shared_embedding_or_output_weight())
 
             if lm_labels is None:
                 # [s b h] => [b s h]
diff --git a/megatron/model/transformer.py b/megatron/legacy/model/transformer.py
similarity index 68%
rename from megatron/model/transformer.py
rename to megatron/legacy/model/transformer.py
index 4d744e7a25..dda550551a 100644
--- a/megatron/model/transformer.py
+++ b/megatron/legacy/model/transformer.py
@@ -1,23 +1,46 @@
-# Copyright (c) 2023, NVIDIA CORPORATION. All rights reserved.
+# Copyright (c) 2024, NVIDIA CORPORATION. All rights reserved.
 
 """Transformer."""
-from contextlib import nullcontext
 import math
+import os
+from contextlib import nullcontext
+from typing import Optional
+
 import numpy as np
 import torch
 import torch.nn.functional as F
-from typing import Optional
 
-from megatron import get_timers, get_args, get_retro_args, core, get_num_microbatches
-from .module import MegatronModule
+from megatron import core
 from megatron.core import mpu, tensor_parallel
 from megatron.core.enums import ModelType
-from megatron.model import LayerNorm
-from megatron.model.enums import AttnMaskType, LayerType, AttnType
-from megatron.model.fused_softmax import FusedScaleMaskSoftmax
-from megatron.model.fused_bias_gelu import bias_gelu_impl
-from megatron.model.rotary_pos_embedding import apply_rotary_pos_emb
-from megatron.model.utils import attention_mask_func, openai_gelu, erf_gelu
+from megatron.legacy.model.enums import AttnMaskType, LayerType, AttnType
+from megatron.legacy.model.fused_softmax import FusedScaleMaskSoftmax
+from megatron.legacy.model.fused_bias_gelu import bias_gelu_impl
+from megatron.core.models.common.embeddings import apply_rotary_pos_emb
+from megatron.core.jit import jit_fuser
+from megatron.core.num_microbatches_calculator import get_num_microbatches
+from megatron.core.parallel_state import (
+    get_tensor_and_expert_parallel_group,
+    get_tensor_model_parallel_group,
+)
+from megatron.core.tensor_parallel import (
+    gather_from_sequence_parallel_region_to_moe,
+    get_cuda_rng_tracker,
+    get_data_parallel_rng_tracker_name,
+    reduce_scatter_to_sequence_parallel_region_from_moe,
+)
+from megatron.legacy.model.enums import AttnMaskType, AttnType, LayerType
+from megatron.legacy.model.fused_bias_gelu import bias_gelu_impl
+from megatron.legacy.model.fused_softmax import FusedScaleMaskSoftmax
+from megatron.legacy.model.utils import (
+    attention_mask_func,
+    erf_gelu,
+    get_norm,
+    openai_gelu,
+)
+from megatron.training import get_args, get_timers
+
+from .module import MegatronModule
 
 try:
     from einops import rearrange
@@ -27,8 +50,12 @@
 try:
     from flash_attn.flash_attn_interface import flash_attn_unpadded_func
 except ImportError:
-    flash_attn_unpadded_func = None
-
+    try:
+        from flash_attn.flash_attn_interface import (
+            flash_attn_varlen_func as flash_attn_unpadded_func,
+        )
+    except ImportError:
+        flash_attn_unpadded_func = None
 
 """ We use the following notation throughout this file:
      h: hidden size
@@ -67,18 +94,6 @@ def forward(self, hidden_state):
         output = hidden_state.div(keep_prob) * random_tensor
         return output
 
-def _args_to_kwargs():
-    args = get_args()
-
-    common_kwargs = {
-        "params_dtype": args.params_dtype,
-        "use_cpu_initialization": args.use_cpu_initialization,
-        "perform_initialization": args.perform_initialization,
-        "gradient_accumulation_fusion": args.gradient_accumulation_fusion,
-        "sequence_parallel_enabled": args.sequence_parallel,
-    }
-    return common_kwargs
-
 class ParallelMLP(MegatronModule):
     """MLP.
 
@@ -87,22 +102,27 @@ class ParallelMLP(MegatronModule):
     state back into h hidden dimension.
     """
 
-    def __init__(self, init_method, output_layer_init_method):
+    def __init__(self, config, is_expert=False):
         super(ParallelMLP, self).__init__()
         args = get_args()
 
-        self.add_bias = args.add_bias_linear
+        self.add_bias = config.add_bias_linear
+
+        ffn_hidden_size = config.ffn_hidden_size
+        if config.gated_linear_unit:
+            ffn_hidden_size *= 2
 
         # Project to 4h. If using swiglu double the output width, see https://arxiv.org/pdf/2002.05202.pdf
         self.dense_h_to_4h = tensor_parallel.ColumnParallelLinear(
-            args.hidden_size,
-            args.ffn_hidden_size * 2 if args.swiglu else args.ffn_hidden_size,
+            config.hidden_size,
+            ffn_hidden_size,
+            config=config,
+            init_method=config.init_method,
             bias=self.add_bias,
             gather_output=False,
-            init_method=init_method,
             skip_bias_add=True,
-            async_tensor_model_parallel_allreduce=args.async_tensor_model_parallel_allreduce,
-            **_args_to_kwargs())
+            is_expert=is_expert,
+        )
 
         self.bias_gelu_fusion = False
         self.activation_func = None
@@ -127,13 +147,15 @@ def squared_relu(x):
 
         # Project back to h.
         self.dense_4h_to_h = tensor_parallel.RowParallelLinear(
-            args.ffn_hidden_size,
-            args.hidden_size,
+            config.ffn_hidden_size,
+            config.hidden_size,
+            config=config,
+            init_method=config.output_layer_init_method,
             bias=self.add_bias,
-            input_is_parallel=True,
-            init_method=output_layer_init_method,
             skip_bias_add=True,
-            **_args_to_kwargs())
+            input_is_parallel=True,
+            is_expert=is_expert,
+        )
 
     def forward(self, hidden_states):
 
@@ -153,82 +175,169 @@ def forward(self, hidden_states):
         output, output_bias = self.dense_4h_to_h(intermediate_parallel)
         return output, output_bias
 
+def sinkhorn(cost, tol=0.0001):
+    cost = torch.exp(cost)
+    d0 = torch.ones(cost.size(0), device=cost.device, dtype=cost.dtype)
+    d1 = torch.ones(cost.size(1), device=cost.device, dtype=cost.dtype)
+
+    eps = 0.00000001
+    error = 1e9
+    d1_old = d1
+    while error > tol:
+        d0 = (1/d0.size(0))*1/(torch.sum(d1*cost,1) + eps)
+        d1 = (1/d1.size(0))*1/(torch.sum(d0.unsqueeze(1)*cost,0)+eps)
+        error = torch.mean(torch.abs(d1_old-d1))
+        d1_old = d1
+    return d1*cost*d0.unsqueeze(1)
+
+
+def get_router_linear_layer(config):
+    args = get_args()
+    router = torch.nn.Linear(args.hidden_size, args.num_experts, bias=False)
+    with get_cuda_rng_tracker().fork(get_data_parallel_rng_tracker_name()):
+        config.init_method(router.weight)
+    setattr(router.weight, 'sequence_parallel',config.sequence_parallel)
+    return router
+
+
 class SwitchMLP(MegatronModule):
     """
     Routes input to one of N MLP "experts"
     """
-    def __init__(self, init_method, output_layer_init_method):
+    def __init__(self, config):
         super(SwitchMLP, self).__init__()
         args = get_args()
-        self.router = torch.nn.Linear(args.hidden_size, args.num_experts)
-        self.experts = torch.nn.ModuleList()
-        for i in range(args.num_experts):
-            self.experts.append(ParallelMLP(init_method, output_layer_init_method))
+        self.router = get_router_linear_layer(config)
+        self.expert_parallel_size = mpu.get_expert_model_parallel_world_size()
+        self.sequence_parallel = config.sequence_parallel
+        self.add_bias = config.add_bias_linear
+
+        assert args.num_experts % self.expert_parallel_size == 0
+        self.num_local_experts = args.num_experts // self.expert_parallel_size
+        local_expert_indices_offset = mpu.get_expert_model_parallel_rank() * self.num_local_experts
+        self.local_expert_indices = [local_expert_indices_offset + i for i in range(self.num_local_experts)]
+
+        self.local_experts = torch.nn.ModuleList()
+        for i in range(self.num_local_experts):
+            self.local_experts.append(ParallelMLP(config, is_expert=True))
+
+    def gather_indices(self, local_indices):
+        """ Gather tensors and concatinate along the first dimension."""
+        group = get_tensor_and_expert_parallel_group()
+        world_size = torch.distributed.get_world_size(group=group)
+        # Bypass the function if we are using only 1 GPU.
+        if world_size == 1:
+            return local_indices
+
+        dim_size = list(local_indices.size())
+        dim_size[0] = dim_size[0] * world_size
+
+        # TODO pre allocate memory
+        output = torch.empty(dim_size, dtype=local_indices.dtype,
+                             device=torch.cuda.current_device())
+        torch.distributed._all_gather_base(
+            output, local_indices.contiguous(), group=group
+        )
+        return output
 
     def forward(self, hidden_states):
-        # hidden_states: [s, b, h]
+        # hidden_states: [b, s, h]
+        args = get_args()
         s = hidden_states.size(0)
         b = hidden_states.size(1)
         h = hidden_states.size(2)
-        route = self.router(hidden_states)
-        route = torch.nn.functional.softmax(route, dim=2)
-        max_prob, max_ind = torch.max(route, dim=2)
-        max_prob = torch.unsqueeze(max_prob, 2) # [s b 1]
+        route = self.router(hidden_states).view(-1, args.num_experts)
+
+        # TODO (rprenger) Right now we're just using the sinkhorn algorithm
+        # for load balancing. There should be an option to do no load balancing
+        # and the algorithm and parametets should be further tested
+        if self.training:
+            with torch.no_grad():
+                sinkroute = sinkhorn(route.detach().to(dtype=torch.float32))
+                _, max_ind = torch.max(sinkroute, dim=1)
+            route = torch.sigmoid(route)
+            max_prob = route[torch.arange(route.size(0)), max_ind]
+        else:
+            route = torch.sigmoid(route)
+            max_prob, max_ind = torch.max(route, dim=1)
+
+        max_prob = torch.unsqueeze(max_prob, 1)
+        hidden_states = hidden_states.view(-1, hidden_states.size(2))
 
         # TODO (rprenger) TODO this could be made easier to read
         # Converting [s, b, h] to [s*b, h].
         # Each vector could be routed differently
-        hidden_states = hidden_states.view(-1, hidden_states.size(2)) # [s*b h]
-        max_prob = max_prob.view(-1, max_prob.size(2)) # [s*b 1]
-        max_ind = max_ind.view(-1) # [s*b]
+        if self.sequence_parallel or (self.expert_parallel_size > 1):
+            global_hidden_states = \
+                gather_from_sequence_parallel_region_to_moe(hidden_states)
+            global_indices = self.gather_indices(max_ind)
+        else:
+            global_hidden_states = hidden_states
+            global_indices = max_ind
 
-        output_total = torch.empty_like(hidden_states)
-        output_bias_total = torch.empty_like(hidden_states)
-        #TODO (rprenger) This does each expert in serial, but it could be parallelized
+        output_total = torch.zeros_like(global_hidden_states)
+        if self.add_bias:
+            output_bias_total = torch.zeros_like(global_hidden_states)
 
-        for expert_num, expert in enumerate(self.experts):
-            local_indices = (max_ind == expert_num).nonzero()
-            hidden = hidden_states[local_indices,:]
+        for expert_num, expert in enumerate(self.local_experts):
+            local_expert_index = self.local_expert_indices[expert_num]
+            local_indices = (global_indices == local_expert_index).nonzero()
+            hidden = global_hidden_states[local_indices, :]
             output, output_bias = expert(hidden)
-            output_bias = output_bias.expand_as(output)
-            output_total[local_indices,:] = output
-            output_bias_total[local_indices,:] = output_bias
+            output_total[local_indices, :] = output
+            if self.add_bias:
+                output_bias = output_bias.expand_as(output)
+                output_bias_total[local_indices, :] = output_bias
+
+        if self.sequence_parallel or (self.expert_parallel_size > 1):
+            output_total = \
+                reduce_scatter_to_sequence_parallel_region_from_moe(output_total)
+            if self.add_bias:
+                output_bias_total = \
+                    reduce_scatter_to_sequence_parallel_region_from_moe(output_bias_total)
+
+                # bias is duplicated across tensor parallelism ranks;
+                # reduce scatter reduces bias across tensor parallel_ranks
+                output_bias_total = \
+                    output_bias_total/mpu.get_tensor_model_parallel_world_size()
 
         output_total = output_total*max_prob
-        output_bias_total = output_bias_total*max_prob
         output_total = output_total.view(s, b, h)
-        output_bias_total = output_bias_total.view(s, b, h)
+        if self.add_bias:
+            output_bias_total = output_bias_total*max_prob
+            output_bias_total = output_bias_total.view(s, b, h)
+        else:
+            output_bias_total = None
 
         return output_total, output_bias_total
 
 
 class CoreAttention(MegatronModule):
 
-    def __init__(self, layer_number,
+    def __init__(self, layer_number, config,
                  attn_mask_type=AttnMaskType.padding):
         super(CoreAttention, self).__init__()
-        args = get_args()
-        self.fp16 = args.fp16
-        self.bf16 = args.bf16
+        self.fp16 = config.fp16
+        self.bf16 = config.bf16
 
-        self.apply_query_key_layer_scaling = args.apply_query_key_layer_scaling
-        self.attention_softmax_in_fp32 = args.attention_softmax_in_fp32
+        self.apply_query_key_layer_scaling = config.apply_query_key_layer_scaling
+        self.attention_softmax_in_fp32 = config.attention_softmax_in_fp32
         if self.apply_query_key_layer_scaling:
             self.attention_softmax_in_fp32 = True
         self.layer_number = max(1, layer_number)
         self.attn_mask_type = attn_mask_type
-        self.sequence_parallel = args.sequence_parallel
+        self.sequence_parallel = config.sequence_parallel
 
-        projection_size = args.kv_channels * args.num_attention_heads
+        projection_size = config.kv_channels * config.num_attention_heads
 
         # Per attention head and per partition values.
         world_size = mpu.get_tensor_model_parallel_world_size()
         self.hidden_size_per_partition = core.utils.divide(projection_size,
                                                            world_size)
         self.hidden_size_per_attention_head = core.utils.divide(
-            projection_size, args.num_attention_heads)
+            projection_size, config.num_attention_heads)
         self.num_attention_heads_per_partition = core.utils.divide(
-            args.num_attention_heads, world_size)
+            config.num_attention_heads, world_size)
 
         coeff = None
         self.norm_factor = math.sqrt(self.hidden_size_per_attention_head)
@@ -239,7 +348,7 @@ def __init__(self, layer_number,
         self.scale_mask_softmax = FusedScaleMaskSoftmax(
             self.fp16, self.bf16,
             self.attn_mask_type,
-            args.masked_softmax_fusion,
+            config.masked_softmax_fusion,
             attention_mask_func,
             self.attention_softmax_in_fp32,
             coeff)
@@ -247,7 +356,7 @@ def __init__(self, layer_number,
         # Dropout. Note that for a single iteration, this layer will generate
         # different outputs on different number of parallel partitions but
         # on average it should not be partition dependent.
-        self.attention_dropout = torch.nn.Dropout(args.attention_dropout)
+        self.attention_dropout = torch.nn.Dropout(config.attention_dropout)
 
     def forward(self, query_layer, key_layer,
                 value_layer, attention_mask):
@@ -263,8 +372,8 @@ def forward(self, query_layer, key_layer,
                        key_layer.size(0))
 
         # [sq, b, np, hn] -> [sq, b * np, hn]
-        query_layer = query_layer.view(output_size[2],
-                                       output_size[0] * output_size[1], -1)
+        query_layer = query_layer.reshape(output_size[2],
+                                          output_size[0] * output_size[1], -1)
         # [sk, b, np, hn] -> [sk, b * np, hn]
         key_layer = key_layer.view(output_size[3],
                                    output_size[0] * output_size[1], -1)
@@ -381,17 +490,18 @@ def forward(self, q, k, v):
 
             is_causal = self.causal
             cu_seqlens_k = cu_seqlens_q
+            dropout_p = self.dropout_p
         else:
             # turn off FA causal mask after first inference autoregressive iteration
             # only on first autoregressive step q,k,v have same seqlen
             is_causal = seqlen_q == seqlen_k
             cu_seqlens_k = torch.arange(0, (batch_size + 1) * seqlen_k, step=seqlen_k, dtype=torch.int32,
                         device=q.device)
-            self.dropout_p = 0
+            dropout_p = 0
 
         output = flash_attn_unpadded_func(
             q, k, v, cu_seqlens_q, cu_seqlens_k, seqlen_q, seqlen_k,
-            self.dropout_p,
+            dropout_p,
             softmax_scale=self.softmax_scale, causal=is_causal
         )
 
@@ -406,8 +516,7 @@ class ParallelAttention(MegatronModule):
     and returns output of the same size.
     """
 
-    def __init__(self, init_method,
-                 output_layer_init_method, layer_number,
+    def __init__(self, config, layer_number,
                  attention_type=AttnType.self_attn,
                  attn_mask_type=AttnMaskType.padding):
         super(ParallelAttention, self).__init__()
@@ -415,8 +524,17 @@ def __init__(self, init_method,
         self.layer_number = max(1, layer_number)
         self.attention_type = attention_type
         self.attn_mask_type = attn_mask_type
-        self.params_dtype = args.params_dtype
-        self.sequence_parallel = args.sequence_parallel
+        self.params_dtype = config.params_dtype
+        self.sequence_parallel = config.sequence_parallel
+        self.config = config
+        self.group_query_attention = args.group_query_attention
+        self.num_query_groups = args.num_query_groups
+
+        query_projection_size = config.kv_channels * config.num_attention_heads
+        if self.group_query_attention:
+            kv_projection_size = args.kv_channels * args.num_query_groups
+        else:
+            kv_projection_size = args.kv_channels * args.num_attention_heads
 
         self.use_flash_attn = args.use_flash_attn \
             and attention_type == AttnType.self_attn \
@@ -432,64 +550,72 @@ def __init__(self, init_method,
             if rearrange is None:
                 raise ImportError('einops is not installed, please install with pip install einops')
 
-        projection_size = args.kv_channels * args.num_attention_heads
-
         # Per attention head and per partition values.
         world_size = mpu.get_tensor_model_parallel_world_size()
         self.hidden_size_per_attention_head = core.utils.divide(
-            projection_size, args.num_attention_heads)
+            query_projection_size, config.num_attention_heads)
         self.num_attention_heads_per_partition = core.utils.divide(
-            args.num_attention_heads, world_size)
+            config.num_attention_heads, world_size)
+
+        if self.group_query_attention:
+            if args.num_query_groups % world_size != 0:
+                raise NotImplementedError('Currently the num_query_groups should be '
+                                          'a multiple of the tensor parallel size')
+            self.num_query_groups_per_partition = core.utils.divide(
+                        args.num_query_groups, world_size)
+        else:
+            self.num_query_groups_per_partition = self.num_attention_heads_per_partition
 
         # Strided linear layer.
         if attention_type == AttnType.self_attn:
             self.query_key_value = tensor_parallel.ColumnParallelLinear(
-                args.hidden_size,
-                3 * projection_size,
-                bias=args.add_bias_linear,
-                gather_output=False,
-                init_method=init_method,
-                async_tensor_model_parallel_allreduce=args.async_tensor_model_parallel_allreduce,
-                **_args_to_kwargs())
+                config.hidden_size,
+                query_projection_size + 2 * kv_projection_size,
+                config=config,
+                init_method=config.init_method,
+                bias=args.add_bias_linear or args.add_qkv_bias,
+                gather_output=False)
         else:
             assert attention_type == AttnType.cross_attn
-            self.query = tensor_parallel.ColumnParallelLinear(
-                args.hidden_size,
-                projection_size,
-                bias=args.add_bias_linear,
-                gather_output=False,
-                init_method=init_method,
-                async_tensor_model_parallel_allreduce=args.async_tensor_model_parallel_allreduce,
-                **_args_to_kwargs())
 
+            if self.group_query_attention:
+                raise NotImplementedError("Grouped query attention not implemented for cross-attention.")
+            assert query_projection_size == kv_projection_size
+
+            self.query = tensor_parallel.ColumnParallelLinear(
+                config.hidden_size,
+                query_projection_size,
+                config=config,
+                init_method=config.init_method,
+                bias=config.add_bias_linear,
+                gather_output=False)
 
             self.key_value = tensor_parallel.ColumnParallelLinear(
-                args.hidden_size,
-                2 * projection_size,
-                bias=args.add_bias_linear,
-                gather_output=False,
-                init_method=init_method,
-                async_tensor_model_parallel_allreduce=args.async_tensor_model_parallel_allreduce,
-                **_args_to_kwargs())
-
-        self.core_attention = CoreAttention(self.layer_number,
+                config.hidden_size,
+                2 * kv_projection_size,
+                config=config,
+                init_method=config.init_method,
+                bias=config.add_bias_linear,
+                gather_output=False)
+
+        self.core_attention = CoreAttention(self.layer_number, config,
                                             self.attn_mask_type)
-        self.checkpoint_core_attention = args.recompute_granularity == 'selective'
+        self.checkpoint_core_attention = config.recompute_granularity == 'selective'
 
         if self.use_flash_attn:
             self.core_attention_flash = FlashSelfAttention(
-                causal=True, attention_dropout=args.attention_dropout
+                causal=True, attention_dropout=config.attention_dropout
             )
 
         # Output.
         self.dense = tensor_parallel.RowParallelLinear(
-            projection_size,
-            args.hidden_size,
+            query_projection_size,
+            config.hidden_size,
+            config=config,
+            init_method=config.output_layer_init_method,
             bias=args.add_bias_linear,
             input_is_parallel=True,
-            init_method=output_layer_init_method,
-            skip_bias_add=True,
-            **_args_to_kwargs())
+            skip_bias_add=True)
 
     def _checkpointed_attention_forward(self, query_layer, key_layer,
                                         value_layer, attention_mask,
@@ -514,11 +640,11 @@ def custom_forward(*inputs):
 
         return hidden_states
 
-    def _allocate_memory(self, inference_max_sequence_len, batch_size):
+    def _allocate_memory(self, inference_max_sequence_len, batch_size, num_attention_heads):
         return torch.empty(
             inference_max_sequence_len,
             batch_size,
-            self.num_attention_heads_per_partition,
+            num_attention_heads,
             self.hidden_size_per_attention_head,
             dtype=self.params_dtype,
             device=torch.cuda.current_device())
@@ -534,12 +660,15 @@ def forward(self, hidden_states, attention_mask,
         is_first_step = False
         if inference_params:
             if self.layer_number not in inference_params.key_value_memory_dict:
-                inf_max_seq_len = inference_params.max_sequence_len
+                inf_max_seq_len = inference_params.max_sequence_length
                 inf_max_batch_size = inference_params.max_batch_size
                 inference_key_memory = self._allocate_memory(
-                    inf_max_seq_len, inf_max_batch_size)
+                    inf_max_seq_len, inf_max_batch_size,
+                    self.num_query_groups_per_partition)
                 inference_value_memory = self._allocate_memory(
-                    inf_max_seq_len, inf_max_batch_size)
+                    inf_max_seq_len, inf_max_batch_size,
+                    self.num_query_groups_per_partition)
+
                 inference_params.key_value_memory_dict[self.layer_number] = (
                     inference_key_memory, inference_value_memory)
                 is_first_step = True
@@ -550,21 +679,38 @@ def forward(self, hidden_states, attention_mask,
         # =====================
         # Query, Key, and Value
         # =====================
-
         if self.attention_type == AttnType.self_attn:
-            # Attention heads [sq, b, h] --> [sq, b, (np * 3 * hn)]
+
+            # Attention heads [sq, b, h] --> [sq, b, ng * (np/ng + 2) * hn)]
             mixed_x_layer, _ = self.query_key_value(hidden_states)
 
-            # [sq, b, (np * 3 * hn)] --> [sq, b, np, 3 * hn]
-            new_tensor_shape = mixed_x_layer.size()[:-1] + \
-                (self.num_attention_heads_per_partition,
-                 3 * self.hidden_size_per_attention_head)
+            # [sq, b, hp] --> [sq, b, ng, (np/ng + 2) * hn]
+            new_tensor_shape = mixed_x_layer.size()[:-1] + (
+                self.num_query_groups_per_partition,
+                (
+                    (self.num_attention_heads_per_partition // self.num_query_groups_per_partition + 2)
+                    * self.hidden_size_per_attention_head
+                ),
+            )
             mixed_x_layer = mixed_x_layer.view(*new_tensor_shape)
 
-            # [sq, b, np, 3 * hn] --> 3 [sq, b, np, hn]
+            # [sq, b, ng, (np/ng + 2) * hn] --> [sq, b, ng, np/ng * hn], [sq, b, ng, hn], [sq, b, ng, hn]
             (query_layer,
-             key_layer,
-             value_layer) = tensor_parallel.split_tensor_along_last_dim(mixed_x_layer, 3)
+            key_layer,
+            value_layer) = torch.split(
+                mixed_x_layer,
+                [
+                    (
+                        self.num_attention_heads_per_partition // self.num_query_groups_per_partition
+                        * self.hidden_size_per_attention_head
+                    ),
+                    self.hidden_size_per_attention_head,
+                    self.hidden_size_per_attention_head
+                ],
+                dim=3)
+
+            # [sq, b, ng, np/ng * hn] -> [sq, b, np, hn] -
+            query_layer = query_layer.view(query_layer.size(0), query_layer.size(1), -1, self.hidden_size_per_attention_head)
         else:
             # Attention heads [sk, b, h] --> [sk, b, (np * 2 * hn)]
             mixed_kv_layer, _ = self.key_value(encoder_output)
@@ -572,19 +718,19 @@ def forward(self, hidden_states, attention_mask,
             # [sk, b, (np * 2 * hn)] --> [sk, b, np, 2 * hn]
             new_tensor_shape = mixed_kv_layer.size()[:-1] + \
                 (self.num_attention_heads_per_partition,
-                 2 * self.hidden_size_per_attention_head)
+                2 * self.hidden_size_per_attention_head)
             mixed_kv_layer = mixed_kv_layer.view(*new_tensor_shape)
 
             # [sk, b, np, 2 * hn] --> 2 [sk, b, np, hn]
             (key_layer,
-             value_layer) = tensor_parallel.split_tensor_along_last_dim(mixed_kv_layer, 2)
+            value_layer) = tensor_parallel.split_tensor_along_last_dim(mixed_kv_layer, 2)
 
             # Attention head [sq, b, h] --> [sq, b, hp]
             query_layer, _ = self.query(hidden_states)
             # [sq, b, hp] --> [sq, b, np, hn]
             new_tensor_shape = query_layer.size()[:-1] + \
                 (self.num_attention_heads_per_partition,
-                 self.hidden_size_per_attention_head)
+                self.hidden_size_per_attention_head)
             query_layer = query_layer.view(*new_tensor_shape)
 
         # ==================================
@@ -636,16 +782,26 @@ def forward(self, hidden_states, attention_mask,
                 k_pos_emb = k_pos_emb[:sequence_end, :, :, :]
                 rotary_pos_emb = (q_pos_emb, k_pos_emb)
 
-
         # ==================================
         # core attention computation
         # ==================================
 
+        # expand the key_layer and value_layer [sk, b, ng, hn] -> [sk, b, np, hn]
+        if self.num_attention_heads_per_partition // self.num_query_groups_per_partition > 1:
+            key_layer = key_layer.repeat_interleave(
+                self.num_attention_heads_per_partition // self.num_query_groups_per_partition,
+                dim = 2
+            )
+            value_layer = value_layer.repeat_interleave(
+                self.num_attention_heads_per_partition // self.num_query_groups_per_partition,
+                dim = 2
+            )
+
         # apply relative positional encoding (rotary embedding)
         if rotary_pos_emb is not None:
             q_pos_emb, k_pos_emb = rotary_pos_emb
-            query_layer = apply_rotary_pos_emb(query_layer, q_pos_emb)
-            key_layer = apply_rotary_pos_emb(key_layer, k_pos_emb)
+            query_layer = apply_rotary_pos_emb(query_layer, q_pos_emb,self.config)
+            key_layer = apply_rotary_pos_emb(key_layer, k_pos_emb,self.config)
             # TODO, can apply positional embedding to value_layer so it has
             # absolute positional embedding.
             # otherwise, only relative positional embedding takes effect
@@ -692,7 +848,7 @@ def _bias_dropout_add(x, bias, residual, prob):
     return _bias_dropout_add
 
 
-@torch.jit.script
+@jit_fuser
 def bias_dropout_add_fused_train(x: torch.Tensor,
                                  bias: Optional[torch.Tensor],
                                  residual: torch.Tensor,
@@ -700,7 +856,7 @@ def bias_dropout_add_fused_train(x: torch.Tensor,
     return bias_dropout_add(x, bias, residual, prob, True)
 
 
-@torch.jit.script
+@jit_fuser
 def bias_dropout_add_fused_inference(x: torch.Tensor,
                                      bias: Optional[torch.Tensor],
                                      residual: torch.Tensor,
@@ -715,49 +871,37 @@ class ParallelTransformerLayer(MegatronModule):
     output of the same size.
     """
 
-    def __init__(self, init_method, output_layer_init_method,
+    def __init__(self, config,
                  layer_number, layer_type=LayerType.encoder,
                  self_attn_mask_type=AttnMaskType.padding,
                  drop_path_rate=0.):
-                 # retriever=None):
         args = get_args()
 
         super(ParallelTransformerLayer, self).__init__()
         self.layer_number = layer_number
         self.layer_type = layer_type
 
-        self.apply_residual_connection_post_layernorm \
-            = args.apply_residual_connection_post_layernorm
+        self.apply_residual_connection_post_norm \
+            = config.apply_residual_connection_post_layernorm
 
-        self.bf16 = args.bf16
-        self.fp32_residual_connection = args.fp32_residual_connection
+        self.bf16 = config.bf16
+        self.fp32_residual_connection = config.fp32_residual_connection
 
-        # Layernorm on the input data.
-        self.input_layernorm = LayerNorm(
-            args.hidden_size,
-            eps=args.layernorm_epsilon,
-            no_persist_layer_norm=args.no_persist_layer_norm,
-            sequence_parallel=args.sequence_parallel,
-            apply_layernorm_1p=args.apply_layernorm_1p)
+        # Normalize the input data.
+        self.input_norm = get_norm(config)
 
         # Self attention.
         self.self_attention = ParallelAttention(
-            init_method,
-            output_layer_init_method,
+            config,
             layer_number,
             attention_type=AttnType.self_attn,
             attn_mask_type=self_attn_mask_type)
-        self.hidden_dropout = args.hidden_dropout
-        self.bias_dropout_fusion = args.bias_dropout_fusion
+        self.hidden_dropout = config.hidden_dropout
+        self.bias_dropout_fusion = config.bias_dropout_fusion
         self.drop_path = DropPath(drop_path_rate) if drop_path_rate > 0.0 else None
 
-        # Layernorm on the attention output
-        self.post_attention_layernorm = LayerNorm(
-            args.hidden_size,
-            eps=args.layernorm_epsilon,
-            no_persist_layer_norm=args.no_persist_layer_norm,
-            sequence_parallel=args.sequence_parallel,
-            apply_layernorm_1p=args.apply_layernorm_1p)
+        # Normalize the attention output
+        self.post_attention_norm = get_norm(config)
 
         # Cross attention.
         if self.layer_type in (LayerType.decoder,
@@ -765,23 +909,17 @@ def __init__(self, init_method, output_layer_init_method,
                                LayerType.retro_decoder_with_retriever,
                                LayerType.retro_encoder):
             self.inter_attention = ParallelAttention(
-                init_method,
-                output_layer_init_method,
+                config,
                 layer_number,
                 attention_type=AttnType.cross_attn)
-            # Layernorm on the attention output.
-            self.post_inter_attention_layernorm = LayerNorm(
-                args.hidden_size,
-                eps=args.layernorm_epsilon,
-                no_persist_layer_norm=args.no_persist_layer_norm,
-                sequence_parallel=args.sequence_parallel,
-                apply_layernorm_1p=args.apply_layernorm_1p)
+            # Normalize the attention output.
+            self.post_inter_attention_norm = get_norm(config)
 
         # MLP
         if args.num_experts is not None:
-            self.mlp = SwitchMLP(init_method, output_layer_init_method)
+            self.mlp = SwitchMLP(config)
         else:
-            self.mlp = ParallelMLP(init_method, output_layer_init_method)
+            self.mlp = ParallelMLP(config)
 
         # Set bias+dropout+add fusion grad_enable execution handler.
         TORCH_MAJOR = int(torch.__version__.split('.')[0])
@@ -791,16 +929,15 @@ def __init__(self, init_method, output_layer_init_method,
                 nullcontext if use_nvfuser else torch.enable_grad
 
         if args.retro_add_retriever:
-            retro_args = get_retro_args()
             self.retro_num_neighbors = args.retro_num_neighbors
-            self.retro_chunk_length = retro_args.retro_gpt_chunk_length
-            self.retro_retrieved_length = retro_args.retro_gpt_retrieved_length
+            self.retro_chunk_length = args.retro_chunk_length
+            self.retro_retrieved_length = \
+                args.retro_num_retrieved_chunks * args.retro_chunk_length
 
         # Retriever (bi-directional transformer with cross attention)
         if layer_type == LayerType.retro_decoder_with_retriever:
             self.retriever = ParallelTransformer(
-                init_method,
-                output_layer_init_method,
+                config=config,
                 model_type=ModelType.retro_encoder,
                 self_attn_mask_type=AttnMaskType.padding,
                 pre_process=True,
@@ -813,43 +950,43 @@ def __init__(self, init_method, output_layer_init_method,
     def default_decoder_cross_attention(self,
                                         encoder_output,
                                         enc_dec_attn_mask,
-                                        layernorm_input,
-                                        layernorm_output,
+                                        norm_input,
+                                        norm_output,
                                         bias_dropout_add_func):
         '''Cross attention for a standard encoder-decoder model.'''
 
         # Attention.
         attention_output, attention_bias = \
-            self.inter_attention(layernorm_output,
+            self.inter_attention(norm_output,
                                  enc_dec_attn_mask,
                                  encoder_output=encoder_output)
 
         # Residual connection.
-        if self.apply_residual_connection_post_layernorm:
-            residual = layernorm_output
+        if self.apply_residual_connection_post_norm:
+            residual = norm_output
         else:
-            residual = layernorm_input
+            residual = norm_input
 
         if attention_bias is not None:
             attention_bias = attention_bias.expand_as(residual)
 
         # Bias-dropout-add.
         with self.bias_dropout_add_exec_handler():
-            layernorm_input = bias_dropout_add_func(
+            norm_input = bias_dropout_add_func(
                 attention_output,
                 attention_bias,
                 residual,
                 self.hidden_dropout)
 
-        # Layer norm.
-        layernorm_output = self.post_inter_attention_layernorm(layernorm_input)
+        # Normalize.
+        norm_output = self.post_inter_attention_norm(norm_input)
 
-        return layernorm_input, layernorm_output
+        return norm_input, norm_output
 
     def retro_encoder_cross_attention(self,
                                       retriever_output,
-                                      layernorm_input,
-                                      layernorm_output,
+                                      norm_input,
+                                      norm_output,
                                       bias_dropout_add_func):
         """Cross attention for Retro encoder.
 
@@ -862,20 +999,20 @@ def retro_encoder_cross_attention(self,
             r  : Number of retrieved tokens (neighbors + continuation).
         """
 
-        ns, bs, d = layernorm_output.shape # [r, bs * l * k, d]
+        ns, bs, d = norm_output.shape # [r, bs * l * k, d]
 
         # Divide sequence dimension into chunks.
-        chunked_outputs = layernorm_output.reshape(self.retro_retrieved_length,
-                                                   -1,
-                                                   self.retro_num_neighbors,
-                                                   d)
-        chunked_outputs_before_layer_norm = \
-            layernorm_input.reshape(self.retro_retrieved_length, -1,
-                                    self.retro_num_neighbors, d) # [r, bs*l, k, d]
+        chunked_outputs = norm_output.reshape(self.retro_retrieved_length,
+                                              -1,
+                                              self.retro_num_neighbors,
+                                              d)
+        chunked_outputs_before_norm = \
+            norm_input.reshape(self.retro_retrieved_length, -1,
+                               self.retro_num_neighbors, d) # [r, bs*l, k, d]
 
         # Per-chunk attention.
-        layernorm_inputs = []
-        layernorm_outputs = []
+        norm_inputs = []
+        norm_outputs = []
         for k in range(self.retro_num_neighbors):
 
             # Attention.
@@ -887,41 +1024,38 @@ def retro_encoder_cross_attention(self,
                     encoder_output=retriever_output) # K, V (hidden act)
 
             # Residual connection.
-            if self.apply_residual_connection_post_layernorm:
+            if self.apply_residual_connection_post_norm:
                 residual = chunked_output
             else:
-                residual = chunked_outputs_before_layer_norm[:,:,k]
+                residual = chunked_outputs_before_norm[:,:,k]
 
             # Re-enable torch grad to enable fused optimization.
             with torch.enable_grad():
-                layernorm_input = bias_dropout_add_func(
+                norm_input = bias_dropout_add_func(
                     attention_output,
                     None if attention_bias is None else attention_bias.expand_as(residual),
                     residual,
                     self.hidden_dropout)
-                layernorm_inputs.append(layernorm_input)
+                norm_inputs.append(norm_input)
 
             # Layer norm.
-            layernorm_output = \
-                self.post_inter_attention_layernorm(layernorm_input)
-            layernorm_outputs.append(layernorm_output)
+            norm_output = self.post_inter_attention_norm(norm_input)
+            norm_outputs.append(norm_output)
 
         # Concatenate layer norms.
-        # layernorm_input : [r, k * bs * l, d]
-        # layernorm_output : [r, k * bs * l, d]
-        layernorm_input = \
-            torch.stack(layernorm_inputs, dim=1).reshape(ns, bs, d)
-        layernorm_output = \
-            torch.stack(layernorm_outputs, dim=1).reshape(ns, bs, d)
+        # norm_input : [r, k * bs * l, d]
+        # norm_output : [r, k * bs * l, d]
+        norm_input = torch.stack(norm_inputs, dim=1).reshape(ns, bs, d)
+        norm_output = torch.stack(norm_outputs, dim=1).reshape(ns, bs, d)
 
-        return layernorm_input, layernorm_output
+        return norm_input, norm_output
 
     def retro_decoder_cross_attention(self,
                                       retriever_input,
                                       retriever_output,
                                       retriever_attn_mask,
-                                      layernorm_input,
-                                      layernorm_output,
+                                      norm_input,
+                                      norm_output,
                                       inference_params,
                                       bias_dropout_add_func):
         """Cross attention for Retro decoder.
@@ -936,16 +1070,15 @@ def retro_decoder_cross_attention(self,
             r  : Number of retrieved tokens (neighbors + continuation).
         """
 
-        ns, bs, d = layernorm_output.shape
+        ns, bs, d = norm_output.shape
         l = int(np.ceil(ns / self.retro_chunk_length))
 
         # Retrieve neighbors.
         if self.layer_type == LayerType.retro_decoder_with_retriever:
             first_ns = ns % self.retro_chunk_length
             if first_ns > 0:
-                raise Exception("test this case.")
                 first_chunk, rest_chunk = \
-                    layernorm_output[:first_ns], layernorm_output[first_ns:]
+                    norm_output[:first_ns], norm_output[first_ns:]
                 first_chunk = torch.nn.functional.pad(
                     first_chunk,
                     (0, 0, 0, 0, 0, self.retro_chunk_length - first_ns),
@@ -954,7 +1087,7 @@ def retro_decoder_cross_attention(self,
                 chunked_output = \
                     torch.cat((first_chunk, rest_chunk), dim=0) # [l * m, bs, d]
             else:
-                chunked_output = layernorm_output # [l * m, bs, d]
+                chunked_output = norm_output # [l * m, bs, d]
             chunked_output = chunked_output \
                 .reshape(l, self.retro_chunk_length, bs, d) \
                 .permute(1, 2, 0, 3) \
@@ -973,7 +1106,7 @@ def retro_decoder_cross_attention(self,
 
         # Chunks.
         pad = (ns - 1) % self.retro_chunk_length
-        attending_chunks = layernorm_output[pad:]
+        attending_chunks = norm_output[pad:]
         padded_chunks = torch.nn.functional.pad(
             attending_chunks,
             (0, 0, 0, 0, 0, self.retro_chunk_length - 1),
@@ -991,32 +1124,34 @@ def retro_decoder_cross_attention(self,
                                  encoder_output=retriever_output)
 
         # Residual connection.
-        if self.apply_residual_connection_post_layernorm:
-            residual = layernorm_output
+        if self.apply_residual_connection_post_norm:
+            residual = norm_output
         else:
-            residual = layernorm_input
+            residual = norm_input
 
         # Re-enable torch grad to enable fused optimization.
         with torch.enable_grad():
-            layernorm_input = bias_dropout_add_func(
+            norm_input = bias_dropout_add_func(
                 attention_output,
                 None if attention_bias is None else attention_bias.expand_as(attention_output),
                 torch.zeros_like(attention_output),
                 self.hidden_dropout)
-            layernorm_input = layernorm_input \
+            norm_input = norm_input \
                 .reshape(self.retro_chunk_length, bs, l, d) \
                 .permute(2, 0, 1, 3) # [l, m, bs, d]
-            layernorm_input = layernorm_input.reshape(self.retro_chunk_length * l, bs, d)
-            layernorm_input = torch.nn.functional.pad(
-                layernorm_input,
+            norm_input = norm_input.reshape(self.retro_chunk_length * l, bs, d)
+            norm_input = torch.nn.functional.pad(
+                norm_input,
                 (0, 0, 0, 0, pad, 0),
                 'constant', 0)[:ns] # [ns, b, d]
-            layernorm_input = layernorm_input + residual
+            # TODO: better redesign with inference param
+            args = get_args()
+            norm_input = args.retro_attention_gate * norm_input + residual
 
         # Layer norm post the decoder attention
-        layernorm_output = self.post_inter_attention_layernorm(layernorm_input)
+        norm_output = self.post_inter_attention_norm(norm_input)
 
-        return retriever_output, layernorm_input, layernorm_output
+        return retriever_output, norm_input, norm_output
 
     def forward(self, hidden_states, attention_mask,
                 encoder_output=None, enc_dec_attn_mask=None,
@@ -1025,22 +1160,32 @@ def forward(self, hidden_states, attention_mask,
                 retriever_attn_mask=None,
                 inference_params=None,
                 rotary_pos_emb=None):
+
+        # Update the params in case the retro param changes during inference
+        # TODO: better redesign with inference param
+        args = get_args()
+        if args.retro_add_retriever:
+            self.retro_num_neighbors = args.retro_num_neighbors
+            self.retro_chunk_length = args.retro_chunk_length
+            self.retro_retrieved_length = \
+                args.retro_num_retrieved_chunks * args.retro_chunk_length
+
         # hidden_states: [s, b, h]
 
         # Layer norm at the beginning of the transformer layer.
-        layernorm_output = self.input_layernorm(hidden_states)
+        norm_output = self.input_norm(hidden_states)
 
         # Self attention.
         attention_output, attention_bias = \
             self.self_attention(
-                layernorm_output,
+                norm_output,
                 attention_mask,
                 inference_params=inference_params,
                 rotary_pos_emb=rotary_pos_emb)
 
         # Residual connection.
-        if self.apply_residual_connection_post_layernorm:
-            residual = layernorm_output
+        if self.apply_residual_connection_post_norm:
+            residual = norm_output
         else:
             residual = hidden_states
 
@@ -1060,7 +1205,7 @@ def forward(self, hidden_states, attention_mask,
             if attention_bias is not None:
                 attention_bias = attention_bias.expand_as(residual)
             with self.bias_dropout_add_exec_handler():
-                layernorm_input = bias_dropout_add_func(
+                norm_input = bias_dropout_add_func(
                     attention_output,
                     attention_bias,
                     residual,
@@ -1069,38 +1214,38 @@ def forward(self, hidden_states, attention_mask,
             out = torch.nn.functional.dropout(attention_output + attention_bias,
                                               p=self.hidden_dropout,
                                               training=self.training)
-            layernorm_input = residual + self.drop_path(out)
+            norm_input = residual + self.drop_path(out)
 
         # Layer norm post the self attention.
-        layernorm_output = self.post_attention_layernorm(layernorm_input)
+        norm_output = self.post_attention_norm(norm_input)
 
         # Cross attention.
         if self.layer_type == LayerType.encoder:
             pass
         elif self.layer_type == LayerType.decoder:
-            layernorm_input, layernorm_output = \
+            norm_input, norm_output = \
                 self.default_decoder_cross_attention(
                     encoder_output,
                     enc_dec_attn_mask,
-                    layernorm_input,
-                    layernorm_output,
+                    norm_input,
+                    norm_output,
                     bias_dropout_add_func)
         elif self.layer_type == LayerType.retro_encoder:
-            layernorm_input, layernorm_output = \
+            norm_input, norm_output = \
                 self.retro_encoder_cross_attention(
                     retriever_output,
-                    layernorm_input,
-                    layernorm_output,
+                    norm_input,
+                    norm_output,
                     bias_dropout_add_func)
         elif self.layer_type in (LayerType.retro_decoder,
                                  LayerType.retro_decoder_with_retriever):
-            retriever_output, layernorm_input, layernorm_output = \
+            retriever_output, norm_input, norm_output = \
                 self.retro_decoder_cross_attention(
                     retriever_input,
                     retriever_output,
                     retriever_attn_mask,
-                    layernorm_input,
-                    layernorm_output,
+                    norm_input,
+                    norm_output,
                     inference_params,
                     bias_dropout_add_func)
         else:
@@ -1108,13 +1253,13 @@ def forward(self, hidden_states, attention_mask,
                             self.layer_type.name)
 
         # MLP.
-        mlp_output, mlp_bias = self.mlp(layernorm_output)
+        mlp_output, mlp_bias = self.mlp(norm_output)
 
         # Second residual connection.
-        if self.apply_residual_connection_post_layernorm:
-            residual = layernorm_output
+        if self.apply_residual_connection_post_norm:
+            residual = norm_output
         else:
-            residual = layernorm_input
+            residual = norm_input
 
         if self.drop_path is None:
             if mlp_bias is not None:
@@ -1182,47 +1327,21 @@ def _get_num_layers(args, model_type, is_decoder=False):
     if model_type == ModelType.retro_encoder:
         num_layers = args.retro_encoder_layers
     elif mpu.get_pipeline_model_parallel_world_size() > 1:
-        if is_encoder_and_decoder_model:
-            assert args.pipeline_model_parallel_split_rank is not None
-
-            # When a standalone embedding stage is used, a rank is taken from
-            # the encoder's ranks, to be used for the encoder's embedding
-            # layer. This way, the rank referenced by the 'split rank' remains
-            # the same whether or not a standalone embedding stage is used.
-            num_ranks_in_encoder = (
-                args.pipeline_model_parallel_split_rank - 1
-                if args.standalone_embedding_stage else
-                args.pipeline_model_parallel_split_rank
-            )
-            num_ranks_in_decoder = args.transformer_pipeline_model_parallel_size - num_ranks_in_encoder
-            assert args.encoder_num_layers % num_ranks_in_encoder == 0, \
-                    'encoder_num_layers (%d) must be divisible by number of ranks given to encoder (%d)' % (args.encoder_num_layers, num_ranks_in_encoder)
-            assert args.decoder_num_layers % num_ranks_in_decoder == 0, \
-                    'decoder_num_layers (%d) must be divisible by number of ranks given to decoder (%d)' % (args.decoder_num_layers, num_ranks_in_decoder)
-            if mpu.is_pipeline_stage_before_split():
-                num_layers = (
-                    0
-                    if args.standalone_embedding_stage
-                    and mpu.get_pipeline_model_parallel_rank() == 0 else
-                    args.encoder_num_layers // num_ranks_in_encoder
-                )
-            else:
-                num_layers = args.decoder_num_layers // num_ranks_in_decoder
-        else:
-            assert args.num_layers == args.encoder_num_layers
-            assert args.num_layers % args.transformer_pipeline_model_parallel_size == 0, \
-                'num_layers must be divisible by transformer_pipeline_model_parallel_size'
-
-            # When a standalone embedding stage is used, all transformer layers
-            # are divided among pipeline rank >= 1, while on pipeline rank 0,
-            # ranks either contain the input embedding layer (virtual pp rank 0),
-            # or no layers at all (virtual pp rank >= 1).
-            num_layers = (
-                0
-                if args.standalone_embedding_stage
-                and mpu.get_pipeline_model_parallel_rank() == 0 else
-                args.num_layers // args.transformer_pipeline_model_parallel_size
-            )
+        assert not is_encoder_and_decoder_model, "This is no longer supported."
+        assert args.num_layers == args.encoder_num_layers
+        assert args.num_layers % args.transformer_pipeline_model_parallel_size == 0, \
+            'num_layers must be divisible by transformer_pipeline_model_parallel_size'
+
+        # When a standalone embedding stage is used, all transformer layers
+        # are divided among pipeline rank >= 1, while on pipeline rank 0,
+        # ranks either contain the input embedding layer (virtual pp rank 0),
+        # or no layers at all (virtual pp rank >= 1).
+        num_layers = (
+            0
+            if args.standalone_embedding_stage
+            and mpu.get_pipeline_model_parallel_rank() == 0 else
+            args.num_layers // args.transformer_pipeline_model_parallel_size
+        )
     else:
         if not is_decoder:
             num_layers = args.encoder_num_layers
@@ -1250,10 +1369,10 @@ def _get_layer_type(model_type, default_layer_type, retro_layer_numbers,
 class ParallelTransformer(MegatronModule):
     """Transformer class."""
 
-    def __init__(self, init_method, output_layer_init_method,
+    def __init__(self, config,
                  model_type, layer_type=LayerType.encoder,
                  self_attn_mask_type=AttnMaskType.padding,
-                 post_layer_norm=True,
+                 post_norm=True,
                  pre_process=True,
                  post_process=True,
                  drop_path_rate=0.0):
@@ -1262,9 +1381,9 @@ def __init__(self, init_method, output_layer_init_method,
 
         self.layer_type = layer_type
         self.model_type = model_type
-        self.bf16 = args.bf16
-        self.fp32_residual_connection = args.fp32_residual_connection
-        self.post_layer_norm = post_layer_norm
+        self.bf16 = config.bf16
+        self.fp32_residual_connection = config.fp32_residual_connection
+        self.post_norm = post_norm
         self.pre_process = pre_process
         self.post_process = post_process
         self.input_tensor = None
@@ -1273,37 +1392,45 @@ def __init__(self, init_method, output_layer_init_method,
         self.retro_add_retriever = args.retro_add_retriever
 
         # Store activation checkpoiting flag.
-        self.recompute_granularity = args.recompute_granularity
-        self.recompute_method = args.recompute_method
-        self.recompute_num_layers = args.recompute_num_layers
+        self.recompute_granularity = config.recompute_granularity
+        self.recompute_method = config.recompute_method
+        self.recompute_num_layers = config.recompute_num_layers
         self.distribute_saved_activations = \
-            args.distribute_saved_activations and not args.sequence_parallel
+            config.distribute_saved_activations and not config.sequence_parallel
 
-        self.sequence_parallel = args.sequence_parallel
+        self.sequence_parallel = config.sequence_parallel
 
         # Transformer Engine Init.
-        self.transformer_engine_rope_available = False
+        self.transformer_engine_v_0_10 = False
+        self.transformer_engine_v_0_11 = False
+        self.transformer_engine_v_0_8 = False
         if self.transformer_impl == 'transformer_engine':
             global transformer_engine
             import transformer_engine
-            from importlib.metadata import version
-            from pkg_resources import packaging
 
-            te_version = packaging.version.Version(version("transformer-engine"))
-            if te_version >= packaging.version.Version("0.10.0"):
-                self.transformer_engine_rope_available = True
+            if core.utils.is_te_min_version("0.8.0"):
+                self.transformer_engine_v_0_8 = True
+            if core.utils.is_te_min_version("0.10.0"):
+                self.transformer_engine_v_0_10 = True
+            if core.utils.is_te_min_version("0.11.0"):
+                self.transformer_engine_v_0_11 = True
 
-            del version, packaging
+            assert not args.squared_relu, ("TransformerEngine does not support squared "
+                                           "relu activation.")
 
-        self.use_fp8 = args.fp8_e4m3 or args.fp8_hybrid
+        self.use_fp8 = args.fp8 is not None
         self.fp8_recipe = None
         self.fp8_group = None
         if self.use_fp8:
-            self.fp8_group = mpu.get_data_parallel_group()
-            if args.fp8_e4m3:
+            assert args.transformer_impl == 'transformer_engine', \
+                'transformer-engine required for fp8 training and inference'
+            self.fp8_group = mpu.get_amax_reduction_group(tp_only_amax_red=config.tp_only_amax_red)
+            if args.fp8 == "e4m3":
                 fp8_format = transformer_engine.common.recipe.Format.E4M3
-            elif args.fp8_hybrid:
+            elif args.fp8 == "hybrid":
                 fp8_format = transformer_engine.common.recipe.Format.HYBRID
+            else:
+                raise ValueError("The DelayedScaling recipe only supports E4M3 and HYBRID formats.")
             self.fp8_recipe = transformer_engine.common.recipe.DelayedScaling(
                 margin=args.fp8_margin,
                 interval=args.fp8_interval,
@@ -1315,7 +1442,7 @@ def __init__(self, init_method, output_layer_init_method,
 
         self.num_microbatches_in_previous_step = -1
         self.microbatch_count = 0
-        self.checkpoint_core_attention = args.recompute_granularity == 'selective'
+        self.checkpoint_core_attention = config.recompute_granularity == 'selective'
 
         # Number of layers.
         self.num_layers = _get_num_layers(args, model_type,
@@ -1323,11 +1450,11 @@ def __init__(self, init_method, output_layer_init_method,
 
         self.drop_path_rates = [
             rate.item() for rate in
-            torch.linspace(0, self.drop_path_rate, args.num_layers)]
+            torch.linspace(0, self.drop_path_rate, config.num_layers)]
 
         self.retro_layer_numbers = None
         if model_type == ModelType.retro_decoder:
-            retro_layer_start = 6 if args.num_layers <= 15 else 9
+            retro_layer_start = 6 if config.num_layers <= 15 else 9
             self.retro_layer_numbers = \
                 np.arange(retro_layer_start, args.num_layers + 1, 3).tolist()
         if model_type == ModelType.retro_encoder:
@@ -1335,6 +1462,8 @@ def __init__(self, init_method, output_layer_init_method,
 
         # Transformer layers.
         if args.retro_add_retriever:
+            assert self.recompute_granularity != 'full', \
+                "Full recompute not supported for Retro."
             assert args.transformer_impl == 'local', \
                 "Transformer engine does not support Retro layers."
         def build_layer(layer_number):
@@ -1343,49 +1472,63 @@ def build_layer(layer_number):
                     model_type, layer_type, self.retro_layer_numbers,
                     layer_number)
                 return ParallelTransformerLayer(
-                    init_method,
-                    output_layer_init_method,
+                    config,
                     layer_number,
                     layer_type=current_layer_type,
                     self_attn_mask_type=self_attn_mask_type,
                     drop_path_rate=self.drop_path_rates[layer_number - 1])
             else:
+                # This argument is only available from TE v0.10 onwards.
+                extra_transformer_engine_kwargs = {}
+                if self.transformer_engine_v_0_8:
+                    extra_transformer_engine_kwargs["bias"] = args.add_bias_linear
+                if self.transformer_engine_v_0_10:
+                    extra_transformer_engine_kwargs["activation"] = "swiglu" if args.swiglu else "gelu"
+                if self.transformer_engine_v_0_11:
+                    extra_transformer_engine_kwargs["normalization"] = args.normalization
+                assert config.attention_softmax_in_fp32, "TransformerEngine only supports softmax compute in FP32."
+                assert (
+                    (bool(int(os.getenv("NVTE_APPLY_QK_LAYER_SCALING", "0"))) and args.fp16) == config.apply_query_key_layer_scaling
+                ), ("Unsupported config for apply_query_key_layer_scaling in TransformerEngine. If --apply-query-key-layer-scaling is "
+                    "provided, set env-var NVTE_APPLY_QK_LAYER_SCALING=1 and you must be using fp16.")
                 return transformer_engine.pytorch.TransformerLayer(
-                    args.hidden_size,
-                    args.ffn_hidden_size,
-                    args.num_attention_heads,
-                    layernorm_epsilon=args.layernorm_epsilon,
-                    hidden_dropout=args.hidden_dropout,
-                    attention_dropout=args.attention_dropout,
-                    init_method=init_method,
-                    output_layer_init_method=output_layer_init_method,
+                    config.hidden_size,
+                    config.ffn_hidden_size,
+                    config.num_attention_heads,
+                    layernorm_epsilon=config.layernorm_epsilon,
+                    hidden_dropout=config.hidden_dropout,
+                    attention_dropout=config.attention_dropout,
+                    init_method=config.init_method,
+                    output_layer_init_method=config.output_layer_init_method,
                     layer_number=layer_number,
-                    kv_channels=args.kv_channels,
+                    kv_channels=config.kv_channels,
                     self_attn_mask_type=self_attn_mask_type.name,
-                    tp_group=mpu.get_tensor_model_parallel_group(),
-                    get_rng_state_tracker=tensor_parallel.get_cuda_rng_tracker,
-                    fuse_wgrad_accumulation=args.gradient_accumulation_fusion,
-                    apply_query_key_layer_scaling=args.apply_query_key_layer_scaling,
-                    attention_softmax_in_fp32=args.attention_softmax_in_fp32,
+                    tp_group=mpu.get_tensor_model_parallel_group() if mpu.is_initialized() else None,
+                    tp_size=mpu.get_tensor_model_parallel_world_size(),
+                    get_rng_state_tracker=get_cuda_rng_tracker
+                    if get_cuda_rng_tracker().is_initialized()
+                    else None,
+                    fuse_wgrad_accumulation=config.gradient_accumulation_fusion,
                     seq_length=args.seq_length,
                     micro_batch_size=args.micro_batch_size,
-                    sequence_parallel=args.sequence_parallel,
-                    params_dtype=args.params_dtype,
-                    apply_residual_connection_post_layernorm=args.apply_residual_connection_post_layernorm,
+                    sequence_parallel=config.sequence_parallel,
+                    params_dtype=config.params_dtype,
+                    apply_residual_connection_post_layernorm=config.apply_residual_connection_post_layernorm,
                     output_layernorm=False,
                     layer_type="encoder",
                     drop_path_rate=self.drop_path_rates[layer_number - 1],
                     set_parallel_mode=True,
-                    fuse_qkv_params=True)
+                    fuse_qkv_params=True,
+                    **extra_transformer_engine_kwargs)
 
-        if args.virtual_pipeline_model_parallel_size is not None:
-            assert args.num_layers % args.virtual_pipeline_model_parallel_size == 0, \
+        if config.virtual_pipeline_model_parallel_size is not None:
+            assert config.num_layers % config.virtual_pipeline_model_parallel_size == 0, \
                 'num_layers_per_stage must be divisible by ' \
                 'virtual_pipeline_model_parallel_size'
             assert args.model_type != ModelType.encoder_and_decoder
             # Number of layers in each model chunk is the number of layers in the stage,
             # divided by the number of model chunks in a stage.
-            self.num_layers = self.num_layers // args.virtual_pipeline_model_parallel_size
+            self.num_layers = self.num_layers // config.virtual_pipeline_model_parallel_size
             # With 8 layers, 2 stages, and 4 model chunks, we want an assignment of
             # layers to stages like (each list is a model chunk):
             # Stage 0: [0]  [2]  [4]  [6]
@@ -1395,7 +1538,7 @@ def build_layer(layer_number):
             # Stage 0: [0, 1]  [4, 5]
             # Stage 1: [2, 3]  [6, 7]
             offset = mpu.get_virtual_pipeline_model_parallel_rank() * (
-                args.num_layers // args.virtual_pipeline_model_parallel_size) + \
+                config.num_layers // config.virtual_pipeline_model_parallel_size) + \
                 (mpu.get_pipeline_model_parallel_rank() * self.num_layers)
         else:
             # Each stage gets a contiguous set of layers.
@@ -1436,14 +1579,9 @@ def build_layer(layer_number):
                             args.retro_encoder_attention_dropout
                     layer.hidden_dropout = args.retro_encoder_hidden_dropout
 
-        if self.post_process and self.post_layer_norm:
+        if self.post_process and self.post_norm:
             # Final layer norm before output.
-            self.final_layernorm = LayerNorm(
-                args.hidden_size,
-                eps=args.layernorm_epsilon,
-                no_persist_layer_norm=args.no_persist_layer_norm,
-                sequence_parallel=args.sequence_parallel,
-                apply_layernorm_1p=args.apply_layernorm_1p)
+            self.final_norm = get_norm(config)
 
     def _get_layer(self, layer_number):
         return self.layers[layer_number]
@@ -1464,7 +1602,7 @@ def custom_forward(*args, **kwargs):
         te_forward_kwargs = {}
         if self.transformer_impl == 'transformer_engine':
             te_forward_kwargs['is_first_microbatch'] = is_first_microbatch
-            if self.transformer_engine_rope_available:
+            if self.transformer_engine_v_0_10:
                 te_forward_kwargs['rotary_pos_emb'] = rotary_pos_emb
 
         if self.recompute_method == 'uniform':
@@ -1474,7 +1612,7 @@ def custom_forward(*args, **kwargs):
             l = 0
             while l < self.num_layers:
                 if self.transformer_impl == 'transformer_engine':
-                    hidden_states = transformer_engine.pytorch.distributed.checkpoint(
+                    hidden_states = transformer_engine.pytorch.checkpoint(
                         custom(l, l + self.recompute_num_layers),
                         self.distribute_saved_activations,
                         tensor_parallel.get_cuda_rng_tracker,
@@ -1485,8 +1623,9 @@ def custom_forward(*args, **kwargs):
                     hidden_states = tensor_parallel.checkpoint(
                         custom(l, l + self.recompute_num_layers),
                         self.distribute_saved_activations,
-                        hidden_states, attention_mask, encoder_output,
-                        enc_dec_attn_mask, rotary_pos_emb)
+                        hidden_states, attention_mask,
+                        encoder_output, enc_dec_attn_mask,
+                        None, None, None, None, rotary_pos_emb)
 
                 l += self.recompute_num_layers
 
@@ -1497,7 +1636,7 @@ def custom_forward(*args, **kwargs):
             for l in range(self.num_layers):
                 if l < self.recompute_num_layers:
                     if self.transformer_impl == 'transformer_engine':
-                        hidden_states = transformer_engine.pytorch.distributed.checkpoint(
+                        hidden_states = transformer_engine.pytorch.checkpoint(
                             custom(l, l + 1),
                             self.distribute_saved_activations,
                             tensor_parallel.get_cuda_rng_tracker,
@@ -1508,8 +1647,9 @@ def custom_forward(*args, **kwargs):
                         hidden_states = tensor_parallel.checkpoint(
                             custom(l, l + 1),
                             self.distribute_saved_activations,
-                            hidden_states, attention_mask, encoder_output,
-                            enc_dec_attn_mask, rotary_pos_emb)
+                            hidden_states, attention_mask,
+                            encoder_output, enc_dec_attn_mask,
+                            None, None, None, None, rotary_pos_emb)
                 else:
                     if self.transformer_impl == 'transformer_engine':
                         hidden_states = custom(l, l + 1)(
@@ -1517,8 +1657,9 @@ def custom_forward(*args, **kwargs):
                             enc_dec_attn_mask, **te_forward_kwargs)
                     else:
                         hidden_states = custom(l, l + 1)(
-                            hidden_states, attention_mask, encoder_output,
-                            enc_dec_attn_mask, rotary_pos_emb)
+                            hidden_states, attention_mask,
+                            encoder_output, enc_dec_attn_mask,
+                            None, None, None, None, rotary_pos_emb)
         else:
             raise ValueError("Invalid activation recompute method.")
 
@@ -1596,8 +1737,6 @@ def forward(self, hidden_states, attention_mask,
 
                 # Forward pass.
                 if self.recompute_granularity == 'full':
-                    assert not self.retro_add_retriever, \
-                        "full recompute not supported for retro."
                     hidden_states = self._checkpointed_forward(hidden_states,
                                                                attention_mask,
                                                                encoder_output,
@@ -1614,7 +1753,7 @@ def forward(self, hidden_states, attention_mask,
                     if self.transformer_impl == 'transformer_engine':
                         forward_kwargs['is_first_microbatch'] = is_first_microbatch
                         forward_kwargs['checkpoint_core_attention'] = self.checkpoint_core_attention
-                        if self.transformer_engine_rope_available:
+                        if self.transformer_engine_v_0_10:
                             forward_kwargs['rotary_pos_emb'] = rotary_pos_emb
                     else:
                         forward_kwargs['rotary_pos_emb'] = rotary_pos_emb
@@ -1643,7 +1782,22 @@ def forward(self, hidden_states, attention_mask,
                     self.microbatch_count += 1
 
         # Final layer norm.
-        if self.post_process and self.post_layer_norm:
-            hidden_states = self.final_layernorm(hidden_states)
+        if self.post_process and self.post_norm:
+            hidden_states = self.final_norm(hidden_states)
 
         return hidden_states
+
+    def load_state_dict(self, state_dict, strict=True):
+        """Customize load."""
+
+        # Handle renaming layernorm -> norm in component names
+        state_dict_ = {}
+        for key in state_dict.keys():
+            # Bypass TransformerEngine module parameters.
+            if "layernorm_qkv" in key or "layernorm_mlp" in key:
+                state_dict_[key] = state_dict[key]
+                continue
+            newkey = key.replace("layernorm", "norm")
+            state_dict_[newkey] = state_dict[key]
+
+        super().load_state_dict(state_dict_, strict)
diff --git a/megatron/model/utils.py b/megatron/legacy/model/utils.py
similarity index 58%
rename from megatron/model/utils.py
rename to megatron/legacy/model/utils.py
index cf3727c02b..5762000d5d 100644
--- a/megatron/model/utils.py
+++ b/megatron/legacy/model/utils.py
@@ -1,4 +1,4 @@
-# Copyright (c) 2022, NVIDIA CORPORATION. All rights reserved.
+# Copyright (c) 2023, NVIDIA CORPORATION. All rights reserved.
 
 """Utilities for models."""
 
@@ -6,7 +6,9 @@
 
 import torch
 
-from megatron import get_args
+from megatron.training import get_args
+from megatron.legacy.model import LayerNorm, RMSNorm
+from megatron.core.jit import jit_fuser
 
 def init_method_normal(sigma):
     """Init method based on N(0, sigma)."""
@@ -40,15 +42,38 @@ def get_linear_layer(rows, columns, init_method):
         layer.bias.zero_()
     return layer
 
-@torch.jit.script
+
+@jit_fuser
 def gelu_impl(x):
     """OpenAI's gelu implementation."""
     return 0.5 * x * (1.0 + torch.tanh(0.7978845608028654 * x *
+
                                        (1.0 + 0.044715 * x * x)))
 def openai_gelu(x):
     return gelu_impl(x)
 
+
 #This is actually Python equivalent of torch.nn.functional.gelu(), also with type hints for ONNX exporter
-@torch.jit.script
+@jit_fuser
 def erf_gelu(x):
     return x * 0.5 * (torch.erf(x / 1.41421).to(dtype=x.dtype)+torch.ones_like(x).to(dtype=x.dtype))
+
+
+def get_norm(config):
+    args = get_args()
+    if args.normalization == "LayerNorm":
+        return LayerNorm(
+            config.hidden_size,
+            eps=config.layernorm_epsilon,
+            no_persist_layer_norm=not config.persist_layer_norm,
+            sequence_parallel=config.sequence_parallel,
+            apply_layernorm_1p=args.apply_layernorm_1p)
+    elif args.normalization == "RMSNorm":
+        if args.apply_layernorm_1p:
+            raise NotImplementedError('RMSNorm does not currently support the layernorm_1p formulation.')
+
+        return RMSNorm(dim=config.hidden_size,
+                       eps=config.layernorm_epsilon,
+                       sequence_parallel=config.sequence_parallel)
+    else:
+        raise Exception(f"unsupported norm type '{args.normalization}'.")
diff --git a/megatron/model/vision/classification.py b/megatron/legacy/model/vision/classification.py
similarity index 77%
rename from megatron/model/vision/classification.py
rename to megatron/legacy/model/vision/classification.py
index fd5d58435d..f9419c71de 100644
--- a/megatron/model/vision/classification.py
+++ b/megatron/legacy/model/vision/classification.py
@@ -4,19 +4,20 @@
 
 import torch
 from torch.nn.init import trunc_normal_
-from megatron import get_args
-from megatron.model.utils import get_linear_layer
-from megatron.model.vision.vit_backbone import VitBackbone, VitMlpHead
-from megatron.model.vision.mit_backbone import mit_b3_avg
-from megatron.model.module import MegatronModule
+from megatron.training import get_args
+from megatron.legacy.model.utils import get_linear_layer
+from megatron.legacy.model.vision.vit_backbone import VitBackbone, VitMlpHead
+from megatron.legacy.model.vision.mit_backbone import mit_b3_avg
+from megatron.legacy.model.module import MegatronModule
 
 class VitClassificationModel(MegatronModule):
     """Vision Transformer Model."""
 
-    def __init__(self, num_classes, finetune=False,
+    def __init__(self, config, num_classes, finetune=False,
                  pre_process=True, post_process=True):
         super(VitClassificationModel, self).__init__()
         args = get_args()
+        self.config = config
 
         self.hidden_size = args.hidden_size
         self.num_classes = num_classes
@@ -24,14 +25,15 @@ def __init__(self, num_classes, finetune=False,
         self.pre_process = pre_process
         self.post_process = post_process
         self.backbone = VitBackbone(
+            config=config,
             pre_process=self.pre_process,
             post_process=self.post_process,
             single_token_output=True
         )
-        
+
         if self.post_process:
             if not self.finetune:
-                self.head = VitMlpHead(self.hidden_size, self.num_classes)
+                self.head = VitMlpHead(config, self.hidden_size, self.num_classes)
             else:
                 self.head = get_linear_layer(
                     self.hidden_size,
@@ -40,7 +42,7 @@ def __init__(self, num_classes, finetune=False,
                 )
 
     def set_input_tensor(self, input_tensor):
-        """See megatron.model.transformer.set_input_tensor()"""
+        """See megatron.legacy.model.transformer.set_input_tensor()"""
         self.backbone.set_input_tensor(input_tensor)
 
     def forward(self, input):
@@ -74,7 +76,7 @@ def _init_weights(self, m):
                 torch.nn.init.constant_(m.bias, 0)
 
     def set_input_tensor(self, input_tensor):
-        """See megatron.model.transformer.set_input_tensor()"""
+        """See megatron.legacy.model.transformer.set_input_tensor()"""
         pass
 
     def forward(self, input):
diff --git a/megatron/model/vision/dino.py b/megatron/legacy/model/vision/dino.py
similarity index 91%
rename from megatron/model/vision/dino.py
rename to megatron/legacy/model/vision/dino.py
index 651271a6fc..20ca2100f6 100644
--- a/megatron/model/vision/dino.py
+++ b/megatron/legacy/model/vision/dino.py
@@ -12,12 +12,12 @@
 import numpy as np
 import torch.nn.functional as F
 from torch.nn.init import trunc_normal_
-from megatron import get_args, print_rank_0
-from megatron.model.utils import get_linear_layer
-from megatron.model.vision.vit_backbone import VitBackbone
-from megatron.model.module import MegatronModule
-from megatron.model.vision.mit_backbone import mit_b5_avg
-from megatron.model.vision.esvit_swin_backbone import get_swin
+from megatron.training import get_args, print_rank_0
+from megatron.legacy.model.utils import get_linear_layer
+from megatron.legacy.model.vision.vit_backbone import VitBackbone
+from megatron.legacy.model.module import MegatronModule
+from megatron.legacy.model.vision.mit_backbone import mit_b5_avg
+from megatron.legacy.model.vision.esvit_swin_backbone import get_swin
 
 
 class DINOLoss(torch.nn.Module):
@@ -173,11 +173,12 @@ def cosine_scheduler(base_value, final_value, epochs, niter_per_ep,
     return schedule
 
 
-def get_student_backbone_and_num_features(pre_process=True, post_process=True):
+def get_student_backbone_and_num_features(config, pre_process=True, post_process=True):
     args = get_args()
 
     if args.vision_backbone_type == 'vit':
-        student = VitBackbone(pre_process=pre_process,
+        student = VitBackbone(config,
+                              pre_process=pre_process,
                               post_process=post_process,
                               drop_path_rate=0.1,
                               single_token_output=True)
@@ -191,14 +192,15 @@ def get_student_backbone_and_num_features(pre_process=True, post_process=True):
     else:
         raise Exception('{} vision backbone is not supported.'.format(
                               args.vision_backbone_type))
- 
+
     return student, num_features
 
-def get_teacher_backbone_and_num_features(pre_process=True, post_process=True):
+def get_teacher_backbone_and_num_features(config, pre_process=True, post_process=True):
     args = get_args()
 
     if args.vision_backbone_type == 'vit':
-        teacher = VitBackbone(pre_process=pre_process,
+        teacher = VitBackbone(config,
+                              pre_process=pre_process,
                               post_process=post_process,
                               single_token_output=True)
         num_features = args.hidden_size
@@ -215,9 +217,10 @@ def get_teacher_backbone_and_num_features(pre_process=True, post_process=True):
 
 
 class DINOPretrainModel(MegatronModule):
-    def __init__(self, pre_process=True, post_process=True):
+    def __init__(self, config, pre_process=True, post_process=True):
         super(DINOPretrainModel, self).__init__()
         args = get_args()
+        self.config = config
         self.out_dim = 65536
 
         self.dino_loss = DINOLoss(
@@ -234,7 +237,7 @@ def __init__(self, pre_process=True, post_process=True):
         self.momentum_teacher = 0.996
 
         student_backbone, num_features = \
-            get_student_backbone_and_num_features(pre_process, post_process)
+            get_student_backbone_and_num_features(config, pre_process, post_process)
 
         self.student = MultiCropWrapper(
             student_backbone,
@@ -249,7 +252,7 @@ def __init__(self, pre_process=True, post_process=True):
         )
 
         teacher_backbone, num_features = \
-            get_teacher_backbone_and_num_features(pre_process, post_process)
+            get_teacher_backbone_and_num_features(config, pre_process, post_process)
         self.teacher = MultiCropWrapper(
             teacher_backbone,
             DINOHead(num_features, self.out_dim)
diff --git a/megatron/model/vision/esvit_swin_backbone.py b/megatron/legacy/model/vision/esvit_swin_backbone.py
similarity index 99%
rename from megatron/model/vision/esvit_swin_backbone.py
rename to megatron/legacy/model/vision/esvit_swin_backbone.py
index 70aee3db42..87932040cb 100644
--- a/megatron/model/vision/esvit_swin_backbone.py
+++ b/megatron/legacy/model/vision/esvit_swin_backbone.py
@@ -15,9 +15,9 @@
 from functools import partial
 import torch.distributed as dist
 from torch.nn.init import trunc_normal_
-from megatron.model.transformer import DropPath
-from megatron import get_args
-from megatron.model import LayerNorm
+from megatron.legacy.model.transformer import DropPath
+from megatron.training import get_args
+from megatron.legacy.model import LayerNorm
 import numpy as np
 from math import sqrt
 
diff --git a/megatron/model/vision/inpainting.py b/megatron/legacy/model/vision/inpainting.py
similarity index 87%
rename from megatron/model/vision/inpainting.py
rename to megatron/legacy/model/vision/inpainting.py
index 96a33de5d3..f71f5e3209 100644
--- a/megatron/model/vision/inpainting.py
+++ b/megatron/legacy/model/vision/inpainting.py
@@ -1,31 +1,33 @@
-# Copyright (c) 2022, NVIDIA CORPORATION.  All rights reserved.
+# Copyright (c) 2023, NVIDIA CORPORATION.  All rights reserved.
 #
 # This source code is licensed under the BSD license found in the
 # LICENSE file in the root directory of this source tree.
-i
+
 import math
 import apex
 import einops
 import torch
 import torch.nn.functional as F
-from megatron import get_args, print_rank_0
-from megatron.model.utils import get_linear_layer
-from megatron.model.vision.vit_backbone import VitBackbone
-from megatron.model.module import MegatronModule
-from megatron.model.vision.mit_backbone import mit_b3
-from megatron.model.vision.utils import resize_
+from megatron.training import get_args, print_rank_0
+from megatron.legacy.model.utils import get_linear_layer
+from megatron.legacy.model.vision.vit_backbone import VitBackbone
+from megatron.legacy.model.module import MegatronModule
+from megatron.legacy.model.vision.mit_backbone import mit_b3
+from megatron.legacy.model.vision.utils import resize
 
 
 class VitInpaintingModel(MegatronModule):
 
-    def __init__(self, pre_process=True, post_process=True):
+    def __init__(self, config, pre_process=True, post_process=True):
         super(VitInpaintingModel, self).__init__()
         args = get_args()
 
+        self.config = config
         self.pre_process = pre_process
         self.post_process = post_process
-        self.hidden_size = args.hidden_size
+        self.hidden_size = config.hidden_size
         self.backbone = VitBackbone(
+            config=config,
             pre_process=self.pre_process,
             post_process=self.post_process,
             class_token=False,
@@ -107,11 +109,11 @@ def __init__(self, pre_process=True, post_process=True):
         self.conv_fuse = torch.nn.Conv2d(self.embedding_dim*4, self.embedding_dim, 1, 1, bias=False)
         self.norm = apex.parallel.SyncBatchNorm(self.embedding_dim)
         self.dropout = torch.nn.Dropout2d(0.1)
-        
+
         self.linear_pred = torch.nn.Conv2d(self.embedding_dim, self.flatten_dim, kernel_size=1)
-    
+
     def set_input_tensor(self, input_tensor):
-        """See megatron.model.transformer.set_input_tensor()"""
+        """See megatron.legacy.model.transformer.set_input_tensor()"""
         pass
 
     def forward(self, input):
@@ -120,7 +122,7 @@ def forward(self, input):
         n, _, h, w = c4.shape
         _c4 = self.linear_c4(c4).permute(0, 2, 1).reshape(n, -1, c4.shape[2], c4.shape[3])
         _c4 = resize(_c4, size=c1.size()[2:], mode='bilinear', align_corners=False)
-    
+
         _c3 = self.linear_c3(c3).permute(0, 2, 1).reshape(n, -1, c3.shape[2], c3.shape[3])
         _c3 = resize(_c3, size=c1.size()[2:], mode='bilinear', align_corners=False)
 
@@ -131,7 +133,7 @@ def forward(self, input):
 
         _c = torch.cat([_c4, _c3, _c2, _c1], dim=1)
         _c = self.conv_fuse(_c)
- 
+
         x = self.norm(_c)
         x = F.relu(x, inplace=True)
         x = self.dropout(x)
diff --git a/megatron/model/vision/knn_monitor.py b/megatron/legacy/model/vision/knn_monitor.py
similarity index 94%
rename from megatron/model/vision/knn_monitor.py
rename to megatron/legacy/model/vision/knn_monitor.py
index a7d79854eb..54e726854d 100644
--- a/megatron/model/vision/knn_monitor.py
+++ b/megatron/legacy/model/vision/knn_monitor.py
@@ -1,9 +1,10 @@
+# Copyright (c) 2024, NVIDIA CORPORATION. All rights reserved.
 import torch.nn.functional as F
 import torch
-from megatron import print_rank_0, get_args
+from megatron.training import print_rank_0, get_args
 from megatron.core import mpu
-from megatron.data.vit_dataset import ClassificationTransform
-from megatron.data.image_folder import ImageFolder
+from megatron.legacy.data.vit_dataset import ClassificationTransform
+from megatron.legacy.data.image_folder import ImageFolder
 
 _FEATURE_BANK = None
 
diff --git a/megatron/model/vision/mit_backbone.py b/megatron/legacy/model/vision/mit_backbone.py
similarity index 97%
rename from megatron/model/vision/mit_backbone.py
rename to megatron/legacy/model/vision/mit_backbone.py
index c67ca2c62b..3ca2303c30 100644
--- a/megatron/model/vision/mit_backbone.py
+++ b/megatron/legacy/model/vision/mit_backbone.py
@@ -1,18 +1,13 @@
-# ---------------------------------------------------------------
-# Copyright (c) 2021, NVIDIA Corporation. All rights reserved.
-#
-# This work is licensed under the NVIDIA Source Code License
-# found in the LICENSE file in the root directory of this 
-# source tree.
-# ---------------------------------------------------------------
+# Copyright (c) 2023, NVIDIA Corporation. All rights reserved.
+
 import math
 import torch
 import torch.nn as nn
 import torch.nn.functional as F
 from functools import partial
 from torch.nn.init import trunc_normal_
-from megatron.model.transformer import DropPath
-from megatron.model import LayerNorm
+from megatron.legacy.model.transformer import DropPath
+from megatron.legacy.model import LayerNorm
 
 
 class Mlp(nn.Module):
diff --git a/megatron/model/vision/swin_backbone.py b/megatron/legacy/model/vision/swin_backbone.py
similarity index 99%
rename from megatron/model/vision/swin_backbone.py
rename to megatron/legacy/model/vision/swin_backbone.py
index 9a622c7070..231802c8f2 100644
--- a/megatron/model/vision/swin_backbone.py
+++ b/megatron/legacy/model/vision/swin_backbone.py
@@ -12,7 +12,7 @@
 from timm.models.layers import DropPath, to_2tuple, trunc_normal_
 from math import sqrt
 
-from megatron import get_args
+from megatron.training import get_args
 from functools import partial
 
 
diff --git a/megatron/model/vision/utils.py b/megatron/legacy/model/vision/utils.py
similarity index 94%
rename from megatron/model/vision/utils.py
rename to megatron/legacy/model/vision/utils.py
index b4068912c8..6d29a877f1 100644
--- a/megatron/model/vision/utils.py
+++ b/megatron/legacy/model/vision/utils.py
@@ -1,3 +1,4 @@
+# Copyright (c) 2024, NVIDIA CORPORATION. All rights reserved.
 import warnings
 import torch
 import torch.nn.functional as F
diff --git a/megatron/model/vision/vit_backbone.py b/megatron/legacy/model/vision/vit_backbone.py
similarity index 90%
rename from megatron/model/vision/vit_backbone.py
rename to megatron/legacy/model/vision/vit_backbone.py
index fc0b5304db..b46f6f74d7 100644
--- a/megatron/model/vision/vit_backbone.py
+++ b/megatron/legacy/model/vision/vit_backbone.py
@@ -7,14 +7,14 @@
 import torch
 import apex
 import torch.nn.functional as F
-from megatron import get_args
-from megatron.model.transformer import ParallelTransformer
-from megatron.model.utils import (
+from megatron.training import get_args
+from megatron.legacy.model.transformer import ParallelTransformer
+from megatron.legacy.model.utils import (
     get_linear_layer,
     init_method_normal,
     scaled_init_method_normal,
 )
-from megatron.model.module import MegatronModule
+from megatron.legacy.model.module import MegatronModule
 
 CLASS_TOKEN_LENGTH = 8
 
@@ -24,14 +24,15 @@ class VitMlpHead(MegatronModule):
     Pool hidden states of a specific token (for example start of the
     sequence) and add a linear transformation followed by a tanh.
 
-    Arguments:
+    Args:
         hidden_size: hidden size
         init_method: weight initialization method for the linear layer.
             bias is set to zero.
     """
 
-    def __init__(self, hidden_size, num_classes):
+    def __init__(self, config, hidden_size, num_classes):
         super(VitMlpHead, self).__init__()
+        self.config = config
         self.dense_in = torch.nn.Linear(hidden_size, hidden_size)
         self.relu = torch.nn.ReLU()
         self.dense_out = torch.nn.Linear(hidden_size, num_classes)
@@ -130,24 +131,18 @@ class VitBackbone(MegatronModule):
     """Vision Transformer Model."""
 
     def __init__(self,
+                 config,
                  pre_process=True,
                  post_process=True,
                  class_token=True,
                  single_token_output=False,
                  post_layer_norm=True,
                  drop_path_rate=0.0):
-        super(VitBackbone, self).__init__(share_word_embeddings=False)
+        super(VitBackbone, self).__init__(share_embeddings_and_output_weights=False)
         args = get_args()
+        self.config = config
 
         self.fp16_lm_cross_entropy = args.fp16_lm_cross_entropy
-        if args.init_method_xavier_uniform:
-            self.init_method = torch.nn.init.xavier_uniform_
-            self.scaled_init_method = torch.nn.init.xavier_uniform_
-        else:
-            self.init_method = init_method_normal(args.init_method_std)
-            self.scaled_init_method = scaled_init_method_normal(
-                args.init_method_std, args.num_layers
-            )
 
         self.pre_process = pre_process
         self.post_process = post_process
@@ -179,7 +174,7 @@ def __init__(self,
                 )
                 torch.nn.init.zeros_(self.cls_token)
             self.position_ids = torch.arange(self.seq_length).expand(1, -1).cuda()
-            
+
             # Linear encoder
             self.linear_encoder = torch.nn.Linear(
                 self.flatten_dim, self.hidden_size
@@ -202,8 +197,8 @@ def __init__(self,
 
         # Transformer
         self.transformer = ParallelTransformer(
-            self.init_method,
-            self.scaled_init_method,
+            config,
+            model_type=args.model_type,
             pre_process=self.pre_process,
             post_process=self.post_process,
             post_layer_norm=self.post_layer_norm,
@@ -211,7 +206,7 @@ def __init__(self,
         )
 
     def set_input_tensor(self, input_tensor):
-        """See megatron.model.transformer.set_input_tensor()"""
+        """See megatron.legacy.model.transformer.set_input_tensor()"""
         self.transformer.set_input_tensor(input_tensor)
 
     def forward(self, input):
diff --git a/megatron/legacy/mpu/tests/__init__.py b/megatron/legacy/mpu/tests/__init__.py
new file mode 100644
index 0000000000..e69de29bb2
diff --git a/megatron/mpu/tests/commons.py b/megatron/legacy/mpu/tests/commons.py
similarity index 100%
rename from megatron/mpu/tests/commons.py
rename to megatron/legacy/mpu/tests/commons.py
diff --git a/megatron/mpu/tests/test_cross_entropy.py b/megatron/legacy/mpu/tests/test_cross_entropy.py
similarity index 100%
rename from megatron/mpu/tests/test_cross_entropy.py
rename to megatron/legacy/mpu/tests/test_cross_entropy.py
diff --git a/megatron/mpu/tests/test_data.py b/megatron/legacy/mpu/tests/test_data.py
similarity index 100%
rename from megatron/mpu/tests/test_data.py
rename to megatron/legacy/mpu/tests/test_data.py
diff --git a/megatron/mpu/tests/test_initialize.py b/megatron/legacy/mpu/tests/test_initialize.py
similarity index 100%
rename from megatron/mpu/tests/test_initialize.py
rename to megatron/legacy/mpu/tests/test_initialize.py
diff --git a/megatron/mpu/tests/test_layers.py b/megatron/legacy/mpu/tests/test_layers.py
similarity index 100%
rename from megatron/mpu/tests/test_layers.py
rename to megatron/legacy/mpu/tests/test_layers.py
diff --git a/megatron/mpu/tests/test_random.py b/megatron/legacy/mpu/tests/test_random.py
similarity index 97%
rename from megatron/mpu/tests/test_random.py
rename to megatron/legacy/mpu/tests/test_random.py
index 8ee6942cf0..26092772cf 100644
--- a/megatron/mpu/tests/test_random.py
+++ b/megatron/legacy/mpu/tests/test_random.py
@@ -20,7 +20,7 @@ def test_set_cuda_rng_state(tensor_model_parallel_size):
     size = 123
     seed = 1234
     torch.cuda.manual_seed(1234)
-    tensor = torch.cuda.FloatTensor(size)
+    tensor = torch.tensor(size, dtype=torch.float, device='cuda')
 
     # Get the state
     rng_state = torch.cuda.get_rng_state()
@@ -82,7 +82,7 @@ def test_cuda_rng_tracker(tensor_model_parallel_size):
     seed_1 = 1234
     seed_2 = 4321
     size = [12, 21]
-    tensor = torch.cuda.FloatTensor(size)
+    tensor = torch.tensor(size, dtype=torch.float, device='cuda')
 
     # Set to seed_1 and generate two tensors.
     torch.cuda.manual_seed(seed_1)
diff --git a/megatron/memory.py b/megatron/memory.py
deleted file mode 100644
index a5fef75baa..0000000000
--- a/megatron/memory.py
+++ /dev/null
@@ -1,132 +0,0 @@
-# Copyright (c) 2022, NVIDIA CORPORATION. All rights reserved.
-
-
-import torch
-
-
-# A dictionary of all the memory buffers allocated.
-_MEM_BUFFS = dict()
-
-
-def allocate_mem_buff(name, numel, dtype, track_usage):
-    """Allocate a memory buffer."""
-    assert name not in _MEM_BUFFS, \
-        'memory buffer {} already allocated.'.format(name)
-    _MEM_BUFFS[name] = MemoryBuffer(name, numel, dtype, track_usage)
-    return _MEM_BUFFS[name]
-
-
-def get_mem_buff(name):
-    """Get the memory buffer."""
-    return _MEM_BUFFS[name]
-
-
-class MemoryBuffer:
-    """Contiguous memory buffer.
-    Allocate a contiguous memory of type `dtype` and size `numel`. It is
-    used to reduce memory fragmentation.
-
-    Usage: After the allocation, the `_start` index is set tot the first
-           index of the memory. A memory chunk starting from `_start` index
-           can be `allocated` for an input tensor, with the elements of the
-           tensor being coppied. The buffer can be reused by resetting the
-           `_start` index.
-
-    """
-    def __init__(self, name, numel, dtype, track_usage):
-        if torch.distributed.get_rank() == 0:
-            element_size = torch.tensor([], dtype=dtype).element_size()
-            print('> building the {} memory buffer with {} num elements '
-                  'and {} dtype ({:.1f} MB)...'.format(
-                      name, numel, dtype, numel*element_size/1024/1024),
-                  flush=True)
-        self.name = name
-        self.numel = numel
-        self.dtype = dtype
-        self.data = torch.empty(self.numel,
-                                dtype=self.dtype,
-                                device=torch.cuda.current_device(),
-                                requires_grad=False)
-
-        # Index tracking the start of the free memory.
-        self._start = 0
-
-        # Values used for tracking usage.
-        self.track_usage = track_usage
-        if self.track_usage:
-            self.in_use_value = 0.0
-            self.total_value = 0.0
-
-
-    def reset(self):
-        """Reset the buffer start index to the beginning of the buffer."""
-        self._start = 0
-
-
-    def is_in_use(self):
-        """Whether the current buffer hold on to any memory."""
-        return self._start > 0
-
-
-    def numel_in_use(self):
-        """Return number of elements in use."""
-        return self._start
-
-
-    def add(self, tensor):
-        """Allocate a chunk of memory from the buffer to tensor and copy
-        the values."""
-        assert tensor.dtype == self.dtype, \
-            'Input tensor type {} different from buffer type {}'.format(
-                tensor.dtype, self.dtype)
-        # Number of elements of the input tensor.
-        tensor_numel = torch.numel(tensor)
-        new_start = self._start + tensor_numel
-        assert new_start <= self.numel, \
-            'Not enough memory left in the buffer ({} > {})'.format(
-                tensor_numel, self.numel - self._start)
-        # New tensor is a view into the memory.
-        new_tensor = self.data[self._start:new_start]
-        self._start = new_start
-        new_tensor = new_tensor.view(tensor.shape)
-        new_tensor.copy_(tensor)
-        # Return a pointer to the new tensor.
-        return new_tensor
-
-
-    def get_data(self):
-        """Return the data currently in use."""
-        if self.track_usage:
-            self.in_use_value += float(self._start)
-            self.total_value += float(self.numel)
-        return self.data[:self._start]
-
-
-    def print_average_usage(self):
-        """Print memory usage average over time. We would like this value
-        to be as high as possible."""
-        assert self.track_usage, 'You need to enable track usage.'
-        if torch.distributed.get_rank() == 0:
-            print(' > usage of {} memory buffer: {:.2f} %'.format(
-                self.name, self.in_use_value * 100.0 / self.total_value),
-                  flush=True)
-
-
-
-class RingMemBuffer:
-    """A ring of memory buffers."""
-
-    def __init__(self, name, num_buffers, numel, dtype, track_usage):
-        self.num_buffers = num_buffers
-        self.buffers = [
-            allocate_mem_buff(name+' {}'.format(i), numel, dtype, track_usage)
-            for i in range(num_buffers)]
-        self._index = -1
-
-
-    def get_next_buffer(self):
-        self._index += 1
-        self._index = self._index % self.num_buffers
-        buff = self.buffers[self._index]
-        assert not buff.is_in_use(), 'buffer is already in use.'
-        return buff
diff --git a/megatron/microbatches.py b/megatron/microbatches.py
deleted file mode 100644
index 6449d7479c..0000000000
--- a/megatron/microbatches.py
+++ /dev/null
@@ -1,144 +0,0 @@
-# Copyright (c) 2022, NVIDIA CORPORATION. All rights reserved.
-
-"""Megatron number of micro-batches calculators."""
-
-from abc import ABC
-from abc import abstractmethod
-
-
-def build_num_microbatches_calculator(args):
-
-    # Constant num micro-batches.
-    if args.rampup_batch_size is None:
-        num_microbatches_calculator = ConstantNumMicroBatches(
-            args.global_batch_size, args.micro_batch_size,
-            args.data_parallel_size)
-        if args.rank == 0:
-            print('setting number of micro-batches to constant {}'.format(
-                num_microbatches_calculator.get()), flush=True)
-
-    else:
-        assert len(args.rampup_batch_size) == 3, 'expected the following ' \
-            'format: --rampup-batch-size <start batch size> ' \
-            '<batch size incerement> <ramp-up samples>'
-        start_batch_size = int(args.rampup_batch_size[0])
-        batch_size_increment = int(args.rampup_batch_size[1])
-        ramup_samples = int(args.rampup_batch_size[2])
-        if args.rank == 0:
-            print('will use batch size rampup starting from global batch '
-                  'size {} to global batch size {} with batch size increments '
-                  '{} over {} samples.'.format(start_batch_size,
-                                               args.global_batch_size,
-                                               batch_size_increment,
-                                               ramup_samples), flush=True)
-        num_microbatches_calculator = RampupBatchsizeNumMicroBatches(
-            start_batch_size, batch_size_increment, ramup_samples,
-            args.global_batch_size, args.micro_batch_size,
-            args.data_parallel_size)
-
-    return num_microbatches_calculator
-
-
-class NumMicroBatchesCalculator(ABC):
-
-    def __init__(self):
-        self.num_micro_batches = None
-        self.current_global_batch_size = None
-
-    def get(self):
-        return self.num_micro_batches
-
-    def get_current_global_batch_size(self):
-        return self.current_global_batch_size
-
-    @abstractmethod
-    def update(self, consumed_samples, consistency_check):
-        pass
-
-
-class ConstantNumMicroBatches(NumMicroBatchesCalculator):
-
-    def __init__(self, global_batch_size, micro_batch_size, data_parallel_size):
-        micro_batch_times_data_parallel = micro_batch_size * \
-                                          data_parallel_size
-        assert global_batch_size % micro_batch_times_data_parallel == 0, \
-            'global batch size ({}) is not divisible by micro batch size ({})' \
-            ' times data parallel size ({})'.format(global_batch_size,
-                                                    micro_batch_size,
-                                                    data_parallel_size)
-        self.num_micro_batches = global_batch_size // \
-                                 micro_batch_times_data_parallel
-        assert self.num_micro_batches >= 1
-        self.current_global_batch_size = global_batch_size
-
-    def update(self, consumed_samples, consistency_check):
-        pass
-
-
-class RampupBatchsizeNumMicroBatches(NumMicroBatchesCalculator):
-
-    def __init__(self, start_batch_size, batch_size_increment, ramup_samples,
-                 global_batch_size, micro_batch_size, data_parallel_size):
-        """Batch size ramp up.
-        Over 
-          steps = (global-batch-size - start-batch-size) / batch_size_increment
-        increment batch size from start-batch-size to global-batch-size using
-          rampup-samples / steps
-        samples.
-        Arguments:
-            start_batch_size: global batch size to start with
-            batch_size_increment: global batch size increments
-            ramup_samples: number of samples to use ramp up global
-               batch size from `start_batch_size` to `global_batch_size`
-            global_batch_size: global batch size post rampup
-            micro_batch_size: micro batch size
-            data_parallel_size: data parallel size.
-        """
-
-        self.micro_batch_size = micro_batch_size
-        self.data_parallel_size = data_parallel_size
-        self.micro_batch_times_data_parallel_size = self.micro_batch_size * \
-                                                    self.data_parallel_size
-        assert self.micro_batch_times_data_parallel_size > 0
-        
-        assert start_batch_size > 0
-        self.start_batch_size = start_batch_size
-
-        assert global_batch_size > 0
-        self.global_batch_size = global_batch_size
-        diff_batch_size = self.global_batch_size - self.start_batch_size
-        assert diff_batch_size >= 0
-        assert batch_size_increment > 0
-        self.batch_size_increment = batch_size_increment
-        assert diff_batch_size % batch_size_increment == 0, 'expected ' \
-            'global batch size interval ({}) to be divisible by global batch ' \
-            'size increment ({})'.format(diff_batch_size, batch_size_increment)
-
-        num_increments = diff_batch_size // self.batch_size_increment
-        self.ramup_samples = ramup_samples
-        assert self.ramup_samples >= 0
-        self.rampup_samples_per_increment = self.ramup_samples / num_increments
-
-        # Initialize number of microbatches.
-        self.update(0, False)
-
-
-    def update(self, consumed_samples, consistency_check):
-
-        if consumed_samples > self.ramup_samples:
-            self.current_global_batch_size = self.global_batch_size
-        else:
-            steps = int(consumed_samples / self.rampup_samples_per_increment)
-            self.current_global_batch_size = self.start_batch_size + \
-                steps * self.batch_size_increment
-            assert self.current_global_batch_size <= self.global_batch_size
-
-        if consistency_check:
-            assert self.current_global_batch_size % \
-                self.micro_batch_times_data_parallel_size == 0, 'current global ' \
-                'batch size ({}) is not divisible by micro-batch-size ({}) times' \
-                'data parallel size ({})'.format(self.current_global_batch_size,
-                                                 self.micro_batch_size,
-                                                 self.data_parallel_size)
-        self.num_micro_batches = self.current_global_batch_size // \
-                                 self.micro_batch_times_data_parallel_size
diff --git a/megatron/model/distributed.py b/megatron/model/distributed.py
deleted file mode 100644
index e28ca15e2f..0000000000
--- a/megatron/model/distributed.py
+++ /dev/null
@@ -1,232 +0,0 @@
-# Copyright (c) 2023, NVIDIA CORPORATION. All rights reserved.
-
-from abc import ABC
-from abc import abstractmethod
-import math
-
-import torch
-from torch._utils import _flatten_dense_tensors, _unflatten_dense_tensors
-
-from megatron import get_args
-from megatron.core import mpu
-from .module import MegatronModule
-
-
-class MemoryBuffer:
-
-    def __init__(self, numel, numel_padded, dtype):
-        self.numel = numel
-        self.numel_padded = numel_padded
-        self.dtype = dtype
-        self.data = torch.zeros(self.numel_padded,
-                                dtype=self.dtype,
-                                device=torch.cuda.current_device(),
-                                requires_grad=False)
-
-    def zero(self):
-        """Reset the buffer to zero."""
-        self.data.zero_()
-
-
-    def get(self, shape, start_index):
-        """Return a tensor with the input `shape` as a view into the
-        1-D data starting at `start_index`."""
-        end_index = start_index + shape.numel()
-        assert end_index <= self.numel, \
-            'requested tensor is out of the buffer range.'
-        buffer_tensor = self.data[start_index:end_index]
-        buffer_tensor = buffer_tensor.view(shape)
-        return buffer_tensor
-
-
-
-class DistributedDataParallelBase(MegatronModule, ABC):
-    """Abstract class for DDP."""
-
-    def __init__(self, module):
-        super(DistributedDataParallelBase, self).__init__()
-        # Keep a pointer to the model.
-        self.module = module
-
-
-    @abstractmethod
-    def allreduce_gradients(self):
-        pass
-
-
-    def forward(self, *inputs, **kwargs):
-        return self.module(*inputs, **kwargs)
-
-
-    def state_dict(self, prefix='', keep_vars=False):
-        return self.module.state_dict(prefix=prefix, keep_vars=keep_vars)
-
-
-    def state_dict_for_save_checkpoint(self, prefix='', keep_vars=False):
-        return self.module.state_dict_for_save_checkpoint(prefix=prefix,
-                                                          keep_vars=keep_vars)
-
-
-    def load_state_dict(self, state_dict, strict=True):
-        self.module.load_state_dict(state_dict, strict=strict)
-
-
-
-class DistributedDataParallel(DistributedDataParallelBase):
-    """DDP with contiguous buffers options to storre and accumulate gradients.
-    This class:
-        - has the potential to reduce memory fragmentation.
-        - provides the option to do the gradient accumulation
-          in a type other than the params type (for example fp32)
-
-    Arguments:
-        module: input model.
-        accumulate_allreduce_grads_in_fp32: if true do the gradient accumulation
-            and the gradient all-reduce all in in float32. If this option is
-            true, we require `use_contiguous_buffers` to be true too.
-        use_contiguous_buffers: if true, use a contiguous buffer to store the
-            gradients.
-    """
-
-    def __init__(self, module,
-                 accumulate_allreduce_grads_in_fp32,
-                 use_contiguous_buffers):
-
-        super(DistributedDataParallel, self).__init__(module)
-
-        self.accumulate_allreduce_grads_in_fp32 \
-            = accumulate_allreduce_grads_in_fp32
-        self.use_contiguous_buffers = use_contiguous_buffers
-        # If we are using fp32-accumulate-allreduce explicitly
-        # this means we need main grads in a continous buffer.
-        if self.accumulate_allreduce_grads_in_fp32:
-            assert self.use_contiguous_buffers
-
-        # ===================================
-        # Rest of this part applies only to
-        # the case we use continuous buffers.
-        # ===================================
-        self._grad_buffers = None
-        self._grad_buffer_param_index_map = None
-        if self.use_contiguous_buffers:
-            self._grad_buffers = {}
-            self._grad_buffer_param_index_map = {}
-            data_parallel_world_size = mpu.get_data_parallel_world_size()
-
-            # Simple function to define buffer type.
-            def _get_buffer_type(param):
-                return torch.float if \
-                    self.accumulate_allreduce_grads_in_fp32 else param.dtype
-
-            # First calculate total number of elements per type.
-            type_num_elements = {}
-            for param in self.module.parameters():
-                if param.requires_grad:
-                    dtype = _get_buffer_type(param)
-                    type_num_elements[dtype] = type_num_elements.get(dtype, 0) \
-                                               + param.data.nelement()
-
-            # Allocate the buffer.
-            for dtype, num_elements in type_num_elements.items():
-
-                # If using distributed optimizer, pad memory buffer to be
-                # multiple of data_parallel_world_size. (This padding is done
-                # due to a constraint with the reduce_scatter op, which requires
-                # all tensors have equal size. See: optimizer.py.)
-                num_elements_padded = data_parallel_world_size * \
-                    int(math.ceil(num_elements / data_parallel_world_size))
-
-                # Allocate grad buffer.
-                self._grad_buffers[dtype] = MemoryBuffer(num_elements,
-                                                         num_elements_padded,
-                                                         dtype)
-
-            # Assume the back prop order is reverse the params order,
-            # store the start index for the gradients.
-            for param in self.module.parameters():
-                if param.requires_grad:
-                    dtype = _get_buffer_type(param)
-                    type_num_elements[dtype] -= param.data.nelement()
-                    param.main_grad = self._grad_buffers[dtype].get(
-                        param.data.shape, type_num_elements[dtype])
-                    if dtype not in self._grad_buffer_param_index_map:
-                        self._grad_buffer_param_index_map[dtype] = {}
-                    self._grad_buffer_param_index_map[dtype][param] = (
-                        type_num_elements[dtype],
-                        type_num_elements[dtype] + param.data.nelement(),
-                    )
-
-            # Backward hook.
-            # Accumalation function for the gradients. We need
-            # to store them so they don't go out of scope.
-            self.grad_accs = []
-            # Loop over all the parameters in the model.
-            for param in self.module.parameters():
-                if param.requires_grad:
-                    # Expand so we get access to grad_fn.
-                    param_tmp = param.expand_as(param)
-                    # Get the gradient accumulator functtion.
-                    grad_acc = param_tmp.grad_fn.next_functions[0][0]
-                    grad_acc.register_hook(self._make_param_hook(param))
-                    self.grad_accs.append(grad_acc)
-
-
-    def _make_param_hook(self, param):
-        """Create the all-reduce hook for backprop."""
-        # Hook used for back-prop.
-        def param_hook(*unused):
-            # Add the gradient to the buffer.
-            if param.grad is not None:
-                # The gradient function of linear layers is fused with GEMMs
-                param.main_grad.add_(param.grad.data)
-                # Now we can deallocate grad memory.
-                param.grad = None
-        return param_hook
-
-
-    def zero_grad_buffer(self):
-        """Set the grad buffer data to zero. Needs to be called at the
-        begining of each iteration."""
-        assert self._grad_buffers is not None, 'buffers are not initialized.'
-        for _, buffer_ in self._grad_buffers.items():
-            buffer_.zero()
-
-
-    def broadcast_params(self):
-        for param in self.module.parameters():
-            torch.distributed.broadcast(param.data,
-                                        src=mpu.get_data_parallel_src_rank(),
-                                        group=mpu.get_data_parallel_group())
-
-
-    def allreduce_gradients(self):
-        """Reduce gradients across data parallel ranks."""
-        # If we have buffers, simply reduce the data in the buffer.
-        if self._grad_buffers is not None:
-            for _, buffer_ in self._grad_buffers.items():
-                buffer_.data /= mpu.get_data_parallel_world_size()
-                torch.distributed.all_reduce(
-                    buffer_.data, group=mpu.get_data_parallel_group())
-        else:
-            # Otherwise, bucketize and all-reduce
-            buckets = {}
-            # Pack the buckets.
-            for param in self.module.parameters():
-                if param.requires_grad and param.grad is not None:
-                    tp = param.data.type()
-                    if tp not in buckets:
-                        buckets[tp] = []
-                    buckets[tp].append(param)
-                    param.main_grad = param.grad
-
-            # For each bucket, all-reduce and copy all-reduced grads.
-            for tp in buckets:
-                bucket = buckets[tp]
-                grads = [param.grad.data for param in bucket]
-                coalesced = _flatten_dense_tensors(grads)
-                coalesced /= mpu.get_data_parallel_world_size()
-                torch.distributed.all_reduce(
-                    coalesced, group=mpu.get_data_parallel_group())
-                for buf, synced in zip(grads, _unflatten_dense_tensors(
-                        coalesced, grads)):
-                    buf.copy_(synced)
diff --git a/megatron/model/rotary_pos_embedding.py b/megatron/model/rotary_pos_embedding.py
deleted file mode 100644
index 80c74d62d4..0000000000
--- a/megatron/model/rotary_pos_embedding.py
+++ /dev/null
@@ -1,56 +0,0 @@
-# coding=utf-8
-
-# The following code has been taken from https://github.com/NVIDIA/NeMo/blob/ \
-# 782b4e1652aaa43c8be390d9db0dc89544afa080/nemo/collections/nlp/modules/ \
-# common/megatron/rotary_pos_embedding.py
-
-import importlib.util
-import torch
-
-from torch import einsum, nn
-
-__all__ = ['RotaryEmbedding', 'apply_rotary_pos_emb']
-
-class RotaryEmbedding(nn.Module):
-    def __init__(self, dim):
-        super().__init__()
-        inv_freq = 1.0 / (10000 ** (torch.arange(0, dim, 2).float() / dim))
-        self.register_buffer('inv_freq', inv_freq)
-        if importlib.util.find_spec('einops') is None:
-            raise RuntimeError("einops is required for Rotary Embedding")
-
-    def forward(self, max_seq_len, offset=0):
-        seq = torch.arange(max_seq_len, device=self.inv_freq.device) + offset
-        freqs = einsum('i , j -> i j', seq.type_as(self.inv_freq), self.inv_freq)
-        # first part even vector components, second part odd vector components,
-        #  2 * dim in dimension size
-        emb = torch.cat((freqs, freqs), dim=-1)
-        # emb [seq_length, .., dim]
-        from einops import rearrange
-        return rearrange(emb, 'n d -> n 1 1 d')
-
-
-def _rotate_half(x):
-    """
-    change sign so the last dimension becomes [-odd, +even]
-    """
-    from einops import rearrange
-    x = rearrange(x, '... (j d) -> ... j d', j=2)
-    x1, x2 = x.unbind(dim=-2)
-    return torch.cat((-x2, x1), dim=-1)
-
-
-def apply_rotary_pos_emb(t, freqs):
-    """
-    input tensor t is of shape [seq_length, ..., dim]
-    rotary positional embeding tensor freqs is of shape [seq_length, ..., dim]
-    check https://kexue.fm/archives/8265 for detailed formulas
-    """
-    rot_dim = freqs.shape[-1]
-    # ideally t_pass is empty so rotary pos embedding is applied to all tensor t
-    t, t_pass = t[..., :rot_dim], t[..., rot_dim:]
-
-    # first part is cosine component
-    # second part is sine component, need to change signs with _rotate_half method
-    t = (t * freqs.cos()) + (_rotate_half(t) * freqs.sin())
-    return torch.cat((t, t_pass), dim=-1)
diff --git a/megatron/optimizer/__init__.py b/megatron/optimizer/__init__.py
deleted file mode 100644
index 484e9b322e..0000000000
--- a/megatron/optimizer/__init__.py
+++ /dev/null
@@ -1,144 +0,0 @@
-# Copyright (c) 2022, NVIDIA CORPORATION. All rights reserved.
-
-from apex.optimizers import FusedAdam as Adam
-from apex.optimizers import FusedSGD as SGD
-
-from megatron import get_args
-
-from .distrib_optimizer import DistributedOptimizer
-from .grad_scaler import ConstantGradScaler, DynamicGradScaler
-from .optimizer import Float16OptimizerWithFloat16Params, FP32Optimizer
-
-
-def get_param_groups(modules,
-                     no_weight_decay_cond,
-                     scale_lr_cond,
-                     lr_mult):
-    """creates param groups based on weight decay condition (regularized vs non regularized)
-       and learning rate scale condition (args.lr vs lr_mult * args.lr)
-       scale_lr_cond is used during finetuning where head of the network requires a scaled
-       version of the base learning rate. 
-    """
-    wd_no_scale_lr = []
-    wd_scale_lr = []
-    no_wd_no_scale_lr = []
-    no_wd_scale_lr = []
-    for module in modules:
-        for name, param in module.named_parameters():
-            if not param.requires_grad:
-                continue
-
-            if no_weight_decay_cond is not None:
-                no_wd = no_weight_decay_cond(name, param)
-            else:
-                # do not regularize biases nor Norm parameters
-                no_wd = name.endswith(".bias") or len(param.shape) == 1
-
-            if scale_lr_cond is not None:
-                scale_lr = scale_lr_cond(name, param)
-            else:
-                scale_lr = False
-
-            if not no_wd and not scale_lr:
-                wd_no_scale_lr.append(param)
-            elif not no_wd and scale_lr:
-                wd_scale_lr.append(param)
-            elif no_wd and not scale_lr:
-                no_wd_no_scale_lr.append(param)
-            else:
-                no_wd_scale_lr.append(param)
-
-    param_groups = []
-    if len(wd_no_scale_lr):
-        param_groups.append({'params': wd_no_scale_lr, 'wd_mult': 1.0, 'lr_mult': 1.0})
-    if len(wd_scale_lr):
-        param_groups.append({'params': wd_scale_lr, 'wd_mult': 1.0, 'lr_mult': lr_mult})
-    if len(no_wd_no_scale_lr):
-        param_groups.append({'params': no_wd_no_scale_lr, 'wd_mult': 0.0, 'lr_mult': 1.0})
-    if len(no_wd_scale_lr):
-        param_groups.append({'params': no_wd_scale_lr, 'wd_mult': 0.0, 'lr_mult': lr_mult})
-
-    return param_groups
-
-def get_megatron_optimizer(model,
-                           no_weight_decay_cond=None,
-                           scale_lr_cond=None,
-                           lr_mult=1.0):
-    args = get_args()
-
-    # Base optimizer.
-    param_groups = get_param_groups(model,
-                                    no_weight_decay_cond,
-                                    scale_lr_cond,
-                                    lr_mult)
-
-    if args.optimizer == 'adam':
-        optimizer = Adam(param_groups,
-                         lr=args.lr,
-                         weight_decay=args.weight_decay,
-                         betas=(args.adam_beta1, args.adam_beta2),
-                         eps=args.adam_eps)
-    elif args.optimizer == 'sgd':
-        optimizer = SGD(param_groups,
-                        lr=args.lr,
-                        weight_decay=args.weight_decay,
-                        momentum=args.sgd_momentum)
-    else:
-        raise Exception('{} optimizer is not supported.'.format(
-            args.optimizer))
-
-    # Determine whether the params have main-grad field.
-    params_have_main_grad = False
-    if args.DDP_impl == 'local':
-        params_have_main_grad = True
-
-    # Mixed precision optimizer.
-    # - Note: both the Float16Optimizer and the DistributedOptimizer inherit
-    #   from the MixedPrecisionOptimizer, which manages any optimizer where
-    #   the model params and main params are distinct.
-    if args.fp16 or args.bf16 or args.use_distributed_optimizer:
-
-        # Grad scaler:
-        #    if loss-scale is provided, instantiate the constant scaler.
-        #    if we are using fp16 and loss-scale is not present, use a
-        #       dynamic scaler.
-        #    otherwise we are running in bf16 with no loss-scale so
-        #       leave it as None.
-        grad_scaler = None
-
-        # Constant loss scale.
-        if args.loss_scale:
-            grad_scaler = ConstantGradScaler(args.loss_scale)
-
-        # Dynamic loss scale.
-        else:
-            if args.fp16:
-                grad_scaler = DynamicGradScaler(
-                    initial_scale=args.initial_loss_scale,
-                    min_scale=args.min_loss_scale,
-                    growth_factor=2.0,
-                    backoff_factor=0.5,
-                    growth_interval=args.loss_scale_window,
-                    hysteresis=args.hysteresis)
-
-        # Megatron optimizer.
-        opt_ty = DistributedOptimizer \
-            if args.use_distributed_optimizer else \
-            Float16OptimizerWithFloat16Params
-        return opt_ty(optimizer,
-                      args.clip_grad,
-                      args.log_num_zeros_in_grad,
-                      params_have_main_grad,
-                      args.use_contiguous_buffers_in_local_ddp,
-                      args.fp16,
-                      args.bf16,
-                      args.params_dtype,
-                      grad_scaler,
-                      model)
-
-    # FP32.
-    return FP32Optimizer(optimizer, args.clip_grad,
-                         args.log_num_zeros_in_grad,
-                         params_have_main_grad,
-                         args.use_contiguous_buffers_in_local_ddp,
-                         model)
diff --git a/megatron/optimizer/clip_grads.py b/megatron/optimizer/clip_grads.py
deleted file mode 100644
index aa1080eb0b..0000000000
--- a/megatron/optimizer/clip_grads.py
+++ /dev/null
@@ -1,135 +0,0 @@
-# Copyright (c) 2022, NVIDIA CORPORATION. All rights reserved.
-
-"""Gradient clipping."""
-
-import torch
-from torch import inf
-
-from apex.multi_tensor_apply import multi_tensor_applier
-import amp_C
-
-from megatron.model.module import param_is_not_shared
-from megatron.core.tensor_parallel import param_is_not_tensor_parallel_duplicate
-
-
-def clip_grad_norm_fp32(parameters, grads_for_norm,
-                        max_norm, norm_type=2,
-                        model_parallel_group=None):
-    """Clips gradient norm of an iterable of parameters whose gradients
-       are in fp32.
-
-    This is adapted from torch.nn.utils.clip_grad.clip_grad_norm_ and
-    added functionality to handle model parallel parameters. Note that
-    the gradients are modified in place.
-
-    Arguments:
-        parameters (Iterable[Tensor] or Tensor): an iterable of Tensors or a
-            single Tensor that will have gradients normalized
-        grads_for_norm (Iterable[Tensor]): an iterable of Tensors or a single
-            Tensor that will be used for calculating the grad norm.
-        max_norm (float or int): max norm of the gradients
-        norm_type (float or int): type of the used p-norm. Can be ``'inf'`` for
-            infinity norm.
-        model_parallel_group (group): given the nature of the distributed
-            optimizer, this is passed as an argument.
-
-    Returns:
-        Total norm of the parameters (viewed as a single vector).
-    """
-
-    if isinstance(parameters, torch.Tensor):
-        parameters = [parameters]
-    if isinstance(grads_for_norm, torch.Tensor):
-        grads_for_norm = [grads_for_norm]
-
-    # Grads.
-    grads = []
-    for param in parameters:
-        if param.grad is not None:
-            assert param.grad.type() == 'torch.cuda.FloatTensor'
-            grads.append(param.grad.detach())
-
-    # Norm parameters.
-    max_norm = float(max_norm)
-    norm_type = float(norm_type)
-    total_norm = 0.0
-
-    # Calculate norm.
-    if norm_type == inf:
-        total_norm = max(grad.abs().max() for grad in grads_for_norm)
-        total_norm_cuda = torch.cuda.FloatTensor([float(total_norm)])
-        # Take max across all model-parallel GPUs.
-        torch.distributed.all_reduce(total_norm_cuda,
-                                     op=torch.distributed.ReduceOp.MAX,
-                                     group=model_parallel_group)
-        total_norm = total_norm_cuda[0].item()
-
-    else:
-        if norm_type == 2.0:
-            dummy_overflow_buf = torch.cuda.IntTensor([0])
-            # Use apex's multi-tensor applier for efficiency reasons.
-            # Multi-tensor applier takes a function and a list of list
-            # and performs the operation on that list all in one kernel.
-            if grads_for_norm:
-                grad_norm, _ = multi_tensor_applier(
-                    amp_C.multi_tensor_l2norm,
-                    dummy_overflow_buf,
-                    [grads_for_norm],
-                    False # no per-parameter norm
-                )
-            else:
-                grad_norm = torch.cuda.FloatTensor([0])
-            # Since we will be summing across data parallel groups,
-            # we need the pow(norm-type).
-            total_norm = grad_norm ** norm_type
-
-        else:
-            for grad in grads_for_norm:
-                grad_norm = torch.norm(grad, norm_type)
-                total_norm += grad_norm ** norm_type
-
-        # Sum across all model-parallel GPUs.
-        torch.distributed.all_reduce(total_norm,
-                                     op=torch.distributed.ReduceOp.SUM,
-                                     group=model_parallel_group)
-        total_norm = total_norm.item() ** (1.0 / norm_type)
-
-    # Scale.
-    clip_coeff = max_norm / (total_norm + 1.0e-6)
-    if clip_coeff < 1.0:
-        dummy_overflow_buf = torch.cuda.IntTensor([0])
-        multi_tensor_applier(amp_C.multi_tensor_scale,
-                             dummy_overflow_buf,
-                             [grads, grads],
-                             clip_coeff)
-
-    return total_norm
-
-
-def count_zeros_fp32(parameters, model_parallel_group):
-
-    if isinstance(parameters, torch.Tensor):
-        parameters = [parameters]
-
-    # Filter parameters based on:
-    #   - grad should not be none
-    #   - parameter should not be shared
-    #   - should not be a replica due to tensor model parallelism
-    total_num_zeros = torch.cuda.FloatTensor([0.0])
-    for param in parameters:
-        grad_not_none = param.grad is not None
-        is_not_shared = param_is_not_shared(param)
-        is_not_tp_duplicate = param_is_not_tensor_parallel_duplicate(param)
-        if grad_not_none and is_not_shared and is_not_tp_duplicate:
-            grad = param.grad.detach()
-            num_zeros = grad.numel() - torch.count_nonzero(grad)
-            total_num_zeros = num_zeros + total_num_zeros
-
-    # Sum across all model-parallel GPUs.
-    torch.distributed.all_reduce(total_num_zeros,
-                                 op=torch.distributed.ReduceOp.SUM,
-                                 group=model_parallel_group)
-
-    total_num_zeros = total_num_zeros.item()
-
-    return total_num_zeros
diff --git a/megatron/optimizer/distrib_optimizer.py b/megatron/optimizer/distrib_optimizer.py
deleted file mode 100644
index 96786394ae..0000000000
--- a/megatron/optimizer/distrib_optimizer.py
+++ /dev/null
@@ -1,1024 +0,0 @@
-# Copyright (c) 2023, NVIDIA CORPORATION. All rights reserved.
-
-"""Megatron distributed optimizer."""
-
-
-from apex.optimizers import FusedAdam as Adam
-import math
-import torch
-
-from megatron import get_args
-from megatron import get_timers
-from megatron import print_rank_0
-from megatron.core import mpu, tensor_parallel
-from megatron.model.module import param_is_not_shared
-
-from .optimizer import MixedPrecisionOptimizer, _zero_grad_group_helper
-
-
-class Range:
-    """
-    A range represents a start and end points for indexing a shard
-    from a full tensor.
-    """
-    def __init__(self, start, end):
-        self.start = start
-        self.end = end
-        self.size = end - start
-    def normalize(self, start = 0):
-        return Range(start, start + self.size)
-    def __str__(self):
-        return "%d,%d [%d]" % (self.start, self.end, self.size)
-    def __len__(self):
-        return self.end - self.start
-
-
-class DistributedOptimizer(MixedPrecisionOptimizer):
-    """Distributed optimizer, for all data types (fp16, bf16, and fp32).
-
-    Arguments:
-        optimizer: base optimizer such as Adam or SGD
-        clip_grad: clip gradeints with this global L2 norm. Note
-            that clipping is ignored if clip_grad == 0
-        log_num_zeros_in_grad: return number of zeros in the gradients.
-        params_have_main_grad: flag indicating if parameters have
-            a `main_grad` field. If this is set, we are assuming
-            that the model parameters are store in the `main_grad`
-            field instead of the typical `grad` field. This happens
-            for the DDP cases where there is a continuous buffer
-            holding the gradients. For example for bfloat16, we want
-            to do gradient accumulation and all-reduces in float32
-            and as a result we store those gradients in the main_grad.
-            Note that main grad is not necessarily in float32.
-        use_contiguous_buffers_in_local_ddp: if true, the local DDP model
-            is using a contiguous buffer to hold the model grads.
-        fp16: if true, the model is running in fp16.
-        bf16: if true, the model is running in bfloat16.
-        grad_scaler: used for scaling gradients. Note that this can be
-            None. This case happens when `bf16 = True` and we don't
-            use any loss scale. Note that for `bf16 = True`, we can have
-            a constnat gradient scaler. Also for `bf16 = False`, we
-            always require a grad scaler.
-        models: list of models (i.e., the virtual pipelining models). This
-            is used by the distributed optimizer for mapping parameters.
-    """
-
-    @classmethod
-    def build_model_gbuf_param_range_map(cls, model, dtype, gbuf_world_range):
-        """
-        Build mapping from param reference to grad buffer shard ranges.
-
-        This method builds a mapping from parameter references to grad
-        buffer shard ranges, specific to each data-parallel (DP) rank's
-        set of 'owned' parameters. Each grad buffer (padded to be an even
-        multiple of DP-world-size) is conceptually divided into DP-world-size
-        contiguous regions, where each DP rank 'owns' a contiguous regions.
-        Ownership in this sense means DP rank is responsible for reducing
-        the relevant subset of grads, and updating the relevant subset of
-        params.
-
-        This conceptual partitioning of the grad buffer does NOT respect
-        parameter boundaries, and as such it is assumed that each created
-        range references a shard (or subset) of the full parameter. It is
-        easiest to think of each DP rank as operating (i.e., reducing,
-        gathering) purely on views into the grad buffer, for all model-to-
-        main & main-to-model operations.
-
-        This method creates three ranges:
-        - The param's range within the entire grad buffer (i.e., world index).
-        - The param's range within the DP rank's local view of the grad buffer.
-        - The param's range within itself (i.e., its shard).
-        """
-
-        # Param range map.
-        param_world_index_map = model._grad_buffer_param_index_map[dtype]
-        param_range_map = {}
-        for param, param_world_indexes in param_world_index_map.items():
-
-            # Param range.
-            param_world_start, param_world_end = param_world_indexes
-            param_local_start = max(
-                0,
-                param_world_start - gbuf_world_range.start)
-            param_local_end = min(
-                gbuf_world_range.size,
-                param_world_end - gbuf_world_range.start)
-
-            # Add param, if within local gbuf range.
-            if param_local_end > param_local_start:
-                param_local_range = Range(param_local_start, param_local_end)
-                param_world_range = param_local_range.normalize(
-                    param_local_start + gbuf_world_range.start)
-                sub_param_start = max(0, gbuf_world_range.start-param_world_start)
-                sub_param_range = param_local_range.normalize(sub_param_start)
-                param_range_map[param] = {
-                    "gbuf_world" : param_world_range,
-                    "gbuf_local" : param_local_range,
-                    "param" : sub_param_range,
-                }
-
-        return param_range_map
-
-
-    @classmethod
-    def build_model_gbuf_range(cls, model, dtype):
-        """
-        Build mapping between params and their grad buffers.
-
-        This method does the initial setup for the method above. This setup
-        includes determining the shard ranges into the DDP's grad buffer for
-        each data-parallel (DP) rank. Each DP rank keeps range info for
-        all other DP ranks, for the purpose of creating args for
-        reduce-scatter and all-gather.
-        """
-
-        data_parallel_rank = mpu.get_data_parallel_rank()
-        data_parallel_world_size = mpu.get_data_parallel_world_size()
-
-        # Grad buffer range.
-        grad_buffer = model._grad_buffers[dtype]
-        gbuf_size = grad_buffer.numel
-        max_gbuf_range_size = int(math.ceil(gbuf_size / data_parallel_world_size))
-
-        # All world ranges. (i.e., across all data parallel ranks)
-        gbuf_world_all_ranges = []
-        for r in range(data_parallel_world_size):
-            gbuf_world_start = r * max_gbuf_range_size
-            gbuf_world_end = min(gbuf_size, gbuf_world_start+max_gbuf_range_size)
-            gbuf_world_range = Range(gbuf_world_start, gbuf_world_end)
-            gbuf_world_all_ranges.append(gbuf_world_range)
-
-        # Local DP's ranges.
-        gbuf_world_range = gbuf_world_all_ranges[data_parallel_rank]
-        gbuf_local_range = gbuf_world_range.normalize()
-
-        # Get each param's ranges.
-        param_range_map = cls.build_model_gbuf_param_range_map(model,
-                                                               dtype,
-                                                               gbuf_world_range)
-
-        # Group into dict.
-        data = {
-            "local" : gbuf_local_range,
-            "world" : gbuf_world_range,
-            "world_all" : gbuf_world_all_ranges,
-            "param_map" : param_range_map,
-            "max_range_size" : max_gbuf_range_size,
-        }
-
-        return data
-
-
-    @classmethod
-    def build_model_gbuf_range_map(cls, model):
-        """
-        Create param-to-grad-buffer mappings, for grad buffer data types
-        within a specific virtual model.
-        """
-        return {
-            dtype : cls.build_model_gbuf_range(model, dtype)
-            for dtype in model._grad_buffers
-        }
-
-
-    @classmethod
-    def build_model_param_gbuf_map(cls, model_gbuf_ranges):
-        """
-        Create a reverse of the model_gbuf_ranges, for referencing in
-        opposite direction.
-        """
-        param_gbuf_map = {}
-        for model_index, model_gbuf_range_map in enumerate(model_gbuf_ranges):
-            for dtype, gbuf_range_map in model_gbuf_range_map.items():
-                for param, param_range_map in gbuf_range_map["param_map"].items():
-                    param_gbuf_map[param] = (model_index, dtype)
-        return param_gbuf_map
-
-
-    @classmethod
-    def build_optimizer_group_ranges(cls, param_groups, model_gbuf_ranges):
-        """
-        Create optimizer groups.
-
-        Given the set of parameter shard ranges that are owned by the current
-        data-parallel (DP) rank, gather the set of parameters that will be
-        used (in the method below) to create the current DP's optimizer
-        groups.
-        """
-
-        num_groups = len(param_groups)
-
-        # Param group map.
-        # World param group map.
-        # - Store a mapping of <model_parameter:group_index> for all parameters
-        #   across all DP ranks. This is necessary because it is our first
-        #   cross reference between the DDP mappings and the optimizer group
-        #   parameters. This mapping only for use in the next step of building
-        #   the local mapping over this DP rank's parameters.
-        world_param_group_map = {}
-        for group_index, group in enumerate(param_groups):
-            for param in group["params"]:
-                assert param.requires_grad
-                world_param_group_map[param] = group_index
-
-        # Optimizer group ranges & param-group mapping.
-        # - Build a mapping from groups to their contained parameters, and also
-        #   from parameters to their containing group index and order within
-        #   the group. The group index and order are particularly important for
-        #   saving and loading checkpoints.
-        local_param_group_map = {}
-        group_ranges = [ {"params": []} for _ in param_groups ]
-        for model_gbuf_range_map in model_gbuf_ranges:
-            for dtype, gbuf_range_map in model_gbuf_range_map.items():
-                for param in gbuf_range_map["param_map"]:
-                    group_index = world_param_group_map[param]
-                    group_range = group_ranges[group_index]
-                    group_range["params"].append(param)
-                    local_param_group_map[param] = \
-                        (group_index, len(group_range["params"]) - 1)
-
-        # Squeeze zero-size group ranges.
-        for group_index, group_range in enumerate(group_ranges):
-            group_range["orig_group"] = param_groups[group_index]
-            group_range["orig_group_idx"] = param_groups[group_index]
-
-        return local_param_group_map, group_ranges
-
-
-    @classmethod
-    def build_model_and_main_param_groups(cls,
-                                          model_gbuf_ranges,
-                                          param_gbuf_map,
-                                          opt_group_ranges):
-        """
-        Create main parameter groups needed for the optimizer step.
-
-        These groups encompass both: 1) groups used by this class, for
-        reducing/gather, and 2) groups used by the inner optimizer for the
-        parameter update. Given that the conceptual grad buffer partitioning
-        (created in earlier method) doesn't respect parameter boundaries,
-        the optimizer operates on shards of the model parameters, rather than
-        the full parameters.
-        """
-
-        # Parameter groups:
-        #   model_float16_groups: original float16 parameters
-        #   model_fp32_groups: original fp32 parameters
-        #   shard_float16_groups: shards of original float16 parameters
-        #   shard_fp32_groups: shards of original fp32 parameters
-        #   shard_fp32_from_float16_groups: fp32 copy of float16 parameters
-        model_float16_groups = []
-        model_fp32_groups = []
-        shard_float16_groups = []
-        shard_fp32_groups = []
-        shard_fp32_from_float16_groups = []
-
-        # Allocate (or slice) each group's param shard.
-        for group_index, group_range in enumerate(opt_group_ranges):
-
-            # Params of this group.
-            model_float16_params_this_group = []
-            model_fp32_params_this_group = []
-            shard_float16_params_this_group = []
-            shard_fp32_params_this_group = []
-            shard_fp32_from_float16_params_this_group = []
-            model_float16_groups.append(model_float16_params_this_group)
-            model_fp32_groups.append(model_fp32_params_this_group)
-            shard_float16_groups.append(shard_float16_params_this_group)
-            shard_fp32_groups.append(shard_fp32_params_this_group)
-            shard_fp32_from_float16_groups.append(
-                shard_fp32_from_float16_params_this_group)
-
-            for model_param in group_range["params"]:
-
-                assert model_param.requires_grad
-
-                model_index, dtype = param_gbuf_map[model_param]
-                gbuf_range = model_gbuf_ranges[model_index][dtype]
-                param_range = gbuf_range["param_map"][model_param]["param"]
-
-                # fp16, bf16 params.
-                if model_param.type() in ['torch.cuda.HalfTensor',
-                                          'torch.cuda.BFloat16Tensor']:
-
-                    # Clone model -> main.
-                    shard_model_param = model_param.detach().view(-1) \
-                        [param_range.start:param_range.end]
-                    shard_main_param = shard_model_param.clone().float()
-                    tensor_parallel.copy_tensor_model_parallel_attributes(
-                        shard_model_param, model_param)
-                    tensor_parallel.copy_tensor_model_parallel_attributes(
-                        shard_main_param, model_param)
-                    if hasattr(model_param, 'shared'):
-                        shard_model_param.shared = model_param.shared
-                        shard_main_param.shared = model_param.shared
-
-                    # Add to group.
-                    model_float16_params_this_group.append(model_param)
-                    shard_float16_params_this_group.append(shard_model_param)
-                    shard_fp32_from_float16_params_this_group.append(shard_main_param)
-
-                # fp32 params.
-                elif model_param.type() == 'torch.cuda.FloatTensor':
-                    shard_model_param = model_param.view(-1) \
-                        [param_range.start:param_range.end]
-                    model_fp32_params_this_group.append(model_param)
-                    shard_fp32_params_this_group.append(shard_model_param)
-                    tensor_parallel.copy_tensor_model_parallel_attributes(
-                        shard_model_param, model_param)
-                    if hasattr(model_param, 'shared'):
-                        shard_model_param.shared = model_param.shared
-
-                else:
-                    raise TypeError('Wrapped parameters must be one of '
-                                    'torch.cuda.FloatTensor,  '
-                                    'torch.cuda.HalfTensor, or '
-                                    'torch.cuda.BFloat16Tensor. '
-                                    'Received {}'.format(param.type()))
-
-            # Update optimizer's params.
-            group_range["orig_group"]["params"] = [
-                *shard_fp32_params_this_group,
-                *shard_fp32_from_float16_params_this_group,
-            ]
-
-        return (
-            model_float16_groups,
-            model_fp32_groups,
-            shard_float16_groups,
-            shard_fp32_groups,
-            shard_fp32_from_float16_groups,
-        )
-
-
-    def __init__(self, optimizer, clip_grad, log_num_zeros_in_grad,
-                 params_have_main_grad, use_contiguous_buffers_in_local_ddp,
-                 fp16, bf16, params_dtype, grad_scaler, models):
-        """
-        See top of class definition for argument descriptions.
-
-        The steps in this method create the core mapping between DDP grad
-        buffers, parameters, and parameter shard ranges, that is needed for
-        converting between model param indexes and main parameter shard
-        indexes. This method also updates the optimizer parameter groups
-        with the newly created shards.
-        """
-
-        super().__init__(
-            optimizer, clip_grad, log_num_zeros_in_grad,
-            params_have_main_grad, use_contiguous_buffers_in_local_ddp,
-            fp16, bf16, params_dtype, grad_scaler, models)
-
-        # Verify that contiguous buffers are being used.
-        # - Note: this should already be checked in arguments.py.
-        assert use_contiguous_buffers_in_local_ddp
-        assert isinstance(optimizer, Adam), \
-            "Only Adam currently supported, due to checkpointing requirements."
-
-        # Model grad buffer ranges.
-        self.model_gbuf_ranges = []
-        for model_index, model in enumerate(self.models):
-            self.model_gbuf_ranges.append(self.build_model_gbuf_range_map(model))
-        self.model_param_gbuf_map = \
-            self.build_model_param_gbuf_map(self.model_gbuf_ranges)
-
-        # Optimizer ranges.
-        self.model_param_group_index_map, self.opt_group_ranges = \
-            self.build_optimizer_group_ranges(self.optimizer.param_groups,
-                                              self.model_gbuf_ranges)
-        
-        # Allocate main param shards.
-        (
-            self.model_float16_groups,
-            self.model_fp32_groups,
-            self.shard_float16_groups,
-            self.shard_fp32_groups,
-            self.shard_fp32_from_float16_groups,
-        ) = self.build_model_and_main_param_groups(self.model_gbuf_ranges,
-                                                   self.model_param_gbuf_map,
-                                                   self.opt_group_ranges)
-
-        # Initialize param buffers.
-        # - These are views on the DDP model's grad buffers, that share
-        #   storage & have their own dtype. This is safe because the param
-        #   dtype size is always <= grad dtype size.
-        self.param_buffers = []
-        for model_index, model in enumerate(self.models):
-            current_param_buffers = {}
-            for dtype, grad_buffer in model._grad_buffers.items():
-
-                # Handle older/newer method for getting untyped storage.
-                try:
-                    storage = grad_buffer.data.storage()._untyped()
-                except:
-                    storage = grad_buffer.data.storage().untyped()
-
-                # Typed param buffer.
-                param_buffer = torch.tensor(
-                    storage,
-                    dtype = params_dtype,
-                    device = grad_buffer.data.device)
-                param_buffer = param_buffer[:grad_buffer.numel_padded]
-                current_param_buffers[dtype] = param_buffer
-            self.param_buffers.append(current_param_buffers)
-
-        # Update optimizer groups.
-        # - Also, leverage state_dict() and load_state_dict() to
-        #   recast preexisting per-param state tensors.
-        self.optimizer.param_groups = \
-            [ g["orig_group"] for g in self.opt_group_ranges ]
-        self.optimizer.load_state_dict(self.optimizer.state_dict())
-
-
-    def get_model_param_range_map(self, param):
-        """
-        Given a model param, get the index sub-range of the param that this
-        data-parallel rank owns.
-        """
-        model_index, dtype = self.model_param_gbuf_map[param]
-        gbuf_range_map = self.model_gbuf_ranges[model_index][dtype]
-        param_range_map = gbuf_range_map["param_map"][param]
-        return param_range_map
-
-
-    def get_model_parallel_group(self):
-        """
-        With the distributed optimizer, the model parallel group is the
-        entire world.
-        """
-        return None
-
-
-    def state_dict(self):
-        """
-        The state dict contains all non-DP-rank-dependent (i.e., non-parameter-
-        related) optimizer variables. The returned state dict can be stored in
-        the standard model/RNG checkpoint file. The parameter and dependent
-        optimizer state (e.g., exp_avg, exp_avg_sq) are stored in a separate
-        checkpoint file by calling 'save_parameter_state()'.
-        """
-
-        state_dict = {}
-
-        # Optimizer state (do not store parameter state here).
-        state_dict['optimizer'] = {
-            k : v
-            for k, v in self.optimizer.state_dict().items()
-            if k != "state"
-        }
-        for param_group in state_dict["optimizer"]["param_groups"]:
-            del param_group["params"]
-
-        # Grad scaler state.
-        if self.grad_scaler:
-            state_dict['grad_scaler'] = self.grad_scaler.state_dict()
-
-        return state_dict
-
-
-    def load_state_dict(self, state_dict):
-        """Load the state dict.
-
-        As detailed in state_dict(), the state dict contains all non-
-        parameter-related variables. This method is notably longer than
-        state_dict(), because the Torch optimizers state has yet to be
-        allocated at this point, and so we must do a cross referencing between
-        the optimizers state (and the ordering it expects for parameter state)
-        and this DP rank's shards. The optimizer at this point does not contain
-        any tensor dimension information, so we must get these dimensions from
-        the DP shards mapped during DistributedOptimizer.__init__().
-
-        The tensor parameter state is loaded via load_parameter_state(), and
-        so this method also must populate the loaded state dict with dummy
-        tensor data (i.e., via torch.empty() below). This will be overwritten
-        during load_parameter_state().
-
-        ** Note: Torch optimizer's state structure. **
-        The Torch optimizer stores its state in two levels. The top level is a
-        list of groups, where each group contains a list of integer indexes
-        (corresponding to parameters) that index into a master parameter list
-        that is shared by all groups. As such, three values are necessary for
-        maintaining this ordering:
-
-        - group_index : The group to which a parameter belongs.
-        - group_order : The index of a parameter within its group.
-        - state_order : The index of a parameter within the shared parameter
-            list.
-        """
-
-        # Get the Torch optimizer's state dict.
-        # - This 'inner' optimizer at this point is unallocated, and only
-        #   contains an integer odering of parameters within each group, and
-        #   the ordering of parameters within its flattened parameter state
-        #   list.
-        inner_state_dict = self.optimizer.state_dict()
-        state_dict_param_groups = [{
-            **group,
-            "params" : list(inner_state_dict["param_groups"][idx]["params"]),
-        } for idx, group in enumerate(state_dict["optimizer"]["param_groups"])]
-
-        # Allocate 'dummy' data for optimizer state (i.e., torch.empty() below)
-        # - Real data is overwritten during load_parameter_state().
-        state_dict_state = []
-        for gbuf_range_maps in self.model_gbuf_ranges:
-            for gbuf_range_map in gbuf_range_maps.values():
-                for model_param, param_range_map in \
-                    gbuf_range_map["param_map"].items():
-
-                    # Get parameter ordering information (see method docstring
-                    # for details).
-                    group_index, group_order = \
-                        self.model_param_group_index_map[model_param]
-                    state_order = inner_state_dict["param_groups"] \
-                        [group_index]["params"][group_order]
-
-                    # Allocate dummy tensors.
-                    numel = len(param_range_map["gbuf_world"])
-                    init_shard = lambda : torch.empty(
-                        (numel,),
-                        dtype=torch.float32,
-                        device=torch.cuda.current_device())
-
-                    state_dict_state.append((state_order, {
-                        "exp_avg" : init_shard(),
-                        "exp_avg_sq" : init_shard(),
-                    }))
-
-        # Sort by state order (see method docstring for details).
-        state_dict_state.sort(key = lambda s : s[0])
-        state_dict_state = {s[0]:s[1] for s in state_dict_state}
-
-        # Optimizer.
-        self.optimizer.load_state_dict({
-            "state" : state_dict_state,
-            "param_groups" : state_dict_param_groups,
-        })
-
-        # Grad scaler.
-        if 'grad_scaler' not in state_dict:
-            if self.fp16:
-                print_rank_0('***WARNING*** found an old checkpoint, will not '
-                             'load grad scaler ...')
-        else:
-            if self.grad_scaler:
-                self.grad_scaler.load_state_dict(state_dict['grad_scaler'])
-            else:
-                print_rank_0('***WARNING*** fould the grad scaler in the '
-                             'checkpoint but it is None in the class. '
-                             'Skipping loading grad scaler ...')
-
-
-    def save_parameter_state(self, filename):
-        """Save parameter state (i.e., parameter & optimizer tensors).
-
-        This method performs three steps:
-        - For each DP rank, copy param & optimizer shards to contiguous CPU
-          buffers. (e.g., one buffer each for main_param, exp_avg, and
-          exp_avg_sq).
-        - Gather contiguous buffers on DP rank 0 and concatenate to world
-          buffers.
-        - Save world buffers to disk (i.e., distrib_opt.pt).
-        """
-
-        # Data parallelism variables.
-        data_parallel_world_size = mpu.get_data_parallel_world_size()
-        data_parallel_rank = mpu.get_data_parallel_rank()
-        data_parallel_group_gloo = mpu.get_data_parallel_group_gloo()
-        data_parallel_global_ranks = list(mpu._DATA_PARALLEL_GLOBAL_RANKS)
-
-        # Collect param states.
-        state = {}
-        for model_idx, gbuf_range_maps in enumerate(self.model_gbuf_ranges):
-
-            # Iterate grad buffers (by data type).
-            dtype_state = {}
-            assert len(gbuf_range_maps) == 1, "single dtype supported, for now."
-            for dtype, gbuf_range_map in gbuf_range_maps.items():
-
-                # Compute local DP contiguous shard's size.
-                model = self.models[model_idx]
-                gbuf_world_numel = model._grad_buffers[dtype].numel_padded
-                gbuf_local_numel = int(gbuf_world_numel/data_parallel_world_size)
-                local_shards = {key:torch.empty((gbuf_local_numel,),
-                                             dtype=torch.float32,
-                                             device="cpu")
-                             for key in ("param", "exp_avg", "exp_avg_sq")}
-
-                # Build contiguous DP rank shards (for param + optim states).
-                for model_param, param_range_map in \
-                    gbuf_range_map["param_map"].items():
-
-                    # Main param & optimizer states.
-                    group_index, group_order = \
-                        self.model_param_group_index_map[model_param]
-                    main_param = self.optimizer.param_groups \
-                        [group_index]["params"][group_order]
-                    optim_state = self.optimizer.state[main_param]
-
-                    tensors = {
-                        "param" : main_param,
-                        **optim_state,
-                    }
-
-                    # Copy states into contiguous shard.
-                    gbuf_local_start = param_range_map["gbuf_local"].start
-                    gbuf_local_end = param_range_map["gbuf_local"].end
-                    for key in local_shards:
-                        local_shards[key][gbuf_local_start:gbuf_local_end] \
-                            .data.copy_(tensors[key].detach().cpu())
-
-                # Gather contiguous shards on DP rank 0.
-                world_tensors = {}
-                for key, send_tensor in local_shards.items():
-                    
-                    # Gather tensor list.
-                    if data_parallel_rank == 0:
-                        recv_tensors = [torch.empty((gbuf_local_numel,),
-                                                    dtype=torch.float32,
-                                                    device="cpu")
-                                        for _ in range(data_parallel_world_size)]
-                    else:
-                        recv_tensors = None
-
-                    # Gather.
-                    torch.distributed.gather(
-                        send_tensor,
-                        recv_tensors,
-                        data_parallel_global_ranks[0],
-                        data_parallel_group_gloo,
-                    )
-
-                    # Concatenate.
-                    if data_parallel_rank == 0:
-                        world_tensors[key] = torch.cat(recv_tensors)
-
-                # Collect world state.
-                dtype_state[dtype] = world_tensors
-            state[model_idx] = dtype_state
-
-        # Save param state.
-        if data_parallel_rank == 0:
-            torch.save(state, filename)
-
-
-    def load_parameter_state(self, filename):
-        """Load parameter state (i.e., parameter & optimizer tensors).
-
-        This method performs the reverse of save_parameter_state():
-        - Load world buffers from disk (i.e., distrib_opt.pt).
-        - Scatter contiguous buffers from DP rank 0 to each DP rank (each DP
-          rank receives its relevant subset of the world buffers).
-        - For each DP rank, copy param & optimizer shards from contiguous CPU
-          buffers. (e.g., one buffer each for main_param, exp_avg, and
-          exp_avg_sq).
-        """
-
-        # Data parallelism variables.
-        data_parallel_world_size = mpu.get_data_parallel_world_size()
-        data_parallel_rank = mpu.get_data_parallel_rank()
-        data_parallel_group_gloo = mpu.get_data_parallel_group_gloo()
-        data_parallel_global_ranks = list(mpu._DATA_PARALLEL_GLOBAL_RANKS)
-
-        # Load on DP rank 0.
-        if data_parallel_rank == 0:
-            loaded_state = torch.load(filename)
-
-        # Scatter tensors to all DP ranks.
-        for model_idx, gbuf_range_maps in enumerate(self.model_gbuf_ranges):
-            for dtype, gbuf_range_map in gbuf_range_maps.items():
-
-                # Compute local DP contiguous shard's size.
-                model = self.models[model_idx]
-                gbuf_world_numel = model._grad_buffers[dtype].numel_padded
-                gbuf_local_numel = int(gbuf_world_numel/data_parallel_world_size)
-
-                # Contiguous local shards (received from DP rank 0).
-                local_shards = {key:torch.empty((gbuf_local_numel,),
-                                                dtype=torch.float32,
-                                                device="cpu")
-                                for key in ("param", "exp_avg", "exp_avg_sq")}
-
-                # Scatter local shards from DP rank 0.
-                for key, recv_tensor in local_shards.items():
-                    
-                    # Scatter tensor list.
-                    if data_parallel_rank == 0:
-                        world_tensor = loaded_state[model_idx][dtype][key]
-                        gbuf_start_idxs = \
-                            list(range(0, gbuf_world_numel, gbuf_local_numel))
-                        send_tensors = [world_tensor[i:(i+gbuf_local_numel)]
-                                        for i in gbuf_start_idxs]
-                    else:
-                        send_tensors = None
-
-                    # Scatter.
-                    torch.distributed.scatter(
-                        recv_tensor,
-                        send_tensors,
-                        data_parallel_global_ranks[0],
-                        data_parallel_group_gloo,
-                    )
-
-                # Copy local contiguous shards to param/optim shards.
-                for model_param, param_range_map in \
-                    gbuf_range_map["param_map"].items():
-
-                    # Main param & optimizer states.
-                    group_index, group_order = \
-                        self.model_param_group_index_map[model_param]
-                    main_param = self.optimizer.param_groups \
-                        [group_index]["params"][group_order]
-                    optim_state = self.optimizer.state[main_param]
-
-                    tensors = {
-                        "param" : main_param,
-                        **optim_state,
-                    }
-
-                    # Copy states into contiguous shard.
-                    gbuf_local_start = param_range_map["gbuf_local"].start
-                    gbuf_local_end = param_range_map["gbuf_local"].end
-                    for key in local_shards:
-                        tensors[key].data.copy_(
-                            local_shards[key][gbuf_local_start:gbuf_local_end])
-
-
-    def zero_grad(self, set_to_none=True):
-        """
-        Zero grads.
-
-        We only need to zero the model related parameters, i.e.,
-        model_float16_groups & model_fp32_groups. We additionally zero
-        the remaining groups as a memory optimization to reduce
-        fragmentation; in the case of set_to_none==True, the space
-        used by this field can be safely deallocated at this point.
-        """
-        for groups in (
-                self.model_float16_groups,
-                self.model_fp32_groups,
-                self.shard_float16_groups, # grad empty/unused here?
-                self.shard_fp32_groups, # throws grad-access warning
-                self.shard_fp32_from_float16_groups):
-            for group in groups:
-                _zero_grad_group_helper(group, set_to_none)
-
-
-    @staticmethod
-    def get_model_buffer_dp_views(model_buffers):
-        """
-        Get shard views of each of the DDP's param/grad buffers.
-
-        In this nested list, the top level is grouped by the virtual model
-        index and the buffer's data type. The sub-level is a list of
-        shards of that buffer, where each shard in the list represents
-        a contiguous view of the buffer, that is owned by a data-parallel
-        rank. The shard boundary does not respect parameter boundaries, and
-        so the elements of some parameters are split across data parallel
-        ranks.
-
-        Additionally, return references to the entire buffers, for use
-        in _reduce_scatter_base and _all_gather_base.
-        """
-
-        data_parallel_world_size = mpu.get_data_parallel_world_size()
-
-        # Buffer views.
-        view_items = []
-        for model_index, buffers in enumerate(model_buffers):
-            for dtype, buf in buffers.items():
-
-                assert buf.numel() % data_parallel_world_size == 0
-                shard_size = int(buf.numel() / data_parallel_world_size)
-                buf_views = [buf[(r*shard_size):((r+1)*shard_size)]
-                             for r in range(data_parallel_world_size)]
-                view_items.append((model_index, dtype, buf, buf_views))
-
-        return view_items
-
-
-    def get_model_grad_buffer_dp_views(self):
-        return self.get_model_buffer_dp_views([
-            {dtype : mem_buffer.data}
-            for model in self.models
-            for dtype, mem_buffer in model._grad_buffers.items()])
-
-
-    def get_model_param_buffer_dp_views(self):
-        return self.get_model_buffer_dp_views(self.param_buffers)
-
-
-    def reduce_model_grads(self, args, timers):
-        """
-        Reduce-scatter model grads.
-
-        The DDP's grad buffer is used for the reduce-scatter, and thus no
-        tensors are dynamically allocated.
-
-        Note: this is a different order of reduction, versus the non-
-        distributed optimizer, which reduces: 1) layernorm grads, 2) all
-        grads, 3) embedding grads.
-        """
-
-        # All-reduce layer-norm grads (for sequence parallelism).
-        timers('layernorm-grads-all-reduce', log_level=1).start(
-            barrier=args.barrier_with_L1_time)
-        self.allreduce_layernorm_grads(args)
-        timers('layernorm-grads-all-reduce').stop()
-
-        # All-reduce embedding grads.
-        timers('embedding-grads-all-reduce', log_level=1).start(
-            barrier=args.barrier_with_L1_time)
-        self.allreduce_embedding_grads(args)
-        timers('embedding-grads-all-reduce').stop()
-
-        # Reduce-scatter setup.
-        timers('grads-reduce-scatter', log_level=1).start(
-            barrier=args.barrier_with_L1_time)
-        data_parallel_rank = mpu.get_data_parallel_rank()
-        data_parallel_world_size = mpu.get_data_parallel_world_size()
-        data_parallel_group = mpu.get_data_parallel_group()
-
-        # Scale grad buffers by '1 / data_parallel_world_size'.
-        for model in self.models:
-            for dtype, gbuf in model._grad_buffers.items():
-                gbuf.data /= data_parallel_world_size
-
-        # Reduce-scatter all grads.
-        gbuf_view_items = self.get_model_grad_buffer_dp_views()
-        for index, (model_index, dtype, gbuf, gbuf_views) \
-            in enumerate(gbuf_view_items):
-
-            torch.distributed._reduce_scatter_base(
-                gbuf_views[data_parallel_rank],
-                gbuf,
-                group = data_parallel_group,
-            )
-
-        timers('grads-reduce-scatter').stop()
-
-
-    def gather_model_params(self, args, timers):
-        """
-        All-gather updated model params.
-
-        The DDP's param buffer is used for the all-gather, and thus no
-        tensors are dynamically allocated. After the all-gather, the params
-        can be copied from the param buffer to the param.
-        """
-
-        timers('params-all-gather', log_level=1).start(
-            barrier=args.barrier_with_L1_time)
-
-        data_parallel_rank = mpu.get_data_parallel_rank()
-        data_parallel_group = mpu.get_data_parallel_group()
-
-        # All-gather updated main params.
-        # - All param buffer views are guaranteed to have the same num elements
-        #   across all data parallel ranks, due to grad buffer padding that is
-        #   done in distributed.py, and extended to the param buffers. Thus,
-        #   all sub-views will have consistent start/end indexes across data
-        #   parallel ranks.
-        pbuf_view_items = self.get_model_param_buffer_dp_views()
-        for index, (model_index, dtype, pbuf, pbuf_views) \
-            in enumerate(pbuf_view_items):
-
-            torch.distributed._all_gather_base(
-                pbuf,
-                pbuf_views[data_parallel_rank],
-                group = data_parallel_group,
-            )
-
-        # Copy from param buffer to each param.
-        for model_id, model in enumerate(self.models):
-            for dtype, param_map in model._grad_buffer_param_index_map.items():
-                for param, (buf_start, buf_end) in param_map.items():
-                    param_buf = self.param_buffers[model_id][dtype]
-                    param_buf_shard = param_buf[buf_start:buf_end]
-                    param.view(-1).detach().copy_(param_buf_shard)
-
-        timers('params-all-gather').stop()
-
-
-    def _collect_main_grad_data_for_unscaling(self):
-        """
-        Note: this should be equivalent to the float-16 optimizer's method,
-        but writtent differently, so the two should be combined.
-        """
-        return [
-            param.grad.data
-            for group in self.optimizer.param_groups
-            for param in group["params"]
-        ]
-
-
-    def _get_model_and_main_params_data_float16(self):
-        """
-        Get aligned list of model and main params.
-        """
-        model_data = []
-        main_data = []
-        for model_group, main_group in zip(self.shard_float16_groups,
-                                           self.shard_fp32_from_float16_groups):
-            for model_param, main_param in zip(model_group, main_group):
-                model_data.append(model_param.data)
-                main_data.append(main_param.data)
-        return model_data, main_data
-
-
-    def _copy_model_grads_to_main_grads(self):
-        """
-        Copy model grads to main grads.
-
-        Since this step follows a reduce-scatter through the DDP's grad
-        buffer, this method is responsible for copying the updated grads
-        from the grad buffer to the main shard's grad field.
-        """
-
-        # Utility method for copying group grads.
-        def copy_group_grads(model_groups, shard_main_groups):
-            for model_group, shard_main_group in zip(model_groups,
-                                                     shard_main_groups):
-                for model_param, shard_main_param in zip(model_group,
-                                                         shard_main_group):
-
-                    param_range_map = self.get_model_param_range_map(model_param)
-                    param_range = param_range_map["param"]
-                    assert param_range.size == shard_main_param.nelement()
-
-                    model_grad = model_param.main_grad
-                    shard_model_grad = model_grad.view(-1) \
-                        [param_range.start:param_range.end]
-                    shard_main_param.grad = shard_model_grad.float()
-
-        # Copy model groups to shard groups.
-        copy_group_grads(self.model_float16_groups,
-                         self.shard_fp32_from_float16_groups)
-        copy_group_grads(self.model_fp32_groups,
-                         self.shard_fp32_groups)
-
-
-    def _copy_main_params_to_model_params(self):
-        """
-        Copy main params to model params.
-
-        Since this step is followed by an all-gather through the DDP's grad
-        buffer, this method is responsible for copying the updated params
-        from the main shards into the correct position in the grad buffer.
-        """
-
-        # Utility method for copying group params.
-        def copy_group_params(shard_main_groups, model_groups):
-            for shard_main_group, model_group in zip(shard_main_groups,
-                                                     model_groups):
-                for shard_main_param, model_param in zip(shard_main_group,
-                                                         model_group):
-
-                    param_range_map = self.get_model_param_range_map(model_param)
-                    world_range = param_range_map["gbuf_world"]
-
-                    assert world_range.size == shard_main_param.nelement()
-
-                    model_id, dtype = self.model_param_gbuf_map[model_param]
-                    model_param_buffer = self.param_buffers[model_id][dtype]
-
-                    shard_model_param = model_param_buffer.view(-1) \
-                        [world_range.start:world_range.end]
-
-                    shard_model_param.data.copy_(shard_main_param)
-
-        # Copy shard groups to model groups.
-        copy_group_params(self.shard_fp32_from_float16_groups,
-                          self.model_float16_groups)
-        copy_group_params(self.shard_fp32_groups,
-                          self.model_fp32_groups)
-
-
-    def _copy_model_params_to_main_params(self):
-        """
-        Copy model params to main params.
-
-        During finetuning, this method is used to reload the main params from
-        the model params. This copy does not make use of the grad buffer as
-        an intermediary.
-        """
-
-        # Utility method for copying group params.
-        def copy_group_params(model_groups, shard_main_groups):
-            for model_group, shard_main_group in zip(model_groups,
-                                                     shard_main_groups):
-                for model_param, shard_main_param in zip(model_group,
-                                                         shard_main_group):
-
-                    param_range_map = self.get_model_param_range_map(model_param)
-                    param_range = param_range_map["param"]
-                    assert param_range.size == shard_main_param.nelement()
-
-                    shard_model_param = model_param.view(-1) \
-                        [param_range.start:param_range.end]
-                    shard_main_param.data.copy_(shard_model_param)
-
-        # Copy model groups to shard groups.
-        copy_group_params(self.model_float16_groups,
-                          self.shard_fp32_from_float16_groups)
-        copy_group_params(self.model_fp32_groups,
-                          self.shard_fp32_groups)
diff --git a/megatron/optimizer/optimizer.py b/megatron/optimizer/optimizer.py
deleted file mode 100644
index cc89c95ca2..0000000000
--- a/megatron/optimizer/optimizer.py
+++ /dev/null
@@ -1,774 +0,0 @@
-# Copyright (c) 2023, NVIDIA CORPORATION. All rights reserved.
-
-"""Megatron optimizer."""
-
-from abc import ABC
-from abc import abstractmethod
-from apex.multi_tensor_apply import multi_tensor_applier
-import amp_C
-import torch
-from torch.nn.parallel.distributed import DistributedDataParallel as torchDDP
-from torch._utils import _flatten_dense_tensors, _unflatten_dense_tensors
-
-from megatron import get_timers
-from megatron import print_rank_0
-from megatron.core import mpu, tensor_parallel
-from megatron.model import DistributedDataParallel as LocalDDP
-from megatron.model import Float16Module
-from megatron.model.module import param_is_not_shared
-from megatron.utils import unwrap_model
-
-from .clip_grads import clip_grad_norm_fp32, count_zeros_fp32
-
-
-def _zero_grad_group_helper(group, set_to_none):
-    """Zero out the gradient for a group of parameters.
-    Note: copied from torch.optim.optimizer."""
-    for param in group:
-        if param.grad is not None:
-            if set_to_none:
-                param.grad = None
-            else:
-                if param.grad.grad_fn is not None:
-                    param.grad.detach_()
-                else:
-                    param.grad.requires_grad_(False)
-                param.grad.zero_()
-
-
-def _multi_tensor_copy_this_to_that(this, that, overflow_buf=None):
-    """Use multi-tensor-applier to copy values from one list to another.
-    We don't have a blfoat16 implementation so for now if the overflow_buf
-    is not provided, we default back to simple loop copy to be compatible
-    with bfloat16."""
-    if overflow_buf:
-        overflow_buf.fill_(0)
-        # Scaling with factor `1.0` is equivalent to copy.
-        multi_tensor_applier(amp_C.multi_tensor_scale,
-                             overflow_buf,
-                             [this, that],
-                             1.0)
-    else:
-        for this_, that_ in zip(this, that):
-            that_.copy_(this_)
-
-
-
-class MegatronOptimizer(ABC):
-
-
-    def __init__(self, optimizer, clip_grad,
-                 log_num_zeros_in_grad,
-                 params_have_main_grad,
-                 use_contiguous_buffers_in_local_ddp,
-                 models):
-
-        """Input optimizer is the base optimizer for example Adam."""
-        self.optimizer = optimizer
-        assert self.optimizer, 'no optimizer is provided.'
-        # Set gradient clipping and logging params.
-        self.clip_grad = clip_grad
-        self.log_num_zeros_in_grad = log_num_zeros_in_grad
-        self.params_have_main_grad = params_have_main_grad
-        self.use_contiguous_buffers_in_local_ddp = use_contiguous_buffers_in_local_ddp
-
-        # 'models' are retained for access to the contiguous grad buffers.
-        # (see distributed optimizer)
-        self.models = models
-
-        if self.use_contiguous_buffers_in_local_ddp:
-            assert self.params_have_main_grad, \
-                "use of contiguous buffer requires that params have main grad"
-
-
-    def get_parameters(self):
-        params = []
-        for param_group in self.optimizer.param_groups:
-            for param in param_group['params']:
-                params.append(param)
-        return params
-
-
-    def get_main_grads_for_grad_norm(self):
-
-        # Filter parameters based on:
-        #   - grad should not be none
-        #   - parameter should not be shared
-        #   - should not be a replica due to tensor model parallelism
-        params = self.get_parameters()
-        grads_for_norm = []
-        for param in params:
-            grad = param.grad
-            grad_not_none = grad is not None
-            is_not_shared = param_is_not_shared(param)
-            is_not_tp_duplicate = tensor_parallel.param_is_not_tensor_parallel_duplicate(param)
-            if grad_not_none and is_not_shared and is_not_tp_duplicate:
-                grads_for_norm.append(grad)
-
-        return grads_for_norm
-
-
-    def get_model_parallel_group(self):
-        """Default returned here, but the distributed optimizer overrides this."""
-        return mpu.get_model_parallel_group()
-
-
-    def clip_grad_norm(self, clip_grad):
-        params = self.get_parameters()
-        grads_for_norm = self.get_main_grads_for_grad_norm()
-        return clip_grad_norm_fp32(
-            params, grads_for_norm, clip_grad,
-            model_parallel_group=self.get_model_parallel_group())
-
-
-    def count_zeros(self):
-        params = self.get_parameters()
-        return count_zeros_fp32(params,
-                                model_parallel_group=self.get_model_parallel_group())
-
-
-    @abstractmethod
-    def zero_grad(self, set_to_none=True):
-        pass
-
-
-    @abstractmethod
-    def get_loss_scale(self):
-        """The output should be a cuda tensor of size 1."""
-        pass
-
-
-    def scale_loss(self, loss):
-        """Simple scaling."""
-        return self.get_loss_scale() * loss
-
-
-    @abstractmethod
-    def reload_model_params(self):
-        """Refreshes any internal state from the current model parameters.
-        Call whenever the parameters are changed outside of the optimizer.
-        For example, when we load a model from a checkpoint  without loading
-        the optimizer, the model parameters are updated but for fp16 optimizer
-        with main parameters, the main parameters need to also be updated."""
-        pass
-
-
-    @abstractmethod
-    def state_dict(self):
-        pass
-
-
-    @abstractmethod
-    def load_state_dict(self, state_dict):
-        pass
-
-
-    # Promote state so it can be retrieved or set via
-    # "optimizer_instance.state"
-    def _get_state(self):
-        return self.optimizer.state
-
-    def _set_state(self, value):
-        self.optimizer.state = value
-
-    state = property(_get_state, _set_state)
-
-
-    # Promote param_groups so it can be retrieved or set via
-    # "optimizer_instance.param_groups"
-    # (for example, to adjust the learning rate)
-    def _get_param_groups(self):
-        return self.optimizer.param_groups
-
-    def _set_param_groups(self, value):
-        self.optimizer.param_groups = value
-
-    param_groups = property(_get_param_groups, _set_param_groups)
-
-
-    @abstractmethod
-    def step(self, args, timers):
-        pass
-
-
-    def gather_model_params(self, args, timers):
-        """
-        For the case of a non-distributed-optimizer, there is nothing to
-        do here.
-        """
-        pass
-
-
-    def allreduce_word_embedding_grads(self, args):
-        """
-        All-reduce word embedding grads.
-
-        Reduce grads across first and last stages to ensure that word_embeddings
-        parameters stay in sync. This should only run for models that support
-        pipelined model parallelism (BERT and GPT-2).
-        """
-
-        if mpu.is_rank_in_embedding_group(ignore_virtual=True) and \
-                mpu.get_pipeline_model_parallel_world_size() > 1:
-            if mpu.is_pipeline_first_stage(ignore_virtual=True):
-                unwrapped_model = self.models[0]
-            elif mpu.is_pipeline_last_stage(ignore_virtual=True):
-                unwrapped_model = self.models[-1]
-            else:  # We do not support the interleaved schedule for T5 yet.
-                unwrapped_model = self.models[0]
-            unwrapped_model = unwrap_model(
-                unwrapped_model, (torchDDP, LocalDDP, Float16Module))
-
-            if unwrapped_model.share_word_embeddings:
-                word_embeddings_weight = unwrapped_model.word_embeddings_weight()
-                if args.DDP_impl == 'local':
-                    grad = word_embeddings_weight.main_grad
-                else:
-                    grad = word_embeddings_weight.grad
-                torch.distributed.all_reduce(grad, group=mpu.get_embedding_group())
-
-
-    def allreduce_position_embedding_grads(self, args):
-        """
-        All-reduce position_embeddings grad across first (encoder) and
-        split (decoder) stages to ensure that position embeddings parameters
-        stay in sync. This should only run for T5 models with pipeline
-        parallelism.
-        """
-        if mpu.is_rank_in_position_embedding_group() and \
-                mpu.get_pipeline_model_parallel_world_size() > 1 and \
-                args.pipeline_model_parallel_split_rank is not None:
-            unwrapped_model = self.models[0]
-            unwrapped_model = unwrap_model(
-                unwrapped_model, (torchDDP, LocalDDP, Float16Module))
-            assert args.DDP_impl == 'local', \
-                'T5 model is only supported with local DDP mode'
-            grad = unwrapped_model.language_model.embedding.position_embeddings.weight.main_grad
-            torch.distributed.all_reduce(grad, group=mpu.get_position_embedding_group())
-
-
-    def allreduce_embedding_grads(self, args):
-        """All-reduce both word and position embeddings."""
-        self.allreduce_word_embedding_grads(args)
-        self.allreduce_position_embedding_grads(args)
-
-
-    def allreduce_layernorm_grads(self, args):
-        """All-reduce layernorm grads (for sequence parallelism)."""
-
-        # All-reduce layernorm parameters across model parallel nodes
-        # when sequence parallelism is used
-        if mpu.get_tensor_model_parallel_world_size() > 1 and \
-                args.sequence_parallel:
-            grads = []
-            for model_module in self.models:
-                unwrapped_model = unwrap_model( 
-                    model_module, (torchDDP, LocalDDP, Float16Module))
-                for param in unwrapped_model.parameters():
-                    if getattr(param, 'sequence_parallel', False):
-                        grad = param.main_grad if args.DDP_impl == 'local' else param.grad
-                        grads.append(grad.data)
-            coalesced = _flatten_dense_tensors(grads)
-            torch.distributed.all_reduce(
-                coalesced, group=mpu.get_tensor_model_parallel_group())
-            for buf, synced in zip(grads, _unflatten_dense_tensors(
-                    coalesced, grads)):
-                buf.copy_(synced)
-
-
-    def reduce_model_grads(self, args, timers):
-        """All-reduce all grads, and all-reduce embeddings."""
-
-        # All-reduce layer-norm grads (for sequence parallelism).
-        timers('layernorm-grads-all-reduce', log_level=1).start(
-            barrier=args.barrier_with_L1_time)
-        self.allreduce_layernorm_grads(args)
-        timers('layernorm-grads-all-reduce').stop()
-
-        # All-reduce if needed.
-        if args.DDP_impl == 'local':
-            timers('grads-all-reduce', log_level=1).start(
-                barrier=args.barrier_with_L1_time)
-            for model in self.models:
-                model.allreduce_gradients()
-            timers('grads-all-reduce').stop()
-
-        # All-reduce embedding grads.
-        timers('embedding-grads-all-reduce', log_level=1).start(
-            barrier=args.barrier_with_L1_time)
-        self.allreduce_embedding_grads(args)
-        timers('embedding-grads-all-reduce').stop()
-
-
-class MixedPrecisionOptimizer(MegatronOptimizer):
-    """Base class for both the float-16 and the distributed optimizer.
-
-    Arguments:
-        optimizer: base optimizer such as Adam or SGD
-        clip_grad: clip gradeints with this global L2 norm. Note
-            that clipping is ignored if clip_grad == 0
-        log_num_zeros_in_grad: return number of zeros in the gradients.
-        params_have_main_grad: flag indicating if parameters have
-            a `main_grad` field. If this is set, we are assuming
-            that the model parameters are store in the `main_grad`
-            field instead of the typical `grad` field. This happens
-            for the DDP cases where there is a continuous buffer
-            holding the gradients. For example for bfloat16, we want
-            to do gradient accumulation and all-reduces in float32
-            and as a result we store those gradients in the main_grad.
-            Note that main grad is not necessarily in float32.
-        use_contiguous_buffers_in_local_ddp: if true, the local DDP model
-            is using a contiguous buffer to hold the model grads.
-        fp16: if true, the model is running in fp16.
-        bf16: if true, the model is running in bfloat16.
-        params_dtype: used by distributed optimizer.
-        grad_scaler: used for scaling gradients. Note that this can be
-            None. This case happens when `bf16 = True` and we don't
-            use any loss scale. Note that for `bf16 = True`, we can have
-            a constnat gradient scaler. Also for `bf16 = False`, we
-            always require a grad scaler.
-        models: list of models (i.e., the virtual pipelining models). This
-            is used by the distributed optimizer for mapping parameters.
-    """
-
-    def __init__(self, optimizer, clip_grad, log_num_zeros_in_grad,
-                 params_have_main_grad, use_contiguous_buffers_in_local_ddp,
-                 fp16, bf16, params_dtype, grad_scaler,
-                 models):
-
-        super().__init__(
-            optimizer, clip_grad, log_num_zeros_in_grad,
-            params_have_main_grad, use_contiguous_buffers_in_local_ddp,
-            models)
-
-        self.fp16 = fp16
-        self.bf16 = bf16
-        self.params_dtype = params_dtype
-        self.grad_scaler = grad_scaler
-
-        # None grad scaler is only supported for bf16.
-        if self.grad_scaler is None:
-            assert not self.fp16, 'fp16 expects a grad scaler.'
-
-        # Tensor used to determine if a nan/if has happend.
-        # Any non-zero value indicates inf/nan.
-        # Note that we keep this for the cases that grad scaler is none.
-        # We still record nan/inf if we have a bfloat16 with a grad scaler.
-        if self.grad_scaler:
-            self.found_inf = torch.cuda.FloatTensor([0.0])
-
-        # Dummy tensor needed for apex multi-apply tensor.
-        # For bfloat, we don't have multi-tensor apply and for now
-        # we set it to none so the multi-tensor apply gets ignored.
-        if bf16:
-            self._dummy_overflow_buf = None
-        else:
-            self._dummy_overflow_buf = torch.cuda.IntTensor([0])
-
-        # In case grad scaler is not passed, define the unity scale.
-        if self.grad_scaler is None:
-            self._scale_one = torch.cuda.FloatTensor([1.0])
-
-
-    def get_loss_scale(self):
-        if self.grad_scaler is None:
-            return self._scale_one
-        return self.grad_scaler.scale
-
-
-    def reload_model_params(self):
-        self._copy_model_params_to_main_params()
-
-
-    def _unscale_main_grads_and_check_for_nan(self):
-
-        # Collect main grads.
-        main_grads = self._collect_main_grad_data_for_unscaling()
-
-        # Reset found inf.
-        self.found_inf.fill_(0.0)
-
-        # Unscale and set found inf/nan
-        torch._amp_foreach_non_finite_check_and_unscale_(
-            main_grads, self.found_inf, self.grad_scaler.inv_scale)
-
-        # Update across all model parallel instances.
-        torch.distributed.all_reduce(self.found_inf,
-                                     op=torch.distributed.ReduceOp.MAX,
-                                     group=self.get_model_parallel_group())
-
-        # Check for nan.
-        found_inf_flag = (self.found_inf.item() > 0)
-
-        return found_inf_flag
-
-
-    @torch.no_grad()
-    def step(self, args, timers):
-
-        # Copy gradients from model params to main params.
-        timers('optimizer-copy-to-main-grad', log_level=1).start(
-            barrier=args.barrier_with_L1_time)
-        self._copy_model_grads_to_main_grads()
-        timers('optimizer-copy-to-main-grad').stop()
-
-        # Do unscale, check for inf, and update grad scaler only for
-        # the case that grad scaler is provided.
-        if self.grad_scaler:
-
-            # Unscale and check for inf/nan.
-            timers('optimizer-unscale-and-check-inf', log_level=1).start(
-                barrier=args.barrier_with_L1_time)
-            found_inf_flag = self._unscale_main_grads_and_check_for_nan()
-            timers('optimizer-unscale-and-check-inf').stop()
-
-            # We are done with scaling gradients
-            # so we can update the loss scale.
-            self.grad_scaler.update(found_inf_flag)
-
-            # If we found inf/nan, skip the update.
-            if found_inf_flag:
-                return False, None, None
-
-        # Clip the main gradients.
-        timers('optimizer-clip-main-grad', log_level=1).start(
-            barrier=args.barrier_with_L1_time)
-        grad_norm = None
-        if self.clip_grad > 0.0:
-            grad_norm = self.clip_grad_norm(self.clip_grad)
-        timers('optimizer-clip-main-grad').stop()
-
-        # Count the zeros in the grads.
-        timers('optimizer-count-zeros', log_level=1).start(
-            barrier=args.barrier_with_L1_time)
-        num_zeros_in_grad = self.count_zeros() if \
-                            self.log_num_zeros_in_grad else None
-        timers('optimizer-count-zeros').stop()
-
-        # Step the optimizer.
-        timers('optimizer-inner-step', log_level=1).start(
-            barrier=args.barrier_with_L1_time)
-        self.optimizer.step()
-        timers('optimizer-inner-step').stop()
-
-        # Update params from main params.
-        timers('optimizer-copy-main-to-model-params', log_level=1).start(
-            barrier=args.barrier_with_L1_time)
-        self._copy_main_params_to_model_params()
-        timers('optimizer-copy-main-to-model-params').stop()
-
-        # Successful update.
-        return True, grad_norm, num_zeros_in_grad
-
-
-class Float16OptimizerWithFloat16Params(MixedPrecisionOptimizer):
-    """Float16 optimizer for fp16 and bf16 data types.
-
-    Arguments:
-        optimizer: base optimizer such as Adam or SGD
-        clip_grad: clip gradeints with this global L2 norm. Note
-            that clipping is ignored if clip_grad == 0
-        log_num_zeros_in_grad: return number of zeros in the gradients.
-        params_have_main_grad: flag indicating if parameters have
-            a `main_grad` field. If this is set, we are assuming
-            that the model parameters are store in the `main_grad`
-            field instead of the typical `grad` field. This happens
-            for the DDP cases where there is a continuous buffer
-            holding the gradients. For example for bfloat16, we want
-            to do gradient accumulation and all-reduces in float32
-            and as a result we store those gradients in the main_grad.
-            Note that main grad is not necessarily in float32.
-        use_contiguous_buffers_in_local_ddp: if true, the local DDP model
-            is using a contiguous buffer to hold the model grads.
-        fp16: if true, the model is running in fp16.
-        bf16: if true, the model is running in bfloat16.
-        grad_scaler: used for scaling gradients. Note that this can be
-            None. This case happens when `bf16 = True` and we don't
-            use any loss scale. Note that for `bf16 = True`, we can have
-            a constnat gradient scaler. Also for `bf16 = False`, we
-            always require a grad scaler.
-        models: list of models (i.e., the virtual pipelining models). This
-            is used by the distributed optimizer for mapping parameters.
-    """
-
-    def __init__(self, optimizer, clip_grad, log_num_zeros_in_grad,
-                 params_have_main_grad, use_contiguous_buffers_in_local_ddp,
-                 fp16, bf16, params_dtype, grad_scaler, models):
-
-        super().__init__(
-            optimizer, clip_grad, log_num_zeros_in_grad,
-            params_have_main_grad, use_contiguous_buffers_in_local_ddp,
-            fp16, bf16, params_dtype, grad_scaler, models)
-
-        # ======================
-        # main parameter stuff
-        # ======================
-
-        # Three groups of parameters:
-        #   float16_groups: original float16 parameters
-        #   fp32_from_float16_groups: fp32 copy of float16 parameters
-        #   fp32_from_fp32_groups: original fp32 parameters
-        self.float16_groups = []
-        self.fp32_from_float16_groups = []
-        self.fp32_from_fp32_groups = []
-
-        # For all the groups in the original optimizer:
-        for param_group in self.optimizer.param_groups:
-            float16_params_this_group = []
-            fp32_params_this_group = []
-            fp32_from_float16_params_this_group = []
-            # For all the parameters in this group:
-            for i, param in enumerate(param_group['params']):
-                if param.requires_grad:
-
-                    # float16 params:
-                    if param.type() in ['torch.cuda.HalfTensor',
-                                        'torch.cuda.BFloat16Tensor']:
-                        float16_params_this_group.append(param)
-                        # Create a copy
-                        main_param = param.detach().clone().float()
-                        # Copy tensor model parallel attributes.
-                        tensor_parallel.copy_tensor_model_parallel_attributes(main_param,
-                                                                              param)
-                        if hasattr(param, 'shared'):
-                            main_param.shared = param.shared
-                        # Replace the optimizer params with the new fp32 copy.
-                        param_group['params'][i] = main_param
-
-                        fp32_from_float16_params_this_group.append(main_param)
-                        # Reset existing state dict key to the new main param.
-                        if param in self.optimizer.state:
-                            self.optimizer.state[main_param] \
-                                = self.optimizer.state.pop(param)
-                    # fp32 params.
-                    elif param.type() == 'torch.cuda.FloatTensor':
-                        fp32_params_this_group.append(param)
-                        param_group['params'][i] = param
-
-                    else:
-                        raise TypeError('Wrapped parameters must be one of '
-                                        'torch.cuda.FloatTensor,  '
-                                        'torch.cuda.HalfTensor, or '
-                                        'torch.cuda.BFloat16Tensor. '
-                                        'Received {}'.format(param.type()))
-
-            self.float16_groups.append(float16_params_this_group)
-            self.fp32_from_float16_groups.append(
-                fp32_from_float16_params_this_group)
-            self.fp32_from_fp32_groups.append(fp32_params_this_group)
-
-
-    def zero_grad(self, set_to_none=True):
-        """We only need to zero the model related parameters, i.e.,
-        float16_groups & fp32_from_fp32_groups. We additionally zero
-        fp32_from_float16_groups as a memory optimization to reduce
-        fragmentation; in the case of set_to_none==True, the space
-        used by this field can be safely deallocated at this point."""
-        for group in self.float16_groups:
-            _zero_grad_group_helper(group, set_to_none)
-        for group in self.fp32_from_float16_groups:
-            _zero_grad_group_helper(group, set_to_none)
-        for group in self.fp32_from_fp32_groups:
-            _zero_grad_group_helper(group, set_to_none)
-
-
-    def _collect_main_grad_data_for_unscaling(self):
-
-        main_grads = []
-
-        # fp32 params from float16 ones.
-        for main_group in self.fp32_from_float16_groups:
-            for main_param in main_group:
-                if main_param.grad is not None:
-                    main_grads.append(main_param.grad.data)
-
-        # Append fp32 parameters.
-        for main_group in self.fp32_from_fp32_groups:
-            for main_param in main_group:
-                if main_param.grad is not None:
-                    main_grads.append(main_param.grad.data)
-        
-        return main_grads
-
-
-    def _get_model_and_main_params_data_float16(self):
-        model_data = []
-        main_data = []
-        for model_group, main_group in zip(self.float16_groups,
-                                           self.fp32_from_float16_groups):
-            for model_param, main_param in zip(model_group, main_group):
-                model_data.append(model_param.data)
-                main_data.append(main_param.data)
-        return model_data, main_data
-
-
-    def _copy_model_grads_to_main_grads(self):
-        # This only needs to be done for the float16 group.
-        for model_group, main_group in zip(self.float16_groups,
-                                           self.fp32_from_float16_groups):
-            for model_param, main_param in zip(model_group, main_group):
-                if self.params_have_main_grad and hasattr(model_param, 'main_grad'):
-                    main_param.grad = model_param.main_grad.float()
-                else:
-                    if model_param.grad is not None:
-                        main_param.grad = model_param.grad.float()
-
-                # Safe to deallocate model's grad/main_grad after copying.
-                # (If using contiguous buffers, main_grad's memory should
-                # persist and therefore should not be deallocated.)
-                model_param.grad = None
-                if self.params_have_main_grad and \
-                   not self.use_contiguous_buffers_in_local_ddp:
-                    model_param.main_grad = None
-
-        # For fp32 grads, we need to reset the grads to main grad.
-        if self.params_have_main_grad:
-            for model_group in self.fp32_from_fp32_groups:
-                for model_param in model_group:
-                    model_param.grad = model_param.main_grad
-
-                    # Safe to de-reference model's main_grad after copying.
-                    # (If using contiguous buffers, main_grad's memory should
-                    # persist and therefore should not be deallocated.)
-                    if not self.use_contiguous_buffers_in_local_ddp:
-                        model_param.main_grad = None
-
-
-    def _copy_main_params_to_model_params(self):
-        # Only needed for the float16 params.
-        model_data, main_data = self._get_model_and_main_params_data_float16()
-        _multi_tensor_copy_this_to_that(this=main_data, that=model_data,
-                                        overflow_buf=self._dummy_overflow_buf)
-
-
-    def _copy_model_params_to_main_params(self):
-        # Only needed for the float16 params.
-        model_data, main_data = self._get_model_and_main_params_data_float16()
-        _multi_tensor_copy_this_to_that(this=model_data, that=main_data,
-                                        overflow_buf=self._dummy_overflow_buf)
-
-
-    def state_dict(self):
-        state_dict = {}
-        state_dict['optimizer'] = self.optimizer.state_dict()
-        if self.grad_scaler:
-            state_dict['grad_scaler'] = self.grad_scaler.state_dict()
-        state_dict['fp32_from_fp16_params'] = self.fp32_from_float16_groups
-        return state_dict
-
-
-    def load_state_dict(self, state_dict):
-        # Optimizer.
-        optimizer_key = 'optimizer'
-        if optimizer_key not in state_dict:
-            optimizer_key = 'optimizer_state_dict'
-            print_rank_0('***WARNING*** loading optimizer from '
-                         'an old checkpoint ...')
-        self.optimizer.load_state_dict(state_dict[optimizer_key])
-
-        # Grad scaler.
-        if 'grad_scaler' not in state_dict:
-            if self.fp16:
-                print_rank_0('***WARNING*** found an old checkpoint, will not '
-                             'load grad scaler ...')
-        else:
-            if self.grad_scaler:
-                self.grad_scaler.load_state_dict(state_dict['grad_scaler'])
-            else:
-                print_rank_0('***WARNING*** fould the grad scaler in the '
-                             'checkpoint but it is None in the class. '
-                             'Skipping loading grad scaler ...')
-
-        # Copy data for the main params.
-        fp32_from_float16_params_key = 'fp32_from_fp16_params'
-        if fp32_from_float16_params_key not in state_dict:
-            fp32_from_float16_params_key = 'fp32_from_fp16'
-        for current_group, saved_group in zip(
-                self.fp32_from_float16_groups,
-                state_dict[fp32_from_float16_params_key]):
-            for current_param, saved_param in zip(current_group, saved_group):
-                current_param.data.copy_(saved_param.data)
-
-
-class FP32Optimizer(MegatronOptimizer):
-
-    def __init__(self, optimizer, clip_grad,
-                 log_num_zeros_in_grad,
-                 params_have_main_grad,
-                 use_contiguous_buffers_in_local_ddp,
-                 models):
-
-        super(FP32Optimizer, self).__init__(
-            optimizer, clip_grad, log_num_zeros_in_grad,
-            params_have_main_grad, use_contiguous_buffers_in_local_ddp,
-            models)
-
-        self._scale = torch.cuda.FloatTensor([1.0])
-
-
-    def zero_grad(self, set_to_none=True):
-        """Copied from torch.optim.optimizer"""
-        for group in self.optimizer.param_groups:
-            _zero_grad_group_helper(group['params'], set_to_none)
-
-
-    def get_loss_scale(self):
-        """FP32 optimizer does not do any scaling."""
-        return self._scale
-
-
-    @torch.no_grad()
-    def step(self, args, timers):
-        """Clip gradients (if needed) and step the base optimizer.
-        Always return successful since there is no overflow."""
-
-        # Copy main_grads to grads.
-        timers('optimizer-copy-to-main-grad', log_level=1).start(
-            barrier=args.barrier_with_L1_time)
-        if self.params_have_main_grad:
-            for param_group in self.optimizer.param_groups:
-                for param in param_group['params']:
-                    param.grad = param.main_grad
-
-                    # Safe to de-reference model's main_grad after copying.
-                    # (If using contiguous buffers, main_grad's memory should
-                    # persist and therefore should not be deallocated.)
-                    if not self.use_contiguous_buffers_in_local_ddp:
-                        param.main_grad = None
-        timers('optimizer-copy-to-main-grad').stop()
-
-        # Clip gradients.
-        timers('optimizer-clip-main-grad', log_level=1).start(
-            barrier=args.barrier_with_L1_time)
-        grad_norm = None
-        if self.clip_grad > 0.0:
-            grad_norm = self.clip_grad_norm(self.clip_grad)
-        timers('optimizer-clip-main-grad').stop()
-
-        # count the zeros in the grads
-        timers('optimizer-count-zeros', log_level=1).start(
-            barrier=args.barrier_with_L1_time)
-        num_zeros_in_grad = self.count_zeros() if \
-                            self.log_num_zeros_in_grad else None
-        timers('optimizer-count-zeros').stop()
-
-        # Update parameters.
-        timers('optimizer-inner-step', log_level=1).start(
-            barrier=args.barrier_with_L1_time)
-        self.optimizer.step()
-        timers('optimizer-inner-step').stop()
-
-        # No overflow for FP32 optimizer.
-        return True, grad_norm, num_zeros_in_grad
-
-
-    def reload_model_params(self):
-        pass
-
-
-    def state_dict(self):
-        return self.optimizer.state_dict()
-
-
-    def load_state_dict(self, state_dict):
-        self.optimizer.load_state_dict(state_dict)
diff --git a/megatron/optimizer_param_scheduler.py b/megatron/optimizer_param_scheduler.py
deleted file mode 100644
index 60b5930e3a..0000000000
--- a/megatron/optimizer_param_scheduler.py
+++ /dev/null
@@ -1,227 +0,0 @@
-# Copyright (c) 2022, NVIDIA CORPORATION. All rights reserved.
-
-"""Learning rate decay and weight decay incr functions."""
-
-import math
-
-from megatron import print_rank_0
-
-class OptimizerParamScheduler(object):
-    """Anneals learning rate and weight decay"""
-
-    def __init__(self, optimizer, max_lr, min_lr,
-                 lr_warmup_steps, lr_decay_steps, lr_decay_style,
-                 start_wd, end_wd, wd_incr_steps, wd_incr_style,
-                 use_checkpoint_opt_param_scheduler=True,
-                 override_opt_param_scheduler=False):
-
-        # Class values.
-        self.optimizer = optimizer
-
-        self.max_lr = float(max_lr)
-        self.min_lr = min_lr
-        assert self.min_lr >= 0.0
-        assert self.max_lr >= self.min_lr
-
-        self.lr_warmup_steps = lr_warmup_steps
-        self.num_steps = 0
-        self.lr_decay_steps = lr_decay_steps
-        assert self.lr_decay_steps > 0
-        assert self.lr_warmup_steps < self.lr_decay_steps
-
-        self.lr_decay_style = lr_decay_style
-
-        self.start_wd = start_wd
-        self.end_wd = end_wd
-        assert self.start_wd >= 0.0
-        assert self.end_wd >= self.start_wd
-        self.wd_incr_steps = wd_incr_steps
-        self.wd_incr_style = wd_incr_style
-
-        self.override_opt_param_scheduler = override_opt_param_scheduler
-        self.use_checkpoint_opt_param_scheduler = use_checkpoint_opt_param_scheduler
-        if self.override_opt_param_scheduler:
-            assert not self.use_checkpoint_opt_param_scheduler, 'both override and '\
-                'use-checkpoint are set.'
-
-        # Set the learning rate
-        self.step(0)
-        print_rank_0('> learning rate decay style: {}'.format(self.lr_decay_style))
-
-
-    def get_wd(self):
-        """ Weight decay incr functions"""
-        if self.num_steps > self.wd_incr_steps:
-            return self.end_wd
-
-        if self.wd_incr_style == 'constant':
-            assert self.start_wd == self.end_wd
-            return self.end_wd
-
-        incr_ratio = float(self.num_steps) / float(self.wd_incr_steps)
-        assert incr_ratio >= 0.0
-        assert incr_ratio <= 1.0
-        delta_wd = self.end_wd - self.start_wd
-
-        if self.wd_incr_style == 'linear':
-            coeff = incr_ratio
-        elif self.wd_incr_style == 'cosine':
-            coeff = 0.5 * (math.cos(math.pi * (1 - incr_ratio)) + 1.0)
-        else:
-            raise Exception('{} weight decay increment style is not supported.'.format(
-                self.wd_incr_style))
-
-        return self.start_wd + coeff * delta_wd
-
-
-    def get_lr(self):
-        """Learning rate decay functions from:
-              https://openreview.net/pdf?id=BJYwwY9ll pg. 4"""
-
-        # Use linear warmup for the initial part.
-        if self.lr_warmup_steps > 0 and self.num_steps <= self.lr_warmup_steps:
-            return self.max_lr * float(self.num_steps) / \
-                float(self.lr_warmup_steps)
-
-        # If the learning rate is constant, just return the initial value.
-        if self.lr_decay_style == 'constant':
-            return self.max_lr
-
-        # For any steps larger than `self.lr_decay_steps`, use `self.min_lr`.
-        if self.num_steps > self.lr_decay_steps:
-            return self.min_lr
-
-        # If we are done with the warmup period, use the decay style.
-        if self.lr_decay_style == 'inverse-square-root':
-            warmup_steps = max(self.lr_warmup_steps, 1)
-            num_steps = max(self.num_steps, 1)
-            lr = self.max_lr * warmup_steps ** 0.5 / (num_steps ** 0.5)
-            return max(self.min_lr, lr)
-
-        num_steps_ = self.num_steps - self.lr_warmup_steps
-        decay_steps_ = self.lr_decay_steps - self.lr_warmup_steps
-        decay_ratio = float(num_steps_) / float(decay_steps_)
-        assert decay_ratio >= 0.0
-        assert decay_ratio <= 1.0
-        delta_lr = self.max_lr - self.min_lr
-
-        if self.lr_decay_style == 'linear':
-            coeff = (1.0 - decay_ratio)
-        elif self.lr_decay_style == 'cosine':
-            coeff = 0.5 * (math.cos(math.pi * decay_ratio) + 1.0)
-        else:
-            raise Exception('{} decay style is not supported.'.format(
-                self.lr_decay_style))
-
-        return self.min_lr + coeff * delta_lr
-
-
-    def step(self, increment):
-        """Set lr for all parameters groups."""
-        self.num_steps += increment
-        new_lr = self.get_lr()
-        new_wd = self.get_wd()
-        for group in self.optimizer.param_groups:
-            group['lr'] = new_lr * group.get('lr_mult', 1.0)
-            group['weight_decay'] = new_wd * group.get('wd_mult', 1.0)
-
-
-    def state_dict(self):
-        state_dict = {
-            'max_lr': self.max_lr,
-            'lr_warmup_steps': self.lr_warmup_steps,
-            'num_steps': self.num_steps,
-            'lr_decay_style': self.lr_decay_style,
-            'lr_decay_steps': self.lr_decay_steps,
-            'min_lr': self.min_lr,
-            'start_wd': self.start_wd,
-            'end_wd': self.end_wd,
-            'wd_incr_style': self.wd_incr_style,
-            'wd_incr_steps': self.wd_incr_steps
-        }
-        return state_dict
-
-
-    def _check_and_set(self, cls_value, sd_value, name):
-        """Auxiliary function for checking the values in the checkpoint and
-        setting them."""
-        if self.override_opt_param_scheduler:
-            print_rank_0(' > overriding {} value to {}'.format(name, cls_value))
-            return cls_value
-
-        if not self.use_checkpoint_opt_param_scheduler:
-            assert cls_value == sd_value, \
-                f'OptimizerParamScheduler: class input value {cls_value} and checkpoint' \
-                f'value {sd_value} for {name} do not match'
-        print_rank_0(' > using checkpoint value {} for {}'.format(sd_value,
-                                                                  name))
-        return sd_value
-
-
-    def load_state_dict(self, sd):
-
-        if 'start_lr' in sd:
-            max_lr_ = sd['start_lr']
-        else:
-            max_lr_ = sd['max_lr']
-        self.max_lr = self._check_and_set(self.max_lr, max_lr_,
-                                          'learning rate')
-        
-        self.min_lr = self._check_and_set(self.min_lr, sd['min_lr'],
-                                          'minimum learning rate')
-
-        if 'warmup_iter' in sd:
-            lr_warmup_steps_ = sd['warmup_iter']
-        elif 'warmup_steps' in sd:
-            lr_warmup_steps_ = sd['warmup_steps']
-        else:
-            lr_warmup_steps_ = sd['lr_warmup_steps']
-        self.lr_warmup_steps = self._check_and_set(self.lr_warmup_steps,
-                                                lr_warmup_steps_,
-                                                'warmup iterations')
-
-        if 'end_iter' in sd:
-            lr_decay_steps_ = sd['end_iter']
-        elif 'decay_steps' in sd:
-            lr_decay_steps_  = sd['decay_steps']
-        else:
-            lr_decay_steps_ = sd['lr_decay_steps']
-        self.lr_decay_steps = self._check_and_set(self.lr_decay_steps, lr_decay_steps_,
-                                               'total number of iterations')
-
-        if 'decay_style' in sd:
-            lr_decay_style_ = sd['decay_style']
-        else:
-            lr_decay_style_ = sd['lr_decay_style']
-        self.lr_decay_style = self._check_and_set(self.lr_decay_style,
-                                               lr_decay_style_,
-                                               'learning rate decay style')
-
-        if 'num_iters' in sd:
-            num_steps = sd['num_iters']
-        else:
-            num_steps = sd['num_steps']
-        self.step(increment=num_steps)
-
-
-        if 'start_wd' in sd:
-            self.start_wd = self._check_and_set(self.start_wd,
-                                                sd['start_wd'],
-                                                "start weight decay")
-            self.end_wd = self._check_and_set(self.end_wd,
-                                                sd['end_wd'],
-                                                "end weight decay")
-            self.wd_incr_steps = self._check_and_set(self.wd_incr_steps,
-                                                sd['wd_incr_steps'],
-                                                "total number of weight decay iterations")
-            self.wd_incr_style = self._check_and_set(self.wd_incr_style,
-                                                sd['wd_incr_style'],
-                                                "weight decay incr style")
-            
-
-
-
-
-
-
-
diff --git a/megatron/text_generation/forward_step.py b/megatron/text_generation/forward_step.py
deleted file mode 100644
index feb087cbb6..0000000000
--- a/megatron/text_generation/forward_step.py
+++ /dev/null
@@ -1,205 +0,0 @@
-# Copyright (c) 2022, NVIDIA CORPORATION. All rights reserved.
-
-"""Forward step utilities."""
-
-from collections.abc import Iterable
-
-import torch
-
-from megatron import get_args
-from megatron.core import mpu
-from .communication import (
-    send_to_next_pipeline_rank,
-    recv_from_prev_pipeline_rank_)
-
-
-
-class InferenceParams:
-    """Inference parameters that are passed to the main model in order
-    to efficienly calculate and store the context during inference."""
-
-    def __init__(self, max_batch_size, max_sequence_len):
-        """Note that offsets are set to zero and we always set the
-        flag to allocate memory. After the first call, make sure to
-        set this flag to False."""
-        self.max_sequence_len = max_sequence_len
-        self.max_batch_size = max_batch_size
-        self.sequence_len_offset = 0
-        self.batch_size_offset = 0
-        self.key_value_memory_dict = {}
-
-    def swap_key_value_dict(self, batch_idx):
-        "swap between batches"
-        if len(self.key_value_memory_dict) == 0:
-            raise ValueError("should not swap when dict in empty")
-        
-        for layer_number in self.key_value_memory_dict.keys():
-            inference_key_memory, inference_value_memory = self.key_value_memory_dict[layer_number]
-            assert len(batch_idx) == inference_key_memory.shape[1] ## make sure batch size is the same
-            new_inference_key_memory = inference_key_memory[:, batch_idx]
-            new_inference_value_memory = inference_value_memory[:, batch_idx]
-            self.key_value_memory_dict[layer_number] = (
-                    new_inference_key_memory, new_inference_value_memory)
-
-class ForwardStep:
-    """Forward step function with all the communications.
-    We use a class here to hide the inference parameters
-    from the outside caller."""
-
-    def __init__(self, model, max_batch_size, max_sequence_len):
-        """Set values so we don't need to do it multiple times."""
-        # Make sure model is in eval mode.
-        assert not isinstance(model, Iterable), \
-            'interleaving schedule is not supported for inference'
-        model.eval()
-        self.model = model
-        # Initialize inference parameters.
-        self.inference_params = InferenceParams(max_batch_size,
-                                                max_sequence_len)
-        # Pipelining arguments.
-        args = get_args()
-        self.pipeline_size_larger_than_one = (
-            args.pipeline_model_parallel_size > 1)
-        # Threshold of pipelining.
-        self.pipelining_batch_x_seqlen = \
-            args.inference_batch_times_seqlen_threshold
-
-
-    def __call__(self, tokens, position_ids, attention_mask):
-        """Invocation of the forward methods. Note that self.inference_params
-        is being modified by the forward step."""
-        # Pipelining case.
-        if self.pipeline_size_larger_than_one:
-            current_batch_x_seqlen = tokens.size(0) * tokens.size(1)
-            if current_batch_x_seqlen >= self.pipelining_batch_x_seqlen:
-                micro_batch_size = \
-                    max(1, self.pipelining_batch_x_seqlen // tokens.size(1))
-                return _with_pipelining_forward_step(self.model,
-                                                     tokens,
-                                                     position_ids,
-                                                     attention_mask,
-                                                     self.inference_params,
-                                                     micro_batch_size)
-
-        return _no_pipelining_forward_step(self.model,
-                                           tokens,
-                                           position_ids,
-                                           attention_mask,
-                                           self.inference_params)
-
-
-
-def _get_recv_buffer_dtype(args):
-    """Receive happens between the layers."""
-    if args.fp32_residual_connection:
-        return torch.float
-    return args.params_dtype
-
-
-
-def _allocate_recv_buffer(batch_size, sequence_length):
-    """Receive happens between the layers with size [s, b, h]."""
-    if mpu.is_pipeline_first_stage():
-        return None
-    args = get_args()
-    recv_size = (sequence_length, batch_size, args.hidden_size)
-    return torch.empty(recv_size,
-                       dtype=_get_recv_buffer_dtype(args),
-                       device=torch.cuda.current_device())
-
-
-
-def _forward_step_helper(model, tokens, position_ids, attention_mask,
-                         inference_params, recv_buffer=None):
-    """Single forward step. Update the allocate memory flag so
-    only the first time the memory is allocated."""
-    batch_size = tokens.size(0)
-    sequence_length = tokens.size(1)
-    if recv_buffer is None:
-        recv_buffer = _allocate_recv_buffer(batch_size, sequence_length)
-
-    # Receive from previous stage.
-    recv_from_prev_pipeline_rank_(recv_buffer)
-
-    # Forward pass through the model.
-    model.set_input_tensor(recv_buffer)
-    output_tensor = model(tokens, position_ids, attention_mask,
-                          inference_params=inference_params)
-
-    # Send output to the next stage.
-    send_to_next_pipeline_rank(output_tensor)
-
-    return output_tensor
-
-
-
-def _no_pipelining_forward_step(model, tokens, position_ids, attention_mask,
-                                inference_params, recv_buffer=None):
-    """If recv_buffer is none, we will allocate one on the fly."""
-    # Run a simple forward pass.
-    output_tensor = _forward_step_helper(model, tokens, position_ids,
-                                         attention_mask, inference_params,
-                                         recv_buffer=recv_buffer)
-    # Update the sequence length offset.
-    inference_params.sequence_len_offset += tokens.size(1)
-
-    logits = None
-    if mpu.is_pipeline_last_stage():
-        logits = output_tensor
-
-    return logits
-
-
-
-def _with_pipelining_forward_step(model, tokens, position_ids, attention_mask,
-                                  inference_params, micro_batch_size):
-    """No interleaving is supported."""
-    sequence_length = tokens.size(1)
-    batch_size = tokens.size(0)
-
-    # Divide the batch dimension into micro batches.
-    num_micro_batches, last_chunk = divmod(batch_size,
-                                           micro_batch_size)
-    if last_chunk > 0:
-        num_micro_batches += 1
-
-    # Preallocate memory for output logits.
-    logits = None
-    if mpu.is_pipeline_last_stage():
-        args = get_args()
-        logits = torch.empty(
-            (batch_size, sequence_length, args.padded_vocab_size),
-            dtype=torch.float32, device=torch.cuda.current_device())
-
-    # Preallocate recv buffer.
-    recv_buffer = _allocate_recv_buffer(micro_batch_size, sequence_length)
-
-    for micro_batch_index in range(num_micro_batches):
-        # Slice among the batch dimenion.
-        start = micro_batch_index * micro_batch_size
-        end = min(start + micro_batch_size, batch_size)
-        this_micro_batch_size = end - start
-        tokens2use = tokens[start:end, ...]
-        position_ids2use = position_ids[start:end, ...]
-
-        # Run a simple forward pass.
-        if this_micro_batch_size != micro_batch_size:
-            recv_buffer = None
-        output = _forward_step_helper(model, tokens2use, position_ids2use,
-                                      attention_mask, inference_params,
-                                      recv_buffer=recv_buffer)
-
-        # Adjust the batch size offset to account for the micro-batch.
-        inference_params.batch_size_offset += this_micro_batch_size
-
-        # Copy logits.
-        if mpu.is_pipeline_last_stage():
-            logits[start:end, ...] = output
-
-    # Once we are done with all the micro-batches, we can
-    # adjust the sequence length offset.
-    inference_params.sequence_len_offset += sequence_length
-    # and reset the batch size offset
-    inference_params.batch_size_offset = 0
-
-    return logits
diff --git a/megatron/tokenizer/tokenizer.py b/megatron/tokenizer/tokenizer.py
deleted file mode 100644
index 79dab75a04..0000000000
--- a/megatron/tokenizer/tokenizer.py
+++ /dev/null
@@ -1,536 +0,0 @@
-# Copyright (c) 2022, NVIDIA CORPORATION. All rights reserved.
-
-"""Megatron tokenizers."""
-
-from abc import ABC
-from abc import abstractmethod
-
-from .bert_tokenization import FullTokenizer as FullBertTokenizer
-from .gpt2_tokenization import GPT2Tokenizer
-
-
-def build_tokenizer(args):
-    """Initialize tokenizer."""
-    if args.rank == 0:
-        print('> building {} tokenizer ...'.format(args.tokenizer_type),
-              flush=True)
-
-    # Select and instantiate the tokenizer.
-    if args.tokenizer_type == 'BertWordPieceLowerCase':
-        assert args.vocab_file is not None
-        tokenizer = _BertWordPieceTokenizer(vocab_file=args.vocab_file,
-                                            lower_case=True,
-                                            vocab_extra_ids=args.vocab_extra_ids)
-    elif args.tokenizer_type == 'BertWordPieceCase':
-        assert args.vocab_file is not None
-        tokenizer = _BertWordPieceTokenizer(vocab_file=args.vocab_file,
-                                            lower_case=False,
-                                            vocab_extra_ids=args.vocab_extra_ids)
-    elif args.tokenizer_type == 'GPT2BPETokenizer':
-        assert args.vocab_file is not None
-        assert args.merge_file is not None
-        tokenizer = _GPT2BPETokenizer(args.vocab_file, args.merge_file)
-    elif args.tokenizer_type == 'SentencePieceTokenizer':
-        assert args.tokenizer_model is not None
-        tokenizer = _SentencePieceTokenizer(args.tokenizer_model, vocab_extra_ids=args.vocab_extra_ids)
-    elif args.tokenizer_type == 'GPTSentencePieceTokenizer':
-        assert args.tokenizer_model is not None
-        tokenizer = _GPTSentencePieceTokenizer(args.tokenizer_model)
-    elif args.tokenizer_type == 'NullTokenizer':
-        assert args.vocab_size is not None
-        tokenizer = _NullTokenizer(args.vocab_size)
-    else:
-        raise NotImplementedError('{} tokenizer is not '
-                                  'implemented.'.format(args.tokenizer_type))
-    
-    # Add vocab size.
-    args.padded_vocab_size = _vocab_size_with_padding(tokenizer.vocab_size,
-                                                      args)
-
-    return tokenizer
-
-
-def _vocab_size_with_padding(orig_vocab_size, args):
-    """Pad vocab size so it is divisible by model parallel size and
-    still having GPU friendly size."""
-
-    after = orig_vocab_size
-    multiple = args.make_vocab_size_divisible_by * \
-        args.tensor_model_parallel_size
-    while (after % multiple) != 0:
-        after += 1
-    if args.rank == 0:
-        print(' > padded vocab (size: {}) with {} dummy tokens '
-              '(new size: {})'.format(
-                  orig_vocab_size, after - orig_vocab_size, after), flush=True)
-    return after
-
-
-class AbstractTokenizer(ABC):
-    """Abstract class for tokenizer."""
-
-    def __init__(self, name):
-        self.name = name
-        super().__init__()
-
-    @property
-    @abstractmethod
-    def vocab_size(self):
-        pass
-
-    @property
-    @abstractmethod
-    def vocab(self):
-        """Dictionary from vocab text token to id token."""
-        pass
-
-    @property
-    @abstractmethod
-    def inv_vocab(self):
-        """Dictionary from vocab id token to text token."""
-        pass
-
-    @abstractmethod
-    def tokenize(self, text):
-        pass
-
-    def detokenize(self, token_ids):
-        raise NotImplementedError('detokenizer is not implemented for {} '
-                                  'tokenizer'.format(self.name))
-
-    @property
-    def cls(self):
-        raise NotImplementedError('CLS is not provided for {} '
-                                  'tokenizer'.format(self.name))
-
-    @property
-    def sep(self):
-        raise NotImplementedError('SEP is not provided for {} '
-                                  'tokenizer'.format(self.name))
-
-    @property
-    def pad(self):
-        raise NotImplementedError('PAD is not provided for {} '
-                                  'tokenizer'.format(self.name))
-
-    @property
-    def eod(self):
-        raise NotImplementedError('EOD is not provided for {} '
-                                  'tokenizer'.format(self.name))
-
-    @property
-    def mask(self):
-        raise NotImplementedError('MASK is not provided for {} '
-                                  'tokenizer'.format(self.name))
-
-
-class _BertWordPieceTokenizer(AbstractTokenizer):
-    """Original BERT wordpiece tokenizer."""
-
-    def __init__(self, vocab_file, lower_case=True, vocab_extra_ids=0):
-        if lower_case:
-            name = 'BERT Lower Case'
-        else:
-            name = 'BERT Upper Case'
-        super().__init__(name)
-        self.tokenizer = FullBertTokenizer(vocab_file, do_lower_case=lower_case)
-        self.cls_id = self.tokenizer.vocab['[CLS]']
-        self.sep_id = self.tokenizer.vocab['[SEP]']
-        self.pad_id = self.tokenizer.vocab['[PAD]']
-        self.mask_id = self.tokenizer.vocab['[MASK]']
-        self._additional_special_tokens = []
-
-        # (dsachan) Add BOS and EOS tokens
-        SPECIAL_TOKENS = {'eos_token': '[EOS]',
-                          'bos_token': '[BOS]'}
-        self._bos_token = '[BOS]'
-        self.add_token(self._bos_token)
-        self._bos_token_id = self.vocab.get(self._bos_token)
-
-        self._eos_token = '[EOS]'
-        self.add_token(self._eos_token)
-        self._eos_token_id = self.vocab.get(self._eos_token)
-
-        # (dsachan) Add additional special tokens
-        # These can be used as sentinel tokens in T5 model inputs
-        additional_special_tokens = []
-        additional_special_tokens.extend(
-            ["<extra_id_{}>".format(i) for i in range(vocab_extra_ids)])
-        self.add_additional_special_tokens(additional_special_tokens)
-
-    def add_token(self, token):
-        if token not in self.vocab:
-            self.inv_vocab[self.vocab_size] = token
-            # self.vocab_size comes from len(vocab)
-            # and it will increase as we add elements
-            self.vocab[token] = self.vocab_size
-
-    def add_additional_special_tokens(self, tokens_list):
-        setattr(self, "additional_special_tokens", tokens_list)
-        for value in tokens_list:
-            self.add_token(value)
-
-    @property
-    def vocab_size(self):
-        return self.tokenizer.vocab_size()
-
-    @property
-    def vocab(self):
-        return self.tokenizer.vocab
-
-    @property
-    def inv_vocab(self):
-        return self.tokenizer.inv_vocab
-
-    def tokenize(self, text):
-        text_tokens = self.tokenizer.tokenize(text)
-        return self.tokenizer.convert_tokens_to_ids(text_tokens)
-
-    def decode(self, ids):
-        tokens = self.tokenizer.convert_ids_to_tokens(ids)
-        return self.tokenizer.convert_tokens_to_string(tokens)
-
-    def decode_token_ids(self, token_ids):
-        tokens = self.tokenizer.convert_ids_to_tokens(token_ids)
-        exclude_list = ['[PAD]', '[CLS]']
-        non_pads = [t for t in tokens if t not in exclude_list]
-
-        result = ""
-        for s in non_pads:
-            if s.startswith("##"):
-                result += s[2:]
-            else:
-                result += " " + s
-
-        return result
-
-    @property
-    def cls(self):
-        return self.cls_id
-
-    @property
-    def sep(self):
-        return self.sep_id
-
-    @property
-    def pad(self):
-        return self.pad_id
-
-    @property
-    def mask(self):
-        return self.mask_id
-
-    @property
-    def bos_token(self):
-        """ Beginning of sentence token id """
-        return self._bos_token
-
-    @property
-    def eos_token(self):
-        """ End of sentence token id """
-        return self._eos_token
-
-    @property
-    def additional_special_tokens(self):
-        """ All the additional special tokens you may want to use (list of strings)."""
-        return self._additional_special_tokens
-
-    @property
-    def bos_token_id(self):
-        """ Id of the beginning of sentence token in the vocabulary."""
-        return self._bos_token_id
-
-    @property
-    def eos_token_id(self):
-        """ Id of the end of sentence token in the vocabulary."""
-        return self._eos_token_id
-
-    @property
-    def additional_special_tokens_ids(self):
-        """ Ids of all the additional special tokens in the vocabulary (list of integers)."""
-        return [self.vocab.get(token) for token in self._additional_special_tokens]
-
-    @additional_special_tokens.setter
-    def additional_special_tokens(self, value):
-        self._additional_special_tokens = value
-
-
-class _GPT2BPETokenizer(AbstractTokenizer):
-    """Original GPT2 BPE tokenizer."""
-
-    def __init__(self, vocab_file, merge_file):
-        name = 'GPT2 BPE'
-        super().__init__(name)
-
-        self.tokenizer = GPT2Tokenizer(vocab_file, merge_file, errors='replace',
-                                       special_tokens=[], max_len=None)
-        self.eod_id = self.tokenizer.encoder['<|endoftext|>']
-
-    @property
-    def vocab_size(self):
-        return len(self.tokenizer.encoder)
-
-    @property
-    def vocab(self):
-        return self.tokenizer.encoder
-
-    @property
-    def inv_vocab(self):
-        return self.tokenizer.decoder
-
-    def tokenize(self, text):
-        return self.tokenizer.encode(text)
-
-    def detokenize(self, token_ids):
-        return self.tokenizer.decode(token_ids)
-
-    @property
-    def eod(self):
-        return self.eod_id
-
-
-class _SentencePieceTokenizer(AbstractTokenizer):
-    """SentencePieceTokenizer-Megatron wrapper"""
-
-    def __init__(self, model_file, vocab_extra_ids=0):
-        name = 'SentencePieceTokenizer'
-        super().__init__(name)
-
-        import sentencepiece
-        self.tokenizer = sentencepiece.SentencePieceProcessor(model_file=model_file)
-        self._initalize(vocab_extra_ids)
-
-    def _populate_vocab(self):
-        self._vocab = {}
-        self._inv_vocab = {}
-
-        for i in range(len(self.tokenizer)):
-            t = self.tokenizer.id_to_piece(i)
-            self._inv_vocab[i] = t
-            self._vocab[t] = i
-
-    def _initalize(self, vocab_extra_ids):
-        self._populate_vocab()
-        self._special_tokens = {}
-        self._inv_special_tokens = {}
-
-        self._t5_tokens = []
-
-        def _add_special_token(t):
-            if t not in self._vocab:
-                next_id = len(self._vocab)
-                self._vocab[t] = next_id
-                self._inv_vocab[next_id] = t
-            self._special_tokens[t] = self._vocab[t]
-            self._inv_special_tokens[self._vocab[t]] = t
-
-        _add_special_token('<CLS>')
-        self._cls_id = self._vocab['<CLS>']
-        _add_special_token('<SEP>')
-        self._sep_id = self._vocab['<SEP>']
-        _add_special_token('<EOD>')
-        self._eod_id = self._vocab['<EOD>']
-        _add_special_token('<MASK>')
-        self._mask_id = self._vocab['<MASK>']
-
-        pad_id = self.tokenizer.pad_id()
-        try:
-            pad_token = self.tokenizer.id_to_piece(pad_id)
-        except IndexError:
-            pad_token = '<PAD>'
-        _add_special_token(pad_token)
-        self._pad_id = self._vocab[pad_token]
-
-        bos_id = self.tokenizer.bos_id()
-        try:
-            bos_token = self.tokenizer.id_to_piece(bos_id)
-        except IndexError:
-            bos_token = '<BOS>'
-        _add_special_token(bos_token)
-        self._bos_id = self._vocab[bos_token]
-
-        eos_id = self.tokenizer.eos_id()
-        try:
-            eos_token = self.tokenizer.id_to_piece(eos_id)
-        except IndexError:
-            eos_token = '<EOS>'
-        _add_special_token(eos_token)
-        self._eos_id = self._vocab[eos_token]
-
-        for i in range(vocab_extra_ids):
-            t = "<extra_id_{}>".format(i)
-            _add_special_token(t)
-            self._t5_tokens += [t]
-
-    @property
-    def vocab_size(self):
-        return len(self._vocab)
-
-    @property
-    def vocab(self):
-        return self._vocab
-
-    @property
-    def inv_vocab(self):
-        return self._inv_vocab
-
-    @property
-    def decoder(self):
-        return self._inv_vocab
-
-    @property
-    def encoder(self):
-        return self._vocab
-
-    # From:
-    # https://github.com/NVIDIA/NeMo/blob/c8fa217e811d60d11d014827c7f3845ff6c99ae7/nemo/collections/common/tokenizers/sentencepiece_tokenizer.py#L89
-    def tokenize(self, text):
-        ids = []
-        idx = 0
-
-        while 1:
-            indices = {}
-            for token in self._special_tokens:
-                try:
-                    indices[token] = text[idx:].index(token)
-                except ValueError:
-                    continue
-            if len(indices) == 0:
-                break
-
-            next_token = min(indices, key=indices.get)
-            next_idx = idx + indices[next_token]
-
-            ids.extend(self.tokenizer.encode_as_ids(text[idx:next_idx]))
-            ids.append(self._special_tokens[next_token])
-            idx = next_idx + len(next_token)
-
-        ids.extend(self.tokenizer.encode_as_ids(text[idx:]))
-        return ids
-
-    # From:
-    # https://github.com/NVIDIA/NeMo/blob/c8fa217e811d60d11d014827c7f3845ff6c99ae7/nemo/collections/common/tokenizers/sentencepiece_tokenizer.py#L125
-    def detokenize(self, ids):
-        text = ""
-        last_i = 0
-
-        for i, id in enumerate(ids):
-            if id in self._inv_special_tokens:
-                text += self.tokenizer.decode_ids(ids[last_i:i]) + " "
-                text += self._inv_special_tokens[id] + " "
-                last_i = i + 1
-
-        text += self.tokenizer.decode_ids(ids[last_i:])
-        return text
-
-    @property
-    def cls(self):
-        return self._cls_id
-
-    @property
-    def sep(self):
-        return self._sep_id
-
-    @property
-    def pad(self):
-        return self._pad_id
-
-    @property
-    def bos_token_id(self):
-        return self._bos_id
-
-    @property
-    def bos(self):
-        return self._bos_id
-
-    @property
-    def eod(self):
-        return self._eod_id
-
-    @property
-    def eos_token_id(self):
-        return self._eos_id
-
-    @property
-    def eos(self):
-        return self._eos_id
-
-    @property
-    def mask(self):
-        return self._mask_id
-
-    @property
-    def additional_special_tokens_ids(self):
-        return [self.vocab[k] for k in self._t5_tokens]
-
-class _GPTSentencePieceTokenizer(_SentencePieceTokenizer):
-    """SentencePieceTokenizer-Megatron wrapper"""
-
-    def __init__(self, model_file,):
-        super().__init__(model_file, vocab_extra_ids=0)
-
-    def _initalize(self, vocab_extra_ids):
-        self._populate_vocab()
-
-        self._pad_id = self.tokenizer.pad_id()
-        self._bos_id = self.tokenizer.bos_id()
-        self._eos_id = self.tokenizer.eos_id()
-
-    def tokenize(self, text):
-        return self.tokenizer.encode_as_ids(text)
-
-    def detokenize(self, ids):
-        return self.tokenizer.decode_ids(ids)
-
-    @property
-    def cls(self):
-        return -1
-
-    @property
-    def sep(self):
-        return -1
-
-    @property
-    def mask(self):
-        return -1
-
-    @property
-    def eod(self):
-        return self._eos_id
-
-    @property
-    def additional_special_tokens_ids(self):
-        return None
-
-class _NullTokenizer:
-    def __init__(self, vocab_size):
-        vocab_size = int(vocab_size)
-        self._eos_id = vocab_size
-        self.vocab_size = vocab_size+1
-
-    def tokenize(self, text):
-        return [int(x) for x in text.split(' ')]
-
-    def detokenize(self, ids):
-        text = [str(x) for x in ids]
-        return ' '.join(text)
-
-    @property
-    def cls(self):
-        return -1
-
-    @property
-    def sep(self):
-        return -1
-
-    @property
-    def mask(self):
-        return -1
-
-    @property
-    def eod(self):
-        return self._eos_id
-
-    @property
-    def additional_special_tokens_ids(self):
-        return None
diff --git a/megatron/training.py b/megatron/training.py
deleted file mode 100644
index 14bca152f0..0000000000
--- a/megatron/training.py
+++ /dev/null
@@ -1,991 +0,0 @@
-# Copyright (c) 2023, NVIDIA CORPORATION. All rights reserved.
-
-"""Pretrain utilities."""
-
-from datetime import datetime
-import math
-import sys
-import time
-# The earliest we can measure the start time.
-_TRAIN_START_TIME = time.time()
-import torch
-from torch.nn.parallel.distributed import DistributedDataParallel as torchDDP
-
-from megatron import get_args
-from megatron import get_signal_handler
-from megatron import get_timers
-from megatron import get_tensorboard_writer
-from megatron import get_current_global_batch_size
-from megatron import get_num_microbatches
-from megatron import is_last_rank
-from megatron import update_num_microbatches
-from megatron.core import mpu, tensor_parallel
-from megatron import print_rank_0
-from megatron import print_rank_last
-from megatron.checkpointing import load_checkpoint
-from megatron.checkpointing import save_checkpoint
-from megatron.model import Float16Module
-from megatron.model import GPTModel
-from megatron.core.enums import ModelType
-from megatron.optimizer import get_megatron_optimizer
-from megatron.initialize import initialize_megatron
-from megatron.initialize import write_args_to_tensorboard
-from megatron.initialize import set_jit_fusion_options
-from megatron.optimizer_param_scheduler import OptimizerParamScheduler
-from megatron.model import DistributedDataParallel as LocalDDP
-from megatron.utils import check_adlr_autoresume_termination
-from megatron.utils import unwrap_model
-from megatron.data.data_samplers import build_pretraining_data_loader
-from megatron.utils import calc_params_l2_norm
-from megatron.core.pipeline_parallel import get_forward_backward_func
-from megatron.utils import report_memory
-from megatron.model.vision.knn_monitor import compute_feature_bank
-
-
-def print_datetime(string):
-    """Note that this call will sync across all ranks."""
-    torch.distributed.barrier()
-    time_str = datetime.now().strftime('%Y-%m-%d %H:%M:%S')
-    print_rank_0('[' + string + '] datetime: {} '.format(time_str))
-
-
-def pretrain(train_valid_test_dataset_provider,
-             model_provider,
-             model_type,
-             forward_step_func,
-             process_non_loss_data_func=None,
-             extra_args_provider=None,
-             args_defaults={}):
-    """Main training program.
-
-    This function will run the followings in the order provided:
-        1) initialize Megatron.
-        2) setup model, optimizer and lr schedule using the model_provider.
-        3) call train_val_test_data_provider to get train/val/test datasets.
-        4) train the modle using the forward_step_func.
-
-    Arguments:
-        train_valid_test_dataset_provider: a function that takes the size of
-            train/valid/test dataset and returns `train, valid, test` datasets.
-        model_provider: a function that returns a vanilla version of the
-            model. By vanilla we mean a simple model on cpu with no fp16 or ddp.
-        model_type: an enum that specifies the type of model being trained.
-        forward_step_func: a function that takes a `data iterator` and `model`,
-            and returns a `loss` scalar with a dictionary with key:values being
-            the info we would like to monitor during training, for example
-            `lm-loss: value`. We also require that this function add
-            `batch generator` to the timers class.
-        process_non_loss_data_func: a function to post process outputs of the
-            network. It can be used for dumping output tensors (e.g images) to
-            tensorboard. It takes `collected data`(list of tensors),
-            `current iteration index` and `tensorboard writer` as arguments.
-        extra_args_provider: a function that takes a parser and adds arguments
-            to it. It is used for programs to add their own arguments.
-        args_defaults: a dictionary from argument-name to argument-value. It
-            to set already parse arguments.
-    """
-
-    # Initalize and get arguments, timers, and Tensorboard writer.
-    initialize_megatron(extra_args_provider=extra_args_provider,
-                        args_defaults=args_defaults)
-    # Set pytorch JIT layer fusion options and warmup JIT functions.
-    set_jit_fusion_options()
-
-    # Adjust the startup time so it reflects the largest value.
-    # This will be closer to what scheduler will see (outside of
-    # image ... launches.
-    global _TRAIN_START_TIME
-    start_time_tensor = torch.cuda.DoubleTensor([_TRAIN_START_TIME])
-    torch.distributed.all_reduce(start_time_tensor,
-                                 op=torch.distributed.ReduceOp.MIN)
-    _TRAIN_START_TIME = start_time_tensor.item()
-    print_rank_0('time to initialize megatron (seconds): {:.3f}'.format(
-        time.time() - _TRAIN_START_TIME))
-    print_datetime('after megatron is initialized')
-
-    args = get_args()
-    timers = get_timers()
-
-    # Model, optimizer, and learning rate.
-    timers('model-and-optimizer-setup', log_level=0).start(barrier=True)
-    model, optimizer, opt_param_scheduler = setup_model_and_optimizer(
-        model_provider, model_type)
-    timers('model-and-optimizer-setup').stop()
-    print_datetime('after model, optimizer, and learning rate '
-                   'scheduler are built')
-
-    # Data stuff.
-    timers('train/valid/test-data-iterators-setup', log_level=0).start(
-        barrier=True)
-    if args.virtual_pipeline_model_parallel_size is not None:
-        all_data_iterators = [
-            build_train_valid_test_data_iterators(
-                train_valid_test_dataset_provider)
-            for _ in range(len(model))
-        ]
-        train_data_iterator = [data_iterators[0]
-                               for data_iterators in all_data_iterators]
-        valid_data_iterator = [data_iterators[1]
-                               for data_iterators in all_data_iterators]
-        test_data_iterator = [data_iterators[2]
-                              for data_iterators in all_data_iterators]
-    else:
-        train_data_iterator, valid_data_iterator, test_data_iterator \
-            = build_train_valid_test_data_iterators(
-                train_valid_test_dataset_provider)
-    timers('train/valid/test-data-iterators-setup').stop()
-    print_datetime('after dataloaders are built')
-
-    # Print setup timing.
-    print_rank_0('done with setup ...')
-    timers.log(['model-and-optimizer-setup',
-                'train/valid/test-data-iterators-setup'], barrier=True)
-    print_rank_0('training ...')
-
-    iteration = 0
-
-    if args.dataloader_type == 'cyclic' and args.retro_add_retriever:
-        args.train_iters = args.retro_cyclic_train_iters
-        print_rank_0("retro cyclic train iters : %d" % args.train_iters)
-
-    if args.do_train and args.train_iters > 0:
-        iteration = train(forward_step_func,
-                          model, optimizer, opt_param_scheduler,
-                          train_data_iterator, valid_data_iterator,
-                          process_non_loss_data_func)
-    print_datetime('after training is done')
-
-    if args.do_valid:
-        prefix = 'the end of training for val data'
-        evaluate_and_print_results(prefix, forward_step_func,
-                                   valid_data_iterator, model,
-                                   iteration, process_non_loss_data_func,
-                                   False)
-
-    if args.save and iteration != 0:
-        save_checkpoint(iteration, model, optimizer, opt_param_scheduler)
-
-    if args.do_test:
-        # Run on test data.
-        prefix = 'the end of training for test data'
-        evaluate_and_print_results(prefix, forward_step_func,
-                                   test_data_iterator, model,
-                                   0, process_non_loss_data_func,
-                                   True)
-
-def update_train_iters(args):
-
-    # For iteration-based training, we don't need to do anything
-    if args.train_iters:
-        return
-
-    # Constant batch size with sample-based training.
-    if args.rampup_batch_size is None:
-        args.train_iters = args.train_samples // args.global_batch_size
-
-    else:
-        # Sample based training with rampup batch size.
-        iterations = 0
-        consumed_samples = 0
-        # Rampup phase.
-        while consumed_samples <= int(args.rampup_batch_size[2]):
-            update_num_microbatches(consumed_samples, consistency_check=False)
-            consumed_samples += get_current_global_batch_size()
-            iterations += 1
-        # Reset
-        update_num_microbatches(0, consistency_check=False)
-        # Constant phase
-        # Note that we throw away any partial last batch.
-        iterations += (args.train_samples - consumed_samples) // \
-                      args.global_batch_size
-        args.train_iters = iterations
-
-    print_rank_0('setting training iterations to {}'.format(args.train_iters))
-
-
-def get_model(model_provider_func, model_type=ModelType.encoder_or_decoder, wrap_with_ddp=True):
-    """Build the model."""
-    args = get_args()
-    args.model_type = model_type
-
-    # Build model.
-    if mpu.get_pipeline_model_parallel_world_size() > 1 and \
-       args.virtual_pipeline_model_parallel_size is not None:
-        assert model_type != ModelType.encoder_and_decoder, \
-            "Interleaved schedule not supported for model with both encoder and decoder"
-        model = []
-        for i in range(args.virtual_pipeline_model_parallel_size):
-            mpu.set_virtual_pipeline_model_parallel_rank(i)
-            # Set pre_process and post_process only after virtual rank is set.
-            pre_process = mpu.is_pipeline_first_stage()
-            post_process = mpu.is_pipeline_last_stage()
-            this_model = model_provider_func(
-                pre_process=pre_process,
-                post_process=post_process
-            )
-            this_model.model_type = model_type
-            model.append(this_model)
-    else:
-        pre_process = mpu.is_pipeline_first_stage()
-        post_process = mpu.is_pipeline_last_stage()
-        add_encoder = True
-        add_decoder = True
-        if model_type == ModelType.encoder_and_decoder:
-            if mpu.get_pipeline_model_parallel_world_size() > 1:
-                assert args.pipeline_model_parallel_split_rank is not None, \
-                    "Split rank needs to be specified for model with both encoder and decoder"
-                rank = mpu.get_pipeline_model_parallel_rank()
-                split_rank = args.pipeline_model_parallel_split_rank
-                world_size = mpu.get_pipeline_model_parallel_world_size()
-                pre_process = rank == 0 or rank == split_rank
-                post_process = (rank == (split_rank - 1)) or (
-                        rank == (world_size - 1))
-                add_encoder = mpu.is_pipeline_stage_before_split()
-                add_decoder = mpu.is_pipeline_stage_after_split()
-            model = model_provider_func(
-                pre_process=pre_process,
-                post_process=post_process,
-                add_encoder=add_encoder,
-                add_decoder=add_decoder)
-        else:
-            model = model_provider_func(
-                pre_process=pre_process,
-                post_process=post_process
-            )
-        model.model_type = model_type
-
-    if not isinstance(model, list):
-        model = [model]
-
-    # Disallow training and inference with Transformer Engine
-    # for non-GPT models
-    args.allow_transformer_engine = all([type(m) == GPTModel for m in model])
-    assert args.allow_transformer_engine or args.transformer_impl == 'local', \
-        'Transformer Engine is only approved for GPT models'
-
-    # Set tensor model parallel attributes if not set.
-    # Only parameters that are already tensor model parallel have these
-    # attributes set for them. We should make sure the default attributes
-    # are set for all params so the optimizer can use them.
-    for model_module in model:
-        for param in model_module.parameters():
-            tensor_parallel.set_defaults_if_not_set_tensor_model_parallel_attributes(param)
-
-    # Print number of parameters.
-    if mpu.get_data_parallel_rank() == 0:
-        print(' > number of parameters on (tensor, pipeline) '
-              'model parallel rank ({}, {}): {}'.format(
-            mpu.get_tensor_model_parallel_rank(),
-            mpu.get_pipeline_model_parallel_rank(),
-            sum([sum([p.nelement() for p in model_module.parameters()])
-                 for model_module in model])), flush=True)
-
-    # GPU allocation.
-    for model_module in model:
-        model_module.cuda(torch.cuda.current_device())
-
-    # Fp16 conversion.
-    if args.fp16 or args.bf16:
-        model = [Float16Module(model_module, args) for model_module in model]
-
-    if wrap_with_ddp:
-        if args.DDP_impl == 'torch':
-            i = torch.cuda.current_device()
-            model = [torchDDP(model_module, device_ids=[i], output_device=i,
-                              process_group=mpu.get_data_parallel_group())
-                     for model_module in model]
-
-        elif args.DDP_impl == 'local':
-            model = [LocalDDP(model_module,
-                              args.accumulate_allreduce_grads_in_fp32,
-                              args.use_contiguous_buffers_in_local_ddp)
-                     for model_module in model]
-            # broad cast params from data parallel src rank to other data parallel ranks
-            if args.data_parallel_random_init:
-                for model_module in model:
-                    model_module.broadcast_params()
-        else:
-            raise NotImplementedError('Unknown DDP implementation specified: '
-                                      '{}. Exiting.'.format(args.DDP_impl))
-
-    return model
-
-
-def get_optimizer_param_scheduler(optimizer):
-    """Build the learning rate scheduler."""
-    args = get_args()
-
-    # Iteration-based training.
-    if args.train_iters:
-        if args.lr_decay_iters is None:
-            args.lr_decay_iters = args.train_iters
-        lr_decay_steps = args.lr_decay_iters * args.global_batch_size
-        wd_incr_steps = args.train_iters * args.global_batch_size
-        if args.lr_warmup_fraction is not None:
-            lr_warmup_steps = args.lr_warmup_fraction * lr_decay_steps
-        else:
-            lr_warmup_steps = args.lr_warmup_iters * args.global_batch_size
-    # Sample-based training.
-    elif args.train_samples:
-        # We need to set training iters for later use. Technically
-        # we need to adjust the training samples too (due to last
-        # batch being incomplete) but we leave it as is for now.
-        update_train_iters(args)
-        if args.lr_decay_samples is None:
-            args.lr_decay_samples = args.train_samples
-        lr_decay_steps = args.lr_decay_samples
-        wd_incr_steps = args.train_samples
-        if args.lr_warmup_fraction is not None:
-            lr_warmup_steps = args.lr_warmup_fraction * lr_decay_steps
-        else:
-            lr_warmup_steps = args.lr_warmup_samples
-    else:
-        raise Exception(
-            'either train-iters or train-samples should be provided.')
-
-    opt_param_scheduler = OptimizerParamScheduler(
-        optimizer,
-        max_lr=args.lr,
-        min_lr=args.min_lr,
-        lr_warmup_steps=lr_warmup_steps,
-        lr_decay_steps=lr_decay_steps,
-        lr_decay_style=args.lr_decay_style,
-        start_wd=args.start_weight_decay,
-        end_wd=args.end_weight_decay,
-        wd_incr_steps=wd_incr_steps,
-        wd_incr_style=args.weight_decay_incr_style,
-        use_checkpoint_opt_param_scheduler=args.use_checkpoint_opt_param_scheduler,
-        override_opt_param_scheduler=args.override_opt_param_scheduler)
-
-    return opt_param_scheduler
-
-
-def setup_model_and_optimizer(model_provider_func,
-                              model_type,
-                              no_wd_decay_cond=None,
-                              scale_lr_cond=None,
-                              lr_mult=1.0):
-    """Setup model and optimizer."""
-    args = get_args()
-
-    model = get_model(model_provider_func, model_type)
-    unwrapped_model = unwrap_model(model,
-                                   (torchDDP, LocalDDP, Float16Module))
-
-    optimizer = get_megatron_optimizer(model, no_wd_decay_cond,
-                                       scale_lr_cond, lr_mult)
-    opt_param_scheduler = get_optimizer_param_scheduler(optimizer)
-
-    if args.load is not None:
-        timers = get_timers()
-        timers('load-checkpoint', log_level=0).start(barrier=True)
-        args.iteration = load_checkpoint(model, optimizer, opt_param_scheduler)
-        timers('load-checkpoint').stop(barrier=True)
-        timers.log(['load-checkpoint'])
-    else:
-        args.iteration = 0
-
-    # We only support local DDP with multiple micro-batches.
-    if len(model) > 1 or mpu.get_pipeline_model_parallel_world_size() > 1:
-        assert args.DDP_impl == 'local'
-
-    # get model without FP16 and/or TorchDDP wrappers
-    if args.iteration == 0 and len(unwrapped_model) == 1 \
-        and hasattr(unwrapped_model[0], 'init_state_dict_from_bert'):
-        print_rank_0("Initializing ICT from pretrained BERT model")
-        unwrapped_model[0].init_state_dict_from_bert()
-        if args.fp16:
-            optimizer.reload_model_params()
-
-    return model, optimizer, opt_param_scheduler
-
-
-
-def train_step(forward_step_func, data_iterator,
-               model, optimizer, opt_param_scheduler):
-    """Single training step."""
-    args = get_args()
-    timers = get_timers()
-
-    # Set grad to zero.
-    if args.DDP_impl == 'local' and args.use_contiguous_buffers_in_local_ddp:
-        for partition in model:
-            partition.zero_grad_buffer()
-    optimizer.zero_grad()
-
-    # Forward pass.
-    timers('forward-backward', log_level=1).start(
-        barrier=args.barrier_with_L1_time)
-    forward_backward_func = get_forward_backward_func()
-    fwd_bwd_timers = timers if args.timing_log_level > 1 else None
-    losses_reduced = forward_backward_func(
-        forward_step_func=forward_step_func,
-        data_iterator=data_iterator,
-        model=model,
-        num_microbatches=get_num_microbatches(),
-        dtype=args.params_dtype,
-        tensor_shape=(args.seq_length, args.micro_batch_size, args.hidden_size),
-        grad_scaler=optimizer.scale_loss,
-        sequence_parallel=args.sequence_parallel,
-        forward_only=False,
-        timers=fwd_bwd_timers)
-    timers('forward-backward').stop()
-
-    # Empty unused memory.
-    if args.empty_unused_memory_level >= 1:
-        torch.cuda.empty_cache()
-
-    # Reduce gradients.
-    optimizer.reduce_model_grads(args, timers)
-
-    # Vision gradients.
-    if args.vision_pretraining and args.vision_pretraining_type == "dino":
-        unwrapped_model = unwrap_model(model[0],
-                                       (torchDDP, LocalDDP, Float16Module))
-        unwrapped_model.cancel_gradients_last_layer(args.curr_iteration)
-
-    # Update parameters.
-    timers('optimizer', log_level=1).start(barrier=args.barrier_with_L1_time)
-    update_successful, grad_norm, num_zeros_in_grad = optimizer.step(args, timers)
-    timers('optimizer').stop()
-
-    # Gather params.
-    if update_successful:
-        optimizer.gather_model_params(args, timers)
-
-    # Vision momentum.
-    if args.vision_pretraining and args.vision_pretraining_type == "dino":
-        unwrapped_model = unwrap_model(model[0],
-                                       (torchDDP, LocalDDP, Float16Module))
-        unwrapped_model.update_momentum(args.curr_iteration)
-
-    # Update learning rate.
-    if update_successful:
-        increment = get_num_microbatches() * \
-                    args.micro_batch_size * \
-                    args.data_parallel_size
-        opt_param_scheduler.step(increment=increment)
-        skipped_iter = 0
-    else:
-        skipped_iter = 1
-
-    # Empty unused memory.
-    if args.empty_unused_memory_level >= 2:
-        torch.cuda.empty_cache()
-
-    if mpu.is_pipeline_last_stage(ignore_virtual=True):
-        # Average loss across microbatches.
-        loss_reduced = {}
-        for key in losses_reduced[0]:
-            losses_reduced_for_key = [x[key] for x in losses_reduced]
-            loss_reduced[key] = sum(losses_reduced_for_key) / len(losses_reduced_for_key)
-        return loss_reduced, skipped_iter, grad_norm, num_zeros_in_grad
-    return {}, skipped_iter, grad_norm, num_zeros_in_grad
-
-
-def training_log(loss_dict, total_loss_dict, learning_rate, iteration,
-                 loss_scale, report_memory_flag, skipped_iter,
-                 grad_norm, params_norm, num_zeros_in_grad):
-    """Log training information such as losses, timing, ...."""
-    args = get_args()
-    timers = get_timers()
-    writer = get_tensorboard_writer()
-
-    # Advanced, skipped, and Nan iterations.
-    advanced_iters_key = 'advanced iterations'
-    skipped_iters_key = 'skipped iterations'
-    nan_iters_key = 'nan iterations'
-    # Advanced iterations.
-    if not skipped_iter:
-        total_loss_dict[advanced_iters_key] = total_loss_dict.get(
-            advanced_iters_key, 0) + 1
-    else:
-        if advanced_iters_key not in total_loss_dict:
-            total_loss_dict[advanced_iters_key] = 0
-    # Skipped iterations.
-    total_loss_dict[skipped_iters_key] = total_loss_dict.get(
-        skipped_iters_key, 0) + skipped_iter
-    # Update losses and set nan iterations
-    got_nan = False
-    for key in loss_dict:
-        if not skipped_iter:
-            total_loss_dict[key] = total_loss_dict.get(
-                key, torch.cuda.FloatTensor([0.0])) + loss_dict[key]
-        else:
-            value = loss_dict[key].float().sum().item()
-            is_nan = value == float('inf') or \
-                     value == -float('inf') or \
-                     value != value
-            got_nan = got_nan or is_nan
-    total_loss_dict[nan_iters_key] = total_loss_dict.get(
-        nan_iters_key, 0) + int(got_nan)
-
-    # Logging.
-    timers_to_log = [
-        'forward-backward',
-        'forward-compute',
-        'backward-compute',
-        'batch-generator',
-        'forward-recv',
-        'forward-send',
-        'backward-recv',
-        'backward-send',
-        'forward-send-forward-recv',
-        'forward-send-backward-recv',
-        'backward-send-forward-recv',
-        'backward-send-backward-recv',
-        'forward-backward-send-forward-backward-recv',
-        'layernorm-grads-all-reduce',
-        'embedding-grads-all-reduce',
-        'grads-all-reduce',
-        'grads-reduce-scatter',
-        'params-all-gather',
-        'optimizer-copy-to-main-grad',
-        'optimizer-unscale-and-check-inf',
-        'optimizer-clip-main-grad',
-        'optimizer-count-zeros',
-        'optimizer-inner-step',
-        'optimizer-copy-main-to-model-params',
-        'optimizer']
-
-    # Calculate batch size.
-    batch_size = args.micro_batch_size * args.data_parallel_size * \
-        get_num_microbatches()
-
-    total_iterations = total_loss_dict[advanced_iters_key] + \
-                       total_loss_dict[skipped_iters_key]
-
-    # Tensorboard values.
-    # Timer requires all the ranks to call.
-    if args.log_timers_to_tensorboard and \
-       (iteration % args.tensorboard_log_interval == 0):
-        timers.write(timers_to_log, writer, iteration,
-                     normalizer=total_iterations)
-    if writer and (iteration % args.tensorboard_log_interval == 0):
-        if args.log_learning_rate_to_tensorboard:
-            writer.add_scalar('learning-rate', learning_rate, iteration)
-            writer.add_scalar('learning-rate vs samples', learning_rate,
-                              args.consumed_train_samples)
-        if args.log_batch_size_to_tensorboard:
-            writer.add_scalar('batch-size', batch_size, iteration)
-            writer.add_scalar('batch-size vs samples', batch_size,
-                              args.consumed_train_samples)
-        for key in loss_dict:
-            writer.add_scalar(key , loss_dict[key], iteration)
-            writer.add_scalar(key + ' vs samples', loss_dict[key],
-                              args.consumed_train_samples)
-        if args.log_loss_scale_to_tensorboard:
-            writer.add_scalar('loss-scale', loss_scale, iteration)
-            writer.add_scalar('loss-scale vs samples', loss_scale,
-                              args.consumed_train_samples)
-        if args.log_world_size_to_tensorboard:
-            writer.add_scalar('world-size', args.world_size, iteration)
-            writer.add_scalar('world-size vs samples', args.world_size,
-                              args.consumed_train_samples)
-        if grad_norm is not None:
-            writer.add_scalar('grad-norm', grad_norm, iteration)
-            writer.add_scalar('grad-norm vs samples', grad_norm,
-                              args.consumed_train_samples)
-        if num_zeros_in_grad is not None:
-            writer.add_scalar('num-zeros', num_zeros_in_grad, iteration)
-            writer.add_scalar('num-zeros vs samples', num_zeros_in_grad,
-                              args.consumed_train_samples)
-        if params_norm is not None:
-            writer.add_scalar('params-norm', params_norm, iteration)
-            writer.add_scalar('params-norm vs samples', params_norm,
-                              args.consumed_train_samples)
-        if args.log_memory_to_tensorboard:
-            mem_stats = torch.cuda.memory_stats()
-            writer.add_scalar(
-                "mem-reserved-bytes",
-                mem_stats["reserved_bytes.all.current"],
-                iteration,
-            )
-            writer.add_scalar(
-                "mem-allocated-bytes",
-                mem_stats["allocated_bytes.all.current"],
-                iteration,
-            )
-            writer.add_scalar(
-                "mem-allocated-count",
-                mem_stats["allocation.all.current"],
-                iteration,
-            )
-
-    if iteration % args.log_interval == 0:
-        elapsed_time = timers('interval-time').elapsed(barrier=True)
-        elapsed_time_per_iteration = elapsed_time / total_iterations
-        if writer:
-            if args.log_timers_to_tensorboard:
-                writer.add_scalar('iteration-time',
-                                  elapsed_time_per_iteration, iteration)
-        log_string = ' iteration {:8d}/{:8d} |'.format(
-            iteration, args.train_iters)
-        log_string += ' consumed samples: {:12d} |'.format(
-            args.consumed_train_samples)
-        log_string += ' elapsed time per iteration (ms): {:.1f} |'.format(
-            elapsed_time_per_iteration * 1000.0)
-        log_string += ' learning rate: {:.3E} |'.format(learning_rate)
-        log_string += ' global batch size: {:5d} |'.format(batch_size)
-        for key in total_loss_dict:
-            if key not in [advanced_iters_key, skipped_iters_key,
-                           nan_iters_key]:
-                avg = total_loss_dict[key].item() / \
-                      float(max(1, total_loss_dict[advanced_iters_key]))
-                if avg > 0.0:
-                    log_string += ' {}: {:.6E} |'.format(key, avg)
-                total_loss_dict[key] = torch.cuda.FloatTensor([0.0])
-        log_string += ' loss scale: {:.1f} |'.format(loss_scale)
-        if grad_norm is not None:
-            log_string += ' grad norm: {:.3f} |'.format(grad_norm)
-        if num_zeros_in_grad is not None:
-            log_string += ' num zeros: {:.1f} |'.format(num_zeros_in_grad)
-        if params_norm is not None:
-            log_string += ' params norm: {:.3f} |'.format(params_norm)
-        log_string += ' number of skipped iterations: {:3d} |'.format(
-            total_loss_dict[skipped_iters_key])
-        log_string += ' number of nan iterations: {:3d} |'.format(
-            total_loss_dict[nan_iters_key])
-        total_loss_dict[advanced_iters_key] = 0
-        total_loss_dict[skipped_iters_key] = 0
-        total_loss_dict[nan_iters_key] = 0
-        print_rank_last(log_string)
-        if report_memory_flag and learning_rate > 0.:
-            # Report memory after optimizer state has been initialized.
-            report_memory('(after {} iterations)'.format(iteration))
-            report_memory_flag = False
-        timers.log(timers_to_log, normalizer=args.log_interval)
-
-    return report_memory_flag
-
-
-def save_checkpoint_and_time(iteration, model, optimizer, opt_param_scheduler):
-    timers = get_timers()
-    # Extra barrier is added to make sure
-    # all ranks report the max time.
-    timers('save-checkpoint', log_level=0).start(barrier=True)
-    save_checkpoint(iteration, model, optimizer, opt_param_scheduler)
-    timers('save-checkpoint').stop(barrier=True)
-    timers.log(['save-checkpoint'])
-
-
-def train(forward_step_func, model, optimizer, opt_param_scheduler,
-          train_data_iterator, valid_data_iterator,
-          process_non_loss_data_func):
-    """Train the model function."""
-    args = get_args()
-    timers = get_timers()
-
-    # Write args to tensorboard
-    write_args_to_tensorboard()
-
-    # Turn on training mode which enables dropout.
-    for model_module in model:
-        model_module.train()
-
-    # Tracking loss.
-    total_loss_dict = {}
-
-    # Iterations.
-    iteration = args.iteration
-
-    timers('interval-time', log_level=0).start(barrier=True)
-    print_datetime('before the start of training step')
-    report_memory_flag = True
-    while iteration < args.train_iters:
-        update_num_microbatches(args.consumed_train_samples)
-        args.curr_iteration = iteration
-        loss_dict, skipped_iter, grad_norm, num_zeros_in_grad = \
-            train_step(forward_step_func,
-                       train_data_iterator,
-                       model,
-                       optimizer,
-                       opt_param_scheduler)
-        iteration += 1
-        args.consumed_train_samples += mpu.get_data_parallel_world_size() * \
-                                       args.micro_batch_size * \
-                                       get_num_microbatches()
-
-        # Logging.
-        loss_scale = optimizer.get_loss_scale().item()
-        params_norm = None
-        if args.log_params_norm:
-            params_norm = calc_params_l2_norm(model)
-        report_memory_flag = training_log(loss_dict, total_loss_dict,
-                                          optimizer.param_groups[0]['lr'],
-                                          iteration, loss_scale,
-                                          report_memory_flag, skipped_iter,
-                                          grad_norm, params_norm, num_zeros_in_grad)
-
-        # Autoresume
-        if args.adlr_autoresume and \
-           (iteration % args.adlr_autoresume_interval == 0):
-            check_adlr_autoresume_termination(iteration, model, optimizer,
-                                              opt_param_scheduler)
-
-        # Evaluation
-        if args.eval_interval and iteration % args.eval_interval == 0 and \
-           args.do_valid:
-            prefix = 'iteration {}'.format(iteration)
-            evaluate_and_print_results(prefix, forward_step_func,
-                                       valid_data_iterator, model,
-                                       iteration, process_non_loss_data_func,
-                                       False)
-
-        # Checkpointing
-        saved_checkpoint = False
-        if args.exit_signal_handler:
-            signal_handler = get_signal_handler()
-            if any(signal_handler.signals_received()):
-                save_checkpoint_and_time(iteration, model, optimizer,
-                                         opt_param_scheduler)
-                print_datetime('exiting program after receiving SIGTERM.')
-                sys.exit()
-
-        if args.save and args.save_interval and \
-           iteration % args.save_interval == 0:
-            save_checkpoint_and_time(iteration, model, optimizer,
-                                     opt_param_scheduler)
-            saved_checkpoint = True
-
-        # Exiting based on duration
-        if args.exit_duration_in_mins:
-            train_time = (time.time() - _TRAIN_START_TIME) / 60.0
-            done_cuda = torch.cuda.IntTensor(
-                [train_time > args.exit_duration_in_mins])
-            torch.distributed.all_reduce(
-                done_cuda, op=torch.distributed.ReduceOp.MAX)
-            done = done_cuda.item()
-            if done:
-                if not saved_checkpoint:
-                    save_checkpoint_and_time(iteration, model, optimizer,
-                                             opt_param_scheduler)
-                print_datetime('exiting program after {} minutes'.format(train_time))
-                sys.exit()
-
-        # Exiting based on iterations
-        if args.exit_interval and iteration % args.exit_interval == 0:
-            if args.save and not saved_checkpoint:
-                save_checkpoint_and_time(iteration, model, optimizer,
-                                         opt_param_scheduler)
-            torch.distributed.barrier()
-            print_datetime('exiting program at iteration {}'.format(iteration))
-            sys.exit()
-
-
-    return iteration
-
-
-def evaluate(forward_step_func,
-             data_iterator,
-             model,
-             process_non_loss_data_func,
-             verbose=False):
-    """Evaluation."""
-    args = get_args()
-
-    if args.vision_pretraining and args.vision_pretraining_type == "dino":
-        compute_feature_bank(model)
-
-    # Turn on evaluation mode which disables dropout.
-    for model_module in model:
-        model_module.eval()
-
-    total_loss_dict = {}
-
-    with torch.no_grad():
-        iteration = 0
-        while iteration < args.eval_iters:
-            iteration += 1
-            if verbose and iteration % args.log_interval == 0:
-                print_rank_0('Evaluating iter {}/{}'.format(iteration,
-                                                            args.eval_iters))
-
-            forward_backward_func = get_forward_backward_func()
-            loss_dicts = forward_backward_func(
-                forward_step_func=forward_step_func,
-                data_iterator=data_iterator,
-                model=model,
-                num_microbatches=get_num_microbatches(),
-                dtype=args.params_dtype,
-                tensor_shape=(args.seq_length, args.micro_batch_size, args.hidden_size),
-                sequence_parallel=args.sequence_parallel,
-                forward_only=True,
-                timers=None)
-
-            # Empty unused memory
-            if args.empty_unused_memory_level >= 1:
-                torch.cuda.empty_cache()
-
-            if mpu.is_pipeline_last_stage(ignore_virtual=True):
-                # Reduce across processes.
-                for loss_dict in loss_dicts:
-                    for key in loss_dict:
-                        total_loss_dict[key] = total_loss_dict.get(
-                            key, torch.cuda.FloatTensor([0.0])) + loss_dict[key]
-
-            args.consumed_valid_samples += mpu.get_data_parallel_world_size() \
-                                           * args.micro_batch_size \
-                                           * get_num_microbatches()
-        collected_non_loss_data = None
-        if process_non_loss_data_func is not None and is_last_rank():
-            collected_non_loss_data = forward_backward_func(
-                forward_step_func, data_iterator, model, optimizer=None,
-                timers=None, forward_only=True, collect_non_loss_data=True)
-
-    # Move model back to the train mode.
-    for model_module in model:
-        model_module.train()
-
-    for key in total_loss_dict:
-        total_loss_dict[key] /= args.eval_iters * get_num_microbatches()
-
-    return total_loss_dict, collected_non_loss_data
-
-def evaluate_and_print_results(prefix, forward_step_func,
-                               data_iterator, model,
-                               iteration, process_non_loss_data_func,
-                               verbose=False):
-    """Helper function to evaluate and dump results on screen."""
-    args = get_args()
-    writer = get_tensorboard_writer()
-
-    total_loss_dict, collected_non_loss_data = evaluate(
-        forward_step_func, data_iterator, model,
-        process_non_loss_data_func, verbose)
-    string = ' validation loss at {} | '.format(prefix)
-    for key in total_loss_dict:
-        string += '{} value: {:.6E} | '.format(key, total_loss_dict[key].item())
-        ppl = math.exp(min(20, total_loss_dict[key].item()))
-        string += '{} PPL: {:.6E} | '.format(key, ppl)
-        if writer:
-            writer.add_scalar('{} validation'.format(key),
-                              total_loss_dict[key].item(),
-                              iteration)
-            writer.add_scalar('{} validation vs samples'.format(key),
-                              total_loss_dict[key].item(),
-                              args.consumed_train_samples)
-            if args.log_validation_ppl_to_tensorboard:
-                writer.add_scalar('{} validation ppl'.format(key), ppl,
-                                  iteration)
-                writer.add_scalar('{} validation ppl vs samples'.format(key),
-                                  ppl, args.consumed_train_samples)
-
-    if process_non_loss_data_func is not None and writer and is_last_rank():
-        process_non_loss_data_func(collected_non_loss_data, iteration, writer)
-
-    length = len(string) + 1
-    print_rank_last('-' * length)
-    print_rank_last(string)
-    print_rank_last('-' * length)
-
-
-def cyclic_iter(iter):
-    while True:
-        for x in iter:
-            yield x
-
-
-def build_train_valid_test_data_loaders(
-        build_train_valid_test_datasets_provider):
-    """XXX"""
-    args = get_args()
-
-    (train_dataloader, valid_dataloader, test_dataloader) = (None, None, None)
-
-    print_rank_0('> building train, validation, and test datasets ...')
-
-    # Backward compatibility, assume fixed batch size.
-    if args.iteration > 0 and args.consumed_train_samples == 0:
-        assert args.train_samples is None, \
-            'only backward compatiblity support for iteration-based training'
-        args.consumed_train_samples = args.iteration * args.global_batch_size
-    if args.iteration > 0 and args.consumed_valid_samples == 0:
-        if args.train_samples is None:
-            args.consumed_valid_samples = (args.iteration // args.eval_interval) * \
-                args.eval_iters * args.global_batch_size
-
-    # Data loader only on rank 0 of each model parallel group.
-    if mpu.get_tensor_model_parallel_rank() == 0:
-
-        # Number of train/valid/test samples.
-        if args.train_samples:
-            train_samples = args.train_samples
-        else:
-            train_samples = args.train_iters * args.global_batch_size
-        eval_iters = (args.train_iters // args.eval_interval + 1) * \
-                     args.eval_iters
-        test_iters = args.eval_iters
-        train_val_test_num_samples = [train_samples,
-                                      eval_iters * args.global_batch_size,
-                                      test_iters * args.global_batch_size]
-        print_rank_0(' > datasets target sizes (minimum size):')
-        print_rank_0('    train:      {}'.format(train_val_test_num_samples[0]))
-        print_rank_0('    validation: {}'.format(train_val_test_num_samples[1]))
-        print_rank_0('    test:       {}'.format(train_val_test_num_samples[2]))
-
-        # Build the datasets.
-        train_ds, valid_ds, test_ds = build_train_valid_test_datasets_provider(
-            train_val_test_num_samples)
-
-        # Build dataloders.
-        train_dataloader = build_pretraining_data_loader(
-            train_ds, args.consumed_train_samples)
-        valid_dataloader = build_pretraining_data_loader(
-            valid_ds, args.consumed_valid_samples)
-        test_dataloader = build_pretraining_data_loader(test_ds, 0)
-
-        # Flags to know if we need to do training/validation/testing.
-        do_train = train_dataloader is not None and args.train_iters > 0
-        do_valid = valid_dataloader is not None and args.eval_iters > 0
-        do_test = test_dataloader is not None and args.eval_iters > 0
-        # Need to broadcast num_tokens and num_type_tokens.
-        flags = torch.cuda.LongTensor(
-            [int(do_train), int(do_valid), int(do_test)])
-    else:
-        flags = torch.cuda.LongTensor([0, 0, 0])
-
-    # Broadcast num tokens.
-    torch.distributed.broadcast(flags,
-                                mpu.get_tensor_model_parallel_src_rank(),
-                                group=mpu.get_tensor_model_parallel_group())
-    args.do_train = flags[0].item()
-    args.do_valid = flags[1].item()
-    args.do_test = flags[2].item()
-
-    return train_dataloader, valid_dataloader, test_dataloader
-
-
-def build_train_valid_test_data_iterators(
-        build_train_valid_test_datasets_provider):
-
-    args = get_args()
-
-    # Build loaders.
-    train_dataloader, valid_dataloader, test_dataloader = \
-        build_train_valid_test_data_loaders(
-            build_train_valid_test_datasets_provider)
-
-    # Build iterators.
-    dl_type = args.dataloader_type
-    assert dl_type in ['single', 'cyclic']
-
-    if train_dataloader is not None:
-        train_data_iterator = iter(train_dataloader) if dl_type == 'single' \
-                              else iter(cyclic_iter(train_dataloader))
-    else:
-        train_data_iterator = None
-
-    if valid_dataloader is not None:
-        valid_data_iterator = iter(valid_dataloader) if dl_type == 'single' \
-                              else iter(cyclic_iter(valid_dataloader))
-    else:
-        valid_data_iterator = None
-
-    if test_dataloader is not None:
-        test_data_iterator = iter(test_dataloader) if dl_type == 'single' \
-                             else iter(cyclic_iter(test_dataloader))
-    else:
-        test_data_iterator = None
-
-    return train_data_iterator, valid_data_iterator, test_data_iterator
diff --git a/megatron/__init__.py b/megatron/training/__init__.py
similarity index 58%
rename from megatron/__init__.py
rename to megatron/training/__init__.py
index aa99c0665a..46cf5b5c9b 100644
--- a/megatron/__init__.py
+++ b/megatron/training/__init__.py
@@ -1,17 +1,17 @@
-# Copyright (c) 2022, NVIDIA CORPORATION. All rights reserved.
+# Copyright (c) 2024, NVIDIA CORPORATION. All rights reserved.
 
 import torch
 
-from .global_vars import get_args, get_retro_args
-from .global_vars import get_current_global_batch_size
-from .global_vars import get_num_microbatches
+from .global_vars import get_args
 from .global_vars import get_signal_handler
-from .global_vars import update_num_microbatches
 from .global_vars import get_tokenizer
 from .global_vars import get_tensorboard_writer
+from .global_vars import get_wandb_writer
+from .global_vars import get_one_logger
 from .global_vars import get_adlr_autoresume
 from .global_vars import get_timers
 from .initialize  import initialize_megatron
+from .training import pretrain, get_model, get_train_valid_test_num_samples
 
 from .utils import (print_rank_0,
                     is_last_rank,
diff --git a/megatron/training/activations.py b/megatron/training/activations.py
new file mode 100644
index 0000000000..c6ce9f1de1
--- /dev/null
+++ b/megatron/training/activations.py
@@ -0,0 +1,22 @@
+# Copyright (c) 2024, NVIDIA CORPORATION. All rights reserved.
+import torch
+import torch.nn.functional as F
+
+try:
+    jit_fuser = torch.compile
+except AttributeError:
+    jit_fuser = torch.jit.script
+
+
+@jit_fuser
+def squared_relu(x: torch.Tensor) -> torch.Tensor:
+    return torch.pow(F.relu(x), 2)
+
+
+@jit_fuser
+def quick_gelu(x: torch.Tensor) -> torch.Tensor:
+    return x * torch.sigmoid(1.702 * x)
+
+@jit_fuser
+def fast_gelu(x: torch.Tensor) -> torch.Tensor:
+    return 0.5 * x * (1.0 + torch.tanh(x * 0.7978845608 * (1.0 + 0.044715 * x * x)))
diff --git a/megatron/training/arguments.py b/megatron/training/arguments.py
new file mode 100644
index 0000000000..e3d876a5f2
--- /dev/null
+++ b/megatron/training/arguments.py
@@ -0,0 +1,1966 @@
+# Copyright (c) 2024, NVIDIA CORPORATION. All rights reserved.
+
+"""Megatron arguments."""
+
+import argparse
+import dataclasses
+import json
+import logging
+import os
+import torch
+import types
+
+import torch.nn.functional as F
+
+from megatron.core.dist_checkpointing.validation import StrictHandling
+from megatron.core.models.retro.utils import (
+    get_config_path as get_retro_config_path,
+    get_gpt_data_dir as get_retro_data_dir,
+)
+from megatron.core.transformer import TransformerConfig, MLATransformerConfig
+from megatron.training.activations import squared_relu
+from megatron.training.utils import update_use_dist_ckpt
+
+
+def parse_args(extra_args_provider=None, ignore_unknown_args=False):
+    """Parse all arguments."""
+    parser = argparse.ArgumentParser(description='Megatron-LM Arguments',
+                                     allow_abbrev=False)
+
+    # Standard arguments.
+    parser = _add_network_size_args(parser)
+    parser = _add_regularization_args(parser)
+    parser = _add_training_args(parser)
+    parser = _add_initialization_args(parser)
+    parser = _add_learning_rate_args(parser)
+    parser = _add_checkpointing_args(parser)
+    parser = _add_mixed_precision_args(parser)
+    parser = _add_distributed_args(parser)
+    parser = _add_validation_args(parser)
+    parser = _add_data_args(parser)
+    parser = _add_autoresume_args(parser)
+    parser = _add_biencoder_args(parser)
+    parser = _add_vision_args(parser)
+    parser = _add_moe_args(parser)
+    parser = _add_mla_args(parser)
+    parser = _add_logging_args(parser)
+    parser = _add_straggler_detector_args(parser)
+    parser = _add_inference_args(parser)
+    parser = _add_transformer_engine_args(parser)
+    parser = _add_retro_args(parser)
+    parser = _add_experimental_args(parser)
+    parser = _add_one_logger_args(parser)
+    parser = _add_ft_package_args(parser)
+    parser = _add_config_logger_args(parser)
+
+    # Custom arguments.
+    if extra_args_provider is not None:
+        parser = extra_args_provider(parser)
+
+    # Parse.
+    if ignore_unknown_args:
+        args, _ = parser.parse_known_args()
+    else:
+        args = parser.parse_args()
+
+    # Experimental yaml
+    if args.yaml_cfg is not None:
+        from .yaml_arguments import load_yaml
+        assert args.yaml_cfg and not args.use_legacy_models, \
+            "Yaml config is not supported with legacy models."
+        args = load_yaml(args.yaml_cfg)
+
+
+    # Args from environment
+    args.rank = int(os.getenv('RANK', '0'))
+    args.world_size = int(os.getenv("WORLD_SIZE", '1'))
+
+    return args
+
+
+def load_retro_config(retro_project_dir):
+    '''Load Retro's config.json.'''
+
+    # Retro config path.
+    retro_config_path = get_retro_config_path(retro_project_dir)
+    assert os.path.exists(retro_config_path), \
+        "Retro project dir missing config.json."
+
+    # Load retro config.
+    with open(retro_config_path) as f:
+        retro_config = types.SimpleNamespace(**json.load(f))
+
+    return retro_config
+
+
+def load_retro_args(args):
+    """Load predefined args from Retro config (if applicable).
+
+    When using Retro (or GPT for comparison purposes), data arguments are
+    overridden by the saved config.json within the Retro project directory. This
+    is to ensure that the data used for pretraining is consistent with the data
+    that was preprocessed using the Retro preprocessing pipeline (see
+    `tools/retro/preprocess_data.py`).
+    """
+
+    # Return if no project directory is specified.
+    if args.retro_project_dir is None:
+        return
+
+    # Load retro config.
+    retro_config = load_retro_config(args.retro_project_dir)
+
+    # Retro data path is relative to project dir (via hard or soft links).
+    data_dir = get_retro_data_dir(args.retro_project_dir)
+    data_path = list(retro_config.retro_gpt_data_path)
+    if len(data_path) % 2 == 0:
+        for i in range(len(data_path) - 1, -1, -2):
+            data_path[i] = os.path.join(data_dir, data_path[i])
+    else:
+        assert len(data_path) == 1
+        data_path[0] = os.path.join(data_dir, data_path[0])
+
+    # Update args.
+    args.data_cache_path = retro_config.retro_gpt_data_cache_path
+    args.data_path = data_path if args.data_path is None else args.data_path
+    args.eval_interval = retro_config.retro_gpt_eval_interval
+    args.eval_iters = retro_config.retro_gpt_eval_iters
+    args.global_batch_size = retro_config.retro_gpt_global_batch_size
+    args.max_position_embeddings = retro_config.retro_gpt_seq_length
+    args.merge_file = os.path.join(
+        args.retro_project_dir,
+        retro_config.retro_gpt_merge_file,
+    ) if retro_config.retro_gpt_merge_file is not None else None
+    args.seed = retro_config.retro_gpt_seed
+    args.seq_length = retro_config.retro_gpt_seq_length
+    args.tokenizer_model = os.path.join(
+        args.retro_project_dir,
+        retro_config.retro_gpt_tokenizer_model,
+    ) if retro_config.retro_gpt_tokenizer_model is not None else None
+    args.tokenizer_type = retro_config.retro_gpt_tokenizer_type
+    args.train_samples = retro_config.retro_gpt_train_samples
+    args.vocab_file = os.path.join(
+        args.retro_project_dir,
+        retro_config.retro_gpt_vocab_file,
+    ) if retro_config.retro_gpt_vocab_file is not None else None
+
+    # Retro-specific args.
+    args.retro_block_size = retro_config.retro_block_size
+    args.retro_chunk_length = retro_config.retro_gpt_chunk_length
+    args.retro_neighbor_dirs = retro_config.retro_neighbor_dirs
+    args.retro_split_preprocessing = retro_config.retro_gpt_split
+    args.retro_bert_tokenizer_type = retro_config.retro_bert_tokenizer_type
+    args.retro_bert_vocab_file = retro_config.retro_bert_vocab_file
+
+
+def validate_args(args, defaults={}):
+
+    # Temporary
+    assert args.non_persistent_ckpt_type in ['global', None], \
+        'Currently only global checkpoints are supported'
+
+    # Load saved args from Retro (if applicable).
+    load_retro_args(args)
+
+    # Set args.use_dist_ckpt from args.ckpt_format.
+    update_use_dist_ckpt(args)
+
+    if args.encoder_tensor_model_parallel_size > 0:
+        assert args.encoder_pipeline_model_parallel_size > 0, "encoder_pipeline_model_parallel_size must be defined."
+        assert args.num_attention_heads % args.encoder_tensor_model_parallel_size == 0
+        assert args.encoder_tensor_model_parallel_size <= args.tensor_model_parallel_size, "We do not support encoders with more TP than the decoder."
+
+    if args.encoder_pipeline_model_parallel_size > 0 and args.encoder_tensor_model_parallel_size == 0:
+        args.encoder_tensor_model_parallel_size = args.tensor_model_parallel_size
+
+    encoder_model_size = args.encoder_tensor_model_parallel_size * args.encoder_pipeline_model_parallel_size * args.context_parallel_size
+    decoder_model_size = args.tensor_model_parallel_size * args.pipeline_model_parallel_size * args.context_parallel_size
+    total_model_size = encoder_model_size + decoder_model_size
+
+    # Total model size.
+    assert args.world_size % total_model_size == 0, (
+        f"world size ({args.world_size}) is not divisible by total_model_size ({encoder_model_size=} + {decoder_model_size=})"
+    )
+
+    # Pipeline model parallel size.
+    args.transformer_pipeline_model_parallel_size = (
+        args.pipeline_model_parallel_size - 1
+        if args.standalone_embedding_stage else
+        args.pipeline_model_parallel_size
+    )
+
+    args.data_parallel_size = args.world_size // total_model_size
+
+    # Checks.
+    if args.rank == 0:
+        print('using world size: {}, data-parallel size: {}, '
+              'context-parallel size: {}, '
+              'tensor-model-parallel size: {}, '
+              'encoder-tensor-model-parallel size: {}, '
+              'pipeline-model-parallel size: {}, '
+              'encoder-pipeline-model-parallel size: {}'.format(
+                  args.world_size, args.data_parallel_size,
+                  args.context_parallel_size,
+                  args.tensor_model_parallel_size,
+                  args.encoder_tensor_model_parallel_size,
+                  args.pipeline_model_parallel_size,
+                  args.encoder_pipeline_model_parallel_size), flush=True)
+
+    # backwards compatibility.
+    if args.pipeline_model_parallel_split_rank is not None:
+        args.encoder_pipeline_model_parallel_size = args.pipeline_model_parallel_split_rank
+        args.pipeline_model_parallel_size -= args.encoder_pipeline_model_parallel_size
+        assert args.pipeline_model_parallel_size > 0
+
+    if args.tp_comm_overlap:
+        assert args.sequence_parallel == True, 'Tensor parallel communication/GEMM overlap can happen only when sequence parallelism is enabled'
+
+    # Deprecated arguments
+    assert args.batch_size is None, '--batch-size argument is no longer ' \
+        'valid, use --micro-batch-size instead'
+    del args.batch_size
+    assert args.warmup is None, '--warmup argument is no longer valid, use ' \
+        '--lr-warmup-fraction instead'
+    del args.warmup
+    assert args.model_parallel_size is None, '--model-parallel-size is no ' \
+        'longer valid, use --tensor-model-parallel-size instead'
+    del args.model_parallel_size
+
+    if args.checkpoint_activations:
+        if args.rank == 0:
+            print('--checkpoint-activations is no longer valid, use --recompute-activations, '
+                  'or, for more control, --recompute-granularity and --recompute-method.')
+        exit()
+    del args.checkpoint_activations
+
+    if args.recompute_activations:
+        args.recompute_granularity = 'selective'
+    del args.recompute_activations
+
+    # Set input defaults.
+    for key in defaults:
+        # For default to be valid, it should not be provided in the
+        # arguments that are passed to the program. We check this by
+        # ensuring the arg is set to None.
+        if getattr(args, key, None) is not None:
+            if args.rank == 0:
+                print('WARNING: overriding default arguments for {key}:{v} \
+                       with {key}:{v2}'.format(key=key, v=defaults[key],
+                                               v2=getattr(args, key)),
+                                               flush=True)
+        else:
+            setattr(args, key, defaults[key])
+
+    if args.data_path is not None and args.split is None:
+        legacy_default_split_value = '969, 30, 1'
+        if args.rank == 0:
+            print('WARNING: Please specify --split when using --data-path. Using legacy default value '
+                  f'of "{legacy_default_split_value}"')
+        args.split = legacy_default_split_value
+
+    # Batch size.
+    assert args.micro_batch_size is not None
+    assert args.micro_batch_size > 0
+    if args.global_batch_size is None:
+        args.global_batch_size = args.micro_batch_size * args.data_parallel_size
+        if args.rank == 0:
+            print('setting global batch size to {}'.format(
+                args.global_batch_size), flush=True)
+    assert args.global_batch_size > 0
+    if args.num_layers_per_virtual_pipeline_stage is not None:
+        if args.overlap_p2p_comm:
+            assert args.pipeline_model_parallel_size > 1, \
+                'when interleaved schedule is used, pipeline-model-parallel size '\
+                'should be greater than 1'
+        else:
+            assert args.pipeline_model_parallel_size > 2, \
+                'when interleaved schedule is used and p2p communication overlap is disabled, '\
+                'pipeline-model-parallel size should be greater than 2 to avoid having multiple '\
+                'p2p sends and recvs between same 2 ranks per communication batch'
+        assert args.num_layers % args.transformer_pipeline_model_parallel_size == 0, \
+            'number of layers should be divisible by the pipeline parallel size'
+        num_layers_per_pipeline_stage = args.num_layers // args.transformer_pipeline_model_parallel_size
+        assert num_layers_per_pipeline_stage % args.num_layers_per_virtual_pipeline_stage == 0, \
+            'number of layers per pipeline stage must be divisible number of layers per virtual pipeline stage'
+        args.virtual_pipeline_model_parallel_size = num_layers_per_pipeline_stage // \
+            args.num_layers_per_virtual_pipeline_stage
+    else:
+        args.virtual_pipeline_model_parallel_size = None
+        # Overlap P2P communication is disabled if not using the interleaved schedule.
+        args.overlap_p2p_comm = False
+        args.align_param_gather = False
+        # Only print warning if PP size > 1.
+        if args.rank == 0 and args.pipeline_model_parallel_size > 1:
+            print('WARNING: Setting args.overlap_p2p_comm and args.align_param_gather to False '
+                  'since non-interleaved schedule does not support overlapping p2p communication '
+                  'and aligned param AG')
+
+    if args.overlap_param_gather:
+        assert args.use_distributed_optimizer, \
+            '--overlap-param-gather only supported with distributed optimizer'
+        assert args.overlap_grad_reduce, \
+            'Must use --overlap-param-gather with --overlap-grad-reduce'
+        assert not args.use_legacy_models, \
+            '--overlap-param-gather only supported with MCore models'
+
+    if args.overlap_param_gather_with_optimizer_step:
+        assert args.use_distributed_optimizer, \
+            '--overlap-param-gather-with-optimizer-step only supported with distributed optimizer'
+        assert args.overlap_param_gather, \
+            'Must use --overlap-param-gather-with-optimizer-step with --overlap-param-gather'
+        assert args.virtual_pipeline_model_parallel_size is not None, \
+            '--overlap-param-gather-with-optimizer-step only supported with interleaved pipeline parallelism'
+        assert not args.use_dist_ckpt, \
+            '--overlap-param-gather-with-optimizer-step not supported with distributed checkpointing yet'
+
+    if args.fp8_param_gather:
+        assert args.use_distributed_optimizer, \
+            '--fp8-param-gather only supported with distributed optimizer'
+
+    # Parameters dtype.
+    args.params_dtype = torch.float
+    if args.fp16:
+        assert not args.bf16
+        args.params_dtype = torch.half
+        # Turn off checking for NaNs in loss and grads if using dynamic loss scaling,
+        # where NaNs in grads / loss are signal to the loss scaler.
+        if not args.loss_scale:
+            args.check_for_nan_in_loss_and_grad = False
+            if args.rank == 0:
+                print('WARNING: Setting args.check_for_nan_in_loss_and_grad to False since '
+                      'dynamic loss scaling is being used')
+    if args.bf16:
+        assert not args.fp16
+        args.params_dtype = torch.bfloat16
+        # bfloat16 requires gradient accumulation and all-reduce to
+        # be done in fp32.
+        if not args.accumulate_allreduce_grads_in_fp32:
+            args.accumulate_allreduce_grads_in_fp32 = True
+            if args.rank == 0:
+                print('accumulate and all-reduce gradients in fp32 for '
+                      'bfloat16 data type.', flush=True)
+
+    if args.rank == 0:
+        print('using {} for parameters ...'.format(args.params_dtype),
+              flush=True)
+
+    if args.dataloader_type is None:
+        args.dataloader_type = 'single'
+
+    # data
+    assert args.num_dataset_builder_threads > 0
+
+    # Consumed tokens.
+    args.consumed_train_samples = 0
+    args.skipped_train_samples = 0
+    args.consumed_valid_samples = 0
+
+    # Support for variable sequence lengths across batches/microbatches.
+    # set it if the dataloader supports generation of variable sequence lengths
+    # across batches/microbatches. Due to additional communication overhead
+    # during pipeline parallelism, it should not be set if sequence length
+    # is constant during training.
+    args.variable_seq_lengths = False
+
+    # Iteration-based training.
+    if args.train_iters:
+        # If we use iteration-based training, make sure the
+        # sample-based options are off.
+        assert args.train_samples is None, \
+            'expected iteration-based training'
+        assert args.lr_decay_samples is None, \
+            'expected iteration-based learning rate decay'
+        assert args.lr_warmup_samples == 0, \
+            'expected iteration-based learning rate warmup'
+        assert args.rampup_batch_size is None, \
+            'expected no batch-size rampup for iteration-based training'
+        if args.lr_warmup_fraction is not None:
+            assert args.lr_warmup_iters == 0, \
+                'can only specify one of lr-warmup-fraction and lr-warmup-iters'
+
+    # Sample-based training.
+    if args.train_samples:
+        # If we use sample-based training, make sure the
+        # iteration-based options are off.
+        assert args.train_iters is None, \
+            'expected sample-based training'
+        assert args.lr_decay_iters is None, \
+            'expected sample-based learning rate decay'
+        assert args.lr_warmup_iters == 0, \
+            'expected sample-based learnig rate warmup'
+        if args.lr_warmup_fraction is not None:
+            assert args.lr_warmup_samples == 0, \
+                'can only specify one of lr-warmup-fraction ' \
+                'and lr-warmup-samples'
+
+    if args.num_layers is not None:
+        assert args.encoder_num_layers is None, \
+            'cannot have both num-layers and encoder-num-layers specified'
+        args.encoder_num_layers = args.num_layers
+    else:
+        assert args.encoder_num_layers is not None, \
+            'either num-layers or encoder-num-layers should be specified'
+        args.num_layers = args.encoder_num_layers
+
+    # Check required arguments.
+    required_args = ['num_layers', 'hidden_size', 'num_attention_heads',
+                     'max_position_embeddings']
+    for req_arg in required_args:
+        _check_arg_is_not_none(args, req_arg)
+
+    # Checks.
+    if args.ffn_hidden_size is None:
+        if args.swiglu:
+            # reduce the dimnesion for MLP since projections happens on
+            # two linear layers. this keeps the number of paramters in
+            # the same ballpark as the counterpart with 4*h size
+            # we keep it a multiple of 64, which means the actual tensor size
+            # will be a multiple of 64 / tp_size
+            args.ffn_hidden_size = int((4 * args.hidden_size * 2 / 3) / 64) * 64
+        else:
+            args.ffn_hidden_size = 4 * args.hidden_size
+
+    if args.kv_channels is None:
+        assert args.hidden_size % args.num_attention_heads == 0
+        args.kv_channels = args.hidden_size // args.num_attention_heads
+
+    if args.seq_length is not None and args.context_parallel_size > 1:
+        assert args.seq_length % (args.context_parallel_size * 2) == 0, \
+            'seq-length should be a multiple of 2 * context-parallel-size ' \
+            'if context-parallel-size > 1.'
+
+    if args.seq_length is not None:
+        assert args.encoder_seq_length is None
+        args.encoder_seq_length = args.seq_length
+    else:
+        assert args.encoder_seq_length is not None
+        args.seq_length = args.encoder_seq_length
+
+    if args.seq_length is not None:
+        assert args.max_position_embeddings >= args.seq_length
+    if args.decoder_seq_length is not None:
+        assert args.max_position_embeddings >= args.decoder_seq_length
+    if args.lr is not None:
+        assert args.min_lr <= args.lr
+    if args.save is not None:
+        assert args.save_interval is not None
+    # Mixed precision checks.
+    if args.fp16_lm_cross_entropy:
+        assert args.fp16, 'lm cross entropy in fp16 only support in fp16 mode.'
+    if args.fp32_residual_connection:
+        assert args.fp16 or args.bf16, \
+            'residual connection in fp32 only supported when using fp16 or bf16.'
+
+    if args.moe_grouped_gemm:
+        assert args.bf16, 'Currently GroupedGEMM for MoE only supports bf16 dtype.'
+        dc = torch.cuda.get_device_capability()
+        assert dc[0] >= 8, "Unsupported compute capability for GroupedGEMM kernels."
+
+    if args.weight_decay_incr_style == 'constant':
+        assert args.start_weight_decay is None
+        assert args.end_weight_decay is None
+        args.start_weight_decay = args.weight_decay
+        args.end_weight_decay = args.weight_decay
+    else:
+        assert args.start_weight_decay is not None
+        assert args.end_weight_decay is not None
+
+    TORCH_MAJOR = int(torch.__version__.split('.')[0])
+    TORCH_MINOR = int(torch.__version__.split('.')[1])
+    # Persistent fused layer norm.
+    if TORCH_MAJOR < 1 or (TORCH_MAJOR == 1 and TORCH_MINOR < 11):
+        args.no_persist_layer_norm = True
+        if args.rank == 0:
+            print('Persistent fused layer norm kernel is supported from '
+                  'pytorch v1.11 (nvidia pytorch container paired with v1.11). '
+                  'Defaulting to no_persist_layer_norm=True')
+
+    # Activation recomputing.
+    if args.distribute_saved_activations:
+        assert args.tensor_model_parallel_size > 1, 'can distribute ' \
+            'recomputed activations only across tensor model ' \
+            'parallel groups'
+        assert args.recompute_granularity == 'full', \
+            'distributed recompute activations is only '\
+            'application to full recompute granularity'
+        assert args.recompute_method is not None, \
+            'for distributed recompute activations to work you '\
+            'need to use a recompute method '
+        assert (TORCH_MAJOR, TORCH_MINOR) >= (1, 10), \
+            'distributed recompute activations are supported for pytorch ' \
+            'v1.10 and above (Nvidia Pytorch container >= 21.07). Current ' \
+            'pytorch version is v%s.%s.' % (TORCH_MAJOR, TORCH_MINOR)
+
+    if args.recompute_granularity == 'selective':
+        assert args.recompute_method is None, \
+            'recompute method is not yet supported for ' \
+            'selective recomputing granularity'
+
+    # disable sequence parallelism when tp=1
+    # to avoid change in numerics when
+    # sequence_parallelism is enabled.
+    if args.tensor_model_parallel_size == 1:
+        args.sequence_parallel = False
+
+    # disable async_tensor_model_parallel_allreduce when
+    # model parallel memory optimization is enabled
+    if args.sequence_parallel:
+        args.async_tensor_model_parallel_allreduce = False
+
+    if os.environ.get('CUDA_DEVICE_MAX_CONNECTIONS') != "1":
+        if args.sequence_parallel:
+            raise RuntimeError(
+                "Using sequence parallelism requires setting the environment variable "
+                "CUDA_DEVICE_MAX_CONNECTIONS to 1")
+        if args.async_tensor_model_parallel_allreduce:
+            raise RuntimeError(
+                "Using async gradient all reduce requires setting the environment "
+                "variable CUDA_DEVICE_MAX_CONNECTIONS to 1")
+
+    # Disable bias gelu fusion if we are disabling bias altogether
+    if not args.add_bias_linear:
+        args.bias_gelu_fusion = False
+
+    # Retro checks.
+    if args.retro_add_retriever:
+
+        # Train samples should be auto-loaded.
+        assert args.train_samples is not None, \
+            "args.train_samples should be auto-loaded from the retro config."
+
+        # Sequence parallelism unsupported.
+        assert not args.sequence_parallel, \
+            "retro currently does not support sequence parallelism."
+
+        # Pipeline parallelism unsupported.
+        assert args.pipeline_model_parallel_size == 1, \
+            "retro currently does not support pipeline parallelism."
+
+    if args.decoupled_lr is not None or args.decoupled_min_lr is not None:
+        assert not args.use_legacy_models, \
+            '--decoupled-lr and --decoupled-min-lr is not supported in legacy models.'
+
+    # Legacy RoPE arguments
+    if args.use_rotary_position_embeddings:
+        args.position_embedding_type = 'rope'
+    if args.rotary_interleaved and args.apply_rope_fusion:
+        raise RuntimeError('--rotary-interleaved does not work with rope_fusion.')
+    if args.rotary_interleaved and args.use_legacy_models:
+        raise RuntimeError('--rotary-interleaved is not supported in legacy models.')
+
+    # Would just need to add 'NoPE' as a position_embedding_type to support this, but for now
+    # don't allow it to keep things simple
+    if not args.add_position_embedding and args.position_embedding_type != 'rope':
+        raise RuntimeError('--no-position-embedding is deprecated, use --position-embedding-type')
+
+    # MoE Spec check
+    if args.num_experts == 0:
+        args.num_experts = None
+    if args.num_experts is not None:
+        assert args.spec is None, "Model Spec must be None when using MoEs"
+
+    # Context parallel
+    if args.context_parallel_size > 1:
+        assert not args.use_legacy_models, "Context parallelism is not supported in legacy models."
+
+    # Expert parallelism check
+    if args.expert_model_parallel_size  > 1:
+        assert args.num_experts is not None, "num_experts must be non None to use expert model parallelism"
+        assert args.num_experts % args.expert_model_parallel_size == 0, \
+            "Number of experts should be a multiple of expert model parallel_size."
+        assert not args.fp16, \
+            "Expert parallelism is not supported with fp16 training."
+
+    # Distributed checkpointing checks
+    if args.use_dist_ckpt and args.use_legacy_models:
+        raise RuntimeError('--use-dist-ckpt is not supported in legacy models.')
+
+    # Data blend checks
+    assert args.mock_data + \
+           bool(args.data_path) + \
+           any([args.train_data_path, args.valid_data_path, args.test_data_path]) \
+           <= 1, "A single data source must be provided in training mode, else None"
+
+    if args.use_tp_pp_dp_mapping:
+        assert args.context_parallel_size * args.expert_model_parallel_size <= 1, \
+            "context_parallel and expert_model_parallel can't be used with tp-pp-dp mapping."
+
+    # Deterministic mode
+    if args.deterministic_mode:
+        assert not args.use_flash_attn, "Flash attention can not be used in deterministic mode."
+        assert not args.cross_entropy_loss_fusion, "Cross Entropy Fusion is currently not deterministic."
+
+        all_reduce_choices = ["Tree", "Ring", "CollnetDirect", "CollnetChain", "^NVLS"]
+        assert os.getenv("NCCL_ALGO", -1) != -1 and os.getenv("NCCL_ALGO") in all_reduce_choices, \
+            f"NCCL_ALGO must be one of {all_reduce_choices}."
+
+        torch.use_deterministic_algorithms(True)
+
+    # Update the printed args to reflect that `apply_query_key_layer_scaling` also controls `attention_softmax_in_fp32`
+    if args.apply_query_key_layer_scaling:
+        args.attention_softmax_in_fp32 = True
+
+    # Checkpointing
+    if args.ckpt_fully_parallel_save_deprecated and args.rank == 0:
+        print('--ckpt-fully-parallel-save flag is deprecated and has no effect.'
+              ' Use --no-ckpt-fully-parallel-save to disable parallel save.')
+    if (
+        args.use_dist_ckpt
+        and not args.ckpt_fully_parallel_save
+        and args.use_distributed_optimizer
+        and args.rank == 0
+    ):
+        print('Warning: With non-parallel ckpt save and DistributedOptimizer,'
+              ' it will be impossible to resume training with different parallelism.'
+              ' Consider removing flag --no-ckpt-fully-parallel-save.')
+    if args.use_dist_ckpt_deprecated and args.rank == 0:
+        print('--use-dist-ckpt is deprecated and has no effect.'
+              ' Use --ckpt-format to select the checkpoint format.')
+    if args.dist_ckpt_format_deprecated and args.rank == 0:
+        print('--dist-ckpt-format is deprecated and has no effect.'
+              ' Use --ckpt-format to select the checkpoint format.')
+
+    # MoE upcycling check
+    if args.moe_use_upcycling:
+        assert args.save is not None, "When using upcycling, the --save option must be specified."
+        if not args.no_load_optim:
+            args.no_load_optim = True
+            print('Warning: disabling --no-load-optim for upcycling.')
+        if not args.no_load_rng:
+            args.no_load_rng = True
+            print('Warning: disabling --no-load-rng for upcycling.')
+
+    # Print arguments.
+    _print_args("arguments", args)
+
+    return args
+
+
+def _print_args(title, args):
+    """Print arguments."""
+    if args.rank == 0:
+        print(f'------------------------ {title} ------------------------',
+              flush=True)
+        str_list = []
+        for arg in vars(args):
+            dots = '.' * (48 - len(arg))
+            str_list.append('  {} {} {}'.format(arg, dots, getattr(args, arg)))
+        for arg in sorted(str_list, key=lambda x: x.lower()):
+            print(arg, flush=True)
+        print(f'-------------------- end of {title} ---------------------',
+              flush=True)
+
+
+def _check_arg_is_not_none(args, arg):
+    assert getattr(args, arg) is not None, '{} argument is None'.format(arg)
+
+
+def core_transformer_config_from_args(args, config_class=None):
+    
+    # Config class.
+    config_class = config_class or TransformerConfig
+
+    if args.multi_latent_attention:
+        config_class = MLATransformerConfig
+
+    # Translate args to core transformer configuration
+    kw_args = {}
+    for f in dataclasses.fields(config_class):
+        if hasattr(args, f.name):
+            kw_args[f.name] = getattr(args, f.name)
+    kw_args['persist_layer_norm'] = not args.no_persist_layer_norm
+    kw_args['layernorm_zero_centered_gamma'] = args.apply_layernorm_1p
+    kw_args['layernorm_epsilon'] = args.norm_epsilon
+    kw_args['deallocate_pipeline_outputs'] = True
+    kw_args['pipeline_dtype'] = args.params_dtype
+    kw_args['batch_p2p_comm'] = not args.overlap_p2p_comm
+    kw_args['num_moe_experts'] = args.num_experts
+    kw_args['rotary_interleaved'] = args.rotary_interleaved
+    kw_args['first_pipeline_num_layers']= args.decoder_first_pipeline_num_layers
+    kw_args['last_pipeline_num_layers']= args.decoder_last_pipeline_num_layers
+    if args.swiglu:
+        kw_args['activation_func'] = F.silu
+        kw_args['gated_linear_unit'] = True
+        kw_args['bias_activation_fusion'] = args.bias_swiglu_fusion
+    else:
+        kw_args['bias_activation_fusion'] = args.bias_gelu_fusion
+    if args.squared_relu:
+        assert not args.swiglu
+        kw_args['activation_func'] = squared_relu
+    if args.init_method_xavier_uniform:
+        kw_args['init_method'] = torch.nn.init.xavier_uniform_
+        kw_args['scaled_init_method'] = torch.nn.init.xavier_uniform_
+    if args.group_query_attention:
+        kw_args['num_query_groups'] = args.num_query_groups
+    else:
+        kw_args['num_query_groups'] = None
+    kw_args['config_logger_dir'] = args.config_logger_dir
+
+    # Return config.
+    return config_class(**kw_args)
+
+
+def _add_transformer_engine_args(parser):
+    group = parser.add_argument_group(title='Transformer-Engine')
+
+    group.add_argument('--fp8-format', default=None,
+                       choices=['e4m3', 'hybrid'],
+                       help='Which fp8 format scheme to use for FP8 tensors in the forward and backward pass',
+                       dest='fp8')
+    group.add_argument('--fp8-margin', type=int, default=0,
+                       help='Scaling margin for fp8',
+                       dest='fp8_margin')
+    group.add_argument('--fp8-interval', type=int, default=1,
+                       help='DEPRECATED. This flag is ignored. Scaling update interval for fp8',
+                       dest='fp8_interval')
+    group.add_argument('--fp8-amax-history-len', type=int, default=1,
+                       help='Number of steps for which amax history is recorded per tensor',
+                       dest='fp8_amax_history_len')
+    group.add_argument('--fp8-amax-compute-algo', default='most_recent',
+                       choices=['most_recent', 'max'],
+                       help='Algorithm for computing amax from history',
+                       dest='fp8_amax_compute_algo')
+    group.add_argument('--no-fp8-wgrad', action='store_false',
+                       help='Execute wgrad in higher precision even for FP8 runs',
+                       dest='fp8_wgrad')
+    group.add_argument('--transformer-impl', default='transformer_engine',
+                       choices=['local', 'transformer_engine'],
+                       help='Which Transformer implementation to use.')
+    group.add_argument('--fp8-param-gather', action='store_true',
+                       help='Keep the compute param in fp8 (do not use any other intermediate '
+                            'dtype) and perform the param all-gather in fp8.')
+
+    return parser
+
+def _add_inference_args(parser):
+    group = parser.add_argument_group(title='inference')
+
+    group.add_argument('--inference-batch-times-seqlen-threshold',
+                       type=int, default=512,
+                       help='During inference, if batch-size times '
+                       'sequence-length is smaller than this threshold '
+                       'then we will not use pipelining, otherwise we will.')
+    group.add_argument('--max-tokens-to-oom',
+                       type=int, default=12000,
+                       help='Maximum number of tokens during inference'
+                       'tokens here is # in prompt + # to generate'
+                       'Allows us to throw an error before OOM crashes server')
+    group.add_argument('--output-bert-embeddings', action='store_true',
+                       help='Output Bert embeddings (via mean pooling) from '
+                       'model, rather than its binary head output or entire '
+                       'hidden batch.')
+    group.add_argument('--bert-embedder-type', default="megatron",
+                       choices=["megatron", "huggingface"],
+                       help='Select either Megatron or Huggingface as the '
+                       'Bert embedder.')
+
+    return parser
+
+
+def _add_retro_args(parser):
+    group = parser.add_argument_group(title='retro')
+
+    group.add_argument('--retro-project-dir', default=None,
+                       help='Retro project directory, which contains the '
+                       'preprocessed data for pretraining. This directory '
+                       'is built during preprocessing (see '
+                       'tools/retro/README.md), and contains subdirectories '
+                       'for the chunk database and pretraining neighbors.')
+    group.add_argument('--retro-add-retriever',
+                       action='store_true', default=False,
+                       help='Add a retriever to the transformer, for use in '
+                       'pretraining a Retro model.')
+    group.add_argument('--retro-cyclic-train-iters', type=int, default=None,
+                       help='Set number of training iterations for cyclic '
+                       'Retro training.')
+    group.add_argument('--retro-encoder-layers', type=int, default=2,
+                       help='Number of layers to use for the retrieval '
+                       'encoder.')
+    group.add_argument('--retro-encoder-hidden-dropout',
+                       type=float, default=0.1, help='Hidden dropout for '
+                       'retrieval encoder.')
+    group.add_argument('--retro-encoder-attention-dropout',
+                       type=float, default=0.1, help='Attention dropout for '
+                       'retrieval encoder.')
+    group.add_argument("--retro-num-neighbors", type=int, default=2,
+                       help='Number of neighbors to retrieve during '
+                       'pretraining.')
+    group.add_argument("--retro-num-retrieved-chunks", type=int, default=2,
+                       help='Number of chunks to retrieve from the retrieval '
+                       'database.')
+    group.add_argument("--retro-attention-gate", type=float, default=1,
+                       help="Gated cross attention.")
+    group.add_argument("--retro-no-verify-neighbor-count", action="store_false",
+                       dest="retro_verify_neighbor_count",
+                       help="Skip verifying that len(GPT dataset) == len(saved "
+                       "neighbors).")
+
+    # Enforce argument naming convention.
+    for action in group._group_actions:
+        prefix = action.dest.split("_")[0]
+        assert prefix == "retro", \
+            "Retro args must be prefixed with '--retro-*', for consistent " \
+            "styling. Please fix '%s'." % ", ".join(action.option_strings)
+
+    return parser
+
+
+def _add_network_size_args(parser):
+    group = parser.add_argument_group(title='network size')
+
+    group.add_argument('--num-layers', type=int, default=None,
+                       help='Number of transformer layers.')
+    group.add_argument('--encoder-num-layers', type=int, default=None,
+                       help='Number of encoder transformer layers.')
+    group.add_argument('--decoder-num-layers', type=int, default=None,
+                       help='Number of decoder transformer layers.')
+    group.add_argument('--hidden-size', type=int, default=None,
+                       help='Tansformer hidden size.')
+    group.add_argument('--ffn-hidden-size', type=int, default=None,
+                       help='Transformer Feed-Forward Network hidden size. '
+                       'This is set to 4*hidden-size if not provided')
+    group.add_argument('--num-attention-heads', type=int, default=None,
+                       help='Number of transformer attention heads.')
+    group.add_argument('--kv-channels', type=int, default=None,
+                       help='Projection weights dimension in multi-head '
+                       'attention. This is set to '
+                       '   args.hidden_size // args.num_attention_heads '
+                       'if not provided.')
+    group.add_argument('--group-query-attention', action='store_true',
+                          help='Use group-query attention.')
+    group.add_argument('--num-query-groups', type=int, default=1)
+
+    group.add_argument('--max-position-embeddings', type=int, default=None,
+                       help='Maximum number of position embeddings to use. '
+                       'This is the size of position embedding.')
+    group.add_argument('--position-embedding-type', type=str, default='learned_absolute',
+                       choices=['learned_absolute', 'rope', 'none'],
+                       help='Position embedding type.')
+    group.add_argument('--use-rotary-position-embeddings', action='store_true',
+                       help='Use rotary positional embeddings or not. '
+                       'Deprecated: use --position-embedding-type')
+    group.add_argument('--rotary-base', type=int, default=10000,
+                       help='Base to use for rotary positional embeddings, default 10000')
+    group.add_argument('--rotary-percent', type=float, default=1.0,
+                       help='Percent of rotary dimension to use, default 100%%')
+    group.add_argument('--rotary-interleaved', action='store_true',
+                          help='Use interleaved rotary embedding.')
+    group.add_argument('--rotary-seq-len-interpolation-factor', type=int, default=None,
+                       help='Sequence length interpolation factor for rotary embeddings.')
+    group.add_argument('--use-rope-scaling', action='store_true',
+                       help='Apply rope scaling as used in llama3.1')
+    group.add_argument('--no-position-embedding',
+                       action='store_false',
+                       help='Disable position embedding. Deprecated: use --position-embedding-type',
+                       dest='add_position_embedding')
+    group.add_argument('--make-vocab-size-divisible-by', type=int, default=128,
+                       help='Pad the vocab size to be divisible by this value.'
+                       'This is added for computational efficieny reasons.')
+    group.add_argument('--normalization', default='LayerNorm',
+                       choices=['LayerNorm', 'RMSNorm'],
+                       help='Which normalization technique to use.')
+    group.add_argument('--norm-epsilon', type=float, default=1e-5,
+                       help='Epsilon for layer norm and RMS norm.')
+    group.add_argument('--apply-layernorm-1p', action='store_true',
+                       help='Adjust LayerNorm weights such that they are centered '
+                       'around zero. This improves numerical stability.')
+    group.add_argument('--apply-residual-connection-post-layernorm',
+                       action='store_true',
+                       help='If set, use original BERT residula connection '
+                       'ordering.')
+    group.add_argument('--openai-gelu', action='store_true',
+                       help='Use OpenAIs GeLU implementation. This option'
+                       'should not be used unless for backward compatibility'
+                       'reasons.')
+    group.add_argument('--squared-relu', action='store_true',
+                       help='Use squared relu activation instead of default gelu')
+    group.add_argument('--swiglu', action='store_true',
+                       help='Use gated linear units and SiLU activation instead of default gelu')
+    group.add_argument('--onnx-safe', type=bool, required=False,
+                       help='Use workarounds for known problems with '
+                       'Torch ONNX exporter')
+    group.add_argument('--bert-no-binary-head', action='store_false',
+                       help='Disable BERT binary head.',
+                       dest='bert_binary_head')
+    group.add_argument('--untie-embeddings-and-output-weights', action='store_true',
+                       help='Untie embeddings and output weights.')
+    group.add_argument('--multi-latent-attention', action='store_true',
+                       help='Use multi-latent attention for model.')
+    return parser
+
+
+def _add_straggler_detector_args(parser):
+    group = parser.add_argument_group(title='straggler')
+    group.add_argument('--log-straggler', action='store_true',
+                       help='If set, tracks and logs straggler per GPU.')
+    group.add_argument('--disable-straggler-on-startup', action='store_true',
+                       help='If set, StragglerDetector is disabled on startup.')
+    group.add_argument('--straggler-ctrlr-port', type=int, default=65535,
+                       help='Port number to toggle StragglerDetector on/off at runtime')
+    group.add_argument('--straggler-minmax-count', type=int, default=1,
+                       help='Number of ranks to report with high/low estimated throughput')
+    return parser
+
+
+def _add_one_logger_args(parser):
+    group = parser.add_argument_group(title='one logger')
+    group.add_argument('--no-one-logger', action='store_false',
+                       help='If set, disable using one_logger to track E2E metrics'
+                       'Note that one_logger is an internal tool and not '
+                       'available externally. For installation, please go to '
+                       'https://confluence.nvidia.com/display/MLWFO/Package+Repositories'
+                       'for more details',
+                       dest='enable_one_logger')
+    group.add_argument('--one-logger-project', type=str, default='megatron-lm',
+                       help='The one-logger project name. Will ignore if '
+                       '--no-one-logger is set')
+    group.add_argument('--one-logger-run-name', type=str, default=None,
+                       help='The one-logger run name displayed. Will ignore if '
+                       '--no-one-logger is set')
+    group.add_argument('--one-logger-async', action='store_true',
+                       help='If set, forces one_logger to use async mode.')
+    group.add_argument('--app-tag-run-name', type=str, default=None,
+                       help='Jobs belonging to same training run, suppose to '
+                       'have the same name. It will be used to track progress of '
+                       'a training done over multiple different jobs')
+    group.add_argument('--app-tag-run-version', type=str, default='0.0.0',
+                       help='The version of the training of which current job is '
+                       'part of. It will be used to track the changes in the '
+                       'application side which might change the performance '
+                       'baseline')
+    return parser
+
+
+def _add_ft_package_args(parser):
+    group = parser.add_argument_group(title='ft_package')
+    group.add_argument('--enable-ft-package', action='store_true',
+                       help='If set, Fault Tolerance package is enabled. '
+                       'Note: This feature is for Nvidia internal use only.')
+    return parser
+
+
+def _add_config_logger_args(parser):
+    group = parser.add_argument_group(title='config logger')
+    group.add_argument('--config-logger-dir', type=str, default='',
+                       help='If set, will dump all configs to --config-logger-dir',
+                       dest='config_logger_dir')
+    return parser
+
+
+def _add_logging_args(parser):
+    group = parser.add_argument_group(title='logging')
+
+    group.add_argument('--log-params-norm', action='store_true',
+                       help='If set, calculate and log parameters norm.')
+    group.add_argument('--log-num-zeros-in-grad', action='store_true',
+                       help='If set, calculate and log the number of zeros in gradient.')
+    group.add_argument('--log-throughput', action='store_true',
+                       help='If set, calculate and log throughput per GPU.')
+    group.add_argument('--log-progress', action='store_true',
+                       help='If set, log progress (in terms of number of processed tokens and '
+                       'number of floating-point operations) to progress.txt file in checkpoint '
+                       'directory.')
+    group.add_argument('--timing-log-level', type=int,
+                       default=0, choices=range(0,3),
+                       help='Granularity level to measure and report timing. '
+                       '   0: report only iteration time and make sure timing '
+                       '      does not introduce extra overhead.'
+                       '   1: report timing for operations that are executed '
+                       '      very limited times (basically once) during '
+                       '      each iteration (such as gradient all-reduce) '
+                       '   2: report timing for operations that migh be '
+                       '      executed numerous times during each iteration. '
+                       'Note that setting the level to 1 or 2 might '
+                       'cause increase in iteration time.')
+    group.add_argument('--no-barrier-with-level-1-timing', action='store_false',
+                       help='If not set, use barrier with level 1 time '
+                       'measurements. Note that this is up to the user '
+                       'to make sure calling barrier with their timers '
+                       'will not result in hangs. This can happen if for '
+                       'example the user adds a level 1 timer that is not '
+                       'called by all ranks.',
+                       dest='barrier_with_L1_time')
+    group.add_argument('--timing-log-option', type=str, default='minmax',
+                       choices=['max', 'minmax', 'all'],
+                       help='Options for logging timing:'
+                       '  max: report the max timing across all ranks'
+                       '  minmax: report min and max timings across all ranks'
+                       '  all: report timings of all ranks.')
+    group.add_argument('--tensorboard-log-interval', type=int, default=1,
+                       help='Report to tensorboard interval.')
+    group.add_argument('--tensorboard-queue-size', type=int, default=1000,
+                       help='Size of the tensorboard queue for pending events '
+                       'and summaries before one of the ‘add’ calls forces a '
+                       'flush to disk.')
+    group.add_argument('--log-timers-to-tensorboard', action='store_true',
+                       help='If set, write timers to tensorboard.')
+    group.add_argument('--no-log-loss-scale-to-tensorboard',
+                       action='store_false',
+                       help='Disable loss-scale logging to tensorboard.',
+                       dest='log_loss_scale_to_tensorboard')
+    group.add_argument('--log-validation-ppl-to-tensorboard',
+                       action='store_true',
+                       help='If set, write validation perplexity to '
+                       'tensorboard.')
+    group.add_argument('--log-memory-to-tensorboard',
+                       action='store_true',
+                       help='Enable memory logging to tensorboard.')
+    group.add_argument('--log-world-size-to-tensorboard',
+                       action='store_true',
+                       help='Enable world size logging to tensorboard.')
+    group.add_argument('--wandb-project', type=str, default='',
+                       help='The wandb project name. Ignore wandb by default.')
+    group.add_argument('--wandb-exp-name', type=str, default='',
+                       help='The wandb experiment name.')
+    group.add_argument('--wandb-save-dir', type=str, default='',
+                       help='Path to save the wandb results locally.')
+    group.add_argument('--logging-level', type=int, default=None,
+                       help='Set default logging level')
+    return parser
+
+
+def _add_regularization_args(parser):
+    group = parser.add_argument_group(title='regularization')
+
+    group.add_argument('--attention-dropout', type=float, default=0.1,
+                       help='Post attention dropout probability.')
+    group.add_argument('--hidden-dropout', type=float, default=0.1,
+                       help='Dropout probability for hidden state transformer.')
+    group.add_argument('--weight-decay', type=float, default=0.01,
+                       help='Weight decay coefficient for L2 regularization.')
+    group.add_argument('--start-weight-decay', type=float,
+                       help='Initial weight decay coefficient for L2 regularization.')
+    group.add_argument('--end-weight-decay', type=float,
+                       help='End of run weight decay coefficient for L2 regularization.')
+    group.add_argument('--weight-decay-incr-style', type=str, default='constant',
+                       choices=['constant', 'linear', 'cosine'],
+                       help='Weight decay increment function.')
+    group.add_argument('--clip-grad', type=float, default=1.0,
+                       help='Gradient clipping based on global L2 norm.')
+    group.add_argument('--adam-beta1', type=float, default=0.9,
+                       help='First coefficient for computing running averages '
+                       'of gradient and its square')
+    group.add_argument('--adam-beta2', type=float, default=0.999,
+                       help='Second coefficient for computing running averages '
+                       'of gradient and its square')
+    group.add_argument('--adam-eps', type=float, default=1e-08,
+                       help='Term added to the denominator to improve'
+                       'numerical stability')
+    group.add_argument('--sgd-momentum', type=float, default=0.9,
+                       help='Momentum factor for sgd')
+    return parser
+
+
+def _add_training_args(parser):
+    group = parser.add_argument_group(title='training')
+
+    group.add_argument('--micro-batch-size', type=int, default=None,
+                       help='Batch size per model instance (local batch size). '
+                       'Global batch size is local batch size times data '
+                       'parallel size times number of micro batches.')
+    group.add_argument('--batch-size', type=int, default=None,
+                       help='Old batch size parameter, do not use. '
+                       'Use --micro-batch-size instead')
+    group.add_argument('--global-batch-size', type=int, default=None,
+                       help='Training batch size. If set, it should be a '
+                       'multiple of micro-batch-size times data-parallel-size. '
+                       'If this value is None, then '
+                       'use micro-batch-size * data-parallel-size as the '
+                       'global batch size. This choice will result in 1 for '
+                       'number of micro-batches.')
+    group.add_argument('--rampup-batch-size', nargs='*', default=None,
+                       help='Batch size ramp up with the following values:'
+                       '  --rampup-batch-size <start batch size> '
+                       '                      <batch size incerement> '
+                       '                      <ramp-up samples> '
+                       'For example:'
+                       '   --rampup-batch-size 16 8 300000 \ '
+                       '   --global-batch-size 1024'
+                       'will start with global batch size 16 and over '
+                       ' (1024 - 16) / 8 = 126 intervals will increase'
+                       'the batch size linearly to 1024. In each interval'
+                       'we will use approximately 300000 / 126 = 2380 samples.')
+    group.add_argument('--decrease-batch-size-if-needed', action='store_true', default=False,
+                       help='If set, decrease batch size if microbatch_size * dp_size'
+                       'does not divide batch_size. Useful for KSO (Keep Soldiering On)'
+                       'to continue making progress if number of healthy GPUs (and'
+                       'corresponding dp_size) does not support current batch_size.'
+                       'Old batch_size will be restored if training is re-started with'
+                       'dp_size that divides batch_size // microbatch_size.')
+    group.add_argument('--recompute-activations', action='store_true',
+                       help='recompute activation to allow for training '
+                       'with larger models, sequences, and batch sizes.')
+    group.add_argument('--recompute-granularity', type=str, default=None,
+                       choices=['full', 'selective'],
+                       help='Checkpoint activations to allow for training '
+                       'with larger models, sequences, and batch sizes. '
+                       'It is supported at two granularities 1) full: '
+                       'whole transformer layer is recomputed, '
+                       '2) selective: core attention part of the transformer '
+                       'layer is recomputed.')
+    group.add_argument('--no-check-for-nan-in-loss-and-grad', action='store_false',
+                       help='Check for NaNs in loss and grad',
+                       dest='check_for_nan_in_loss_and_grad')
+    group.add_argument('--distribute-saved-activations',
+                       action='store_true',
+                       help='If set, distribute recomputed activations '
+                       'across model parallel group.')
+    group.add_argument('--recompute-method', type=str, default=None,
+                       choices=['uniform', 'block'],
+                       help='1) uniform: uniformly divide the total number of '
+                       'Transformer layers and recompute the input activation of '
+                       'each divided chunk at specified granularity, '
+                       '2) recompute the input activations of only a set number of '
+                       'individual Transformer layers per pipeline stage and do the '
+                       'rest without any recomputing at specified granularity'
+                       'default) do not apply activations recompute to any layers')
+    group.add_argument('--recompute-num-layers', type=int, default=None,
+                       help='1) uniform: the number of Transformer layers in each '
+                       'uniformly divided recompute unit, '
+                       '2) block: the number of individual Transformer layers '
+                       'to recompute within each pipeline stage.')
+    group.add_argument('--no-clone-scatter-output-in-embedding', action='store_false',
+                       help='If not set, clone the output of the scatter in embedding layer to GC original tensor.',
+                       dest='clone_scatter_output_in_embedding')
+    group.add_argument('--profile', action='store_true',
+                       help='Enable nsys profiling. When using this option, nsys '
+                       'options should be specified in commandline. An example '
+                       'nsys commandline is `nsys profile -s none -t nvtx,cuda '
+                       '-o <path/to/output_file> --force-overwrite true '
+                       '--capture-range=cudaProfilerApi '
+                       '--capture-range-end=stop`.')
+    group.add_argument('--profile-step-start', type=int, default=10,
+                       help='Global step to start profiling.')
+    group.add_argument('--profile-step-end', type=int, default=12,
+                       help='Global step to stop profiling.')
+    group.add_argument('--use-pytorch-profiler', action='store_true',
+                       help='Use the built-in pytorch profiler. '
+                       'Useful if you wish to view profiles in tensorboard.',
+                       dest='use_pytorch_profiler')
+    group.add_argument('--profile-ranks', nargs='+', type=int, default=[0],
+                       help='Global ranks to profile.')
+    group.add_argument('--tp-comm-overlap', action='store_true', help='Enables the '
+                       ' overlap of Tensor parallel communication and GEMM kernels.')
+    group.add_argument('--tp-comm-overlap-cfg', type=str, default=None,
+                       help='Config file when tp_comm_overlap is enabled.')
+    group.add_argument('--disable-tp-comm-overlap-ag', action='store_false',
+                       help=('Disables the All-Gather overlap with GEMM by '
+                             'pipelining the GEMM and All-Gather.'),
+                       dest='tp_comm_overlap_ag')
+    group.add_argument('--disable-tp-comm-overlap-rs', action='store_false',
+                       help=('Disables the Reduce-Scatter overlap with GEMM by '
+                             'pipelining the GEMM and Reduce-Scatter.'),
+                       dest='tp_comm_overlap_rs')
+    group.add_argument('--tp-comm-overlap-rs-dgrad', action='store_true',
+                       help = 'Enables the Reduce-Scatter overlap with dgrad GEMM.',
+                       dest='tp_comm_overlap_rs_dgrad')
+    group.add_argument('--disable-tp-comm-bulk-dgrad', action='store_false',
+                       help='Disables the All-Gather overlap with bprop activation gradient GEMM.',
+                       dest='tp_comm_bulk_dgrad')
+    group.add_argument('--disable-tp-comm-bulk-wgrad', action='store_false',
+                       help='Disables the Reduce-Scatter overlap with bprop weight gradient GEMM.',
+                       dest='tp_comm_bulk_wgrad')
+    group.add_argument('--tp-comm-bootstrap-backend', default='nccl', type=str,
+                       choices=['nccl', 'mpi', 'gloo'],
+                       help='Set the bootstrapping backend of Tensor parallel communications.')
+    group.add_argument('--use-cpu-initialization', action='store_true',
+                       default=None,
+                       help='If set, initialize weights on the CPU. This eliminates init differences based on tensor parallelism.')
+    group.add_argument('--empty-unused-memory-level', default=0, type=int,
+                       choices=[0, 1, 2],
+                       help='Call torch.cuda.empty_cache() each iteration '
+                       '(training and eval), to reduce fragmentation.'
+                       '0=off, 1=moderate, 2=aggressive.')
+    group.add_argument('--deterministic-mode', action='store_true',
+                       help='Choose code that has deterministic execution. This usually '
+                       'means slower execution, but is good for debugging and testing.')
+    group.add_argument('--check-weight-hash-across-dp-replicas-interval', type=int, default=None,
+                       help='Interval to check weight hashes are same across DP replicas. If not specified, weight hashes not checked.')
+    group.add_argument('--calculate-per-token-loss', action='store_true',
+                       help=('Scale cross entropy loss by the number of non-padded tokens in the '
+                             'global batch, versus the default behavior of assuming all tokens are non-padded.'))
+    group.add_argument('--train-sync-interval', type=int, default=None,
+                       help='Training CPU-GPU synchronization interval, to ensure that CPU is not running too far ahead of GPU.')
+
+    # deprecated
+    group.add_argument('--checkpoint-activations', action='store_true',
+                       help='Checkpoint activation to allow for training '
+                       'with larger models, sequences, and batch sizes.')
+    group.add_argument('--train-iters', type=int, default=None,
+                       help='Total number of iterations to train over all '
+                       'training runs. Note that either train-iters or '
+                       'train-samples should be provided.')
+    group.add_argument('--train-samples', type=int, default=None,
+                       help='Total number of samples to train over all '
+                       'training runs. Note that either train-iters or '
+                       'train-samples should be provided.')
+    group.add_argument('--log-interval', type=int, default=100,
+                       help='Report loss and timing interval.')
+    group.add_argument('--exit-interval', type=int, default=None,
+                       help='Exit the program after the iteration is divisible '
+                       'by this value.')
+    group.add_argument('--exit-duration-in-mins', type=int, default=None,
+                       help='Exit the program after this many minutes.')
+    group.add_argument('--exit-signal-handler', action='store_true',
+                       help='Dynamically save the checkpoint and shutdown the '
+                       'training if SIGTERM is received')
+    group.add_argument('--tensorboard-dir', type=str, default=None,
+                       help='Write TensorBoard logs to this directory.')
+    group.add_argument('--no-masked-softmax-fusion',
+                       action='store_false',
+                       help='Disable fusion of query_key_value scaling, '
+                       'masking, and softmax.',
+                       dest='masked_softmax_fusion')
+    group.add_argument('--no-bias-gelu-fusion', action='store_false',
+                       help='Disable bias and gelu fusion.',
+                       dest='bias_gelu_fusion')
+    group.add_argument('--no-bias-swiglu-fusion', action='store_false',
+                       help='Disable bias and swiglu fusion, the fusion is '
+                       'available only when using megatron-core.',
+                       dest='bias_swiglu_fusion')
+    group.add_argument('--no-bias-dropout-fusion', action='store_false',
+                       help='Disable bias and dropout fusion.',
+                       dest='bias_dropout_fusion')
+    group.add_argument('--no-rope-fusion', action='store_false',
+                       help='Disable rope fusion, the fusion is available '
+                       'only when using megatron-core.',
+                       dest='apply_rope_fusion')
+    group.add_argument('--cross-entropy-loss-fusion', action='store_true',
+                       help='Enabled fusion of cross entropy loss calculation.',
+                       dest='cross_entropy_loss_fusion')
+    group.add_argument('--use-flash-attn', action='store_true',
+                       help='use FlashAttention implementation of attention. '
+                       'https://arxiv.org/abs/2205.14135')
+    group.add_argument('--disable-bias-linear', action='store_false',
+                       help='Disable bias in the linear layers',
+                       dest='add_bias_linear')
+    group.add_argument('--add-qkv-bias', action='store_true',
+                       help='Enable bias only in the QKV linear layers',
+                       dest='add_qkv_bias')
+    group.add_argument('--optimizer', type=str, default='adam',
+                       choices=['adam', 'sgd'],
+                       help='Optimizer function')
+    group.add_argument('--dataloader-type', type=str, default=None,
+                       choices=['single', 'cyclic', 'external'],
+                       help='Single pass vs multiple pass data loader')
+    group.add_argument('--no-async-tensor-model-parallel-allreduce',
+                       action='store_false',
+                       help='DEPRECATED. This flag is ignored.',
+                       dest='async_tensor_model_parallel_allreduce')
+    group.add_argument('--no-persist-layer-norm', action='store_true',
+                       help='Disable using persistent fused layer norm kernel. '
+                       'This kernel supports only a set of hidden sizes. Please '
+                       'check persist_ln_hidden_sizes if your hidden '
+                       'size is supported.')
+    group.add_argument('--sequence-parallel', action='store_true',
+                       help='Enable sequence parallel optimization.')
+    group.add_argument('--no-gradient-accumulation-fusion',
+                       action='store_false',
+                       help='Disable fusing gradient accumulation to weight '
+                       'gradient computation of linear layers',
+                       dest='gradient_accumulation_fusion')
+    group.add_argument('--use-mcore-models', action='store_true',
+                       dest='deprecated_use_mcore_models',
+                       help='DEPRECATED. Use the implementation from megatron core.'
+                       'Now ignored and mcore models are the default, use '
+                       '--use-legacy-models to not use core models.')
+    group.add_argument('--use-legacy-models', action='store_true',
+                       help='Use the legacy Megatron models, not Megatron-Core models.')
+    group.add_argument('--manual-gc', action='store_true',
+                       help='Disable the threshold-based default garbage '
+                       'collector and trigger the garbage collection manually. '
+                       'Manual garbage collection helps to align the timing of '
+                       'the collection across ranks which mitigates the impact '
+                       'of CPU-associated jitters. When the manual gc is enabled, '
+                       'garbage collection is performed only at the start and the '
+                       'end of the validation routine by default.')
+    group.add_argument('--manual-gc-interval', type=int, default=0,
+                       help='Training step interval to trigger manual garbage '
+                       'collection. When the value is set to 0, garbage '
+                       'collection is not triggered between training steps.')
+    group.add_argument('--no-manual-gc-eval', action='store_false',
+                       help='When using manual garbage collection, disable '
+                       'garbage collection at the start and the end of each '
+                       'evaluation run.', dest='manual_gc_eval')
+    group.add_argument('--disable-tp-comm-split-ag', action='store_false',
+                       help='Disables the All-Gather overlap with fprop GEMM.',
+                       dest='tp_comm_split_ag')
+    group.add_argument('--disable-tp-comm-split-rs', action='store_false',
+                       help='Disables the Reduce-Scatter overlap with fprop GEMM.',
+                       dest='tp_comm_split_rs')
+
+    return parser
+
+
+def _add_initialization_args(parser):
+    group = parser.add_argument_group(title='initialization')
+
+    group.add_argument('--seed', type=int, default=1234,
+                       help='Random seed used for python, numpy, '
+                       'pytorch, and cuda.')
+    group.add_argument('--data-parallel-random-init', action='store_true',
+                       help='Enable random initialization of params '
+                       'across data parallel ranks')
+    group.add_argument('--init-method-std', type=float, default=0.02,
+                       help='Standard deviation of the zero mean normal '
+                       'distribution used for weight initialization.')
+    group.add_argument('--init-method-xavier-uniform', action='store_true',
+                       help='Enable Xavier uniform parameter initialization')
+
+    return parser
+
+
+def _add_learning_rate_args(parser):
+    group = parser.add_argument_group(title='learning rate')
+
+    group.add_argument('--lr', type=float, default=None,
+                       help='Initial learning rate. Depending on decay style '
+                       'and initial warmup, the learning rate at each '
+                       'iteration would be different.')
+    group.add_argument('--lr-decay-style', type=str, default='linear',
+                       choices=['constant', 'linear', 'cosine', 'inverse-square-root', 'WSD'],
+                       help='Learning rate decay function.')
+    group.add_argument('--lr-wsd-decay-style', type=str, default='exponential',
+                       choices=['exponential', 'linear', 'cosine'],
+                       help='Decay style for the annealing phase of WSD'),
+    group.add_argument('--lr-decay-iters', type=int, default=None,
+                       help='number of iterations to decay learning rate over,'
+                       ' If None defaults to `--train-iters`')
+    group.add_argument('--lr-decay-samples', type=int, default=None,
+                       help='number of samples to decay learning rate over,'
+                       ' If None defaults to `--train-samples`')
+    group.add_argument('--lr-wsd-decay-samples', type=int, default=None,
+                       help='number of samples for the annealing phase in the wsd schedule')
+    group.add_argument('--lr-wsd-decay-iters', type=int, default=None,
+                       help='number of iterations for the annealing phase in the wsd schedule')
+    group.add_argument('--lr-warmup-fraction', type=float, default=None,
+                       help='fraction of lr-warmup-(iters/samples) to use '
+                       'for warmup (as a float)')
+    group.add_argument('--lr-warmup-iters', type=int, default=0,
+                       help='number of iterations to linearly warmup '
+                       'learning rate over.')
+    group.add_argument('--lr-warmup-samples', type=int, default=0,
+                       help='number of samples to linearly warmup '
+                       'learning rate over.')
+    group.add_argument('--lr-warmup-init', type=float, default=0.0,
+                       help='Initial value for learning rate warmup. The '
+                       'scheduler starts warmup from this value.')
+    group.add_argument('--warmup', type=int, default=None,
+                       help='Old lr warmup argument, do not use. Use one of the'
+                       '--lr-warmup-* arguments above')
+    group.add_argument('--min-lr', type=float, default=0.0,
+                       help='Minimum value for learning rate. The scheduler'
+                       'clip values below this threshold.')
+    group.add_argument('--override-opt_param-scheduler', action='store_true',
+                       help='Reset the values of the scheduler (learning rate,'
+                       'warmup iterations, minimum learning rate, maximum '
+                       'number of iterations, and decay style from input '
+                       'arguments and ignore values from checkpoints. Note'
+                       'that all the above values will be reset.')
+    group.add_argument('--use-checkpoint-opt_param-scheduler', action='store_true',
+                       help='Use checkpoint to set the values of the scheduler '
+                       '(learning rate, warmup iterations, minimum learning '
+                       'rate, maximum number of iterations, and decay style '
+                       'from checkpoint and ignore input arguments.')
+    group.add_argument('--decoupled-lr', type=float, default=None,
+                       help='Separate learning rate for the input and output layer')
+    group.add_argument('--decoupled-min-lr', type=float, default=None,
+                       help='Minimum value for learning rate for the input and output layer. The scheduler'
+                       'clip values below this threshold')
+
+    return parser
+
+
+def _add_checkpointing_args(parser):
+    group = parser.add_argument_group(title='checkpointing')
+
+    group.add_argument('--save', type=str, default=None,
+                       help='Output directory to save checkpoints to.')
+    group.add_argument('--save-interval', '--persistent-save-interval', type=int, default=None,
+                       help='Number of iterations between persistent checkpoint saves.')
+    group.add_argument('--no-save-optim', action='store_true', default=None,
+                       help='Do not save current optimizer.')
+    group.add_argument('--no-save-rng', action='store_true', default=None,
+                       help='Do not save current rng state.')
+    group.add_argument('--load', type=str, default=None,
+                       help='Directory containing a model checkpoint.')
+    group.add_argument('--no-load-optim', action='store_true', default=None,
+                       help='Do not load optimizer when loading checkpoint.')
+    group.add_argument('--no-load-rng', action='store_true', default=None,
+                       help='Do not load rng state when loading checkpoint.')
+    group.add_argument('--non-persistent-save-interval', type=int, default=None,
+                       help='Number of iterations between non-persistent saves.')
+    group.add_argument('--non-persistent-ckpt-type', type=str, default=None,
+                       choices=['global', 'local', 'in_memory', None],
+                       help='Type of non-persistent model checkpoints. '
+                           '"global" - Saved as a standard checkpoint (e.g., on Lustre) with old checkpoints being removed. '
+                           '"local" - [TBD] Each rank saves a portion of the checkpoint locally (e.g., on SSD/ramdisk). '
+                           '"in_memory" - [TBD] A special kind of local checkpoint that avoids serialization. '
+                           'None - No non-persistent checkpointing (default option).')
+    group.add_argument('--non-persistent-global-ckpt-dir', type=str, default=None,
+                       help='Directory containing global non-persistent model checkpoints.')
+    group.add_argument('--non-persistent-local-ckpt-dir', type=str, default=None,
+                       help='Directory containing local non-persistent model checkpoints.')
+    group.add_argument('--non-persistent-local-ckpt-algo', type=str, default='fully_parallel',
+                       choices=['fully_parallel', 'atomic'],
+                       help='Algorithm for local non-persistent checkpointing.')
+    group.add_argument('--finetune', action='store_true',
+                       help='Load model for finetuning. Do not load optimizer '
+                       'or rng state from checkpoint and set iteration to 0. '
+                       'Assumed when loading a release checkpoint.')
+    group.add_argument('--pretrained-checkpoint', type=str, default=None,
+                       help='Directory containing a pretrained model checkpoint for finetuning.')
+    group.add_argument('--ckpt-step', type=int, default=None,
+                       help='Checkpoint step to load model from.')
+    group.add_argument('--no-initialization', action='store_false',
+                       help='Do not perform initialization when building model, '
+                       'can reduce startup time when definitely loading from a '
+                       'checkpoint',
+                       dest='perform_initialization')
+    group.add_argument('--use-checkpoint-args', action='store_true',
+                       help='Override any command line arguments with arguments '
+                       'from the checkpoint')
+    group.add_argument('--exit-on-missing-checkpoint', action='store_true',
+                       help="If '--load' is set, but checkpoint is not found "
+                       "(e.g., path typo), then exit instead of random "
+                       "initialization.")
+    group.add_argument('--use-dist-ckpt', action='store_true',
+                       dest='use_dist_ckpt_deprecated',
+                       help='Deprecated: see --ckpt-format.')
+    group.add_argument('--auto-detect-ckpt-format', action='store_true',
+                       help='Determine if the checkpoint format is in legacy or distributed format.'
+                            ' If False, expects distributed checkpoint iff args.ckpt_format != "torch".'
+                            ' Might slow down loading a bit (double rank0 ckpt load).')
+    group.add_argument('--dist-ckpt-format',
+                       dest='dist_ckpt_format_deprecated',
+                       help='Deprecated: see --ckpt-format.')
+    group.add_argument('--ckpt-format', default='torch_dist',
+                       choices=['torch', 'torch_dist', 'zarr'],
+                       help='Checkpoint format to use.')
+    group.add_argument('--ckpt-convert-format', default=None,
+                       choices=['torch', 'torch_dist', 'zarr'],
+                       help='Checkpoint format for conversion.')
+    group.add_argument('--ckpt-convert-save', default=None,
+                       help='Save directory for converted checkpoint.')
+    group.add_argument('--ckpt-convert-update-legacy-dist-opt-format', action='store_true',
+                       help='When loading a checkpoint, update the legacy format '
+                       'for the distributed optimizer, which previously used a '
+                       'merged param/grad buffer and a different bucket mapping. '
+                       'The legacy format was deprecated on Feb 13, 2024.')
+    group.add_argument('--ckpt-fully-parallel-save', action='store_true',
+                       dest='ckpt_fully_parallel_save_deprecated',
+                       help='Deprecated: see --no-ckpt-fully-parallel-save.')
+    group.add_argument('--no-ckpt-fully-parallel-save', action='store_false',
+                       dest='ckpt_fully_parallel_save',
+                       help='Disable applying full save parallelization across DP for'
+                            ' distributed checkpoints. Depending on ckpt format'
+                            ' might decrease the number of files in the checkpoint.'
+                            ' Makes DistributedOptimizer checkpoint non-reshardable.')
+    group.add_argument('--async-save', action='store_true', default=None,
+                       help='Apply async checkpointing save. Currently works only with'
+                            '`torch_dist` distributed checkpoint format.')
+    group.add_argument('--ckpt-fully-parallel-load', action='store_true',
+                       help='Apply full load parallelization across DP for'
+                            ' distributed checkpoints.')
+    group.add_argument('--ckpt-assume-constant-structure', action='store_true',
+                       help='If the model and optimizer state dict structure is'
+                            'constant throughout a *single training job*, it allows for'
+                            'different checkpointing performance optimizations.')
+    group.add_argument('--dist-ckpt-strictness', type=str, default='assume_ok_unexpected',
+                       choices=[e.value for e in StrictHandling],
+                       help='Determine handling of key mismatch during checkpoint load.'
+                            ' Check StrictHandling docs for flags meaning.'
+                            ' NOTE: This flag controls only distributed checkpoint'
+                            ' load from storage, not loading state dict into the model.')
+    return parser
+
+
+def _add_mixed_precision_args(parser):
+    group = parser.add_argument_group(title='mixed precision')
+
+    group.add_argument('--fp16', action='store_true',
+                       help='Run model in fp16 mode.')
+    group.add_argument('--bf16', action='store_true',
+                       help='Run model in bfloat16 mode.')
+    group.add_argument('--loss-scale', type=float, default=None,
+                       help='Static loss scaling, positive power of 2 '
+                       'values can improve fp16 convergence. If None, dynamic'
+                       'loss scaling is used.')
+    group.add_argument('--initial-loss-scale', type=float, default=2**32,
+                       help='Initial loss-scale for dynamic loss scaling.')
+    group.add_argument('--min-loss-scale', type=float, default=1.0,
+                       help='Minimum loss scale for dynamic loss scaling.')
+    group.add_argument('--loss-scale-window', type=float, default=1000,
+                       help='Window over which to raise/lower dynamic scale.')
+    group.add_argument('--hysteresis', type=int, default=2,
+                       help='hysteresis for dynamic loss scaling')
+    group.add_argument('--fp32-residual-connection', action='store_true',
+                       help='Move residual connections to fp32.')
+    group.add_argument('--apply-query-key-layer-scaling', action='store_true',
+                       help='Scale Q * K^T by 1 / layer-number. '
+                       'Useful for fp16 training. Also sets `attention_softmax_in_fp32` to True.')
+    group.add_argument('--attention-softmax-in-fp32', action='store_true',
+                       help='Run attention masking and softmax in fp32.')
+    group.add_argument('--accumulate-allreduce-grads-in-fp32',
+                       action='store_true',
+                       help='Gradient accumulation and all-reduce in fp32.')
+    group.add_argument('--fp16-lm-cross-entropy', action='store_true',
+                       help='Move the cross entropy unreduced loss calculation'
+                       'for lm head to fp16.')
+
+    return parser
+
+
+def _add_distributed_args(parser):
+    group = parser.add_argument_group(title='distributed')
+
+    group.add_argument('--tensor-model-parallel-size', type=int, default=1,
+                       help='Degree of tensor model parallelism.')
+    group.add_argument('--encoder-tensor-model-parallel-size', type=int, default=0,
+                       help='Degree of tensor model parallelism for the encoder.')
+    group.add_argument('--pipeline-model-parallel-size', type=int, default=1,
+                       help='Degree of pipeline model parallelism.')
+    group.add_argument('--encoder-pipeline-model-parallel-size', type=int, default=0,
+                       help=('Degree of pipeline model parallelism in the encoder. This is '
+                             'independent of the amount of pipeline in the decoder.'))
+    group.add_argument('--pipeline-model-parallel-split-rank',
+                       type=int, default=None,
+                       help=('Rank where encoder and decoder should be split. '
+                             'Deprecated; use --encoder-pipeline-model-parallel-size instead.'))
+    group.add_argument('--decoder-first-pipeline-num-layers',
+                       type=int, default=None,
+                       help=('The number of transformer layers on the first pipeline stage of the decoder. '
+                       'Default None is even split of transformer layers across all pipeline stages'))
+    group.add_argument('--decoder-last-pipeline-num-layers',
+                       type=int, default=None,
+                       help=('The number of transformer layers on the last pipeline stage of the decoder. '
+                       'Default None is even split of transformer layers across all pipeline stages'))
+    group.add_argument('--model-parallel-size', type=int, default=None,
+                       help='Old model parallel argument, do not use. Use '
+                       '--tensor-model-parallel-size instead.')
+    group.add_argument('--num-layers-per-virtual-pipeline-stage', type=int, default=None,
+                       help='Number of layers per virtual pipeline stage')
+    group.add_argument('--no-overlap-p2p-communication', action='store_false',
+                       help='overlap pipeline parallel communication with forward and backward chunks',
+                       dest='overlap_p2p_comm')
+    group.add_argument('--distributed-backend', default='nccl',
+                       choices=['nccl', 'gloo'],
+                       help='Which backend to use for distributed training.')
+    group.add_argument('--distributed-timeout-minutes', type=int, default=10,
+                       help='Timeout minutes for torch.distributed.')
+    group.add_argument('--overlap-grad-reduce', action='store_true',
+                       default=False, help='If set, overlap DDP grad reduce.')
+    group.add_argument('--defer-embedding-wgrad-compute', action='store_true',
+                       default=False, help='If set, defers the vocabulary projection linear layer weight'
+                       'gradient compute to pipeline flush.', dest='defer_embedding_wgrad_compute')
+    group.add_argument('--wgrad-deferral-limit', type=int, default=0, help='Number of micro-batches for which'
+                       'weight gradient computation of vocabulary projection is deferred, defaults to 0 which'
+                       'means all the micro-batches are deferred. Invalid if `defer-embedding-wgrad-compute`'
+                       'is not set')
+    group.add_argument('--no-align-grad-reduce', action='store_false',
+                       help='If not set, all PP stages will launch gradient reduces simultaneously. '
+                       'Otherwise, each PP stage will independently launch as needed.',
+                       dest='align_grad_reduce')
+    group.add_argument('--ddp-bucket-size', type=int, default=None,
+                       help='Bucket size for data-parallel communication')
+    group.add_argument('--ddp-average-in-collective', action='store_true',
+                       default=False, help='If set, average directly in data-parallel communication collective.')
+    group.add_argument('--overlap-param-gather', action='store_true',
+                       default=False, help='If set, overlap param all-gather in distributed optimizer.')
+    group.add_argument('--overlap-param-gather-with-optimizer-step', action='store_true',
+                       default=False, help='If set, overlap param all-gather of first bucket with optimizer step.')
+    group.add_argument('--no-align-param-gather', action='store_false',
+                       help='If not set, all PP stages will launch param all-gathers simultaneously. '
+                       'Otherwise, each PP stage will independently launch as needed.',
+                       dest='align_param_gather')
+    group.add_argument('--no-scatter-gather-tensors-in-pipeline', action='store_false',
+                       help='If not set, use scatter/gather to optimize communication of tensors in pipeline.',
+                       dest='scatter_gather_tensors_in_pipeline')
+    group.add_argument('--use-ring-exchange-p2p', action='store_true',
+                       default=False, help='If set, use custom-built ring exchange '
+                       'for p2p communications. Note that this option will require '
+                       'a custom built image that support ring-exchange p2p.')
+    group.add_argument('--local-rank', type=int, default=int(os.getenv('LOCAL_RANK', '0')),
+                       help='local rank passed from distributed launcher.')
+    group.add_argument('--lazy-mpu-init', type=bool, required=False,
+                       help='If set to True, initialize_megatron() '
+                       'skips DDP initialization and returns function to '
+                       'complete it instead.Also turns on '
+                       '--use-cpu-initialization flag. This is for '
+                       'external DDP manager.' )
+    group.add_argument('--standalone-embedding-stage', action='store_true',
+                       default=False, help='If set, *input* embedding layer '
+                       'is placed on its own pipeline stage, without any '
+                       'transformer layers. (For T5, this flag currently only '
+                       'affects the encoder embedding.)')
+    group.add_argument('--use-distributed-optimizer', action='store_true',
+                       help='Use distributed optimizer.')
+    group.add_argument('--context-parallel-size', type=int, default=1,
+                       help='Degree of context parallelism.')
+    group.add_argument('--nccl-communicator-config-path', type=str, default=None,
+                       help='Path to the yaml file with NCCL communicator '
+                       'configurations. The number of min/max thread groups and thread '
+                       'group cluster size of each communicator can be configured by '
+                       'setting `min_ctas`, `max_ctas`, and `cga_cluster_size`.')
+    group.add_argument('--use-tp-pp-dp-mapping', action='store_true', default=False,
+                        help='If set, distributed ranks initialize order is changed '
+                        'from tp-dp-pp to tp-pp-dp. Make sure EP and CP aren\'t used '
+                        'with this option enabled')
+    return parser
+
+
+def _add_validation_args(parser):
+    group = parser.add_argument_group(title='validation')
+
+    group.add_argument('--eval-iters', type=int, default=100,
+                       help='Number of iterations to run for evaluation'
+                       'validation/test for.')
+    group.add_argument('--eval-interval', type=int, default=1000,
+                       help='Interval between running evaluation on '
+                       'validation set.')
+    group.add_argument("--test-mode", action="store_true", help='Run all real-time test alongside the experiment.')
+    group.add_argument('--skip-train', action='store_true',
+                       default=False, help='If set, bypass the training loop, '
+                       'optionally do evaluation for validation/test, and exit.')
+
+    return parser
+
+
+def _add_data_args(parser):
+    group = parser.add_argument_group(title='data and dataloader')
+
+    group.add_argument('--data-path', nargs='*', default=None,
+                       help='The weight and prefix list for a set of train, validation, and test'
+                       'datasets which split according to --split. The accepted formats are: '
+                       '(1) a single prefix, '
+                       '(2) a list of weight prefix pairs e.g. weight1 prefix1 weight2 prefix2, '
+                       '(3) a list of prefixes e.g. prefix1 prefix2. '
+                       'For (3), weights are inferred from the lengths of the contributing datasets. '
+                       'This argument is exclusive to the other independent --*-data-path arguments.')
+    group.add_argument('--renormalize-blend-weights', action='store_true',
+                       help='Renormalize the blend weights to account for the mid-level dataset '
+                       'oversampling done to ensure fulfillment of the requested number of '
+                       'samples. Use this option if prompted. Defaults to False for backward '
+                       'comparability in the data sample order.')
+    group.add_argument('--split', type=str, default=None,
+                       help='Comma-separated list of proportions for training,'
+                       ' validation, and test split. For example the split '
+                       '`90,5,5` will use 90%% of data for training, 5%% for '
+                       'validation and 5%% for test.')
+    group.add_argument('--train-data-path', nargs='*', default=None,
+                       help='The weight and prefix list for an independent train dataset. '
+                       'Follows the same pattern rules as --data-path.')
+    group.add_argument('--valid-data-path', nargs='*', default=None,
+                       help='The weight and prefix list for an independent validation dataset. '
+                       'Follows the same pattern rules as --data-path.')
+    group.add_argument('--test-data-path', nargs='*', default=None,
+                       help='The weight and prefix list for an independent test dataset. '
+                       'Follows the same pattern rules as --data-path.')
+    group.add_argument('--data-cache-path', default=None,
+                       help='Path to a directory to hold cached index files.')
+    group.add_argument('--no-mmap-bin-files', action='store_false',
+                       help='Disable mmap-ing of .bin files.',
+                       dest='mmap_bin_files')
+    group.add_argument('--mock-data', action='store_true',
+                       help='Skip data loading and validation and opt for artificial '
+                       'generation of mock data when an implementation is available.')
+    group.add_argument('--vocab-size', type=int, default=None,
+                       help='Size of vocab before EOD or padding.')
+    group.add_argument('--vocab-file', type=str, default=None,
+                       help='Path to the vocab file.')
+    group.add_argument('--merge-file', type=str, default=None,
+                       help='Path to the BPE merge file.')
+    group.add_argument('--vocab-extra-ids', type=int, default=0,
+                       help='Number of additional vocabulary tokens. '
+                            'They are used for span masking in the T5 model')
+    group.add_argument('--seq-length', type=int, default=None,
+                       help='Maximum sequence length to process.')
+    group.add_argument('--encoder-seq-length', type=int, default=None,
+                       help='Maximum encoder sequence length to process.'
+                       'This should be exclusive of --seq-length')
+    group.add_argument('--decoder-seq-length', type=int, default=None,
+                       help="Maximum decoder sequence length to process.")
+    group.add_argument('--retriever-seq-length', type=int, default=256,
+                       help='Maximum sequence length for the biencoder model '
+                       'for retriever')
+    group.add_argument('--sample-rate', type=float, default=1.0,
+                       help='sample rate for training data. Supposed to be 0 '
+                            ' < sample_rate < 1')
+    group.add_argument('--mask-prob', type=float, default=0.15,
+                       help='Probability of replacing a token with mask.')
+    group.add_argument('--short-seq-prob', type=float, default=0.1,
+                       help='Probability of producing a short sequence.')
+    group.add_argument('--num-workers', type=int, default=2,
+                       help="Dataloader number of workers.")
+    group.add_argument('--tokenizer-type', type=str,
+                       default=None,
+                       choices=['BertWordPieceLowerCase',
+                                'BertWordPieceCase',
+                                'GPT2BPETokenizer',
+                                'SentencePieceTokenizer',
+                                'GPTSentencePieceTokenizer',
+                                'HuggingFaceTokenizer',
+                                'Llama2Tokenizer',
+                                'TikTokenizer',
+                                'NullTokenizer'],
+                       help='What type of tokenizer to use.')
+    group.add_argument('--tokenizer-model', type=str, default=None,
+                       help='Sentencepiece tokenizer model.')
+    group.add_argument('--tiktoken-pattern', type=str, default=None,
+                       help='Which tiktoken pattern to use. Options: [v1, v2]')
+    group.add_argument('--tiktoken-num-special-tokens', type=int, default=1000,
+                       help='Number of special tokens in tiktoken tokenizer')
+    group.add_argument('--tiktoken-special-tokens', type=str, nargs='+', default=None,
+                       help='List of tiktoken special tokens, needs to have ["<unk>", "<s>", "</s>"]')
+    group.add_argument('--reset-position-ids', action='store_true',
+                       help='Reset posistion ids after end-of-document token.')
+    group.add_argument('--reset-attention-mask', action='store_true',
+                       help='Reset self attention maske after '
+                       'end-of-document token.')
+    group.add_argument('--eod-mask-loss', action='store_true',
+                       help='Mask loss for the end of document tokens.')
+    group.add_argument('--no-create-attention-mask-in-dataloader', action='store_false',
+                       help='If set, do not create attention_masks in dataloader.',
+                       dest='create_attention_mask_in_dataloader')
+    group.add_argument('--num-dataset-builder-threads', type=int, default=1,
+                       help='Number of parallel threads per rank for dataset builder')
+    group.add_argument('--s3-cache-path', type=str, default=None,
+                       help='Path to cache index files when using s3 dataloader')
+    return parser
+
+
+def _add_autoresume_args(parser):
+    group = parser.add_argument_group(title='autoresume')
+
+    group.add_argument('--adlr-autoresume', action='store_true',
+                       help='Enable autoresume on adlr cluster.')
+    group.add_argument('--adlr-autoresume-interval', type=int, default=1000,
+                       help='Intervals over which check for autoresume'
+                       'termination signal')
+
+    return parser
+
+
+def _add_biencoder_args(parser):
+    group = parser.add_argument_group(title='biencoder')
+
+    # network size
+    group.add_argument('--ict-head-size', type=int, default=None,
+                       help='Size of block embeddings to be used in ICT and '
+                        'REALM (paper default: 128)')
+    group.add_argument('--biencoder-projection-dim', type=int, default=0,
+                       help='Size of projection head used in biencoder (paper'
+                        ' default: 128)')
+    group.add_argument('--biencoder-shared-query-context-model', action='store_true',
+                        help='Whether to share the parameters of the query '
+                        'and context models or not')
+
+    # checkpointing
+    group.add_argument('--ict-load', type=str, default=None,
+                       help='Directory containing an ICTBertModel checkpoint')
+    group.add_argument('--bert-load', type=str, default=None,
+                       help='Directory containing an BertModel checkpoint '
+                       '(needed to start ICT and REALM)')
+
+    # data
+    group.add_argument('--titles-data-path', type=str, default=None,
+                       help='Path to titles dataset used for ICT')
+    group.add_argument('--query-in-block-prob', type=float, default=0.1,
+                       help='Probability of keeping query in block for '
+                       'ICT dataset')
+    group.add_argument('--use-one-sent-docs', action='store_true',
+                       help='Whether to use one sentence documents in ICT')
+    group.add_argument('--evidence-data-path', type=str, default=None,
+                       help='Path to Wikipedia Evidence frm DPR paper')
+
+    # training
+    group.add_argument('--retriever-report-topk-accuracies', nargs='+', type=int,
+                        default=[], help="Which top-k accuracies to report "
+                        "(e.g. '1 5 20')")
+    group.add_argument('--retriever-score-scaling', action='store_true',
+                       help='Whether to scale retriever scores by inverse '
+                        'square root of hidden size')
+
+    # faiss index
+    group.add_argument('--block-data-path', type=str, default=None,
+                       help='Where to save/load BlockData to/from')
+    group.add_argument('--embedding-path', type=str, default=None,
+                       help='Where to save/load Open-Retrieval Embedding'
+                        ' data to/from')
+
+    # indexer
+    group.add_argument('--indexer-batch-size', type=int, default=128,
+                       help='How large of batches to use when doing indexing '
+                       'jobs')
+    group.add_argument('--indexer-log-interval', type=int, default=1000,
+                       help='After how many batches should the indexer '
+                       'report progress')
+    return parser
+
+
+def _add_vision_args(parser):
+    group = parser.add_argument_group(title="vision")
+
+    # general vision arguements
+    group.add_argument('--num-classes', type=int, default=1000,
+                       help='num of classes in vision classificaiton task')
+    group.add_argument('--img-h', type=int, default=224,
+                       help='Image height for vision classification task')
+    group.add_argument('--img-w', type=int, default=224,
+                       help='Image height for vision classification task')
+    group.add_argument('--num-channels', type=int, default=3,
+                       help='Number of channels in input image data')
+    group.add_argument('--patch-dim', type=int, default=16,
+                       help='patch dimension')
+    group.add_argument('--classes-fraction', type=float, default=1.0,
+                       help='training with fraction of classes.')
+    group.add_argument('--data-per-class-fraction', type=float, default=1.0,
+                       help='training with fraction of data per class.')
+    group.add_argument('--no-data-sharding', action='store_false',
+                       help='Disable data sharding.',
+                       dest='data_sharding')
+    group.add_argument('--head-lr-mult', type=float, default=1.0,
+                       help='learning rate multiplier for head during finetuning')
+
+    # pretraining type and backbone selection`
+    group.add_argument('--vision-pretraining', action='store_true',
+                       help='flag to indicate vision pretraining')
+    group.add_argument('--vision-pretraining-type', type=str, default='classify',
+                       choices=['classify', 'inpaint', 'dino'],
+                       help='pretraining objectives')
+    group.add_argument('--vision-backbone-type', type=str, default='vit',
+                       choices=['vit', 'mit', 'swin'],
+                       help='backbone types types')
+    group.add_argument('--swin-backbone-type', type=str, default='tiny',
+                       choices=['tiny', 'base', 'h3'],
+                       help='pretraining objectives')
+    # inpainting arguments
+    group.add_argument('--mask-type', type=str, default='random',
+                       choices=['random', 'row'],
+                       help='mask types')
+    group.add_argument('--mask-factor', type=float, default=1.0,
+                       help='mask size scaling parameter')
+
+    # dino arguments
+    group.add_argument('--iter-per-epoch', type=int, default=1250,
+                       help='iterations per epoch')
+    group.add_argument('--dino-local-img-size', type=int, default=96,
+                       help='Image size for vision classification task')
+    group.add_argument('--dino-local-crops-number', type=int, default=10,
+                       help='Number of local crops')
+    group.add_argument('--dino-head-hidden-size', type=int, default=2048,
+                       help='Hidden dimension size in dino head')
+    group.add_argument('--dino-bottleneck-size', type=int, default=256,
+                       help='Bottle neck dimension in dino head ')
+    group.add_argument('--dino-freeze-last-layer', type=float, default=1,
+                       help='Freezing last layer weights')
+    group.add_argument('--dino-norm-last-layer', action='store_true',
+                       help='Disable Norm in last layer.')
+    group.add_argument('--dino-warmup-teacher-temp', type=float, default=0.04,
+                       help='warump teacher temperature')
+    group.add_argument('--dino-teacher-temp', type=float, default=0.07,
+                       help='teacher temperature')
+    group.add_argument('--dino-warmup-teacher-temp-epochs', type=int, default=30,
+                       help='warmup teacher temperaure epochs')
+
+    # regularization arguments
+    group.add_argument('--qk-layernorm', action='store_true',
+                       help='Whether to layer normalize the q and k attention embeddings.')
+
+    return parser
+
+def _add_moe_args(parser):
+    group = parser.add_argument_group(title="moe")
+    group.add_argument('--expert-model-parallel-size', type=int, default=1,
+                       help='Degree of expert model parallelism.')
+    group.add_argument('--num-experts', type=int, default=None,
+                       help='Number of Experts in MoE (None means no MoE)')
+    group.add_argument('--moe-shared-expert-intermediate-size', type=int, default=None,
+                       help='Shared expert total ffn hidden size. '
+                       'It should be equal to "num_shared_experts * ffn_size_of_each_shared_expert" if there are multiple shared experts. '
+                       'None means no shared expert.')
+    group.add_argument('--moe-shared-expert-overlap', action='store_true',
+                       help='Enable overlapping between shared expert computations and dispatcher communications. '
+                       'Without this, the shared epxerts execute after the routed experts. '
+                       'Only effective when moe-shared-expert-intermediate-size is set.')
+    group.add_argument('--moe-router-load-balancing-type', type=str,
+                       choices=['aux_loss', 'sinkhorn', 'none'],
+                       default='aux_loss',
+                       help='Determines the load balancing strategy for the router. "aux_loss" corresponds to the load balancing loss used in GShard and SwitchTransformer, "sinkhorn" corresponds to the balancing algorithm used in S-BASE, and "none" implies no load balancing. The default is "aux_loss".')
+    group.add_argument('--moe-router-topk', type=int, default=2,
+                       help='Number of experts to route to for each token. The default is 2.')
+    group.add_argument('--moe-router-pre-softmax', action='store_true',
+                       help='Enable pre-softmax routing for MoE, which means softmax is before the top-k selection. By default, softmax is done after top-k.')
+    group.add_argument('--moe-grouped-gemm', action='store_true',
+                       help='When there are multiple experts per rank, launch multiple local GEMM kernels in multiple streams to improve the utilization and performance with GroupedLinear in TransformerEngine.')
+    group.add_argument('--moe-aux-loss-coeff', type=float, default=0.0,
+                       help='Scaling coefficient for the aux loss: a starting value of 1e-2 is recommended.')
+    group.add_argument('--moe-z-loss-coeff', type=float, default=None,
+                       help='Scaling coefficient for the z-loss: a starting value of 1e-3 is recommended.')
+    group.add_argument('--moe-input-jitter-eps', type=float, default=None,
+                       help='Add noise to the input tensor by applying jitter with a specified epsilon value.')
+    group.add_argument('--moe-token-dispatcher-type', type=str,
+                       choices=['allgather', 'alltoall', 'alltoall_seq'],
+                       default='allgather',
+                       help="The type of token dispatcher to use. The default is 'allgather'. Options are 'allgather', 'alltoall' and 'alltoall_seq'. We recommend using 'alltoall' when applying expert parallelism. For more information, please refer to the documentation in core/moe/README.")
+    group.add_argument('--moe-per-layer-logging', action='store_true',
+                       help='Enable per-layer logging for MoE, currently supports auxiliary loss and z loss.')
+    # Token dropping arguments
+    group.add_argument('--moe-expert-capacity-factor', type=float, default=None,
+                       help='The capacity factor for each expert, None means no token will be dropped.')
+    group.add_argument('--moe-pad-expert-input-to-capacity', action='store_true',
+                       help='Pads the input for each expert to match the expert capacity length, effective only after the --moe-expert-capacity-factor is set.')
+    group.add_argument('--moe-token-drop-policy', type=str, default='probs', choices=['probs', 'position'],
+                       help='The policy to drop tokens. Can be either "probs" or "position". If "probs", the tokens with the lowest probabilities will be dropped. If "position", tokens at the end of each batch will be dropped.')
+    group.add_argument('--moe-layer-recompute', action='store_true',
+                       help='Enable checkpointing for moe_layer, should be used when memory is not sufficient.')
+    group.add_argument('--moe-extended-tp', action='store_true',
+                       help='Alternative to expert parallelism, all experts are sharded across TPXEP domain.')
+    group.add_argument('--moe-use-upcycling', action='store_true',
+                       help='Load a checkpoint of a dense model, convert it into an MoE model, and save the converted model to the path specified by --save. '
+                       'Upcycling is implemented on the top of distributed checkpointing, so it supports parallel modes different from the dense model.')
+
+    return parser
+
+def _add_mla_args(parser):
+    group = parser.add_argument_group(title="mla")
+    group.add_argument('--q-lora-rank', type=int, default=None,
+                       help="Rank of Query tensor's low rank representation.")
+    group.add_argument('--kv-lora-rank', type=int, default=32,
+                       help="Rank of Key and Value tensors' low rank representation.")
+    group.add_argument('--qk-head-dim', type=int, default=128,
+                       help="Dimension of the head in the QK projection. q_head_dim = qk_head_dim + qk_pos_emb_head_dim")
+    group.add_argument('--qk-pos-emb-head-dim', type=int, default=64,
+                       help="Dimension of the position embedding in the QK projection.")
+    group.add_argument('--v-head-dim', type=int, default=128,
+                       help="Dimension of the head in the V projection.")
+    group.add_argument('--rotary-scaling-factor', type=float, default=1.0,
+                       help="Rotary scaling factor for the rotary embeddings.")
+
+    return parser
+
+def _add_experimental_args(parser):
+    group = parser.add_argument_group(title='experimental')
+
+    group.add_argument('--spec', type=str, default=None, nargs='*',
+                       help='Specify the <module_location function_name> pair '
+                       'that returns a spec to customize a model, transformer '
+                       'block, or transformer layer, depending on the use case.'
+                       'To use local spec specify local as the argument.'
+                       'For more details, see the model class, '
+                       '`transformer_block.py`, or `transformer_layer.py`')
+    group.add_argument('--hybrid-attention-ratio', type=float, default=0.0,
+                       help='Ratio of attention layers to total layers, in the '
+                       'range [0.0, 1.0].')
+    group.add_argument('--hybrid-mlp-ratio', type=float, default=0.0,
+                       help='Ratio of mlp layers to total layers, in the '
+                       'range [0.0, 1.0].')
+    group.add_argument('--hybrid-override-pattern', type=str, default=None,
+                       help='Force a specific hybrid layer pattern. If a value'
+                       'greater than 0.0 is supplied to any of the hybrid ratio'
+                       'arguments, then the number of each type of layer in the'
+                       'override pattern must match number in the overidden'
+                       'pattern')
+    group.add_argument('--yaml-cfg', type=str, default=None,
+                       help = 'Config file to add additional arguments')
+    return parser
diff --git a/megatron/training/async_utils.py b/megatron/training/async_utils.py
new file mode 100644
index 0000000000..44530ad9d9
--- /dev/null
+++ b/megatron/training/async_utils.py
@@ -0,0 +1,43 @@
+# Copyright (c) 2024, NVIDIA CORPORATION. All rights reserved.
+
+"""
+This module provides a singleton instance of AsyncCallsQueue which manages
+the async checkpoint save calls.
+"""
+import logging
+
+from megatron.core.dist_checkpointing.strategies.async_utils import AsyncCallsQueue, AsyncRequest
+from megatron.training import get_args
+from megatron.training.utils import print_rank_0
+
+logger = logging.getLogger(__name__)
+
+# Singleton manager of async calls
+_async_calls_queue = AsyncCallsQueue()
+
+
+def schedule_async_save(async_request: AsyncRequest):
+    """ Schedule the async save request.
+
+    Args:
+        async_request (AsyncRequest): the async save request.
+    """
+    _async_calls_queue.schedule_async_request(async_request)
+
+
+def maybe_finalize_async_save(blocking: bool = False):
+    """ Finalizes active async save calls.
+
+    Args:
+        blocking (bool, optional): if True, will wait until all active requests
+            are done. Otherwise, finalizes only the async request that already
+            finished. Defaults to False.
+    """
+    args = get_args()
+    if not args.async_save:
+        return
+
+    if blocking and _async_calls_queue.get_num_unfinalized_calls() > 0:
+        print_rank_0('Unfinalized async checkpoint saves. Finalizing them synchronously now.')
+
+    _async_calls_queue.maybe_finalize_async_calls(blocking)
diff --git a/megatron/training/checkpointing.py b/megatron/training/checkpointing.py
new file mode 100644
index 0000000000..3de49f6c57
--- /dev/null
+++ b/megatron/training/checkpointing.py
@@ -0,0 +1,1290 @@
+# Copyright (c) 2024, NVIDIA CORPORATION. All rights reserved.
+
+"""Input/output checkpointing."""
+
+from enum import Enum, auto
+from logging import getLogger
+import os
+import random
+import shutil
+import sys
+import threading
+from pathlib import Path
+
+import numpy as np
+from time import time
+
+import torch
+
+from megatron.core import mpu, tensor_parallel, dist_checkpointing
+from megatron.core.dist_checkpointing.mapping import ShardedObject
+from megatron.core.dist_checkpointing.serialization import get_default_load_sharded_strategy
+from megatron.core.dist_checkpointing.state_dict_transformation import (
+    prepare_state_dict_for_save,
+    recreate_state_dict_after_load,
+)
+from megatron.core.dist_checkpointing.strategies.fully_parallel import \
+    FullyParallelSaveStrategyWrapper, FullyParallelLoadStrategyWrapper
+from megatron.core.num_microbatches_calculator import update_num_microbatches
+from megatron.core.utils import is_float8tensor
+from .async_utils import schedule_async_save
+from .global_vars import get_args, get_one_logger
+from .utils import unwrap_model, print_rank_0, append_to_progress_log, is_last_rank
+from ..core.dist_checkpointing.serialization import \
+    get_default_save_sharded_strategy
+from .one_logger_utils import on_save_checkpoint_start, on_save_checkpoint_success
+
+# [ModelOpt]: Import
+try:
+    from modelopt.torch.opt.plugins import (
+        save_modelopt_state,
+        save_sharded_modelopt_state,
+        restore_modelopt_state,
+        restore_sharded_modelopt_state,
+    )
+    has_nvidia_modelopt = True
+except Exception:
+    has_nvidia_modelopt = False
+
+_CHECKPOINT_VERSION = None
+
+logger = getLogger(__name__)
+_NON_PERSISTENT_CKPT_SUBDIR = 'non_persistent'
+
+def set_checkpoint_version(value):
+    global _CHECKPOINT_VERSION
+    if _CHECKPOINT_VERSION is not None:
+        assert _CHECKPOINT_VERSION == value, \
+            "checkpoint versions do not match"
+    _CHECKPOINT_VERSION = value
+
+
+def get_checkpoint_version():
+    global _CHECKPOINT_VERSION
+    return _CHECKPOINT_VERSION
+
+
+def check_checkpoint_args(checkpoint_args):
+    """Ensure fixed arguments for a model are the same for the input
+    arguments and the one retrieved from checkpoint."""
+    args = get_args()
+
+    def _compare(arg_name, old_arg_name=None, default=None):
+        if old_arg_name is not None:
+            ckpt_arg_name = old_arg_name
+        else:
+            ckpt_arg_name = arg_name
+        if default is not None:
+            checkpoint_value = getattr(checkpoint_args, ckpt_arg_name, default)
+        else:
+            checkpoint_value = getattr(checkpoint_args, ckpt_arg_name)
+        args_value = getattr(args, arg_name)
+        error_message = '{} value from checkpoint ({}) is not equal to the ' \
+                        'input argument value ({}).'.format(
+                            arg_name, checkpoint_value, args_value)
+        assert checkpoint_value == args_value, error_message
+
+    _compare('num_layers')
+    _compare('hidden_size')
+    _compare('num_attention_heads')
+    _compare('add_position_embedding', default=True)
+    if args.vocab_file:
+        _compare('max_position_embeddings')
+        _compare('make_vocab_size_divisible_by')
+        if not args.use_dist_ckpt:
+            _compare('padded_vocab_size')
+        _compare('tokenizer_type')
+    if args.data_parallel_random_init:
+        _compare('data_parallel_random_init')
+    if get_checkpoint_version() < 3.0:
+        _compare('tensor_model_parallel_size',
+                 old_arg_name='model_parallel_size')
+    if get_checkpoint_version() >= 3.0 and not args.use_dist_ckpt:
+        _compare('tensor_model_parallel_size')
+        _compare('pipeline_model_parallel_size')
+
+
+def ensure_directory_exists(filename, check_parent=True):
+    """Build filename's path if it does not already exists."""
+    dirname = os.path.dirname(filename) if check_parent else filename
+    os.makedirs(dirname, exist_ok=True)
+
+
+def get_checkpoint_name(checkpoints_path, iteration, release=False,
+                        pipeline_parallel=None,
+                        tensor_rank=None, pipeline_rank=None,
+                        expert_parallel=None, expert_rank=None,
+                        return_base_dir=False, basename="model_optim_rng.pt"):
+    """Determine the directory name for this rank's checkpoint."""
+    if release:
+        directory = 'release'
+    else:
+        directory = 'iter_{:07d}'.format(iteration)
+    if return_base_dir:
+        common_path = os.path.join(checkpoints_path, directory)
+        return common_path
+
+    # Use both the tensor and pipeline MP rank.
+    if pipeline_parallel is None:
+        pipeline_parallel = (mpu.get_pipeline_model_parallel_world_size() > 1)
+    if tensor_rank is None:
+        tensor_rank = mpu.get_tensor_model_parallel_rank()
+    if pipeline_rank is None:
+        pipeline_rank = mpu.get_pipeline_model_parallel_rank()
+    if expert_parallel is None:
+        expert_parallel = (mpu.get_expert_model_parallel_world_size() > 1)
+    if expert_rank is None:
+        expert_rank = mpu.get_expert_model_parallel_rank()
+
+    # Use both the tensor and pipeline MP rank. If using the distributed
+    # optimizer, then the optimizer's path must additionally include the
+    # data parallel rank.
+    if not pipeline_parallel:
+        common_path = os.path.join(checkpoints_path, directory,
+                            f'mp_rank_{tensor_rank:02d}')
+    else:
+        common_path = os.path.join(checkpoints_path, directory,
+                f'mp_rank_{tensor_rank:02d}_{pipeline_rank:03d}')
+
+    if expert_parallel:
+        common_path = common_path + f'_{expert_rank:03d}'
+
+    return os.path.join(common_path, basename)
+
+
+def get_distributed_optimizer_checkpoint_name(model_checkpoint_name):
+    return os.path.join(os.path.dirname(model_checkpoint_name),
+                        "distrib_optim.pt")
+
+
+def find_checkpoint_rank_0(checkpoints_path, iteration, release=False):
+    """Finds the checkpoint for rank 0 without knowing if we are using
+    pipeline parallelism/expert parallelism or not.
+
+    Since the checkpoint naming scheme changes if pipeline or expert
+    parallelism is present, we need to look for both naming schemes if
+    we don't know if the checkpoint has pipeline or expert parallelism.
+    """
+
+    # Look for checkpoint with no pipelining and no expert parallelism
+    filename = get_checkpoint_name(checkpoints_path, iteration, release,
+                                   pipeline_parallel=False,
+                                   tensor_rank=0, pipeline_rank=0,
+                                   expert_parallel=False, expert_rank=0)
+    if os.path.isfile(filename):
+        return filename
+
+    # Look for checkpoint with no pipelining and expert parallelism
+    filename = get_checkpoint_name(checkpoints_path, iteration, release,
+                                   pipeline_parallel=False,
+                                   tensor_rank=0, pipeline_rank=0,
+                                   expert_parallel=True, expert_rank=0)
+    if os.path.isfile(filename):
+        return filename
+
+    # Look for checkpoint with pipelining and no expert parallelism
+    filename = get_checkpoint_name(checkpoints_path, iteration, release,
+                                   pipeline_parallel=True,
+                                   tensor_rank=0, pipeline_rank=0,
+                                   expert_parallel=False, expert_rank=0)
+    if os.path.isfile(filename):
+        return filename
+
+    # Look for checkpoint with pipelining and expert parallelism
+    filename = get_checkpoint_name(checkpoints_path, iteration, release,
+                                   pipeline_parallel=True,
+                                   tensor_rank=0, pipeline_rank=0,
+                                   expert_parallel=True, expert_rank=0)
+    if os.path.isfile(filename):
+        return filename
+
+    # Look for a distributed checkpoint
+    filename = get_checkpoint_name(checkpoints_path, iteration, release,
+                                   pipeline_parallel=True,
+                                   return_base_dir=True)
+    if dist_checkpointing.check_is_distributed_checkpoint(filename):
+        return filename
+
+    return None
+
+
+def get_checkpoint_tracker_filename(checkpoints_path):
+
+    """Tracker file rescords the latest chckpoint during
+    training to restart from."""
+    return os.path.join(checkpoints_path, 'latest_checkpointed_iteration.txt')
+
+
+def checkpoint_exists(checkpoints_path):
+    if checkpoints_path is None:
+        return False
+    load_step = 'latest_checkpointed_iteration.txt'
+    return os.path.exists(os.path.join(checkpoints_path, load_step))
+
+
+def read_metadata(tracker_filename):
+    # Read the tracker file and either set the iteration or
+    # mark it as a release checkpoint.
+    iteration = 0
+    release = False
+    with open(tracker_filename, 'r') as f:
+        metastring = f.read().strip()
+        try:
+            iteration = int(metastring)
+        except ValueError:
+            release = metastring == 'release'
+            if not release:
+                print_rank_0('ERROR: Invalid metadata file {}. Exiting'.format(
+                    tracker_filename))
+                sys.exit()
+    assert iteration > 0 or release, 'error parsing metadata file {}'.format(
+        tracker_filename)
+
+    # Get the max iteration retrieved across the ranks.
+    if torch.distributed.is_initialized():
+        iters_cuda = torch.tensor([iteration], dtype=torch.long, device='cuda')
+        torch.distributed.all_reduce(iters_cuda, op=torch.distributed.ReduceOp.MAX)
+        max_iter = iters_cuda[0].item()
+
+        # We should now have all the same iteration.
+        # If not, print a warning and chose the maximum
+        # iteration across all ranks.
+        if iteration != max_iter:
+            rank = torch.distributed.get_rank()
+            print('WARNING: on rank {} found iteration {} in the '
+                  'metadata while max iteration across the ranks '
+                  'is {}, replacing it with max iteration.'.format(
+                      rank, iteration, max_iter), flush=True)
+    else:
+        # When loading a checkpoint outside of training (for example,
+        # when editing it), we might not have torch distributed
+        # initialized, in this case, just assume we have the latest
+        max_iter = iteration
+    return max_iter, release
+
+
+def get_rng_state(use_dist_ckpt: bool = False):
+    """ collect rng state across data parallel ranks """
+    args = get_args()
+    rng_state = {
+        'random_rng_state': random.getstate(),
+        'np_rng_state': np.random.get_state(),
+        'torch_rng_state': torch.get_rng_state(),
+        'cuda_rng_state': torch.cuda.get_rng_state(),
+        'rng_tracker_states': tensor_parallel.get_cuda_rng_tracker().get_states()}
+
+    rng_state_list = None
+    if torch.distributed.is_initialized() and \
+            mpu.get_data_parallel_world_size() > 1 and \
+            args.data_parallel_random_init:
+        rng_state_list = \
+            [None for i in range(mpu.get_data_parallel_world_size())]
+        torch.distributed.all_gather_object(
+            rng_state_list,
+            rng_state,
+            group=mpu.get_data_parallel_group())
+    else:
+        rng_state_list = [rng_state]
+
+    if use_dist_ckpt:
+        pp_rank = mpu.get_pipeline_model_parallel_rank()
+        pp_size = mpu.get_pipeline_model_parallel_world_size()
+        tp_rank = mpu.get_tensor_model_parallel_rank()
+        tp_size = mpu.get_tensor_model_parallel_world_size()
+        rng_state_list = ShardedObject('rng_state', rng_state_list, (pp_size, tp_size), (pp_rank, tp_rank),
+                                       replica_id=mpu.get_data_parallel_rank(with_context_parallel=True))
+
+    return rng_state_list
+
+class CheckpointType(Enum):
+    LEGACY = auto()
+    LOCAL = auto()
+    GLOBAL = auto()
+
+def save_checkpoint(iteration, model, optimizer, opt_param_scheduler, num_floating_point_operations_so_far,
+                    checkpointing_context=None, pipeline_rank=None, expert_rank=None, tensor_rank=None, pipeline_parallel=None, expert_parallel=None, non_persistent_ckpt=False,
+                    train_data_iterator=None, ft_client=None):
+    """Save a model, optimizer and optionally dataloader checkpoint.
+
+    Checkpointing context is used to persist some checkpointing state
+    throughout a single job. Must be initialized externally (not used if None).
+
+    If non_persistent_ckpt is True,
+    the checkpoint will be saved with special functionality for removing old checkpoints.
+    There are several types of non-persistent checkpoints:
+    "global" - Saved as a standard checkpoint (e.g., on Lustre) with old checkpoints being removed.
+    "local" - [TBD] Each rank saves a portion of the checkpoint locally (e.g., on SSD/ramdisk).
+    "in_memory" - [TBD] A special kind of local checkpoint that avoids serialization.
+
+    Dataloader checkpoint is only saved if the dataloader supports it. Currently this applies only
+    to the Megatron Energon dataloader (multimodal) and not the built-in Megatron dataloader (text-only).
+    """
+    start_ckpt = time()
+    args = get_args()
+
+    # Prepare E2E metrics at start of save checkpoint
+    productive_metrics = on_save_checkpoint_start(args.async_save)
+
+    # Only rank zero of the data parallel writes to the disk.
+    model = unwrap_model(model)
+
+    # Handle non_persistent_ckpt flag. Besides overwriting `args.save` and
+    # `args.use_dist_ckpt`, non-persistent global ckpt requires no additional logic
+    ckpt_type = CheckpointType.GLOBAL if args.use_dist_ckpt else CheckpointType.LEGACY
+    save_dir = args.save
+    if non_persistent_ckpt:
+        if args.non_persistent_ckpt_type == 'global':
+            ckpt_type = CheckpointType.GLOBAL
+            save_dir = (
+                args.non_persistent_global_ckpt_dir
+                if args.non_persistent_global_ckpt_dir
+                else os.path.join(save_dir, _NON_PERSISTENT_CKPT_SUBDIR)
+            )
+            # TODO Can we ensure the previous checkpoint is saved? We don't want to allow two saves in parallel.
+            cleanup_old_non_persistent_checkpoint(
+                save_dir, leave_ckpt_num=1, do_async=args.async_save
+            )
+        elif args.non_persistent_ckpt_type == 'local':
+            raise RuntimeError('LocalCheckpointManagers are not yet integrated')
+            ckpt_type = CheckpointType.LOCAL
+            save_dir = checkpointing_context['local_checkpoint_manager'].local_ckpt_dir
+        else:
+            assert False, 'Please use local or global non-persistent checkpoints' \
+                f'(got: {args.non_persistent_ckpt_type})'
+
+    ckpt_format = args.ckpt_format if ckpt_type == CheckpointType.GLOBAL else 'torch'
+    print_rank_0('saving checkpoint at iteration {:7d} to {} in {} format'.format(
+        iteration, save_dir, ckpt_format))
+
+    # Collect rng state across data parallel ranks.
+    rng_state = get_rng_state(ckpt_type != CheckpointType.LEGACY)
+
+    # Checkpoint name.
+    return_base_dir = (ckpt_type != CheckpointType.LEGACY)
+    checkpoint_name = get_checkpoint_name(save_dir, iteration, release=False, pipeline_parallel=pipeline_parallel,
+        tensor_rank=tensor_rank, pipeline_rank=pipeline_rank, expert_parallel=expert_parallel, expert_rank=expert_rank, return_base_dir=return_base_dir)
+
+    # Save dataloader state if the dataloader supports it (currently only Megatron Energon).
+    save_dataloader_state(train_data_iterator, iteration, getattr(args, "dataloader_save", None))
+
+    # Save distributed optimizer's custom parameter state.
+    if (
+        args.use_distributed_optimizer
+        and not args.no_save_optim
+        and optimizer is not None
+        and ckpt_type == CheckpointType.LEGACY
+    ):
+        optim_checkpoint_name = \
+            get_distributed_optimizer_checkpoint_name(checkpoint_name)
+        ensure_directory_exists(optim_checkpoint_name)
+        optimizer.save_parameter_state(optim_checkpoint_name)
+
+    async_save_request = None
+    if args.async_save:
+        if ckpt_type == CheckpointType.LEGACY:
+            raise NotImplementedError('Async checkpoint save not implemented for legacy checkpoints')
+        elif ckpt_type == CheckpointType.GLOBAL and args.ckpt_format != 'torch_dist':
+            raise NotImplementedError(f'Async checkpoint save not implemented for {args.ckpt_format} distributed checkpoint format')
+
+    rank = torch.distributed.get_rank() if torch.distributed.is_initialized() else 0
+
+    # Collect args, model, RNG.
+    if not torch.distributed.is_initialized() \
+            or mpu.get_data_modulo_expert_parallel_rank(with_context_parallel=True) == 0 \
+            or ckpt_type != CheckpointType.LEGACY:
+        optim_sd_kwargs = {}
+        if ckpt_type != CheckpointType.LEGACY and args.use_distributed_optimizer:
+            optim_sd_kwargs['sharding_type'] = ('fully_sharded_model_space'
+                                                if args.ckpt_fully_parallel_save
+                                                else 'dp_zero_gather_scatter')
+            print_rank_0(f'Storing distributed optimizer sharded state of type {optim_sd_kwargs["sharding_type"]}')
+        state_dict = generate_state_dict(
+            args,
+            model,
+            optimizer,
+            opt_param_scheduler,
+            rng_state,
+            ckpt_type != CheckpointType.LEGACY,
+            iteration,
+            optim_sd_kwargs=optim_sd_kwargs,
+        )
+
+        if args.enable_ft_package and ft_client is not None:
+            state_dict["ft_state"] = ft_client.state_dict()
+        state_dict['num_floating_point_operations_so_far'] = num_floating_point_operations_so_far
+        if ckpt_type == CheckpointType.GLOBAL:
+            if not torch.distributed.is_initialized() or torch.distributed.get_rank() == 0:
+                # TODO Handle non-empty directories (e.g., after a crash during saving).
+                ensure_directory_exists(checkpoint_name, check_parent=False)
+            if checkpointing_context is not None and 'save_strategy' in checkpointing_context:
+                save_strategy = checkpointing_context['save_strategy']
+                # Already saved once before - don't need to rerun sharding validation
+                validate_sharding_integrity = not args.ckpt_assume_constant_structure
+            else:
+                validate_sharding_integrity = True
+                save_strategy = get_default_save_sharded_strategy(args.ckpt_format)
+                if args.ckpt_assume_constant_structure and args.ckpt_format == 'torch_dist':
+                    save_strategy.use_cached_ckpt_structure = args.ckpt_assume_constant_structure
+                if args.ckpt_fully_parallel_save:
+                    save_strategy = FullyParallelSaveStrategyWrapper(save_strategy, mpu.get_data_parallel_group(with_context_parallel=True),
+                                                                     args.ckpt_assume_constant_structure)
+            # Store save strategy for future checkpoint saves
+            if checkpointing_context is not None:
+                checkpointing_context['save_strategy'] = save_strategy
+            end_ckpt = time()
+            logger.debug(f"rank: {rank}, takes {end_ckpt - start_ckpt} to prepare state dict for ckpt ")
+            async_save_request = dist_checkpointing.save(state_dict, checkpoint_name, save_strategy,
+                                                         async_sharded_save=args.async_save,
+                                                         validate_access_integrity=validate_sharding_integrity)
+            # [ModelOpt]: save sharded modelopt_state
+            if has_nvidia_modelopt:
+                save_sharded_modelopt_state(model, checkpoint_name, (args.ckpt_format, 1))
+        else:
+            # [ModelOpt]: Inject modelopt_state into state_dict
+            if has_nvidia_modelopt:
+                save_modelopt_state(model, state_dict)
+
+            if ckpt_type == CheckpointType.LOCAL:
+                state_dict_for_save = prepare_state_dict_for_save(
+                    state_dict, algo=args.non_persistent_local_ckpt_algo
+                )
+                async_save_request = checkpointing_context['local_checkpoint_manager'].save(
+                    state_dict_for_save, iteration, is_async=bool(args.async_save)
+                )
+            else:
+                assert ckpt_type == CheckpointType.LEGACY
+                # Save.
+                ensure_directory_exists(checkpoint_name)
+                torch.save(state_dict, checkpoint_name)
+    start_misc = time()
+    if not args.async_save:
+        assert async_save_request is None
+        # Wait so everyone is done (necessary)
+        if torch.distributed.is_initialized():
+            torch.distributed.barrier()
+
+    # And update the latest iteration
+    if not torch.distributed.is_initialized() \
+            or torch.distributed.get_rank() == 0:
+        tracker_filename = get_checkpoint_tracker_filename(save_dir)
+
+        if ckpt_type == CheckpointType.LOCAL:
+            def iter_finalize_fn():
+                print_rank_0('  successfully saved local checkpoint from iteration {:7d}'
+                             .format(iteration))
+                if args.log_progress and args.async_save:
+                    append_to_progress_log(f'Saved async local checkpoint\tIteration: {iteration}',
+                                           barrier=False)
+        else:
+            def iter_finalize_fn():
+                with open(tracker_filename, 'w') as f:
+                    f.write(str(iteration))
+                print_rank_0('  successfully saved checkpoint from iteration {:7d} to {}'
+                             .format(iteration, args.save))
+                if args.log_progress and args.async_save:
+                    append_to_progress_log(f'Saved async checkpoint\tIteration: {iteration}',
+                                           barrier=False)
+
+        if args.async_save:
+            assert async_save_request is not None
+            async_save_request.add_finalize_fn(iter_finalize_fn)
+        else:
+            iter_finalize_fn()
+
+    # Additional callback for one_logger (last rank)
+    if not torch.distributed.is_initialized() \
+       or is_last_rank():
+        def onelogger_finalize_fn():
+            on_save_checkpoint_success(productive_metrics, args.async_save)
+        if args.async_save:
+            assert async_save_request is not None
+            async_save_request.add_finalize_fn(onelogger_finalize_fn)
+        else:
+            onelogger_finalize_fn()
+
+    if args.async_save:
+        schedule_async_save(async_save_request)
+        print_rank_0('  scheduled an async checkpoint save at iteration {:7d} to {}' \
+                     .format(iteration, save_dir))
+
+    # Wait so everyone is done (not necessary)
+    if torch.distributed.is_initialized():
+        torch.distributed.barrier()
+
+    end_misc = time()
+    logger.debug(f"rank: {rank}, takes {end_misc - start_misc} to finalize ckpt save ")
+
+
+def cleanup_old_non_persistent_checkpoint(save_dir, leave_ckpt_num=1, do_async=False):
+    if torch.distributed.is_initialized() and torch.distributed.get_rank() != 0:
+        return
+    save_dir = Path(save_dir)
+
+    iter_prefix = "iter_"
+    iter_ckpts = save_dir.rglob(f'{iter_prefix}*')
+    sorted_iter_ckpts = sorted(iter_ckpts, key=lambda ckpt_name: int(ckpt_name.name[len(iter_prefix):]))
+    if not sorted_iter_ckpts:
+        return
+    rm_iter_ckpts = sorted_iter_ckpts[:-leave_ckpt_num]
+    print_rank_0(f'Non-persistent checkpoints scheduled for removal: {rm_iter_ckpts}')
+    print_rank_0(f'Non-persistent checkpoints to be kept: {sorted_iter_ckpts[-leave_ckpt_num:]}')
+
+    def remove_iter_ckpts(_iter_ckpts):
+        for ckpt in _iter_ckpts:
+            shutil.rmtree(ckpt)
+    if do_async:
+        threading.Thread(target=remove_iter_ckpts, args=(rm_iter_ckpts,)).start()
+    else:
+        remove_iter_ckpts(rm_iter_ckpts)
+
+
+def save_dataloader_state(train_iterator, iteration, dataloader_save_path):
+    """Saves dataloader state if the dataloader supports it.
+
+    Currently, this is only used by Megatron Energon dataloader (multimodal) to store its state at a
+    specific iteration. The Megatron built-in dataloader (text-only) creates index files upfront
+    to track its state.
+
+    If the provided dataloader has `save_state` method, then it is called to save the state.
+    Otherwise, no state is saved.
+
+    Args:
+        train_iterator (iterable): Train dataloader.
+        iteration (int): Current iteration.
+        dataloader_save_path (str): Path where the dataloader state is saved.
+    """
+    # If no dataloader or saving path is provided, then exit early.
+    if train_iterator is None or dataloader_save_path is None:
+        return
+
+    # If dataloader doesn't support saving state, exit early.
+    if not hasattr(train_iterator, "save_state"):
+        return
+
+    # Save dataloader state for each data parallel rank only once.
+    first_rank = mpu.is_pipeline_first_stage(ignore_virtual=True) and mpu.get_tensor_model_parallel_rank() == 0
+    if not first_rank:
+        return
+
+    dp_rank = mpu.get_data_parallel_rank()
+    print(f"saving dataloader checkpoint at iteration {iteration} to {dataloader_save_path}")
+    train_dataloader_state_dict = train_iterator.save_state()
+    data_state_save_path = get_checkpoint_name(
+        dataloader_save_path, iteration,
+        basename=f'train_dataloader_dprank{dp_rank:03d}.pt'
+    )
+
+    torch.distributed.barrier(group=mpu.get_data_parallel_group())
+
+    if mpu.get_data_parallel_rank() == 0:
+        ensure_directory_exists(data_state_save_path)
+
+    torch.distributed.barrier(group=mpu.get_data_parallel_group())
+
+    dataloader_save_dict = {}
+    dataloader_save_dict['dataloader_state_dict'] = train_dataloader_state_dict
+    torch.save(dataloader_save_dict, data_state_save_path)
+
+
+def generate_state_dict(args, model, optimizer, opt_param_scheduler,
+                        rng_state, use_dist_ckpt=False, iteration=None,
+                        optim_sd_kwargs=None):
+    # Arguments, iteration, and model.
+    state_dict = {}
+    state_dict['args'] = args
+    state_dict['checkpoint_version'] = 3.0
+    if iteration is not None:
+        state_dict['iteration'] = iteration
+
+    if len(model) == 1:
+        state_dict['model'] = (model[0].sharded_state_dict()
+                               if use_dist_ckpt else
+                               model[0].state_dict_for_save_checkpoint())
+    else:
+        for i in range(len(model)):
+            mpu.set_virtual_pipeline_model_parallel_rank(i)
+            state_dict['model%d' % i] = (
+                model[i].sharded_state_dict()
+                if use_dist_ckpt else
+                model[i].state_dict_for_save_checkpoint())
+    # Optimizer stuff.
+    if not args.no_save_optim:
+        if optimizer is not None:
+            state_dict['optimizer'] = (optimizer.sharded_state_dict(state_dict, **(optim_sd_kwargs or {}))
+                                       if use_dist_ckpt else
+                                       optimizer.state_dict())
+        if opt_param_scheduler is not None:
+            state_dict['opt_param_scheduler'] = \
+                opt_param_scheduler.state_dict()
+    # RNG states.
+    if not args.no_save_rng:
+        state_dict["rng_state"] = rng_state
+    return state_dict
+
+
+def _transpose_first_dim(t, num_splits, num_splits_first, model):
+    input_shape = t.size()
+    # We use a self_attention module but the values extracted aren't
+    # specific to self attention so should work for cross attention as well
+    while hasattr(model, 'module'):
+        model = model.module
+    attention_module = model.language_model.encoder.layers[0].self_attention
+    hidden_size_per_attention_head = attention_module.hidden_size_per_attention_head
+    num_attention_heads_per_partition = attention_module.num_attention_heads_per_partition
+    if num_splits_first:
+        """[num_splits * np * hn, h]
+        -->(view) [num_splits, np, hn, h]
+        -->(tranpose) [np, num_splits, hn, h]
+        -->(view) [np * num_splits * hn, h] """
+
+        intermediate_shape = \
+            (num_splits, num_attention_heads_per_partition,
+             hidden_size_per_attention_head) + input_shape[1:]
+
+        t = t.view(*intermediate_shape)
+        t = t.transpose(0, 1).contiguous()
+    else:
+        """[np * hn * num_splits, h]
+        -->(view) [np, hn, num_splits, h]
+        -->(tranpose) [np, num_splits, hn, h]
+        -->(view) [np * num_splits * hn, h] """
+
+        intermediate_shape = \
+            (num_attention_heads_per_partition,
+             hidden_size_per_attention_head, num_splits) +\
+             input_shape[1:]
+
+        t = t.view(*intermediate_shape)
+        t = t.transpose(1, 2).contiguous()
+    t = t.view(*input_shape)
+
+    return t
+
+
+def fix_query_key_value_ordering(model, checkpoint_version):
+    """Fix up query/key/value matrix ordering if checkpoint
+    version is smaller than 2.0
+    """
+    if checkpoint_version < 2.0:
+        if isinstance(model, list):
+            assert len(model)==1
+            model = model[0]
+        for name, param in model.named_parameters():
+            if name.endswith(('.query_key_value.weight', '.query_key_value.bias')):
+                if checkpoint_version == 0:
+                    fixed_param = _transpose_first_dim(param.data, 3, True, model)
+                elif checkpoint_version == 1.0:
+                    fixed_param = _transpose_first_dim(param.data, 3, False, model)
+                else:
+                    print_rank_0(f"Invalid checkpoint version {checkpoint_version}.")
+                    sys.exit()
+                param.data.copy_(fixed_param)
+            if name.endswith(('.key_value.weight', '.key_value.bias')):
+                if checkpoint_version == 0:
+                    fixed_param = _transpose_first_dim(param.data, 2, True, model)
+                elif checkpoint_version == 1.0:
+                    fixed_param = _transpose_first_dim(param.data, 2, False, model)
+                else:
+                    print_rank_0(f"Invalid checkpoint version {checkpoint_version}.")
+                    sys.exit()
+                param.data.copy_(fixed_param)
+        print_rank_0(" successfully fixed query-key-values ordering for"
+                     " checkpoint version {}".format(checkpoint_version))
+
+
+def _get_non_persistent_iteration(non_persistent_global_dir, args, checkpointing_context=None):
+    if args.non_persistent_ckpt_type is None:
+        return -1
+    elif args.non_persistent_ckpt_type == "global":
+        tracker_filename = get_checkpoint_tracker_filename(non_persistent_global_dir)
+        if os.path.isfile(tracker_filename):
+            iteration, release = read_metadata(tracker_filename)
+            if release:
+                raise RuntimeError('Non-persistent checkpoint can\'t be a release checkpoint')
+        else:
+            iteration = -1
+            print_rank_0('WARNING: could not find the metadata file {}'.format(tracker_filename))
+            print_rank_0('    will not load any non-persistent checkpoint')
+        return iteration
+    elif args.non_persistent_ckpt_type == "local":
+        raise RuntimeError('LocalCheckpointManagers are not yet integrated')
+        return checkpointing_context['local_checkpoint_manager'].get_latest_checkpoint_iteration()
+    else:
+        assert False, 'Please use local or global non-persistent checkpoints' \
+            f'(got: {args.non_persistent_ckpt_type})'
+
+
+def _load_non_persistent_base_checkpoint(
+    non_persistent_global_dir,
+    args,
+    rank0,
+    sharded_state_dict,
+    non_persistent_iteration,
+    checkpointing_context=None,
+):
+    """ Load the base state_dict from a non-persistent distributed checkpoint.
+    Depending on the non_persistent_ckpt_type, different logic may be required.
+    """
+    assert args.non_persistent_ckpt_type is not None
+    if args.non_persistent_ckpt_type == "global":
+        if not rank0:
+            print_rank_0(
+                f'Loading from a non-persistent checkpoint (non-persistent iter {non_persistent_iteration})'
+            )
+        return _load_global_dist_base_checkpoint(
+            non_persistent_global_dir, args, rank0, sharded_state_dict, non_persistent_iteration, False
+        )
+    elif args.non_persistent_ckpt_type == "local":
+        raise RuntimeError('LocalCheckpointManagers are not yet integrated')
+        intermediate_state_dict, checkpoint_name = checkpointing_context[
+            'local_checkpoint_manager'
+        ].load()
+        state_dict = recreate_state_dict_after_load(
+            sharded_state_dict,
+            intermediate_state_dict,
+            algo=args.non_persistent_local_ckpt_algo,
+        )
+        return state_dict, checkpoint_name, False, CheckpointType.LOCAL
+    else:
+        assert False, 'Please use local or global non-persistent checkpoints' \
+            f'(got: {args.non_persistent_ckpt_type})'
+
+
+def _load_global_dist_base_checkpoint(
+    load_dir, args, rank0, sharded_state_dict, iteration, release
+):
+    """ Load the base state_dict from the given directory containing the global distributed checkpoint """
+    if rank0:
+        checkpoint_name = find_checkpoint_rank_0(load_dir, iteration, release)
+        state_dict = dist_checkpointing.load_common_state_dict(checkpoint_name)
+        return state_dict, checkpoint_name, release, CheckpointType.GLOBAL
+
+    if sharded_state_dict is None:
+        assert not args.auto_detect_ckpt_format and not args.use_dist_ckpt, (
+            args.auto_detect_ckpt_format,
+            args.use_dist_ckpt,
+        )
+        raise RuntimeError(
+            'Detected load from a distributed checkpoint, but neither --use-dist-ckpt nor --auto-detect-ckpt-format is set.'
+        )
+
+    checkpoint_name = get_checkpoint_name(load_dir, iteration, release, return_base_dir=True)
+    load_strategy = get_default_load_sharded_strategy(checkpoint_name)
+    # NOTE: `args.ckpt_fully_parallel_load` applies to both persistent and non-persistent checkpoints.
+    if args.ckpt_fully_parallel_load:
+        load_strategy = FullyParallelLoadStrategyWrapper(
+            load_strategy, mpu.get_data_parallel_group(with_context_parallel=True)
+        )
+    state_dict = dist_checkpointing.load(sharded_state_dict, checkpoint_name, load_strategy, strict=args.dist_ckpt_strictness)
+    return state_dict, checkpoint_name, release, CheckpointType.GLOBAL
+
+
+def _load_base_checkpoint(
+    load_dir,
+    args,
+    rank0=False,
+    sharded_state_dict=None,
+    checkpointing_context=None,
+):
+    """ Load the base state_dict from the given directory
+
+    If rank0 is true, just loads rank 0 checkpoint, ignoring arguments.
+    """
+    # Try to load non-persistent checkpoint first
+    non_persistent_global_dir = (
+        args.non_persistent_global_ckpt_dir
+        if args.non_persistent_global_ckpt_dir or load_dir is None
+        else os.path.join(load_dir, _NON_PERSISTENT_CKPT_SUBDIR)
+    )
+    non_persistent_iteration = _get_non_persistent_iteration(
+        non_persistent_global_dir, args, checkpointing_context
+    )
+    iteration, release = -1, False
+    tracker_filename = 'because load directory is not defined'
+    if load_dir is not None:
+        tracker_filename = get_checkpoint_tracker_filename(load_dir)
+        if os.path.isfile(tracker_filename):
+            iteration, release = read_metadata(tracker_filename)
+    if non_persistent_iteration != -1:  # there is a non-persistent checkpoint
+        if non_persistent_iteration >= iteration:
+            return _load_non_persistent_base_checkpoint(
+                non_persistent_global_dir,
+                args,
+                rank0,
+                sharded_state_dict,
+                non_persistent_iteration,
+                checkpointing_context,
+            )
+        else:
+            print_rank_0('WARNING: non-persistent checkpoints are older than persistent checkpoint')
+
+    # Otherwise we are dealing with global checkpoints
+    # If no tracker file, return nothing
+    if iteration == -1:
+        if not rank0:
+            print_rank_0('WARNING: could not find the metadata file {}'.format(tracker_filename))
+            print_rank_0('    will not load any checkpoints and will start from random')
+        # Conditionally exit if checkpoint not found.
+        if args.exit_on_missing_checkpoint:
+            print_rank_0(">> '--exit-on-missing-checkpoint' set ... exiting. <<")
+            if torch.distributed.is_initialized():
+                torch.distributed.barrier()
+            sys.exit()
+
+        return None, "", False, None
+
+    # Determine the type of the checkpoint
+    checkpoint_name = get_checkpoint_name(load_dir, iteration, release, return_base_dir=True)
+    is_dist_ckpt = dist_checkpointing.check_is_distributed_checkpoint(checkpoint_name)
+    if not rank0:
+        dist_infix = "distributed " if is_dist_ckpt else ""
+        if release:
+            print_rank_0(f' loading release {dist_infix}checkpoint from {load_dir}')
+        else:
+            print_rank_0(
+                f' loading {dist_infix}checkpoint from {load_dir} at iteration {iteration}'
+            )
+
+    # Handle global distributed checkpoint
+    if is_dist_ckpt:
+        return _load_global_dist_base_checkpoint(
+            load_dir, args, rank0, sharded_state_dict, iteration, release
+        )
+    # Handle global legacy checkpoint
+    if rank0:
+        checkpoint_name = find_checkpoint_rank_0(load_dir, iteration, release)
+    else:
+        checkpoint_name = get_checkpoint_name(load_dir, iteration, release, return_base_dir=False)
+    try:
+        state_dict = torch.load(checkpoint_name, map_location='cpu')
+    except ModuleNotFoundError:
+        from megatron.legacy.fp16_deprecated import loss_scaler
+
+        # For backward compatibility.
+        if not rank0:
+            print_rank_0(' > deserializing using the old code structure ...')
+        sys.modules['fp16.loss_scaler'] = sys.modules['megatron.legacy.fp16_deprecated.loss_scaler']
+        sys.modules['megatron.fp16.loss_scaler'] = sys.modules[
+            'megatron.legacy.fp16_deprecated.loss_scaler'
+        ]
+        sys.modules['megatron.model'] = sys.modules['megatron.legacy.model']
+        state_dict = torch.load(checkpoint_name, map_location='cpu')
+        sys.modules.pop('fp16.loss_scaler', None)
+        sys.modules.pop('megatron.fp16.loss_scaler', None)
+        sys.modules.pop('megatron.model', None)
+    except Exception as e:
+        print('could not load the checkpoint')
+        print(e)
+        sys.exit()
+
+    return state_dict, checkpoint_name, release, CheckpointType.LEGACY
+
+
+def load_args_from_checkpoint(
+    args, load_arg='load', checkpointing_context=None
+):
+    """Set required arguments from the checkpoint specified in the
+    arguments.
+
+    Will overwrite arguments that have a non-None default value, but
+    will leave any arguments that default to None as set.
+
+    Returns the same args NameSpace with the new values added/updated.
+
+    If no checkpoint is specified in args, or if the checkpoint is
+    there but invalid, the arguments will not be modified
+
+    """
+    load_dir = getattr(args, load_arg)
+
+    if load_dir is None:
+        print_rank_0('No load directory specified, using provided arguments.')
+        return args
+
+    state_dict, checkpoint_name, release, ckpt_type = _load_base_checkpoint(
+        load_dir,
+        args,
+        rank0=True,
+        checkpointing_context=checkpointing_context,
+    )
+
+    # Args.
+    if not state_dict:
+        print_rank_0('Checkpoint not found to provide arguments, using provided arguments.')
+        return args
+
+    if 'args' not in state_dict:
+        print_rank_0('Checkpoint provided does not have arguments saved, using provided arguments.')
+        return args
+
+    checkpoint_args = state_dict['args']
+    checkpoint_version = state_dict.get('checkpoint_version', 0)
+    args.iteration = state_dict['iteration']
+
+    # One-off conversion for foundation models
+    if hasattr(checkpoint_args, 'disable_bias_linear'):
+        setattr(
+            checkpoint_args, 'add_bias_linear', not getattr(checkpoint_args, 'disable_bias_linear')
+        )
+
+    def _set_arg(arg_name, old_arg_name=None, force=False):
+        if not force and getattr(args, arg_name, None) is not None:
+            return
+
+        if old_arg_name is not None:
+            checkpoint_value = getattr(checkpoint_args, old_arg_name, None)
+        else:
+            checkpoint_value = getattr(checkpoint_args, arg_name, None)
+
+        if checkpoint_value is not None:
+            print_rank_0(f"Setting {arg_name} to {checkpoint_value} from checkpoint")
+            setattr(args, arg_name, checkpoint_value)
+        else:
+            print_rank_0(f"Checkpoint did not provide arguments {arg_name}")
+
+    _set_arg('num_layers')
+    _set_arg('hidden_size')
+    _set_arg('ffn_hidden_size')
+    _set_arg('seq_length')
+    _set_arg('num_attention_heads')
+    _set_arg('num_query_groups', force=True)
+    _set_arg('group_query_attention', force=True)
+    _set_arg('kv_channels')
+    _set_arg('max_position_embeddings')
+    _set_arg('position_embedding_type', force=True)
+    _set_arg('add_position_embedding', force=True)
+    _set_arg('use_rotary_position_embeddings', force=True)
+    _set_arg('rotary_percent', force=True)
+    _set_arg('rotary_interleaved', force=True)
+    _set_arg('add_bias_linear', force=True)
+    _set_arg('add_qkv_bias', force=True)
+    _set_arg('swiglu', force=True)
+    _set_arg('untie_embeddings_and_output_weights', force=True)
+    _set_arg('apply_layernorm_1p', force=True)
+    _set_arg('normalization', force=True)
+    _set_arg('tokenizer_type')
+    _set_arg('padded_vocab_size')
+    _set_arg('apply_query_key_layer_scaling', force=True)
+    if checkpoint_version < 3.0:
+        _set_arg('tensor_model_parallel_size', 'model_parallel_size')
+    else:
+        _set_arg('tensor_model_parallel_size', force=True)
+        _set_arg('pipeline_model_parallel_size', force=True)
+        _set_arg('virtual_pipeline_model_parallel_size', force=True)
+        _set_arg('num_layers_per_virtual_pipeline_stage')
+    return args, checkpoint_args
+
+
+def fix_fp8_params_lose_precision_when_loading_dist_ckpt(state_dict):
+    """
+    When "--fp8-param-gather" and "--use-dist-ckpt" are both enabled, the state dict read from
+    dist-checkpoint loses precision (the weights read from checkpoint go through the process of
+    bf16/fp16 -> fp8 -> bf16/fp16). This function is implemented to solve this problem.
+    When "--fp8-param-gather" is disabled, this function doesn't modify anything.
+    """
+    for key in state_dict.keys():
+        if key.startswith('model'):
+            for _, sharded_tensor in state_dict[key].items():
+                if is_float8tensor(sharded_tensor.data):
+                    sharded_tensor.data = sharded_tensor.data.from_float8().cpu()
+
+
+def load_checkpoint(model, optimizer, opt_param_scheduler, load_arg='load', strict=True,
+                    ft_client=None, checkpointing_context=None):
+    """Load a model checkpoint and return the iteration.
+    strict (bool): whether to strictly enforce that the keys in
+        :attr:`state_dict` of the checkpoint match the names of
+        parameters and buffers in model.
+    """
+    args = get_args()
+    load_dir = getattr(args, load_arg)
+
+    # Finetuning directories
+    pretrained_dir = getattr(args, 'pretrained_checkpoint', None)
+    if pretrained_dir is not None and not checkpoint_exists(load_dir):
+        print_rank_0(
+            f'Checkpoint file not found in load directory {load_dir} attempting to finetune with checkpoint in {pretrained_dir}'
+        )
+        load_dir = pretrained_dir
+        if not checkpoint_exists(load_dir):
+            raise FileNotFoundError("No checkpoint found in load directory or pretrained directory")
+        args.finetune = True
+
+    model = unwrap_model(model)
+
+    load_kwargs = {}
+    is_dist_ckpt = False
+    if (
+        args.auto_detect_ckpt_format
+        or args.use_dist_ckpt
+        or args.non_persistent_save_interval is not None
+    ):
+        state_dict, checkpoint_name, release, ckpt_type = _load_base_checkpoint(
+            load_dir,
+            args,
+            rank0=True,
+            checkpointing_context=checkpointing_context,
+        )
+        if args.enable_ft_package and ft_client is not None and state_dict is not None:
+            if 'ft_state' in state_dict:
+                ft_client.load_state_dict(state_dict['ft_state'])
+            else:
+                print_rank_0("ft_state is not present in state_dict")
+        is_dist_ckpt = (
+            ckpt_type == CheckpointType.LOCAL
+            or dist_checkpointing.check_is_distributed_checkpoint(checkpoint_name)
+        )
+        if is_dist_ckpt:
+            ckpt_tp_pp = (
+                state_dict['args'].tensor_model_parallel_size,
+                state_dict['args'].pipeline_model_parallel_size,
+            )
+            run_tp_pp = (
+                mpu.get_tensor_model_parallel_world_size(),
+                mpu.get_pipeline_model_parallel_world_size(),
+            )
+            mismatch_msg = "(TP, PP) mismatch after resume ({} vs {} from checkpoint)".format(
+                ckpt_tp_pp, run_tp_pp
+            )
+
+            # Determine if RNG state will be loaded
+            if (ckpt_tp_pp == run_tp_pp and not release and not args.finetune and not args.no_load_rng
+                    and not getattr(state_dict['args'], 'no_save_rng', False)):
+                gen_sd_rng_state = get_rng_state(True)  # we can load the rng state
+            else:
+                gen_sd_rng_state = None
+                if ckpt_tp_pp != run_tp_pp:
+                    print_rank_0("{}: RNG state will be ignored".format(mismatch_msg))
+
+            optim_sd_kwargs = dict(is_loading=True)
+            # Determine if optimizer state will be loaded
+            if (not release and not args.finetune and not args.no_load_optim
+                    and not getattr(state_dict['args'], 'no_save_optim', False)):
+                gen_sd_optim = optimizer
+                gen_sd_opt_param_scheduler = opt_param_scheduler
+
+                if args.use_distributed_optimizer:
+                    optim_sd_kwargs['sharding_type'] = ('fully_sharded_model_space'
+                                                        if getattr(state_dict['args'], 'ckpt_fully_parallel_save', False)
+                                                        else 'dp_zero_gather_scatter')
+                    # This is for backwards-compatibility. Can be removed once 'fully_sharded_bucket_space' loading is removed
+                    for maybe_dist_opt_optim_state in (state_dict['optimizer'], *state_dict['optimizer'].values()):
+                        if 'param_state_sharding_type' in maybe_dist_opt_optim_state:
+                            if maybe_dist_opt_optim_state['param_state_sharding_type'] == 'fully_sharded_bucket_space':
+                                print_rank_0('Detected deprecated `fully_sharded_bucket_space` DistributedOptimizer checkpoint format')
+                                optim_sd_kwargs['sharding_type'] = maybe_dist_opt_optim_state['param_state_sharding_type']
+                            break
+
+                    if ckpt_tp_pp != run_tp_pp and optim_sd_kwargs['sharding_type'] != 'fully_sharded_model_space':
+                        raise RuntimeError(f"{mismatch_msg}: not supported for DistributedOptimizer with sharding type {optim_sd_kwargs['sharding_type']}."
+                                           f" Please use `--ckpt-fully-parallel-save` flag during checkpoint saving.")
+            else:
+                gen_sd_optim = None
+                gen_sd_opt_param_scheduler = None
+            load_kwargs['sharded_state_dict'] = generate_state_dict(args, model, gen_sd_optim, gen_sd_opt_param_scheduler,
+                                                                    gen_sd_rng_state, True, optim_sd_kwargs=optim_sd_kwargs)
+            # When "--fp8-param-gather" is disabled, this function doesn't modify anything.
+            fix_fp8_params_lose_precision_when_loading_dist_ckpt(load_kwargs['sharded_state_dict'])
+
+    state_dict, checkpoint_name, release, ckpt_type = _load_base_checkpoint(
+        load_dir, args, rank0=False, checkpointing_context=checkpointing_context,
+        **load_kwargs
+    )
+
+    if args.enable_ft_package and ft_client is not None and state_dict is not None:
+        if 'ft_state' in state_dict:
+            ft_client.load_state_dict(state_dict['ft_state'])
+        else:
+            print_rank_0("ft_state is not present in state_dict")
+
+    # Checkpoint not loaded.
+    if state_dict is None:
+        # Iteration and num_floating_point_operations_so_far default to 0.
+        return 0, 0
+
+    # Set checkpoint version.
+    set_checkpoint_version(state_dict.get('checkpoint_version', 0))
+
+    # Set iteration.
+    if args.finetune or release:
+        iteration = 0
+    else:
+        try:
+            iteration = state_dict['iteration']
+        except KeyError:
+            try:  # Backward compatible with older checkpoints
+                iteration = state_dict['total_iters']
+            except KeyError:
+                print_rank_0('A metadata file exists but unable to load '
+                             'iteration from checkpoint {}, exiting'.format(checkpoint_name))
+                sys.exit()
+    num_floating_point_operations_so_far = state_dict.get('num_floating_point_operations_so_far', 0)
+
+    # Check arguments.
+    assert args.consumed_train_samples == 0
+    assert args.skipped_train_samples == 0
+    assert args.consumed_valid_samples == 0
+    if 'args' in state_dict and not args.finetune:
+        checkpoint_args = state_dict['args']
+        check_checkpoint_args(checkpoint_args)
+        args.consumed_train_samples = getattr(checkpoint_args,
+                                              'consumed_train_samples', 0)
+        args.skipped_train_samples = getattr(checkpoint_args,
+                                             'skipped_train_samples', 0)
+        update_num_microbatches(consumed_samples=args.consumed_train_samples, verbose=True)
+        args.consumed_valid_samples = getattr(checkpoint_args,
+                                              'consumed_valid_samples', 0)
+    else:
+        print_rank_0('could not find arguments in the checkpoint ...')
+
+    # [ModelOpt]: loading modelopt_state (sharded or not)
+    if has_nvidia_modelopt:
+        if ckpt_type == CheckpointType.LOCAL:
+            raise NotImplementedError('Local checkpointing does not support model opt')
+        if not args.use_dist_ckpt:
+            restore_modelopt_state(model, state_dict)
+        else:
+            restore_sharded_modelopt_state(model, checkpoint_name)
+
+    # Model.
+    strict = False if args.retro_add_retriever else strict
+    if len(model) == 1:
+        model[0].load_state_dict(state_dict['model'], strict=strict)
+    else:
+        for i in range(len(model)):
+            mpu.set_virtual_pipeline_model_parallel_rank(i)
+            model[i].load_state_dict(state_dict['model%d' % i], strict=strict)
+
+    # Fix up query/key/value matrix ordering if needed.
+    checkpoint_version = get_checkpoint_version()
+    print_rank_0(f' checkpoint version {checkpoint_version}')
+    fix_query_key_value_ordering(model, checkpoint_version)
+
+    # Optimizer.
+    if not release and not args.finetune and not args.no_load_optim:
+        try:
+            # Load state dict.
+            if optimizer is not None:
+                optimizer.load_state_dict(state_dict['optimizer'])
+
+            # Load distributed optimizer's custom parameter state.
+            # For distributed checkpoint it's already loaded in load_state_dict above
+            if args.use_distributed_optimizer and not is_dist_ckpt:
+                # NOTE: this is a manual read of the tracker file.
+                # This code should not be reached when reading from a non_persistent checkpoint
+                assert not is_dist_ckpt
+                tracker_filename = get_checkpoint_tracker_filename(load_dir)
+                iteration, release = read_metadata(tracker_filename)
+                model_checkpoint_name = \
+                    get_checkpoint_name(load_dir, iteration, release)
+                optim_checkpoint_name = \
+                    get_distributed_optimizer_checkpoint_name(
+                        model_checkpoint_name)
+                optimizer.load_parameter_state(optim_checkpoint_name,
+                                               update_legacy_format=args.ckpt_convert_update_legacy_dist_opt_format)
+
+            # Load scheduler.
+            if opt_param_scheduler is not None:
+                if 'lr_scheduler' in state_dict: # backward compatbility
+                    opt_param_scheduler.load_state_dict(state_dict['lr_scheduler'])
+                else:
+                    opt_param_scheduler.load_state_dict(state_dict['opt_param_scheduler'])
+        except KeyError as e:
+            print_rank_0('Unable to load optimizer from checkpoint {}. '
+                         'Specify --no-load-optim or --finetune to prevent '
+                         'attempting to load the optimizer state, '
+                         'exiting ...'.format(checkpoint_name))
+            raise e
+    else:
+        if (args.fp16 or args.bf16) and optimizer is not None:
+            optimizer.reload_model_params()
+
+    # rng states.
+    if not release and not args.finetune and not args.no_load_rng:
+        try:
+            if 'rng_state' in state_dict:
+                # access rng_state for data parallel rank
+                if args.data_parallel_random_init:
+                    rng_state = state_dict['rng_state'][mpu.get_data_parallel_rank()]
+                else:
+                    rng_state = state_dict['rng_state'][0]
+                random.setstate(rng_state['random_rng_state'])
+                np.random.set_state(rng_state['np_rng_state'])
+                torch.set_rng_state(rng_state['torch_rng_state'])
+                torch.cuda.set_rng_state(rng_state['cuda_rng_state'])
+                # Check for empty states array
+                if not rng_state['rng_tracker_states']:
+                    raise KeyError
+                tensor_parallel.get_cuda_rng_tracker().set_states(
+                    rng_state['rng_tracker_states'])
+            else:  # backward compatability
+                random.setstate(state_dict['random_rng_state'])
+                np.random.set_state(state_dict['np_rng_state'])
+                torch.set_rng_state(state_dict['torch_rng_state'])
+                torch.cuda.set_rng_state(state_dict['cuda_rng_state'])
+                # Check for empty states array
+                if not state_dict['rng_tracker_states']:
+                    raise KeyError
+                tensor_parallel.get_cuda_rng_tracker().set_states(
+                    state_dict['rng_tracker_states'])
+        except KeyError:
+            print_rank_0('Unable to load rng state from checkpoint {}. '
+                         'Specify --no-load-rng or --finetune to prevent '
+                         'attempting to load the rng state, '
+                         'exiting ...'.format(checkpoint_name))
+            sys.exit()
+
+    # Some utilities want to load a checkpoint without distributed being initialized
+    if torch.distributed.is_initialized():
+        torch.distributed.barrier()
+
+    print_rank_0(f'  successfully loaded checkpoint from {load_dir} '
+                 f'[ t {mpu.get_tensor_model_parallel_rank()}, '
+                 f'p {mpu.get_pipeline_model_parallel_rank()} ] '
+                 f'at iteration {iteration}')
+
+    torch.cuda.empty_cache()
+    return iteration, num_floating_point_operations_so_far
+
+
+def load_biencoder_checkpoint(model, only_query_model=False,
+                              only_context_model=False, custom_load_path=None):
+    """
+    selectively load retrieval models for indexing/retrieving
+    from saved checkpoints
+    """
+
+    args = get_args()
+
+    model = unwrap_model(model)
+
+    load_path = custom_load_path if custom_load_path is not None else args.load
+
+    tracker_filename = get_checkpoint_tracker_filename(load_path)
+    with open(tracker_filename, 'r') as f:
+        iteration = int(f.read().strip())
+
+    checkpoint_name = get_checkpoint_name(load_path, iteration,
+                                          args.use_distributed_optimizer,
+                                          release=False)
+
+    if mpu.get_data_parallel_rank() == 0:
+        print('global rank {} is loading checkpoint {}'.format(
+            torch.distributed.get_rank(), checkpoint_name))
+
+    state_dict = torch.load(checkpoint_name, map_location='cpu')
+    ret_state_dict = state_dict['model']
+
+    if only_query_model:
+        ret_state_dict.pop('context_model')
+    if only_context_model:
+        ret_state_dict.pop('query_model')
+
+    assert len(model) == 1
+    model[0].load_state_dict(ret_state_dict)
+    torch.distributed.barrier()
+
+    if mpu.get_data_parallel_rank() == 0:
+        print(' successfully loaded {}'.format(checkpoint_name))
+
+    return model
diff --git a/megatron/dist_signal_handler.py b/megatron/training/dist_signal_handler.py
similarity index 97%
rename from megatron/dist_signal_handler.py
rename to megatron/training/dist_signal_handler.py
index a60204f004..f4b4fbf5c0 100644
--- a/megatron/dist_signal_handler.py
+++ b/megatron/training/dist_signal_handler.py
@@ -1,3 +1,4 @@
+# Copyright (c) 2024, NVIDIA CORPORATION. All rights reserved.
 import signal
 
 import torch
diff --git a/megatron/training/ft_integration.py b/megatron/training/ft_integration.py
new file mode 100644
index 0000000000..250262775e
--- /dev/null
+++ b/megatron/training/ft_integration.py
@@ -0,0 +1,110 @@
+# Copyright (c) 2024, NVIDIA CORPORATION. All rights reserved.
+
+"""
+FT Package Integration
+
+This file is part of the integration process for the FT package, a custom heartbeat-based
+system developed by NVIDIA. The FT package monitors the ranks to detect hangs, gracefully
+terminates the workload, and respawns it from the last checkpoints. It includes an auto
+config feature that automatically sets up timeouts based on the observed time of iterations.
+
+Note: This tool is an internal NVIDIA tool and is not open source. This file does not
+contain the FT package itself but supports its integration.
+"""
+
+import types
+from enum import Enum, auto
+from . import global_vars
+
+class StateMachineActions(Enum):
+    NONE = auto()
+    SAVE_CHECKPOINT = auto()
+    TRAIN_HEARTBEAT = auto() 
+    EVAL_HEARTBEAT = auto()
+    UPDATE_TIMEOUT = auto()
+
+class _TrainingStateMachine:
+    """
+    This class encapsulates logic for determining when:
+    - FT timeouts can be updated (`.can_update_timeouts` property)
+
+    `on_ ...` methods update the state and should be called from the corresponding places.
+    """
+
+    MIN_ITERS_FOR_TIMEOUT_UPDATE = 2
+
+    def __init__(self):
+        self.num_tr_iters_total = 0
+        self.num_tr_iter_at_last_save = None
+        self.seen_checkpointing = False
+        self.timeouts_updated = False
+
+    def on_save_checkpoint(self):
+        self.num_tr_iter_at_last_save = self.num_tr_iters_total
+
+    def on_train_heartbeat(self):
+        self.num_tr_iters_total += 1
+        if not self.seen_checkpointing and self.num_tr_iter_at_last_save is not None:
+            # detect mid-epoch checkpointing that makes hearbeat interval longer
+            iters_pre_save = self.num_tr_iter_at_last_save
+            iters_post_save = self.num_tr_iters_total - self.num_tr_iter_at_last_save
+            self.seen_checkpointing = iters_pre_save > 0 and iters_post_save > 0
+
+    def on_eval_heartbeat(self):
+        pass
+
+    def on_timeouts_updated(self):
+        self.timeouts_updated = True
+
+    @property
+    def can_update_timeouts(self) -> bool:
+        """
+        Returns True if new timeouts can be computed.
+        `.on_timeouts_updated()` resets this property back to False.
+        """
+        if self.timeouts_updated:
+            # timeouts are updated at most once per training run
+            return False
+        if self.num_tr_iters_total < self.MIN_ITERS_FOR_TIMEOUT_UPDATE:
+            # need a few training iters
+            return False
+        # check if there was checkoint saving
+        # this makes heartbeat iterval longer than usual.
+        return self.seen_checkpointing
+
+    def perform_action(self, action: StateMachineActions):
+        if action == StateMachineActions.TRAIN_HEARTBEAT:
+            self.on_train_heartbeat()
+        elif action == StateMachineActions.SAVE_CHECKPOINT:
+            self.on_save_checkpoint()
+        elif action == StateMachineActions.EVAL_HEARTBEAT:
+            self.on_eval_heartbeat()
+        elif action == StateMachineActions.UPDATE_TIMEOUT:
+            self.on_timeouts_updated()
+            assert not self.can_update_timeouts
+        # No action for StateMachineActions.NONE
+
+
+_GLOBAL_RANK_MONITOR_CLIENT = None
+_GLOBAL_STATE_MACHINE = _TrainingStateMachine()
+
+def _set_rank_monitor_client():
+    from nvidia_resiliency_ext.fault_tolerance import RankMonitorClient
+    cli = RankMonitorClient()
+    global _GLOBAL_RANK_MONITOR_CLIENT
+    global_vars._ensure_var_is_not_initialized(_GLOBAL_RANK_MONITOR_CLIENT, 'rank monitor client')
+    _GLOBAL_RANK_MONITOR_CLIENT = cli
+
+def get_rank_monitor_client(action=StateMachineActions.NONE):
+    global _GLOBAL_RANK_MONITOR_CLIENT, _GLOBAL_STATE_MACHINE
+    if _GLOBAL_RANK_MONITOR_CLIENT is None:
+        try:
+            _set_rank_monitor_client()
+        except ImportError:
+            _GLOBAL_RANK_MONITOR_CLIENT = None
+    _GLOBAL_STATE_MACHINE.perform_action(action)
+    return _GLOBAL_RANK_MONITOR_CLIENT
+
+def can_update_timeouts():
+    global _GLOBAL_STATE_MACHINE
+    return _GLOBAL_STATE_MACHINE.can_update_timeouts
diff --git a/megatron/global_vars.py b/megatron/training/global_vars.py
similarity index 54%
rename from megatron/global_vars.py
rename to megatron/training/global_vars.py
index e3831167fd..6c1b551d1d 100644
--- a/megatron/global_vars.py
+++ b/megatron/training/global_vars.py
@@ -1,4 +1,4 @@
-# Copyright (c) 2022, NVIDIA CORPORATION. All rights reserved.
+# Copyright (c) 2024, NVIDIA CORPORATION. All rights reserved.
 
 """Megatron global variables."""
 
@@ -6,16 +6,16 @@
 import sys
 import torch
 
-from megatron import dist_signal_handler
-from megatron.tokenizer import build_tokenizer
-from .microbatches import build_num_microbatches_calculator
-from .timers import Timers
+from megatron.core import Timers
+from megatron.core.num_microbatches_calculator import init_num_microbatches_calculator
+from megatron.training import dist_signal_handler
+from megatron.training.tokenizer import build_tokenizer
 
 _GLOBAL_ARGS = None
-_GLOBAL_RETRO_ARGS = None
-_GLOBAL_NUM_MICROBATCHES_CALCULATOR = None
 _GLOBAL_TOKENIZER = None
 _GLOBAL_TENSORBOARD_WRITER = None
+_GLOBAL_WANDB_WRITER = None
+_GLOBAL_ONE_LOGGER = None
 _GLOBAL_ADLR_AUTORESUME = None
 _GLOBAL_TIMERS = None
 _GLOBAL_SIGNAL_HANDLER = None
@@ -26,24 +26,6 @@ def get_args():
     return _GLOBAL_ARGS
 
 
-def get_retro_args():
-    """Return retro arguments."""
-    return _GLOBAL_RETRO_ARGS
-
-
-def get_num_microbatches():
-    return _GLOBAL_NUM_MICROBATCHES_CALCULATOR.get()
-
-
-def get_current_global_batch_size():
-    return _GLOBAL_NUM_MICROBATCHES_CALCULATOR.get_current_global_batch_size()
-
-
-def update_num_microbatches(consumed_samples, consistency_check=True):
-    _GLOBAL_NUM_MICROBATCHES_CALCULATOR.update(consumed_samples,
-                                               consistency_check)
-
-
 def get_tokenizer():
     """Return tokenizer."""
     _ensure_var_is_initialized(_GLOBAL_TOKENIZER, 'tokenizer')
@@ -56,6 +38,17 @@ def get_tensorboard_writer():
     return _GLOBAL_TENSORBOARD_WRITER
 
 
+def get_wandb_writer():
+    """Return tensorboard writer. It can be None so no need
+    to check if it is initialized."""
+    return _GLOBAL_WANDB_WRITER
+
+
+def get_one_logger():
+    """Return one logger. It can be None so no need
+    to check if it is initialized."""
+    return _GLOBAL_ONE_LOGGER
+
 def get_adlr_autoresume():
     """ADLR autoresume object. It can be None so no need
     to check if it is initialized."""
@@ -80,7 +73,7 @@ def _set_signal_handler():
 
 
 
-def set_global_variables(args):
+def set_global_variables(args, build_tokenizer=True):
     """Set args, tokenizer, tensorboard-writer, adlr-autoresume, and timers."""
 
     assert args is not None
@@ -88,36 +81,31 @@ def set_global_variables(args):
     _ensure_var_is_not_initialized(_GLOBAL_ARGS, 'args')
     set_args(args)
 
-    _build_num_microbatches_calculator(args)
-    _ = _build_tokenizer(args)
+    init_num_microbatches_calculator(
+        args.rank,
+        args.rampup_batch_size,
+        args.global_batch_size,
+        args.micro_batch_size,
+        args.data_parallel_size,
+        args.decrease_batch_size_if_needed,
+    )
+    if build_tokenizer:
+        _ = _build_tokenizer(args)
     _set_tensorboard_writer(args)
+    _set_wandb_writer(args)
+    _set_one_logger(args)
     _set_adlr_autoresume(args)
     _set_timers(args)
 
     if args.exit_signal_handler:
         _set_signal_handler()
-    
+
 
 def set_args(args):
     global _GLOBAL_ARGS
     _GLOBAL_ARGS = args
 
 
-def set_retro_args(retro_args):
-    global _GLOBAL_RETRO_ARGS
-    _GLOBAL_RETRO_ARGS = retro_args
-
-
-def _build_num_microbatches_calculator(args):
-
-    global _GLOBAL_NUM_MICROBATCHES_CALCULATOR
-    _ensure_var_is_not_initialized(_GLOBAL_NUM_MICROBATCHES_CALCULATOR,
-                                   'num microbatches calculator')
-
-    _GLOBAL_NUM_MICROBATCHES_CALCULATOR = build_num_microbatches_calculator(
-        args)
-
-
 def _build_tokenizer(args):
     """Initialize tokenizer."""
     global _GLOBAL_TOKENIZER
@@ -152,6 +140,54 @@ def _set_tensorboard_writer(args):
                   'no TensorBoard logs will be written.', flush=True)
 
 
+def _set_wandb_writer(args):
+    global _GLOBAL_WANDB_WRITER
+    _ensure_var_is_not_initialized(_GLOBAL_WANDB_WRITER,
+                                   'wandb writer')
+    if getattr(args, 'wandb_project', '') and args.rank == (args.world_size - 1):
+        if args.wandb_exp_name == '':
+            raise ValueError("Please specify the wandb experiment name!")
+
+        import wandb
+        if args.wandb_save_dir:
+            save_dir = args.wandb_save_dir
+        else:
+            # Defaults to the save dir.
+            save_dir = os.path.join(args.save, 'wandb')
+        wandb_kwargs = {
+            'dir': save_dir,
+            'name': args.wandb_exp_name,
+            'project': args.wandb_project,
+            'config': vars(args)}
+        os.makedirs(wandb_kwargs['dir'], exist_ok=True)
+        wandb.init(**wandb_kwargs)
+        _GLOBAL_WANDB_WRITER = wandb
+
+
+def _set_one_logger(args):
+    global _GLOBAL_ONE_LOGGER
+    _ensure_var_is_not_initialized(_GLOBAL_ONE_LOGGER, 'one logger')
+
+    if args.enable_one_logger and args.rank == (args.world_size - 1):
+        if args.one_logger_async or getattr(args, 'wandb_project', ''):
+            one_logger_async = True
+        else:
+            one_logger_async = False
+        try:
+            from one_logger import OneLogger
+            config = {
+               'project': args.one_logger_project,
+               'name': args.one_logger_run_name,
+               'async': one_logger_async,
+            }
+            one_logger = OneLogger(config=config)
+            _GLOBAL_ONE_LOGGER = one_logger
+        except Exception:
+            print('WARNING: one_logger package is required to enable e2e metrics '
+                  'tracking. please go to '
+                  'https://confluence.nvidia.com/display/MLWFO/Package+Repositories'
+                  ' for details to install it')
+
 def _set_adlr_autoresume(args):
     """Initialize ADLR autoresume."""
     global _GLOBAL_ADLR_AUTORESUME
@@ -163,7 +199,7 @@ def _set_adlr_autoresume(args):
         sys.path.append(os.environ.get('SUBMIT_SCRIPTS', '.'))
         try:
             from userlib.auto_resume import AutoResume
-        except BaseException:
+        except ImportError:
             print('ADLR autoresume is not available, exiting ...')
             sys.exit()
 
@@ -186,5 +222,27 @@ def _ensure_var_is_not_initialized(var, name):
     """Make sure the input variable is not None."""
     assert var is None, '{} is already initialized.'.format(name)
 
+def destroy_global_vars():
+    global _GLOBAL_ARGS
+    _GLOBAL_ARGS = None
 
+    global _GLOBAL_TOKENIZER
+    _GLOBAL_TOKENIZER = None
 
+    global _GLOBAL_TENSORBOARD_WRITER
+    _GLOBAL_TENSORBOARD_WRITER = None
+
+    global _GLOBAL_WANDB_WRITER
+    _GLOBAL_WANDB_WRITER = None
+
+    global _GLOBAL_ONE_LOGGER
+    _GLOBAL_ONE_LOGGER = None
+
+    global _GLOBAL_ADLR_AUTORESUME
+    _GLOBAL_ADLR_AUTORESUME = None
+
+    global _GLOBAL_TIMERS
+    _GLOBAL_TIMERS = None
+
+    global _GLOBAL_SIGNAL_HANDLER
+    _GLOBAL_SIGNAL_HANDLER = None
diff --git a/megatron/training/initialize.py b/megatron/training/initialize.py
new file mode 100644
index 0000000000..24982205f5
--- /dev/null
+++ b/megatron/training/initialize.py
@@ -0,0 +1,453 @@
+# Copyright (c) 2022, NVIDIA CORPORATION. All rights reserved.
+
+"""Megatron initialization."""
+import logging
+import random
+import os
+import time
+import warnings
+
+import numpy as np
+import torch
+from datetime import timedelta
+
+from megatron.legacy import fused_kernels
+from megatron.training import get_adlr_autoresume
+from megatron.training import get_args
+from megatron.training import get_tensorboard_writer
+from megatron.core import mpu, tensor_parallel
+from megatron.training.arguments import parse_args, validate_args
+from megatron.training.yaml_arguments import validate_yaml
+from megatron.training.checkpointing import load_args_from_checkpoint
+from megatron.training.global_vars import set_global_variables
+from megatron.core.fusions.fused_bias_dropout import bias_dropout_add_fused_train
+from megatron.core.fusions.fused_bias_gelu import bias_gelu
+from megatron.core.fusions.fused_bias_swiglu import bias_swiglu
+from megatron.core.utils import get_te_version, is_te_min_version
+
+logger = logging.getLogger(__name__)
+
+
+def initialize_megatron(
+    extra_args_provider=None,
+    args_defaults={},
+    ignore_unknown_args=False,
+    allow_no_cuda=False,
+    skip_mpu_initialization=False,
+    get_embedding_ranks=None,
+    get_position_embedding_ranks=None
+):
+    """Set global variables, initialize distributed, and
+    set autoresume and random seeds.
+    `allow_no_cuda` should not be set unless using megatron for cpu only
+    data processing. In general this arg should not be set unless you know
+    what you are doing.
+    Returns a function to finalize distributed env initialization
+    (optionally, only when args.lazy_mpu_init == True)
+    """
+    if not allow_no_cuda:
+        # Make sure cuda is available.
+        assert torch.cuda.is_available(), "Megatron requires CUDA."
+
+    # Parse arguments
+    args = parse_args(extra_args_provider, ignore_unknown_args)
+
+    # Prep for checkpoint conversion.
+    if args.ckpt_convert_format is not None:
+        assert args.ckpt_convert_save is not None
+        assert args.load is not None
+        args.exit_on_missing_checkpoint = True
+
+    if args.use_checkpoint_args or args_defaults.get("use_checkpoint_args", False):
+        assert args.load is not None, "--use-checkpoint-args requires --load argument"
+        load_args_from_checkpoint(args)
+
+    if args.yaml_cfg is not None:
+        args = validate_yaml(args, args_defaults)
+    else:
+        validate_args(args, args_defaults)
+
+
+    # set global args, build tokenizer, and set adlr-autoresume,
+    # tensorboard-writer, and timers.
+    set_global_variables(args)
+
+    # set logging level
+    setup_logging()
+
+    # torch.distributed initialization
+    def finish_mpu_init():
+        args = get_args()
+        # Pytorch distributed.
+        _initialize_distributed(get_embedding_ranks, get_position_embedding_ranks)
+
+        # Random seeds for reproducibility.
+        if args.rank == 0:
+            print("> setting random seeds to {} ...".format(args.seed))
+        _set_random_seed(args.seed, args.data_parallel_random_init)
+
+    if skip_mpu_initialization:
+        return None
+
+    args = get_args()
+    if args.lazy_mpu_init:
+        # TODO is this still a necessary option?
+        args.use_cpu_initialization = True
+        # delayed initialization of DDP-related stuff
+        # We only set basic DDP globals
+        mpu.set_tensor_model_parallel_world_size(args.tensor_model_parallel_size)
+        # and return function for external DDP manager
+        # to call when it has DDP initialized
+        mpu.set_tensor_model_parallel_rank(args.rank)
+        return finish_mpu_init
+    else:
+        # Megatron's MPU is the master. Complete initialization right away.
+        finish_mpu_init()
+
+        # Autoresume.
+        _init_autoresume()
+
+        # Compile dependencies.
+        _compile_dependencies()
+
+        if args.tp_comm_overlap:
+            #TODO: Should this be activated with just decoder-tp-comm-overlap too?
+           _initialize_tp_communicators()
+
+        # No continuation function
+        return None
+
+
+def _compile_dependencies():
+
+    args = get_args()
+
+    # =========================
+    # Compile dataset C++ code.
+    # =========================
+    # TODO: move this to ninja
+    if torch.distributed.get_rank() == 0:
+        start_time = time.time()
+        print("> compiling dataset index builder ...")
+        from megatron.core.datasets.utils import compile_helpers
+
+        compile_helpers()
+        print(
+            ">>> done with dataset index builder. Compilation time: {:.3f} "
+            "seconds".format(time.time() - start_time),
+            flush=True,
+        )
+
+    # ==================
+    # Load fused kernels
+    # ==================
+
+    # Custom kernel constraints check.
+    seq_len = args.seq_length
+    attn_batch_size = (
+        args.num_attention_heads / args.tensor_model_parallel_size
+    ) * args.micro_batch_size
+    # Constraints on sequence length and attn_batch_size to enable warp based
+    # optimization and upper triangular optimization (for causal mask)
+    custom_kernel_constraint = (
+        seq_len > 16
+        and seq_len <= 16384
+        and seq_len % 4 == 0
+        and attn_batch_size % 4 == 0
+    )
+    # Print a warning.
+    if not (
+        (args.fp16 or args.bf16)
+        and custom_kernel_constraint
+        and args.masked_softmax_fusion
+    ):
+        if args.rank == 0:
+            print(
+                "WARNING: constraints for invoking optimized"
+                " fused softmax kernel are not met. We default"
+                " back to unfused kernel invocations.",
+                flush=True,
+            )
+
+    # Always build on rank zero first.
+    if torch.distributed.get_rank() == 0:
+        start_time = time.time()
+        print("> compiling and loading fused kernels ...", flush=True)
+        fused_kernels.load(args)
+        torch.distributed.barrier()
+    else:
+        torch.distributed.barrier()
+        fused_kernels.load(args)
+    # Simple barrier to make sure all ranks have passed the
+    # compilation phase successfully before moving on to the
+    # rest of the program. We think this might ensure that
+    # the lock is released.
+    torch.distributed.barrier()
+    if torch.distributed.get_rank() == 0:
+        print(
+            ">>> done with compiling and loading fused kernels. "
+            "Compilation time: {:.3f} seconds".format(time.time() - start_time),
+            flush=True,
+        )
+
+def _initialize_tp_communicators():
+    """ initializing the communicators with user buffers for high-performance tensor-model-parallel
+        communication overlap """
+
+    try:
+       import yaml
+
+       import transformer_engine
+       from transformer_engine.pytorch import module as te_module
+
+    except ImportError:
+       raise RuntimeError("Tensor Parallel Communication/GEMM Overlap optimization needs 'yaml' and "
+             "'transformer_engine' packages")
+
+    args = get_args()
+
+    if args.tp_comm_overlap_cfg is not None:
+       with open(args.tp_comm_overlap_cfg,"r") as stream:
+          ub_cfgs = yaml.safe_load(stream)
+    else:
+       ub_cfgs = {}
+
+    if getattr(args, 'decoder_tp_comm_overlap', False):
+        input_shape = [(args.decoder_seq_length * args.micro_batch_size) // args.context_parallel_size , args.hidden_size]
+    else:
+        input_shape = [(args.seq_length * args.micro_batch_size) // args.context_parallel_size , args.hidden_size]
+
+    if is_te_min_version("1.9.0"):
+        # The process group with the target bootstrap backend is created in Transformer Engine.
+        te_module.base.initialize_ub(shape = input_shape, tp_size = args.tensor_model_parallel_size,
+                                     use_fp8 = (args.fp8 is not None) , ub_cfgs = ub_cfgs,
+                                     bootstrap_backend = args.tp_comm_bootstrap_backend)
+    else:
+        if args.tp_comm_bootstrap_backend != 'mpi':
+            warnings.warn(
+                f"Transformer Engine v{get_te_version()} supports only MPI bootstrap backend."
+            )
+        # Create a MPI process group to help with TP communication overlap bootstrap.
+        torch.distributed.new_group(backend='mpi')
+    
+        te_module.base.initialize_ub(shape = input_shape, tp_size = args.tensor_model_parallel_size,
+                                     use_fp8 = (args.fp8 is not None) , ub_cfgs = ub_cfgs)
+
+def _initialize_distributed(get_embedding_ranks, get_position_embedding_ranks):
+    """Initialize torch.distributed and core model parallel."""
+    args = get_args()
+
+    device_count = torch.cuda.device_count()
+    if torch.distributed.is_initialized():
+
+        if args.rank == 0:
+            print(
+                "torch distributed is already initialized, "
+                "skipping initialization ...",
+                flush=True,
+            )
+        args.rank = torch.distributed.get_rank()
+        args.world_size = torch.distributed.get_world_size()
+
+    else:
+
+        if args.rank == 0:
+            print("> initializing torch distributed ...", flush=True)
+        # Manually set the device ids.
+        if device_count > 0:
+            torch.cuda.set_device(args.local_rank)
+            device_id = torch.device(f'cuda:{args.local_rank}')
+        else:
+            device_id = None
+
+        # Call the init process
+        init_process_group_kwargs = {
+            'backend' : args.distributed_backend,
+            'world_size': args.world_size,
+            'rank': args.rank,
+            'timeout': timedelta(minutes=args.distributed_timeout_minutes),
+        }
+
+        torch.distributed.init_process_group(**init_process_group_kwargs)
+
+    # Set the tensor model-parallel, pipeline model-parallel, and
+    # data-parallel communicators.
+    if device_count > 0:
+        if mpu.model_parallel_is_initialized():
+            print("model parallel is already initialized")
+        else:
+            mpu.initialize_model_parallel(
+                args.tensor_model_parallel_size,
+                args.pipeline_model_parallel_size,
+                args.virtual_pipeline_model_parallel_size,
+                args.pipeline_model_parallel_split_rank,
+                context_parallel_size=args.context_parallel_size,
+                expert_model_parallel_size=args.expert_model_parallel_size,
+                distributed_timeout_minutes=args.distributed_timeout_minutes,
+                nccl_communicator_config_path=args.nccl_communicator_config_path,
+                order='tp-cp-ep-dp-pp' if not args.use_tp_pp_dp_mapping else 'tp-pp-dp',
+                encoder_tensor_model_parallel_size=args.encoder_tensor_model_parallel_size,
+                encoder_pipeline_model_parallel_size=args.encoder_pipeline_model_parallel_size,
+                get_embedding_ranks=get_embedding_ranks,
+                get_position_embedding_ranks=get_position_embedding_ranks,
+            )
+            if args.rank == 0:
+                print(
+                    f"> initialized tensor model parallel with size "
+                    f"{mpu.get_tensor_model_parallel_world_size()}"
+                )
+                print(
+                    f"> initialized pipeline model parallel with size "
+                    f"{mpu.get_pipeline_model_parallel_world_size()}"
+                )
+
+
+def _init_autoresume():
+    """Set autoresume start time."""
+    autoresume = get_adlr_autoresume()
+    if autoresume:
+        torch.distributed.barrier()
+        autoresume.init()
+        torch.distributed.barrier()
+
+
+def _set_random_seed(seed_, data_parallel_random_init=False):
+    """Set random seed for reproducability."""
+    if seed_ is not None and seed_ > 0:
+        # Ensure that different pipeline MP stages get different seeds.
+        seed = seed_ + (100 * mpu.get_pipeline_model_parallel_rank())
+        # Ensure different data parallel ranks get different seeds
+        if data_parallel_random_init:
+            seed = seed + (10 * mpu.get_data_parallel_rank())
+        random.seed(seed)
+        np.random.seed(seed)
+        torch.manual_seed(seed)
+        if torch.cuda.device_count() > 0:
+            tensor_parallel.model_parallel_cuda_manual_seed(seed)
+    else:
+        raise ValueError("Seed ({}) should be a positive integer.".format(seed))
+
+
+def write_args_to_tensorboard():
+    """Write arguments to tensorboard."""
+    args = get_args()
+    writer = get_tensorboard_writer()
+    if writer:
+        for arg in vars(args):
+            writer.add_text(arg, str(getattr(args, arg)), global_step=args.iteration)
+
+
+def set_jit_fusion_options():
+    """Set PyTorch JIT layer fusion options."""
+    # flags required to enable jit fusion kernels
+    TORCH_MAJOR = int(torch.__version__.split(".")[0])
+    TORCH_MINOR = int(torch.__version__.split(".")[1])
+    if (TORCH_MAJOR > 1) or (TORCH_MAJOR == 1 and TORCH_MINOR >= 10):
+        # nvfuser
+        torch._C._jit_set_profiling_executor(True)
+        torch._C._jit_set_profiling_mode(True)
+        torch._C._jit_override_can_fuse_on_cpu(False)
+        torch._C._jit_override_can_fuse_on_gpu(False)
+        torch._C._jit_set_texpr_fuser_enabled(False)
+        torch._C._jit_set_nvfuser_enabled(True)
+        torch._C._debug_set_autodiff_subgraph_inlining(False)
+    else:
+        # legacy pytorch fuser
+        torch._C._jit_set_profiling_mode(False)
+        torch._C._jit_set_profiling_executor(False)
+        torch._C._jit_override_can_fuse_on_cpu(True)
+        torch._C._jit_override_can_fuse_on_gpu(True)
+
+    _warmup_jit_function()
+
+
+def _warmup_jit_function():
+    """Compilie JIT functions before the main training steps"""
+    args = get_args()
+    if args.bf16:
+        dtype = torch.bfloat16
+    elif args.fp16:
+        dtype = torch.float16
+    else:
+        dtype = torch.float32
+
+    # Warmup fused bias+gelu
+    bias = torch.rand(
+        args.ffn_hidden_size // args.tensor_model_parallel_size,
+        dtype=dtype,
+        device="cuda",
+    )
+    input = torch.rand(
+        (
+            args.seq_length // args.context_parallel_size,
+            args.micro_batch_size,
+            args.ffn_hidden_size // args.tensor_model_parallel_size,
+        ),
+        dtype=dtype,
+        device="cuda",
+    )
+    # Warmup JIT fusions with the input grad_enable state of both forward
+    # prop and recomputation
+    for bias_grad, input_grad in zip([True, True], [False, True]):
+        bias.requires_grad, input.requires_grad = bias_grad, input_grad
+        for _ in range(5):
+            if args.swiglu:
+                output = bias_swiglu(input, bias)
+            else:
+                output = bias_gelu(bias, input)
+    del bias, input, output
+
+    # Warmup fused bias+dropout+add
+    if args.sequence_parallel:
+        seq_length = args.seq_length // mpu.get_tensor_model_parallel_world_size()
+    else:
+        seq_length = args.seq_length
+    input = torch.rand(
+        (seq_length // args.context_parallel_size, args.micro_batch_size, args.hidden_size),
+        dtype=dtype,
+        device="cuda",
+    )
+    residual = torch.rand(
+        (seq_length // args.context_parallel_size, args.micro_batch_size, args.hidden_size),
+        dtype=dtype,
+        device="cuda",
+    )
+    bias = torch.rand((args.hidden_size), dtype=dtype, device="cuda").expand_as(
+        residual
+    )
+    dropout_rate = 0.1
+    # Warmup JIT fusions with the input grad_enable state of both forward
+    # prop and recomputation
+    for input_grad, bias_grad, residual_grad in zip(
+        [False, True], [True, True], [True, True]
+    ):
+        input.requires_grad = input_grad
+        bias.requires_grad = bias_grad
+        residual.requires_grad = residual_grad
+        for _ in range(5):
+            output = bias_dropout_add_fused_train([input, bias], residual, dropout_rate)
+    del bias, input, residual, output
+    torch.cuda.empty_cache()
+
+
+def setup_logging() -> None:
+    """ Sets the default logging level based on cmdline args and env vars.
+
+    Precedence:
+    1. Command line argument `--logging-level`
+    2. Env var `MEGATRON_LOGGING_LEVEL`
+    3. Default logging level (INFO)
+
+    Returns: None
+    """
+    args = get_args()
+    logging_level = None
+    env_logging_level = os.getenv('MEGATRON_LOGGING_LEVEL', None)
+    if env_logging_level is not None:
+        logging_level = int(env_logging_level)
+    if args.logging_level is not None:
+        logging_level = args.logging_level
+
+    if logging_level is not None:
+        logger.info(f'Setting logging level to {logging_level}')
+        logging.getLogger().setLevel(logging_level)
diff --git a/megatron/training/log_handler.py b/megatron/training/log_handler.py
new file mode 100644
index 0000000000..06f5d1842d
--- /dev/null
+++ b/megatron/training/log_handler.py
@@ -0,0 +1,24 @@
+# Copyright (c) 2023, NVIDIA CORPORATION. All rights reserved.
+
+import sys
+from logging import LogRecord, StreamHandler
+
+BLACKLISTED_MODULES = ["torch.distributed"]
+
+
+class CustomHandler(StreamHandler):
+    """
+    Custom handler to filter out logging from code outside of
+    Megatron Core, and dump to stdout.
+    """
+
+    def __init__(self):
+        super().__init__(stream=sys.stdout)
+
+    def filter(self, record: LogRecord) -> bool:
+        # Prevent log entries that come from the blacklisted modules
+        # through (e.g., PyTorch Distributed).
+        for blacklisted_module in BLACKLISTED_MODULES:
+            if record.name.startswith(blacklisted_module):
+                return False
+        return True
diff --git a/megatron/training/one_logger_utils.py b/megatron/training/one_logger_utils.py
new file mode 100644
index 0000000000..3a45712b72
--- /dev/null
+++ b/megatron/training/one_logger_utils.py
@@ -0,0 +1,463 @@
+# Copyright (c) 2024, NVIDIA CORPORATION. All rights reserved.
+import time, os
+
+from .global_vars import get_one_logger, get_args
+
+
+def get_timestamp_in_ms():
+    """Helper function to get timestamp in ms
+
+    Returns:
+        [int]: [timestamp in ms]
+    """
+    return round(time.time() * 1000.0)
+
+
+def on_train_start(iteration, consumed_train_samples, train_samples, seq_length,
+                   train_iters, save, async_save, log_throughput,
+                   num_floating_point_operations_so_far):
+    """Function will be called at the start of train function to prepare and track E2E metrics.
+
+    Args:
+        iteration (int): current iteration number
+        consumed_train_samples (int): consumed sample numbers so far
+        train_samples (int): total train sample number
+        seq_length (int): sequence length
+        train_iters (type): target iteration
+        save (str): output directory to save checkpoints to
+        async_save (bool): apply async checkpointing save
+        log_throughput (bool): log throughput or not
+        num_floating_point_operations_so_far (int): flops so far
+    """
+    one_logger = get_one_logger()
+
+    if one_logger:
+        with one_logger.get_context_manager():
+            # Get app train loop start time
+            app_train_loop_start_time = get_timestamp_in_ms()
+            one_logger.store_set('app_train_loop_start_time', app_train_loop_start_time)
+
+            # Set up initial values in store
+            one_logger.store_set('iteration_start', iteration)
+            one_logger.store_set('train_samples_start', consumed_train_samples)
+
+            # Init accumulative metric values in one-logger store
+            one_logger.store_set('train_iterations_time_msecs_total', 0)
+            one_logger.store_set('tracked_train_iterations', iteration)
+            one_logger.store_set('validation_iterations_time_msecs_total', 0)
+            one_logger.store_set('tracked_validation_iterations', 0)
+            one_logger.store_set('save_checkpoint_count', 0)
+            one_logger.store_set('save_checkpoint_sync_time_total', 0.0)
+
+            train_samples_target = train_samples
+            train_tokens_target = seq_length * train_samples_target
+            e2e_metrics = {
+                'train_samples_start': consumed_train_samples,
+                'train_iterations_start': iteration,
+                'train_samples_target': train_samples_target,
+                'train_iterations_target': train_iters,
+                'train_tokens_target': train_tokens_target,
+                'app_train_loop_start_time': app_train_loop_start_time,
+                'is_save_checkpoint_enabled': save is not None,
+                'save_checkpoint_strategy': 'async' if async_save else 'sync',
+            }
+            if log_throughput:
+                e2e_metrics.update({
+                    'train_tflop_start': float(num_floating_point_operations_so_far) / (10**12),
+                })
+            one_logger.log_metrics(e2e_metrics)
+
+
+def _produce_e2e_metrics(log_throughput=False, throughput=None):
+    """ Generate APP metrics for E2E tracking
+    NOTE: always call this function after barrier call
+
+    Args:
+        log_throughput (bool, optional): if log throughput or not. Defaults to False.
+        throughput (int, optional): throughput value to log. Defaults to None.
+
+    Returns:
+        dict: all E2E metrics
+    """
+    one_logger = get_one_logger()
+    
+    if one_logger:
+        with one_logger.get_context_manager():
+            # Unpack and assign local vars
+            base_metrics = one_logger.store_get('get_e2e_base_metrics')()
+            (iteration, train_duration, eval_duration, eval_iterations,
+             total_flops, num_floating_point_operations_so_far,
+             consumed_train_samples, world_size, seq_length) = base_metrics.values()
+
+            iteration_start = one_logger.store_get('iteration_start')
+            train_samples_start = one_logger.store_get('train_samples_start')
+
+            train_samples = consumed_train_samples - train_samples_start
+            train_iterations = iteration - iteration_start
+            train_iterations_time_msecs_avg = (train_duration * 1000.0) / train_iterations
+            if eval_iterations:
+                validation_iterations_time_msecs_avg = (eval_duration * 1000.0) / eval_iterations
+            else:
+                validation_iterations_time_msecs_avg = None
+
+            if not one_logger.store_has_key('first_logged_train_iterations_finish_time'):
+                one_logger.store_set(
+                    'first_logged_train_iterations_finish_time',
+                    get_timestamp_in_ms()
+                )
+
+            train_tokens = train_samples * seq_length
+
+            e2e_metrics = {
+                'first_logged_train_iterations_finish_time': \
+                    one_logger.store_get('first_logged_train_iterations_finish_time'),
+                'train_iterations_end': iteration,
+                'train_samples_end': consumed_train_samples,
+                'train_iterations': train_iterations,
+                'train_samples': train_samples,
+                'train_iterations_time_msecs_avg': train_iterations_time_msecs_avg,
+                'validation_iterations_time_total': eval_duration,
+                'validation_iterations_time_msecs_avg': validation_iterations_time_msecs_avg,
+                'train_tokens': train_tokens,
+                'train_iterations_time_total': train_duration,
+                'last_logged_train_iterations_finish_time': get_timestamp_in_ms(),
+            }
+
+            if log_throughput:
+                if train_duration:
+                    train_throughput_per_gpu = total_flops / (train_duration * 10**12 * world_size)
+                else:
+                    train_throughput_per_gpu = 0.0
+
+                train_throughput_per_gpu_max = one_logger.store_get('train_throughput_per_gpu_max')
+                if throughput:
+                    train_throughput_per_gpu_max = max(throughput, train_throughput_per_gpu_max)
+                    one_logger.store_set('train_throughput_per_gpu_max', train_throughput_per_gpu_max)
+
+                throughput_metrics = {
+                    'train_tflop_end': float(num_floating_point_operations_so_far) / (10**12),
+                    'train_tflop': float(total_flops) / (10**12),
+                    'train_throughput_per_gpu': train_throughput_per_gpu,
+                    'train_throughput_per_gpu_max': train_throughput_per_gpu_max,
+                }
+                e2e_metrics.update(throughput_metrics)
+
+            # Tracking minimal train/validation iteration duration metrics
+            # Minimal train iteration duration
+            current_train_iterations_time_msecs_total = train_duration * 1000.0
+            current_train_iteration = iteration
+            prev_train_iterations_time_msecs_total = one_logger.store_get('train_iterations_time_msecs_total')
+            tracked_train_iterations = one_logger.store_get('tracked_train_iterations')
+
+            if current_train_iteration > tracked_train_iterations:
+                train_iterations_time_msecs = (
+                    (current_train_iterations_time_msecs_total - prev_train_iterations_time_msecs_total) /
+                    (current_train_iteration - tracked_train_iterations)
+                )
+
+                if not one_logger.store_has_key('train_iterations_time_msecs_min'):
+                    train_iterations_time_msecs_min = train_iterations_time_msecs
+                else:
+                    train_iterations_time_msecs_min = min(
+                        one_logger.store_get('train_iterations_time_msecs_min'),
+                        train_iterations_time_msecs
+                    )
+                one_logger.store_set('train_iterations_time_msecs_min', train_iterations_time_msecs_min)
+                one_logger.store_set('train_iterations_time_msecs_total', current_train_iterations_time_msecs_total)
+                one_logger.store_set('tracked_train_iterations', current_train_iteration)
+
+                e2e_metrics.update({
+                    'train_iterations_time_msecs_min': train_iterations_time_msecs_min
+                })
+
+            # Minimal validation iteration duration
+            current_validation_iterations_time_msecs_total = eval_duration * 1000.0
+            current_validation_iteration = eval_iterations
+            prev_validation_iterations_time_msecs_total = \
+                one_logger.store_get('validation_iterations_time_msecs_total')
+            tracked_validation_iterations = one_logger.store_get('tracked_validation_iterations')
+
+            if current_validation_iteration > tracked_validation_iterations:
+                validation_iterations_time_msecs = (
+                    (current_validation_iterations_time_msecs_total - prev_validation_iterations_time_msecs_total) /
+                    (current_validation_iteration - tracked_validation_iterations)
+                )
+
+                # Cache minimal validation iteration duration
+                if not one_logger.store_has_key('validation_iterations_time_msecs_min'):
+                    validation_iterations_time_msecs_min = validation_iterations_time_msecs
+                else:
+                    validation_iterations_time_msecs_min = min(
+                        one_logger.store_get('validation_iterations_time_msecs_min'),
+                        validation_iterations_time_msecs
+                    )
+                one_logger.store_set('validation_iterations_time_msecs_min', validation_iterations_time_msecs_min)
+                one_logger.store_set('validation_iterations_time_msecs_total', current_validation_iterations_time_msecs_total)
+                one_logger.store_set('tracked_validation_iterations', current_validation_iteration)
+
+                e2e_metrics.update({
+                    'validation_iterations_time_msecs_min': validation_iterations_time_msecs_min
+                })
+            return e2e_metrics
+
+
+def track_e2e_metrics(log_throughput=False, throughput=None):
+    """Track E2E application metrics with one-logger
+
+    NOTE: the function should be called after barrier call.
+
+    Args:
+        log_throughput (bool, optional): if log throughput or not. Defaults to False.
+        throughput (int, optional): throughput value to log. Defaults to None.
+    """
+    one_logger = get_one_logger()
+
+    if one_logger:
+        with one_logger.get_context_manager():
+            e2e_metrics = _produce_e2e_metrics(log_throughput, throughput)
+            one_logger.log_metrics(e2e_metrics)
+
+
+def on_save_checkpoint_start(async_save):
+    """Function to be called before save-checkpoint start to generate productive metrics to log after ckpt succeeds.
+
+    Args:
+        async_save (bool): apply async checkpointing save
+
+    Returns:
+        dict: productive metrics to be stored to DB after ckpt succeeds
+    """
+    one_logger = get_one_logger()
+    
+    if one_logger:
+        with one_logger.get_context_manager():
+            # Unpack and assign local vars
+            base_metrics = one_logger.store_get('get_e2e_base_metrics')()
+            (iteration, train_duration, eval_duration, eval_iterations,
+             total_flops, num_floating_point_operations_so_far,
+             consumed_train_samples, world_size, seq_length) = base_metrics.values()
+
+            save_checkpoint_count = one_logger.store_get('save_checkpoint_count') + 1
+            one_logger.store_set('save_checkpoint_count', save_checkpoint_count)
+            one_logger.log_metrics({
+                'train_iterations_save_checkpoint_end': iteration,
+                'save_checkpoint_count': save_checkpoint_count,
+            })
+            productive_metrics = {
+                'train_tflop_productive_end': float(num_floating_point_operations_so_far) / (10**12),
+                'train_iterations_productive_end': iteration,
+                'train_samples_productive_end': consumed_train_samples,
+                'train_iterations_time_total_productive': train_duration,
+                'validation_iterations_time_total_productive': eval_duration,
+            }
+            if async_save:
+                productive_metrics.update({
+                    'save_checkpoint_async_count': save_checkpoint_count,
+                })
+            return productive_metrics
+
+            
+def on_pretrain_start():
+    """ Function to be called at the start of pretrain function to track E2E meta data
+    """
+    args = get_args()
+    one_logger = get_one_logger()
+
+    if one_logger:
+        with one_logger.get_context_manager():
+            job_name = os.environ.get('SLURM_JOB_NAME', None)
+            app_tag_run_name =  job_name if not args.app_tag_run_name else args.app_tag_run_name
+            app_tag_run_version = args.app_tag_run_version
+            one_logger.store_set('app_tag_run_name', app_tag_run_name)
+            one_logger.store_set('app_tag_run_version', app_tag_run_version)
+            one_logger.store_set('train_throughput_per_gpu_max', 0.0)
+
+            one_logger.log_metrics({
+                'train_iterations_warmup': 5,
+                'data_parallel_size' : args.data_parallel_size,
+                'context_parallel_size': args.context_parallel_size,
+                'global_batch_size': args.global_batch_size,
+                'micro_batch_size': args.micro_batch_size,
+                'pipeline_model_parallel_size': args.pipeline_model_parallel_size,
+                'tensor_model_parallel_size': args.tensor_model_parallel_size,
+                'expert_model_parallel_size' : args.expert_model_parallel_size,
+                'world_size': args.world_size,
+                'model_seq_length': args.seq_length,
+                'app_tag_run_name': app_tag_run_name,
+                'app_tag_run_version': app_tag_run_version,
+                'is_log_throughput_enabled': args.log_throughput,
+                'app_run_type': 'training',
+                'summary_data_schema_version': '1.0.0',
+                'app_metrics_feature_tags': 'full',
+            })
+
+def track_config_flags(train_iters, skip_train, do_train, do_valid, do_test,
+                           dataloader_type, retro_project_dir, retro_cyclic_train_iters):
+    """Track flags about train/validation/test enablement
+
+    Args:
+        train_iters (int): target train iteration number
+        skip_train (bool): flag to skip train iterations
+        do_train (bool): flags to do train
+        do_valid (bool): flags to do validation
+        do_test (bool): flags to do test
+        dataloader_type (str): dataloader type
+        retro_project_dir (str): Retro project directory
+        retro_cyclic_train_iters (int): iteration number for cyclic retro training
+    """
+    one_logger = get_one_logger()
+    if one_logger:
+        with one_logger.get_context_manager():
+            # Update train_iters for cyclic loader
+            if dataloader_type == 'cyclic' and retro_project_dir:
+                assert retro_cyclic_train_iters is not None
+                train_iters = retro_cyclic_train_iters
+            # Track if training is enabled. Can only be done once args.do_train is assigned after dataloader is built.
+            train_enabled = train_iters and (not skip_train) and do_train and train_iters > 0
+            one_logger.log_metrics({
+                'is_train_iterations_enabled': train_enabled,
+                'is_validation_iterations_enabled': bool(do_valid),
+                'is_test_iterations_enabled': bool(do_test),
+            })
+
+def on_save_checkpoint_success(productive_metrics, async_save):
+    """Function to be called after checkpointing succeeds and checkpoint is persisted for storing productive metrics
+
+    Args:
+        productive_metrics (dict): productive related E2E metrics generated at the start of save checkpoint
+        async_save (bool): apply async checkpointing save
+    """
+    one_logger = get_one_logger()
+
+    if one_logger:
+        with one_logger.get_context_manager():
+            # Accumulate train_iterations_time_total_productive for current iteration
+            prod_iteration = productive_metrics['train_iterations_productive_end']
+
+            # Log start timestamp of first iteration that was successfully checkpointed
+            if not one_logger.store_has_key('first_checkpoint_success'):
+                app_train_loop_start_time = one_logger.store_get('app_train_loop_start_time')
+                one_logger.store_set('first_checkpoint_success', True)
+                one_logger.log_metrics({
+                    'first_saved_train_iterations_start_time': app_train_loop_start_time
+                })
+
+            # Handle possible out-of-order async checkpoint callbacks
+            need_update = True
+            if one_logger.store_has_key('iters_prod_max'):
+                need_update = prod_iteration > one_logger.store_get('iters_prod_max')
+
+            if need_update:
+                # Update cache
+                one_logger.store_set('iters_prod_max', prod_iteration)
+
+                if async_save:
+                    save_checkpoint_sync_time_total_productive = \
+                        one_logger.store_pop(f'save_checkpoint_sync_time_total_productive:{prod_iteration}')
+                    last_successful_save_checkpoint_sync_finish_time = \
+                        one_logger.store_pop(f'save_checkpoint_sync_finish_time:{prod_iteration}')
+                    # Update productive metrics and log to DB
+                    productive_metrics.update({
+                        'save_checkpoint_sync_time_total_productive': save_checkpoint_sync_time_total_productive,
+                        'last_successful_save_checkpoint_sync_finish_time': last_successful_save_checkpoint_sync_finish_time
+                    })
+                one_logger.log_metrics(productive_metrics)
+
+
+def on_save_checkpoint_end(save_checkpoint_duration, current_iteration, async_save):
+    """Function to be called after checkpointing ends
+    
+    Args:
+        save_checkpoint_duration (float): duration of current save checkpoint process
+        current_iteration (int): current train iteration step number
+        async_save (bool): apply async checkpointing save
+    """
+    one_logger = get_one_logger()
+    if one_logger:
+        with one_logger.get_context_manager():
+            save_checkpoint_sync_finish_time = get_timestamp_in_ms()
+
+            # Track finish timestamp of the sync part of first successful save checkpoint
+            if (one_logger.store_has_key('first_checkpoint_success') 
+                    and not one_logger.store_has_key('first_successful_checkpoint_end')):
+                one_logger.store_set('first_successful_checkpoint_end', True)
+                one_logger.log_metrics({
+                    'first_successful_save_checkpoint_sync_finish_time': save_checkpoint_sync_finish_time
+                })
+
+            save_checkpoint_sync_count = one_logger.store_get('save_checkpoint_count')
+
+            # accumulate total sync checkpointing duration
+            save_checkpoint_sync_time_total = \
+                one_logger.store_get('save_checkpoint_sync_time_total') + save_checkpoint_duration
+            one_logger.store_set('save_checkpoint_sync_time_total', save_checkpoint_sync_time_total)
+
+            e2e_metrics = {}
+            if async_save:
+                # Cache total sync checkpointing duration
+                one_logger.store_set(
+                    f'save_checkpoint_sync_time_total_productive:{current_iteration}',
+                    save_checkpoint_sync_time_total
+                )
+                # Cache finish time for current iteration
+                one_logger.store_set(f'save_checkpoint_sync_finish_time:{current_iteration}',
+                                     save_checkpoint_sync_finish_time)
+            else:
+                e2e_metrics.update({
+                    # Track productive total time directly for sync ckpt
+                    'save_checkpoint_sync_time_total_productive': save_checkpoint_sync_time_total,
+                    'last_successful_save_checkpoint_sync_finish_time': save_checkpoint_sync_finish_time,
+                })
+
+            # Tracking min & max value sync checkpointing duration
+            # For the first comparison
+            if not one_logger.store_has_key('save_checkpoint_sync_time_max'):
+                one_logger.store_set('save_checkpoint_sync_time_max', save_checkpoint_duration)
+            if not one_logger.store_has_key('save_checkpoint_sync_time_min'):
+                one_logger.store_set('save_checkpoint_sync_time_min', save_checkpoint_duration)
+
+            save_checkpoint_sync_time_max = max(
+                one_logger.store_get('save_checkpoint_sync_time_max'),
+                save_checkpoint_duration
+            )
+            save_checkpoint_sync_time_min = min(
+                one_logger.store_get('save_checkpoint_sync_time_min'),
+                save_checkpoint_duration
+            )
+            one_logger.store_set('save_checkpoint_sync_time_max', save_checkpoint_sync_time_max)
+            one_logger.store_set('save_checkpoint_sync_time_min', save_checkpoint_sync_time_min)
+            e2e_metrics.update({
+                'save_checkpoint_sync_count': save_checkpoint_sync_count,
+                'save_checkpoint_sync_time_max': save_checkpoint_sync_time_max,
+                'save_checkpoint_sync_time_min': save_checkpoint_sync_time_min,
+                'save_checkpoint_sync_time_total': save_checkpoint_sync_time_total,
+            })
+            one_logger.log_metrics(e2e_metrics)
+
+
+def track_app_tag(batch_size, world_size, seq_length):
+    """Track app_tag and app_tag ID
+
+    Args:
+        batch_size (int): current batch size
+        world_size (int): the number of processes of current job
+        seq_length (int): current sequence length
+    """
+    # Track app tag & app tag ID
+    one_logger = get_one_logger()
+    if one_logger:
+        with one_logger.get_context_manager():
+            app_tag_run_name = one_logger.store_get('app_tag_run_name')
+            app_tag_run_version = one_logger.store_get('app_tag_run_version')
+            current_app_tag = (f'{app_tag_run_name}_{app_tag_run_version}_{batch_size}'
+                            f'_{world_size}_{seq_length}')
+            one_logger.log_app_tag(current_app_tag)
+
+
+def finish():
+    """Flush E2E metrics to remote server
+    """
+    one_logger = get_one_logger()
+    if one_logger:
+        with one_logger.get_context_manager():
+            one_logger.finish()
diff --git a/megatron/training/theoretical_memory_usage.py b/megatron/training/theoretical_memory_usage.py
new file mode 100644
index 0000000000..f9b75031ae
--- /dev/null
+++ b/megatron/training/theoretical_memory_usage.py
@@ -0,0 +1,187 @@
+# Copyright (c) 2023, NVIDIA CORPORATION. All rights reserved.
+
+"""Computes theoretical memory footprint for model training."""
+
+
+import math
+
+NUM_BYTES_IN_MEGABYTE = 1024 * 1024
+
+
+def compute_weight_and_optimizer_memory(args, verbose=False):
+    # Attention projection size.
+    query_projection_size = args.kv_channels * args.num_attention_heads
+    query_projection_to_hidden_size_ratio = query_projection_size / args.hidden_size
+    # Group Query Attention.
+    if not args.group_query_attention:
+        args.num_query_groups = args.num_attention_heads
+    # MoE.
+    num_experts = 1 if args.num_experts is None else args.num_experts
+    gated_linear_multiplier = 3 / 2 if args.swiglu else 1
+    num_parameters_in_transformer_layers = (
+        2
+        * args.num_layers
+        * args.hidden_size
+        * args.hidden_size
+        * (
+            # Attention.
+            (
+                (1 + (args.num_query_groups / args.num_attention_heads))
+                * query_projection_to_hidden_size_ratio
+            )
+            # MLP.
+            + ((args.ffn_hidden_size / args.hidden_size) * num_experts * gated_linear_multiplier)
+            # Transformer layernorms.
+            + (2 / args.hidden_size)
+            # Final layernorm.
+            + (1 / (args.num_layers * args.hidden_size))
+        )
+    )
+    embedding_size = args.hidden_size * args.padded_vocab_size
+    if args.untie_embeddings_and_output_weights:
+        num_parameters_in_embedding_layers = 2 * embedding_size
+    else:
+        num_parameters_in_embedding_layers = embedding_size
+    num_total_parameters = num_parameters_in_transformer_layers + num_parameters_in_embedding_layers
+    if verbose:
+        print(
+            f"Number of parameters in transformer layers in billions: "
+            f"{num_parameters_in_transformer_layers / 10**9: .2f}"
+        )
+        print(
+            f"Number of parameters in embedding layers in billions: "
+            f"{num_parameters_in_embedding_layers / 10**9:.2f}"
+        )
+        print(f"Total number of parameters in billions: {num_total_parameters / 10**9:.2f}")
+
+    # Most loaded model shard has (1/pp_size transformer layers + 1 embedding layer) / tp_size.
+    num_parameters_on_most_loaded_model_shard = (
+        (num_parameters_in_transformer_layers / args.pipeline_model_parallel_size) + embedding_size
+    ) / args.tensor_model_parallel_size
+    if args.untie_embeddings_and_output_weights and args.pipeline_model_parallel_size == 1:
+        num_parameters_on_most_loaded_model_shard += (
+            embedding_size / args.tensor_model_parallel_size
+        )
+    if verbose:
+        print(
+            f"Number of parameters in most loaded shard in billions: "
+            f"{num_parameters_on_most_loaded_model_shard / 10**9:.4f}"
+        )
+
+    if args.pipeline_model_parallel_size > 1:
+        # Other shards just have (1/pp_size transformer layers) / tp_size.
+        num_parameters_on_other_model_shards = num_parameters_in_transformer_layers / (
+            args.pipeline_model_parallel_size * args.tensor_model_parallel_size
+        )
+        if verbose:
+            print(
+                f"Number of parameters in other shards in billions: "
+                f"{num_parameters_on_other_model_shards / 10**9:.4f}"
+            )
+
+    num_bytes_per_parameter = (
+        18 if not args.use_distributed_optimizer else 6 + (12 / args.data_parallel_size)
+    )
+    weight_and_optimizer_memory = (
+        num_parameters_on_most_loaded_model_shard * num_bytes_per_parameter
+    )
+
+    return weight_and_optimizer_memory
+
+
+def compute_activation_memory(args, num_microbatches, verbose=False):
+    # Using formula in Table 2 of https://arxiv.org/pdf/2205.05198.pdf.
+    # We are trying to compute the maximum activation footprint, so all calculations in this
+    # function are for the first pipeline stage.
+
+    # TODO: This function needs to take into account query_projection_size potentially being
+    # different from hidden_size.
+
+    # Memory footprint from transformer layer (self-attention and MLP).
+    activation_memory = (args.seq_length * args.micro_batch_size * args.hidden_size) * (
+        18 + (4 * (args.ffn_hidden_size / args.hidden_size))
+    )
+    if verbose:
+        print(
+            f"Activation memory footprint per transformer layer: "
+            f"{activation_memory / NUM_BYTES_IN_MEGABYTE / args.tensor_model_parallel_size:.1f} MB"
+        )
+    activation_memory *= args.num_layers
+
+    # Now add activation memory required for input embeddings, last LayerNorm and output layer.
+
+    # Input to embedding (pp_size microbatches in flight).
+    activation_memory += (
+        8 * args.seq_length * args.micro_batch_size * args.pipeline_model_parallel_size
+    )
+    # Dropout in embedding layer (pp_size microbatches in flight).
+    activation_memory += (
+        args.seq_length
+        * args.micro_batch_size
+        * args.hidden_size
+        * args.pipeline_model_parallel_size
+    )
+
+    # Multiply by interleaved PP memory factor.
+    if args.virtual_pipeline_model_parallel_size is not None:
+        interleaved_schedule_memory_penalty = 1 + (
+            (args.pipeline_model_parallel_size - 1)
+            / (args.pipeline_model_parallel_size * args.virtual_pipeline_model_parallel_size)
+        )
+        in_flight_microbatches = math.ceil(
+            interleaved_schedule_memory_penalty * args.pipeline_model_parallel_size
+        )
+        if verbose:
+            print(
+                f"Memory penalty from interleaved schedule: {interleaved_schedule_memory_penalty:.2f}"
+            )
+            print(f"Number of in-flight microbatches: {in_flight_microbatches}")
+        activation_memory *= interleaved_schedule_memory_penalty
+
+    # If using non-interleaved schedule, number of microbatches in pipeline can be less than pp_size,
+    # so discount accordingly.
+    if args.virtual_pipeline_model_parallel_size is None and args.pipeline_model_parallel_size > 1:
+        if num_microbatches is not None:
+            activation_memory *= min(1, num_microbatches / args.pipeline_model_parallel_size)
+            in_flight_microbatches = min(num_microbatches, args.pipeline_model_parallel_size)
+        else:
+            in_flight_microbatches = args.pipeline_model_parallel_size
+        if verbose:
+            print(f"Number of in-flight microbatches: {in_flight_microbatches}")
+
+    if args.pipeline_model_parallel_size == 1:
+        # Inputs to output layer and CE loss.
+        activation_memory += (
+            args.seq_length
+            * args.micro_batch_size
+            * args.hidden_size
+            * 4
+            * (1 + (args.padded_vocab_size / args.hidden_size))
+        )
+
+    # Activation memory is partitioned by TP size due to tensor and sequence model parallelism.
+    return activation_memory / args.tensor_model_parallel_size
+
+
+def report_theoretical_memory(args, num_microbatches=None, verbose=False):
+    weight_and_optimizer_memory = (
+        compute_weight_and_optimizer_memory(args, verbose=verbose) / NUM_BYTES_IN_MEGABYTE
+    )
+
+    # Formulae here assume sequence parallelism and selective activation recomputation.
+    if not args.sequence_parallel or args.recompute_granularity != 'selective':
+        print(
+            f"Theoretical memory footprints: weight and optimizer={weight_and_optimizer_memory:.2f} MB"
+        )
+        return
+
+    activation_memory = (
+        compute_activation_memory(args, num_microbatches=num_microbatches, verbose=verbose)
+        / NUM_BYTES_IN_MEGABYTE
+    )
+    total_memory = weight_and_optimizer_memory + activation_memory
+
+    print(
+        f"Theoretical memory footprints: weight and optimizer={weight_and_optimizer_memory:.2f} MB, "
+        f"activation={activation_memory:.2f} MB, total={total_memory:.2f} MB\n"
+    )
diff --git a/megatron/tokenizer/__init__.py b/megatron/training/tokenizer/__init__.py
similarity index 100%
rename from megatron/tokenizer/__init__.py
rename to megatron/training/tokenizer/__init__.py
diff --git a/megatron/tokenizer/bert_tokenization.py b/megatron/training/tokenizer/bert_tokenization.py
similarity index 100%
rename from megatron/tokenizer/bert_tokenization.py
rename to megatron/training/tokenizer/bert_tokenization.py
diff --git a/megatron/tokenizer/gpt2_tokenization.py b/megatron/training/tokenizer/gpt2_tokenization.py
similarity index 99%
rename from megatron/tokenizer/gpt2_tokenization.py
rename to megatron/training/tokenizer/gpt2_tokenization.py
index 3f37e44908..4080abeebc 100644
--- a/megatron/tokenizer/gpt2_tokenization.py
+++ b/megatron/training/tokenizer/gpt2_tokenization.py
@@ -213,7 +213,7 @@ def bpe(self, token):
                     j = word.index(first, i)
                     new_word.extend(word[i:j])
                     i = j
-                except BaseException:
+                except Exception:
                     new_word.extend(word[i:])
                     break
 
diff --git a/megatron/training/tokenizer/tokenizer.py b/megatron/training/tokenizer/tokenizer.py
new file mode 100644
index 0000000000..1ddc7a237f
--- /dev/null
+++ b/megatron/training/tokenizer/tokenizer.py
@@ -0,0 +1,795 @@
+# Copyright (c) 2023, NVIDIA CORPORATION. All rights reserved.
+
+"""Megatron tokenizers."""
+
+import base64
+import json
+import math
+import types
+from abc import ABC, abstractmethod
+from pathlib import Path
+from typing import Dict, List, Optional
+
+from megatron.core.datasets.megatron_tokenizer import MegatronTokenizer
+
+from .bert_tokenization import FullTokenizer as FullBertTokenizer
+from .gpt2_tokenization import GPT2Tokenizer
+
+
+def build_tokenizer(args, **kwargs):
+    """Initialize tokenizer."""
+    if args.rank == 0:
+        print('> building {} tokenizer ...'.format(args.tokenizer_type), flush=True)
+
+    # Select and instantiate the tokenizer.
+    if args.tokenizer_type == 'BertWordPieceLowerCase':
+        assert args.vocab_file is not None
+        tokenizer = _BertWordPieceTokenizer(
+            vocab_file=args.vocab_file, lower_case=True, vocab_extra_ids=args.vocab_extra_ids
+        )
+    elif args.tokenizer_type == 'BertWordPieceCase':
+        assert args.vocab_file is not None
+        tokenizer = _BertWordPieceTokenizer(
+            vocab_file=args.vocab_file, lower_case=False, vocab_extra_ids=args.vocab_extra_ids
+        )
+    elif args.tokenizer_type == 'GPT2BPETokenizer':
+        assert args.vocab_file is not None
+        assert args.merge_file is not None
+        tokenizer = _GPT2BPETokenizer(args.vocab_file, args.merge_file)
+    elif args.tokenizer_type == 'SentencePieceTokenizer':
+        assert args.tokenizer_model is not None
+        tokenizer = _SentencePieceTokenizer(
+            args.tokenizer_model, vocab_extra_ids=args.vocab_extra_ids
+        )
+    elif args.tokenizer_type == 'GPTSentencePieceTokenizer':
+        assert args.tokenizer_model is not None
+        tokenizer = _GPTSentencePieceTokenizer(args.tokenizer_model)
+    elif args.tokenizer_type == 'HuggingFaceTokenizer':
+        tokenizer = _HuggingFaceTokenizer(args.tokenizer_model, **kwargs)
+    elif args.tokenizer_type == 'Llama2Tokenizer':
+        assert args.tokenizer_model is not None
+        tokenizer = _Llama2Tokenizer(args.tokenizer_model)
+    elif args.tokenizer_type == 'TikTokenizer':
+        assert args.tokenizer_model is not None
+        assert args.tiktoken_pattern is not None
+        assert args.tiktoken_pattern in {"v1", "v2"}
+        pattern = PATTERN_TIKTOKEN if args.tiktoken_pattern == "v1" else PATTERN_TIKTOKEN_V2
+        tokenizer = CustomTikTokenizer(
+            path=args.tokenizer_model,
+            pattern=pattern,
+            vocab_size=args.vocab_size,
+            num_special_tokens=args.tiktoken_num_special_tokens,
+            special_tokens=args.tiktoken_special_tokens,
+        )
+    elif args.tokenizer_type == 'NullTokenizer':
+        assert args.vocab_size is not None
+        tokenizer = _NullTokenizer(args.vocab_size)
+    else:
+        raise NotImplementedError('{} tokenizer is not ' 'implemented.'.format(args.tokenizer_type))
+
+    # Add vocab size (if not already set from a checkpoint).
+    if getattr(args, "padded_vocab_size", None) is None:
+        args.padded_vocab_size = _vocab_size_with_padding(tokenizer.vocab_size, args)
+
+    return tokenizer
+
+
+def _vocab_size_with_padding(orig_vocab_size, args, logging_enabled=True):
+    """Pad vocab size so it is divisible by model parallel size and
+    still having GPU friendly size."""
+
+    after = orig_vocab_size
+    multiple = args.make_vocab_size_divisible_by * args.tensor_model_parallel_size
+    after = int(math.ceil(after / multiple) * multiple)
+    if args.rank == 0 and logging_enabled:
+        print(
+            ' > padded vocab (size: {}) with {} dummy tokens '
+            '(new size: {})'.format(orig_vocab_size, after - orig_vocab_size, after),
+            flush=True,
+        )
+    return after
+
+
+class _HuggingFaceTokenizer(MegatronTokenizer):
+    def __init__(self, pretrained_model_name_or_path, **kwargs):
+        super().__init__(pretrained_model_name_or_path, **kwargs)
+        try:
+            import transformers
+        except ImportError:
+            raise EnvironmentError(
+                f"The transformers library must be installed to use huggingface_tokenizer_provider"
+            )
+
+        # TODO(bnorick): download tokenizer once to lustre and use force offline to make sure all tasks read it from there
+        self._tokenizer = transformers.AutoTokenizer.from_pretrained(
+            pretrained_model_name_or_path=pretrained_model_name_or_path, **kwargs
+        )
+        self._vocab = self._tokenizer.get_vocab()
+        self._inv_vocab = {token_id: token for token, token_id in self._vocab.items()}
+
+    @property
+    def vocab_size(self):
+        return len(self._tokenizer)
+
+    @property
+    def vocab(self):
+        """Dictionary from vocab text token to id token."""
+        return self._vocab
+
+    @property
+    def inv_vocab(self):
+        """Dictionary from vocab id token to text token."""
+        return self._inv_vocab
+
+    @property
+    def decoder(self):
+        return self._inv_vocab
+
+    def tokenize(self, text, **kwargs):
+        return self._tokenizer(text, **kwargs).input_ids
+
+    def detokenize(self, token_ids, **kwargs):
+        return self._tokenizer.decode(token_ids, **kwargs)
+
+    def offsets(self, ids: list[int], text: str) -> list[int]:
+        retok_ids: "transformers.BatchEncoding" = self._tokenizer(text)
+        offsets, next_start_idx = [], 0
+        for i in range(len(ids)):
+            span = retok_ids.token_to_chars(i)
+            if span is not None:
+                offsets.append(span.start)
+                next_start_idx = span.end
+            else:
+                offsets.append(next_start_idx)
+        return offsets
+
+    @property
+    def eod(self):
+        return self._tokenizer.eos_token_id
+
+
+class _BertWordPieceTokenizer(MegatronTokenizer):
+    """Original BERT wordpiece tokenizer."""
+
+    def __init__(self, vocab_file, lower_case=True, vocab_extra_ids=0):
+        super().__init__(vocab_file, lower_case=lower_case, vocab_extra_ids=vocab_extra_ids)
+        self.tokenizer = FullBertTokenizer(vocab_file, do_lower_case=lower_case)
+        self.cls_id = self.tokenizer.vocab['[CLS]']
+        self.sep_id = self.tokenizer.vocab['[SEP]']
+        self.pad_id = self.tokenizer.vocab['[PAD]']
+        self.mask_id = self.tokenizer.vocab['[MASK]']
+        self._additional_special_tokens = []
+
+        # (dsachan) Add BOS and EOS tokens
+        SPECIAL_TOKENS = {'eos_token': '[EOS]', 'bos_token': '[BOS]'}
+        self._bos_token = '[BOS]'
+        self.add_token(self._bos_token)
+        self._bos_token_id = self.vocab.get(self._bos_token)
+
+        self._eos_token = '[EOS]'
+        self.add_token(self._eos_token)
+        self._eos_token_id = self.vocab.get(self._eos_token)
+
+        # (dsachan) Add additional special tokens
+        # These can be used as sentinel tokens in T5 model inputs
+        additional_special_tokens = []
+        additional_special_tokens.extend(
+            ["<extra_id_{}>".format(i) for i in range(vocab_extra_ids)]
+        )
+        self.add_additional_special_tokens(additional_special_tokens)
+
+    def add_token(self, token):
+        if token not in self.vocab:
+            self.inv_vocab[self.vocab_size] = token
+            # self.vocab_size comes from len(vocab)
+            # and it will increase as we add elements
+            self.vocab[token] = self.vocab_size
+
+    def add_additional_special_tokens(self, tokens_list):
+        setattr(self, "additional_special_tokens", tokens_list)
+        for value in tokens_list:
+            self.add_token(value)
+
+    @property
+    def vocab_size(self):
+        return self.tokenizer.vocab_size()
+
+    @property
+    def vocab(self):
+        return self.tokenizer.vocab
+
+    @property
+    def inv_vocab(self):
+        return self.tokenizer.inv_vocab
+
+    def tokenize(self, text):
+        text_tokens = self.tokenizer.tokenize(text)
+        return self.tokenizer.convert_tokens_to_ids(text_tokens)
+
+    def decode(self, ids):
+        tokens = self.tokenizer.convert_ids_to_tokens(ids)
+        return self.tokenizer.convert_tokens_to_string(tokens)
+
+    def detokenize(self, token_ids):
+        """Copy of decode() method for inference pipeline compatibility"""
+        return self.decode(token_ids)
+
+    def decode_token_ids(self, token_ids):
+        tokens = self.tokenizer.convert_ids_to_tokens(token_ids)
+        exclude_list = ['[PAD]', '[CLS]']
+        non_pads = [t for t in tokens if t not in exclude_list]
+
+        result = ""
+        for s in non_pads:
+            if s.startswith("##"):
+                result += s[2:]
+            else:
+                result += " " + s
+
+        return result
+
+    @property
+    def cls(self):
+        return self.cls_id
+
+    @property
+    def sep(self):
+        return self.sep_id
+
+    @property
+    def pad(self):
+        return self.pad_id
+
+    @property
+    def mask(self):
+        return self.mask_id
+
+    @property
+    def bos(self):
+        """Id of the beginning of sentence token in the vocabulary."""
+        return self._bos_token_id
+
+    @property
+    def eos(self):
+        """Id of the end of sentence token in the vocabulary."""
+        return self._eos_token_id
+
+    @property
+    def eod(self):
+        """Copy of eod property for inference pipeline compatibility"""
+        return self.eos
+
+    @property
+    def bos_token(self):
+        """Beginning of sentence token id"""
+        return self._bos_token
+
+    @property
+    def eos_token(self):
+        """End of sentence token id"""
+        return self._eos_token
+
+    @property
+    def additional_special_tokens(self):
+        """All the additional special tokens you may want to use (list of strings)."""
+        return self._additional_special_tokens
+
+    @property
+    def additional_special_tokens_ids(self):
+        """Ids of all the additional special tokens in the vocabulary (list of integers)."""
+        return [self.vocab.get(token) for token in self._additional_special_tokens]
+
+    @additional_special_tokens.setter
+    def additional_special_tokens(self, value):
+        self._additional_special_tokens = value
+
+
+class _GPT2BPETokenizer(MegatronTokenizer):
+    """Original GPT2 BPE tokenizer."""
+
+    def __init__(self, vocab_file, merge_file):
+        super().__init__(vocab_file, merge_file)
+
+        self.tokenizer = GPT2Tokenizer(
+            vocab_file, merge_file, errors='replace', special_tokens=[], max_len=None
+        )
+        self.eod_id = self.tokenizer.encoder['<|endoftext|>']
+
+    @property
+    def vocab_size(self):
+        return len(self.tokenizer.encoder)
+
+    @property
+    def vocab(self):
+        return self.tokenizer.encoder
+
+    @property
+    def inv_vocab(self):
+        return self.tokenizer.decoder
+
+    def tokenize(self, text):
+        return self.tokenizer.encode(text)
+
+    def detokenize(self, token_ids):
+        return self.tokenizer.decode(token_ids)
+
+    @property
+    def eod(self):
+        return self.eod_id
+
+
+class _SentencePieceTokenizer(MegatronTokenizer):
+    """SentencePieceTokenizer-Megatron wrapper"""
+
+    def __init__(self, model_file, vocab_extra_ids=0):
+        super().__init__(model_file, vocab_extra_ids=vocab_extra_ids)
+
+        import sentencepiece
+
+        self.tokenizer = sentencepiece.SentencePieceProcessor(model_file=model_file)
+        self._initalize(vocab_extra_ids)
+
+    def _populate_vocab(self):
+        self._vocab = {}
+        self._inv_vocab = {}
+
+        for i in range(len(self.tokenizer)):
+            t = self.tokenizer.id_to_piece(i)
+            self._inv_vocab[i] = t
+            self._vocab[t] = i
+
+    def _initalize(self, vocab_extra_ids):
+        self._populate_vocab()
+        self._special_tokens = {}
+        self._inv_special_tokens = {}
+
+        self._t5_tokens = []
+
+        def _add_special_token(t):
+            if t not in self._vocab:
+                next_id = len(self._vocab)
+                self._vocab[t] = next_id
+                self._inv_vocab[next_id] = t
+            self._special_tokens[t] = self._vocab[t]
+            self._inv_special_tokens[self._vocab[t]] = t
+
+        _add_special_token('<CLS>')
+        self._cls_id = self._vocab['<CLS>']
+        _add_special_token('<SEP>')
+        self._sep_id = self._vocab['<SEP>']
+        _add_special_token('<EOD>')
+        self._eod_id = self._vocab['<EOD>']
+        _add_special_token('<MASK>')
+        self._mask_id = self._vocab['<MASK>']
+
+        pad_id = self.tokenizer.pad_id()
+        try:
+            pad_token = self.tokenizer.id_to_piece(pad_id)
+        except IndexError:
+            pad_token = '<PAD>'
+        _add_special_token(pad_token)
+        self._pad_id = self._vocab[pad_token]
+
+        bos_id = self.tokenizer.bos_id()
+        try:
+            bos_token = self.tokenizer.id_to_piece(bos_id)
+        except IndexError:
+            bos_token = '<BOS>'
+        _add_special_token(bos_token)
+        self._bos_id = self._vocab[bos_token]
+
+        eos_id = self.tokenizer.eos_id()
+        try:
+            eos_token = self.tokenizer.id_to_piece(eos_id)
+        except IndexError:
+            eos_token = '<EOS>'
+        _add_special_token(eos_token)
+        self._eos_id = self._vocab[eos_token]
+
+        for i in range(vocab_extra_ids):
+            t = "<extra_id_{}>".format(i)
+            _add_special_token(t)
+            self._t5_tokens += [t]
+
+    @property
+    def vocab_size(self):
+        return len(self._vocab)
+
+    @property
+    def vocab(self):
+        return self._vocab
+
+    @property
+    def inv_vocab(self):
+        return self._inv_vocab
+
+    @property
+    def decoder(self):
+        return self._inv_vocab
+
+    @property
+    def encoder(self):
+        return self._vocab
+
+    # From:
+    # https://github.com/NVIDIA/NeMo/blob/c8fa217e811d60d11d014827c7f3845ff6c99ae7/nemo/collections/common/tokenizers/sentencepiece_tokenizer.py#L89
+    def tokenize(self, text):
+        ids = []
+        idx = 0
+
+        while 1:
+            indices = {}
+            for token in self._special_tokens:
+                try:
+                    indices[token] = text[idx:].index(token)
+                except ValueError:
+                    continue
+            if len(indices) == 0:
+                break
+
+            next_token = min(indices, key=indices.get)
+            next_idx = idx + indices[next_token]
+
+            ids.extend(self.tokenizer.encode_as_ids(text[idx:next_idx]))
+            ids.append(self._special_tokens[next_token])
+            idx = next_idx + len(next_token)
+
+        ids.extend(self.tokenizer.encode_as_ids(text[idx:]))
+        return ids
+
+    # From:
+    # https://github.com/NVIDIA/NeMo/blob/c8fa217e811d60d11d014827c7f3845ff6c99ae7/nemo/collections/common/tokenizers/sentencepiece_tokenizer.py#L125
+    def detokenize(self, ids):
+        text = ""
+        last_i = 0
+
+        for i, id in enumerate(ids):
+            if id in self._inv_special_tokens:
+                text += self.tokenizer.decode_ids(ids[last_i:i]) + " "
+                text += self._inv_special_tokens[id] + " "
+                last_i = i + 1
+
+        text += self.tokenizer.decode_ids(ids[last_i:])
+        return text
+
+    def offsets(self, ids: list[int], text: str) -> list[int]:
+        return [p.begin for p in self.tokenizer.decode_ids_as_immutable_proto(ids).pieces]
+
+    @property
+    def cls(self):
+        return self._cls_id
+
+    @property
+    def sep(self):
+        return self._sep_id
+
+    @property
+    def pad(self):
+        return self._pad_id
+
+    @property
+    def bos(self):
+        return self._bos_id
+
+    @property
+    def eod(self):
+        return self._eod_id
+
+    @property
+    def eos(self):
+        return self._eos_id
+
+    @property
+    def mask(self):
+        return self._mask_id
+
+    @property
+    def additional_special_tokens_ids(self):
+        return [self.vocab[k] for k in self._t5_tokens]
+
+
+class _GPTSentencePieceTokenizer(_SentencePieceTokenizer):
+    """SentencePieceTokenizer-Megatron wrapper"""
+
+    def __init__(self, model_file):
+        super().__init__(model_file, vocab_extra_ids=0)
+
+    def _initalize(self, vocab_extra_ids):
+        self._populate_vocab()
+
+        self._pad_id = self.tokenizer.pad_id()
+        self._bos_id = self.tokenizer.bos_id()
+        self._eos_id = self.tokenizer.eos_id()
+
+    def tokenize(self, text):
+        return self.tokenizer.encode_as_ids(text)
+
+    def detokenize(self, ids):
+        return self.tokenizer.decode_ids(ids)
+
+    @property
+    def cls(self):
+        return -1
+
+    @property
+    def sep(self):
+        return -1
+
+    @property
+    def mask(self):
+        return -1
+
+    @property
+    def eod(self):
+        return self._eos_id
+
+    @property
+    def additional_special_tokens_ids(self):
+        return None
+
+
+class _Llama2Tokenizer(_SentencePieceTokenizer):
+    """SentencePieceTokenizer-Megatron wrapper"""
+
+    def __init__(self, model_file):
+        super().__init__(model_file, vocab_extra_ids=0)
+
+    def _initalize(self, vocab_extra_ids):
+        self._populate_vocab()
+
+        # BOS / EOS token IDs
+        self.n_words: int = self.tokenizer.vocab_size()
+        self.bos_id: int = self.tokenizer.bos_id()
+        self.eos_id: int = self.tokenizer.eos_id()
+        self.pad_id: int = self.tokenizer.pad_id()
+        assert self.tokenizer.vocab_size() == self.tokenizer.get_piece_size()
+
+    def tokenize(self, s: str, bos=True, eos=False):
+        '''Default args for text completion, not chat/dialog.'''
+        assert type(s) is str
+        t = self.tokenizer.encode(s)
+        if bos:
+            t = [self.bos_id] + t
+        if eos:
+            t = t + [self.eos_id]
+        return t
+
+    def detokenize(self, ids):
+        return self.tokenizer.decode_ids(ids)
+
+    @property
+    def cls(self):
+        return -1
+
+    @property
+    def sep(self):
+        return -1
+
+    @property
+    def mask(self):
+        return -1
+
+    @property
+    def eod(self):
+        return self.eos_id
+
+    @property
+    def additional_special_tokens_ids(self):
+        return None
+
+
+def reload_mergeable_ranks(path: str, max_vocab: Optional[int] = None) -> Dict[bytes, int]:
+    """
+    Reload our tokenizer JSON file and convert it to Tiktoken format.
+    """
+    from ..utils import print_rank_0  # To prevent circular import.
+
+    assert path.endswith(".json")
+
+    # reload vocab
+    with open(path, "r") as f:
+        vocab = json.load(f)
+    assert isinstance(vocab, list)
+    print_rank_0(f"Vocab size: {len(vocab)}")
+    if max_vocab is not None:
+        vocab = vocab[:max_vocab]
+        print_rank_0(f"Cutting vocab to first {len(vocab)} tokens.")
+
+    # build ranks
+    ranks: Dict[bytes, int] = {}
+    for i, x in enumerate(vocab):
+        assert x.keys() == {"rank", "token_bytes", "token_str"}
+        assert x["rank"] == i
+        merge = base64.b64decode(x["token_bytes"])
+        assert i >= 256 or merge == bytes([i])
+        ranks[merge] = x["rank"]
+
+    # sanity check
+    assert len(ranks) == len(vocab)
+    assert set(ranks.values()) == set(range(len(ranks)))
+
+    return ranks
+
+
+PATTERN_TIKTOKEN = (
+    r"[^\r\n\p{L}\p{N}]?+\p{L}+|\p{N}| ?[^\s\p{L}\p{N}]++[\r\n]*|\s*[\r\n]|\s+(?!\S)|\s+"
+)
+PATTERN_TIKTOKEN_V2 = "[^\\r\\n\\p{L}\\p{N}]?[\\p{Lu}\\p{Lt}\\p{Lm}\\p{Lo}\\p{M}]*[\\p{Ll}\\p{Lm}\\p{Lo}\\p{M}]+|[^\\r\\n\\p{L}\\p{N}]?[\\p{Lu}\\p{Lt}\\p{Lm}\\p{Lo}\\p{M}]+[\\p{Ll}\\p{Lm}\\p{Lo}\\p{M}]*|\\p{N}| ?[^\\s\\p{L}\\p{N}]+[\\r\\n/]*|\\s*[\\r\\n]+|\\s+(?!\\S)|\\s+"
+
+
+class CustomTikTokenizer(MegatronTokenizer):
+    def __init__(
+        self,
+        path: str,
+        pattern: str,
+        vocab_size: Optional[int],
+        num_special_tokens: int,
+        special_tokens: Optional[List[str]],
+    ):
+        super().__init__(
+            path,
+            pattern=pattern,
+            vocab_size=vocab_size,
+            num_special_tokens=num_special_tokens,
+            special_tokens=special_tokens,
+        )
+        import tiktoken
+
+        from .. import print_rank_0  # To prevent circular import.
+
+        if vocab_size is None:
+            vocab_size = 2**17  # Fallback vocab size is 131072.
+        self._vocab_size = vocab_size
+
+        SPECIAL_TOKENS = ["<unk>", "<s>", "</s>"]
+        if special_tokens is None:
+            special_tokens = SPECIAL_TOKENS.copy()
+        assert len(special_tokens) == len(
+            set(special_tokens)
+        ), f"Special tokens should be unique: {special_tokens}"
+        assert len(special_tokens) <= num_special_tokens < self._vocab_size
+        assert set(SPECIAL_TOKENS) <= set(
+            special_tokens
+        ), f"Custom special tokens should include {SPECIAL_TOKENS}"
+
+        special_filler = [
+            "<SPECIAL_{id}>".format(id=i) for i in range(len(special_tokens), num_special_tokens)
+        ]
+        if special_filler:
+            print_rank_0(f"Adding special tokens {special_filler[0]}, ..., {special_filler[-1]}")
+        special_tokens = special_tokens + special_filler
+        assert len(set(special_tokens)) == len(special_tokens) == num_special_tokens, special_tokens
+        inner_vocab_size = self._vocab_size - num_special_tokens
+
+        token_to_id_without_special_tokens = reload_mergeable_ranks(
+            path, max_vocab=inner_vocab_size
+        )
+        # Create space for special tokens.
+        token_to_id_without_special_tokens = {
+            t: i + num_special_tokens for t, i in token_to_id_without_special_tokens.items()
+        }
+
+        special_tokens = {t: i for i, t in enumerate(special_tokens)}
+        self._unk_id = special_tokens["<unk>"]
+        self._bos_id = special_tokens["<s>"]
+        self._eos_id = special_tokens["</s>"]
+
+        # Create tiktoken model.
+        self._model = tiktoken.Encoding(
+            name=Path(path).parent.name,
+            pat_str=pattern,
+            mergeable_ranks=token_to_id_without_special_tokens,
+            special_tokens=special_tokens,
+        )
+
+        # Create final _id_to_token and _token_to_id data structures with special tokens inserted
+        # into appropriate locations.
+        assert set(token_to_id_without_special_tokens.keys()).isdisjoint(set(special_tokens.keys()))
+        self._token_to_id = token_to_id_without_special_tokens.copy()
+        self._token_to_id.update(special_tokens)
+        self._id_to_token = {v: k for k, v in self._token_to_id.items()}
+        assert set(range(self._vocab_size)) == set(self._id_to_token.keys())
+
+    @property
+    def bos(self) -> int:
+        return self._bos_id
+
+    @property
+    def eos(self) -> int:
+        return self._eos_id
+
+    @property
+    def unk(self) -> int:
+        return self._unk_id
+
+    @property
+    def eod(self) -> int:
+        return self._eos_id
+
+    @property
+    def vocab(self):
+        return self._token_to_id
+
+    @property
+    def inv_vocab(self):
+        return self._id_to_token
+
+    def tokenize(self, s: str, bos: bool = False, eos: bool = False) -> List[int]:
+        tokens = self._model.encode_ordinary(s)
+        if bos:
+            tokens = [self.bos, *tokens]
+        if eos:
+            tokens = [*tokens, self.eos]
+
+        return tokens
+
+    def detokenize(self, tokens: List[int]) -> str:
+        return self._model.decode(tokens)
+
+    def offsets(self, ids: list[int], text: str) -> list[int]:
+        return self._model.decode_with_offsets(ids)[1]
+
+    @property
+    def vocab_size(self) -> int:
+        return self._vocab_size
+
+    @property
+    def encoder(self):
+        return self._token_to_id
+
+    @property
+    def decoder(self):
+        return self._id_to_token
+
+
+class _NullTokenizer(MegatronTokenizer):
+    def __init__(self, vocab_size):
+        super().__init__(None, vocab_size=vocab_size)
+        self._vocab_size_without_eod = int(vocab_size)
+        self._eod_id = self._vocab_size_without_eod
+
+    def tokenize(self, text):
+        return [int(x) for x in text.split(' ')]
+
+    def detokenize(self, ids):
+        text = [str(x) for x in ids]
+        return ' '.join(text)
+
+    def offsets(self, ids: list[int], text: str) -> list[int]:
+        offsets, start_idx = [], 0
+        for id_ in ids:
+            offsets.append(start_idx)
+            start_idx += 1 + len(str(id_))
+        return offsets
+
+    @property
+    def vocab_size(self):
+        return self._vocab_size_without_eod + 1
+
+    @property
+    def vocab(self):
+        raise NotImplementedError
+
+    @property
+    def inv_vocab(self):
+        raise NotImplementedError
+
+    @property
+    def cls(self):
+        return -1
+
+    @property
+    def sep(self):
+        return -1
+
+    @property
+    def mask(self):
+        return -1
+
+    @property
+    def eod(self):
+        return self._eod_id
+
+    @property
+    def additional_special_tokens_ids(self):
+        return None
diff --git a/megatron/training/training.py b/megatron/training/training.py
new file mode 100644
index 0000000000..d5ee16be5f
--- /dev/null
+++ b/megatron/training/training.py
@@ -0,0 +1,1766 @@
+# Copyright (c) 2024, NVIDIA CORPORATION. All rights reserved.
+
+"""Pretrain utilities."""
+
+import dataclasses
+from datetime import datetime
+import functools
+import gc
+import logging
+import math
+import os
+import sys
+from .log_handler import CustomHandler
+# Make default logging level INFO, but filter out all log messages not from MCore.
+logging.basicConfig(handlers=[CustomHandler()], level=logging.INFO)
+from .theoretical_memory_usage import report_theoretical_memory
+import time
+# The earliest we can measure the start time.
+_TRAIN_START_TIME = time.time()
+import torch
+
+from megatron.core import mpu, tensor_parallel
+from megatron.core.utils import (
+    check_param_hashes_across_dp_replicas,
+    get_model_config,
+    StragglerDetector,
+    is_float8tensor,
+)
+from megatron.training.checkpointing import load_checkpoint
+from megatron.training.checkpointing import save_checkpoint
+from megatron.training.checkpointing import checkpoint_exists
+from megatron.legacy.model import Float16Module
+from megatron.core.distributed import DistributedDataParallelConfig
+from megatron.core.distributed import DistributedDataParallel as DDP
+from megatron.core.distributed import finalize_model_grads
+from megatron.core.enums import ModelType
+from megatron.core.optimizer import get_megatron_optimizer, OptimizerConfig
+from megatron.training.initialize import initialize_megatron
+from megatron.training.initialize import write_args_to_tensorboard
+from megatron.training.initialize import set_jit_fusion_options
+from megatron.legacy.data.data_samplers import build_pretraining_data_loader
+from megatron.core.optimizer_param_scheduler import OptimizerParamScheduler
+from megatron.core.transformer.moe import upcycling_utils
+from megatron.core.transformer.moe.moe_utils import track_moe_metrics
+from megatron.core.parallel_state import (
+    destroy_global_memory_buffer,
+    destroy_model_parallel,
+)
+from megatron.core.pipeline_parallel import get_forward_backward_func
+from megatron.core.num_microbatches_calculator import (
+    destroy_num_microbatches_calculator,
+    get_current_global_batch_size,
+    get_current_running_global_batch_size,
+    get_num_microbatches,
+    update_num_microbatches)
+
+from .async_utils import maybe_finalize_async_save
+from .utils import (
+    calc_params_l2_norm,
+    check_adlr_autoresume_termination,
+    is_last_rank,
+    print_rank_0,
+    print_rank_last,
+    report_memory,
+    unwrap_model,
+    append_to_progress_log,
+    update_use_dist_ckpt,
+)
+from .global_vars import (
+    destroy_global_vars,
+    get_args,
+    get_signal_handler,
+    get_timers,
+    get_tensorboard_writer,
+    get_wandb_writer,
+    get_one_logger)
+from . import one_logger_utils
+
+from . import ft_integration
+
+stimer = StragglerDetector()
+
+
+def destroy_global_state():
+    destroy_global_vars()
+    destroy_num_microbatches_calculator()
+    destroy_global_memory_buffer()
+    destroy_model_parallel()
+
+
+def print_datetime(string):
+    """Note that this call will sync across all ranks."""
+    torch.distributed.barrier()
+    time_str = datetime.now().strftime('%Y-%m-%d %H:%M:%S')
+    print_rank_0('[' + string + '] datetime: {} '.format(time_str))
+
+
+def num_floating_point_operations(args, batch_size):
+    # Attention projection size.
+    query_projection_size = args.kv_channels * args.num_attention_heads
+    query_projection_to_hidden_size_ratio = query_projection_size / args.hidden_size
+    # Group Query Attention.
+    if not args.group_query_attention:
+        args.num_query_groups = args.num_attention_heads
+    # MoE.
+    num_experts_routed_to = 1 if args.num_experts is None else args.moe_router_topk
+    gated_linear_multiplier = 3 / 2 if args.swiglu else 1
+    shared_expert_ffn_hidden_size = (
+        0
+        if args.moe_shared_expert_intermediate_size is None
+        else args.moe_shared_expert_intermediate_size
+    )
+
+    # The 12x term below comes from the following factors; for more details, see
+    # "APPENDIX: FLOATING-POINT OPERATIONS" in https://arxiv.org/abs/2104.04473.
+    # - 3x: Each GEMM in the model needs to be performed 3 times (forward pass,
+    #       backward wgrad [weight gradient], backward dgrad [data gradient]).
+    # - 2x: GEMMs of a particular size are stacked twice in the standard Transformer model
+    #       architectures implemented in this codebase (e.g., h->ffn_h GEMM and ffn_h->h GEMM
+    #       in MLP layer).
+    # - 2x: A GEMM of a m*n tensor with a n*k tensor requires 2mnk floating-point operations.
+    expansion_factor = 3 * 2 * 2
+
+    return (
+        expansion_factor
+        * batch_size
+        * args.seq_length
+        * args.num_layers
+        * args.hidden_size
+        * args.hidden_size
+        * (
+            # Attention.
+            (
+                (
+                    1
+                    + (args.num_query_groups / args.num_attention_heads)
+                    + (args.seq_length / args.hidden_size)
+                ) * query_projection_to_hidden_size_ratio
+            )
+            # MLP.
+            + (
+                (args.ffn_hidden_size / args.hidden_size)
+                * num_experts_routed_to
+                * gated_linear_multiplier
+            )
+            # Shared Experts.
+            + ((shared_expert_ffn_hidden_size / args.hidden_size) * gated_linear_multiplier)
+            # Logit.
+            + (args.padded_vocab_size / (2 * args.num_layers * args.hidden_size))
+        )
+    )
+
+
+def get_start_time_from_progress_log():
+    """
+    Gets start time of earliest job with same world size. Also returns the number
+    of floating-point operations completed in last saved checkpoint.
+    """
+    args = get_args()
+    assert args.save is not None
+    progress_log_filename = os.path.join(args.save, "progress.txt")
+
+    # start_time is time when job with same world size started.
+    # start_num_floating_point_operations is the number of floating-point operations
+    # completed when this job started.
+    # latest_num_floating_point_operations is the number of floating-point operations
+    # completed in most recent saved checkpoint.
+    start_time = None
+    start_num_floating_point_operations = None
+    latest_num_floating_point_operations = 0
+
+    def _get_field(string, type):
+        return type(string.split(': ')[1])
+
+    with open(progress_log_filename, 'r') as f:
+        for line in f:
+            line = line.strip()
+            line_tokens = line.split('\t')
+            world_size_in_line = _get_field(line_tokens[2], int)
+            if line_tokens[3] == "Saved checkpoint":
+                latest_num_floating_point_operations = \
+                    _get_field(line_tokens[7], float)
+            if world_size_in_line != args.world_size:
+                # Re-start search if we see a different world size.
+                start_time = None
+                start_num_floating_point_operations = None
+                continue
+            if line_tokens[3] == "Starting job":
+                if start_time is None:
+                    start_time = line_tokens[0]
+                    start_num_floating_point_operations = \
+                        latest_num_floating_point_operations
+    assert start_time is not None and start_num_floating_point_operations is not None, \
+        "Should have seen at least one 'Starting job' entry with same world_size"
+    return datetime.strptime(start_time, '%Y-%m-%d %H:%M:%S'), \
+        start_num_floating_point_operations
+
+
+def pretrain(
+    train_valid_test_dataset_provider,
+    model_provider,
+    model_type,
+    forward_step_func,
+    process_non_loss_data_func=None,
+    extra_args_provider=None,
+    args_defaults={},
+    get_embedding_ranks=None,
+    get_position_embedding_ranks=None,
+    non_loss_data_func=None,
+):
+    """Main training program.
+
+    This function will run the followings in the order provided:
+        1) initialize Megatron.
+        2) setup model, optimizer and lr schedule using the model_provider.
+        3) call train_val_test_data_provider to get train/val/test datasets.
+        4) train the modle using the forward_step_func.
+
+    Args:
+        train_valid_test_dataset_provider: a function that takes the size of
+            train/valid/test dataset and returns `train, valid, test` datasets.
+        model_provider: a function that returns a vanilla version of the
+            model. By vanilla we mean a simple model on cpu with no fp16 or ddp.
+        model_type: an enum that specifies the type of model being trained.
+        forward_step_func: a function that takes a `data iterator` and `model`,
+            and returns a `loss` scalar with a dictionary with key:values being
+            the info we would like to monitor during training, for example
+            `lm-loss: value`. We also require that this function add
+            `batch generator` to the timers class.
+        process_non_loss_data_func: a function to post process outputs of the
+            network. It can be used for dumping output tensors (e.g images) to
+            tensorboard. It takes `collected data`(list of tensors),
+            `current iteration index` and `tensorboard writer` as arguments.
+        extra_args_provider: a function that takes a parser and adds arguments
+            to it. It is used for programs to add their own arguments.
+        args_defaults: a dictionary from argument-name to argument-value. It
+            to set already parse arguments.
+        get_embedding_ranks (TODO):
+        get_position_embedding_ranks (TODO):
+        non_loss_data_func (callable): A custom function to call during evaluation.
+            It can run e.g. benchmarks.
+    """
+
+    # Initalize and get arguments, timers, and Tensorboard writer.
+    initialize_megatron(
+        extra_args_provider=extra_args_provider,
+        args_defaults=args_defaults,
+        get_embedding_ranks=get_embedding_ranks,
+        get_position_embedding_ranks=get_position_embedding_ranks
+    )
+
+    args = get_args()
+    timers = get_timers()
+
+    if args.log_progress:
+        append_to_progress_log("Starting job")
+
+    # Set pytorch JIT layer fusion options and warmup JIT functions.
+    set_jit_fusion_options()
+
+    # Adjust the startup time so it reflects the largest value.
+    # This will be closer to what scheduler will see (outside of
+    # image ... launches.
+    global _TRAIN_START_TIME
+    start_time_tensor = torch.tensor([_TRAIN_START_TIME],
+                                     dtype=torch.double,
+                                     device='cuda')
+    torch.distributed.all_reduce(start_time_tensor,
+                                 op=torch.distributed.ReduceOp.MIN)
+    _TRAIN_START_TIME = start_time_tensor.item()
+
+    app_metrics = {}
+    app_metrics['app_start_time'] = round(_TRAIN_START_TIME * 1000.0)
+    app_metrics['app_model_init_start_time'] = round(_TRAIN_START_TIME * 1000.0)
+
+    print_rank_0('time to initialize megatron (seconds): {:.3f}'.format(
+        time.time() - _TRAIN_START_TIME))
+    print_datetime('after megatron is initialized')
+    app_metrics['app_model_init_finish_time'] = one_logger_utils.get_timestamp_in_ms()
+
+    args = get_args()
+    timers = get_timers()
+
+    # Track E2E metrics on pretrain start
+    one_logger_utils.on_pretrain_start()
+
+    # Context used for persisting some state between checkpoint saves.
+    if args.non_persistent_ckpt_type == 'local':
+        raise RuntimeError('LocalCheckpointManagers are not yet integrated')
+        checkpointing_context = {
+            'local_checkpoint_manager': BasicLocalCheckpointManager(
+                args.non_persistent_local_ckpt_dir
+            )
+        }
+    else:
+        checkpointing_context = {}
+
+    # Model, optimizer, and learning rate.
+    timers('model-and-optimizer-setup', log_level=0).start(barrier=True)
+    app_metrics['app_build_optimizer_start_time'] = one_logger_utils.get_timestamp_in_ms()
+    model, optimizer, opt_param_scheduler = setup_model_and_optimizer(
+        model_provider, model_type, checkpointing_context=checkpointing_context)
+
+    timers('model-and-optimizer-setup').stop()
+    print_datetime('after model, optimizer, and learning rate '
+                   'scheduler are built')
+    app_metrics['app_build_optimizer_finish_time'] = one_logger_utils.get_timestamp_in_ms()
+    config = get_model_config(model[0])
+
+    # Data stuff.
+    app_metrics['app_build_dataiters_start_time'] = one_logger_utils.get_timestamp_in_ms()
+    timers('train/valid/test-data-iterators-setup', log_level=0).start(
+        barrier=True)
+    if args.virtual_pipeline_model_parallel_size is not None:
+        train_data_iterator = []
+        valid_data_iterator = []
+        test_data_iterator = []
+        for i in range(len(model)):
+            mpu.set_virtual_pipeline_model_parallel_rank(i)
+            iterators = build_train_valid_test_data_iterators(
+                train_valid_test_dataset_provider)
+            train_data_iterator.append(iterators[0])
+            valid_data_iterator.append(iterators[1])
+            test_data_iterator.append(iterators[2])
+    else:
+        train_data_iterator, valid_data_iterator, test_data_iterator \
+            = build_train_valid_test_data_iterators(
+                train_valid_test_dataset_provider)
+    timers('train/valid/test-data-iterators-setup').stop()
+    print_datetime('after dataloaders are built')
+    app_metrics['app_build_dataiters_finish_time'] = one_logger_utils.get_timestamp_in_ms()
+
+    # Track if training is enabled. Can only be done once args.do_train is assigned after dataloader is built.
+    one_logger_utils.track_config_flags(args.train_iters, args.skip_train, args.do_train,
+                                        args.do_valid, args.do_test, args.dataloader_type,
+                                        args.retro_project_dir, args.retro_cyclic_train_iters)
+
+    if args.enable_ft_package and ft_integration.get_rank_monitor_client() is not None:
+        ft_integration.get_rank_monitor_client().init_workload_monitoring()
+        ft_timeouts = ft_integration.get_rank_monitor_client().timeouts
+        print_rank_0(f"Fault tolerance client initialized. Timeouts: {ft_timeouts}")
+
+    # Print setup timing.
+    print_rank_0('done with setup ...')
+    timers.log(['model-and-optimizer-setup',
+                'train/valid/test-data-iterators-setup'], barrier=True)
+
+    one_logger = get_one_logger()
+    one_logger and one_logger.log_metrics(app_metrics)
+
+    if not args.skip_train:
+        print_rank_0('training ...')
+
+        if args.dataloader_type == 'cyclic' and args.retro_project_dir:
+            assert args.retro_cyclic_train_iters is not None
+            args.train_iters = args.retro_cyclic_train_iters
+            print_rank_0("retro cyclic train iters : %d" % args.train_iters)
+
+        iteration = 0
+        if args.do_train and args.train_iters > 0:
+            iteration, num_floating_point_operations_so_far = train(
+                forward_step_func,
+                model, optimizer, opt_param_scheduler,
+                train_data_iterator, valid_data_iterator,
+                process_non_loss_data_func, config, checkpointing_context,
+                non_loss_data_func)
+
+        print_datetime('after training is done')
+
+        if args.save and iteration != 0 and iteration % args.save_interval != 0:
+            save_checkpoint(iteration, model, optimizer, opt_param_scheduler,
+                            num_floating_point_operations_so_far, checkpointing_context,
+                            train_data_iterator=train_data_iterator,
+                            ft_client=ft_integration.get_rank_monitor_client(
+                                ft_integration.StateMachineActions.SAVE_CHECKPOINT))
+
+        one_logger and one_logger.log_metrics({
+            'app_train_loop_finish_time': one_logger_utils.get_timestamp_in_ms()
+        })
+
+    else:
+        print_rank_0('skipping training (--skip-train is on) ...')
+
+        iteration = args.iteration
+
+    if args.do_valid:
+        prefix = f'iteration {iteration} on validation set'
+        evaluate_and_print_results(prefix, forward_step_func,
+                                   valid_data_iterator, model,
+                                   iteration, process_non_loss_data_func, config,
+                                   verbose=True, write_to_tensorboard=not args.skip_train,
+                                   non_loss_data_func=non_loss_data_func)
+
+    if args.do_test:
+        prefix = f'iteration {iteration} on test set'
+        evaluate_and_print_results(prefix, forward_step_func,
+                                   test_data_iterator, model,
+                                   iteration, process_non_loss_data_func, config,
+                                   verbose=True, write_to_tensorboard=not args.skip_train,
+                                   non_loss_data_func=non_loss_data_func)
+
+    wandb_writer = get_wandb_writer()
+    if wandb_writer:
+        wandb_writer.finish()
+    maybe_finalize_async_save(blocking=True)
+
+    one_logger and one_logger.log_metrics({
+        'app_finish_time': one_logger_utils.get_timestamp_in_ms()
+    })
+    one_logger_utils.finish()
+
+
+def update_train_iters(args):
+
+    # For iteration-based training, we don't need to do anything
+    if args.train_iters:
+        return
+
+    # Constant batch size with sample-based training.
+    if args.rampup_batch_size is None:
+        args.train_iters = args.train_samples // args.global_batch_size
+
+    else:
+        # Sample based training with rampup batch size.
+        iterations = 0
+        consumed_samples = 0
+        # Rampup phase.
+        while consumed_samples <= int(args.rampup_batch_size[2]) and consumed_samples <= args.train_samples:
+            update_num_microbatches(consumed_samples, consistency_check=False)
+            consumed_samples += get_current_global_batch_size()
+            iterations += 1
+        # Reset
+        update_num_microbatches(0, consistency_check=False)
+        # Constant phase
+        # Note that we throw away any partial last batch.
+        if args.train_samples > consumed_samples:
+            iterations += (args.train_samples - consumed_samples) // \
+                          args.global_batch_size
+        args.train_iters = iterations
+
+    print_rank_0('setting training iterations to {}'.format(args.train_iters))
+
+
+def get_model(model_provider_func, model_type=ModelType.encoder_or_decoder, wrap_with_ddp=True):
+    """Build the model."""
+    args = get_args()
+    args.model_type = model_type
+
+    # Build model.
+    if mpu.get_pipeline_model_parallel_world_size() > 1 and \
+       args.virtual_pipeline_model_parallel_size is not None:
+        assert model_type != ModelType.encoder_and_decoder, \
+            "Interleaved schedule not supported for model with both encoder and decoder"
+        model = []
+        for i in range(args.virtual_pipeline_model_parallel_size):
+            mpu.set_virtual_pipeline_model_parallel_rank(i)
+            # Set pre_process and post_process only after virtual rank is set.
+            pre_process = mpu.is_pipeline_first_stage()
+            post_process = mpu.is_pipeline_last_stage()
+            this_model = model_provider_func(
+                pre_process=pre_process,
+                post_process=post_process
+            )
+            this_model.model_type = model_type
+            model.append(this_model)
+    else:
+        pre_process = mpu.is_pipeline_first_stage()
+        post_process = mpu.is_pipeline_last_stage()
+        add_encoder = True
+        add_decoder = True
+        if model_type == ModelType.encoder_and_decoder:
+            if mpu.get_pipeline_model_parallel_world_size() > 1:
+                rank = mpu.get_pipeline_model_parallel_rank()
+                first_decoder_rank = args.encoder_pipeline_model_parallel_size
+                world_size = mpu.get_pipeline_model_parallel_world_size()
+                pre_process = rank == 0 or rank == first_decoder_rank
+                post_process = (rank == (first_decoder_rank - 1)) or (rank == (world_size - 1))
+                add_encoder = mpu.is_inside_encoder(rank)
+                add_decoder = mpu.is_inside_decoder(rank)
+            model = model_provider_func(
+                pre_process=pre_process,
+                post_process=post_process,
+                add_encoder=add_encoder,
+                add_decoder=add_decoder)
+        else:
+            model = model_provider_func(
+                pre_process=pre_process,
+                post_process=post_process
+            )
+        model.model_type = model_type
+
+    if not isinstance(model, list):
+        model = [model]
+
+    # Set tensor model parallel attributes if not set.
+    # Only parameters that are already tensor model parallel have these
+    # attributes set for them. We should make sure the default attributes
+    # are set for all params so the optimizer can use them.
+    for model_module in model:
+        for param in model_module.parameters():
+            tensor_parallel.set_defaults_if_not_set_tensor_model_parallel_attributes(param)
+
+    # Print number of parameters.
+    if mpu.get_data_parallel_rank() == 0:
+        print(' > number of parameters on (tensor, pipeline) '
+              'model parallel rank ({}, {}): {}'.format(
+            mpu.get_tensor_model_parallel_rank(),
+            mpu.get_pipeline_model_parallel_rank(),
+            sum([sum([p.nelement() for p in model_module.parameters()])
+                 for model_module in model])), flush=True)
+
+    # GPU allocation.
+    for model_module in model:
+        model_module.cuda(torch.cuda.current_device())
+
+    # Fp16 conversion.
+    if args.fp16 or args.bf16:
+        model = [Float16Module(model_module, args) for model_module in model]
+
+    # The model_module.bfloat16()/model_module.half() above will call the inplace copy of TE's
+    # Float8Tensor, which will write an unwanted value (amax calculated from the current fp8
+    # param) to its amax_history. The following logic will correct the amax_history back.
+    for model_module in model:
+        for param in model_module.parameters():
+            if is_float8tensor(param) and param._fp8_meta is not None:
+                fp8_meta = param._fp8_meta['scaling_fwd']
+                fp8_meta_index = param._fp8_meta_index
+                if hasattr(param, 'get_high_precision_init_val'):
+                    fp8_meta.amax_history[0][fp8_meta_index].copy_(
+                        param.get_high_precision_init_val().abs().max()
+                    )
+                else:
+                    fp8_meta.amax_history[0][fp8_meta_index] = 0
+
+    if wrap_with_ddp:
+        config = get_model_config(model[0])
+
+        kwargs = {}
+        for f in dataclasses.fields(DistributedDataParallelConfig):
+            if hasattr(args, f.name):
+                kwargs[f.name] = getattr(args, f.name)
+        kwargs['grad_reduce_in_fp32'] = args.accumulate_allreduce_grads_in_fp32
+        kwargs['check_for_nan_in_grad'] = args.check_for_nan_in_loss_and_grad
+        kwargs['bucket_size'] = args.ddp_bucket_size
+        kwargs['average_in_collective'] = args.ddp_average_in_collective
+        ddp_config = DistributedDataParallelConfig(**kwargs)
+
+        overlap_param_gather_with_optimizer_step = getattr(args, 'overlap_param_gather_with_optimizer_step', False)
+        model = [DDP(config,
+                     ddp_config,
+                     model_chunk,
+                     # Turn off bucketing for model_chunk 2 onwards, since communication for these
+                     # model chunks is overlapped with compute anyway.
+                     disable_bucketing=(model_chunk_idx > 0) or overlap_param_gather_with_optimizer_step)
+                 for (model_chunk_idx, model_chunk) in enumerate(model)]
+
+        # Broadcast params from data parallel src rank to other data parallel ranks.
+        if args.data_parallel_random_init:
+            for model_module in model:
+                model_module.broadcast_params()
+
+    return model
+
+
+def get_optimizer_param_scheduler(optimizer):
+    """Build the learning rate scheduler."""
+    args = get_args()
+
+    # Iteration-based training.
+    if args.train_iters:
+        if args.lr_decay_iters is None:
+            args.lr_decay_iters = args.train_iters
+        lr_decay_steps = args.lr_decay_iters * args.global_batch_size
+        wd_incr_steps = args.train_iters * args.global_batch_size
+        wsd_decay_steps = None
+        if args.lr_wsd_decay_iters is not None:
+            wsd_decay_steps = args.lr_wsd_decay_iters * args.global_batch_size
+        if args.lr_warmup_fraction is not None:
+            lr_warmup_steps = args.lr_warmup_fraction * lr_decay_steps
+        else:
+            lr_warmup_steps = args.lr_warmup_iters * args.global_batch_size
+    # Sample-based training.
+    elif args.train_samples:
+        # We need to set training iters for later use. Technically
+        # we need to adjust the training samples too (due to last
+        # batch being incomplete) but we leave it as is for now.
+        update_train_iters(args)
+        if args.lr_decay_samples is None:
+            args.lr_decay_samples = args.train_samples
+        lr_decay_steps = args.lr_decay_samples
+        wd_incr_steps = args.train_samples
+        wsd_decay_steps = args.lr_wsd_decay_samples
+        if args.lr_warmup_fraction is not None:
+            lr_warmup_steps = args.lr_warmup_fraction * lr_decay_steps
+        else:
+            lr_warmup_steps = args.lr_warmup_samples
+    else:
+        raise Exception(
+            'either train-iters or train-samples should be provided.')
+
+    opt_param_scheduler = OptimizerParamScheduler(
+        optimizer,
+        init_lr=args.lr_warmup_init,
+        max_lr=args.lr,
+        min_lr=args.min_lr,
+        lr_warmup_steps=lr_warmup_steps,
+        lr_decay_steps=lr_decay_steps,
+        lr_decay_style=args.lr_decay_style,
+        start_wd=args.start_weight_decay,
+        end_wd=args.end_weight_decay,
+        wd_incr_steps=wd_incr_steps,
+        wd_incr_style=args.weight_decay_incr_style,
+        use_checkpoint_opt_param_scheduler=args.use_checkpoint_opt_param_scheduler,
+        override_opt_param_scheduler=args.override_opt_param_scheduler,
+        wsd_decay_steps=wsd_decay_steps,
+        lr_wsd_decay_style=args.lr_wsd_decay_style)
+
+    return opt_param_scheduler
+
+
+def setup_model_and_optimizer(model_provider_func,
+                              model_type,
+                              no_wd_decay_cond=None,
+                              scale_lr_cond=None,
+                              lr_mult=1.0,
+                              checkpointing_context=None):
+    """Setup model and optimizer."""
+    args = get_args()
+    timers = get_timers()
+    one_logger = get_one_logger()
+
+    model = get_model(model_provider_func, model_type)
+    unwrapped_model = unwrap_model(model)
+
+    kwargs = {}
+    for f in dataclasses.fields(OptimizerConfig):
+        if hasattr(args, f.name):
+            kwargs[f.name] = getattr(args, f.name)
+    config = OptimizerConfig(**kwargs)
+    config.timers = timers
+    optimizer = get_megatron_optimizer(config, model, no_wd_decay_cond,
+                                       scale_lr_cond, lr_mult)
+    opt_param_scheduler = get_optimizer_param_scheduler(optimizer)
+
+    if args.moe_use_upcycling:
+        torch.distributed.barrier()
+        assert not checkpoint_exists(
+            args.save
+        ), ("The upcycling destination directory already exists. "
+            "Please check if --moe-use-upcycling is mistakenly enabled. "
+            "Upcycling should only be set for the first run when converting the dense model. "
+            "All subsequent runs should remove this flag. ")
+        num_experts = args.num_experts
+        args.num_experts = None
+        expert_model_parallel_size = args.expert_model_parallel_size
+        args.expert_model_parallel_size = 1
+        dense_model_for_upcycling = get_model(model_provider_func, model_type)
+        args.num_experts = num_experts
+        args.expert_model_parallel_size = expert_model_parallel_size
+        _, args.num_floating_point_operations_so_far = upcycling_utils.load_and_upcycle_model(
+            load_checkpoint,
+            unwrapped_model,
+            dense_model_for_upcycling,
+            load_kwargs = {'model': dense_model_for_upcycling, 'optimizer': None, 'opt_param_scheduler': None}
+        )
+        args.iteration = 1
+        save_checkpoint(args.iteration, model, None, None, args.num_floating_point_operations_so_far)
+        torch.distributed.barrier()
+        del dense_model_for_upcycling
+        if (args.fp16 or args.bf16) and optimizer is not None:
+            optimizer.reload_model_params()
+        print_rank_0(f'Upcycled checkpoint saved to {args.save}')
+
+    if (args.load is not None or args.pretrained_checkpoint is not None) and not args.moe_use_upcycling:
+        one_logger and one_logger.log_metrics({
+            'load_checkpoint_start_time': one_logger_utils.get_timestamp_in_ms()
+        })
+        timers('load-checkpoint', log_level=0).start(barrier=True)
+
+        args.iteration, args.num_floating_point_operations_so_far = load_checkpoint(
+                model, optimizer, opt_param_scheduler,
+                ft_client=ft_integration.get_rank_monitor_client(), checkpointing_context=checkpointing_context)
+        timers('load-checkpoint').stop(barrier=True)
+        timers.log(['load-checkpoint'])
+        one_logger and one_logger.log_metrics({
+            'load_checkpoint_finish_time': one_logger_utils.get_timestamp_in_ms(),
+            'load_checkpoint_time': timers('load-checkpoint').active_time()
+        })
+    else:
+        args.iteration = 0
+        args.num_floating_point_operations_so_far = 0
+
+    # get model without FP16 and/or DDP wrappers
+    if args.iteration == 0 and len(unwrapped_model) == 1 \
+        and hasattr(unwrapped_model[0], 'init_state_dict_from_bert'):
+        print_rank_0("Initializing ICT from pretrained BERT model")
+        unwrapped_model[0].init_state_dict_from_bert()
+        if args.fp16:
+            optimizer.reload_model_params()
+
+    # Convert checkpoint format.
+    if args.ckpt_convert_format is not None:
+        load_ckpt_format = args.ckpt_format
+        args.ckpt_format = args.ckpt_convert_format
+        args.save = os.path.join(args.ckpt_convert_save, args.ckpt_convert_format)
+        update_use_dist_ckpt(args)
+
+        save_checkpoint(args.iteration, model, optimizer, opt_param_scheduler,
+                        args.num_floating_point_operations_so_far)
+
+        print_rank_0("> converted checkpoint: %s -> %s." % (load_ckpt_format, args.ckpt_format))
+        torch.distributed.barrier()
+        exit()
+
+    return model, optimizer, opt_param_scheduler
+
+
+def train_step(forward_step_func, data_iterator,
+               model, optimizer, opt_param_scheduler, config):
+    """Single training step."""
+    args = get_args()
+    timers = get_timers()
+
+    # Set grad to zero.
+    for model_chunk in model:
+        model_chunk.zero_grad_buffer()
+    optimizer.zero_grad()
+
+    # Forward pass.
+    forward_backward_func = get_forward_backward_func()
+    losses_reduced = forward_backward_func(
+        forward_step_func=forward_step_func,
+        data_iterator=data_iterator,
+        model=model,
+        num_microbatches=get_num_microbatches(),
+        seq_length=args.seq_length,
+        micro_batch_size=args.micro_batch_size,
+        decoder_seq_length=args.decoder_seq_length,
+        forward_only=False)
+
+    # Empty unused memory.
+    if args.empty_unused_memory_level >= 1:
+        torch.cuda.empty_cache()
+
+    # Vision gradients.
+    if getattr(args, 'vision_pretraining', False) and args.vision_pretraining_type == "dino":
+        unwrapped_model = unwrap_model(model[0])
+        unwrapped_model.cancel_gradients_last_layer(args.curr_iteration)
+
+    # Update parameters.
+    timers('optimizer', log_level=1).start(barrier=args.barrier_with_L1_time)
+    update_successful, grad_norm, num_zeros_in_grad = optimizer.step()
+    timers('optimizer').stop()
+
+    # Vision momentum.
+    if getattr(args, 'vision_pretraining', False) and args.vision_pretraining_type == "dino":
+        unwrapped_model = unwrap_model(model[0])
+        unwrapped_model.update_momentum(args.curr_iteration)
+
+    # Update learning rate.
+    if update_successful:
+        increment = get_num_microbatches() * \
+                    args.micro_batch_size * \
+                    args.data_parallel_size
+        opt_param_scheduler.step(increment=increment)
+        skipped_iter = 0
+    else:
+        skipped_iter = 1
+
+    # Empty unused memory.
+    if args.empty_unused_memory_level >= 2:
+        torch.cuda.empty_cache()
+
+    if mpu.is_pipeline_last_stage(ignore_virtual=True):
+        # Average loss across microbatches.
+        loss_reduced = {}
+        for key in losses_reduced[0].keys():
+            numerator = 0
+            denominator = 0
+            for x in losses_reduced:
+                val = x[key]
+                # there is one dict per microbatch. in new reporting, we average
+                # over the total number of tokens across the global batch.
+                if isinstance(val, tuple) or isinstance(val, list):
+                    numerator += val[0]
+                    denominator += val[1]
+                else:
+                    # legacy behavior. we average over the number of microbatches,
+                    # and so the denominator is 1.
+                    numerator += val
+                    denominator += 1
+            loss_reduced[key] = numerator / denominator
+        return loss_reduced, skipped_iter, grad_norm, num_zeros_in_grad
+    return {}, skipped_iter, grad_norm, num_zeros_in_grad
+
+
+def training_log(loss_dict, total_loss_dict, learning_rate, decoupled_learning_rate, iteration,
+                 loss_scale, report_memory_flag, skipped_iter,
+                 grad_norm, params_norm, num_zeros_in_grad):
+    """Log training information such as losses, timing, ...."""
+    args = get_args()
+    timers = get_timers()
+    writer = get_tensorboard_writer()
+    wandb_writer = get_wandb_writer()
+    one_logger = get_one_logger()
+
+    # Advanced, skipped, and Nan iterations.
+    advanced_iters_key = 'advanced iterations'
+    skipped_iters_key = 'skipped iterations'
+    nan_iters_key = 'nan iterations'
+    # Advanced iterations.
+    if not skipped_iter:
+        total_loss_dict[advanced_iters_key] = total_loss_dict.get(
+            advanced_iters_key, 0) + 1
+    else:
+        if advanced_iters_key not in total_loss_dict:
+            total_loss_dict[advanced_iters_key] = 0
+    # Skipped iterations.
+    total_loss_dict[skipped_iters_key] = total_loss_dict.get(
+        skipped_iters_key, 0) + skipped_iter
+    # Update losses and set nan iterations
+    got_nan = False
+    for key in loss_dict:
+        if not skipped_iter:
+            total_loss_dict[key] = total_loss_dict.get(
+                key, torch.tensor([0.0], dtype=torch.float, device='cuda')) + loss_dict[key]
+        else:
+            value = loss_dict[key].float().sum().item()
+            is_nan = value == float('inf') or \
+                     value == -float('inf') or \
+                     value != value
+            got_nan = got_nan or is_nan
+    total_loss_dict[nan_iters_key] = total_loss_dict.get(
+        nan_iters_key, 0) + int(got_nan)
+
+    # Logging.
+    timers_to_log = [
+        'forward-backward',
+        'forward-compute',
+        'backward-compute',
+        'batch-generator',
+        'forward-recv',
+        'forward-send',
+        'backward-recv',
+        'backward-send',
+        'forward-send-forward-recv',
+        'forward-send-backward-recv',
+        'backward-send-forward-recv',
+        'backward-send-backward-recv',
+        'forward-backward-send-forward-backward-recv',
+        'layernorm-grads-all-reduce',
+        'embedding-grads-all-reduce',
+        'all-grads-sync',
+        'params-all-gather',
+        'optimizer-copy-to-main-grad',
+        'optimizer-unscale-and-check-inf',
+        'optimizer-clip-main-grad',
+        'optimizer-count-zeros',
+        'optimizer-inner-step',
+        'optimizer-copy-main-to-model-params',
+        'optimizer']
+
+    # Calculate batch size.
+    batch_size = args.micro_batch_size * args.data_parallel_size * \
+        get_num_microbatches()
+
+    # Track app tag & app tag ID
+    one_logger_utils.track_app_tag(batch_size, args.world_size, args.seq_length)
+
+    total_iterations = total_loss_dict[advanced_iters_key] + \
+                       total_loss_dict[skipped_iters_key]
+
+    # Tensorboard values.
+    # Timer requires all the ranks to call.
+    if args.log_timers_to_tensorboard and \
+       (iteration % args.tensorboard_log_interval == 0):
+        timers.write(timers_to_log, writer, iteration,
+                     normalizer=total_iterations)
+    if writer and (iteration % args.tensorboard_log_interval == 0):
+        if wandb_writer:
+            wandb_writer.log({'samples vs steps': args.consumed_train_samples},
+                             iteration)
+        writer.add_scalar('learning-rate', learning_rate, iteration)
+        if args.decoupled_lr is not None:
+            writer.add_scalar('decoupled-learning-rate', decoupled_learning_rate, iteration)
+        writer.add_scalar('learning-rate vs samples', learning_rate,
+                          args.consumed_train_samples)
+        if wandb_writer:
+            wandb_writer.log({'learning-rate': learning_rate}, iteration)
+        if args.skipped_train_samples > 0:
+            writer.add_scalar('skipped-train-samples', args.skipped_train_samples, iteration)
+            if wandb_writer:
+                wandb_writer.log({'skipped-train-samples': args.skipped_train_samples}, iteration)
+        writer.add_scalar('batch-size', batch_size, iteration)
+        writer.add_scalar('batch-size vs samples', batch_size,
+                          args.consumed_train_samples)
+        if wandb_writer:
+            wandb_writer.log({'batch-size': batch_size}, iteration)
+        for key in loss_dict:
+            writer.add_scalar(key , loss_dict[key], iteration)
+            writer.add_scalar(key + ' vs samples', loss_dict[key],
+                              args.consumed_train_samples)
+            if wandb_writer:
+                wandb_writer.log({key: loss_dict[key]}, iteration)
+        if args.log_loss_scale_to_tensorboard:
+            writer.add_scalar('loss-scale', loss_scale, iteration)
+            writer.add_scalar('loss-scale vs samples', loss_scale,
+                              args.consumed_train_samples)
+            if wandb_writer:
+                wandb_writer.log({'loss-scale': loss_scale}, iteration)
+        if args.log_world_size_to_tensorboard:
+            writer.add_scalar('world-size', args.world_size, iteration)
+            writer.add_scalar('world-size vs samples', args.world_size,
+                              args.consumed_train_samples)
+            if wandb_writer:
+                wandb_writer.log({'world-size': args.world_size}, iteration)
+        if grad_norm is not None:
+            writer.add_scalar('grad-norm', grad_norm, iteration)
+            writer.add_scalar('grad-norm vs samples', grad_norm,
+                              args.consumed_train_samples)
+            if wandb_writer:
+                wandb_writer.log({'grad-norm': grad_norm}, iteration)
+        if num_zeros_in_grad is not None:
+            writer.add_scalar('num-zeros', num_zeros_in_grad, iteration)
+            writer.add_scalar('num-zeros vs samples', num_zeros_in_grad,
+                              args.consumed_train_samples)
+            if wandb_writer:
+                wandb_writer.log({'num-zeros': num_zeros_in_grad}, iteration)
+        if params_norm is not None:
+            writer.add_scalar('params-norm', params_norm, iteration)
+            writer.add_scalar('params-norm vs samples', params_norm,
+                              args.consumed_train_samples)
+            if wandb_writer:
+                wandb_writer.log({'params-norm': params_norm}, iteration)
+        if args.log_memory_to_tensorboard:
+            mem_stats = torch.cuda.memory_stats()
+            writer.add_scalar(
+                "mem-reserved-bytes",
+                mem_stats["reserved_bytes.all.current"],
+                iteration,
+            )
+            writer.add_scalar(
+                "mem-allocated-bytes",
+                mem_stats["allocated_bytes.all.current"],
+                iteration,
+            )
+            writer.add_scalar(
+                "mem-allocated-count",
+                mem_stats["allocation.all.current"],
+                iteration,
+            )
+    if args.num_experts is not None:
+        moe_loss_scale = 1 / get_num_microbatches()
+        track_moe_metrics(moe_loss_scale, iteration, writer, wandb_writer, total_loss_dict, args.moe_per_layer_logging)
+
+    if iteration % args.log_interval == 0:
+        elapsed_time = timers('interval-time').elapsed(barrier=True)
+        elapsed_time_per_iteration = elapsed_time / total_iterations
+
+        throughput = num_floating_point_operations(args, batch_size) / (
+            elapsed_time_per_iteration * 10**12 * args.world_size)
+
+        one_logger_utils.track_e2e_metrics(args.log_throughput, throughput)
+
+        if args.log_timers_to_tensorboard:
+            if writer:
+                writer.add_scalar('iteration-time',
+                                  elapsed_time_per_iteration, iteration)
+            if wandb_writer:
+                wandb_writer.log({'iteration-time': elapsed_time_per_iteration},
+                                 iteration)
+        log_string = f" [{datetime.now().strftime('%Y-%m-%d %H:%M:%S')}]"
+        log_string += ' iteration {:8d}/{:8d} |'.format(
+            iteration, args.train_iters)
+        log_string += ' consumed samples: {:12d} |'.format(
+            args.consumed_train_samples)
+        if args.skipped_train_samples > 0:
+            log_string += ' skipped samples: {:12d} |'.format(
+                args.skipped_train_samples)
+        log_string += ' elapsed time per iteration (ms): {:.1f} |'.format(
+            elapsed_time_per_iteration * 1000.0)
+        if args.log_throughput:
+            log_string += f' throughput per GPU (TFLOP/s/GPU): {throughput:.1f} |'
+            if args.log_timers_to_tensorboard:
+                if writer:
+                    writer.add_scalar('throughput', throughput, iteration)
+                if wandb_writer:
+                    wandb_writer.log({'throughput': throughput}, iteration)
+        assert learning_rate is not None
+        # Decoupled_learning_rate should be not None only on first and last pipeline stage.
+        log_string += ' learning rate: {:.6E} |'.format(learning_rate)
+        if args.decoupled_lr is not None and (mpu.is_pipeline_first_stage(ignore_virtual=True) or
+                                              mpu.is_pipeline_last_stage(ignore_virtual=True)):
+            assert decoupled_learning_rate is not None
+            log_string += ' decoupled learning rate: {:.6E} |'.format(decoupled_learning_rate)
+        else:
+            assert decoupled_learning_rate is None
+        log_string += ' global batch size: {:5d} |'.format(batch_size)
+        for key in total_loss_dict:
+            if key not in [advanced_iters_key, skipped_iters_key,
+                           nan_iters_key]:
+                avg = total_loss_dict[key].item() / \
+                      float(max(1, total_loss_dict[advanced_iters_key]))
+                if avg > 0.0:
+                    log_string += ' {}: {:.6E} |'.format(key, avg)
+                total_loss_dict[key] = torch.tensor([0.0], dtype=torch.float, device='cuda')
+        log_string += ' loss scale: {:.1f} |'.format(loss_scale)
+        if grad_norm is not None:
+            log_string += ' grad norm: {:.3f} |'.format(grad_norm)
+        if num_zeros_in_grad is not None:
+            log_string += ' num zeros: {:.1f} |'.format(num_zeros_in_grad)
+        if params_norm is not None:
+            log_string += ' params norm: {:.3f} |'.format(params_norm)
+        log_string += ' number of skipped iterations: {:3d} |'.format(
+            total_loss_dict[skipped_iters_key])
+        log_string += ' number of nan iterations: {:3d} |'.format(
+            total_loss_dict[nan_iters_key])
+        total_loss_dict[advanced_iters_key] = 0
+        total_loss_dict[skipped_iters_key] = 0
+        total_loss_dict[nan_iters_key] = 0
+        print_rank_last(log_string)
+        if report_memory_flag and learning_rate > 0.:
+            # Report memory after optimizer state has been initialized.
+            if torch.distributed.get_rank() == 0:
+                num_microbatches = get_num_microbatches()
+                report_theoretical_memory(args, num_microbatches=num_microbatches, verbose=True)
+            report_memory('(after {} iterations)'.format(iteration))
+            report_memory_flag = False
+        timers.log(timers_to_log, normalizer=args.log_interval)
+
+    return report_memory_flag
+
+
+def compute_throughputs_and_append_to_progress_log(iteration,
+                                                   num_floating_point_operations_so_far):
+    args = get_args()
+    if args.save is None:
+        return
+
+    # Compute job throughput.
+    # args.num_floating_point_operations_so_far keeps track of floating-point operations
+    # completed at the start of job.
+    global _TRAIN_START_TIME
+    job_throughput = \
+        (num_floating_point_operations_so_far -
+         args.num_floating_point_operations_so_far) / (
+            (time.time() - _TRAIN_START_TIME) * 10**12 * args.world_size)
+
+    # Compute cumulative throughput since jobs of this world size were launched.
+    # `get_start_time_from_progress_log` returns start time and number of floating-point
+    # operations of first job of this world size.
+    start_time, start_num_floating_point_operations = get_start_time_from_progress_log()
+    elapsed_time = (datetime.now() - start_time).total_seconds()
+    cumulative_throughput = \
+        (num_floating_point_operations_so_far -
+         start_num_floating_point_operations) / (
+            elapsed_time * 10**12 * args.world_size)
+
+    tokens_so_far = args.consumed_train_samples * args.seq_length
+    saved_ckpt_prefix = 'Saving async checkpoint' if args.async_save else 'Saved checkpoint'
+    append_to_progress_log(f"{saved_ckpt_prefix}\tIteration: {iteration}\t"
+                           f"Job throughput: {job_throughput:.1f} TFLOP/s/GPU\t"
+                           f"Cumulative throughput: {cumulative_throughput:.1f} TFLOP/s/GPU\t"
+                           f"Floating-point operations: {num_floating_point_operations_so_far:.2e}\t"
+                           f"Tokens (in billions): {tokens_so_far / 10**9:.2f}")
+
+
+def save_checkpoint_and_time(iteration, model, optimizer, opt_param_scheduler,
+                             num_floating_point_operations_so_far, checkpointing_context,
+                             non_persistent_ckpt=False, train_data_iterator=None):
+    args = get_args()
+    timers = get_timers()
+
+    # Stop timer to get accurate train interval time and exclude checkpointing duration
+    timers('interval-time').stop()
+    # Extra barrier is added to make sure all ranks report the max time.
+    timer_key = 'save-checkpoint-non-persistent' if non_persistent_ckpt else 'save-checkpoint'
+    timers(timer_key, log_level=0).start(barrier=True)
+    save_checkpoint_start_time = timers('save-checkpoint').active_time()
+
+    # Log E2E metrics before save-checkpoint
+    one_logger_utils.track_e2e_metrics()
+    if args.use_distributed_optimizer and args.overlap_param_gather:
+        optimizer.disable_pre_hook()
+    save_checkpoint(iteration, model, optimizer, opt_param_scheduler,
+                    num_floating_point_operations_so_far, checkpointing_context,
+                    non_persistent_ckpt=non_persistent_ckpt, train_data_iterator=train_data_iterator,
+                    ft_client=ft_integration.get_rank_monitor_client(
+                        ft_integration.StateMachineActions.SAVE_CHECKPOINT))
+    if args.use_distributed_optimizer and args.overlap_param_gather:
+        optimizer.enable_pre_hook()
+    timers(timer_key).stop(barrier=True)
+    timers.log([timer_key])
+    save_checkpoint_finish_time = timers('save-checkpoint').active_time()
+
+    # Log E2E metrics after save-checkpoint
+    one_logger_utils.track_e2e_metrics()
+    save_checkpoint_duration = save_checkpoint_finish_time - save_checkpoint_start_time
+    one_logger_utils.on_save_checkpoint_end(save_checkpoint_duration, iteration, args.async_save)
+
+    if args.log_progress and not non_persistent_ckpt:
+        compute_throughputs_and_append_to_progress_log(iteration,
+                                                       num_floating_point_operations_so_far)
+
+    # Recover timing
+    timers('interval-time', log_level=0).start(barrier=True)
+
+
+def train(forward_step_func, model, optimizer, opt_param_scheduler,
+          train_data_iterator, valid_data_iterator,
+          process_non_loss_data_func, config, checkpointing_context, non_loss_data_func):
+    """Train the model function."""
+    args = get_args()
+    timers = get_timers()
+    one_logger = get_one_logger()
+
+    # Write args to tensorboard
+    write_args_to_tensorboard()
+
+    # Turn on training mode which enables dropout.
+    for model_module in model:
+        model_module.train()
+
+    # Tracking loss.
+    total_loss_dict = {}
+
+    # Iterations.
+    iteration = args.iteration
+
+    # Track E2E metrics at the start of training
+    one_logger_utils.on_train_start(iteration=iteration, consumed_train_samples=args.consumed_train_samples,
+                                    train_samples=args.train_samples, seq_length=args.seq_length,
+                                    train_iters=args.train_iters, save=args.save, async_save=args.async_save,
+                                    log_throughput=args.log_throughput,
+                                    num_floating_point_operations_so_far=args.num_floating_point_operations_so_far)
+
+    num_floating_point_operations_so_far = args.num_floating_point_operations_so_far
+
+    # Setup some training config params
+    config.grad_scale_func = optimizer.scale_loss
+    config.timers = timers
+    if isinstance(model[0], DDP) and args.overlap_grad_reduce:
+        assert config.no_sync_func is None, \
+            ('When overlap_grad_reduce is True, config.no_sync_func must be None; '
+             'a custom no_sync_func is not supported when overlapping grad-reduce')
+        config.no_sync_func = [model_chunk.no_sync for model_chunk in model]
+        if len(model) == 1:
+            config.no_sync_func = config.no_sync_func[0]
+        if args.align_grad_reduce:
+            config.grad_sync_func = [model_chunk.start_grad_sync for model_chunk in model]
+            if len(model) == 1:
+                config.grad_sync_func = config.grad_sync_func[0]
+    if args.overlap_param_gather and args.align_param_gather:
+        config.param_sync_func = [model_chunk.start_param_sync for model_chunk in model]
+        if len(model) == 1:
+            config.param_sync_func = config.param_sync_func[0]
+    config.finalize_model_grads_func = finalize_model_grads
+
+    timers('interval-time', log_level=0).start(barrier=True)
+    print_datetime('before the start of training step')
+    report_memory_flag = True
+    exit = False
+
+    if args.manual_gc:
+        # Disable the default garbage collector and perform the collection manually.
+        # This is to align the timing of garbage collection across ranks.
+        assert args.manual_gc_interval >= 0, \
+            'Manual garbage collection interval should be laerger than or equal to 0.'
+        gc.disable()
+        gc.collect()
+
+    # Singleton Initialization
+    if args.log_straggler:
+        global stimer
+        world = torch.distributed.get_world_size()
+        rank = torch.distributed.get_rank()
+        mmcnt = args.straggler_minmax_count
+        stimer.configure(world, rank,
+                mmcnt = mmcnt,
+                enabled = not args.disable_straggler_on_startup,
+                port = args.straggler_ctrlr_port)
+    total_flops = 0.0
+
+    num_microbatches = get_num_microbatches()
+    eval_duration = 0.0
+    eval_iterations = 0
+
+    def get_e2e_base_metrics():
+        """Get base metrics values for one-logger to calculate E2E tracking metrics.
+        """
+        return {
+            'iteration': iteration,
+            'train_duration': timers('interval-time').active_time(),
+            'eval_duration': eval_duration,
+            'eval_iterations': eval_iterations,
+            'total_flops': total_flops,
+            'num_floating_point_operations_so_far': num_floating_point_operations_so_far,
+            'consumed_train_samples': args.consumed_train_samples,
+            'world_size': args.world_size,
+            'seq_length': args.seq_length
+        }
+    # Cache into one-logger for callback
+    if one_logger:
+        with one_logger.get_context_manager():
+            one_logger.store_set('get_e2e_base_metrics', get_e2e_base_metrics)
+
+    if args.profile and torch.distributed.get_rank() in args.profile_ranks and args.use_pytorch_profiler:
+        prof = torch.profiler.profile(
+        schedule=torch.profiler.schedule(
+            wait=max(args.profile_step_start-1, 0),
+            warmup=1 if args.profile_step_start > 0 else 0,
+            active=args.profile_step_end-args.profile_step_start,
+            repeat=1),
+        on_trace_ready=torch.profiler.tensorboard_trace_handler(args.tensorboard_dir),
+        record_shapes=True,
+        with_stack=True)
+        prof.start()
+
+    while iteration < args.train_iters:
+        if args.profile and torch.distributed.get_rank() in args.profile_ranks:
+            if args.use_pytorch_profiler:
+                prof.step()
+            elif iteration == args.profile_step_start:
+                torch.cuda.cudart().cudaProfilerStart()
+                torch.autograd.profiler.emit_nvtx(record_shapes=True).__enter__()
+
+        maybe_finalize_async_save(False)
+
+        # Update number of microbatches first without consistency check to decide if a
+        # checkpoint should be saved. If the number of microbatches is different
+        # from the previous iteration, save a checkpoint. Then run consistency check
+        # to make sure training configuration is still valid.
+        update_num_microbatches(args.consumed_train_samples, consistency_check=False, verbose=True)
+        if get_num_microbatches() != num_microbatches and iteration != 0:
+            assert get_num_microbatches() > num_microbatches, \
+                "number of microbatches should be increasing due to batch size rampup ... %d -> %d." % (num_microbatches, get_num_microbatches())
+            if args.save is not None:
+                save_checkpoint_and_time(iteration, model, optimizer,
+                                         opt_param_scheduler,
+                                         num_floating_point_operations_so_far,
+                                         checkpointing_context, train_data_iterator=train_data_iterator)
+        num_microbatches = get_num_microbatches()
+        update_num_microbatches(args.consumed_train_samples, consistency_check=True, verbose=True)
+
+        args.curr_iteration = iteration
+        loss_dict, skipped_iter, grad_norm, num_zeros_in_grad = \
+            train_step(forward_step_func,
+                       train_data_iterator,
+                       model,
+                       optimizer,
+                       opt_param_scheduler,
+                       config)
+        iteration += 1
+        batch_size = mpu.get_data_parallel_world_size() * \
+                     args.micro_batch_size * \
+                     get_num_microbatches()
+        args.consumed_train_samples += batch_size
+        num_skipped_samples_in_batch = (get_current_global_batch_size() -
+                                        get_current_running_global_batch_size())
+        if args.decrease_batch_size_if_needed:
+            assert num_skipped_samples_in_batch >= 0
+        else:
+            assert num_skipped_samples_in_batch == 0
+        args.skipped_train_samples += num_skipped_samples_in_batch
+        num_fp_ops = num_floating_point_operations(args, batch_size)
+        num_floating_point_operations_so_far += num_fp_ops
+        total_flops += num_fp_ops
+
+        # Send heartbeat to FT package and update timeouts.
+        if args.enable_ft_package:
+            ft_client = ft_integration.get_rank_monitor_client(
+                ft_integration.StateMachineActions.TRAIN_HEARTBEAT)
+            if ft_client is not None:
+                ft_client.send_heartbeat()
+                # TODO we are always calculating timeouts in the current implementation
+                # if we want to rely on manually setup then we need to add additional argument
+                # to training and pass it here
+                if ft_integration.can_update_timeouts():
+                    ft_integration.get_rank_monitor_client(
+                        ft_integration.StateMachineActions.UPDATE_TIMEOUT).calculate_and_set_timeouts()
+                    print_rank_0(f'Updated FT timeouts. New values: \
+                        {ft_integration.get_rank_monitor_client().timeouts}')
+
+        # Bring CPU and GPU back in sync if on right iteration.
+        if (
+            args.train_sync_interval
+            and iteration % args.train_sync_interval == 0
+        ):
+            torch.cuda.synchronize()
+
+        # Logging.
+        loss_scale = optimizer.get_loss_scale().item()
+        params_norm = None
+        if args.log_params_norm:
+            params_norm = calc_params_l2_norm(model)
+
+        learning_rate = None
+        decoupled_learning_rate = None
+        for param_group in optimizer.param_groups:
+            if param_group['is_decoupled_lr']:
+                decoupled_learning_rate = param_group['lr']
+            else:
+                learning_rate = param_group['lr']
+        report_memory_flag = training_log(loss_dict, total_loss_dict,
+                                          learning_rate,
+                                          decoupled_learning_rate,
+                                          iteration, loss_scale,
+                                          report_memory_flag, skipped_iter,
+                                          grad_norm, params_norm, num_zeros_in_grad)
+
+        # StragglerDetector
+        if iteration % args.log_interval == 0 and args.log_straggler:
+            stimer.report(total_flops, args.log_interval)
+            total_flops = 0.0
+
+        if args.check_weight_hash_across_dp_replicas_interval is not None and \
+                iteration % args.check_weight_hash_across_dp_replicas_interval == 0:
+            if args.use_distributed_optimizer and args.overlap_param_gather:
+                optimizer.disable_pre_hook()
+            assert check_param_hashes_across_dp_replicas(model, cross_check=True), \
+                "Parameter hashes not matching across DP replicas"
+            torch.distributed.barrier()
+            print_rank_0(f">>> Weight hashes match after {iteration} iterations...")
+            if args.use_distributed_optimizer and args.overlap_param_gather:
+                optimizer.enable_pre_hook()
+
+        # Autoresume
+        if args.adlr_autoresume and \
+           (iteration % args.adlr_autoresume_interval == 0):
+            check_adlr_autoresume_termination(iteration, model, optimizer,
+                                              opt_param_scheduler)
+
+        # Evaluation
+        if args.eval_interval and iteration % args.eval_interval == 0 and \
+           args.do_valid:
+            timers('interval-time').stop()
+            if args.use_distributed_optimizer and args.overlap_param_gather:
+                optimizer.disable_pre_hook()
+            if args.manual_gc and args.manual_gc_eval:
+                # Collect all objects.
+                gc.collect()
+            prefix = 'iteration {}'.format(iteration)
+            timers('eval-time', log_level=0).start(barrier=True)
+            evaluate_and_print_results(prefix, forward_step_func,
+                                       valid_data_iterator, model,
+                                       iteration, process_non_loss_data_func,
+                                       config, verbose=False, write_to_tensorboard=True,
+                                       non_loss_data_func=non_loss_data_func)
+            eval_duration += timers('eval-time').elapsed()
+            eval_iterations += args.eval_iters
+            timers('eval-time').stop()
+            one_logger_utils.track_e2e_metrics()
+
+            if args.manual_gc and args.manual_gc_eval:
+                # Collect only the objects created and used in evaluation.
+                gc.collect(generation=0)
+            if args.use_distributed_optimizer and args.overlap_param_gather:
+                optimizer.enable_pre_hook()
+            timers('interval-time', log_level=0).start(barrier=True)
+
+
+            if args.enable_ft_package and ft_integration.get_rank_monitor_client() is not None:
+                ft_integration.get_rank_monitor_client(
+                    ft_integration.StateMachineActions.EVAL_HEARTBEAT).send_heartbeat()
+
+        # Checkpointing
+        saved_checkpoint = False
+        if args.exit_signal_handler:
+            signal_handler = get_signal_handler()
+            if any(signal_handler.signals_received()):
+                if args.save:
+                    save_checkpoint_and_time(iteration, model, optimizer,
+                                             opt_param_scheduler,
+                                             num_floating_point_operations_so_far,
+                                             checkpointing_context, train_data_iterator=train_data_iterator)
+                print_datetime('exiting program after receiving SIGTERM.')
+                exit = True
+                break
+
+        if args.save and args.save_interval and \
+           iteration % args.save_interval == 0:
+            save_checkpoint_and_time(iteration, model, optimizer,
+                                     opt_param_scheduler,
+                                     num_floating_point_operations_so_far,
+                                     checkpointing_context, train_data_iterator=train_data_iterator)
+            saved_checkpoint = True
+
+        elif args.save and args.non_persistent_save_interval and \
+           iteration % args.non_persistent_save_interval == 0:
+            timers('interval-time').stop()
+            save_checkpoint_and_time(iteration, model, optimizer,
+                                     opt_param_scheduler,
+                                     num_floating_point_operations_so_far,
+                                     checkpointing_context,
+                                     non_persistent_ckpt=True, train_data_iterator=train_data_iterator)
+            saved_checkpoint = True
+            timers('interval-time', log_level=0).start(barrier=True)
+
+        # Exiting based on duration
+        if args.exit_duration_in_mins:
+            train_time = (time.time() - _TRAIN_START_TIME) / 60.0
+            done_cuda = torch.tensor(
+                [train_time > args.exit_duration_in_mins],
+                dtype=torch.int, device='cuda')
+            torch.distributed.all_reduce(
+                done_cuda, op=torch.distributed.ReduceOp.MAX)
+            done = done_cuda.item()
+            if done:
+                if args.save and not saved_checkpoint:
+                    save_checkpoint_and_time(iteration, model, optimizer,
+                                             opt_param_scheduler,
+                                             num_floating_point_operations_so_far,
+                                             checkpointing_context, train_data_iterator=train_data_iterator)
+                print_datetime('exiting program after {} minutes'.format(train_time))
+                exit = True
+                break
+
+        # Exiting based on iterations
+        if args.exit_interval and iteration % args.exit_interval == 0:
+            if args.save and not saved_checkpoint:
+                save_checkpoint_and_time(iteration, model, optimizer,
+                                         opt_param_scheduler,
+                                         num_floating_point_operations_so_far,
+                                         checkpointing_context, train_data_iterator=train_data_iterator)
+            torch.distributed.barrier()
+            print_datetime('exiting program at iteration {}'.format(iteration))
+            exit = True
+            break
+
+        if args.profile and \
+            iteration == args.profile_step_end and \
+            torch.distributed.get_rank() in args.profile_ranks:
+            if args.use_pytorch_profiler:
+                prof.stop()
+            else:
+                torch.cuda.cudart().cudaProfilerStop()
+
+        if args.manual_gc:
+            if args.manual_gc_interval != 0 and iteration % args.manual_gc_interval == 0:
+                gc.collect()
+
+    one_logger_utils.track_e2e_metrics()
+
+    # Flush TensorBoard, WandB writers and one-logger
+    writer = get_tensorboard_writer()
+    if writer:
+        writer.flush()
+
+    # Close out pre-hooks if using distributed optimizer and overlapped param gather.
+    if args.use_distributed_optimizer and args.overlap_param_gather:
+        optimizer.disable_pre_hook()
+
+    if args.enable_ft_package and ft_integration.get_rank_monitor_client() is not None:
+        ft_integration.get_rank_monitor_client().shutdown_workload_monitoring()
+
+    maybe_finalize_async_save(True)
+
+    # If any exit conditions (signal handler, duration, iterations) have been reached, exit.
+    if exit:
+        wandb_writer = get_wandb_writer()
+        if wandb_writer:
+            wandb_writer.finish()
+        sys.exit()
+
+    return iteration, num_floating_point_operations_so_far
+
+
+def evaluate(forward_step_func,
+             data_iterator,
+             model,
+             process_non_loss_data_func,
+             config,
+             verbose=False,
+             non_loss_data_func=None):
+    """Evaluation."""
+    args = get_args()
+    timers = get_timers()
+
+    timers('evaluate', log_level=0).start(barrier=True)
+
+    if args.vision_pretraining and args.vision_pretraining_type == "dino":
+        from megatron.legacy.model.vision.knn_monitor import compute_feature_bank
+        compute_feature_bank(model)
+
+    # Turn on evaluation mode which disables dropout.
+    for model_module in model:
+        model_module.eval()
+
+    total_loss_dict = {}
+
+    # make validation batch size independent from training batch size
+    eval_batch_size = args.global_batch_size
+    eval_num_microbatches = eval_batch_size // \
+        (args.micro_batch_size * args.data_parallel_size)
+
+    with torch.no_grad():
+        iteration = 0
+        if verbose:
+            print_rank_0(f'Evaluating on {args.eval_iters * eval_batch_size} samples')
+        while iteration < args.eval_iters:
+            iteration += 1
+            if verbose:
+                print_rank_0(f'Evaluating iter {iteration}/{args.eval_iters}')
+
+            forward_backward_func = get_forward_backward_func()
+            # Don't care about timing during evaluation
+            config.timers = None
+            loss_dicts = forward_backward_func(
+                forward_step_func=forward_step_func,
+                data_iterator=data_iterator,
+                model=model,
+                num_microbatches=eval_num_microbatches,
+                seq_length=args.seq_length,
+                micro_batch_size=args.micro_batch_size,
+                decoder_seq_length=args.decoder_seq_length,
+                forward_only=True)
+            config.timers = get_timers()
+
+            # Empty unused memory
+            if args.empty_unused_memory_level >= 1:
+                torch.cuda.empty_cache()
+
+            if mpu.is_pipeline_last_stage(ignore_virtual=True):
+                # Reduce across processes.
+                for loss_dict in loss_dicts:
+                    for key in loss_dict:
+                        if key not in total_loss_dict:
+                            total_loss_dict[key] = torch.tensor([0.0, 0.0], dtype=torch.float).cuda()
+                        val = loss_dict[key]
+                        if isinstance(val, tuple) or isinstance(val, list):
+                            total_loss_dict[key][0] += val[0]
+                            total_loss_dict[key][1] += val[1]
+                        else:
+                            total_loss_dict[key][0] += val
+                            total_loss_dict[key][1] += 1
+
+            args.consumed_valid_samples += eval_batch_size
+
+            if args.exit_duration_in_mins:
+                train_time = (time.time() - _TRAIN_START_TIME) / 60.0
+                done_cuda = torch.tensor(
+                    [train_time > args.exit_duration_in_mins],
+                    dtype=torch.int, device='cuda')
+                torch.distributed.all_reduce(
+                    done_cuda, op=torch.distributed.ReduceOp.MAX)
+                done = done_cuda.item()
+                if done:
+                    print_rank_0('Exiting during evaluation, timelimit reached')
+                    return None, None, True
+
+        collected_non_loss_data = None
+        if non_loss_data_func is not None:
+            collected_non_loss_data = non_loss_data_func(model)
+        elif process_non_loss_data_func is not None and is_last_rank():
+            collected_non_loss_data = forward_backward_func(
+                forward_step_func=forward_step_func,
+                data_iterator=data_iterator,
+                model=model,
+                num_microbatches=get_num_microbatches(),
+                seq_length=args.seq_length,
+                micro_batch_size=args.micro_batch_size,
+                decoder_seq_length=args.decoder_seq_length,
+                forward_only=True,
+                collect_non_loss_data=True)
+
+    # Move model back to the train mode.
+    for model_module in model:
+        model_module.train()
+
+    for key in total_loss_dict:
+        numerator, denominator = total_loss_dict[key]
+        total_loss_dict[key] = numerator / denominator
+
+    timers('evaluate').stop()
+    timers.log(['evaluate'])
+
+    return total_loss_dict, collected_non_loss_data, False
+
+def evaluate_and_print_results(prefix, forward_step_func,
+                               data_iterator, model,
+                               iteration, process_non_loss_data_func, config,
+                               verbose=False, write_to_tensorboard=True, non_loss_data_func=None):
+    """Helper function to evaluate and dump results on screen."""
+    args = get_args()
+    if write_to_tensorboard:
+        writer = get_tensorboard_writer()
+    else:
+        writer = None
+
+    wandb_writer = get_wandb_writer()
+
+    total_loss_dict, collected_non_loss_data, timelimit = evaluate(
+        forward_step_func, data_iterator, model,
+        process_non_loss_data_func, config, verbose, non_loss_data_func)
+    # Timelimit hit during evaluation
+    if timelimit:
+        return
+    string = ' validation loss at {} | '.format(prefix)
+    for key in total_loss_dict:
+        string += '{} value: {:.6E} | '.format(key, total_loss_dict[key].item())
+        ppl = math.exp(min(20, total_loss_dict[key].item()))
+        string += '{} PPL: {:.6E} | '.format(key, ppl)
+        if writer:
+            writer.add_scalar('{} validation'.format(key),
+                              total_loss_dict[key].item(),
+                              iteration)
+            writer.add_scalar('{} validation vs samples'.format(key),
+                              total_loss_dict[key].item(),
+                              args.consumed_train_samples)
+            if args.log_validation_ppl_to_tensorboard:
+                writer.add_scalar('{} validation ppl'.format(key), ppl,
+                                  iteration)
+                writer.add_scalar('{} validation ppl vs samples'.format(key),
+                                  ppl, args.consumed_train_samples)
+            if wandb_writer and is_last_rank():
+                wandb_writer.log({
+                    '{} validation'.format(key): total_loss_dict[key].item()},
+                    iteration)
+
+    if process_non_loss_data_func is not None and writer and is_last_rank():
+        process_non_loss_data_func(collected_non_loss_data, iteration, writer)
+
+    length = len(string) + 1
+    print_rank_last('-' * length)
+    print_rank_last(string)
+    print_rank_last('-' * length)
+
+
+def cyclic_iter(iter):
+    while True:
+        for x in iter:
+            yield x
+
+
+def get_train_valid_test_num_samples():
+    """Train/valid/test num samples."""
+
+    args = get_args()
+
+    # Number of train/valid/test samples.
+    if args.train_samples:
+        train_samples = args.train_samples
+    else:
+        train_samples = args.train_iters * args.global_batch_size
+    eval_iters = (args.train_iters // args.eval_interval + 1) * \
+                 args.eval_iters
+    test_iters = args.eval_iters
+
+    return (
+        train_samples,
+        eval_iters * args.global_batch_size,
+        test_iters * args.global_batch_size,
+    )
+
+
+def build_train_valid_test_datasets(build_train_valid_test_datasets_provider):
+    """Build pretraining datasets."""
+    train_valid_test_num_samples = get_train_valid_test_num_samples()
+    print_rank_0(' > datasets target sizes (minimum size):')
+    print_rank_0('    train:      {}'.format(train_valid_test_num_samples[0]))
+    print_rank_0('    validation: {}'.format(train_valid_test_num_samples[1]))
+    print_rank_0('    test:       {}'.format(train_valid_test_num_samples[2]))
+    return build_train_valid_test_datasets_provider(train_valid_test_num_samples)
+
+
+def build_train_valid_test_data_loaders(
+        build_train_valid_test_datasets_provider):
+    """Build pretraining data loaders."""
+
+    args = get_args()
+
+    (train_dataloader, valid_dataloader, test_dataloader) = (None, None, None)
+
+    print_rank_0('> building train, validation, and test datasets ...')
+
+    # Backward compatibility, assume fixed batch size.
+    if args.iteration > 0 and args.consumed_train_samples == 0:
+        assert args.train_samples is None, \
+            'only backward compatiblity support for iteration-based training'
+        args.consumed_train_samples = args.iteration * args.global_batch_size
+    if args.iteration > 0 and args.consumed_valid_samples == 0:
+        if args.train_samples is None:
+            args.consumed_valid_samples = (args.iteration // args.eval_interval) * \
+                args.eval_iters * args.global_batch_size
+
+    # Rely on distributed-aware core datasets, temporary
+    is_distributed = getattr(build_train_valid_test_datasets_provider, "is_distributed", False)
+
+    # Construct the data pipeline
+    if is_distributed or mpu.get_tensor_model_parallel_rank() == 0:
+
+        # Build datasets.
+        train_ds, valid_ds, test_ds = build_train_valid_test_datasets(
+            build_train_valid_test_datasets_provider)
+        # Build dataloders.
+        train_dataloader = build_pretraining_data_loader(
+            train_ds, args.consumed_train_samples)
+        if args.skip_train:
+            valid_dataloader = build_pretraining_data_loader(valid_ds, 0)
+        else:
+            valid_dataloader = build_pretraining_data_loader(
+                valid_ds, args.consumed_valid_samples)
+        test_dataloader = build_pretraining_data_loader(test_ds, 0)
+
+        # Flags to know if we need to do training/validation/testing.
+        do_train = train_dataloader is not None and args.train_iters > 0
+        do_valid = valid_dataloader is not None and args.eval_iters > 0
+        do_test = test_dataloader is not None and args.eval_iters > 0
+        flags = torch.tensor(
+            [int(do_train), int(do_valid), int(do_test)],
+            dtype=torch.long, device='cuda')
+    else:
+        flags = torch.tensor([0, 0, 0], dtype=torch.long, device='cuda')
+
+    torch.distributed.broadcast(flags, 0)
+
+    args.do_train = getattr(args, "do_train", False) or flags[0].item()
+    args.do_valid = getattr(args, "do_valid", False) or flags[1].item()
+    args.do_test = getattr(args, "do_test", False) or flags[2].item()
+
+    return train_dataloader, valid_dataloader, test_dataloader
+
+
+def build_train_valid_test_data_iterators(
+        build_train_valid_test_datasets_provider):
+    """Build pretraining data iterators."""
+
+    args = get_args()
+
+    # Build loaders.
+    train_dataloader, valid_dataloader, test_dataloader = \
+        build_train_valid_test_data_loaders(
+            build_train_valid_test_datasets_provider)
+
+    # Build iterators.
+    dl_type = args.dataloader_type
+    assert dl_type in ['single', 'cyclic', 'external']
+
+    def _get_iterator(dataloader_type, dataloader):
+        """Return dataset iterator."""
+        if dataloader_type == "single":
+            return iter(dataloader)
+        elif dataloader_type == "cyclic":
+            return iter(cyclic_iter(dataloader))
+        elif dataloader_type == "external":
+            # External dataloader is passed through. User is expected to define how to iterate.
+            return dataloader
+        else:
+            raise RuntimeError("unexpected dataloader type")
+
+    if train_dataloader is not None:
+        train_data_iterator = _get_iterator(dl_type, train_dataloader)
+    else:
+        train_data_iterator = None
+
+    if valid_dataloader is not None:
+        valid_data_iterator = _get_iterator(dl_type, valid_dataloader)
+    else:
+        valid_data_iterator = None
+
+    if test_dataloader is not None:
+        test_data_iterator = _get_iterator(dl_type, test_dataloader)
+    else:
+        test_data_iterator = None
+
+    return train_data_iterator, valid_data_iterator, test_data_iterator
diff --git a/megatron/training/utils.py b/megatron/training/utils.py
new file mode 100644
index 0000000000..4c3223d0de
--- /dev/null
+++ b/megatron/training/utils.py
@@ -0,0 +1,390 @@
+# Copyright (c) 2022, NVIDIA CORPORATION. All rights reserved.
+
+"""General utilities."""
+import os
+import sys
+from datetime import datetime
+
+import torch
+
+try:
+    from transformer_engine.pytorch.optimizers import multi_tensor_applier, multi_tensor_l2norm
+except ImportError:
+    try:
+        from apex.multi_tensor_apply import multi_tensor_applier
+    except ImportError:
+        multi_tensor_applier = None
+
+    try:
+        from amp_C import multi_tensor_l2norm
+    except ImportError:
+        import warnings
+        warnings.warn(
+            f'Transformer Engine and Apex are not installed. '
+            'Falling back to local implementations of '
+            'multi_tensor_applier and multi_tensor_l2norm'
+        )
+
+        from megatron.core.utils import (
+            local_multi_tensor_l2_norm as multi_tensor_l2norm,
+            local_multi_tensor_applier as multi_tensor_applier,
+        )
+
+from megatron.training import (
+    get_args,
+    get_adlr_autoresume,
+)
+from megatron.core import DistributedDataParallel as DDP
+from megatron.core import mpu
+from megatron.core.tensor_parallel import param_is_not_tensor_parallel_duplicate
+from megatron.legacy.model import Float16Module
+from megatron.legacy.model.module import param_is_not_shared
+
+
+ALL_MODULE_WRAPPER_CLASSNAMES = (DDP, Float16Module)
+
+
+def unwrap_model(model, module_instances=ALL_MODULE_WRAPPER_CLASSNAMES):
+    return_list = True
+    if not isinstance(model, list):
+        model = [model]
+        return_list = False
+    unwrapped_model = []
+    for model_module in model:
+        while isinstance(model_module, module_instances):
+            model_module = model_module.module
+        unwrapped_model.append(model_module)
+    if not return_list:
+        return unwrapped_model[0]
+    return unwrapped_model
+
+
+def calc_params_l2_norm(model):
+    """Calculate l2 norm of parameters """
+    args = get_args()
+    if not isinstance(model, list):
+        model = [model]
+    # Remove duplicate params.
+    params_data = []
+    for model_ in model:
+        for param in model_.parameters():
+            is_not_tp_duplicate = param_is_not_tensor_parallel_duplicate(param)
+            if mpu.get_expert_model_parallel_rank() > 0:
+                if not getattr(param, 'allreduce', True) and is_not_tp_duplicate:
+                    assert param_is_not_shared(param)
+                    params_data.append(param.data.float() if args.bf16 else param.data)
+            else:
+                is_not_shared = param_is_not_shared(param)
+                if is_not_shared and is_not_tp_duplicate:
+                    params_data.append(param.data.float() if args.bf16 else param.data)
+
+    # Calculate norm
+    dummy_overflow_buf = torch.tensor([0], dtype=torch.int, device='cuda')
+    norm, _ = multi_tensor_applier(
+        multi_tensor_l2norm,
+        dummy_overflow_buf,
+        [params_data],
+        False # no per-parameter norm
+    )
+    norm_2 = norm * norm
+    if mpu.get_expert_model_parallel_world_size() == 1:
+        # Sum across all model-parallel GPUs(tensor + pipeline).
+        torch.distributed.all_reduce(norm_2,
+                                     op=torch.distributed.ReduceOp.SUM,
+                                     group=mpu.get_model_parallel_group())
+    else:
+        # Sum across tensor, pipeline and expert model-parallel GPUs.
+        torch.distributed.all_reduce(norm_2,
+                                     op=torch.distributed.ReduceOp.SUM,
+                                     group=mpu.get_tensor_and_expert_parallel_group())
+        torch.distributed.all_reduce(norm_2,
+                                     op=torch.distributed.ReduceOp.SUM,
+                                     group=mpu.get_pipeline_model_parallel_group())
+    return norm_2.item() ** 0.5
+
+
+def average_losses_across_data_parallel_group(losses):
+    """Reduce a tensor of losses across all GPUs."""
+    averaged_losses = torch.cat(
+        [loss.clone().detach().view(1) for loss in losses])
+    torch.distributed.all_reduce(averaged_losses,
+                                 group=mpu.get_data_parallel_group())
+    averaged_losses = averaged_losses / \
+        torch.distributed.get_world_size(group=mpu.get_data_parallel_group())
+
+    return averaged_losses
+
+
+def report_memory(name):
+    """Simple GPU memory report."""
+    mega_bytes = 1024.0 * 1024.0
+    string = name + ' memory (MB)'
+    string += ' | allocated: {}'.format(
+        torch.cuda.memory_allocated() / mega_bytes)
+    string += ' | max allocated: {}'.format(
+        torch.cuda.max_memory_allocated() / mega_bytes)
+    string += ' | reserved: {}'.format(
+        torch.cuda.memory_reserved() / mega_bytes)
+    string += ' | max reserved: {}'.format(
+        torch.cuda.max_memory_reserved() / mega_bytes)
+    if mpu.get_data_parallel_rank() == 0:
+        print("[Rank {}] {}".format(torch.distributed.get_rank(), string),
+              flush=True)
+
+
+def print_params_min_max_norm(optimizer, iteration):
+    """Print min, max, and norm of all parameters."""
+    index = 0
+    rank = torch.distributed.get_rank()
+    string = 'iteration, rank, index, tensor-model-parallel, min, max, norm\n'
+    optimizer_ = optimizer.optimizer
+    for param_group in optimizer_.param_groups:
+        for param in param_group['params']:
+            index += 1
+            min_ = param.data.min()
+            max_ = param.data.max()
+            norm = torch.linalg.norm(param.data)
+            string += '{:7d}, {:4d}, {:4d}, {:2d}, '.format(
+                iteration, rank, index, int(param.tensor_model_parallel))
+            string += '{:.6E}, {:.6E}, {:.6E}\n'.format(min_, max_, norm)
+    print(string, flush=True)
+
+
+def check_adlr_autoresume_termination(iteration, model,
+                                      optimizer, opt_param_scheduler):
+    """Check for autoresume signal and exit if it is received."""
+    from megatron.training.checkpointing import save_checkpoint
+
+    args = get_args()
+    autoresume = get_adlr_autoresume()
+    # Add barrier to ensure consistnecy.
+    torch.distributed.barrier()
+    if autoresume.termination_requested():
+        if args.save:
+            save_checkpoint(iteration, model, optimizer, opt_param_scheduler)
+        print_rank_0(">>> autoresume termination request found!")
+        if torch.distributed.get_rank() == 0:
+            autoresume.request_resume()
+        print_rank_0(">>> training terminated. Returning")
+        sys.exit(0)
+
+
+def get_ltor_masks_and_position_ids(data,
+                                    eod_token,
+                                    reset_position_ids,
+                                    reset_attention_mask,
+                                    eod_mask_loss):
+    """Build masks and position id for left to right model."""
+
+    # Extract batch size and sequence length.
+    micro_batch_size, seq_length = data.size()
+
+    # Attention mask (lower triangular).
+    if reset_attention_mask:
+        att_mask_batch = micro_batch_size
+    else:
+        att_mask_batch = 1
+    attention_mask = torch.tril(torch.ones(
+        (att_mask_batch, seq_length, seq_length), device=data.device)).view(
+            att_mask_batch, 1, seq_length, seq_length)
+
+    # Loss mask.
+    loss_mask = torch.ones(data.size(), dtype=torch.float, device=data.device)
+    if eod_mask_loss:
+        loss_mask[data == eod_token] = 0.0
+
+    # Position ids.
+    position_ids = torch.arange(seq_length, dtype=torch.long,
+                                device=data.device)
+    position_ids = position_ids.unsqueeze(0).expand_as(data)
+    # We need to clone as the ids will be modifed based on batch index.
+    if reset_position_ids:
+        position_ids = position_ids.clone()
+
+    if reset_position_ids or reset_attention_mask:
+        # Loop through the batches:
+        for b in range(micro_batch_size):
+
+            # Find indecies where EOD token is.
+            eod_index = position_ids[b, data[b] == eod_token]
+            # Detach indecies from positions if going to modify positions.
+            if reset_position_ids:
+                eod_index = eod_index.clone()
+
+            # Loop through EOD indecies:
+            prev_index = 0
+            for j in range(eod_index.size()[0]):
+                i = eod_index[j]
+                # Mask attention loss.
+                if reset_attention_mask:
+                    attention_mask[b, 0, (i + 1):, :(i + 1)] = 0
+                # Reset positions.
+                if reset_position_ids:
+                    position_ids[b, (i + 1):] -= (i + 1 - prev_index)
+                    prev_index = i + 1
+
+    # Convert attention mask to binary:
+    attention_mask = (attention_mask < 0.5)
+
+    return attention_mask, loss_mask, position_ids
+
+
+def get_batch_on_this_cp_rank(batch):
+    """ Slice batch input along sequence dimension into multiple chunks,
+        which are parallelized across GPUs in a context parallel group.
+    """
+
+    # With causal masking, each token only attends to its prior tokens. Simply split
+    # sequence into CP chunks can result in severe load imbalance. That's to say, chunks
+    # at the end of sequence have bigger workload than others. To address this issue,
+    # we split sequence into 2*CP ranks. Assuming CP=2, we then get 4 chunks, chunk_0
+    # and chunk_3 are assigned to GPU0, chunk_1 and chunk_2 are assigned to GPU1, so
+    # that we can get balanced workload among GPUs in a context parallel group.
+    args = get_args()
+    cp_size = args.context_parallel_size
+    if cp_size > 1:
+        cp_rank = mpu.get_context_parallel_rank()
+        for key, val in batch.items():
+            if val is not None:
+                seq_dim = 1 if key != 'attention_mask' else 2
+                val = val.view(
+                    *val.shape[0:seq_dim],
+                    2 * cp_size,
+                    val.shape[seq_dim] // (2 * cp_size),
+                    *val.shape[(seq_dim + 1) :],
+                )
+                index = torch.tensor([cp_rank, (2 * cp_size - cp_rank - 1)], 
+                                     device="cpu", pin_memory=True).cuda(non_blocking=True)
+                val = val.index_select(seq_dim, index)
+                val = val.view(*val.shape[0:seq_dim], -1, *val.shape[(seq_dim + 2) :])
+                batch[key] = val
+
+    return batch
+
+
+def print_rank_0(message):
+    """If distributed is initialized, print only on rank 0."""
+    if torch.distributed.is_initialized():
+        if torch.distributed.get_rank() == 0:
+            print(message, flush=True)
+    else:
+        print(message, flush=True)
+
+def is_last_rank():
+    return torch.distributed.get_rank() == (
+        torch.distributed.get_world_size() - 1)
+
+def print_rank_last(message):
+    """If distributed is initialized, print only on last rank."""
+    if torch.distributed.is_initialized():
+        if is_last_rank():
+            print(message, flush=True)
+    else:
+        print(message, flush=True)
+
+
+def append_to_progress_log(string, barrier=True):
+    """ Append given string to progress log. """
+    args = get_args()
+    if args.save is None:
+        return
+    progress_log_filename = os.path.join(args.save, "progress.txt")
+    if barrier:
+        torch.distributed.barrier()
+    if torch.distributed.get_rank() == 0:
+        with open(progress_log_filename, 'a') as f:
+            job_id = os.getenv('SLURM_JOB_ID', '')
+            num_gpus = args.world_size
+            f.write(f"{datetime.now().strftime('%Y-%m-%d %H:%M:%S')}\tJob ID: {job_id}\t"
+                    f"# GPUs: {num_gpus}\t{string}\n")
+
+
+def get_batch_on_this_tp_rank(data_iterator):
+
+    args = get_args()
+
+    def _broadcast(item):
+       if item is not None:
+           torch.distributed.broadcast(item, mpu.get_tensor_model_parallel_src_rank(), group=mpu.get_tensor_model_parallel_group())
+
+    if mpu.get_tensor_model_parallel_rank() == 0:
+
+       if data_iterator is not None:
+           data = next(data_iterator)
+       else:
+           data = None
+
+       batch = {
+           'tokens': data["tokens"].cuda(non_blocking = True),
+           'labels': data["labels"].cuda(non_blocking = True),
+           'loss_mask': data["loss_mask"].cuda(non_blocking = True),
+           'attention_mask': None if "attention_mask" not in data else data["attention_mask"].cuda(non_blocking = True),
+           'position_ids': data["position_ids"].cuda(non_blocking = True)
+       }
+
+       if args.pipeline_model_parallel_size == 1:
+           _broadcast(batch['tokens'])
+           _broadcast(batch['labels'])
+           _broadcast(batch['loss_mask'])
+           _broadcast(batch['attention_mask'])
+           _broadcast(batch['position_ids'])
+
+       elif mpu.is_pipeline_first_stage():
+           _broadcast(batch['tokens'])
+           _broadcast(batch['attention_mask'])
+           _broadcast(batch['position_ids'])
+
+       elif mpu.is_pipeline_last_stage():
+           _broadcast(batch['labels'])
+           _broadcast(batch['loss_mask'])
+           _broadcast(batch['attention_mask'])
+
+    else:
+
+       tokens=torch.empty((args.micro_batch_size,args.seq_length), dtype = torch.int64 , device = torch.cuda.current_device())
+       labels=torch.empty((args.micro_batch_size,args.seq_length), dtype = torch.int64 , device = torch.cuda.current_device())
+       loss_mask=torch.empty((args.micro_batch_size,args.seq_length), dtype = torch.float32 , device = torch.cuda.current_device())
+       if args.create_attention_mask_in_dataloader:
+           attention_mask=torch.empty(
+                (args.micro_batch_size,1,args.seq_length,args.seq_length), dtype = torch.bool , device = torch.cuda.current_device()
+            )
+       else:
+           attention_mask=None
+       position_ids=torch.empty((args.micro_batch_size,args.seq_length), dtype = torch.int64 , device = torch.cuda.current_device())
+
+       if args.pipeline_model_parallel_size == 1:
+           _broadcast(tokens)
+           _broadcast(labels)
+           _broadcast(loss_mask)
+           _broadcast(attention_mask)
+           _broadcast(position_ids)
+ 
+       elif mpu.is_pipeline_first_stage():
+           labels=None
+           loss_mask=None
+   
+           _broadcast(tokens)
+           _broadcast(attention_mask)
+           _broadcast(position_ids)
+
+       elif mpu.is_pipeline_last_stage():
+           tokens=None
+           position_ids=None
+    
+           _broadcast(labels)
+           _broadcast(loss_mask)
+           _broadcast(attention_mask)
+ 
+       batch = {
+           'tokens': tokens,
+           'labels': labels,
+           'loss_mask': loss_mask,
+           'attention_mask': attention_mask,
+           'position_ids': position_ids
+       }
+
+    return batch
+
+
+def update_use_dist_ckpt(args):
+    args.use_dist_ckpt = args.ckpt_format != "torch"
diff --git a/megatron/training/yaml_arguments.py b/megatron/training/yaml_arguments.py
new file mode 100644
index 0000000000..3c6c39b07f
--- /dev/null
+++ b/megatron/training/yaml_arguments.py
@@ -0,0 +1,459 @@
+# Copyright (c) 2024, NVIDIA CORPORATION. All rights reserved.
+
+"""Megatron arguments."""
+
+import argparse
+import dataclasses
+import json
+import os
+import torch
+import types
+
+from itertools import chain, starmap
+from types import SimpleNamespace
+import yaml, re, os
+from types import SimpleNamespace
+
+import torch.nn.functional as F
+
+from megatron.core.transformer import TransformerConfig, MLATransformerConfig
+
+# Taken from https://stackoverflow.com/questions/65414773/parse-environment-variable-from-yaml-with-pyyaml
+# Allows for yaml to use environment variables
+env_pattern = re.compile(r".*?\${(.*?)}.*?")
+def env_constructor(loader, node):
+    value = loader.construct_scalar(node)
+    for group in env_pattern.findall(value):
+        assert os.environ.get(group) is not None, f"environment variable {group} in yaml not found"
+        value = value.replace(f"${{{group}}}", os.environ.get(group))
+    return value
+yaml.add_implicit_resolver("!pathex", env_pattern)
+yaml.add_constructor("!pathex", env_constructor)
+
+
+str_dtype_to_torch = {
+    "float32" : torch.float32,
+    "float16" : torch.float16,
+    "bfloat16" : torch.bfloat16
+}
+
+def validate_yaml(args, defaults={}):
+    
+    # This is for legacy script env var setting
+    if type(args.data_path) is str:
+        # If no white space its a single path
+        split_data_path = args.data_path.split()
+        if len(split_data_path) != 1:
+            args.data_path = split_data_path
+
+    # Tensor model parallel size.
+    args.model_parallel.tensor_model_parallel_size = min(
+        args.model_parallel.tensor_model_parallel_size, args.world_size)
+    assert args.world_size % args.model_parallel.tensor_model_parallel_size == 0, 'world size'\
+        ' ({}) is not divisible by tensor model parallel size ({})'.format(
+            args.world_size, args.model_parallel.tensor_model_parallel_size)
+    # Pipeline model parallel size.
+    args.model_parallel.pipeline_model_parallel_size = min(
+        args.model_parallel.pipeline_model_parallel_size,
+        (args.world_size // args.model_parallel.tensor_model_parallel_size))
+    args.model_parallel.transformer_pipeline_model_parallel_size = (
+        args.model_parallel.pipeline_model_parallel_size - 1
+        if args.standalone_embedding_stage else
+        args.model_parallel.pipeline_model_parallel_size
+    )
+    # Checks.
+    model_parallel_size = args.model_parallel.pipeline_model_parallel_size * \
+                          args.model_parallel.tensor_model_parallel_size
+    assert args.world_size % (model_parallel_size * args.model_parallel.context_parallel_size) == 0, \
+        'world size ({}) is not divisible by tensor parallel size ({}) times ' \
+        'pipeline parallel size ({}) times context parallel size ({})'.format(
+        args.world_size, args.model_parallel.tensor_model_parallel_size,
+        args.model_parallel.pipeline_model_parallel_size, args.model_parallel.context_parallel_size)
+    
+    # data_parallel_size is not in model parallel config
+    args.data_parallel_size = args.world_size // (model_parallel_size * args.model_parallel.context_parallel_size)
+    if args.rank == 0:
+        print('using world size: {}, data-parallel size: {}, '
+              'context-parallel size: {} '
+              'tensor-model-parallel size: {}, '
+              'pipeline-model-parallel size: {} '.format(
+                  args.world_size, args.data_parallel_size,
+                  args.model_parallel.context_parallel_size,
+                  args.model_parallel.tensor_model_parallel_size,
+                  args.model_parallel.pipeline_model_parallel_size), flush=True)
+    if args.model_parallel.pipeline_model_parallel_size > 1:
+        if args.model_parallel.pipeline_model_parallel_split_rank is not None:
+            assert args.model_parallel.pipeline_model_parallel_split_rank < \
+                    args.model_parallel.pipeline_model_parallel_size, 'split rank needs'\
+                    ' to be less than pipeline model parallel size ({})'.format(
+                            args.model_parallel.pipeline_model_parallel_size)
+
+    if args.model_parallel.tp_comm_overlap:
+        assert args.model_parallel.sequence_parallel == True, 'Tensor parallel communication/GEMM overlap can happen only when sequence parallelism is enabled'
+
+    # Set input defaults.
+    for key in defaults:
+        # For default to be valid, it should not be provided in the
+        # arguments that are passed to the program. We check this by
+        # ensuring the arg is set to None.
+        if getattr(args, key, None) is not None:
+            if args.rank == 0:
+                print('WARNING: overriding default arguments for {key}:{v} \
+                       with {key}:{v2}'.format(key=key, v=defaults[key],
+                                               v2=getattr(args, key)),
+                                               flush=True)
+        else:
+            setattr(args, key, defaults[key])
+
+    # Batch size.
+    assert args.micro_batch_size is not None
+    assert args.micro_batch_size > 0
+    if args.global_batch_size is None:
+        args.global_batch_size = args.micro_batch_size * args.data_parallel_size
+        if args.rank == 0:
+            print('setting global batch size to {}'.format(
+                args.global_batch_size), flush=True)
+    assert args.global_batch_size > 0
+
+    # num_layers_per_virtual_pipeline_stage is not insde model parallel for checkpointing
+    if args.num_layers_per_virtual_pipeline_stage is not None:
+        assert args.model_parallel.pipeline_model_parallel_size > 2, \
+            'pipeline-model-parallel size should be greater than 2 with ' \
+            'interleaved schedule'
+        assert args.language_model.num_layers % args.model_parallel.transformer_pipeline_model_parallel_size == 0, \
+            'number of layers should be divisible by the pipeline parallel size'
+        num_layers_per_pipeline_stage = args.language_model.num_layers // args.model_parallel.transformer_pipeline_model_parallel_size
+        assert num_layers_per_pipeline_stage % args.num_layers_per_virtual_pipeline_stage == 0, \
+            'number of layers per pipeline stage must be divisible number of layers per virtual pipeline stage'
+        args.model_parallel.virtual_pipeline_model_parallel_size = num_layers_per_pipeline_stage // \
+            args.num_layers_per_virtual_pipeline_stage
+    else:
+        args.model_parallel.virtual_pipeline_model_parallel_size = None
+        # Overlap P2P communication is disabled if not using the interleaved schedule.
+        args.model_parallel.overlap_p2p_comm = False
+        if args.rank == 0:
+            print('WARNING: Setting args.overlap_p2p_comm to False since non-interleaved '
+                  'schedule does not support overlapping p2p communication')
+
+    if args.overlap_param_gather:
+        assert args.use_distributed_optimizer, \
+            '--overlap-param-gather only supported with distributed optimizer'
+        assert args.overlap_grad_reduce, \
+            '--overlap-grad-reduce should be turned on when using --overlap-param-gather'
+
+    # Parameters dtype.
+    if args.model_parallel.fp16:
+        assert not args.model_parallel.bf16
+        args.model_parallel.params_dtype = torch.half
+    if args.model_parallel.bf16:
+        assert not args.model_parallel.fp16
+        args.model_parallel.params_dtype = torch.bfloat16
+        # bfloat16 requires gradient accumulation and all-reduce to
+        # be done in fp32.
+        if not args.accumulate_allreduce_grads_in_fp32:
+            args.accumulate_allreduce_grads_in_fp32 = True
+            if args.rank == 0:
+                print('accumulate and all-reduce gradients in fp32 for '
+                      'bfloat16 data type.', flush=True)
+
+    if args.rank == 0:
+        print('using {} for parameters ...'.format(args.model_parallel.params_dtype),
+              flush=True)
+
+    if args.dataloader_type is None:
+        args.dataloader_type = 'single'
+
+    # Consumed tokens.
+    args.consumed_train_samples = 0
+    args.consumed_valid_samples = 0
+
+    # Support for variable sequence lengths across batches/microbatches.
+    # set it if the dataloader supports generation of variable sequence lengths
+    # across batches/microbatches. Due to additional communication overhead
+    # during pipeline parallelism, it should not be set if sequence length
+    # is constant during training.
+    args.model_parallel.variable_seq_lengths = False
+
+    # Iteration-based training.
+    if args.train_iters:
+        # If we use iteration-based training, make sure the
+        # sample-based options are off.
+        assert args.train_samples is None, \
+            'expected iteration-based training'
+        assert args.lr_decay_samples is None, \
+            'expected iteration-based learning rate decay'
+        assert args.lr_warmup_samples == 0, \
+            'expected iteration-based learning rate warmup'
+        assert args.rampup_batch_size is None, \
+            'expected no batch-size rampup for iteration-based training'
+        if args.lr_warmup_fraction is not None:
+            assert args.lr_warmup_iters == 0, \
+                'can only specify one of lr-warmup-fraction and lr-warmup-iters'
+
+    # Sample-based training.
+    if args.train_samples:
+        # If we use sample-based training, make sure the
+        # iteration-based options are off.
+        assert args.train_iters is None, \
+            'expected sample-based training'
+        assert args.lr_decay_iters is None, \
+            'expected sample-based learning rate decay'
+        assert args.lr_warmup_iters == 0, \
+            'expected sample-based learnig rate warmup'
+        if args.lr_warmup_fraction is not None:
+            assert args.lr_warmup_samples == 0, \
+                'can only specify one of lr-warmup-fraction ' \
+                'and lr-warmup-samples'
+
+    # How to handle this better
+    if args.language_model.num_layers is not None:
+        assert args.encoder_num_layers is None, \
+            'cannot have both num-layers and encoder-num-layers specified'
+        args.encoder_num_layers = args.language_model.num_layers
+    else:
+        assert args.encoder_num_layers is not None, \
+            'either num-layers or encoder-num-layers should be specified'
+        args.language_model.num_layers = args.encoder_num_layers
+
+    # Check required arguments.
+    # removed max_position_embeddings from reqs
+    required_args = ['num_layers', 'hidden_size', 'num_attention_heads']
+    for req_arg in required_args:
+        _check_arg_is_not_none(args.language_model, req_arg)
+
+    # Checks.
+    if args.language_model.ffn_hidden_size is None:
+        if args.language_model.activation_func == "swiglu":
+            # reduce the dimnesion for MLP since projections happens on
+            # two linear layers. this keeps the number of paramters in
+            # the same ballpark as the counterpart with 4*h size
+            # we keep it a multiple of 64, which means the actual tensor size
+            # will be a multiple of 64 / tp_size
+            args.language_model.ffn_hidden_size = int((4 * args.language_model.hidden_size * 2 / 3) / 64) * 64
+        else:
+            args.language_model.ffn_hidden_size = 4 * args.language_model.hidden_size
+
+    if args.language_model.kv_channels is None:
+        assert args.language_model.hidden_size % args.language_model.num_attention_heads == 0
+        args.language_model.kv_channels = args.language_model.hidden_size // args.language_model.num_attention_heads
+
+    #TODO: Implement arguments for encoder-decoder
+    if args.seq_length is not None:
+        assert args.encoder_seq_length is None
+        args.encoder_seq_length = args.seq_length
+    else:
+        assert args.encoder_seq_length is not None
+        args.seq_length = args.encoder_seq_length
+
+    if args.seq_length is not None:
+        assert args.max_position_embeddings >= args.seq_length
+    if args.decoder_seq_length is not None:
+        assert args.max_position_embeddings >= args.decoder_seq_length
+    if args.lr is not None:
+        assert args.min_lr <= args.lr
+    if args.save is not None:
+        assert args.save_interval is not None
+    # Mixed precision checks.
+    if args.fp16_lm_cross_entropy:
+        assert args.fp16, 'lm cross entropy in fp16 only support in fp16 mode.'
+    if args.language_model.fp32_residual_connection:
+        assert args.model_parallel.fp16 or args.model_parallel.bf16, \
+            'residual connection in fp32 only supported when using fp16 or bf16.'
+
+    if args.language_model.moe_grouped_gemm:
+        assert args.model_parallel.bf16, 'Currently GroupedGEMM for MoE only supports bf16 dtype.'
+        dc = torch.cuda.get_device_capability()
+        assert dc[0] >= 8, "Unsupported compute capability for GroupedGEMM kernels."
+
+    if args.weight_decay_incr_style == 'constant':
+        assert args.start_weight_decay is None
+        assert args.end_weight_decay is None
+        args.start_weight_decay = args.weight_decay
+        args.end_weight_decay = args.weight_decay
+    else:
+        assert args.start_weight_decay is not None
+        assert args.end_weight_decay is not None
+
+    TORCH_MAJOR = int(torch.__version__.split('.')[0])
+    TORCH_MINOR = int(torch.__version__.split('.')[1])
+    # Persistent fused layer norm.
+    if TORCH_MAJOR < 1 or (TORCH_MAJOR == 1 and TORCH_MINOR < 11):
+        args.language_model.persist_layer_norm = False
+        if args.rank == 0:
+            print('Persistent fused layer norm kernel is supported from '
+                  'pytorch v1.11 (nvidia pytorch container paired with v1.11). '
+                  'Defaulting to no_persist_layer_norm=True')
+
+    # Activation recomputing.
+    if args.language_model.distribute_saved_activations:
+        assert args.model_parallel.tensor_model_parallel_size > 1, 'can distribute ' \
+            'recomputed activations only across tensor model ' \
+            'parallel groups'
+        assert args.language_model.recompute_granularity == 'full', \
+            'distributed recompute activations is only '\
+            'application to full recompute granularity'
+        assert args.language_model.recompute_method is not None, \
+            'for distributed recompute activations to work you '\
+            'need to use a recompute method '
+        assert (TORCH_MAJOR, TORCH_MINOR) >= (1, 10), \
+            'distributed recompute activations are supported for pytorch ' \
+            'v1.10 and above (Nvidia Pytorch container >= 21.07). Current ' \
+            'pytorch version is v%s.%s.' % (TORCH_MAJOR, TORCH_MINOR)
+
+    if args.language_model.recompute_granularity == 'selective':
+        assert args.language_model.recompute_method is None, \
+            'recompute method is not yet supported for ' \
+            'selective recomputing granularity'
+
+    # disable sequence parallelism when tp=1
+    # to avoid change in numerics when
+    # sequence_parallelism is enabled.
+    if args.model_parallel.tensor_model_parallel_size == 1:
+        args.model_parallel.sequence_parallel = False
+
+    # disable async_tensor_model_parallel_allreduce when
+    # model parallel memory optimization is enabled
+    if args.model_parallel.sequence_parallel:
+        args.model_parallel.async_tensor_model_parallel_allreduce = False
+
+    if os.environ.get('CUDA_DEVICE_MAX_CONNECTIONS') != "1":
+        if args.model_parallel.sequence_parallel:
+            raise RuntimeError(
+                "Using sequence parallelism requires setting the environment variable "
+                "CUDA_DEVICE_MAX_CONNECTIONS to 1")
+        if args.model_parallel.async_tensor_model_parallel_allreduce:
+            raise RuntimeError(
+                "Using async gradient all reduce requires setting the environment "
+                "variable CUDA_DEVICE_MAX_CONNECTIONS to 1")
+
+    # Retro checks.
+    if getattr(args, 'retro_add_retriever', False):
+        raise Exception("Retro untested for yaml args. See arguments.py.")
+
+        # Sequence parallelism unsupported.
+        assert not args.sequence_parallel, \
+            "retro currently does not support sequence parallelism."
+
+        # Pipeline parallelism unsupported.
+        assert args.pipeline_model_parallel_size == 1, \
+            "retro currently does not support pipeline parallelism."
+
+    #TODO: Retro args loading not tested
+    # Load retro args (used by both Retro & GPT).
+    if getattr(args, 'retro_project_dir', None) is not None:
+        raise Exception("Retro untested for yaml args. See arguments.py.")
+
+    if args.language_model.rotary_interleaved and args.language_model.apply_rope_fusion:
+        raise RuntimeError('--rotary-interleaved does not work with rope_fusion.')
+    
+    # MoE Spec check
+    if args.language_model.num_moe_experts is not None:
+        assert args.spec is None, "Model Spec must be None when using MoEs"
+        if args.model_parallel.tensor_model_parallel_size > 1:
+            assert args.model_parallel.sequence_parallel, \
+                "When using MoE and tensor parallelism, sequence parallelism must be used."
+
+    # Expert parallelism check
+    if args.model_parallel.expert_model_parallel_size  > 1:
+        assert args.language_model.num_moe_experts is not None, "num_experts must be non None to use expert model parallelism"
+        assert args.language_model.num_moe_experts % args.model_parallel.expert_model_parallel_size == 0, \
+            "Number of experts should be a multiple of expert model parallel_size."
+        assert not args.model_parallel.fp16, \
+            "Expert parallelism is not supported with fp16 training."
+
+    # Print arguments.
+    _print_args("arguments", args)
+
+    #TODO: Added as much of the global initialization requires the model parallel arguments
+    args = SimpleNamespace(**args.__dict__, **args.model_parallel.__dict__)
+    args = SimpleNamespace(**args.__dict__, **args.language_model.__dict__)
+    # For GPT Layer spec in pretrain_gpt
+    args.num_experts = args.language_model.num_moe_experts
+
+    return args
+
+def _print_args(title, args):
+    """Print arguments."""
+    if args.rank == 0:
+        print(f'------------------------ {title} ------------------------',
+              flush=True)
+        str_list = []
+        for arg in vars(args):
+            dots = '.' * (48 - len(arg))
+            str_list.append('  {} {} {}'.format(arg, dots, getattr(args, arg)))
+        for arg in sorted(str_list, key=lambda x: x.lower()):
+            print(arg, flush=True)
+        print(f'-------------------- end of {title} ---------------------',
+              flush=True)
+
+def core_config_from_args(args, dataclass=TransformerConfig):
+    """Builds core config object from namespace args from given dataclass
+
+    Raises exception if argument missing in args
+
+    Args:
+        args(SimpleNamespace, optional): Namespace to pull argument values from 
+        dataclass (dataclass, optional): Core dataclass config to pull argument names from
+
+
+    Returns:
+        SimpleNamespace: The returned namespace to build core config from
+    """
+    kw_args = {}
+    for f in dataclasses.fields(dataclass):
+        if hasattr(args, f.name):
+            kw_args[f.name] = getattr(args, f.name)
+        else:
+            raise Exception(f"Missing argument {f.name} for {str(dataclass)} config")
+    return kw_args
+
+def _check_arg_is_not_none(args, arg):
+    assert getattr(args, arg) is not None, '{} argument is None'.format(arg)
+
+def core_transformer_config_from_yaml(args, transfomer_key = "language_model"):    
+    # Combine transfomer config with model parallel args
+    args = SimpleNamespace(**vars(getattr(args, transfomer_key)), **vars(args.model_parallel))
+    # Translate args to core transformer configuration
+    kw_args = core_config_from_args(args, TransformerConfig)    
+    
+    # Hardcoded 
+    kw_args['deallocate_pipeline_outputs'] = True
+    kw_args['pipeline_dtype'] = kw_args['params_dtype']
+    kw_args['batch_p2p_comm'] = not args.overlap_p2p_comm 
+    
+    assert args.activation_func in ["swiglu","squaredrelu","gelu"], f"{args.activation_func} is not a supported activation function"
+    if args.activation_func == "swiglu":
+        kw_args['activation_func'] = F.silu
+        kw_args['gated_linear_unit'] = True
+        kw_args['bias_activation_fusion'] = args.bias_swiglu_fusion
+    elif args.activation_func == "squaredrelu":
+        def squared_relu(x):
+            return torch.pow(F.relu(x), 2)
+        kw_args['activation_func'] = squared_relu
+    elif args.activation_func == "gelu":
+        kw_args['activation_func'] = F.gelu
+        if args.add_bias_linear:
+            kw_args['bias_activation_fusion'] = False
+        else:
+            kw_args['bias_activation_fusion'] = args.bias_activation_fusion
+    
+    if args.init_method == "xavier_uniform":
+        kw_args['init_method'] = torch.nn.init.xavier_uniform_
+        kw_args['scaled_init_method'] = torch.nn.init.xavier_uniform_
+    
+    # Return Transformer config.
+    if getattr(args, "multi_latent_attention", False):
+        return MLATransformerConfig(**kw_args)
+    else:
+        return TransformerConfig(**kw_args)
+
+def load_yaml(yaml_path):
+    print(f"warning using experimental yaml arguments feature, argparse arguments will be ignored")
+    with open(yaml_path, "r") as f:
+        config = yaml.load(f,Loader=yaml.FullLoader)
+        # Convert to nested namespace
+        config_namespace = json.loads(json.dumps(config), object_hook=lambda item: SimpleNamespace(**item))
+        # Add config location to namespace
+        config_namespace.yaml_cfg = yaml_path
+        return config_namespace
+
diff --git a/megatron/utils.py b/megatron/utils.py
deleted file mode 100644
index 008f89fa80..0000000000
--- a/megatron/utils.py
+++ /dev/null
@@ -1,213 +0,0 @@
-# Copyright (c) 2022, NVIDIA CORPORATION. All rights reserved.
-
-"""General utilities."""
-
-import sys
-
-import torch
-from torch.nn.parallel import DistributedDataParallel as torchDDP
-
-from apex.multi_tensor_apply import multi_tensor_applier
-import amp_C
-
-from megatron import (
-    get_args,
-    get_adlr_autoresume,
-)
-from megatron.core import mpu
-from megatron.core.tensor_parallel import param_is_not_tensor_parallel_duplicate
-from megatron.model.module import param_is_not_shared
-
-
-def unwrap_model(model, module_instances=(torchDDP)):
-    return_list = True
-    if not isinstance(model, list):
-        model = [model]
-        return_list = False
-    unwrapped_model = []
-    for model_module in model:
-        while isinstance(model_module, module_instances):
-            model_module = model_module.module
-        unwrapped_model.append(model_module)
-    if not return_list:
-        return unwrapped_model[0]
-    return unwrapped_model
-
-
-def calc_params_l2_norm(model):
-    """Calculate l2 norm of parameters """
-    args = get_args()
-    if not isinstance(model, list):
-        model = [model]
-    # Remove duplicate params.
-    params_data = []
-    for model_ in model:
-        for param in model_.parameters():
-            is_not_shared = param_is_not_shared(param)
-            is_not_tp_duplicate = param_is_not_tensor_parallel_duplicate(param)
-            if is_not_shared and is_not_tp_duplicate:
-                if args.bf16:
-                    params_data.append(param.data.float())
-                else:
-                    params_data.append(param.data)
-    # Calculate norm
-    dummy_overflow_buf = torch.cuda.IntTensor([0])
-    norm, _ = multi_tensor_applier(
-        amp_C.multi_tensor_l2norm,
-        dummy_overflow_buf,
-        [params_data],
-        False # no per-parameter norm
-    )
-    norm_2 = norm * norm
-    # Sum across all model-parallel GPUs.
-    torch.distributed.all_reduce(norm_2,
-                                 op=torch.distributed.ReduceOp.SUM,
-                                 group=mpu.get_model_parallel_group())
-    return norm_2.item() ** 0.5
-
-
-def average_losses_across_data_parallel_group(losses):
-    """Reduce a tensor of losses across all GPUs."""
-    averaged_losses = torch.cat(
-        [loss.clone().detach().view(1) for loss in losses])
-    torch.distributed.all_reduce(averaged_losses,
-                                 group=mpu.get_data_parallel_group())
-    averaged_losses = averaged_losses / \
-        torch.distributed.get_world_size(group=mpu.get_data_parallel_group())
-
-    return averaged_losses
-
-
-def report_memory(name):
-    """Simple GPU memory report."""
-    mega_bytes = 1024.0 * 1024.0
-    string = name + ' memory (MB)'
-    string += ' | allocated: {}'.format(
-        torch.cuda.memory_allocated() / mega_bytes)
-    string += ' | max allocated: {}'.format(
-        torch.cuda.max_memory_allocated() / mega_bytes)
-    string += ' | reserved: {}'.format(
-        torch.cuda.memory_reserved() / mega_bytes)
-    string += ' | max reserved: {}'.format(
-        torch.cuda.max_memory_reserved() / mega_bytes)
-    if mpu.get_data_parallel_rank() == 0:
-        print("[Rank {}] {}".format(torch.distributed.get_rank(), string),
-              flush=True)
-
-
-def print_params_min_max_norm(optimizer, iteration):
-    """Print min, max, and norm of all parameters."""
-    index = 0
-    rank = torch.distributed.get_rank()
-    string = 'iteration, rank, index, tensor-model-parallel, min, max, norm\n'
-    optimizer_ = optimizer.optimizer
-    for param_group in optimizer_.param_groups:
-        for param in param_group['params']:
-            index += 1
-            min_ = param.data.min()
-            max_ = param.data.max()
-            norm = torch.linalg.norm(param.data)
-            string += '{:7d}, {:4d}, {:4d}, {:2d}, '.format(
-                iteration, rank, index, int(param.tensor_model_parallel))
-            string += '{:.6E}, {:.6E}, {:.6E}\n'.format(min_, max_, norm)
-    print(string, flush=True)
-
-
-def check_adlr_autoresume_termination(iteration, model,
-                                      optimizer, opt_param_scheduler):
-    """Check for autoresume signal and exit if it is received."""
-    from megatron.checkpointing import save_checkpoint
-
-    args = get_args()
-    autoresume = get_adlr_autoresume()
-    # Add barrier to ensure consistnecy.
-    torch.distributed.barrier()
-    if autoresume.termination_requested():
-        if args.save:
-            save_checkpoint(iteration, model, optimizer, opt_param_scheduler)
-        print_rank_0(">>> autoresume termination request found!")
-        if torch.distributed.get_rank() == 0:
-            autoresume.request_resume()
-        print_rank_0(">>> training terminated. Returning")
-        sys.exit(0)
-
-
-def get_ltor_masks_and_position_ids(data,
-                                    eod_token,
-                                    reset_position_ids,
-                                    reset_attention_mask,
-                                    eod_mask_loss):
-    """Build masks and position id for left to right model."""
-
-    # Extract batch size and sequence length.
-    micro_batch_size, seq_length = data.size()
-
-    # Attention mask (lower triangular).
-    if reset_attention_mask:
-        att_mask_batch = micro_batch_size
-    else:
-        att_mask_batch = 1
-    attention_mask = torch.tril(torch.ones(
-        (att_mask_batch, seq_length, seq_length), device=data.device)).view(
-            att_mask_batch, 1, seq_length, seq_length)
-
-    # Loss mask.
-    loss_mask = torch.ones(data.size(), dtype=torch.float, device=data.device)
-    if eod_mask_loss:
-        loss_mask[data == eod_token] = 0.0
-
-    # Position ids.
-    position_ids = torch.arange(seq_length, dtype=torch.long,
-                                device=data.device)
-    position_ids = position_ids.unsqueeze(0).expand_as(data)
-    # We need to clone as the ids will be modifed based on batch index.
-    if reset_position_ids:
-        position_ids = position_ids.clone()
-
-    if reset_position_ids or reset_attention_mask:
-        # Loop through the batches:
-        for b in range(micro_batch_size):
-
-            # Find indecies where EOD token is.
-            eod_index = position_ids[b, data[b] == eod_token]
-            # Detach indecies from positions if going to modify positions.
-            if reset_position_ids:
-                eod_index = eod_index.clone()
-
-            # Loop through EOD indecies:
-            prev_index = 0
-            for j in range(eod_index.size()[0]):
-                i = eod_index[j]
-                # Mask attention loss.
-                if reset_attention_mask:
-                    attention_mask[b, 0, (i + 1):, :(i + 1)] = 0
-                # Reset positions.
-                if reset_position_ids:
-                    position_ids[b, (i + 1):] -= (i + 1 - prev_index)
-                    prev_index = i + 1
-
-    # Convert attention mask to binary:
-    attention_mask = (attention_mask < 0.5)
-
-    return attention_mask, loss_mask, position_ids
-
-
-def print_rank_0(message):
-    """If distributed is initialized, print only on rank 0."""
-    if torch.distributed.is_initialized():
-        if torch.distributed.get_rank() == 0:
-            print(message, flush=True)
-    else:
-        print(message, flush=True)
-
-def is_last_rank():
-    return torch.distributed.get_rank() == (
-        torch.distributed.get_world_size() - 1)
-
-def print_rank_last(message):
-    """If distributed is initialized, print only on last rank."""
-    if torch.distributed.is_initialized():
-        if is_last_rank():
-            print(message, flush=True)
-    else:
-        print(message, flush=True)
diff --git a/pretrain_bert.py b/pretrain_bert.py
index d751feab86..35884ecdc4 100644
--- a/pretrain_bert.py
+++ b/pretrain_bert.py
@@ -7,15 +7,23 @@
 import torch
 import torch.nn.functional as F
 
-from megatron import get_args
-from megatron import print_rank_0
-from megatron import get_timers
+from megatron.training import get_args
+from megatron.training import get_tokenizer
+from megatron.training import print_rank_0
+from megatron.training import get_timers
 from megatron.core import tensor_parallel
 from megatron.core.enums import ModelType
-from megatron.data.dataset_utils import build_train_valid_test_datasets
-from megatron.model import BertModel
+import megatron.legacy.model
+from megatron.core.models.bert.bert_model import BertModel
 from megatron.training import pretrain
-from megatron.utils import average_losses_across_data_parallel_group
+from megatron.training.utils import average_losses_across_data_parallel_group
+from megatron.training.arguments import core_transformer_config_from_args
+from megatron.core.transformer.spec_utils import import_module
+from megatron.core.models.bert.bert_layer_specs import bert_layer_with_transformer_engine_spec, bert_layer_local_spec
+from megatron.core.datasets.blended_megatron_dataset_builder import BlendedMegatronDatasetBuilder
+from megatron.core.datasets.bert_dataset import BERTMaskedWordPieceDataset, BERTMaskedWordPieceDatasetConfig
+from megatron.core.datasets.utils import get_blend_from_list
+from megatron.core import mpu, tensor_parallel
 
 
 def model_provider(pre_process=True, post_process=True):
@@ -24,13 +32,37 @@ def model_provider(pre_process=True, post_process=True):
     print_rank_0('building BERT model ...')
 
     args = get_args()
+    config = core_transformer_config_from_args(args)
     num_tokentypes = 2 if args.bert_binary_head else 0
-    model = BertModel(
-        num_tokentypes=num_tokentypes,
-        add_binary_head=args.bert_binary_head,
-        parallel_output=True,
-        pre_process=pre_process,
-        post_process=post_process)
+
+    if args.use_legacy_models:
+        model = megatron.legacy.model.BertModel(
+            config=config,
+            num_tokentypes=num_tokentypes,
+            add_binary_head=args.bert_binary_head,
+            parallel_output=True,
+            pre_process=pre_process,
+            post_process=post_process)
+    else:
+        if args.spec is None:
+            transformer_layer_spec = bert_layer_with_transformer_engine_spec #default spec
+        elif args.spec[0] == 'local':
+            print_rank_0('Using Local spec for transformer layers')
+            transformer_layer_spec = bert_layer_local_spec
+        else :
+            transformer_layer_spec = import_module(args.spec)
+
+        model = BertModel(
+            config=config,
+            transformer_layer_spec=transformer_layer_spec,
+            vocab_size=args.padded_vocab_size,
+            max_sequence_length=args.max_position_embeddings,
+            num_tokentypes=num_tokentypes,
+            add_binary_head=args.bert_binary_head,
+            share_embeddings_and_output_weights=not args.untie_embeddings_and_output_weights,
+            parallel_output=True,
+            pre_process=pre_process,
+            post_process=post_process)
 
     return model
 
@@ -39,7 +71,8 @@ def get_batch(data_iterator):
     """Build the batch."""
 
     # Items and their type.
-    keys = ['text', 'types', 'labels', 'is_random', 'loss_mask', 'padding_mask']
+    keys = ['text', 'types', 'labels',
+            'is_random', 'loss_mask', 'padding_mask']
     datatype = torch.int64
 
     # Broadcast data.
@@ -78,7 +111,6 @@ def loss_func(loss_mask, sentence_order, output_tensor):
             [lm_loss, sop_loss])
         return loss, {'lm loss': averaged_losses[0],
                       'sop loss': averaged_losses[1]}
-
     else:
         loss = lm_loss
         averaged_losses = average_losses_across_data_parallel_group(
@@ -101,8 +133,8 @@ def forward_step(data_iterator, model):
         types = None
 
     # Forward pass through the model.
-    output_tensor = model(tokens, padding_mask, tokentype_ids=types,
-                          lm_labels=lm_labels)
+    output_tensor = model(tokens, padding_mask,
+                          tokentype_ids=types, lm_labels=lm_labels)
 
     return output_tensor, partial(loss_func, loss_mask, sentence_order)
 
@@ -111,19 +143,41 @@ def train_valid_test_datasets_provider(train_val_test_num_samples):
     """Build train, valid, and test datasets."""
     args = get_args()
 
+    tokenizer = get_tokenizer()
+
+    config = BERTMaskedWordPieceDatasetConfig(
+        random_seed=args.seed,
+        sequence_length=args.seq_length,
+        blend=get_blend_from_list(args.data_path),
+        blend_per_split=[
+            get_blend_from_list(args.train_data_path),
+            get_blend_from_list(args.valid_data_path),
+            get_blend_from_list(args.test_data_path)
+        ],
+        renormalize_blend_weights=args.renormalize_blend_weights,
+        split=args.split,
+        path_to_cache=args.data_cache_path,
+        tokenizer=tokenizer,
+        masking_probability=args.mask_prob,
+        short_sequence_probability=args.short_seq_prob,
+        masking_max_ngram=3,
+        masking_do_full_word=True,
+        masking_do_permutation=False,
+        masking_use_longer_ngrams=False,
+        masking_use_geometric_distribution=False,
+        classification_head=args.bert_binary_head,
+    )
+
     print_rank_0('> building train, validation, and test datasets '
                  'for BERT ...')
-    train_ds, valid_ds, test_ds = build_train_valid_test_datasets(
-        data_prefix=args.data_path,
-        data_impl=args.data_impl,
-        splits_string=args.split,
-        train_valid_test_num_samples=train_val_test_num_samples,
-        max_seq_length=args.seq_length,
-        masked_lm_prob=args.mask_prob,
-        short_seq_prob=args.short_seq_prob,
-        seed=args.seed,
-        skip_warmup=(not args.mmap_warmup),
-        binary_head=args.bert_binary_head)
+
+    train_ds, valid_ds, test_ds = BlendedMegatronDatasetBuilder(
+        BERTMaskedWordPieceDataset,
+        train_val_test_num_samples,
+        lambda: mpu.get_tensor_model_parallel_rank() == 0,
+        config,
+    ).build()
+
     print_rank_0("> finished creating BERT datasets ...")
 
     return train_ds, valid_ds, test_ds
@@ -131,6 +185,9 @@ def train_valid_test_datasets_provider(train_val_test_num_samples):
 
 if __name__ == "__main__":
 
+    # Temporary for transition to core datasets
+    train_valid_test_datasets_provider.is_distributed = True
+
     pretrain(train_valid_test_datasets_provider, model_provider,
              ModelType.encoder_or_decoder,
              forward_step, args_defaults={'tokenizer_type': 'BertWordPieceLowerCase'})
diff --git a/pretrain_gpt.py b/pretrain_gpt.py
index 88913e48aa..3b7f8db012 100644
--- a/pretrain_gpt.py
+++ b/pretrain_gpt.py
@@ -1,110 +1,257 @@
-# Copyright (c) 2022, NVIDIA CORPORATION.  All rights reserved.
-
-"""Pretrain GPT"""
+# Copyright (c) 2023, NVIDIA CORPORATION.  All rights reserved.
+"""Pretrain GPT."""
 
+import os
 import torch
 from functools import partial
-from megatron import get_args
-from megatron import print_rank_0
-from megatron import get_timers
-from megatron import get_tokenizer
-from megatron.core import tensor_parallel
+from contextlib import nullcontext
+import inspect
+
+from typing import Union
+from megatron.training import get_args
+from megatron.training import print_rank_0
+from megatron.training import get_timers
+from megatron.training import get_tokenizer
+from megatron.core import mpu
 from megatron.core.enums import ModelType
-from megatron.data.gpt_dataset import build_train_valid_test_datasets
-from megatron.model import GPTModel
+from megatron.core.datasets.blended_megatron_dataset_builder import BlendedMegatronDatasetBuilder
+from megatron.core.datasets.utils import get_blend_from_list
+from megatron.core.datasets.gpt_dataset import GPTDatasetConfig
+from megatron.core.datasets.gpt_dataset import MockGPTDataset, GPTDataset
+import megatron.legacy.model
+from megatron.core.models.gpt import GPTModel
 from megatron.training import pretrain
-from megatron.utils import get_ltor_masks_and_position_ids
-from megatron.utils import average_losses_across_data_parallel_group
+from megatron.core.utils import StragglerDetector
+from megatron.core.transformer.spec_utils import import_module
+from megatron.training.utils import (
+    get_batch_on_this_cp_rank,
+    get_batch_on_this_tp_rank,
+)
+from megatron.training.arguments import core_transformer_config_from_args
+from megatron.training.yaml_arguments import core_transformer_config_from_yaml
+from megatron.core.models.gpt.gpt_layer_specs import (
+    get_gpt_layer_local_spec,
+    get_gpt_layer_with_transformer_engine_spec,
+)
+
+
+stimer = StragglerDetector()
+
+def model_provider(pre_process=True, post_process=True) -> Union[GPTModel, megatron.legacy.model.GPTModel]:
+    """Builds the model.
+
+    If you set the use_legacy_models to True, it will return the legacy GPT model and if not the mcore GPT model.
 
-def model_provider(pre_process=True, post_process=True):
-    """Build the model."""
+    Args:
+        pre_process (bool, optional): Set to true if you need to compute embedings. Defaults to True.
+        post_process (bool, optional): Set to true if you need to want to compute output logits/loss. Defaults to True.
+
+
+    Returns:
+        Union[GPTModel, megatron.legacy.model.GPTModel]: The returned model
+    """
+    args = get_args()
+    use_te = args.transformer_impl == "transformer_engine"
 
     print_rank_0('building GPT model ...')
-    model = GPTModel(
-        num_tokentypes=0,
-        parallel_output=True,
-        pre_process=pre_process,
-        post_process=post_process
-    )
+    # Experimental loading arguments from yaml
+    if args.yaml_cfg is not None:
+        config = core_transformer_config_from_yaml(args, "language_model")
+    else:
+        config = core_transformer_config_from_args(args)
+
+    if args.use_legacy_models:
+        model = megatron.legacy.model.GPTModel(
+            config,
+            num_tokentypes=0,
+            parallel_output=True,
+            pre_process=pre_process,
+            post_process=post_process,
+        )
+    else: # using core models
+        if args.spec is not None:
+            transformer_layer_spec = import_module(args.spec)
+        else:
+            if use_te:
+                transformer_layer_spec = get_gpt_layer_with_transformer_engine_spec(args.num_experts, args.moe_grouped_gemm, args.qk_layernorm, args.multi_latent_attention, args.fp8)
+            else:
+                transformer_layer_spec = get_gpt_layer_local_spec(args.num_experts, args.moe_grouped_gemm, args.qk_layernorm, args.multi_latent_attention)
+
+        build_model_context = nullcontext
+        build_model_context_args = {}
+        if args.fp8_param_gather:
+            try:
+                from transformer_engine.pytorch import fp8_model_init
+
+                build_model_context = fp8_model_init
+                build_model_context_args["enabled"] = True
+
+                # Check if fp8_model_init supports preserve_high_precision_init_val
+                if "preserve_high_precision_init_val" in inspect.signature(fp8_model_init).parameters:
+                    build_model_context_args["preserve_high_precision_init_val"] = True
+            except:
+                raise RuntimeError("--fp8-param-gather requires `fp8_model_init` from TransformerEngine, but not found.")
+
+        with build_model_context(**build_model_context_args):
+            model = GPTModel(
+                config=config,
+                transformer_layer_spec=transformer_layer_spec,
+                vocab_size=args.padded_vocab_size,
+                max_sequence_length=args.max_position_embeddings,
+                pre_process=pre_process,
+                post_process=post_process,
+                fp16_lm_cross_entropy=args.fp16_lm_cross_entropy,
+                parallel_output=True,
+                share_embeddings_and_output_weights=not args.untie_embeddings_and_output_weights,
+                position_embedding_type=args.position_embedding_type,
+                rotary_percent=args.rotary_percent,
+                rotary_base=args.rotary_base,
+                rope_scaling=args.use_rope_scaling
+            )
+
     return model
 
 
 def get_batch(data_iterator):
-    """Generate a batch"""
-    args = get_args()
-    tokenizer = get_tokenizer()
+    """Generate a batch."""
 
-    # Items and their type.
-    keys = ['text']
-    datatype = torch.int64
+    # TODO: this is pretty hacky, find a better way
+    if (not mpu.is_pipeline_first_stage()) and (not mpu.is_pipeline_last_stage()):
+        return None, None, None, None, None
 
-    # Broadcast data.
-    if data_iterator is not None:
-        data = next(data_iterator)
-    else:
-        data = None
-    data_b = tensor_parallel.broadcast_data(keys, data, datatype)
+    # get batches based on the TP rank you are on
+    batch = get_batch_on_this_tp_rank(data_iterator)
+
+    # slice batch along sequence dimension for context parallelism
+    batch = get_batch_on_this_cp_rank(batch)
+
+    return batch.values()
 
-    # Unpack.
-    tokens_ = data_b['text'].long()
-    labels = tokens_[:, 1:].contiguous()
-    tokens = tokens_[:, :-1].contiguous()
 
-    # Get the masks and postition ids.
-    attention_mask, loss_mask, position_ids = get_ltor_masks_and_position_ids(
-        tokens,
-        tokenizer.eod,
-        args.reset_position_ids,
-        args.reset_attention_mask,
-        args.eod_mask_loss)
+def loss_func(loss_mask: torch.Tensor, output_tensor: torch.Tensor):
+    """Loss function.
 
-    return tokens, labels, loss_mask, attention_mask, position_ids
+    Args:
+        loss_mask (torch.Tensor): Used to mask out some portions of the loss
+        output_tensor (torch.Tensor): The tensor with the losses
+
+    Returns:
+        the loss scalar for this micro-batch
+        the number of non-padded tokens in this microbatch
+        a dict containing reporting metrics on the loss and number of tokens across
+            the data parallel ranks
+    """
+    args = get_args()
 
-def loss_func(loss_mask, output_tensor):
     losses = output_tensor.float()
     loss_mask = loss_mask.view(-1).float()
-    loss = torch.sum(losses.view(-1) * loss_mask) / loss_mask.sum()
+    total_tokens = loss_mask.sum()
+    loss = torch.cat([torch.sum(losses.view(-1) * loss_mask).view(1), total_tokens.view(1)])
+
+    if args.context_parallel_size > 1:
+        torch.distributed.all_reduce(loss, group=mpu.get_context_parallel_group())
+
+    # Check individual rank losses are not NaN prior to DP all-reduce.
+    if args.check_for_nan_in_loss_and_grad:
+        global_rank = torch.distributed.get_rank()
+        assert not loss[0].isnan(), (
+            f'Rank {global_rank}: found NaN in local forward loss calculation. '
+            f'Device: {torch.cuda.current_device()}, node: {os.uname()[1]}'
+        )
 
     # Reduce loss for logging.
-    averaged_loss = average_losses_across_data_parallel_group([loss])
+    reporting_loss = loss.clone().detach()
+    torch.distributed.all_reduce(reporting_loss, group=mpu.get_data_parallel_group())
+
+    local_num_tokens = loss[1].clone().detach().to(torch.int)
+    return (
+        loss[0] * args.context_parallel_size,
+        local_num_tokens,
+        {'lm loss': (reporting_loss[0], reporting_loss[1])},
+    )
 
-    return loss, {'lm loss': averaged_loss[0]}
 
+def forward_step(data_iterator, model: GPTModel):
+    """Forward training step.
 
-def forward_step(data_iterator, model):
-    """Forward step."""
+    Args:
+        data_iterator : Input data iterator
+        model (GPTModel): The GPT Model
+    """
     args = get_args()
     timers = get_timers()
 
     # Get the batch.
     timers('batch-generator', log_level=2).start()
-    tokens, labels, loss_mask, attention_mask, position_ids = get_batch(
-        data_iterator)
+    global stimer
+    with stimer(bdata=True):
+        tokens, labels, loss_mask, attention_mask, position_ids = get_batch(
+            data_iterator)
     timers('batch-generator').stop()
 
-    output_tensor = model(tokens, position_ids, attention_mask,
-                          labels=labels)
+    with stimer:
+        output_tensor = model(tokens, position_ids, attention_mask,
+                              labels=labels)
 
     return output_tensor, partial(loss_func, loss_mask)
 
 
+def is_dataset_built_on_rank():
+    return (
+        mpu.is_pipeline_first_stage() or mpu.is_pipeline_last_stage()
+    ) and mpu.get_tensor_model_parallel_rank() == 0
+
+
+def core_gpt_dataset_config_from_args(args):
+    tokenizer = get_tokenizer()
+
+    return GPTDatasetConfig(
+        random_seed=args.seed,
+        sequence_length=args.seq_length,
+        blend=get_blend_from_list(args.data_path),
+        blend_per_split=[
+            get_blend_from_list(args.train_data_path),
+            get_blend_from_list(args.valid_data_path),
+            get_blend_from_list(args.test_data_path)
+        ],
+        renormalize_blend_weights=args.renormalize_blend_weights,
+        split=args.split,
+        num_dataset_builder_threads=args.num_dataset_builder_threads,
+        path_to_cache=args.data_cache_path,
+        mmap_bin_files=args.mmap_bin_files,
+        tokenizer=tokenizer,
+        reset_position_ids=args.reset_position_ids,
+        reset_attention_mask=args.reset_attention_mask,
+        eod_mask_loss=args.eod_mask_loss,
+        create_attention_mask=args.create_attention_mask_in_dataloader,
+        s3_cache_path = args.s3_cache_path
+    )
+
+
 def train_valid_test_datasets_provider(train_val_test_num_samples):
-    """Build train, valid, and test datasets."""
+    """Build the train test and validation datasets.
+
+    Args:
+        train_val_test_num_samples : A list containing the number of samples in train test and validation.
+    """
     args = get_args()
 
-    print_rank_0('> building train, validation, and test datasets '
-                 'for GPT ...')
-    train_ds, valid_ds, test_ds = build_train_valid_test_datasets(
-        data_prefix=args.data_path,
-        data_impl=args.data_impl,
-        splits_string=args.split,
-        train_valid_test_num_samples=train_val_test_num_samples,
-        seq_length=args.seq_length,
-        seed=args.seed,
-        skip_warmup=(not args.mmap_warmup),
-        train_data_prefix=args.train_data_path,
-        valid_data_prefix=args.valid_data_path,
-        test_data_prefix=args.test_data_path)
+    config = core_gpt_dataset_config_from_args(args)
+
+    if args.mock_data:
+        dataset_type = MockGPTDataset
+    else:
+        dataset_type = GPTDataset
+
+    print_rank_0("> building train, validation, and test datasets for GPT ...")
+
+    train_ds, valid_ds, test_ds = BlendedMegatronDatasetBuilder(
+        dataset_type,
+        train_val_test_num_samples,
+        is_dataset_built_on_rank,
+        config
+    ).build()
+
     print_rank_0("> finished creating GPT datasets ...")
 
     return train_ds, valid_ds, test_ds
@@ -112,8 +259,13 @@ def train_valid_test_datasets_provider(train_val_test_num_samples):
 
 if __name__ == "__main__":
 
-    pretrain(train_valid_test_datasets_provider,
-             model_provider,
-             ModelType.encoder_or_decoder,
-             forward_step,
-             args_defaults={'tokenizer_type': 'GPT2BPETokenizer'})
+    # Temporary for transition to core datasets
+    train_valid_test_datasets_provider.is_distributed = True
+
+    pretrain(
+        train_valid_test_datasets_provider,
+        model_provider,
+        ModelType.encoder_or_decoder,
+        forward_step,
+        args_defaults={'tokenizer_type': 'GPT2BPETokenizer'},
+    )
diff --git a/pretrain_ict.py b/pretrain_ict.py
index b9aa4eaf56..205588b5e9 100644
--- a/pretrain_ict.py
+++ b/pretrain_ict.py
@@ -9,16 +9,16 @@
 import torch.distributed as dist
 import torch.nn.functional as F
 
-from megatron import get_args
-from megatron import print_rank_0
-from megatron import get_timers
+from megatron.training import get_args
+from megatron.training import print_rank_0
+from megatron.training import get_timers
 from megatron.core import mpu
 from megatron.core.enums import ModelType
-from megatron.data.biencoder_dataset_utils import get_ict_batch
-from megatron.data.dataset_utils import build_train_valid_test_datasets
-from megatron.model.biencoder_model import biencoder_model_provider
+from megatron.legacy.data.biencoder_dataset_utils import get_ict_batch
+from megatron.legacy.data.dataset_utils import build_train_valid_test_datasets
+from megatron.legacy.model.biencoder_model import biencoder_model_provider
 from megatron.training import pretrain
-from megatron.utils import average_losses_across_data_parallel_group
+from megatron.training.utils import average_losses_across_data_parallel_group
 
 
 def pretrain_ict_model_provider(pre_process=True, post_process=True):
@@ -144,14 +144,12 @@ def train_valid_test_datasets_provider(train_val_test_num_samples):
 
     train_ds, valid_ds, test_ds = build_train_valid_test_datasets(
         data_prefix=args.data_path,
-        data_impl=args.data_impl,
         splits_string=args.split,
         train_valid_test_num_samples=train_val_test_num_samples,
         max_seq_length=args.seq_length,
         masked_lm_prob=args.mask_prob,
         short_seq_prob=args.short_seq_prob,
         seed=args.seed,
-        skip_warmup=(not args.mmap_warmup),
         binary_head=False,
         dataset_type='ict')
     print_rank_0("> finished creating BERT ICT datasets ...")
@@ -160,6 +158,7 @@ def train_valid_test_datasets_provider(train_val_test_num_samples):
 
 
 if __name__ == "__main__":
+    print_rank_0("WARNING : This script is DEPRECATED. Will be removed in mcore release 0.9")
     pretrain(train_valid_test_datasets_provider,
              pretrain_ict_model_provider,
              ModelType.encoder_or_decoder,
diff --git a/pretrain_mamba.py b/pretrain_mamba.py
new file mode 100644
index 0000000000..f8202b6eac
--- /dev/null
+++ b/pretrain_mamba.py
@@ -0,0 +1,242 @@
+# Copyright (c) 2024, NVIDIA CORPORATION.  All rights reserved.
+"""Pretrain Mamba."""
+
+import os
+import torch
+from functools import partial
+
+from megatron.training import get_args
+from megatron.training import print_rank_0
+from megatron.training import get_timers
+from megatron.training import get_tokenizer
+from megatron.core import mpu
+# from megatron.core import parallel_state
+from megatron.core.enums import ModelType
+from megatron.core.datasets.blended_megatron_dataset_builder import BlendedMegatronDatasetBuilder
+from megatron.core.datasets.utils import get_blend_from_list
+from megatron.core.datasets.gpt_dataset import GPTDatasetConfig
+from megatron.core.datasets.gpt_dataset import MockGPTDataset, GPTDataset
+from megatron.core.models.mamba import MambaModel
+from megatron.training import pretrain
+from megatron.core.utils import StragglerDetector
+from megatron.core.transformer.spec_utils import import_module
+from megatron.training.utils import (
+    get_batch_on_this_cp_rank,
+    get_batch_on_this_tp_rank,
+)
+from megatron.training.arguments import core_transformer_config_from_args
+from megatron.core.models.gpt.gpt_layer_specs import get_gpt_layer_with_transformer_engine_spec
+
+
+stimer = StragglerDetector()
+
+def count_parameters_in_layer(model, layer_name):
+    num_params = 0
+    for name, param in model.named_parameters():
+        if layer_name in name:
+            num_params += param.numel()
+            print_rank_0(f" - {name}: {param.numel()}")
+    return num_params
+
+
+def model_provider(pre_process=True, post_process=True) -> MambaModel:
+    """Builds the model.
+
+    Args:
+        pre_process (bool, optional): Set to true if you need to compute embedings. Defaults to True.
+        post_process (bool, optional): Set to true if you need to want to compute output logits/loss. Defaults to True.
+
+
+    Returns:
+        MambaModel: The returned model
+    """
+    args = get_args()
+
+    print_rank_0('building Mamba model ...')
+    config = core_transformer_config_from_args(get_args())
+
+    assert args.use_legacy_models == False, "Mamba only supported in Mcore!"
+
+    if args.spec is not None:
+        mamba_stack_spec = import_module(args.spec)
+    else:
+        raise("You must provide a valid Mamba layer spec!")
+
+    model = MambaModel(
+        config=config,
+        mamba_stack_spec=mamba_stack_spec,
+        vocab_size=args.padded_vocab_size,
+        max_sequence_length=args.max_position_embeddings,
+        pre_process=pre_process,
+        hybrid_attention_ratio=args.hybrid_attention_ratio,
+        hybrid_mlp_ratio=args.hybrid_mlp_ratio,
+        hybrid_override_pattern=args.hybrid_override_pattern,
+        post_process=post_process,
+        fp16_lm_cross_entropy=args.fp16_lm_cross_entropy,
+        parallel_output=True,
+        share_embeddings_and_output_weights=not args.untie_embeddings_and_output_weights,
+        position_embedding_type=args.position_embedding_type,
+        rotary_percent=args.rotary_percent,
+        rotary_base=args.rotary_base
+    )
+
+    for l in range(model.decoder.num_layers_per_pipeline_rank):
+        layer_params = count_parameters_in_layer(model, f'decoder.layers.{l}.')
+        print_rank_0(f" == params layer {l}: {layer_params}")
+
+    return model
+
+
+def get_batch(data_iterator):
+    """Generate a batch."""
+
+    # TODO: this is pretty hacky, find a better way
+    if (not mpu.is_pipeline_first_stage()) and (not mpu.is_pipeline_last_stage()):
+        return None, None, None, None, None
+
+    # get batches based on the TP rank you are on
+    batch = get_batch_on_this_tp_rank(data_iterator)
+
+    # slice batch along sequence dimension for context parallelism
+    batch = get_batch_on_this_cp_rank(batch)
+
+    return batch.values()
+
+def loss_func(loss_mask: torch.Tensor, output_tensor: torch.Tensor):
+    """Loss function.
+
+    Args:
+        loss_mask (torch.Tensor): Used to mask out some portions of the loss
+        output_tensor (torch.Tensor): The tensor with the losses
+
+    Returns:
+        the loss scalar for this micro-batch
+        the number of non-padded tokens in this microbatch
+        a dict containing reporting metrics on the loss and number of tokens across
+            the data parallel ranks
+    """
+    args = get_args()
+
+    losses = output_tensor.float()
+    loss_mask = loss_mask.view(-1).float()
+    total_tokens = loss_mask.sum()
+    loss = torch.cat([torch.sum(losses.view(-1) * loss_mask).view(1), total_tokens.view(1)])
+
+    if args.context_parallel_size > 1:
+        torch.distributed.all_reduce(loss, group=mpu.get_context_parallel_group())
+
+    # Check individual rank losses are not NaN prior to DP all-reduce.
+    if args.check_for_nan_in_loss_and_grad:
+        global_rank = torch.distributed.get_rank()
+        assert not loss[0].isnan(), (
+            f'Rank {global_rank}: found NaN in local forward loss calculation. '
+            f'Device: {torch.cuda.current_device()}, node: {os.uname()[1]}'
+        )
+
+    # Reduce loss for logging.
+    reporting_loss = loss.clone().detach()
+    torch.distributed.all_reduce(reporting_loss, group=mpu.get_data_parallel_group())
+
+    local_num_tokens = loss[1].clone().detach().to(torch.int)
+    return (
+        loss[0] * args.context_parallel_size,
+        local_num_tokens,
+        {'lm loss': (reporting_loss[0], reporting_loss[1])},
+    )
+
+
+def forward_step(data_iterator, model: MambaModel):
+    """Forward training step.
+
+    Args:
+        data_iterator : Input data iterator
+        model (MambaModel): The GPT Model
+    """
+    args = get_args()
+    timers = get_timers()
+
+    # Get the batch.
+    timers('batch-generator', log_level=2).start()
+    global stimer
+    with stimer(bdata=True):
+        tokens, labels, loss_mask, attention_mask, position_ids = get_batch(
+            data_iterator)
+    timers('batch-generator').stop()
+
+    with stimer:
+        output_tensor = model(tokens, position_ids, attention_mask,
+                              labels=labels)
+
+    return output_tensor, partial(loss_func, loss_mask)
+
+
+def is_dataset_built_on_rank():
+    return (
+        mpu.is_pipeline_first_stage() or mpu.is_pipeline_last_stage()
+    ) and mpu.get_tensor_model_parallel_rank() == 0
+
+
+def core_gpt_dataset_config_from_args(args):
+    tokenizer = get_tokenizer()
+
+    return GPTDatasetConfig(
+        random_seed=args.seed,
+        sequence_length=args.seq_length,
+        blend=get_blend_from_list(args.data_path),
+        blend_per_split=[
+            get_blend_from_list(args.train_data_path),
+            get_blend_from_list(args.valid_data_path),
+            get_blend_from_list(args.test_data_path)
+        ],
+        renormalize_blend_weights=args.renormalize_blend_weights,
+        split=args.split,
+        num_dataset_builder_threads=args.num_dataset_builder_threads,
+        path_to_cache=args.data_cache_path,
+        mmap_bin_files=args.mmap_bin_files,
+        tokenizer=tokenizer,
+        reset_position_ids=args.reset_position_ids,
+        reset_attention_mask=args.reset_attention_mask,
+        eod_mask_loss=args.eod_mask_loss,
+        create_attention_mask=args.create_attention_mask_in_dataloader,
+    )
+
+
+def train_valid_test_datasets_provider(train_val_test_num_samples):
+    """Build the train test and validation datasets.
+
+    Args:
+        train_val_test_num_samples : A list containing the number of samples in train test and validation.
+    """
+    args = get_args()
+
+    config = core_gpt_dataset_config_from_args(args)
+
+    if args.mock_data:
+        dataset_type = MockGPTDataset
+    else:
+        dataset_type = GPTDataset
+
+    print_rank_0("> building train, validation, and test datasets for GPT ...")
+
+    train_ds, valid_ds, test_ds = BlendedMegatronDatasetBuilder(
+        dataset_type,
+        train_val_test_num_samples,
+        is_dataset_built_on_rank,
+        config
+    ).build()
+
+    print_rank_0("> finished creating GPT datasets ...")
+
+    return train_ds, valid_ds, test_ds
+
+
+if __name__ == "__main__":
+
+    # Temporary for transition to core datasets
+    train_valid_test_datasets_provider.is_distributed = True
+
+    pretrain(train_valid_test_datasets_provider,
+             model_provider,
+             ModelType.encoder_or_decoder,
+             forward_step,
+             args_defaults={'tokenizer_type': 'GPT2BPETokenizer'})
diff --git a/pretrain_retro.py b/pretrain_retro.py
index 597bbf0f6a..0aecbf14ce 100644
--- a/pretrain_retro.py
+++ b/pretrain_retro.py
@@ -1,40 +1,95 @@
-# Copyright (c) 2023, NVIDIA CORPORATION.  All rights reserved.
+# Copyright (c) 2024, NVIDIA CORPORATION.  All rights reserved.
 
 """Pretrain Retro."""
 
 from functools import partial
 import torch
 
-from megatron import get_args, get_retro_args
-from megatron import get_timers
-from megatron import get_tokenizer
-from megatron import print_rank_0
-from megatron.core import mpu, tensor_parallel
+from megatron.training import get_args
+from megatron.training import get_timers
+from megatron.training import get_tokenizer
+from megatron.training import print_rank_0
+from megatron.training.arguments import core_transformer_config_from_args
+from megatron.core import tensor_parallel
+from megatron.core.datasets.blended_megatron_dataset_builder import BlendedMegatronDatasetBuilder
+from megatron.core.datasets.utils import get_blend_from_list
+from megatron.core.datasets.retro.query.retro_dataset import get_retro_datasets
+from megatron.core.datasets.retro.query.multi_split_gpt_dataset import MultiSplitGPTDataset, MultiSplitGPTDatasetConfig
 from megatron.core.enums import ModelType
-from megatron.model import GPTModel
+from megatron.core.models.retro import get_retro_decoder_block_spec, RetroConfig, RetroModel
+from megatron.core.models.retro.utils import get_all_true_mask
 from megatron.training import pretrain
-from megatron.utils import get_ltor_masks_and_position_ids
-from tools.retro.query.retro_dataset import get_retro_datasets
-
+from megatron.training.utils import get_ltor_masks_and_position_ids
 from pretrain_gpt import (
+    is_dataset_built_on_rank,
     loss_func,
-    model_provider,
-    train_valid_test_datasets_provider as standard_datasets_provider,
+    model_provider as default_model_provider,
+    train_valid_test_datasets_provider as gpt_train_valid_test_datasets_provider,
 )
 
 
+def get_retro_config():
+    return core_transformer_config_from_args(get_args(), RetroConfig)
+
+
+def core_model_provider(pre_process=True, post_process=True):
+    """Build the model using Megatron-Core."""
+
+    args = get_args()
+    config = get_retro_config()
+
+    # NOTE: Experimental customization feature
+    if args.spec is not None:
+        block_spec = import_module(args.spec)()
+    else:
+        block_spec = get_retro_decoder_block_spec(config, use_transformer_engine=True)
+
+    print_rank_0('building GPT model ...')
+    model = RetroModel(
+        config=config,
+        transformer_layer_spec=block_spec,
+        vocab_size=args.padded_vocab_size,
+        max_sequence_length=args.max_position_embeddings,
+        pre_process=pre_process,
+        post_process=post_process,
+        fp16_lm_cross_entropy=args.fp16_lm_cross_entropy,
+        parallel_output=True,
+        share_embeddings_and_output_weights=not args.untie_embeddings_and_output_weights,
+        position_embedding_type=args.position_embedding_type,
+        rotary_percent=args.rotary_percent
+    )
+    return model
+
+
+def model_provider(pre_process=True, post_process=True):
+    """Build the model.
+
+    Select between two different model classes:
+      1. Default model (uses megatron.legacy.models/gpt_model.py).
+      2. Core model (uses megatron/core/models/retro/model.py).
+    """
+
+    args = get_args()
+    if not args.use_legacy_models and args.retro_add_retriever:
+        provider = core_model_provider
+    else:
+        provider = default_model_provider
+    model = provider(pre_process=pre_process, post_process=post_process)
+    return model
+
+
 def get_batch(data_iterator):
     """Generate a batch"""
+
     args = get_args()
-    retro_args = get_retro_args()
     tokenizer = get_tokenizer()
+    config = get_retro_config()
 
     # Items and their type.
     keys = ['text']
-    datatype = torch.int64
-
     if args.retro_add_retriever:
-        keys += 'neighbor_tokens',
+        keys.append('neighbor_tokens')
+    datatype = torch.int64
 
     # Broadcast data.
     if data_iterator is not None:
@@ -49,12 +104,6 @@ def get_batch(data_iterator):
     labels = tokens_[:, 1:].contiguous()
     tokens = tokens_[:, :-1].contiguous()
 
-    if args.retro_add_retriever:
-        # note: [bs * l * k, r]
-        # note: 2x == neighbor, continuation
-        neighbor_tokens = data_b['neighbor_tokens'] \
-            .view(-1, retro_args.retro_gpt_retrieved_length).long()
-
     # Get the masks and postition ids.
     attention_mask, loss_mask, position_ids = get_ltor_masks_and_position_ids(
         tokens,
@@ -64,15 +113,22 @@ def get_batch(data_iterator):
         args.eod_mask_loss)
 
     if args.retro_add_retriever:
+        # note: [bs * l * k, r]
+        # note: 2x == neighbor, continuation
+        neighbor_tokens = data_b['neighbor_tokens'] \
+            .view(-1, config.retro_retrieved_length).long()
         _, _, neighbor_position_ids = get_ltor_masks_and_position_ids(
             neighbor_tokens,
             tokenizer.eod,
             args.reset_position_ids,
             args.reset_attention_mask,
             args.eod_mask_loss)
-        neighbor_attention_mask = None
+        neighbor_attention_mask = get_all_true_mask(
+            (1, 1, config.retro_retrieved_length, config.retro_retrieved_length),
+            neighbor_tokens.device)
         return tokens, labels, loss_mask, attention_mask, position_ids, \
                neighbor_tokens, neighbor_attention_mask, neighbor_position_ids
+
     else:
         return tokens, labels, loss_mask, attention_mask, position_ids
 
@@ -95,29 +151,95 @@ def forward_step(data_iterator, model):
             None, None, None
     timers('batch-generator').stop()
 
+    # Model call.
+    if args.use_legacy_models:
+        forward_kwargs = {
+            "retriever_input_ids" : neighbor_tokens,
+            "retriever_position_ids" : neighbor_position_ids,
+            "retriever_attn_mask" : neighbor_attention_mask,
+        }
+    else:
+        if args.retro_add_retriever:
+            forward_kwargs = {
+                "context_input_ids" : neighbor_tokens,
+                "context_position_ids" : neighbor_position_ids,
+                "context_mask" : neighbor_attention_mask,
+            }
+        else:
+            forward_kwargs = {}
+ 
     output_tensor = model(tokens, position_ids, attention_mask,
-                          retriever_input_ids=neighbor_tokens,
-                          retriever_position_ids=neighbor_position_ids,
-                          retriever_attn_mask=neighbor_attention_mask,
-                          labels=labels)
+                          labels=labels, **forward_kwargs)
 
     return output_tensor, partial(loss_func, loss_mask)
 
 
-def train_valid_test_datasets_provider(train_val_test_num_samples):
+def train_valid_test_datasets_provider(train_valid_test_num_samples):
     """Build train, valid, and test datasets."""
     args = get_args()
+
+    # Dataset config.
+    retro_config = get_retro_config()
+    data_config = MultiSplitGPTDatasetConfig(
+        random_seed=args.seed,
+        sequence_length=args.seq_length,
+        blend=get_blend_from_list(args.data_path),
+        blend_per_split=[
+            get_blend_from_list(args.train_data_path),
+            get_blend_from_list(args.valid_data_path),
+            get_blend_from_list(args.test_data_path)
+        ],
+        renormalize_blend_weights=args.renormalize_blend_weights,
+        split=args.split,
+        split_preprocessing=retro_config.retro_split_preprocessing,
+        path_to_cache=args.data_cache_path,
+        return_document_ids=False,
+        tokenizer=get_tokenizer(),
+        reset_position_ids=args.reset_position_ids,
+        reset_attention_mask=args.reset_attention_mask,
+        eod_mask_loss=args.eod_mask_loss,
+    )
+
+    # GPT datasets.
+    print_rank_0(" > multi-split gpt datasets.")
+    train_ds, valid_ds, test_ds = BlendedMegatronDatasetBuilder(
+        MultiSplitGPTDataset,
+        train_valid_test_num_samples,
+        is_dataset_built_on_rank,
+        data_config,
+    ).build()
+
+    gpt_datasets = {
+        "train" : (train_ds, train_valid_test_num_samples[0]),
+        "valid" : (valid_ds, train_valid_test_num_samples[1]),
+        "test"  : (test_ds, train_valid_test_num_samples[2]),
+    }
+
+    # Retro datasets.
     if args.retro_add_retriever:
-        return get_retro_datasets()
+        return get_retro_datasets(
+            config=retro_config,
+            gpt_datasets=gpt_datasets,
+            sample_length=args.seq_length,
+            eod_token_id=get_tokenizer().eod,
+        )
+
+    # Multi-split GPT datasets.
     else:
-        return standard_datasets_provider(train_val_test_num_samples)
+        return (
+            gpt_datasets["train"][0],
+            gpt_datasets["valid"][0],
+            gpt_datasets["test"][0],
+        )
 
 
 if __name__ == "__main__":
 
+    # Temporary for transition to core datasets.
+    train_valid_test_datasets_provider.is_distributed = True
+
     pretrain(train_valid_test_datasets_provider,
              model_provider,
              ModelType.retro_decoder,
              forward_step,
-             args_defaults={'tokenizer_type': 'GPT2BPETokenizer',
-                            'retro_add_retriever': True})
+             args_defaults={'tokenizer_type': 'GPT2BPETokenizer'})
diff --git a/pretrain_t5.py b/pretrain_t5.py
index e3ae4ad0ad..253d4b19c6 100644
--- a/pretrain_t5.py
+++ b/pretrain_t5.py
@@ -2,26 +2,40 @@
 
 """Pretrain T5"""
 
+from copy import deepcopy
 from functools import partial
+from typing import Union
 
 import torch
 
-from megatron import (
+from megatron.training import (
     get_args,
     get_timers,
+    get_tokenizer,
     print_rank_0
 )
-from megatron.core import tensor_parallel
+from megatron.core import mpu, tensor_parallel
+from megatron.core.datasets.blended_megatron_dataset_builder import BlendedMegatronDatasetBuilder
+from megatron.core.datasets.t5_dataset import (
+    T5MaskedWordPieceDataset,
+    T5MaskedWordPieceDatasetConfig,
+)
 from megatron.core.enums import ModelType
-from megatron.data.dataset_utils import build_train_valid_test_datasets
-from megatron.model import T5Model
+from megatron.core.models.T5 import T5Model
 from megatron.training import pretrain
-from megatron.utils import average_losses_across_data_parallel_group
-
+from megatron.training.arguments import core_transformer_config_from_args
+from megatron.core.datasets.blended_megatron_dataset_builder import BlendedMegatronDatasetBuilder
+from megatron.core.datasets.t5_dataset import T5MaskedWordPieceDataset, T5MaskedWordPieceDatasetConfig
+from megatron.core.datasets.utils import get_blend_from_list
+from megatron.core.models.T5.t5_spec import (get_t5_encoder_with_transformer_engine_block_spec,
+                                            get_t5_decoder_with_transformer_engine_block_spec,
+                                            get_t5_encoder_with_local_block_spec,
+                                            get_t5_decoder_with_local_block_spec)
+from megatron.legacy.model import T5Model as LegacyT5Model
+from pretrain_gpt import loss_func
 
 """
 Pipeline parallelism for T5
-===========================
 
 T5 is a model architecture with both encoder and decoder blocks.
 Consequently, pipeline parallelism is implemented slightly differently
@@ -55,25 +69,88 @@
 """
 
 
-def model_provider(pre_process=True, post_process=True,
-                   add_encoder=True, add_decoder=True):
-    """Build the model."""
+def model_provider(
+    pre_process=True, post_process=True, add_encoder=True, add_decoder=True
+) -> Union[LegacyT5Model, T5Model]:
+    """Builds the model.
+
+    Args:
+        pre_process (bool, optional): Set to true if you need to compute embedings. Defaults to True.
+        post_process (bool, optional): Set to true if you need to want to compute output logits/loss. Defaults to True.
+        add_encoder (bool, optional): Defaults to True
+        add_decoder (bool, optional): Defaults to True
+    Returns:
+        T5Model: The returned T5 model
+    """
+
+    args = get_args()
+
+    assert (
+        args.encoder_tensor_model_parallel_size == 0 or
+        args.encoder_tensor_model_parallel_size == args.tensor_model_parallel_size
+    ), f"Because word embeddings are shared between the encoder & decoder, these have to have the same tensor parallel size."
+
+    config = core_transformer_config_from_args(args)
+    if args.use_legacy_models:
+        model = LegacyT5Model(
+            config=config,
+            num_tokentypes=0,
+            parallel_output=True,
+            pre_process=pre_process,
+            post_process=post_process,
+            add_encoder=add_encoder,
+            add_decoder=add_decoder,
+        )
+    else:
+        encoder_config = deepcopy(config)
+        encoder_config.num_layers = args.encoder_num_layers
+
+        if args.pipeline_model_parallel_size > 1:
+            assert args.encoder_pipeline_model_parallel_size > 0, "Need to know how to shard the encoder & decoder."
+
+        if args.encoder_pipeline_model_parallel_size > 0:
+            encoder_config.pipeline_model_parallel_size = args.encoder_pipeline_model_parallel_size
+
+        encoder_layers_per_pipeline = encoder_config.num_layers // encoder_config.pipeline_model_parallel_size
+        decoder_layers_per_pipeline = config.num_layers // config.pipeline_model_parallel_size
+
+        if args.transformer_impl == "local":
+            en_block_spec = get_t5_encoder_with_local_block_spec(encoder_layers_per_pipeline)
+            de_block_spec = get_t5_decoder_with_local_block_spec(decoder_layers_per_pipeline)
+        elif args.transformer_impl == "transformer_engine":
+            en_block_spec = get_t5_encoder_with_transformer_engine_block_spec(
+                encoder_layers_per_pipeline
+            )
+            de_block_spec = get_t5_decoder_with_transformer_engine_block_spec(
+                decoder_layers_per_pipeline
+            )
+
+        print_rank_0('building T5 model ...')
+        model = T5Model(
+            config=config,
+            encoder_config=encoder_config,
+            transformer_encoder_layer_spec=en_block_spec,
+            transformer_decoder_layer_spec=de_block_spec,
+            vocab_size=args.padded_vocab_size,
+            max_sequence_length=args.max_position_embeddings,
+            pre_process=pre_process,
+            post_process=post_process,
+            fp16_lm_cross_entropy=args.fp16_lm_cross_entropy,
+            parallel_output=True,
+            share_embeddings_and_output_weights=not args.untie_embeddings_and_output_weights,
+            position_embedding_type=args.position_embedding_type,
+            rotary_percent=args.rotary_percent,
+            add_encoder=add_encoder,
+            add_decoder=add_decoder
+        )
 
-    print_rank_0('building T5 model ...')
-    model = T5Model(num_tokentypes=0,
-                    parallel_output=True,
-                    pre_process=pre_process,
-                    post_process=post_process,
-                    add_encoder=add_encoder,
-                    add_decoder=add_decoder)
     return model
 
 
 def get_batch(data_iterator):
     """Build the batch."""
 
-    keys = ['text_enc', 'text_dec', 'labels', 'loss_mask',
-            'enc_mask', 'dec_mask', 'enc_dec_mask']
+    keys = ['text_enc', 'text_dec', 'labels', 'loss_mask', 'enc_mask', 'dec_mask', 'enc_dec_mask']
     datatype = torch.int64
 
     # Broadcast data.
@@ -89,72 +166,134 @@ def get_batch(data_iterator):
     labels = data_b['labels'].long()
     loss_mask = data_b['loss_mask'].float()
 
-    enc_mask = (data_b['enc_mask'] < 0.5)
-    dec_mask = (data_b['dec_mask'] < 0.5)
-    enc_dec_mask = (data_b['enc_dec_mask'] < 0.5)
-
-    return tokens_enc, tokens_dec, loss_mask, labels, \
-           enc_mask, dec_mask, enc_dec_mask
-
+    enc_mask = data_b['enc_mask'] < 0.5
+    dec_mask = data_b['dec_mask'] < 0.5
+    enc_dec_mask = data_b['enc_dec_mask'] < 0.5
 
-def loss_func(loss_mask, output_tensor):
-    lm_loss_ = output_tensor.float()
-    lm_loss = torch.sum(
-        lm_loss_.view(-1) * loss_mask.reshape(-1)) / loss_mask.sum()
+    return tokens_enc, tokens_dec, loss_mask, labels, enc_mask, dec_mask, enc_dec_mask
 
-    loss = lm_loss
-    averaged_losses = average_losses_across_data_parallel_group([lm_loss])
 
-    return loss, {'lm loss': averaged_losses[0]}
+def forward_step(data_iterator, model: T5Model):
+    """Forward training step.
 
+    Args:
+        data_iterator : Input data iterator
+        model (T5Model): The T5 Model
+    """
 
-def forward_step(data_iterator, model):
-    """Forward step."""
     args = get_args()
     timers = get_timers()
 
     # Get the batch.
     timers('batch generator', log_level=2).start()
-    tokens_enc, tokens_dec, loss_mask, lm_labels, enc_mask, dec_mask, enc_dec_mask \
-        = get_batch(data_iterator)
+    tokens_enc, tokens_dec, loss_mask, lm_labels, enc_mask, dec_mask, enc_dec_mask = get_batch(
+        data_iterator
+    )
     timers('batch generator').stop()
 
     # Forward model lm_labels
-    output_tensor = model(tokens_enc,
-                          tokens_dec,
-                          enc_mask,
-                          dec_mask,
-                          enc_dec_mask,
-                          tokentype_ids=None,
-                          lm_labels=lm_labels)
+    output_tensor = model(
+        tokens_enc, tokens_dec, enc_mask, dec_mask, enc_dec_mask, lm_labels=lm_labels
+    )
 
     return output_tensor, partial(loss_func, loss_mask)
 
 
-def train_valid_test_datasets_provider(train_val_test_num_samples):
-    """Build train, valid, and test datasets."""
+def train_valid_test_datasets_provider(train_val_test_num_samples: int):
+    """Build the train test and validation datasets.
+
+    Args:
+        train_val_test_num_samples : A list containing the number of samples in train test and validation.
+    """
     args = get_args()
 
-    print_rank_0('> building train, validation, and test datasets '
-                 'for T5 ...')
-    train_ds, valid_ds, test_ds = build_train_valid_test_datasets(
-        data_prefix=args.data_path,
-        data_impl=args.data_impl,
-        splits_string=args.split,
-        train_valid_test_num_samples=train_val_test_num_samples,
-        max_seq_length=args.encoder_seq_length,
-        max_seq_length_dec=args.decoder_seq_length,
-        masked_lm_prob=args.mask_prob,
-        short_seq_prob=args.short_seq_prob,
-        seed=args.seed,
-        skip_warmup=(not args.mmap_warmup),
-        dataset_type='t5')
+    tokenizer = get_tokenizer()
+
+    config = T5MaskedWordPieceDatasetConfig(
+        random_seed=args.seed,
+        sequence_length=args.encoder_seq_length,
+        sequence_length_decoder=args.decoder_seq_length,
+        blend=get_blend_from_list(args.data_path),
+        blend_per_split=[
+            get_blend_from_list(args.train_data_path),
+            get_blend_from_list(args.valid_data_path),
+            get_blend_from_list(args.test_data_path)
+        ],
+        renormalize_blend_weights=args.renormalize_blend_weights,
+        split=args.split,
+        path_to_cache=args.data_cache_path,
+        tokenizer=tokenizer,
+        masking_probability=args.mask_prob,
+        short_sequence_probability=args.short_seq_prob,
+        masking_max_ngram=10,
+        masking_do_full_word=True,
+        masking_do_permutation=False,
+        masking_use_longer_ngrams=False,
+        masking_use_geometric_distribution=True,
+    )
+
+    print_rank_0('> building train, validation, and test datasets for T5 ...')
+
+    train_ds, valid_ds, test_ds = BlendedMegatronDatasetBuilder(
+        T5MaskedWordPieceDataset,
+        train_val_test_num_samples,
+        lambda: mpu.get_tensor_model_parallel_rank() == 0,
+        config,
+    ).build()
+
     print_rank_0("> finished creating T5 datasets ...")
 
     return train_ds, valid_ds, test_ds
 
 
+def t5_embedding_ranks(pp_ranks):
+    """T5's embedding ranks consist of the encoder's first rank, and the decoder's first & last ranks.
+    Args:
+        pp_ranks: A list of global ranks that constitute a pipeline group.
+    """
+    args = get_args()
+
+    first_rank = pp_ranks[0]
+    last_rank = pp_ranks[-1]
+
+    # encoder size is also the index to the first rank of the decoder.
+    epp = args.encoder_pipeline_model_parallel_size
+
+    if len(pp_ranks) == 1:
+        return [first_rank]
+    elif pp_ranks[epp] not in (first_rank, last_rank):
+        return [first_rank, pp_ranks[epp], last_rank]
+    else:
+        return [first_rank, last_rank]
+
+
+def t5_position_embedding_ranks(pp_ranks):
+    """T5's positional embeddings are the encoder & decoder first rank stages
+    Args:
+        pp_ranks: A list of global ranks that constitute a pipeline group.
+    """
+    args = get_args()
+
+    # encoder size is also the index to the first rank of the decoder.
+    epp = args.encoder_pipeline_model_parallel_size
+
+    if len(pp_ranks) == 1 or pp_ranks[0] == pp_ranks[epp]:
+        return [pp_ranks[0]]
+    else:
+        return [pp_ranks[0], pp_ranks[epp]]
+
+
 if __name__ == "__main__":
 
-    pretrain(train_valid_test_datasets_provider, model_provider, ModelType.encoder_and_decoder,
-             forward_step, args_defaults={'tokenizer_type': 'BertWordPieceLowerCase'})
+    # Temporary for transition to core datasets
+    train_valid_test_datasets_provider.is_distributed = True
+
+    pretrain(
+        train_valid_test_datasets_provider,
+        model_provider,
+        ModelType.encoder_and_decoder,
+        forward_step,
+        args_defaults={'tokenizer_type': 'BertWordPieceLowerCase'},
+        get_embedding_ranks=t5_embedding_ranks,
+        get_position_embedding_ranks=t5_position_embedding_ranks,
+    )
diff --git a/pretrain_vision_classify.py b/pretrain_vision_classify.py
index b5798482d2..8d9b28baeb 100644
--- a/pretrain_vision_classify.py
+++ b/pretrain_vision_classify.py
@@ -5,23 +5,25 @@
 import torch
 import torch.nn.functional as F
 from functools import partial
-from megatron import get_args, get_timers, print_rank_0
+from megatron.training import get_args, get_timers, print_rank_0
 from megatron.core.enums import ModelType
-from megatron.data.vit_dataset import build_train_valid_datasets
-from megatron.model.vision.classification import VitClassificationModel
-from megatron.model.vision.classification import MitClassificationModel
+from megatron.legacy.data.vit_dataset import build_train_valid_datasets
+from megatron.legacy.model.vision.classification import VitClassificationModel
+from megatron.legacy.model.vision.classification import MitClassificationModel
 from megatron.training import pretrain
-from megatron.utils import average_losses_across_data_parallel_group
+from megatron.training.utils import average_losses_across_data_parallel_group
+from megatron.training.arguments import core_transformer_config_from_args
 
 
 def model_provider(pre_process=True, post_process=True):
     """Build the model."""
 
     args = get_args()
-
+    config = core_transformer_config_from_args(args)
     if args.vision_backbone_type == 'vit':
         print_rank_0("building VIT model ...")
-        model = VitClassificationModel(num_classes=args.num_classes,
+        model = VitClassificationModel(config=config,
+                                       num_classes=args.num_classes,
                                        pre_process=pre_process,
                                        post_process=post_process)
     elif args.vision_backbone_type == 'mit':
diff --git a/pretrain_vision_dino.py b/pretrain_vision_dino.py
index ed96715bb4..f75280c42d 100644
--- a/pretrain_vision_dino.py
+++ b/pretrain_vision_dino.py
@@ -6,20 +6,19 @@
 import numpy as np
 import torch.distributed as dist
 from functools import partial
-from megatron import get_args, get_timers, print_rank_0
+from megatron.training import get_args, get_timers, print_rank_0
 from megatron.core.enums import ModelType
-from megatron.data.vit_dataset import build_train_valid_datasets
-from megatron.model.vision.dino import DINOPretrainModel
-from megatron.model.vision.knn_monitor import knn_predict, get_feature_bank
+from megatron.legacy.data.vit_dataset import build_train_valid_datasets
+from megatron.legacy.model.vision.dino import DINOPretrainModel
+from megatron.legacy.model.vision.knn_monitor import knn_predict, get_feature_bank
 from megatron.training import pretrain
-from megatron.utils import average_losses_across_data_parallel_group, unwrap_model
-from torch.nn.parallel.distributed import DistributedDataParallel as torchDDP
-from megatron.model import DistributedDataParallel as LocalDDP
-from megatron.model import Float16Module
+from megatron.training.utils import average_losses_across_data_parallel_group, unwrap_model
+from megatron.training.arguments import core_transformer_config_from_args
 
 def model_provider(pre_process=True, post_process=True):
     """Build the model."""
-    return DINOPretrainModel(pre_process=pre_process, post_process=post_process)
+    config = core_transformer_config_from_args(get_args())
+    return DINOPretrainModel(config, pre_process=pre_process, post_process=post_process)
 
 def get_batch(data_iterator):
     """Build the batch."""
@@ -37,11 +36,8 @@ def get_batch(data_iterator):
 
 def loss_func(model, labels, output_tensor, collect_data=False):
     args = get_args()
-    
-    model = unwrap_model(
-        model,
-        (torchDDP, LocalDDP, Float16Module)
-    )
+
+    model = unwrap_model(model)
     if model.training:
         student_output, teacher_output = output_tensor
         loss = model.dino_loss(student_output, teacher_output, args.curr_iteration)
@@ -98,6 +94,7 @@ def train_valid_test_datasets_provider(train_val_test_num_samples):
 
 
 if __name__ == "__main__":
+
     pretrain(
         train_valid_test_datasets_provider,
         model_provider,
diff --git a/pretrain_vision_inpaint.py b/pretrain_vision_inpaint.py
index 783ad7f4b2..8570baab5b 100644
--- a/pretrain_vision_inpaint.py
+++ b/pretrain_vision_inpaint.py
@@ -5,23 +5,27 @@
 import torch
 import torch.nn.functional as F
 from functools import partial
-from megatron import get_args, get_timers, print_rank_0, print_rank_last
+from megatron.training import get_args, get_timers, print_rank_0, print_rank_last
 from megatron.core.enums import ModelType
-from megatron.data.vit_dataset import build_train_valid_datasets
-from megatron.model.vision.inpainting import VitInpaintingModel
-from megatron.model.vision.inpainting import MitInpaintingModel
+from megatron.legacy.data.vit_dataset import build_train_valid_datasets
+from megatron.legacy.model.vision.inpainting import VitInpaintingModel
+from megatron.legacy.model.vision.inpainting import MitInpaintingModel
 from megatron.training import pretrain
-from megatron.utils import average_losses_across_data_parallel_group
-from tasks.vision.metrics import SSIM, PSNR
+from megatron.training.utils import average_losses_across_data_parallel_group
+from tasks.vision.segmentation.metrics import SSIM, PSNR
+from megatron.training.arguments import core_transformer_config_from_args
 
 def model_provider(pre_process=True, post_process=True):
     """Build the model."""
     args = get_args()
+    config = core_transformer_config_from_args(args)
     if args.vision_backbone_type == 'vit':
-        model = VitInpaintingModel(pre_process=pre_process,
+        model = VitInpaintingModel(config=config,
+                                   pre_process=pre_process,
                                    post_process=post_process)
     elif args.vision_backbone_type == 'mit':
-        model = MitInpaintingModel(pre_process=pre_process,
+        model = MitInpaintingModel(config=config,
+                                   pre_process=pre_process,
                                    post_process=post_process)
     else:
         raise Exception('{} vision backbone is not supported.'.format(
@@ -39,7 +43,7 @@ def get_batch(data_iterator):
     return images, masks
 
 
-def loss_func(images, masks, masked_images, outputs, collect_data=False):
+def loss_func(images, masks, masked_images, outputs, non_loss_data=False):
     outputs = outputs.contiguous().float()
     masks_flip = 1-masks
     flip_masked_outputs = outputs.masked_fill(masks_flip.bool(), 0)
@@ -48,7 +52,7 @@ def loss_func(images, masks, masked_images, outputs, collect_data=False):
     ssim_fun = SSIM()
     psnr_fun = PSNR()
 
-    if not collect_data:
+    if not non_loss_data:
         mask_count = torch.count_nonzero(masks)
         loss = F.mse_loss(
             flip_masked_outputs,
diff --git a/pretrain_vlm.py b/pretrain_vlm.py
new file mode 100644
index 0000000000..5ad0bda695
--- /dev/null
+++ b/pretrain_vlm.py
@@ -0,0 +1,394 @@
+# Copyright (c) 2024, NVIDIA CORPORATION.  All rights reserved.
+"""Pretrain vision language model."""
+from copy import deepcopy
+from functools import partial
+import warnings
+
+import torch
+
+from megatron.core import parallel_state, tensor_parallel
+from megatron.core.datasets.blended_megatron_dataset_builder import BlendedMegatronDatasetBuilder
+from megatron.core.datasets.multimodal_dataset import MockMultimodalDataset, MultimodalDatasetConfig
+from megatron.core.enums import ModelType
+from megatron.core.models.vision.clip_vit_model import get_num_image_embeddings
+from megatron.core.transformer.enums import AttnMaskType
+from megatron.core.models.multimodal.llava_model import LLaVAModel, IMAGE_TOKEN_INDEX
+from megatron.core.models.multimodal.llava_spec import (
+    decoder_model_with_transformer_engine_default_spec,
+    decoder_model_with_local_default_spec,
+)
+from megatron.core.models.vision.vit_layer_specs import (
+    get_vit_layer_with_transformer_engine_spec,
+    get_vit_layer_with_local_spec,
+)
+from megatron.core.transformer.spec_utils import import_module
+from megatron.training import get_args, get_timers, get_tokenizer, pretrain, print_rank_0
+from megatron.training.arguments import core_transformer_config_from_args
+from pretrain_gpt import loss_func
+
+
+def model_provider(
+    pre_process=True, post_process=True, add_encoder=True, add_decoder=True, parallel_output=True
+) -> LLaVAModel:
+    """Builds the model.
+
+    Note: currently, only LLaVA model is supported. Follow-up changes will make this configurable.
+
+    Args:
+        pre_process (bool): Include the embedding layer in the gpt decoder (used with pipeline parallelism). Defaults to True.
+        post_process (bool): Include an output layer and a layernorm in the gpt decoder (used with pipeline parallelism). Defaults to True.
+        add_encoder (bool): Construct the encoder module (used with pipeline parallelism). Defaults to True. When we use pipelining, the encoder
+            will live on only a subset of the pipeline stages (specifically, only the first stage).
+        add_decoder (bool): Construct the decoder module (used with pipeline parallelism). Defaults to True. When we use pipelining, the decoder
+            will live on only a subset of the pipeline stages (specifically, every stage after the first one).
+        parallel_output (bool): Enable model parallel output.
+
+    Returns:
+        model (megatron.core.models.multimodal.llava_model.LLaVAModel): A multimodal model
+    """
+    args = get_args()
+    vision_model_type = "clip"
+
+    num_image_embeddings = get_num_image_embeddings(
+        args.img_h, args.img_w, args.patch_dim, vision_model_type, args.disable_vision_class_token, 1
+    )
+
+    old_seq_length = args.seq_length
+    # dataloader-seq-length is required to determine the length of text seq len
+    if args.dataloader_seq_length is None:
+        args.dataloader_seq_length = args.seq_length
+
+    # decoder_seq_length denotes the language model sequence length.
+    decoder_seq_len = args.seq_length + num_image_embeddings
+
+    # seq_length and encoder_seq_length denote the vision model sequence length. Override if the user provided something else.
+    args.seq_length = args.encoder_seq_length = num_image_embeddings
+    if torch.distributed.get_rank() == 0 and old_seq_length != args.seq_length:
+        warnings.warn(
+            f"Changed seq_length and encoder_seq_length (vision model sequence length) from {old_seq_length} to num_image_tokens ({num_image_embeddings})"
+        )
+    #Padding to multiple of 64 when using sequence parallel
+    sp_padding_needed = 0
+    tp_size = args.tensor_model_parallel_size
+    if args.sequence_parallel:
+        assert args.transformer_impl == "transformer_engine", \
+            "TransformerEngine is needed to support Sequence Parallelism implementation"
+        if not args.decoder_tp_comm_overlap:
+            args.decoder_seq_length = decoder_seq_len
+            sp_padding_needed = int((args.decoder_seq_length + (tp_size-1)) // tp_size * tp_size) - args.decoder_seq_length
+            if sp_padding_needed > 0:
+                args.decoder_seq_length += sp_padding_needed
+        else:
+            # If TP Comm Overlap is enabled for LM backbone,
+            # user needs to provide decoder_seq_length with any potential padding needed
+            assert args.decoder_seq_length is not None, \
+                "Please provide --decoder-seq-length when using TP Comm overlap for LM backbone"
+            sp_padding_needed = args.decoder_seq_length - decoder_seq_len
+    else:
+        args.decoder_seq_length = decoder_seq_len
+
+    args.max_position_embeddings = max(args.max_position_embeddings, args.decoder_seq_length)
+
+    print_rank_0('building a multimodal model ...')
+    language_transformer_config = core_transformer_config_from_args(get_args())
+    if args.decoder_tp_comm_overlap:
+        assert args.transformer_impl == "transformer_engine", \
+            "TransformerEngine is needed to support Decoder TP Comm overlap"
+        language_transformer_config.tp_comm_overlap = args.decoder_tp_comm_overlap
+
+    if args.spec is not None:
+        language_transformer_layer_spec = import_module(args.spec)
+    elif args.transformer_impl == "transformer_engine":
+        language_transformer_layer_spec = decoder_model_with_transformer_engine_default_spec(
+            args.num_experts, args.moe_grouped_gemm
+        )
+    else:  # transformer_impl == "local"
+        language_transformer_layer_spec = decoder_model_with_local_default_spec(
+            args.num_experts, args.moe_grouped_gemm
+        )
+    
+    if sp_padding_needed > 0:
+        if language_transformer_layer_spec.submodules.self_attention.params.get('attn_mask_type', '') == AttnMaskType.causal:
+            language_transformer_layer_spec.submodules.self_attention.params['attn_mask_type'] = AttnMaskType.padding_causal
+        elif language_transformer_layer_spec.submodules.self_attention.params.get('attn_mask_type', '') == AttnMaskType.no_mask:
+            language_transformer_layer_spec.submodules.self_attention.params['attn_mask_type'] = AttnMaskType.padding
+    
+    if args.transformer_impl == "transformer_engine":
+        vision_transformer_layer_spec = get_vit_layer_with_transformer_engine_spec()
+    else:  # transformer_impl == "local"
+        vision_transformer_layer_spec = get_vit_layer_with_local_spec()
+
+    # TODO: Make these configurable via input .yaml config.
+    vision_transformer_config = deepcopy(language_transformer_config)
+    vision_transformer_config.num_layers = args.encoder_num_layers
+    vision_transformer_config.first_pipeline_num_layers = None
+    vision_transformer_config.last_pipeline_num_layers = None
+    vision_transformer_config.vision_model_type = vision_model_type
+    if vision_transformer_config.sequence_parallel:
+        print_rank_0("> Disabling Sequence parallelism in Vision Transformer. Not yet supported")
+        vision_transformer_config.sequence_parallel = False
+    if vision_transformer_config.tp_comm_overlap:
+        print_rank_0("> Disabling TP Comm overlap in Vision Transformer. Not yet supported")
+        vision_transformer_config.tp_comm_overlap = False
+    
+    vision_projection_type = "mlp"
+    vision_projection_config = deepcopy(language_transformer_config)
+    if vision_projection_config.sequence_parallel:
+        print_rank_0("> Disabling Sequence parallelism in Vision Projection. Not yet supported")
+        vision_projection_config.sequence_parallel = False
+    if vision_projection_config.tp_comm_overlap:
+        print_rank_0("> Disabling TP Comm overlap in Vision Projection. Not yet supported")
+        vision_projection_config.tp_comm_overlap = False
+
+    if args.encoder_pipeline_model_parallel_size > 0:
+        assert (
+            args.encoder_pipeline_model_parallel_size == 1
+        ), "ViT can only live on 1 pipeline stage."
+        vision_transformer_config.pipeline_model_parallel_size = (
+            args.encoder_pipeline_model_parallel_size
+        )
+        vision_projection_config.pipeline_model_parallel_size = (
+            args.encoder_pipeline_model_parallel_size
+        )
+        if args.encoder_tensor_model_parallel_size > 0:
+            vision_transformer_config.tensor_model_parallel_size = (
+                args.encoder_tensor_model_parallel_size
+            )
+            vision_projection_config.tensor_model_parallel_size = (
+                args.encoder_tensor_model_parallel_size
+            )
+
+    vision_projection_modules = deepcopy(language_transformer_layer_spec.submodules.mlp.submodules)
+
+    if args.virtual_pipeline_model_parallel_size:
+        raise NotImplementedError("virtual pipeline model parallelism is not supported yet.")
+
+    model = LLaVAModel(
+        language_transformer_config=language_transformer_config,
+        language_transformer_layer_spec=language_transformer_layer_spec,
+        language_vocab_size=args.padded_vocab_size,
+        language_max_sequence_length=args.decoder_seq_length,
+        vision_transformer_config=vision_transformer_config,
+        vision_transformer_layer_spec=vision_transformer_layer_spec,
+        drop_vision_class_token=args.disable_vision_class_token,
+        vision_projection_config=vision_projection_config,
+        vision_projection_layer_spec=vision_projection_modules,
+        vision_projection_type=vision_projection_type,
+        parallel_output=parallel_output,
+        language_position_embedding_type=args.position_embedding_type,
+        language_rotary_percent=args.rotary_percent,
+        language_rope_scaling=args.use_rope_scaling,
+        pre_process=pre_process,
+        post_process=post_process,
+        add_encoder=add_encoder,
+        add_decoder=add_decoder,
+        img_h=args.img_h,
+        img_w=args.img_w,
+        patch_dim=args.patch_dim,
+    )
+
+    model.freeze(
+        freeze_language_model=args.freeze_LM,
+        freeze_vision_model=args.freeze_ViT,
+        freeze_vision_projection=False,
+    )
+
+    return model
+
+
+def train_valid_test_datasets_provider(train_val_test_num_samples):
+    """Build the train test and validation datasets.
+
+    Args:
+        train_val_test_num_samples : A list containing the number of samples in train, validation, and test sets.
+
+    Returns:
+        train_ds, val_ds, test_ds (megatron.core.datasets.multimodal_dataset.MockMultimodalDataset): Train, validation, and test datasets, respectively.
+    """
+    args = get_args()
+
+    config = MultimodalDatasetConfig(
+        random_seed=args.seed,
+        split=args.split,
+        sequence_length=args.dataloader_seq_length,
+        tokenizer=get_tokenizer(),
+        reset_position_ids=args.reset_position_ids,
+        reset_attention_mask=args.reset_attention_mask,
+        eod_mask_loss=args.eod_mask_loss,
+        image_h=args.img_h,
+        image_w=args.img_w,
+        preprocess_func=_preprocess_data_for_llava,
+    )
+
+    print_rank_0("> building train, validation, and test datasets for multimodal ...")
+
+    train_ds, valid_ds, test_ds = BlendedMegatronDatasetBuilder(
+        MockMultimodalDataset,
+        train_val_test_num_samples,
+        lambda: parallel_state.get_tensor_model_parallel_rank() == 0,
+        config,
+    ).build()
+
+    print_rank_0("> finished creating multimodal datasets ...")
+
+    return train_ds, valid_ds, test_ds
+
+
+def _preprocess_data_for_llava(data):
+    """Preprocess data sample to the format expected by a LLaVA model.
+
+    Note: This doesn't support all the different modes in the official LLaVA repo yet.
+
+    Args:
+        data (dict): Data sample with keys like 'image', 'tokens', etc.
+
+    Returns:
+        data (dict): Processed data sample suitable for the model.
+    """
+    # Prepend image token index to tokens.
+    data["tokens"] = torch.cat(
+        [
+            IMAGE_TOKEN_INDEX
+            * torch.ones(1, dtype=data["tokens"].dtype, device=data["tokens"].device),
+            data["tokens"],
+        ]
+    )
+    # Prepend labels accordingly.
+    data["labels"] = torch.cat([data["tokens"][1].unsqueeze(0), data["labels"]])
+    # Zero loss mask for the image token index.
+    data["loss_mask"] = torch.cat(
+        [
+            torch.zeros(1, dtype=data["loss_mask"].dtype, device=data["loss_mask"].device),
+            data["loss_mask"],
+        ]
+    )
+    # Add one more position id.
+    data["position_ids"] = torch.cat(
+        [data["position_ids"], data["position_ids"][-1].unsqueeze(0) + 1]
+    )
+
+    return data
+
+
+def get_batch(data_iterator):
+    """Generate a batch.
+
+    Args:
+        data_iterator: Iterable dataset.
+
+    Returns:
+        sample: A data sample with images, tokens, etc.
+    """
+    # Broadcast data.
+    if data_iterator is not None:
+        data = next(data_iterator)
+    else:
+        data = None
+
+    data_i = tensor_parallel.broadcast_data(["tokens", "position_ids", "labels"], data, torch.int64)
+    data_f = tensor_parallel.broadcast_data(["image", "loss_mask"], data, torch.float32)
+
+    tokens = data_i["tokens"].long()
+    position_ids = data_i["position_ids"].long()
+    labels = data_i["labels"].long()
+    images = data_f["image"].float()
+    loss_mask = data_f["loss_mask"].float()
+    attention_mask = None  # Use the attention mask type defined in layer spec. Typically no mask for the vision model and causal mask for the vision model.
+
+    return tokens, position_ids, labels, images, loss_mask, attention_mask
+
+
+def forward_step(data_iterator, model: LLaVAModel):
+    """Forward training step.
+
+    Args:
+        data_iterator: Iterable dataset.
+        model (megatron.core.models.multimodal.llava_model.LLaVAModel): Multimodal model
+
+    Returns:
+        output_tensor (torch.Tensor): Loss of shape [b, s] if labels are provided, otherwise logits of shape [b, s, vocab_size].
+        loss_func (callable): Loss function with a loss mask specified.
+    """
+    timers = get_timers()
+
+    # Get the batch.
+    timers('batch-generator', log_level=2).start()
+    tokens, position_ids, labels, images, loss_mask, attention_mask = get_batch(data_iterator)
+    timers('batch-generator').stop()
+
+    output_tensor, loss_mask = model(
+        images, tokens, position_ids, attention_mask, labels, loss_mask
+    )
+
+    return output_tensor, partial(loss_func, loss_mask)
+
+
+def add_vlm_extra_args(parser):
+    """Extra arguments."""
+    group = parser.add_argument_group(title='vision language model specific arguments')
+    group.add_argument(
+        '--freeze-LM', action='store_true', default=False, help="Freeze language model weights"
+    )
+    group.add_argument(
+        '--freeze-ViT', action='store_true', default=False, help="Freeze vision model (ViT) weights"
+    )
+    group.add_argument(
+        "--disable-vision-class-token",
+        action="store_true",
+        default=False,
+        help="Drop vision model class token",
+    )
+    group.add_argument("--dataloader-seq-length", type=int, help="Make dataloader to produce sequences of specific length.")
+    group.add_argument("--decoder-tp-comm-overlap", action="store_true", default=False, help="Enables the overlap of "
+                        "Tensor parallel communication and GEMM kernels in Decoder only. "
+                        "Please provide decoder-seq-length when using this feature.")
+    return parser
+
+
+def llava_embedding_ranks(pp_ranks):
+    """LLava's embedding ranks consist of the decoder's first and last ranks (ie, the ViT has no embeddings).
+    Args:
+        pp_ranks: A list of global ranks that constitute a pipeline group.
+    """
+    args = get_args()
+
+    # encoder size is also the index to the first rank of the decoder.
+    epp = args.encoder_pipeline_model_parallel_size
+
+    last_rank = pp_ranks[-1]
+    if len(pp_ranks) == 1 or pp_ranks[epp] == last_rank:
+        return [last_rank]
+    else:
+        return [pp_ranks[epp], last_rank]
+
+
+def llava_position_embedding_ranks(pp_ranks):
+    """LLava's embedding ranks consist of the singular rank of the model or the decoder's first rank.
+    Args:
+        pp_ranks: A list of global ranks that constitute a pipeline group.
+    """
+    args = get_args()
+
+    # encoder size is also the index to the first rank of the decoder.
+    epp = args.encoder_pipeline_model_parallel_size
+
+    last_rank = pp_ranks[-1]
+    if len(pp_ranks) == 1:
+        return [last_rank]
+    else:
+        return [pp_ranks[epp]]
+
+
+if __name__ == "__main__":
+    train_valid_test_datasets_provider.is_distributed = True
+
+    pretrain(
+        train_valid_test_datasets_provider,
+        model_provider,
+        ModelType.encoder_and_decoder,
+        forward_step,
+        args_defaults={'tokenizer_type': 'GPT2BPETokenizer'},
+        extra_args_provider=add_vlm_extra_args,
+        get_embedding_ranks=llava_embedding_ranks,
+        get_position_embedding_ranks=llava_position_embedding_ranks,
+    )
diff --git a/pyproject.toml b/pyproject.toml
new file mode 100644
index 0000000000..a4fb32980d
--- /dev/null
+++ b/pyproject.toml
@@ -0,0 +1,75 @@
+# Copyright (c) 2023, NVIDIA CORPORATION.  All rights reserved.
+
+[build-system]
+requires = [
+    "setuptools",
+    "pybind11",
+]
+
+[project]
+name = "megatron-core"
+dynamic = ["dependencies", "version"]
+description = "Megatron Core - a library for efficient and scalable training of transformer based models"
+readme = "README.md"
+license = {file = "LICENSE"}
+authors = [{ name = "NVIDIA", email = "nemo-toolkit@nvidia.com" }]
+maintainers = [{ name = "NVIDIA", email = "nemo-toolkit@nvidia.com" }]
+keywords = [
+    "NLP",
+    "NLU",
+    "deep",
+    "gpu",
+    "language",
+    "learning",
+    "learning",
+    "machine",
+    "nvidia",
+    "pytorch",
+    "torch",
+    "transformer",
+]
+classifiers = [
+    "Development Status :: 5 - Production/Stable",
+    "Environment :: Console",
+    "Intended Audience :: Developers",
+    "Intended Audience :: Information Technology",
+    "Intended Audience :: Science/Research",
+    "License :: OSI Approved :: BSD License",
+    "Natural Language :: English",
+    "Operating System :: OS Independent",
+    "Programming Language :: Python :: 3",
+    "Programming Language :: Python :: 3.8",
+    "Programming Language :: Python :: 3.9",
+    "Topic :: Scientific/Engineering :: Artificial Intelligence",
+    "Topic :: Scientific/Engineering :: Image Recognition",
+    "Topic :: Scientific/Engineering :: Mathematics",
+    "Topic :: Scientific/Engineering",
+    "Topic :: Software Development :: Libraries :: Python Modules",
+    "Topic :: Software Development :: Libraries",
+    "Topic :: Utilities",
+]
+
+[tool.setuptools.dynamic]
+dependencies = { file = ["megatron/core/requirements.txt"] }
+
+[project.urls]
+Download = "https://github.com/NVIDIA/Megatron-LM/releases"
+Homepage = "https://github.com/NVIDIA/Megatron-LM/megatron/core"
+
+[tool.isort]
+profile = "black"  # black-compatible
+line_length = 100  # should match black parameters
+py_version = 310  # python 3.8 as a target version
+known_first_party = ["megatron"]  # FIRSTPARTY section
+known_third_party = ["transformer_engine"]  # THIRDPARTY section
+sections = ["FUTURE", "STDLIB", "THIRDPARTY", "FIRSTPARTY", "LOCALFOLDER"]
+default_section = "THIRDPARTY"
+extend_skip = ["setup.py"]
+
+[tool.black]
+line_length = 100
+skip_string_normalization = true
+# recongized by future versions, disallows to reformat code with incompatible versions
+# Matches NeMO version so people working on both codebases don't need two different version of black installed
+required_version = "24"  
+skip_magic_trailing_comma = true
\ No newline at end of file
diff --git a/pytest.ini b/pytest.ini
new file mode 100644
index 0000000000..c75f3b9fa4
--- /dev/null
+++ b/pytest.ini
@@ -0,0 +1,4 @@
+# content of pytest.ini
+[pytest]
+markers =
+    internal: mark a test as a test to private/internal functions.
\ No newline at end of file
diff --git a/setup.py b/setup.py
index b0bf3c1b85..adb00629ac 100644
--- a/setup.py
+++ b/setup.py
@@ -1,10 +1,10 @@
-from setuptools import setup, find_packages
-
 """Setup for pip package."""
 
 import importlib.util
-import os
+import subprocess
+
 import setuptools
+from setuptools import Extension
 
 spec = importlib.util.spec_from_file_location('package_info', 'megatron/core/package_info.py')
 package_info = importlib.util.module_from_spec(spec)
@@ -23,28 +23,20 @@
 __version__ = package_info.__version__
 
 
-if os.path.exists('megatron/core/README.md'):
-    with open("megatron/core/README.md", "r", encoding='utf-8') as fh:
-        long_description = fh.read()
-    long_description_content_type = "text/markdown"
-
-else:
-    long_description = 'See ' + __homepage__
-    long_description_content_type = "text/plain"
-
+with open("megatron/core/README.md", "r", encoding='utf-8') as fh:
+    long_description = fh.read()
+long_description_content_type = "text/markdown"
 
 ###############################################################################
-#                             Dependency Loading                              #
+#                             Extension Making                                #
 # %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%% #
 
-def req_file(filename, folder="megatron/core"):
-    with open(os.path.join(folder, filename), encoding='utf-8') as f:
-        content = f.readlines()
-    # you may also want to remove whitespace characters
-    # Example: `\n` at the end of each line
-    return [x.strip() for x in content]
-
-install_requires = req_file("requirements.txt")
+extra_compile_args = (
+    subprocess.check_output(["python3", "-m", "pybind11", "--includes"])
+    .decode("utf-8")
+    .strip()
+    .split()
+)
 
 ###############################################################################
 
@@ -101,11 +93,17 @@ def req_file(filename, folder="megatron/core"):
         'Natural Language :: English',
         'Operating System :: OS Independent',
     ],
-    packages=['megatron.core', 'megatron.core.pipeline_parallel', 'megatron.core.tensor_parallel'], 
-    install_requires=install_requires,
-
+    packages=setuptools.find_namespace_packages(include=["megatron.core", "megatron.core.*"]),
+    ext_modules=[
+        Extension(
+            "megatron.core.datasets.helpers",
+            sources=["megatron/core/datasets/helpers.cpp"],
+            language="c++",
+            extra_compile_args=extra_compile_args,
+        )
+    ],
     # Add in any packaged data.
     include_package_data=True,
     # PyPI package information.
     keywords=__keywords__,
-)
\ No newline at end of file
+)
diff --git a/tasks/eval_utils.py b/tasks/eval_utils.py
index 6b29db345f..6d5d4f3d03 100644
--- a/tasks/eval_utils.py
+++ b/tasks/eval_utils.py
@@ -8,8 +8,8 @@
 
 import torch
 
-from megatron import get_args
-from megatron import print_rank_last, is_last_rank
+from megatron.training import get_args
+from megatron.training import print_rank_last, is_last_rank
 from megatron.core import mpu
 from megatron.schedules import get_forward_backward_func
 from tasks.finetune_utils import build_data_loader
@@ -111,7 +111,7 @@ def loss_func(output_predictions, labels, output_tensor):
     def correct_answers_forward_step(batch, model):
         try:
             batch_ = next(batch)
-        except BaseException:
+        except Exception:
             batch_ = batch
         tokens, types, labels, attention_mask = process_batch(batch_)
 
@@ -159,7 +159,7 @@ def correct_answers_forward_step(batch, model):
 
     # Reduce.
     if mpu.is_pipeline_last_stage():
-        unreduced = torch.cuda.LongTensor([correct, total])
+        unreduced = torch.tensor([correct, total], dtype=torch.long, device='cuda')
         torch.distributed.all_reduce(unreduced,
                                      group=mpu.get_data_parallel_group())
 
diff --git a/tasks/finetune_utils.py b/tasks/finetune_utils.py
index b468ca8d20..4b48f23890 100644
--- a/tasks/finetune_utils.py
+++ b/tasks/finetune_utils.py
@@ -6,20 +6,21 @@
 import sys
 import torch
 
-from megatron import get_args, get_num_microbatches
-from megatron import print_rank_0
-from megatron import get_timers
+from megatron.training import get_args
+from megatron.core.num_microbatches_calculator import get_num_microbatches
+from megatron.training import print_rank_0
+from megatron.training import get_timers
 from megatron.core import mpu
 from megatron.core.enums import ModelType
-from megatron.checkpointing import load_checkpoint
-from megatron.checkpointing import save_checkpoint
-from megatron.training import evaluate_and_print_results
-from megatron.training import setup_model_and_optimizer
-from megatron.training import train_step
-from megatron.training import training_log
-from megatron.utils import average_losses_across_data_parallel_group
-from megatron.utils import calc_params_l2_norm
-from megatron.utils import check_adlr_autoresume_termination
+from megatron.training.checkpointing import load_checkpoint
+from megatron.training.checkpointing import save_checkpoint
+from megatron.training.training import evaluate_and_print_results
+from megatron.training.training import setup_model_and_optimizer
+from megatron.training.training import train_step
+from megatron.training.training import training_log
+from megatron.training.utils import average_losses_across_data_parallel_group
+from megatron.training.utils import calc_params_l2_norm
+from megatron.training.utils import check_adlr_autoresume_termination
 
 
 def process_batch(batch):
@@ -57,7 +58,7 @@ def _cross_entropy_forward_step(batch, model):
     timers('batch-generator', log_level=2).start()
     try:
         batch_ = next(batch)
-    except BaseException:
+    except Exception:
         batch_ = batch
     tokens, types, labels, attention_mask = process_batch(batch_)
     timers('batch-generator').stop()
diff --git a/tasks/glue/data.py b/tasks/glue/data.py
index d96f6962d9..3e2eeaa078 100644
--- a/tasks/glue/data.py
+++ b/tasks/glue/data.py
@@ -7,7 +7,7 @@
 
 from torch.utils.data import Dataset
 
-from megatron import print_rank_0
+from megatron.training import print_rank_0
 from tasks.data_utils import build_sample
 from tasks.data_utils import build_tokens_types_paddings_from_text
 
diff --git a/tasks/glue/finetune.py b/tasks/glue/finetune.py
index 0c31b90470..7e89453dea 100644
--- a/tasks/glue/finetune.py
+++ b/tasks/glue/finetune.py
@@ -2,12 +2,13 @@
 
 """GLUE finetuning/evaluation."""
 
-from megatron import get_args
-from megatron import print_rank_0
-from megatron import get_tokenizer
-from megatron.model.classification import Classification
+from megatron.training import get_args
+from megatron.training import print_rank_0
+from megatron.training import get_tokenizer
+from megatron.legacy.model.classification import Classification
 from tasks.eval_utils import accuracy_func_provider
 from tasks.finetune_utils import finetune
+from megatron.training.arguments import core_transformer_config_from_args
 
 
 def glue_classification(num_classes, Dataset,
@@ -28,10 +29,11 @@ def train_valid_datasets_provider():
     def model_provider(pre_process=True, post_process=True):
         """Build the model."""
         args = get_args()
+        config = core_transformer_config_from_args()
 
         print_rank_0('building classification model for {} ...'.format(
             args.task))
-        model = Classification(num_classes=num_classes, num_tokentypes=2,
+        model = Classification(config=config, num_classes=num_classes, num_tokentypes=2,
                                pre_process=pre_process, post_process=post_process)
 
         return model
diff --git a/tasks/glue/mnli.py b/tasks/glue/mnli.py
index 8cecc5911e..cd4b2d6176 100644
--- a/tasks/glue/mnli.py
+++ b/tasks/glue/mnli.py
@@ -2,7 +2,7 @@
 
 """MNLI dataset."""
 
-from megatron import print_rank_0
+from megatron.training import print_rank_0
 from tasks.data_utils import clean_text
 from .data import GLUEAbstractDataset
 
diff --git a/tasks/glue/qqp.py b/tasks/glue/qqp.py
index 5409f5f746..f8a0e06ca0 100644
--- a/tasks/glue/qqp.py
+++ b/tasks/glue/qqp.py
@@ -2,7 +2,7 @@
 
 """QQP dataset."""
 
-from megatron import print_rank_0
+from megatron.training import print_rank_0
 from tasks.data_utils import clean_text
 from .data import GLUEAbstractDataset
 
diff --git a/tasks/main.py b/tasks/main.py
index cf8226b3f5..da8c4b9b96 100644
--- a/tasks/main.py
+++ b/tasks/main.py
@@ -7,8 +7,8 @@
 sys.path.append(os.path.abspath(os.path.join(os.path.dirname(__file__),
                                              os.path.pardir)))
 
-from megatron import get_args
-from megatron.initialize import initialize_megatron
+from megatron.training import get_args
+from megatron.training.initialize import initialize_megatron
 
 
 def get_tasks_args(parser):
@@ -20,8 +20,6 @@ def get_tasks_args(parser):
     group.add_argument('--epochs', type=int, default=None,
                        help='Number of finetunning epochs. Zero results in '
                        'evaluation only.')
-    group.add_argument('--pretrained-checkpoint', type=str, default=None,
-                       help='Pretrained checkpoint used for finetunning.')
     group.add_argument('--keep-last', action='store_true',
                        help='Keep the last batch (maybe incomplete) in'
                        'the data loader')
diff --git a/tasks/msdp/README.md b/tasks/msdp/README.md
index 27c8728eca..e606e7ec51 100644
--- a/tasks/msdp/README.md
+++ b/tasks/msdp/README.md
@@ -7,7 +7,7 @@ Below we present the steps to run our multi-stage dialogue prompting (MSDP) fram
 
 ### Data Preparation
 1. Dataset Download: [Wizard of Wikipedia](https://parl.ai/projects/wizard_of_wikipedia/) and [Wizard of Internet](https://parl.ai/projects/sea/)
-2. Data Processing: We provide the script to run the [`data processing`](../../examples/msdp/data_processing.sh) of the datatsets.
+2. Data Processing: We provide the script to run the [`data processing`](../../examples/msdp/data_processing.sh) of the datasets.
 
 ### Stage-1: Prompting for Knowledge Generation
 1. We provide the script to perform the [`first-stage prompting`](../../examples/msdp/prompt_knwl_gen.sh) for the knowledge generation.
diff --git a/tasks/msdp/evaluate.py b/tasks/msdp/evaluate.py
index b0631d7b8f..87cfbdbd70 100644
--- a/tasks/msdp/evaluate.py
+++ b/tasks/msdp/evaluate.py
@@ -2,8 +2,8 @@
 
 """Model evaluation"""
 
-from megatron import get_args
-from megatron import print_rank_0
+from megatron.training import get_args
+from megatron.training import print_rank_0
 from tasks.msdp.metrics import F1Metric
 from tqdm import tqdm
 
diff --git a/tasks/msdp/main.py b/tasks/msdp/main.py
index 6ffd944207..a0068c7b06 100644
--- a/tasks/msdp/main.py
+++ b/tasks/msdp/main.py
@@ -6,8 +6,8 @@
 import sys
 sys.path.append(os.path.abspath(os.path.join(
     os.path.join(os.path.dirname(__file__), os.path.pardir), os.path.pardir)))
-from megatron import get_args
-from megatron.initialize import initialize_megatron
+from megatron.training import get_args
+from megatron.training.initialize import initialize_megatron
 
 
 def get_tasks_args(parser):
diff --git a/tasks/msdp/prompt.py b/tasks/msdp/prompt.py
index a4e777e0b8..c1d1651c34 100644
--- a/tasks/msdp/prompt.py
+++ b/tasks/msdp/prompt.py
@@ -6,15 +6,15 @@
 import torch
 import requests
 from nltk import word_tokenize
-from megatron import get_args
-from megatron import print_rank_0
-from megatron import get_tokenizer
+from megatron.training import get_args
+from megatron.training import print_rank_0
+from megatron.training import get_tokenizer
 from megatron.core import mpu
-from megatron.model import GPTModel
+from megatron.legacy.model import GPTModel
 from megatron.training import get_model
-from megatron.checkpointing import load_checkpoint
-from megatron.initialize import initialize_megatron
-from megatron.text_generation import generate_and_post_process
+from megatron.training.checkpointing import load_checkpoint
+from megatron.training.initialize import initialize_megatron
+from megatron.inference.text_generation import generate_and_post_process
 
 
 def call_model_api(inputs, tokens_to_generate):
diff --git a/tasks/orqa/README.md b/tasks/orqa/README.md
index a8e8f8e6fa..58aa455b60 100644
--- a/tasks/orqa/README.md
+++ b/tasks/orqa/README.md
@@ -1,6 +1,6 @@
 ## End-to-End Training of Neural Retrievers for Open-Domain Question Answering
 
-Below we present the steps to run unsupervised and supervised trainining and evaluation of the retriever for [open domain question answering](https://arxiv.org/abs/2101.00408).
+Below we present the steps to run unsupervised and supervised training and evaluation of the retriever for [open domain question answering](https://arxiv.org/abs/2101.00408).
 
 ## Retriever Training
 
diff --git a/tasks/orqa/evaluate_orqa.py b/tasks/orqa/evaluate_orqa.py
index 3bcc71ba44..f960425499 100644
--- a/tasks/orqa/evaluate_orqa.py
+++ b/tasks/orqa/evaluate_orqa.py
@@ -2,8 +2,8 @@
 
 """Main tasks functionality."""
 
-from megatron import get_args, print_rank_0
-from megatron.indexer import IndexBuilder
+from megatron.training import get_args, print_rank_0
+from megatron.legacy.indexer import IndexBuilder
 from tasks.orqa.evaluate_utils import ORQAEvaluator
 
 def main():
diff --git a/tasks/orqa/evaluate_utils.py b/tasks/orqa/evaluate_utils.py
index 6d4ba786c0..b7ce3fcd8d 100644
--- a/tasks/orqa/evaluate_utils.py
+++ b/tasks/orqa/evaluate_utils.py
@@ -2,11 +2,11 @@
 
 import torch
 
-from megatron import get_args, print_rank_0
-from megatron.checkpointing import load_biencoder_checkpoint
-from megatron.data.orqa_wiki_dataset import get_open_retrieval_wiki_dataset
-from megatron.data.realm_index import OpenRetreivalDataStore, FaissMIPSIndex
-from megatron.model.biencoder_model import get_model_provider
+from megatron.training import get_args, print_rank_0
+from megatron.training.checkpointing import load_biencoder_checkpoint
+from megatron.legacy.data.orqa_wiki_dataset import get_open_retrieval_wiki_dataset
+from megatron.legacy.data.realm_index import OpenRetreivalDataStore, FaissMIPSIndex
+from megatron.legacy.model.biencoder_model import get_model_provider
 from megatron.training import get_model
 from tasks.orqa.unsupervised.nq import get_nq_dataset
 from tasks.orqa.unsupervised.nq import get_one_epoch_nq_dataloader
diff --git a/tasks/orqa/supervised/data.py b/tasks/orqa/supervised/data.py
index eb99e2df82..89ae60c89e 100644
--- a/tasks/orqa/supervised/data.py
+++ b/tasks/orqa/supervised/data.py
@@ -10,8 +10,8 @@
 import numpy as np
 from torch.utils.data import Dataset
 
-from megatron import print_rank_0, get_args
-from megatron.data.biencoder_dataset_utils import make_attention_mask
+from megatron.training import print_rank_0, get_args
+from megatron.legacy.data.biencoder_dataset_utils import make_attention_mask
 
 def build_token_types_from_context_list(ctx_list, tokenizer, max_seq_length):
     ctx_id_list, ctx_types_list = [], []
diff --git a/tasks/orqa/supervised/eval_utils.py b/tasks/orqa/supervised/eval_utils.py
index 02966362c9..27af475c8d 100644
--- a/tasks/orqa/supervised/eval_utils.py
+++ b/tasks/orqa/supervised/eval_utils.py
@@ -9,9 +9,9 @@
 import torch.nn.functional as F
 from torch.utils.data import DataLoader
 
-from megatron import get_args, print_rank_0
+from megatron.training import get_args, print_rank_0
 from megatron.core import mpu
-from megatron.utils import average_losses_across_data_parallel_group
+from megatron.training.utils import average_losses_across_data_parallel_group
 from tasks.finetune_utils import build_data_loader
 
 def task_collate_fn(batch_data):
diff --git a/tasks/orqa/supervised/finetune.py b/tasks/orqa/supervised/finetune.py
index c186dcc518..f8b4b354c8 100644
--- a/tasks/orqa/supervised/finetune.py
+++ b/tasks/orqa/supervised/finetune.py
@@ -9,11 +9,11 @@
 import torch
 import torch.nn.functional as F
 
-from megatron import get_args, get_timers, get_tokenizer, print_rank_0
+from megatron.training import get_args, get_timers, get_tokenizer, print_rank_0
 from megatron.core import mpu
-from megatron.indexer import IndexBuilder
-from megatron.model.biencoder_model import biencoder_model_provider
-from megatron.utils import average_losses_across_data_parallel_group
+from megatron.legacy.indexer import IndexBuilder
+from megatron.legacy.model.biencoder_model import biencoder_model_provider
+from megatron.training.utils import average_losses_across_data_parallel_group
 from pretrain_ict import get_group_world_size_rank
 from tasks.finetune_utils import finetune
 from tasks.orqa.supervised.eval_utils import accuracy_func_provider
@@ -53,7 +53,7 @@ def cross_entropy_forward_step(batch, model):
         timers('batch generator', log_level=2).start()
         try:
             batch_ = next(batch)
-        except BaseException:
+        except Exception:
             batch_ = batch
 
         group, rank, world_size = get_group_world_size_rank()
diff --git a/tasks/orqa/unsupervised/nq.py b/tasks/orqa/unsupervised/nq.py
index 56fd77c12c..2d1bfca730 100644
--- a/tasks/orqa/unsupervised/nq.py
+++ b/tasks/orqa/unsupervised/nq.py
@@ -13,8 +13,8 @@
 from torch.utils.data import DataLoader
 from torch.utils.data import Dataset, BatchSampler
 
-from megatron import print_rank_0, get_args, get_tokenizer
-from megatron.data.biencoder_dataset_utils import make_attention_mask
+from megatron.training import print_rank_0, get_args, get_tokenizer
+from megatron.legacy.data.biencoder_dataset_utils import make_attention_mask
 
 def get_nq_dataset(qa_data, split):
     args = get_args()
diff --git a/tasks/orqa/unsupervised/qa_utils.py b/tasks/orqa/unsupervised/qa_utils.py
index 811a05834a..3b2224c241 100644
--- a/tasks/orqa/unsupervised/qa_utils.py
+++ b/tasks/orqa/unsupervised/qa_utils.py
@@ -146,7 +146,7 @@ def regex_match(text, pattern):
             pattern,
             flags=re.IGNORECASE + re.UNICODE + re.MULTILINE,
         )
-    except BaseException:
+    except Exception:
         return False
     return pattern.search(text) is not None
 
diff --git a/tasks/quantize/calibrate_gpt.py b/tasks/quantize/calibrate_gpt.py
new file mode 100644
index 0000000000..76840246a6
--- /dev/null
+++ b/tasks/quantize/calibrate_gpt.py
@@ -0,0 +1,239 @@
+# Copyright (c) 2024, NVIDIA CORPORATION. All rights reserved.
+
+"""Calibrate a GPT model for FP8 scaling factors."""
+import os
+import sys
+
+sys.path.append(
+    os.path.abspath(os.path.join(os.path.dirname(__file__), os.path.pardir, os.path.pardir))
+)
+import math
+
+import torch
+import transformer_engine.pytorch as te
+
+from megatron.core import parallel_state, tensor_parallel
+from megatron.core.models.gpt import GPTModel
+from megatron.core.models.gpt.gpt_layer_specs import get_gpt_layer_with_transformer_engine_spec
+from megatron.core.pipeline_parallel.p2p_communication import recv_forward, send_forward
+from megatron.core.transformer.spec_utils import import_module
+from megatron.training import get_args, get_model, is_last_rank, print_rank_0
+from megatron.training.arguments import core_transformer_config_from_args
+from megatron.training.checkpointing import load_checkpoint
+from megatron.training.initialize import initialize_megatron
+from megatron.training.training import save_checkpoint_and_time
+from megatron.training.utils import unwrap_model
+from megatron.training.yaml_arguments import core_transformer_config_from_yaml
+from tasks.finetune_utils import build_data_loader
+from tasks.zeroshot_gpt.datasets import build_dataset
+from tasks.zeroshot_gpt.evaluate import process_batch
+
+
+def model_provider(pre_process=True, post_process=True) -> GPTModel:
+    """Builds the model.
+
+        Args:
+            pre_process (bool, optional): Set to true if you need to compute embeddings. Defaults to True.
+            post_process (bool, optional): Set to true if you need to want to compute output logits/loss. Defaults to True.
+
+        Returns:
+            GPTModel: The returned model. Only works for Transformer Engine implementations.
+        """
+
+    args = get_args()
+
+    print_rank_0('building GPT model ...')
+
+    # Experimental loading arguments from yaml
+    if args.yaml_cfg is not None:
+        config = core_transformer_config_from_yaml(args, "language_model")
+    else:
+        config = core_transformer_config_from_args(args)
+
+    if args.use_legacy_models or args.transformer_impl != "transformer_engine":
+        raise NotImplementedError(
+            'Calibration is only supported for models using TransformerEngine.'
+        )
+    else:
+        if args.spec is not None:
+            transformer_layer_spec = import_module(args.spec)
+        else:
+            transformer_layer_spec = get_gpt_layer_with_transformer_engine_spec(
+                args.num_experts, args.moe_grouped_gemm
+            )
+        model = GPTModel(
+            config=config,
+            transformer_layer_spec=transformer_layer_spec,
+            vocab_size=args.padded_vocab_size,
+            max_sequence_length=args.max_position_embeddings,
+            pre_process=pre_process,
+            post_process=post_process,
+            fp16_lm_cross_entropy=args.fp16_lm_cross_entropy,
+            parallel_output=True,
+            share_embeddings_and_output_weights=not args.untie_embeddings_and_output_weights,
+            position_embedding_type=args.position_embedding_type,
+            rotary_percent=args.rotary_percent
+        )
+
+    return model
+
+
+def forward_step(batch, model, config):
+    """Forward step."""
+
+    # Get the batch.
+    tokens, labels, attention_mask, position_ids, loss_mask = process_batch(batch)
+
+    args = get_args()
+    args.micro_batch_size = len(labels)
+
+    tensor_shape = (args.seq_length, args.micro_batch_size, args.hidden_size)
+    input_tensor = recv_forward(tensor_shape, config)
+
+    # Forward pass through the model.
+    unwrapped_model = unwrap_model(model)
+    unwrapped_model.set_input_tensor(input_tensor)
+    output = model(tokens, position_ids, attention_mask)
+
+    send_forward(output, config)
+
+    if parallel_state.is_pipeline_last_stage():
+        losses = tensor_parallel.vocab_parallel_cross_entropy(
+            output.contiguous().float(), labels.contiguous()
+        )
+        loss = torch.sum(losses.view(-1) * loss_mask.contiguous().view(-1).float())
+        return loss
+
+    return None
+
+
+def calibrate(data_loader, model):
+    args = get_args()
+    config = core_transformer_config_from_args(args)
+
+    # Turn on evaluation mode which disables dropout.
+    model.eval()
+
+    total_output = 0.0
+    num_examples = min(len(data_loader), args.calib_size)
+    data_loader = iter(data_loader)
+
+    with torch.no_grad():
+        iteration = 0
+        while iteration < num_examples - 1:
+            batch = next(data_loader)
+            if iteration % args.log_interval == 0:
+                print_rank_0('> working on iteration: {}'.format(iteration))
+            with te.fp8_autocast(enabled=False, calibrating=True), torch.autocast(
+                device_type='cuda', dtype=torch.bfloat16
+            ):
+                output = forward_step(batch, model, config)
+
+                # Reduce across processes.
+                if parallel_state.is_pipeline_last_stage():
+                    torch.distributed.all_reduce(
+                        output, group=parallel_state.get_data_parallel_group()
+                    )
+
+                    total_output += output
+            iteration += 1
+
+        print_rank_0(f"Compute scaling factors with FP8 autocast ...")
+        with te.fp8_autocast(enabled=True), torch.autocast(
+            device_type='cuda', dtype=torch.bfloat16
+        ):
+            forward_step(batch, model, config)
+
+            if parallel_state.is_pipeline_last_stage():
+                torch.distributed.all_reduce(output, group=parallel_state.get_data_parallel_group())
+
+                total_output += output
+
+    print_rank_0(f"Saving calibrated checkpoint ...")
+    save_checkpoint_and_time(
+        iteration,
+        [model],
+        optimizer=None,
+        opt_param_scheduler=None,
+        num_floating_point_operations_so_far=0,
+        checkpointing_context=None,
+    )
+
+    return total_output
+
+
+def calibrate_and_print_results(task, data_loader, model):
+    """Calibrate and print results on screen."""
+
+    # Calibrate and save scaling factors
+    output = calibrate(data_loader, model)
+
+    string = ' validation results on {} | '.format(task)
+    if is_last_rank():
+        num_tokenized_tokens = data_loader.dataset.num_tokenized_tokens
+        num_original_tokens = data_loader.dataset.num_original_tokens
+        val_loss = output / (num_tokenized_tokens - 1)
+        ppl = math.exp(min(20, val_loss))
+        token_ratio = (num_tokenized_tokens - 1) / (num_original_tokens - 1)
+        adjusted_ppl = math.exp(min(20, val_loss * token_ratio))
+        string += 'avg loss: {:.4E} | '.format(val_loss)
+        string += 'ppl: {:.4E} | '.format(ppl)
+        string += 'adjusted ppl: {:.4E} | '.format(adjusted_ppl)
+        string += 'token ratio: {} |'.format(token_ratio)
+
+        length = len(string) + 1
+        print('-' * length)
+        print(string)
+        print('-' * length)
+
+
+def add_calib_args(parser):
+    group = parser.add_argument_group(title='calibration')
+    group.add_argument("--task", type=str, help="Calibration task to run. Defaults to WIKITEXT103.")
+    group.add_argument('--valid-data', nargs='*', default=None, help='Calibration dataset')
+    group.add_argument(
+        '--overlapping-eval',
+        type=int,
+        default=32,  # Required for reusing _build_wikitext103_dataset()
+        help='Sliding window for overlapping evaluation.',
+    )
+    group.add_argument(
+        "--calib-size", type=int, default=512, help="Number of samples to use for calibration."
+    )
+    return parser
+
+
+if __name__ == "__main__":
+    initialize_megatron(
+        extra_args_provider=add_calib_args,
+        args_defaults={
+            'tokenizer_type': 'GPT2BPETokenizer',
+            'no_load_rng': True,
+            'no_load_optim': True,
+        },
+    )
+
+    args = get_args()
+
+    if args.num_layers_per_virtual_pipeline_stage is not None:
+        print("Interleaved pipeline schedule is not yet supported for calibration.")
+        exit()
+
+    # Set up model and load checkpoint.
+    model = get_model(model_provider, wrap_with_ddp=False)
+    if args.load is not None:
+        _ = load_checkpoint(model, None, None)
+
+    assert len(model) == 1, "Above condition should have caught this"
+    model = model[0]
+
+    # Setup data loader.
+    dataset = build_dataset(args.task)
+    dataloader = build_data_loader(
+        dataset, args.micro_batch_size, args.num_workers, drop_last=False
+    )
+
+    # Run calibration.
+    calibrate_and_print_results(args.task, dataloader, model)
+
+    print_rank_0('Calibration successfully completed.')
diff --git a/tasks/race/data.py b/tasks/race/data.py
index c4967a0842..0c22108daa 100644
--- a/tasks/race/data.py
+++ b/tasks/race/data.py
@@ -6,7 +6,7 @@
 
 from torch.utils.data import Dataset
 
-from megatron import print_rank_0
+from megatron.training import print_rank_0
 from tasks.data_utils import build_sample
 from tasks.data_utils import build_tokens_types_paddings_from_ids
 from tasks.data_utils import clean_text
diff --git a/tasks/race/finetune.py b/tasks/race/finetune.py
index 18b3ff919d..09d9e739b8 100644
--- a/tasks/race/finetune.py
+++ b/tasks/race/finetune.py
@@ -2,13 +2,14 @@
 
 """Race."""
 
-from megatron import get_args
-from megatron import print_rank_0
-from megatron import get_tokenizer
-from megatron.model.multiple_choice import MultipleChoice
+from megatron.training import get_args
+from megatron.training import print_rank_0
+from megatron.training import get_tokenizer
+from megatron.legacy.model.multiple_choice import MultipleChoice
 from tasks.eval_utils import accuracy_func_provider
 from tasks.finetune_utils import finetune
 from tasks.race.data import RaceDataset
+from megatron.training.arguments import core_transformer_config_from_args
 
 
 def train_valid_datasets_provider():
@@ -26,9 +27,10 @@ def train_valid_datasets_provider():
 
 def model_provider(pre_process=True, post_process=True):
     """Build the model."""
-
+    config = core_transformer_config_from_args(get_args())
     print_rank_0('building multichoice model for RACE ...')
-    model = MultipleChoice(num_tokentypes=2,
+    model = MultipleChoice(config=config,
+                           num_tokentypes=2,
                            pre_process=pre_process,
                            post_process=post_process)
 
diff --git a/tasks/vision/classification/classification.py b/tasks/vision/classification/classification.py
index cc8dbe629e..efe58be9d7 100644
--- a/tasks/vision/classification/classification.py
+++ b/tasks/vision/classification/classification.py
@@ -4,13 +4,13 @@
 
 import torch.nn.functional as F
 from functools import partial
-from megatron import get_args, get_timers
-from megatron import print_rank_0
-from megatron.model.vision.classification import VitClassificationModel
-from megatron.data.vit_dataset import build_train_valid_datasets
+from megatron.training import get_args, get_timers
+from megatron.training import print_rank_0
+from megatron.legacy.model.vision.classification import VitClassificationModel
+from megatron.legacy.data.vit_dataset import build_train_valid_datasets
 from tasks.vision.classification.eval_utils import accuracy_func_provider
 from tasks.vision.finetune_utils import finetune
-from megatron.utils import average_losses_across_data_parallel_group
+from megatron.training.utils import average_losses_across_data_parallel_group
 
 
 def classification():
@@ -58,7 +58,7 @@ def _cross_entropy_forward_step(batch, model):
         timers("batch generator", log_level=2).start()
         try:
             batch_ = next(batch)
-        except BaseException:
+        except Exception:
             batch_ = batch
         images, labels = process_batch(batch_)
         timers("batch generator").stop()
diff --git a/tasks/vision/classification/eval_utils.py b/tasks/vision/classification/eval_utils.py
index d3eaec4850..f68e0275aa 100644
--- a/tasks/vision/classification/eval_utils.py
+++ b/tasks/vision/classification/eval_utils.py
@@ -7,8 +7,8 @@
 
 import torch
 
-from megatron import get_args
-from megatron import print_rank_0, print_rank_last
+from megatron.training import get_args
+from megatron.training import print_rank_0, print_rank_last
 from megatron.core import mpu
 from megatron.schedules import get_forward_backward_func
 from tasks.vision.finetune_utils import build_data_loader
@@ -79,7 +79,7 @@ def loss_func(labels, output_tensor):
     def correct_answers_forward_step(batch, model):
         try:
             batch_ = next(batch)
-        except BaseException:
+        except Exception:
             batch_ = batch
         images, labels = process_batch(batch_)
 
diff --git a/tasks/vision/finetune_utils.py b/tasks/vision/finetune_utils.py
index 2e55c184e3..ced2e674e6 100644
--- a/tasks/vision/finetune_utils.py
+++ b/tasks/vision/finetune_utils.py
@@ -4,22 +4,19 @@
 
 import torch
 import torch.nn.functional as F
-from megatron import get_args
-from megatron import print_rank_0
-from megatron import get_timers
-from megatron import utils
+from megatron.training import get_args
+from megatron.training import print_rank_0
+from megatron.training import get_timers
+from megatron.training import utils
 from megatron.core import mpu
-from megatron.checkpointing import load_checkpoint
-from megatron.checkpointing import save_checkpoint
+from megatron.training.checkpointing import load_checkpoint
+from megatron.training.checkpointing import save_checkpoint
 from megatron.training import evaluate_and_print_results
 from megatron.training import setup_model_and_optimizer
 from megatron.training import train_step
 from megatron.training import training_log
-from megatron.utils import check_adlr_autoresume_termination
-from megatron.utils import average_losses_across_data_parallel_group, print_params_min_max_norm
-from torch.nn.parallel.distributed import DistributedDataParallel as torchDDP
-from megatron.model import DistributedDataParallel as LocalDDP
-from megatron.model import Float16Module
+from megatron.training.utils import check_adlr_autoresume_termination
+from megatron.training.utils import average_losses_across_data_parallel_group, print_params_min_max_norm
 from megatron.core.enums import ModelType
 
 def process_batch(batch):
diff --git a/tasks/vision/main.py b/tasks/vision/main.py
index 7c1b738110..7975f6e9c1 100644
--- a/tasks/vision/main.py
+++ b/tasks/vision/main.py
@@ -13,8 +13,8 @@
         )
     )
 )
-from megatron import get_args
-from megatron.initialize import initialize_megatron
+from megatron.training import get_args
+from megatron.training.initialize import initialize_megatron
 
 def get_tasks_args(parser):
     """Provide extra arguments required for tasks."""
diff --git a/tasks/vision/segmentation/cityscapes.py b/tasks/vision/segmentation/cityscapes.py
index 1a182288f2..af63a6f616 100644
--- a/tasks/vision/segmentation/cityscapes.py
+++ b/tasks/vision/segmentation/cityscapes.py
@@ -41,7 +41,7 @@
 from torchvision.datasets.utils import extract_archive, verify_str_arg, iterable_to_str
 from torchvision.datasets import VisionDataset
 from PIL import Image
-from megatron import print_rank_0
+from megatron.training import print_rank_0
 
 
 class Cityscapes(VisionDataset):
diff --git a/tasks/vision/segmentation/data.py b/tasks/vision/segmentation/data.py
index 292e9cab33..a0ea612cfb 100644
--- a/tasks/vision/segmentation/data.py
+++ b/tasks/vision/segmentation/data.py
@@ -7,11 +7,11 @@
 import torchvision.transforms as T
 from torchvision import datasets
 from torch.utils.data import Dataset
-from megatron.data.autoaugment import ImageNetPolicy
+from megatron.legacy.data.autoaugment import ImageNetPolicy
 from tasks.vision.segmentation.cityscapes import Cityscapes
 import tasks.vision.segmentation.transforms as ET
-from megatron.data.autoaugment import ImageNetPolicy
-from megatron import get_args
+from megatron.legacy.data.autoaugment import ImageNetPolicy
+from megatron.training import get_args
 from PIL import Image, ImageOps
 
 
diff --git a/tasks/vision/segmentation/finetune_segformer.py b/tasks/vision/segmentation/finetune_segformer.py
index 10a4085be4..35e20c9a2c 100644
--- a/tasks/vision/segmentation/finetune_segformer.py
+++ b/tasks/vision/segmentation/finetune_segformer.py
@@ -6,16 +6,16 @@
 import torch
 import torch.nn.functional as F
 from functools import partial
-from megatron import get_args, get_timers
-from megatron import print_rank_0, print_rank_last
+from megatron.training import get_args, get_timers
+from megatron.training import print_rank_0, print_rank_last
 from megatron.core import mpu
 from tasks.vision.finetune_utils import finetune
 from tasks.vision.finetune_utils import build_data_loader
-from megatron.utils import average_losses_across_data_parallel_group
+from megatron.training.utils import average_losses_across_data_parallel_group
 from megatron.schedules import get_forward_backward_func
 from tasks.vision.segmentation.data import build_train_valid_datasets
 from tasks.vision.segmentation.seg_models import SegformerSegmentationModel
-from megatron.model.vision.utils import resize
+from megatron.legacy.model.vision.utils import resize
 
 
 def calculate_iou(hist_data):
@@ -154,7 +154,7 @@ def loss_func(labels, output_tensor):
         def correct_answers_forward_step(batch, model):
             try:
                 batch_ = next(batch)
-            except BaseException:
+            except Exception:
                 batch_ = batch
             images, labels = process_batch(batch_)
 
diff --git a/tasks/vision/segmentation/finetune_setr.py b/tasks/vision/segmentation/finetune_setr.py
index 7f3208d09a..b301c51374 100644
--- a/tasks/vision/segmentation/finetune_setr.py
+++ b/tasks/vision/segmentation/finetune_setr.py
@@ -5,12 +5,12 @@
 import torch
 import torch.nn.functional as F
 from functools import partial
-from megatron import get_args, get_timers
-from megatron import print_rank_0, print_rank_last
+from megatron.training import get_args, get_timers
+from megatron.training import print_rank_0, print_rank_last
 from megatron.core import mpu
 from tasks.vision.finetune_utils import finetune
 from tasks.vision.finetune_utils import build_data_loader
-from megatron.utils import average_losses_across_data_parallel_group
+from megatron.training.utils import average_losses_across_data_parallel_group
 from megatron.schedules import get_forward_backward_func
 from tasks.vision.segmentation.metrics import CFMatrix
 from tasks.vision.segmentation.data import build_train_valid_datasets
@@ -122,7 +122,7 @@ def correct_answers_forward_step(batch, model):
             args = get_args()
             try:
                 batch_ = next(batch)
-            except BaseException:
+            except Exception:
                 batch_ = batch
             images, labels = process_batch(batch_)
 
diff --git a/tasks/vision/segmentation/seg_heads.py b/tasks/vision/segmentation/seg_heads.py
index 61b16cdcbd..6d06cbca94 100644
--- a/tasks/vision/segmentation/seg_heads.py
+++ b/tasks/vision/segmentation/seg_heads.py
@@ -4,10 +4,10 @@
 import torch
 import apex
 import torch.nn.functional as F
-from megatron import get_args
-from megatron.model import LayerNorm
-from megatron.model.module import MegatronModule
-from megatron.model.vision.utils import resize
+from megatron.training import get_args
+from megatron.legacy.model import LayerNorm
+from megatron.legacy.model.module import MegatronModule
+from megatron.legacy.model.vision.utils import resize
 
 
 class SetrSegmentationHead(MegatronModule):
diff --git a/tasks/vision/segmentation/seg_models.py b/tasks/vision/segmentation/seg_models.py
index 3bf0f48def..9b152d06ed 100644
--- a/tasks/vision/segmentation/seg_models.py
+++ b/tasks/vision/segmentation/seg_models.py
@@ -4,10 +4,10 @@
 import torch
 import apex
 import torch.nn.functional as F
-from megatron import get_args
-from megatron.model.module import MegatronModule
-from megatron.model.vision.vit_backbone import VitBackbone, VitMlpHead
-from megatron.model.vision.mit_backbone import mit_b3, mit_b5
+from megatron.training import get_args
+from megatron.legacy.model.module import MegatronModule
+from megatron.legacy.model.vision.vit_backbone import VitBackbone, VitMlpHead
+from megatron.legacy.model.vision.mit_backbone import mit_b3, mit_b5
 from tasks.vision.segmentation.seg_heads import SetrSegmentationHead, SegformerSegmentationHead
 
 
@@ -36,7 +36,7 @@ def __init__(self,
         )
 
     def set_input_tensor(self, input_tensor):
-        """See megatron.model.transformer.set_input_tensor()"""
+        """See megatron.legacy.model.transformer.set_input_tensor()"""
         pass
 
     def forward(self, input):
@@ -68,7 +68,7 @@ def __init__(self,
         )
 
     def set_input_tensor(self, input_tensor):
-        """See megatron.model.transformer.set_input_tensor()"""
+        """See megatron.legacy.model.transformer.set_input_tensor()"""
         pass
 
     def forward(self, input):
diff --git a/tasks/vision/segmentation/transforms.py b/tasks/vision/segmentation/transforms.py
index 8506c53266..51e11abeca 100644
--- a/tasks/vision/segmentation/transforms.py
+++ b/tasks/vision/segmentation/transforms.py
@@ -12,8 +12,8 @@
 import torchvision.transforms as T
 from torchvision import datasets
 from torch.utils.data import Dataset
-from megatron import print_rank_0
-from megatron import get_args
+from megatron.training import print_rank_0
+from megatron.training import get_args
 from PIL import Image, ImageOps, ImageEnhance
 import torchvision.transforms as torch_tr
 
diff --git a/tasks/vision/segmentation/utils.py b/tasks/vision/segmentation/utils.py
index dfc6a20148..f9cfb820cb 100644
--- a/tasks/vision/segmentation/utils.py
+++ b/tasks/vision/segmentation/utils.py
@@ -1,7 +1,7 @@
 import math
 import torch
 import numpy as np
-from megatron import get_args
+from megatron.training import get_args
 
 def slidingcrops(img, mask):
     # img: [b c h w]
diff --git a/tasks/zeroshot_gpt/datasets.py b/tasks/zeroshot_gpt/datasets.py
index 92b7d78913..eafaa8dab1 100644
--- a/tasks/zeroshot_gpt/datasets.py
+++ b/tasks/zeroshot_gpt/datasets.py
@@ -8,9 +8,9 @@
 import numpy as np
 import torch
 
-from megatron import get_args
-from megatron import print_rank_0
-from megatron import get_tokenizer
+from megatron.training import get_args
+from megatron.training import print_rank_0
+from megatron.training import get_tokenizer
 from .detokenizer import get_detokenizer
 
 
diff --git a/tasks/zeroshot_gpt/evaluate.py b/tasks/zeroshot_gpt/evaluate.py
index 43b659b92f..e42c776e83 100644
--- a/tasks/zeroshot_gpt/evaluate.py
+++ b/tasks/zeroshot_gpt/evaluate.py
@@ -6,23 +6,20 @@
 
 import torch
 
-from megatron import get_args
-from megatron import print_rank_0, is_last_rank
-from megatron import get_tokenizer
+from megatron.training import get_args
+from megatron.training import print_rank_0, is_last_rank
+from megatron.training import get_tokenizer
 from megatron.core import parallel_state, tensor_parallel
-from megatron.checkpointing import load_checkpoint
-from megatron.model import GPTModel
+from megatron.training.checkpointing import load_checkpoint
+from megatron.legacy.model import GPTModel
 from megatron.training import get_model
-from megatron.utils import get_ltor_masks_and_position_ids, unwrap_model
-from megatron.p2p_communication import recv_forward, send_forward
+from megatron.training.utils import get_ltor_masks_and_position_ids, unwrap_model
+from megatron.core.pipeline_parallel.p2p_communication import recv_forward, send_forward
+from megatron.training.arguments import core_transformer_config_from_args
 from tasks.finetune_utils import build_data_loader
 
 from .datasets import build_dataset
 
-# These are needed to unwrap the model, would be nice to put these in megatron.utils if possible?
-from torch.nn.parallel.distributed import DistributedDataParallel as torchDDP
-from megatron.model import DistributedDataParallel as LocalDDP
-from megatron.model import Float16Module
 
 def get_model_provider(eval_metric):
     """Based on evaluation metric set the parallel-output flag and
@@ -31,6 +28,8 @@ def get_model_provider(eval_metric):
     def model_provider(pre_process=True, post_process=True):
         """Build the model."""
 
+        config = core_transformer_config_from_args(get_args())
+
         if eval_metric == 'loss':
             parallel_output = True
         elif eval_metric == 'accuracy':
@@ -40,7 +39,7 @@ def model_provider(pre_process=True, post_process=True):
                                       'is not supported.'.format(eval_metric))
 
         print_rank_0('building GPT model ...')
-        model = GPTModel(num_tokentypes=0, parallel_output=parallel_output,
+        model = GPTModel(config, num_tokentypes=0, parallel_output=parallel_output,
                          pre_process=pre_process, post_process=post_process)
 
         return model
@@ -69,7 +68,7 @@ def process_batch(batch):
     return tokens, labels, attention_mask, position_ids, loss_mask
 
 
-def forward_step(batch, model, eval_metric):
+def forward_step(batch, model, eval_metric, config):
     """Forward step."""
 
     # Get the batch.
@@ -80,15 +79,15 @@ def forward_step(batch, model, eval_metric):
     args = get_args()
     args.micro_batch_size = len(labels)
 
-    input_tensor = recv_forward()
+    tensor_shape = (args.seq_length, args.micro_batch_size, args.hidden_size)
+    input_tensor = recv_forward(tensor_shape, config)
 
     # Forward pass through the model.
-    unwrapped_model = unwrap_model(
-        model, (torchDDP, LocalDDP, Float16Module))
+    unwrapped_model = unwrap_model(model)
     unwrapped_model.set_input_tensor(input_tensor)
     output = model(tokens, position_ids, attention_mask)
 
-    send_forward(output)
+    send_forward(output, config)
 
     if parallel_state.is_pipeline_last_stage():
         # For loss, return the unreduced loss.
@@ -115,7 +114,8 @@ def forward_step(batch, model, eval_metric):
 def evaluate(data_loader, model, eval_metric):
     """Evaluation."""
     args = get_args()
-
+    config = core_transformer_config_from_args(args)
+    
     # Turn on evaluation mode which disables dropout.
     model.eval()
 
@@ -126,7 +126,7 @@ def evaluate(data_loader, model, eval_metric):
             if iteration % args.log_interval == 0:
                 print_rank_0('> working on iteration: {}'.format(iteration))
             # Forward evaluation.
-            output = forward_step(batch, model, eval_metric)
+            output = forward_step(batch, model, eval_metric, config)
 
             # Reduce across processes.
             if parallel_state.is_pipeline_last_stage():
diff --git a/tests/functional_tests/jet_recipes/_build-mcore-dev.yaml b/tests/functional_tests/jet_recipes/_build-mcore-dev.yaml
new file mode 100644
index 0000000000..123250d746
--- /dev/null
+++ b/tests/functional_tests/jet_recipes/_build-mcore-dev.yaml
@@ -0,0 +1,11 @@
+type: build
+format_version: 1
+maintainers: [maanug]
+spec:
+  name: mcore-pyt-dev
+  platforms: [linux/amd64]
+  source:
+    # The image tag will be added via `jet-tests.yaml`
+    # Tags are one of {buildcache, $CI_PIPELINE_ID}
+    image: gitlab-master.nvidia.com/adlr/megatron-lm/mcore_ci_dev
+    
\ No newline at end of file
diff --git a/tests/functional_tests/jet_recipes/_build-mcore-lts.yaml b/tests/functional_tests/jet_recipes/_build-mcore-lts.yaml
new file mode 100644
index 0000000000..d017b71c10
--- /dev/null
+++ b/tests/functional_tests/jet_recipes/_build-mcore-lts.yaml
@@ -0,0 +1,11 @@
+type: build
+format_version: 1
+maintainers: [maanug]
+spec:
+  name: mcore-pyt-lts
+  platforms: [linux/amd64]
+  source:
+    # The image tag will be added via `jet-tests.yaml`
+    # Tags are one of {buildcache, $CI_PIPELINE_ID}
+    image: gitlab-master.nvidia.com/adlr/megatron-lm/mcore_ci_lts
+    
\ No newline at end of file
diff --git a/tests/functional_tests/jet_recipes/_build-nemo.yaml b/tests/functional_tests/jet_recipes/_build-nemo.yaml
new file mode 100644
index 0000000000..bca1c7a1f8
--- /dev/null
+++ b/tests/functional_tests/jet_recipes/_build-nemo.yaml
@@ -0,0 +1,10 @@
+type: build
+format_version: 1
+maintainers: [maanug]
+spec:
+  name: mcore-nemo-lts
+  platforms: [linux/amd64]
+  source:
+    # The image tag will be added via `jet-tests.yaml`
+    # Tags are one of {buildcache, $CI_PIPELINE_ID}
+    image: gitlab-master.nvidia.com/adlr/megatron-lm/nemo_ci
\ No newline at end of file
diff --git a/tests/functional_tests/jet_recipes/bert.yaml b/tests/functional_tests/jet_recipes/bert.yaml
new file mode 100644
index 0000000000..cb8873fcb9
--- /dev/null
+++ b/tests/functional_tests/jet_recipes/bert.yaml
@@ -0,0 +1,52 @@
+type: basic
+format_version: 1
+maintainers: [mcore]
+loggers: [stdout]
+spec:
+  name: "{test_case}"
+  model: bert
+  nodes: 1
+  build: mcore-pyt-{environment}
+  gpus: 8
+  platforms: dgx_a100
+  artifacts: 
+    /workspace/data/bert_data: text/the_pile/bert_shard00
+  script: |-
+    ls
+    cd /opt/megatron-lm
+
+    ARGUMENTS=(
+        "DATA_PATH=/workspace/data/bert_data"
+        "DATA_CACHE_PATH=/workspace/data/cache" 
+        "OUTPUT_PATH={assets_dir}"
+        "TENSORBOARD_PATH={assets_dir}/tensorboard"
+        "CHECKPOINT_PATH=/workspace/checkpoints"
+        "TRAINING_SCRIPT_PATH=pretrain_bert.py"
+        "TRAINING_PARAMS_PATH=./tests/functional_tests/test_cases/{model}/{test_case}/model_config.yaml"
+        "GOLDEN_VALUES_PATH=./tests/functional_tests/test_cases/{model}/{test_case}/golden_values_{environment}.json"
+    )
+
+    bash ./tests/functional_tests/shell_test_utils/run_ci_test.sh ${{ARGUMENTS[@]}}
+
+products:
+  - environment: [lts, dev]
+    scope: [mr]
+    time_limit: [12000]
+    test_case: 
+    - bert_mr_mcore_tp2_pp2_dgx_a100_1N8G
+    # - bert_mr_mcore_tp2_pp2_local_spec_dgx_a100_1N8G
+    - bert_mr_mcore_tp2_pp2_resume_torch_dist_dgx_a100_1N8G
+    - bert_mr_mcore_tp2_pp2_resume_torch_dist_local_spec_dgx_a100_1N8G
+    - bert_mr_tp1_pp4_vp2_dgx_a100_1N8G
+    - bert_mr_tp1_pp4_vp2_resume_torch_dgx_a100_1N8G
+    - bert_mr_tp2_pp2_dgx_a100_1N8G
+    - bert_mr_tp2_pp2_resume_torch_dgx_a100_1N8G
+  - environment: [lts, dev]
+    scope: [nightly]
+    time_limit: [12000]
+    test_case:
+    - bert_nightly_dgx_a100_1N8G_mcore_tp1_pp2
+    - bert_nightly_dgx_a100_1N8G_mcore_tp1_pp4_vp2
+    - bert_nightly_dgx_a100_1N8G_mcore_tp4_pp1
+    - bert_nightly_dgx_a100_1N8G_tp1_pp2
+    - bert_nightly_dgx_a100_1N8G_tp4_pp1
diff --git a/tests/functional_tests/jet_recipes/gpt-nemo.yaml b/tests/functional_tests/jet_recipes/gpt-nemo.yaml
new file mode 100644
index 0000000000..3d091ba015
--- /dev/null
+++ b/tests/functional_tests/jet_recipes/gpt-nemo.yaml
@@ -0,0 +1,37 @@
+type: basic
+format_version: 1
+maintainers: [mcore]
+loggers: [stdout]
+spec:
+  name: "{test_case}"
+  model: gpt-nemo
+  build: mcore-nemo-dev
+  nodes: 1
+  gpus: 8
+  platforms: dgx_a100
+  time_limit: 12000
+  scope: null
+  script: |-
+    ls
+    cd /opt/NeMo
+  
+    ARGUMENTS=(
+        "DATA_PATH='-'"
+        "DATA_CACHE_PATH='-'"
+        "OUTPUT_PATH={assets_dir}"
+        "TENSORBOARD_PATH={assets_dir}/tensorboard"
+        "CHECKPOINT_PATH=/workspace/checkpoints"
+        "TRAINING_SCRIPT_PATH=/opt/NeMo/examples/nlp/language_modeling/megatron_gpt_pretraining.py"
+        "TRAINING_PARAMS_PATH=/opt/megatron-lm/tests/functional_tests/test_cases/{model}/{test_case}/model_config.yaml"
+        "GOLDEN_VALUES_PATH=/opt/megatron-lm/tests/functional_tests/test_cases/{model}/{test_case}/golden_values_{environment}.json"
+    )
+
+    bash /opt/megatron-lm/tests/functional_tests/shell_test_utils/run_ci_test.sh ${{ARGUMENTS[@]}}
+
+products:
+  - environment: [dev]
+    scope: [mr]
+    test_case:
+    - gpt3-nemo_126m_mr_mbs1_gbs8_mcore_te_tp2_pp4_vp3_seq_par_overlap_p2p_dgx_a100_1N8G
+    - gpt3-nemo_126m_mr_mbs4_gbs64_mcore_te_tp1_pp1_dgx_a100_1N8G
+    
\ No newline at end of file
diff --git a/tests/functional_tests/jet_recipes/gpt.yaml b/tests/functional_tests/jet_recipes/gpt.yaml
new file mode 100644
index 0000000000..e039a755ba
--- /dev/null
+++ b/tests/functional_tests/jet_recipes/gpt.yaml
@@ -0,0 +1,154 @@
+type: basic
+format_version: 1
+maintainers: [mcore]
+loggers: [stdout]
+spec:
+  name: "{test_case}"
+  model: gpt
+  build: mcore-pyt-{environment}
+  nodes: 1
+  gpus: 8
+  artifacts:
+    /workspace/data/gpt3_data: text/the_pile/shard00
+  script: |-
+    ls
+    cd /opt/megatron-lm
+
+    ARGUMENTS=(
+        "DATA_PATH=/workspace/data/gpt3_data"
+        "DATA_CACHE_PATH=/workspace/data/cache"
+        "OUTPUT_PATH={assets_dir}"
+        "TENSORBOARD_PATH={assets_dir}/tensorboard"
+        "CHECKPOINT_PATH=/workspace/checkpoints"
+        "TRAINING_SCRIPT_PATH=pretrain_gpt.py"
+        "TRAINING_PARAMS_PATH=./tests/functional_tests/test_cases/{model}/{test_case}/model_config.yaml"
+        "GOLDEN_VALUES_PATH=./tests/functional_tests/test_cases/{model}/{test_case}/golden_values_{environment}.json"
+    )
+
+    bash ./tests/functional_tests/shell_test_utils/run_ci_test.sh ${{ARGUMENTS[@]}}
+
+products:
+  - environment: [lts, dev]
+    scope: [mr]
+    platforms: [dgx_a100]
+    time_limit: [12000]
+    test_case:
+    - gpt3_mr_mcore_te_tp1_pp1_dist_optimizer_no_mmap_bin_files_dgx_a100_1N8G
+    - gpt3_mr_mcore_te_tp1_pp1_resume_torch_dist_dist_optimizer_dgx_a100_1N8G
+    - gpt3_mr_mcore_te_tp1_pp1_resume_torch_dist_dist_optimizer_no_mmap_bin_files_dgx_a100_1N8G
+    - gpt3_mr_mcore_te_tp1_pp1_resume_torch_dist_uniform_full_recompute_dgx_a100_1N8G
+    - gpt3_mr_mcore_te_tp1_pp1_uniform_full_recompute_dgx_a100_1N8G
+    - gpt3_mr_mcore_te_tp1_pp2_resume_torch_dist_reshard_2x1x4_te_8experts2parallel_dist_optimizer_dgx_a100_1N8G
+    - gpt3_mr_mcore_te_tp1_pp2_resume_torch_dist_rope_embeddings_dgx_a100_1N8G
+    - gpt3_mr_mcore_te_tp1_pp2_resume_torch_dist_rope_embeddings_interleaved_no_fusion_dgx_a100_1N8G
+    - gpt3_mr_mcore_te_tp1_pp2_rope_embeddings_dgx_a100_1N8G
+    - gpt3_mr_mcore_te_tp1_pp2_rope_embeddings_interleaved_no_fusion_dgx_a100_1N8G
+    - gpt3_mr_mcore_te_tp1_pp4_disable_bias_linear_dgx_a100_1N8G
+    - gpt3_mr_mcore_te_tp1_pp4_resume_torch_dist_disable_bias_linear_dgx_a100_1N8G
+    - gpt3_mr_mcore_te_tp1_pp4_resume_torch_dist_sequence_parallel_dgx_a100_1N8G
+    - gpt3_mr_mcore_te_tp1_pp4_resume_torch_dist_swiglu_dgx_a100_1N8G
+    - gpt3_mr_mcore_te_tp1_pp4_resume_torch_dist_untie_embeddings_and_outputs_dgx_a100_1N8G
+    - gpt3_mr_mcore_te_tp1_pp4_sequence_parallel_dgx_a100_1N8G
+    - gpt3_mr_mcore_te_tp1_pp4_swiglu_dgx_a100_1N8G
+    - gpt3_mr_mcore_te_tp1_pp4_untie_embeddings_and_outputs_dgx_a100_1N8G
+    - gpt3_mr_mcore_te_tp1_pp4_vp1_calculate_per_token_loss_dgx_a100_1N8G
+    - gpt3_mr_mcore_te_tp1_pp4_vp1_decoupled_lr_dgx_a100_1N8G
+    - gpt3_mr_mcore_te_tp1_pp4_vp1_uneven_pipeline_dgx_a100_1N8G
+    - gpt3_mr_mcore_te_tp1_pp4_vp1_dgx_a100_1N8G
+    - gpt3_mr_mcore_te_tp1_pp4_vp1_dist_optimizer_overlap_grad_reduce_dgx_a100_1N8G
+    - gpt3_mr_mcore_te_tp1_pp4_vp1_dist_optimizer_overlap_grad_reduce_param_gather_dgx_a100_1N8G
+    - gpt3_mr_mcore_te_tp1_pp4_vp1_dist_optimizer_overlap_grad_reduce_param_gather_overlap_optimizer_dgx_a100_1N8G
+    - gpt3_mr_mcore_te_tp1_pp4_vp1_dist_optimizer_overlap_grad_reduce_untied_dgx_a100_1N8G
+    - gpt3_mr_mcore_te_tp1_pp4_vp1_resume_torch_decoupled_lr_dgx_a100_1N8G
+    - gpt3_mr_mcore_te_tp1_pp4_vp1_resume_torch_dist_calculate_per_token_loss_dgx_a100_1N8G
+    - gpt3_mr_mcore_te_tp1_pp4_vp1_resume_torch_dist_dgx_a100_1N8G
+    - gpt3_mr_mcore_te_tp1_pp4_vp1_resume_torch_dist_dist_optimizer_overlap_grad_reduce_dgx_a100_1N8G
+    - gpt3_mr_mcore_te_tp1_pp4_vp1_resume_torch_dist_dist_optimizer_overlap_grad_reduce_param_gather_dgx_a100_1N8G
+    - gpt3_mr_mcore_te_tp1_pp4_vp1_resume_torch_dist_dist_optimizer_overlap_grad_reduce_untied_dgx_a100_1N8G
+    - gpt3_mr_mcore_te_tp2_pp1_cp2_nondeterministic_dgx_a100_1N8G
+    - gpt3_mr_mcore_te_tp2_pp1_resume_torch_dist_cp2_nondeterministic_dgx_a100_1N8G
+    - gpt3_mr_mcore_te_tp2_pp1_resume_torch_dist_te_8experts2parallel_dgx_a100_1N8G
+    - gpt3_mr_mcore_te_tp2_pp1_resume_torch_dist_te_8experts2parallel_dist_optimizer_dgx_a100_1N8G
+    - gpt3_mr_mcore_te_tp2_pp1_resume_torch_dist_te_8experts2parallel_groupedGEMM_dgx_a100_1N8G
+    - gpt3_mr_mcore_te_tp2_pp1_resume_torch_dist_te_8experts2parallel_overlap_grad_reduce_param_gather_groupedGEMM_dgx_a100_1N8G
+    - gpt3_mr_mcore_te_tp2_pp1_resume_torch_dist_te_8experts2parallel_top2router_dgx_a100_1N8G
+    - gpt3_mr_mcore_te_tp2_pp1_te_8experts2parallel_dgx_a100_1N8G
+    - gpt3_mr_mcore_te_tp2_pp1_te_8experts2parallel_dist_optimizer_dgx_a100_1N8G
+    - gpt3_mr_mcore_te_tp2_pp1_te_8experts2parallel_groupedGEMM_dgx_a100_1N8G
+    - gpt3_mr_mcore_te_tp2_pp1_te_8experts2parallel_overlap_grad_reduce_param_gather_groupedGEMM_dgx_a100_1N8G
+    - gpt3_mr_mcore_te_tp2_pp1_te_8experts2parallel_top2router_dgx_a100_1N8G
+    - gpt3_mr_mcore_te_tp2_pp2_cp2_nondeterministic_dgx_a100_1N8G
+    - gpt3_mr_mcore_te_tp2_pp2_cross_entropy_loss_fusion_dgx_a100_1N8G
+    - gpt3_mr_mcore_te_tp2_pp2_ddp_average_in_collective_dgx_a100_1N8G
+    - gpt3_mr_mcore_te_tp2_pp2_defer_embedding_wgrad_compute_dgx_a100_1N8G
+    - gpt3_mr_mcore_te_tp2_pp2_dgx_a100_1N8G
+    - gpt3_mr_mcore_te_tp2_pp2_no_create_attention_mask_in_dataloader_dgx_a100_1N8G
+    - gpt3_mr_mcore_te_tp2_pp2_no_mmap_bin_files_dgx_a100_1N8G
+    - gpt3_mr_mcore_te_tp2_pp2_resume_torch_dist_cp2_nondeterministic_dgx_a100_1N8G
+    - gpt3_mr_mcore_te_tp2_pp2_resume_torch_dist_cross_entropy_loss_fusion_dgx_a100_1N8G
+    - gpt3_mr_mcore_te_tp2_pp2_resume_torch_dist_ddp_average_in_collective_dgx_a100_1N8G
+    - gpt3_mr_mcore_te_tp2_pp2_resume_torch_dist_defer_embedding_wgrad_compute_dgx_a100_1N8G
+    - gpt3_mr_mcore_te_tp2_pp2_resume_torch_dist_dgx_a100_1N8G
+    - gpt3_mr_mcore_te_tp2_pp2_resume_torch_dist_no_create_attention_mask_in_dataloader_dgx_a100_1N8G
+    - gpt3_mr_mcore_te_tp2_pp2_resume_torch_dist_no_mmap_bin_files_dgx_a100_1N8G
+    - gpt3_mr_mcore_te_tp2_pp2_resume_torch_dist_reshard_1x4xNone_dgx_a100_1N8G
+    - gpt3_mr_mcore_te_tp4_pp1_dist_optimizer_overlap_grad_reduce_dgx_a100_1N8G
+    - gpt3_mr_mcore_te_tp4_pp1_dist_optimizer_overlap_grad_reduce_param_gather_dgx_a100_1N8G
+    - gpt3_mr_mcore_te_tp4_pp1_qk_layernorm_test_mode_dgx_a100_1N8G
+    - gpt3_mr_mcore_te_tp4_pp1_resume_torch_dist_dist_optimizer_overlap_grad_reduce_dgx_a100_1N8G
+    - gpt3_mr_mcore_te_tp4_pp1_resume_torch_dist_dist_optimizer_overlap_grad_reduce_param_gather_dgx_a100_1N8G
+    - gpt3_mr_mcore_te_tp4_pp1_resume_torch_dist_qk_layernorm_test_mode_dgx_a100_1N8G
+    - gpt3_mr_mcore_te_tp4_pp2_resume_torch_dist_reshard_8x1xNone_dgx_a100_1N8G
+    - gpt3_mr_mcore_tp2_pp2_resume_torch_dist_uninstall_te_dgx_a100_1N8G
+    - gpt3_mr_mcore_tp2_pp2_uninstall_te_dgx_a100_1N8G
+    - gpt3_mr_te_tp2_pp2_dgx_a100_1N8G
+    - gpt3_mr_te_tp2_pp2_resume_torch_dgx_a100_1N8G
+    - gpt3_mr_tp1_pp4_vp1_dgx_a100_1N8G
+    - gpt3_mr_tp1_pp4_vp1_resume_torch_dgx_a100_1N8G
+    - gpt3_mr_tp2_pp2_dgx_a100_1N8G
+  - environment: [lts, dev]
+    scope: [nightly]
+    platforms: [dgx_a100]
+    time_limit: [12000]
+    test_case:
+    - gpt3_345m_nightly_dgx_a100_1N8G_mcore_tp1_pp1_dist_optimizer_overlap_grad_reduce_param_gather
+    - gpt3_345m_nightly_dgx_a100_1N8G_mcore_tp1_pp1_resume_torch_dist_dist_optimizer_overlap_grad_reduce_param_gather
+    - gpt3_345m_nightly_dgx_a100_1N8G_mcore_tp1_pp2
+    - gpt3_345m_nightly_dgx_a100_1N8G_mcore_tp1_pp2_resume_torch_dist
+    - gpt3_345m_nightly_dgx_a100_1N8G_mcore_tp1_pp4
+    - gpt3_345m_nightly_dgx_a100_1N8G_mcore_tp1_pp4_resume_torch_dist
+    # - gpt3_345m_nightly_dgx_a100_1N8G_mcore_tp2_pp2_ep2_resume_torch_dist_te_4experts2parallel
+    - gpt3_345m_nightly_dgx_a100_1N8G_mcore_tp2_pp2_ep2_te_4experts2parallel
+    # - gpt3_345m_nightly_dgx_a100_1N8G_mcore_tp2_pp2_resume_torch_dist_te_2experts
+    - gpt3_345m_nightly_dgx_a100_1N8G_mcore_tp2_pp2_te_2experts
+    - gpt3_345m_nightly_dgx_a100_1N8G_mcore_tp4_pp1
+    - gpt3_345m_nightly_dgx_a100_1N8G_mcore_tp4_pp1_resume_torch
+    - gpt3_345m_nightly_dgx_a100_1N8G_mcore_tp4_pp1_resume_torch_dist
+    - gpt3_345m_nightly_dgx_a100_1N8G_tp1_pp1_dist_optimizer_overlap_grad_reduce
+    - gpt3_345m_nightly_dgx_a100_1N8G_tp1_pp1_overlap_grad_reduce
+    - gpt3_345m_nightly_dgx_a100_1N8G_tp1_pp2
+    - gpt3_345m_nightly_dgx_a100_1N8G_tp1_pp2_resume_torch
+    - gpt3_345m_nightly_dgx_a100_1N8G_tp1_pp4
+    - gpt3_345m_nightly_dgx_a100_1N8G_tp1_pp4_overlap_grad_reduce
+    - gpt3_345m_nightly_dgx_a100_1N8G_tp1_pp4_resume_torch
+    - gpt3_345m_nightly_dgx_a100_1N8G_tp1_pp4_vp1_overlap_grad_reduce
+    - gpt3_345m_nightly_dgx_a100_1N8G_tp2_pp2_4experts
+    - gpt3_345m_nightly_dgx_a100_1N8G_tp2_pp2_overlap_grad_reduce
+    - gpt3_345m_nightly_dgx_a100_1N8G_tp2_pp2_resume_torch_4experts
+    - gpt3_345m_nightly_dgx_a100_1N8G_tp2_pp2_resume_torch_overlap_grad_reduce
+    - gpt3_345m_nightly_dgx_a100_1N8G_tp4_pp1
+    - gpt3_345m_nightly_dgx_a100_1N8G_tp4_pp1_overlap_grad_reduce
+    - gpt3_345m_nightly_dgx_a100_1N8G_tp4_pp1_resume_torch
+  - environment: [lts, dev]
+    scope: [weekly]
+    platforms: [dgx_h100]
+    time_limit: [9000]
+    test_case:
+    - gpt3_345m_weekly_dgx_h100_1N8G_mcore_nondet_tp1_pp1_fp8_no_model_parallel
+    - gpt3_345m_weekly_dgx_h100_1N8G_mcore_tp1_pp1_bf16_baseline
+    - gpt3_345m_weekly_dgx_h100_1N8G_mcore_tp1_pp1_fp8_no_model_parallel
+    - gpt3_345m_weekly_dgx_h100_1N8G_mcore_tp1_pp2_fp8_pp
+    - gpt3_345m_weekly_dgx_h100_1N8G_mcore_tp2_pp2_fp8_tp_pp
+    - gpt3_345m_weekly_dgx_h100_1N8G_mcore_tp2_pp2_fp8_tp_pp_sp
+    - gpt3_345m_weekly_dgx_h100_1N8G_mcore_tp2_pp2_native_fp8_tp_pp_sp
+    - gpt3_345m_weekly_dgx_h100_1N8G_mcore_tp4_pp2_fp8_tp_pp
diff --git a/tests/functional_tests/jet_recipes/multimodal-llava.yaml b/tests/functional_tests/jet_recipes/multimodal-llava.yaml
new file mode 100644
index 0000000000..a2b6e6c3ff
--- /dev/null
+++ b/tests/functional_tests/jet_recipes/multimodal-llava.yaml
@@ -0,0 +1,38 @@
+type: basic
+format_version: 1
+maintainers: [mcore]
+loggers: [stdout]
+spec:
+  name: "{test_case}"
+  model: multimodal-llava
+  build: mcore-pyt-{environment}
+  nodes: 1
+  gpus: 8
+  platforms: dgx_a100
+  time_limit: 12000
+  scope: null
+  script: |-
+    ls
+    cd /opt/megatron-lm
+
+    ARGUMENTS=(
+        "DATA_PATH='-'"
+        "DATA_CACHE_PATH='-'"
+        "OUTPUT_PATH={assets_dir}"
+        "TENSORBOARD_PATH={assets_dir}/tensorboard"
+        "CHECKPOINT_PATH=/workspace/checkpoints"
+        "TRAINING_SCRIPT_PATH=pretrain_vlm.py"
+        "TRAINING_PARAMS_PATH=./tests/functional_tests/test_cases/{model}/{test_case}/model_config.yaml"
+        "GOLDEN_VALUES_PATH=./tests/functional_tests/test_cases/{model}/{test_case}/golden_values_{environment}.json"
+    )
+
+    bash ./tests/functional_tests/shell_test_utils/run_ci_test.sh ${{ARGUMENTS[@]}}
+
+products:
+  - environment: [lts, dev]
+    scope: [mr]
+    test_case:
+    - multimodal_llava_mr_mcore_te_tp1_pp1_dgx_a100_1N8G
+    - multimodal_llava_mr_mcore_te_tp2_pp3_dgx_a100_1N8G
+    - multimodal_llava_mr_mcore_te_tp4_pp1_etp3_dgx_a100_1N7G
+    - multimodal_llava_mr_mcore_te_tp4_pp1_resume_torch_etp3_dgx_a100_1N7G
diff --git a/tests/functional_tests/jet_recipes/t5.yaml b/tests/functional_tests/jet_recipes/t5.yaml
new file mode 100644
index 0000000000..7d1f67337d
--- /dev/null
+++ b/tests/functional_tests/jet_recipes/t5.yaml
@@ -0,0 +1,53 @@
+type: basic
+format_version: 1
+maintainers: [mcore]
+loggers: [stdout]
+spec:
+  name: "{test_case}"
+  model: t5
+  build: mcore-pyt-{environment}
+  nodes: 1
+  gpus: 8
+  platforms: dgx_a100
+  artifacts:
+    /workspace/data/t5_data: text/the_pile/t5_shard00
+  script: |-
+    ls
+    cd /opt/megatron-lm
+
+    ARGUMENTS=(
+        "DATA_PATH=/workspace/data/t5_data"
+        "DATA_CACHE_PATH=/workspace/data/cache"
+        "OUTPUT_PATH={assets_dir}"
+        "TENSORBOARD_PATH={assets_dir}/tensorboard"
+        "CHECKPOINT_PATH=/workspace/checkpoints"
+        "TRAINING_SCRIPT_PATH=pretrain_t5.py"
+        "TRAINING_PARAMS_PATH=./tests/functional_tests/test_cases/{model}/{test_case}/model_config.yaml"
+        "GOLDEN_VALUES_PATH=./tests/functional_tests/test_cases/{model}/{test_case}/golden_values_{environment}.json"
+    )
+
+    bash ./tests/functional_tests/shell_test_utils/run_ci_test.sh ${{ARGUMENTS[@]}}
+
+products:
+  - environment: [lts, dev]
+    scope: [mr]
+    time_limit: [12000]
+    test_case:
+    - t5_220m_mr_mcore_te_tp4_pp1_dgx_a100_1N8G
+    - t5_220m_mr_mcore_te_tp4_pp1_resume_torch_dist_dgx_a100_1N8G
+    - t5_220m_mr_mcore_te_tp2_pp2_dgx_a100_1N8G
+    - t5_220m_mr_mcore_te_tp2_pp2_resume_torch_dgx_a100_1N8G
+    - t5_220m_mr_mcore_tp4_pp1_dgx_a100_1N8G
+    - t5_220m_mr_mcore_tp4_pp1_resume_torch_dist_dgx_a100_1N8G
+    - t5_220m_mr_mcore_tp2_pp2_resume_torch_dgx_a100_1N8G
+    - t5_220m_mr_mcore_tp2_pp2_dgx_a100_1N8G # flaky
+  - environment: [lts, dev]
+    scope: [weekly]
+    time_limit: [9000]
+    test_case:
+    - t5_220m_weekly_dgx_a100_1N8G_mcore_te_tp1_pp1_vp1_resume_torch
+    - t5_220m_weekly_dgx_a100_1N8G_mcore_te_tp2_pp1_vp1
+    - t5_220m_weekly_dgx_a100_1N8G_mcore_te_tp2_pp1_vp1_sequence_parallel
+    - t5_220m_weekly_dgx_a100_1N8G_mcore_tp1_pp1_vp1
+    - t5_220m_weekly_dgx_a100_1N8G_mcore_tp1_pp1_vp1_resume_torch
+    - t5_220m_weekly_dgx_a100_1N8G_mcore_tp2_pp1_vp1
diff --git a/tests/functional_tests/python_test_utils/check_slurm_job_completion.py b/tests/functional_tests/python_test_utils/check_slurm_job_completion.py
deleted file mode 100644
index acd179a4ea..0000000000
--- a/tests/functional_tests/python_test_utils/check_slurm_job_completion.py
+++ /dev/null
@@ -1,19 +0,0 @@
-"""Check if a given slurm job id completed successfully
-   Usage:
-       python3 check_slurm_job_completion.py <JOB_ID>
-"""
-
-import sys
-import subprocess
-
-
-cmd = f"sacct -j {sys.argv[1]}"
-result = subprocess.check_output(cmd, shell=True).decode().split()
-assert len(result) > 14, "JOB state not available."
-
-status = result[19]
-exit_code = result[20]
-
-assert status == "COMPLETED", f"Job {sys.argv[1]} not completed."
-assert exit_code == "0:0", f"Job {sys.argv[1]} did not exit successfully."
-
diff --git a/tests/functional_tests/python_test_utils/common.py b/tests/functional_tests/python_test_utils/common.py
new file mode 100644
index 0000000000..3a9fd359a6
--- /dev/null
+++ b/tests/functional_tests/python_test_utils/common.py
@@ -0,0 +1,80 @@
+import enum
+import glob
+import json
+import logging
+import os
+
+from tensorboard.backend.event_processing import event_accumulator
+
+# By default TB tries to be smart about what to load in memory to avoid OOM
+# Since we expect every step to be there when we do our comparisons, we explicitly
+# set the size guidance to 0 so that we load everything. It's okay given our tests
+# are small/short.
+SIZE_GUIDANCE = {event_accumulator.TENSORS: 0, event_accumulator.SCALARS: 0}
+
+logger = logging.getLogger()
+
+
+class TypeOfTest(enum.Enum):
+    APPROX = 1
+    DETERMINISTIC = 2
+
+
+TYPE_OF_TEST_TO_METRIC = {
+    TypeOfTest.DETERMINISTIC: ["lm loss", "num-zeros"],
+    TypeOfTest.APPROX: ["lm loss", "iteration-time", "mem-allocated-bytes"],
+}
+
+METRIC_TO_THRESHOLD = {
+    "iteration-time": 0.5,
+    "mem-allocated-bytes": 3 * 1000 * 1000,  # 3MB
+    "lm loss": 0.05,
+}
+
+
+def read_tb_logs_as_list(path, index=0):
+    """Reads a TensorBoard Events file from the input path, and returns the
+    summary specified as input as a list.
+
+    Args:
+        path: str, path to the dir where the events file is located.
+        summary_name: str, name of the summary to read from the TB logs.
+
+    Returns:
+        summary_list: list, the values in the read summary list, formatted as a list.
+    """
+    files = glob.glob(f"{path}/events*tfevents*")
+    files += glob.glob(f"{path}/results/events*tfevents*")
+
+    summaries = {}
+
+    if not files:
+        logger.info(f"File not found matching: {path}/events* || {path}/results/events*")
+        return summaries
+
+    files.sort(key=lambda x: os.path.getmtime(os.path.join(path, x)))
+
+    event_file = files[index]
+    ea = event_accumulator.EventAccumulator(event_file, size_guidance=SIZE_GUIDANCE)
+    ea.Reload()
+
+    for scalar_name in ea.Tags()["scalars"]:
+        summaries[scalar_name] = [round(x.value, 5) for x in ea.Scalars(scalar_name)]
+
+        print(
+            f"Extracted {len(summaries[scalar_name])} values of {scalar_name} from Tensorboard \
+logs. Here are the first 5 values: {summaries[scalar_name][:5]}"
+        )
+
+    return summaries
+
+
+def load_expected_data():
+    expected_metrics_file = os.getenv("EXPECTED_METRICS_FILE")
+
+    with open(expected_metrics_file) as f:
+        if os.path.exists(expected_metrics_file):
+            with open(expected_metrics_file) as f:
+                return json.load(f)
+        else:
+            print(f"File {expected_metrics_file} not found!")
diff --git a/tests/functional_tests/python_test_utils/get_test_results_from_tensorboard_logs.py b/tests/functional_tests/python_test_utils/get_test_results_from_tensorboard_logs.py
index 362dabab78..3c0b67ed3a 100644
--- a/tests/functional_tests/python_test_utils/get_test_results_from_tensorboard_logs.py
+++ b/tests/functional_tests/python_test_utils/get_test_results_from_tensorboard_logs.py
@@ -1,73 +1,33 @@
 import os
-import sys
-import json
-import shutil
-import glob
-from tensorboard.backend.event_processing import event_accumulator
-
-
-def read_tb_logs_as_list(path, summary_name):
-    """Reads a TensorBoard Events file from the input path, and returns the
-    summary specified as input as a list.
-
-    Arguments:
-    path: str, path to the dir where the events file is located.
-    summary_name: str, name of the summary to read from the TB logs.
-    Output:
-    summary_list: list, the values in the read summary list, formatted as a list.
-    """
-    files = glob.glob(f"{path}/events*tfevents*")
-    files += glob.glob(f"{path}/results/events*tfevents*")
-    files.sort(key=lambda x: os.path.getmtime(os.path.join(path, x)))
-    if files:
-        event_file = files[0]
-        ea = event_accumulator.EventAccumulator(event_file)
-        ea.Reload()
-        summary = ea.Scalars(summary_name)
-        summary_list = [round(x.value, 5) for x in summary]
-        print(f'\nObtained the following list for {summary_name} ------------------')
-        print(summary_list)
-        return summary_list
-    raise FileNotFoundError(f"File not found matching: {path}/events*")    
 
-def collect_train_test_metrics(logs_dir, run_name):
-    # TODO: Fetch current baseline
+os.environ["OPENBLAS_NUM_THREADS"] = "1"
+import json
 
-    # train loss
-    train_loss_list = read_tb_logs_as_list(logs_dir, "lm loss")
+import click
 
-    # num zeros
-    num_zeros = read_tb_logs_as_list(logs_dir, "num-zeros")
+from tests.functional_tests.python_test_utils import common
 
-    iteration_time = read_tb_logs_as_list(logs_dir, "iteration-time")
 
-    # First few iterations might take a little longer. So we take the last 70 percent of the timings
-    idx = len(iteration_time)//3   
-    iteration_time_avg = sum(iteration_time[idx:])/len(iteration_time[idx:])
+@click.command()
+@click.option("--logs-dir", required=True, type=str, help="Path to Tensorboard logs")
+@click.option("--output-path", required=False, type=str, help="Path to write golden values")
+def collect_train_test_metrics(logs_dir: str, output_path: str):
+    summaries = common.read_tb_logs_as_list(logs_dir)
 
     train_metrics = {
-        "lm loss": {
-            "start_step": 0,
-            "end_step": len(train_loss_list),
-            "step_interval": 5,
-            "values": train_loss_list[0:len(train_loss_list):5],
-        },
-        "num-zeros": {
+        metric_name: {
             "start_step": 0,
-            "end_step": len(num_zeros),
+            "end_step": len(metric_values),
             "step_interval": 5,
-            "values": num_zeros[0:len(num_zeros):5],
-        },
-        "iteration_timing_avg": iteration_time_avg,
+            "values": metric_values[0 : len(metric_values) : 5],
+        }
+        for metric_name, metric_values in summaries.items()
     }
-    str_train_metrics = str(train_metrics).replace("'", "\"")
-    print(f"\n ----------- Store the following metrics in {run_name}.json ----------")
-    print(f"\n {str_train_metrics}", flush=True)
 
-if __name__ == '__main__':
-    args = sys.argv[1:]
-    logs_dir = args[0] # eg /lustre/fsw/joc/shanmugamr/megatron/logs/
-    run_name = args[1]
-    collect_train_test_metrics(logs_dir, run_name)
+    if output_path is not None:
+        with open(output_path, "w") as fh:
+            json.dump(train_metrics, fh)
 
 
+if __name__ == "__main__":
+    collect_train_test_metrics()
diff --git a/tests/functional_tests/python_test_utils/jet/common.py b/tests/functional_tests/python_test_utils/jet/common.py
new file mode 100644
index 0000000000..eed22752c6
--- /dev/null
+++ b/tests/functional_tests/python_test_utils/jet/common.py
@@ -0,0 +1,174 @@
+import copy
+import itertools
+import pathlib
+from typing import List, Optional
+
+import jetclient
+import yaml
+
+BASE_PATH = pathlib.Path(__file__).parent.resolve()
+
+
+def flatten_products(
+    workload_manifest: jetclient.JETWorkloadManifest,
+) -> jetclient.JETWorkloadManifest:
+    """Flattens a nested dict of products"""
+    workload_manifest.products = [
+        dict(zip(inp.keys(), values))
+        for inp in workload_manifest.products
+        for values in itertools.product(*inp.values())
+    ]
+
+    return workload_manifest
+
+
+def flatten_workload(
+    workload_manifest: jetclient.JETWorkloadManifest,
+) -> List[jetclient.JETWorkloadManifest]:
+    """Flattens a workload with products into a list of workloads that don't have products."""
+    workload_manifest = dict(workload_manifest)
+    products = workload_manifest.pop("products")
+    workload_manifests = []
+    for product in products:
+        workload = copy.deepcopy(workload_manifest)
+        workload['spec'] = {k: v for k, v in workload['spec'] if k not in product.keys()}
+        workload['spec'] = dict(**dict(workload['spec']), **product)
+        workload_manifests.append(jetclient.JETWorkloadManifest(**workload))
+    return workload_manifests
+
+
+def set_build_dependency(
+    workload_manifests: List[jetclient.JETWorkloadManifest],
+) -> List[jetclient.JETWorkloadManifest]:
+    for workload_manifest in workload_manifests:
+        workload_manifest.spec.build = workload_manifest.spec.build.format(
+            **dict(workload_manifest.spec)
+        )
+    return workload_manifests
+
+
+def load_config(config_path: str) -> jetclient.JETWorkloadManifest:
+    """Loads and parses a yaml file into a JETWorkloadManifest"""
+    with open(config_path) as stream:
+        try:
+            return jetclient.JETWorkloadManifest(**yaml.safe_load(stream))
+        except yaml.YAMLError as exc:
+            raise exc
+
+
+def load_and_flatten(config_path: str) -> List[jetclient.JETWorkloadManifest]:
+    """Wrapper function for doing all the fun at once."""
+    return set_build_dependency(
+        flatten_workload(flatten_products(load_config(config_path=config_path)))
+    )
+
+
+def filter_by_test_case(
+    workload_manifests: List[jetclient.JETWorkloadManifest], test_case: str
+) -> jetclient.JETWorkloadManifest:
+    """Returns a workload with matching name. Raises an error if there no or more than a single workload."""
+    workload_manifests = list(
+        workload_manifest
+        for workload_manifest in workload_manifests
+        if workload_manifest.spec.test_case == test_case
+    )
+
+    if len(workload_manifests) > 1:
+        raise ValueError("Duplicate test_case found!")
+
+    if len(workload_manifests) == 0:
+        raise ValueError("No test_case found!")
+
+    return workload_manifests[0]
+
+
+def filter_by_scope(
+    workload_manifests: List[jetclient.JETWorkloadManifest], scope: str
+) -> List[jetclient.JETWorkloadManifest]:
+    """Returns all workload with matching scope."""
+    workload_manifests = list(
+        workload_manifest
+        for workload_manifest in workload_manifests
+        if workload_manifest.spec.scope == scope
+    )
+
+    if len(workload_manifests) == 0:
+        raise ValueError("No test_case found!")
+
+    return workload_manifests
+
+
+def filter_by_environment(
+    workload_manifests: List[jetclient.JETWorkloadManifest], environment: str
+) -> List[jetclient.JETWorkloadManifest]:
+    workload_manifests = list(
+        workload_manifest
+        for workload_manifest in workload_manifests
+        if (
+            hasattr(workload_manifest.spec, "environment")
+            and workload_manifest.spec.environment == environment
+        )
+    )
+
+    if len(workload_manifests) == 0:
+        raise ValueError("No test_case found!")
+
+    return workload_manifests
+
+
+def filter_by_model(
+    workload_manifests: List[jetclient.JETWorkloadManifest], model: str
+) -> List[jetclient.JETWorkloadManifest]:
+    """Returns all workload with matching model."""
+    workload_manifests = list(
+        workload_manifest
+        for workload_manifest in workload_manifests
+        if workload_manifest.spec.model == model
+    )
+
+    if len(workload_manifests) == 0:
+        raise ValueError("No test_case found!")
+
+    return workload_manifests
+
+
+def load_workloads(
+    container_tag: str,
+    environment: Optional[str] = None,
+    scope: Optional[str] = None,
+    model: Optional[str] = None,
+    test_case: Optional[str] = None,
+    container_image: Optional[str] = None,
+) -> List[jetclient.JETWorkloadManifest]:
+    """Return all workloads from disk that match scope and platform."""
+    recipes_dir = BASE_PATH / ".." / ".." / "jet_recipes"
+    local_dir = BASE_PATH / ".." / ".." / "local_recipes"
+
+    workloads: List[jetclient.JETWorkloadManifest] = []
+    build_workloads: List[jetclient.JETClient] = []
+    for file in list(recipes_dir.glob("*.yaml")) + list(local_dir.glob("*.yaml")):
+        workloads += load_and_flatten(config_path=file)
+        if file.stem.startswith("_build"):
+            build_workloads.append(load_config(config_path=file))
+
+    if scope:
+        workloads = filter_by_scope(workload_manifests=workloads, scope=scope)
+
+    if environment:
+        workloads = filter_by_environment(workload_manifests=workloads, environment=environment)
+
+    if model:
+        workloads = filter_by_model(workload_manifests=workloads, model=model)
+
+    if test_case:
+        workloads = [filter_by_test_case(workload_manifests=workloads, test_case=test_case)]
+
+    for workload in list(workloads):
+        for build_workload in build_workloads:
+            if (
+                workload.spec.build == build_workload.spec.name
+            ) and build_workload not in workloads:
+                container_image = container_image or build_workload.spec.source.image
+                build_workload.spec.source.image = f"{container_image}:{container_tag}"
+                workloads.append(build_workload)
+    return workloads
diff --git a/tests/functional_tests/python_test_utils/jet/generate_jet_trigger_job.py b/tests/functional_tests/python_test_utils/jet/generate_jet_trigger_job.py
new file mode 100644
index 0000000000..3922de3f86
--- /dev/null
+++ b/tests/functional_tests/python_test_utils/jet/generate_jet_trigger_job.py
@@ -0,0 +1,97 @@
+import pathlib
+from typing import Optional
+
+import click
+import yaml
+
+from tests.functional_tests.python_test_utils.jet import common
+
+BASE_PATH = pathlib.Path(__file__).parent.resolve()
+
+
+@click.command()
+@click.option("--scope", required=True, type=str, help="Test scope")
+@click.option("--environment", required=True, type=str, help="LTS or dev features")
+@click.option("--a100-cluster", required=True, type=str, help="A100 Cluster to run on")
+@click.option("--h100-cluster", required=True, type=str, help="H100 Cluster to run on")
+@click.option("--output-path", required=True, type=str, help="Path to write GitLab job to")
+@click.option("--container-image", required=True, type=str, help="LTS Container tag to use")
+@click.option("--container-tag", required=True, type=str, help="Container tag to use")
+@click.option(
+    "--run-name", required=False, type=str, help="Run name (only relevant for release tests)"
+)
+@click.option(
+    "--wandb-experiment",
+    required=False,
+    type=str,
+    help="Wandb experiment (only relevant for release tests)",
+)
+def main(
+    scope: str,
+    environment: str,
+    a100_cluster: str,
+    h100_cluster: str,
+    output_path: str,
+    container_image: str,
+    container_tag: str,
+    run_name: Optional[str] = None,
+    wandb_experiment: Optional[str] = None,
+):
+    test_cases = [
+        test_case
+        for test_case in common.load_workloads(
+            scope=scope, container_tag=container_tag, environment=environment
+        )
+        if test_case.type != "build"
+    ]
+
+    gitlab_pipeline = {
+        "stages": list(set([test_case.spec.model for test_case in test_cases])),
+        "default": {"interruptible": True},
+    }
+
+    for test_case in test_cases:
+        if test_case.spec.platforms == "dgx_a100":
+            cluster = a100_cluster
+        elif test_case.spec.platforms == "dgx_h100":
+            cluster = h100_cluster
+        else:
+            raise ValueError(f"Platform {test_case.spec.platforms} unknown")
+
+        script = [
+            "export PYTHONPATH=$(pwd); "
+            "python tests/functional_tests/python_test_utils/jet/launch_jet_workload.py",
+            f"--model {test_case.spec.model}",
+            f"--environment {test_case.spec.environment}",
+            f"--test-case {test_case.spec.test_case}",
+            f"--container-tag {container_tag}",
+            f"--cluster {cluster}",
+        ]
+
+        if run_name is not None and wandb_experiment is not None:
+            script.append(f"--run-name {run_name}")
+            test_case.spec.model
+            script.append(
+                f"--wandb-experiment {wandb_experiment}-{test_case.spec.model}-{test_case.spec.test_case}"
+            )
+
+        gitlab_pipeline[test_case.spec.test_case] = {
+            "stage": f"{test_case.spec.model}",
+            "image": f"{container_image}:{container_tag}",
+            "tags": ["mcore-docker-node-jet"],
+            "rules": [
+                {"if": '$CI_PIPELINE_SOURCE == "parent_pipeline"'},
+                {"if": '$CI_MERGE_REQUEST_ID'},
+            ],
+            "timeout": "7 days",
+            "needs": [{"pipeline": '$PARENT_PIPELINE_ID', "job": "functional:configure"}],
+            "script": [" ".join(script)],
+            "artifacts": {"paths": ["results/"], "when": "always"},
+        }
+
+    with open(output_path, 'w') as outfile:
+        yaml.dump(gitlab_pipeline, outfile, default_flow_style=False)
+
+
+if __name__ == "__main__":
+    main()
diff --git a/tests/functional_tests/python_test_utils/jet/generate_local_jobs.py b/tests/functional_tests/python_test_utils/jet/generate_local_jobs.py
new file mode 100644
index 0000000000..bc9ad22302
--- /dev/null
+++ b/tests/functional_tests/python_test_utils/jet/generate_local_jobs.py
@@ -0,0 +1,62 @@
+"""Generate launch scripts for local execution.
+
+This script allows to generate pre-filled launch scripts that allow for local execution of Megatron-LM functional tests inside containerized enviroments (i.e. Slurm enroot or Docker).
+
+This script will generate scripts into `$(pwd)/test_cases`.
+"""
+
+import pathlib
+from typing import Optional
+
+import click
+import jetclient
+import yaml
+
+from tests.functional_tests.python_test_utils.jet import common
+
+
+def load_script(config_path: str) -> str:
+    with open(config_path) as stream:
+        try:
+            jetclient.JETWorkloadManifest(**yaml.safe_load(stream)).spec.script
+        except yaml.YAMLError as exc:
+            raise exc
+
+
+@click.command()
+@click.option("--model", required=False, type=str, help="Filters all tests by matching model")
+@click.option("--scope", required=False, type=str, help="Filters all tests by matching scope")
+@click.option(
+    "--test-case", required=False, type=str, help="Returns a single test-case with matching name."
+)
+@click.option(
+    "--output-path",
+    required=True,
+    type=str,
+    help="Directory where the functional test will write its artifacts to (Tensorboard logs)",
+    default="/opt/megatron-lm",
+)
+def main(model: Optional[str], scope: Optional[str], test_case: Optional[str], output_path: str):
+    workloads = common.load_workloads(
+        container_image='none', scope=scope, model=model, test_case=test_case, container_tag='none'
+    )
+
+    for workload in workloads:
+        if workload.type == "build":
+            continue
+        magic_values = dict(workload.spec)
+        magic_values["assets_dir"] = output_path
+
+        file_path = (
+            pathlib.Path.cwd()
+            / "test_cases"
+            / workload.spec.model
+            / f"{workload.spec.test_case}.sh"
+        )
+        file_path.parent.mkdir(parents=True, exist_ok=True)
+        with open(file_path, "w", encoding="utf-8") as fh:
+            fh.write(workload.spec.script.format(**magic_values))
+
+
+if __name__ == "__main__":
+    main()
diff --git a/tests/functional_tests/python_test_utils/jet/launch_jet_workload.py b/tests/functional_tests/python_test_utils/jet/launch_jet_workload.py
new file mode 100644
index 0000000000..ebedea411e
--- /dev/null
+++ b/tests/functional_tests/python_test_utils/jet/launch_jet_workload.py
@@ -0,0 +1,225 @@
+import os
+import pathlib
+import re
+import signal
+import sys
+import tempfile
+from typing import List, Optional, Tuple
+
+import click
+import jetclient
+import yaml
+from jetclient.services.dtos.pipeline import PipelineStatus
+
+from tests.functional_tests.python_test_utils.jet import common
+
+BASE_PATH = pathlib.Path(__file__).parent.resolve()
+
+
+def resolve_cluster_config(cluster: str) -> str:
+    if cluster == "dgxh100_eos":
+        return "mcore/eos"
+    if cluster == "dgxa100_dracooci":
+        return "mcore/draco-oci"
+    if cluster == "dgxa100_dracooci-ord":
+        return "mcore/draco-oci-ord"
+    if cluster == "dgxh100_coreweave":
+        return "mcore/coreweave"
+    raise ValueError(f"Unknown cluster {cluster} provided.")
+
+
+def register_pipeline_terminator(pipeline: jetclient.JETPipeline):
+    def sigterm_handler(_signo, _stack_frame):
+        print(f"Trying to terminate pipeline {pipeline.jet_id}")
+        pipeline.cancel()
+        print(f"Pipeline {pipeline.jet_id} terminated")
+        sys.exit(0)
+
+    signal.signal(signal.SIGINT, sigterm_handler)
+    signal.signal(signal.SIGTERM, sigterm_handler)
+
+
+def launch_and_wait_for_completion(
+    test_case: str,
+    environment: str,
+    container_image: str,
+    container_tag: str,
+    cluster: str,
+    account: str,
+    run_name: Optional[str],
+    wandb_experiment: Optional[str],
+) -> jetclient.JETPipeline:
+    pipeline = jetclient.JETClient(
+        customer='mcore', gitlab_ci_token=os.getenv("RO_API_TOKEN"), env="prod"
+    ).workloads.submit(
+        workloads=common.load_workloads(
+            test_case=test_case,
+            container_image=container_image,
+            container_tag=container_tag,
+            environment=environment,
+        ),
+        config_id=resolve_cluster_config(cluster),
+        custom_config={
+            "launchers": {cluster: {"account": account}},
+            "executors": {
+                "jet-ci": {
+                    "environments": {
+                        cluster: {
+                            "variables": {
+                                "RUN_NAME": run_name or "",
+                                "WANDB_API_KEY": os.getenv("WANDB_API_KEY") or "",
+                                "WANDB_EXPERIMENT": wandb_experiment or "",
+                            }
+                        }
+                    }
+                }
+            },
+        },
+        wait_for_validation=True,
+    )
+
+    register_pipeline_terminator(pipeline=pipeline)
+
+    print(
+        f"Pipeline triggered; inspect it here: https://gitlab-master.nvidia.com/dl/jet/ci/-/pipelines/{pipeline.jet_id}",
+        flush=True,
+    )
+
+    pipeline.wait(max_wait_time=60 * 60 * 24 * 7)
+    print(f"Pipeline terminated; status: {pipeline.get_status()}")
+    return pipeline
+
+
+def download_job_assets(job: jetclient.JETJob, iteration: int = 0) -> List[str]:
+    logs = job.get_logs()
+    if not logs:
+        return [""]
+
+    assets_base_path = BASE_PATH / ".." / ".." / ".." / ".." / "results" / f"iteration={iteration}"
+
+    for restart_idx, log in enumerate(logs):
+        assets = log.get_assets()
+        assets_path = assets_base_path / f"restart={restart_idx}"
+        assets_path.mkdir(parents=True, exist_ok=True)
+        for log_filename in assets.keys():
+            with open(assets_path / log_filename, "w") as fh:
+                assets[log_filename].download(pathlib.Path(fh.name))
+
+
+def download_job_logs(job: jetclient.JETJob) -> List[str]:
+    logs = job.get_logs()
+    if not logs:
+        return [""]
+
+    assets = logs[0].get_assets()
+    log_filename = [key for key in assets.keys() if key.endswith(".log")][0]
+
+    with tempfile.NamedTemporaryFile() as tmp_file:
+        assets[log_filename].download(pathlib.Path(tmp_file.name))
+        with open(pathlib.Path(tmp_file.name), "r") as fh:
+            return fh.readlines()
+
+
+def parse_iterations_from_logs(logs: List[str]) -> Optional[Tuple[int, int]]:
+    for log_row in logs[::-1]:
+        match = re.search(r"iteration\s+(\d+)\s*/\s*(\d+)", log_row)
+        if match is not None:
+            return int(match.group(1)), int(match.group(2))
+
+
+@click.command()
+@click.option("--model", required=True, type=str, help="Model")
+@click.option("--test-case", required=True, type=str, help="Test case")
+@click.option(
+    "--environment", required=True, type=click.Choice(['dev', 'lts']), help="Pytorch LTS or DEV"
+)
+@click.option(
+    "--account",
+    required=False,
+    type=str,
+    help="Slurm account to use",
+    default="coreai_dlalgo_mcore",
+)
+@click.option("--cluster", required=True, type=str, help="Cluster to run on")
+@click.option("--container-tag", required=True, type=str, help="Base image of Mcore image")
+@click.option("--container-image", required=False, type=str, help="Base image of Mcore image")
+@click.option(
+    "--run-name", required=False, type=str, help="Run name (only relevant for release tests)"
+)
+@click.option(
+    "--wandb-experiment",
+    required=False,
+    type=str,
+    help="Wandb experiment (only relevant for release tests)",
+)
+def main(
+    model: str,
+    test_case: str,
+    environment: str,
+    account: str,
+    cluster: str,
+    container_tag: str,
+    container_image: Optional[str] = None,
+    run_name: Optional[str] = None,
+    wandb_experiment: Optional[str] = None,
+):
+
+    with open(
+        pathlib.Path(
+            BASE_PATH / ".." / ".." / "test_cases" / model / test_case / "model_config.yaml"
+        )
+    ) as stream:
+        try:
+            test_case_dict = yaml.safe_load(stream)
+        except yaml.YAMLError as exc:
+            print(exc)
+
+    test_type = test_case_dict['TEST_TYPE']
+
+    if test_type == "release" and (run_name is None or wandb_experiment is None):
+        print(f"Not all arguments provided ({run_name=}, {wandb_experiment=})")
+        sys.exit(1)
+
+    n_attempts = 0
+    n_iteration = 0
+    while True and n_attempts < 3:
+        pipeline = launch_and_wait_for_completion(
+            test_case=test_case,
+            environment=environment,
+            container_image=container_image,
+            container_tag=container_tag,
+            cluster=cluster,
+            account=account,
+            run_name=run_name,
+            wandb_experiment=wandb_experiment,
+        )
+
+        main_job = [job for job in pipeline.get_jobs() if job.name.startswith("basic")][0]
+
+        logs = download_job_logs(job=main_job)
+        concat_logs = "\n".join(logs)
+        print(f"Logs:\n{concat_logs}")
+
+        download_job_assets(job=main_job, iteration=n_iteration)
+
+        if test_type != "release":
+            success = pipeline.get_status() == PipelineStatus.SUCCESS
+            sys.exit(int(not success))  # invert for exit 0
+
+        parsed_result = parse_iterations_from_logs(logs=logs)
+        if not parsed_result:
+            print("Weird log, no iterations found")
+            n_attempts += 1
+            continue
+
+        current_iteration, total_iterations = parsed_result
+        if current_iteration == total_iterations:
+
+            success = pipeline.get_status() == PipelineStatus.SUCCESS
+            sys.exit(int(not success))  # invert for exit 0
+        n_iteration += 1
+    sys.exit(1)
+
+
+if __name__ == "__main__":
+    main()
diff --git a/tests/functional_tests/python_test_utils/test_ci_pipeline.py b/tests/functional_tests/python_test_utils/test_ci_pipeline.py
index 829ebeec41..90662485d9 100644
--- a/tests/functional_tests/python_test_utils/test_ci_pipeline.py
+++ b/tests/functional_tests/python_test_utils/test_ci_pipeline.py
@@ -1,87 +1,96 @@
 import os
-import json
+from typing import List, Union
+
+import numpy as np
 import pytest
-import sys
-import glob
-from tensorboard.backend.event_processing import event_accumulator
-
-LOGS_DIR = os.getenv('LOGS_DIR')
-EXPECTED_METRICS_FILE = os.getenv('EXPECTED_METRICS_FILE')
-
-import enum
-
-class TypeOfTest(enum.Enum):
-    APPROX = 1
-    DETERMINISTIC = 2
-
-
-def read_tb_logs_as_list(path, summary_name):
-    """Reads a TensorBoard Events file from the input path, and returns the
-    summary specified as input as a list.
-
-    Arguments:
-    path: str, path to the dir where the events file is located.
-    summary_name: str, name of the summary to read from the TB logs.
-    Output:
-    summary_list: list, the values in the read summary list, formatted as a list.
-    """
-    files = glob.glob(f"{path}/events*tfevents*")
-    files += glob.glob(f"{path}/results/events*tfevents*")
-    files.sort(key=lambda x: os.path.getmtime(os.path.join(path, x)))
-    if files:
-        event_file = files[0]
-        ea = event_accumulator.EventAccumulator(event_file)
-        ea.Reload()
-        summary = ea.Scalars(summary_name)
-        summary_list = [round(x.value, 5) for x in summary]
-        print(f'\nObtained the following list for {summary_name} ------------------')
-        print(summary_list)
-        return summary_list
-    raise FileNotFoundError(f"File not found matching: {path}/events*")
+
+from .common import (
+    METRIC_TO_THRESHOLD,
+    TYPE_OF_TEST_TO_METRIC,
+    TypeOfTest,
+    load_expected_data,
+    read_tb_logs_as_list,
+)
+
+
+@pytest.fixture(params=load_expected_data().items())
+def expected_data(request):
+    return request.param
 
 
 # If we require a variation of tests for any of the other pipelines we can just inherit this class.
 class TestCIPipeline:
+    allow_nondeterministic = bool(int(os.getenv("NVTE_ALLOW_NONDETERMINISTIC_ALGO")))
+    logs_dir = os.getenv("LOGS_DIR")
+
+    # Replace symbol in namespace to fix function call result for lifetime of
+    # this class.
+
+    def _test_helper(self, metric_type: str, metric_dict: List[Union[int, float]], test_type):
+        expected_list = metric_dict['values']
+        print(f"The list of expected values: {expected_list} for metric {metric_type}")
+
+        try:
+            actual_list = read_tb_logs_as_list(self.logs_dir)[metric_type]
+        except KeyError as e:
+            raise KeyError(
+                f"Required metric {metric_type} not found in TB logs. Please make sure your model \
+exports this metric as its required by the test case/golden values file"
+            ) from e
+
+        if actual_list is None:
+            raise ValueError(f"No values of {metric_type} found in TB logs.")
+
+        actual_list_sliced = actual_list[
+            metric_dict["start_step"] : metric_dict["end_step"] : metric_dict["step_interval"]
+        ]
+        print(f"The list of actual values: {actual_list_sliced}")
+
+        if metric_type == "iteration-time":
+            actual_list_sliced = actual_list_sliced[3:]
+            expected_list = expected_list[3:]
+            print("Removing first items of values for metric_type iteration-time")
+
+        if test_type == TypeOfTest.DETERMINISTIC:
+            assert np.allclose(
+                actual_list_sliced, expected_list, rtol=0, atol=0
+            ), f"Actual is not equal to Expected for {metric_type}"
+        elif test_type == TypeOfTest.APPROX:
+            assert np.allclose(
+                actual_list_sliced, expected_list, rtol=1e-5, atol=METRIC_TO_THRESHOLD[metric_type]
+            ), f"Actual is not equal to Expected for {metric_type}"
+        else:
+            raise ValueError(f"Unexpected test_type {test_type} provided")
+
+    def test_approx(self, expected_data):
+        expected_metric, expected_values = expected_data
+
+        if expected_metric in TYPE_OF_TEST_TO_METRIC[TypeOfTest.APPROX]:
+            self._test_helper(expected_metric, expected_values, TypeOfTest.APPROX)
+        else:
+            print(f"Skipping metric {expected_metric} for approximate as it is deterministic only.")
+
+    @pytest.mark.skipif(allow_nondeterministic, reason="Cannot expect exact results")
+    def test_deterministic(self, expected_data):
+        expected_metric, expected_values = expected_data
+
+        if expected_metric in TYPE_OF_TEST_TO_METRIC[TypeOfTest.DETERMINISTIC]:
+            self._test_helper(expected_metric, expected_values, TypeOfTest.DETERMINISTIC)
+        else:
+            print(f"Skipping metric {expected_metric} for deterministic as it is approximate only.")
+
+    # # @TODO: This is inactive, do we want to activate it?
+    # def iteration_timing_node(self):
+    #     expected_iteration_timing_avg = self.expected["train_step_timing_avg"]
+    #     iteration_time = read_tb_logs_as_list(LOGS_DIR)["iteration-time"]
+    #     idx = len(iteration_time) // 3
+    #     iteration_time_avg = sum(iteration_time[idx:]) / len(iteration_time[idx:])
+    #     assert (
+    #         expected_iteration_timing_avg
+    #         == pytest.approx(expected=iteration_time_avg, rel=self.margin_time)
+    #     ), f"The time per global step must be approximately {expected_iteration_timing_avg} but "
+    #         "it is {iteration_time_avg}."
+
 
-    margin_loss, margin_time = 0.05, 0.1
-    expected = None
-    if os.path.exists(EXPECTED_METRICS_FILE):
-        with open(EXPECTED_METRICS_FILE) as f:
-            expected = json.load(f)
-
-    def _test_helper(self, loss_type, test_type):
-        if self.expected is None:
-            raise FileNotFoundError("Expected data is none")
-        expected = self.expected[loss_type]
-        expected_list = expected["values"]
-        print(expected_list)
-        actual_list = read_tb_logs_as_list(LOGS_DIR, loss_type)
-        assert actual_list is not None, f"No TensorBoard events file was found in the logs for {loss_type}."
-        actual_list_sliced = actual_list[expected["start_step"]:expected["end_step"]:expected["step_interval"]]
-        for i, (expected_val, actual_val) in enumerate(zip(expected_list, actual_list_sliced)):
-            step = i * expected["step_interval"]
-            print(f"Checking step {step} against expected {i}")
-            if test_type == TypeOfTest.APPROX:
-                assert actual_val == pytest.approx(expected=expected_val, rel=self.margin_loss), f"{self.job_name} : The loss at step {step} should be approximately {expected_val} but it is {actual_val}."
-            else:
-                assert actual_val == expected_val, f"The value at step {step} should be {expected_val} but it is {actual_val}."
-
-    @pytest.mark.xfail
-    def test_lm_loss_deterministic(self):
-        # Expected training loss curve at different global steps.
-        self._test_helper("lm loss", TypeOfTest.DETERMINISTIC)
-
-    def test_lm_loss_approx(self):
-        # Expected training loss curve at different global steps.
-        self._test_helper("lm loss", TypeOfTest.APPROX)
-
-    def test_num_zeros_deterministic(self):
-        # Expected validation loss curve at different global steps.
-        self._test_helper("num-zeros", TypeOfTest.DETERMINISTIC)
-    
-    def iteration_timing_node(self):
-        expected_iteration_timing_avg = self.expected["train_step_timing_avg"]
-        iteration_time = read_tb_logs_as_list(LOGS_DIR, "iteration-time")
-        idx = len(iteration_time)//3   
-        iteration_time_avg = sum(iteration_time[idx:])/len(iteration_time[idx:])
-        assert expected_iteration_timing_avg == pytest.approx(expected=iteration_time_avg, rel=self.margin_time), f"The time per global step must be approximately {expected_iteration_timing_avg} but it is {iteration_time_avg}."
+# if deterministic, then also approx
+# if not determinstic, then also aprox
diff --git a/tests/functional_tests/python_test_utils/test_fp8_ci_pipeline.py b/tests/functional_tests/python_test_utils/test_fp8_ci_pipeline.py
new file mode 100644
index 0000000000..b6a9b61ec9
--- /dev/null
+++ b/tests/functional_tests/python_test_utils/test_fp8_ci_pipeline.py
@@ -0,0 +1,113 @@
+import json
+import os
+
+import numpy as np
+import pytest
+import scipy.stats as ss
+from scipy.integrate import trapezoid
+
+from .common import read_tb_logs_as_list
+
+LOGS_DIR = os.getenv("LOGS_DIR")
+EXPECTED_METRICS_FILE = os.getenv("EXPECTED_METRICS_FILE")
+
+
+# If we require a variation of tests for any of the other pipelines we can just inherit this class.
+class TestFP8CIPipeline:
+    margin_loss, margin_time = 0.2, 0.1
+    auc_threshold, correlation_threshold = 0.01, 0.999
+    expected = None
+
+    def _setup(self):
+        if os.path.exists(EXPECTED_METRICS_FILE):
+            with open(EXPECTED_METRICS_FILE) as f:
+                self.expected = json.load(f)
+            if self.expected is None:
+                raise FileNotFoundError("Expected data is none")
+
+    def _get_actual(self, loss_type):
+        actual_list = read_tb_logs_as_list(LOGS_DIR)[loss_type]
+        assert (
+            actual_list is not None
+        ), f"No TensorBoard events file was found in the logs for {loss_type}."
+        return actual_list
+
+    def _margin_test_helper(self, loss_type):
+        expected = self.expected[loss_type]
+        expected_list = np.array(expected["values"])
+        actual_list = self._get_actual(loss_type)
+        actual_list_sliced = np.array(
+            actual_list[expected["start_step"] : expected["end_step"] : expected["step_interval"]]
+        )
+
+        max_diff_index = np.argmax(np.abs(actual_list_sliced - expected_list))
+        max_diff = np.abs(actual_list_sliced[max_diff_index] - expected_list[max_diff_index])
+
+        print(
+            "[INFO - margin]: "
+            f"maximum absolute difference for {loss_type} is {max_diff} at index {max_diff_index}, "
+            f"Actual: {actual_list_sliced[max_diff_index]}, "
+            f"Expected: {expected_list[max_diff_index]}"
+        )
+        assert np.allclose(
+            actual_list_sliced, expected_list, rtol=1e-5, atol=self.margin_loss
+        ), f"Actual is not equal to Expected for {loss_type}"
+
+    def _auc_test_helper(self, loss_type):
+        expected = self.expected[loss_type]
+        expected_list = np.array(expected["values"])
+        actual_list = self._get_actual(loss_type)
+        actual_list_sliced = np.array(
+            actual_list[expected["start_step"] : expected["end_step"] : expected["step_interval"]]
+        )
+
+        def compute_auc(y_values):
+            x_values = np.arange(0, len(y_values), 1)
+            area = trapezoid(y_values, x_values)
+            return round(area, 5)
+
+        baseline_area = compute_auc(expected_list)
+        current_area = compute_auc(actual_list_sliced)
+        diff = abs(baseline_area - current_area)
+
+        print(
+            f"[INFO - AUC]: AUC diff: {diff * 100 / baseline_area} %, current: {current_area}, "
+            f"baseline: {baseline_area}"
+        )
+        assert (baseline_area <= 0) or (diff <= self.auc_threshold * baseline_area)
+
+    def _correlation_test_helper(self, loss_type):
+        expected = self.expected[loss_type]
+        expected_list = np.array(expected["values"])
+        actual_list = self._get_actual(loss_type)
+        actual_list_sliced = np.array(
+            actual_list[expected["start_step"] : expected["end_step"] : expected["step_interval"]]
+        )
+        corr = ss.pearsonr(actual_list_sliced, expected_list).statistic
+
+        print(f"[INFO - Corr]: Corr: {corr}")
+        assert corr > self.correlation_threshold
+
+    @pytest.mark.xfail
+    def test_lm_loss_margin(self):
+        self._setup()
+        self._margin_test_helper("lm loss")
+
+    def test_lm_loss_auc(self):
+        self._setup()
+        self._auc_test_helper("lm loss")
+
+    @pytest.mark.xfail
+    def test_lm_loss_correlation(self):
+        self._setup()
+        self._correlation_test_helper("lm loss")
+
+    def iteration_timing_node(self):
+        expected_iteration_timing_avg = self.expected["train_step_timing_avg"]
+        iteration_time = read_tb_logs_as_list(LOGS_DIR)["iteration-time"]
+        idx = len(iteration_time) // 3
+        iteration_time_avg = sum(iteration_time[idx:]) / len(iteration_time[idx:])
+        assert expected_iteration_timing_avg == pytest.approx(
+            expected=iteration_time_avg, rel=self.margin_time
+        ), f"The time per global step must be approximately {expected_iteration_timing_avg} but it \
+is {iteration_time_avg}."
diff --git a/tests/functional_tests/python_test_utils/test_resume_checkpoint_pipeline.py b/tests/functional_tests/python_test_utils/test_resume_checkpoint_pipeline.py
index 5d3e69d123..61955e8f42 100644
--- a/tests/functional_tests/python_test_utils/test_resume_checkpoint_pipeline.py
+++ b/tests/functional_tests/python_test_utils/test_resume_checkpoint_pipeline.py
@@ -1,55 +1,63 @@
 import os
-import sys
-import json
-import shutil
-import glob
-from tensorboard.backend.event_processing import event_accumulator
-
-LOGS_DIR = os.getenv('LOGS_DIR')
-
-def read_tb_logs_as_list(path, summary_name, index):
-    files = glob.glob(f"{path}/events*tfevents*")
-    files += glob.glob(f"{path}/results/events*tfevents*")
-    files.sort(key=lambda x: os.path.getmtime(os.path.join(path, x)))
-    if files:
-        event_file = files[index]
-        ea = event_accumulator.EventAccumulator(event_file)
-        ea.Reload()
-        summary = ea.Scalars(summary_name)
-        summary_list = [round(x.value, 5) for x in summary]
-        print(summary_list)
-        return summary_list
-    raise FileNotFoundError(f"File not found matching: {path}/events*")    
+
+os.environ["OPENBLAS_NUM_THREADS"] = "1"
+import pytest
+
+from tests.functional_tests.python_test_utils.common import TypeOfTest, read_tb_logs_as_list
+
+LOGS_DIR = os.getenv("LOGS_DIR")
+ALLOW_NONDETERMINISTIC = os.getenv("NVTE_ALLOW_NONDETERMINISTIC_ALGO")
+STEP_INTERVAL = 5
+
 
 def collect_train_test_metrics(logs_dir, index):
-    train_loss_list = read_tb_logs_as_list(logs_dir, "lm loss", index)
-    train_loss_list = [round(elem,3) for elem in train_loss_list]
-    train_metrics = {
-        "lm loss": train_loss_list[0:len(train_loss_list):5],
-    } 
-    str_train_metrics = str(train_metrics).replace("'", "\"")
-    print(f"\n ----------- The following are the metrics for ----------")
+    train_loss_list = read_tb_logs_as_list(logs_dir, index)["lm loss"]
+    train_loss_list = [round(elem, 3) for elem in train_loss_list]
+    train_metrics = {"lm loss": train_loss_list[0 : len(train_loss_list) : STEP_INTERVAL]}
+    str_train_metrics = str(train_metrics).replace("'", '"')
+    print("\n ----------- The following are the metrics for ----------")
     print(f"\n {str_train_metrics}", flush=True)
     return train_metrics
 
-class TestCIPipeline:
 
+class TestCIPipeline:
+    margin_loss = 0.005
+    allow_nondeterministic = bool(int(ALLOW_NONDETERMINISTIC))
     train_metrics_100 = collect_train_test_metrics(LOGS_DIR, 0)
     train_metrics_50_to_100 = collect_train_test_metrics(LOGS_DIR, 1)
 
-    def _test_helper(self, loss_type):
+    def _test_helper(self, loss_type, test_type):
         expected = self.train_metrics_100[loss_type]
-        print('expected : '  + str(expected))
+        assert (
+            len(expected) == 100 // STEP_INTERVAL
+        ), "Train metrics from first run (before checkpoint load) should \
+have {100 // STEP_INTERVAL} elements"
+        print("expected : " + str(expected))
         actual = self.train_metrics_50_to_100[loss_type]
-        print('actual : '  + str(actual))
-        # NOTE : Doing this way because in gpt3 model when I run from 0 - 100 directly, it produces 1 extra element
-        # i.e expected is [10.84266, 10.89696, 10.90542, 10.87498, 10.86265, 10.83608, 10.64368, 10.62319, 10.53908, 10.25005, 10.20907, 9.96542, 9.96802, 9.92436, 9.79086, 9.26718, 9.61784, 9.19018, 9.45986, 9.62168, 9.73772, 8.85732, 9.43185, 9.27912, 9.6832, 9.5127, 9.5419, 9.02549, 8.55077, 8.91355, 8.83375, 9.17722, 9.22436, 9.19436, 9.11323, 9.09711, 9.04421, 9.36795]
-        # actual is : [9.73772, 8.85732, 9.43185, 9.27912, 9.6832, 9.5127, 9.5419, 9.02549, 8.55077, 8.91355, 8.83375, 9.17722, 9.22435, 9.19435, 9.11322, 9.09711, 9.04422]
-        # That extra element in expected is causing some issues. So doing it this way. Need to figure out whats happening
-        start_idx_expected = expected.index(actual[0]) # First element of actual
+        assert (
+            len(actual) == 50 // STEP_INTERVAL
+        ), "Train metrics from second run (after checkpoint load) should have \
+{50 // STEP_INTERVAL} elements"
+        print("actual : " + str(actual))
+        start_idx_expected = len(expected) - len(actual)
+        print("start_idx_expected:", start_idx_expected)
         # Here we will just be comparing values of actual and second half (50-100) of expected
-        for i in range(len(actual)):
-            assert actual[i] == expected[start_idx_expected + i], f"The value at step {i} should be {expected[start_idx_expected + i]} but it is {actual[i]}."
+        for i, (expected_val, actual_val) in enumerate(zip(expected[start_idx_expected:], actual)):
+            step = start_idx_expected + i * STEP_INTERVAL
+            if test_type == TypeOfTest.APPROX:
+                assert actual_val == pytest.approx(
+                    expected=expected_val, rel=self.margin_loss
+                ), f"The loss at step {step} should be approximately {expected_val} but it is \
+{actual_val}."
+            else:
+                assert (
+                    actual_val == expected_val
+                ), f"The value at step {step} should be {expected_val} but it is {actual_val}."
 
+    @pytest.mark.skipif(allow_nondeterministic, reason="Nondeterministic is allowed.")
     def test_lm_loss_deterministic(self):
-        self._test_helper("lm loss")
\ No newline at end of file
+        self._test_helper("lm loss", TypeOfTest.DETERMINISTIC)
+
+    @pytest.mark.skipif(not allow_nondeterministic, reason="Nondeterministic is not allowed.")
+    def test_lm_loss_nondeterministic(self):
+        self._test_helper("lm loss", TypeOfTest.APPROX)
diff --git a/tests/functional_tests/shell_test_utils/_run_training.sh b/tests/functional_tests/shell_test_utils/_run_training.sh
new file mode 100644
index 0000000000..847f93613e
--- /dev/null
+++ b/tests/functional_tests/shell_test_utils/_run_training.sh
@@ -0,0 +1,105 @@
+#!/bin/bash
+
+# This script can be used for model onboarding and testing.
+
+# For onboarding, it extract scalars from Tensorboard logs only.
+# For testing, it compares extracted Tensorboard scalars against
+# a set of `GOLDEN_VALUES`.
+
+set -euxo pipefail
+
+echo "------ARGUMENTS LIST --------"
+for ARGUMENT in "$@"; do
+    KEY=$(echo $ARGUMENT | cut -f1 -d=)
+
+    KEY_LENGTH=${#KEY}
+    VALUE="${ARGUMENT:$KEY_LENGTH+1}"
+
+    export "$KEY"="$VALUE"
+    echo "$KEY=$VALUE"
+done
+echo "---------------------------------"
+
+# Check that mandatory vars are set
+MANDATORY_VARS=(
+    "TRAINING_SCRIPT_PATH"
+    "TRAINING_PARAMS_PATH"
+    "OUTPUT_PATH"
+    "TENSORBOARD_PATH"
+    "CHECKPOINT_PATH"
+    "DATA_PATH"
+    "RUN_NUMBER"
+)
+for mandatory_var in "${MANDATORY_VARS[@]}"; do
+    if [[ -z "${!mandatory_var}" ]]; then
+        echo 'Providing $'$mandatory_var' is mandatory.'
+        exit 1
+    fi
+done
+
+# Envsubst model_params
+cat $TRAINING_PARAMS_PATH | envsubst "$(env | cut -d= -f1 | sed -e 's/^/$/')" >$TRAINING_PARAMS_PATH.tmp
+mv $TRAINING_PARAMS_PATH.tmp $TRAINING_PARAMS_PATH
+
+# Pull env vars to export
+ENV_VARS=$(yq '... comments="" | .ENV_VARS | to_entries | .[] | [.key + "=" + .value] | join(" ")' $TRAINING_PARAMS_PATH)
+while IFS= read -r ARGUMENT; do
+    KEY=$(echo $ARGUMENT | cut -f1 -d=)
+
+    KEY_LENGTH=${#KEY}
+    VALUE="${ARGUMENT:$KEY_LENGTH+1}"
+
+    export "$KEY"="$VALUE"
+    echo "$KEY=$VALUE"
+done <<< "$ENV_VARS"
+
+# Run before script
+SCRIPT=$(cat $TRAINING_PARAMS_PATH | yq '.BEFORE_SCRIPT')
+if [[ "$SCRIPT" != null ]]; then
+    eval "$SCRIPT"
+fi;
+
+# Exit earlier to leave time for properly saving checkpoint
+if [[ $(echo "$TRAINING_SCRIPT_PATH" | tr '[:upper:]' '[:lower:]') == *nemo* ]]; then
+    PARAMS=""
+    TRAINING_PARAMS_FROM_CONFIG=$(yq '... comments="" | .MODEL_ARGS | to_entries | .[] | with(select(.value == "true"); .value = "") | [.key + "=" + .value] | join("")' $TRAINING_PARAMS_PATH | tr '\n' ' ')
+
+else
+    # If this is a second run (of checkpoint-resume), we might want to use a 
+    # different model configuration than during first time. So if key `MODEL_ARGS_2`
+    # exists we use it, otherwise we use the same as for the first run.
+    if [[ $RUN_NUMBER -eq 2 && $(yq 'has("MODEL_ARGS_2")' $TRAINING_PARAMS_PATH) == true ]]; then
+        export KEY="MODEL_ARGS_2"
+    else
+        export  KEY="MODEL_ARGS"
+    fi
+
+    TRAINING_PARAMS_FROM_CONFIG=$(yq '... comments="" | .[env(KEY)] | to_entries | .[] | with(select(.value == "true"); .value = "") | [.key + " " + .value] | join("")' $TRAINING_PARAMS_PATH | tr '\n' ' ')
+    PARAMS="--exit-duration-in-mins $((($SLURM_JOB_END_TIME - $SLURM_JOB_START_TIME) / 60 - 15))"
+fi
+
+# Extract training params
+PARAMS="$PARAMS $TRAINING_PARAMS_FROM_CONFIG"
+
+# Set PYTHONPATH
+export PYTHONPATH="$(pwd):${PYTHONPATH:-}"
+export WANDB_API_KEY="${WANDB_API_KEY:-}"
+
+######## Distributed training settings. ########
+echo "------ARGUMENTS for SLURM ---"
+MASTER_ADDR=${MASTER_ADDR:-localhost}
+MASTER_PORT=${MASTER_PORT:-6000}
+NUM_NODES=${NUM_NODES:-${SLURM_NNODES}}
+GPUS_PER_NODE=${GPUS_PER_NODE:-8}
+NODE_RANK=${SLURM_NODEID:-${SLURM_NODEID}}
+DISTRIBUTED_ARGS=(
+    --nproc_per_node $GPUS_PER_NODE
+    --nnodes $NUM_NODES
+    --master_addr $MASTER_ADDR
+    --master_port $MASTER_PORT
+    --node_rank $SLURM_NODEID
+)
+
+# Start training
+torchrun ${DISTRIBUTED_ARGS[@]} $TRAINING_SCRIPT_PATH $PARAMS
+
diff --git a/tests/functional_tests/shell_test_utils/jobwait.sh b/tests/functional_tests/shell_test_utils/jobwait.sh
deleted file mode 100644
index dd49fd8cd6..0000000000
--- a/tests/functional_tests/shell_test_utils/jobwait.sh
+++ /dev/null
@@ -1,25 +0,0 @@
-#! /bin/bash
-
-JOBID=$1
-echo "Job id : $JOBID"
-
-if [[ $JOBID -eq "" ]]; then
-  exit 1
-fi
-
-sleep 10s
-
-while true; do
-    export STATE=`sacct -j $JOBID --format State --parsable2 --noheader |& head -n 1`
-    case "${STATE}" in
-        PENDING|RUNNING|REQUEUED)
-            echo "Job is still in $STATE"
-            sleep 15s
-            ;;
-        *)
-            sleep 30s
-            echo "Exiting with SLURM job status '${STATE}'"
-            exit 0
-            ;;
-    esac
-done
diff --git a/tests/functional_tests/shell_test_utils/notify.sh b/tests/functional_tests/shell_test_utils/notify.sh
new file mode 100644
index 0000000000..cbdc0e7030
--- /dev/null
+++ b/tests/functional_tests/shell_test_utils/notify.sh
@@ -0,0 +1,194 @@
+set -euxo pipefail
+
+collect_jobs () {
+  PAGE=1
+  PER_PAGE=100
+  RESULTS="[]"
+
+  while true; do
+    # Fetch the paginated results
+    RESPONSE=$(curl \
+                  -s \
+                  --globoff \
+                  --header "PRIVATE-TOKEN: $RO_API_TOKEN" \
+                  "https://${GITLAB_ENDPOINT}/api/v4/projects/${CI_PROJECT_ID}/pipelines/${DOWNSTREAM_PIPELINE_ID}/jobs?page=$PAGE&per_page=$PER_PAGE"
+              )
+    # Combine the results
+    RESULTS=$(jq -s '.[0] + .[1]' <<< "$RESULTS $RESPONSE")
+
+    # Check if there are more pages
+    if [[ $(jq 'length' <<< "$RESPONSE") -lt $PER_PAGE ]]; then
+      break
+    fi
+
+    # Increment the page number
+    PAGE=$((PAGE + 1))
+  done
+
+  echo "$RESULTS"
+}
+
+CI_PIPELINE_ID=${1:-16595865}
+ENVIRONMENT=${2}
+
+CI_PROJECT_ID=${CI_PROJECT_ID:-19378}
+
+# Fetch Elastic logs
+set +x
+PIPELINE_JSON=$(curl \
+                  --fail \
+                  --silent \
+                  --header "PRIVATE-TOKEN: ${RO_API_TOKEN}" \
+                  "https://${GITLAB_ENDPOINT}/api/v4/projects/${CI_PROJECT_ID}/pipelines/${CI_PIPELINE_ID}/bridges?per_page=100"
+                ) || ret_code=$?
+set -x
+if [[ ${ret_code:-0} -ne 0 ]]; then
+    echo CI_PIPELINE_ID=$CI_PIPELINE_ID does not exist
+    exit 1
+fi
+
+# Fetch GitLab logs of JET downstream pipeline
+DOWNSTREAM_PIPELINE_ID=$(jq --arg environment "$ENVIRONMENT" '.[] |select(.name == "jet-trigger-" + $environment) | .downstream_pipeline.id' <<< "$PIPELINE_JSON")
+
+PIPELINE_URL=https://${GITLAB_ENDPOINT}/ADLR/megatron-lm/-/pipelines/$CI_PIPELINE_ID
+JOB_URL=https://${GITLAB_ENDPOINT}/ADLR/megatron-lm/-/jobs/
+
+if [[ $DOWNSTREAM_PIPELINE_ID == null ]]; then
+    FAILED_JOBS=$(curl \
+                    --fail \
+                    --silent \
+                    --header "PRIVATE-TOKEN: ${RO_API_TOKEN}" \
+                    "https://${GITLAB_ENDPOINT}/api/v4/projects/${CI_PROJECT_ID}/pipelines/${CI_PIPELINE_ID}/jobs?per_page=100" \
+                  | jq --arg JOB_URL "$JOB_URL" '[.[] | select(.status == "failed") | ("<" + $JOB_URL + (.id | tostring) + "|" + .name + ">")] | join("\n• Job: ")' | tr -d '"')
+    curl \
+        -X POST \
+        -H "Content-type: application/json" \
+        --data '
+            {
+                "blocks": [
+                    {                
+                        "type": "section",
+                        "text": {            
+                            "type": "mrkdwn",
+                            "text": "<'$PIPELINE_URL'|Report of '$DATE' ('$CONTEXT')>:\n"   
+                        }
+                    },
+                    {                
+                        "type": "section",
+                        "text": {            
+                            "type": "mrkdwn",
+                            "text": "\n• Job: '"$FAILED_JOBS"'"   
+                        }
+                    },
+                ]
+            
+            }' \
+        $WEBHOOK_URL
+
+else
+    set +x
+    JOBS=$(echo "$(collect_jobs)" | jq '[.[] | {id, name, status}]')
+    echo $JOBS
+    set -x
+
+    FAILED_JOBS=$(echo "$JOBS" \
+                | jq --arg GITLAB_ENDPOINT "$GITLAB_ENDPOINT" '[
+                    .[] 
+                    | select(.status != "success")
+                    | {
+                        name,
+                        id,
+                        "url": ("https://" + $GITLAB_ENDPOINT + "/dl/jet/ci/-/jobs/" + (.id | tostring)),
+                    }
+                ]'
+            ) 
+    set -x
+
+    for row in $(echo "${FAILED_JOBS}" | jq -r '.[] | @base64'); do
+        _jq() {
+        echo ${row} | base64 --decode | jq -r ${1}
+        }
+        JOB_ID=$(_jq '.id')
+        FULL_LOG=$(curl \
+            --location \
+            --header "PRIVATE-TOKEN: ${RO_API_TOKEN}" \
+            "https://${GITLAB_ENDPOINT}/api/v4/projects/${CI_PROJECT_ID}/jobs/${JOB_ID}/trace")
+        
+        if [[ "$FULL_LOG" == *exception* ]]; then 
+            LAST_EXCEPTION_POS=$(echo "$FULL_LOG" | grep -o -b 'exception' | tail -1 | cut -d: -f1)
+            SHORT_LOG=${FULL_LOG:$LAST_EXCEPTION_POS-500:499}
+        else
+            SHORT_LOG=${FULL_LOG: -1000}
+        fi
+
+        FAILED_JOBS=$(echo "$FAILED_JOBS" \
+                    | jq \
+                        --argjson JOB_ID "$JOB_ID" \
+                        --arg SLURM_FAILURE "$SHORT_LOG" '
+                            .[] |= ((select(.id==$JOB_ID) += {
+                                "slurm_failure_reason": $SLURM_FAILURE}))
+                    ')
+    done
+
+    NUM_FAILED=$(echo "$FAILED_JOBS" | jq 'length')
+    NUM_TOTAL=$(echo "$JOBS" | jq 'length')
+
+    if [[ $NUM_FAILED -eq 0 ]]; then
+        BLOCKS='[
+            {                
+                "type": "section",
+                "text": {            
+                    "type": "mrkdwn",
+                    "text": ":doge3d: <'$PIPELINE_URL'|Report of '$DATE' ('$CONTEXT')>: All '$NUM_TOTAL' passed"
+                }
+            }
+        ]'
+    else
+        BLOCKS=$(echo "$FAILED_JOBS" \
+                    | jq --arg DATE "$DATE" --arg CONTEXT "$CONTEXT" --arg URL "$PIPELINE_URL" --arg NUM_FAILED "$NUM_FAILED" --arg NUM_TOTAL "$NUM_TOTAL" '
+                        [
+                            {                
+                                "type": "section",
+                                "text": {            
+                                    "type": "mrkdwn",
+                                    "text": (":doctorge: <" + $URL + "|Report of " + $DATE + " (" + $CONTEXT + ")>: " + $NUM_FAILED + " of " + $NUM_TOTAL + " failed")
+                                }
+                            }
+                        ] + [
+                            .[] 
+                            | {                
+                                "type": "section",
+                                "text": {            
+                                    "type": "mrkdwn",
+                                    "text": (                               
+                                        "• Job: <" +.url + "|" + .name + ">"
+                                        + "\n    SLURM failure reason: \n```" + .slurm_failure_reason + "```"
+                                        
+                                    )
+                                }
+                            }
+                        ] + [
+                            {                
+                                "type": "section",
+                                "text": {            
+                                    "type": "mrkdwn",
+                                    "text": ("===============================================")
+                                }
+                            }
+                        ]'
+        )
+    fi
+
+    for row in $(echo "${BLOCKS}" | jq -r '.[] | @base64'); do
+        _jq() {
+            echo ${row} | base64 --decode
+        }
+
+        curl \
+            -X POST \
+            -H "Content-type: application/json" \
+            --data '{"blocks": '["$(_jq)"]'}' \
+            $WEBHOOK_URL
+    done
+
+fi
\ No newline at end of file
diff --git a/tests/functional_tests/shell_test_utils/notify_unit_tests.sh b/tests/functional_tests/shell_test_utils/notify_unit_tests.sh
new file mode 100644
index 0000000000..86cb29b772
--- /dev/null
+++ b/tests/functional_tests/shell_test_utils/notify_unit_tests.sh
@@ -0,0 +1,186 @@
+set -euxo pipefail
+
+collect_jobs () {
+  PAGE=1
+  PER_PAGE=100
+  RESULTS="[]"
+
+  while true; do
+    # Fetch the paginated results
+    RESPONSE=$(curl \
+                  -s \
+                  --globoff \
+                  --header "PRIVATE-TOKEN: $RO_API_TOKEN" \
+                  "https://${GITLAB_ENDPOINT}/api/v4/projects/${CI_PROJECT_ID}/pipelines/${DOWNSTREAM_PIPELINE_ID}/jobs?page=$PAGE&per_page=$PER_PAGE"
+              )
+    # Combine the results
+    RESULTS=$(jq -s '.[0] + .[1]' <<< "$RESULTS $RESPONSE")
+
+    # Check if there are more pages
+    if [[ $(jq 'length' <<< "$RESPONSE") -lt $PER_PAGE ]]; then
+      break
+    fi
+
+    # Increment the page number
+    PAGE=$((PAGE + 1))
+  done
+
+  echo "$RESULTS"
+}
+
+CI_PIPELINE_ID=${1:-16595865}
+CI_PROJECT_ID=${CI_PROJECT_ID:-19378}
+PIPELINE_URL=https://${GITLAB_ENDPOINT}/ADLR/megatron-lm/-/pipelines/$CI_PIPELINE_ID
+JOB_URL=https://${GITLAB_ENDPOINT}/ADLR/megatron-lm/-/jobs/
+CONTEXT="unit-tests-extended"
+
+# Fetch Elastic logs
+set +x
+PIPELINE_JSON=$(curl \
+                  --fail \
+                  --silent \
+                  --header "PRIVATE-TOKEN: ${RO_API_TOKEN}" \
+                  "https://${GITLAB_ENDPOINT}/api/v4/projects/${CI_PROJECT_ID}/pipelines/${CI_PIPELINE_ID}/jobs"
+                ) || ret_code=$?
+set -x
+if [[ ${ret_code:-0} -ne 0 ]]; then
+    echo CI_PIPELINE_ID=$CI_PIPELINE_ID does not exist
+    exit 1
+fi
+
+UNIT_TESTS_JOBS=$(echo -E $PIPELINE_JSON | jq '[.[] | select(.name | startswith("test:unit_tests_"))]')
+
+if [[ $UNIT_TESTS_JOBS == null ]]; then
+    FAILED_JOBS=$(curl \
+                    --fail \
+                    --silent \
+                    --header "PRIVATE-TOKEN: ${RO_API_TOKEN}" \
+                    "https://${GITLAB_ENDPOINT}/api/v4/projects/${CI_PROJECT_ID}/pipelines/${CI_PIPELINE_ID}/jobs?per_page=100" \
+                  | jq --arg JOB_URL "$JOB_URL" '[.[] | select(.status == "failed") | ("<" + $JOB_URL + (.id | tostring) + "|" + .name + ">")] | join("\n• Job: ")' | tr -d '"')
+    curl \
+        -X POST \
+        -H "Content-type: application/json" \
+        --data '
+            {
+                "blocks": [
+                    {                
+                        "type": "section",
+                        "text": {            
+                            "type": "mrkdwn",
+                            "text": "<'$PIPELINE_URL'|Report of '$DATE' ('$CONTEXT')>:\n"   
+                        }
+                    },
+                    {                
+                        "type": "section",
+                        "text": {            
+                            "type": "mrkdwn",
+                            "text": "\n• Job: '"$FAILED_JOBS"'"   
+                        }
+                    },
+                ]
+            
+            }' \
+        $WEBHOOK_URL
+
+else
+    FAILED_JOBS=$(echo -E "$UNIT_TESTS_JOBS" \
+                | jq --arg GITLAB_ENDPOINT "$GITLAB_ENDPOINT" --arg JOB_URL "$JOB_URL" '[
+                    .[] 
+                    | select(.status != "success")
+                    | {
+                        name,
+                        id,
+                        "url": ($JOB_URL + (.id | tostring)),
+                    }
+                ]'
+            ) 
+    set -x
+
+    for row in $(echo "${FAILED_JOBS}" | jq -r '.[] | @base64'); do
+        _jq() {
+        echo ${row} | base64 --decode | jq -r ${1}
+        }
+        JOB_ID=$(_jq '.id')
+        FULL_LOG=$(curl \
+            --location \
+            --header "PRIVATE-TOKEN: ${RO_API_TOKEN}" \
+            "https://${GITLAB_ENDPOINT}/api/v4/projects/${CI_PROJECT_ID}/jobs/${JOB_ID}/trace")
+        
+        if [[ "$FULL_LOG" == *exception* ]]; then 
+            LAST_EXCEPTION_POS=$(echo "$FULL_LOG" | grep -o -b 'exception' | tail -1 | cut -d: -f1)
+            SHORT_LOG=${FULL_LOG:$LAST_EXCEPTION_POS-500:499}
+        else
+            SHORT_LOG=${FULL_LOG: -1000}
+        fi
+
+        FAILED_JOBS=$(echo "$FAILED_JOBS" \
+                    | jq \
+                        --argjson JOB_ID "$JOB_ID" \
+                        --arg SLURM_FAILURE "$SHORT_LOG" '
+                            .[] |= ((select(.id==$JOB_ID) += {
+                                "slurm_failure_reason": $SLURM_FAILURE}))
+                    ')
+    done
+
+    NUM_FAILED=$(echo "$FAILED_JOBS" | jq 'length')
+    NUM_TOTAL=$(echo "$UNIT_TESTS_JOBS" | jq 'length')
+
+    if [[ $NUM_FAILED -eq 0 ]]; then
+        BLOCKS='[
+            {                
+                "type": "section",
+                "text": {            
+                    "type": "mrkdwn",
+                    "text": ":doge3d: <'$PIPELINE_URL'|Report of '$DATE' ('$CONTEXT')>: All '$NUM_TOTAL' passed"
+                }
+            }
+        ]'
+    else
+        BLOCKS=$(echo "$FAILED_JOBS" \
+                    | jq --arg DATE "$DATE" --arg CONTEXT "$CONTEXT" --arg URL "$PIPELINE_URL" --arg NUM_FAILED "$NUM_FAILED" --arg NUM_TOTAL "$NUM_TOTAL" '
+                        [
+                            {                
+                                "type": "section",
+                                "text": {            
+                                    "type": "mrkdwn",
+                                    "text": (":doctorge: <" + $URL + "|Report of " + $DATE + " (" + $CONTEXT + ")>: " + $NUM_FAILED + " of " + $NUM_TOTAL + " failed")
+                                }
+                            }
+                        ] + [
+                            .[] 
+                            | {                
+                                "type": "section",
+                                "text": {            
+                                    "type": "mrkdwn",
+                                    "text": (                               
+                                        "• Job: <" +.url + "|" + .name + ">"
+                                        + "\n    SLURM failure reason: \n```" + .slurm_failure_reason + "```"
+                                        
+                                    )
+                                }
+                            }
+                        ] + [
+                            {                
+                                "type": "section",
+                                "text": {            
+                                    "type": "mrkdwn",
+                                    "text": ("===============================================")
+                                }
+                            }
+                        ]'
+        )
+    fi
+
+    for row in $(echo "${BLOCKS}" | jq -r '.[] | @base64'); do
+        _jq() {
+            echo ${row} | base64 --decode
+        }
+
+        curl \
+            -X POST \
+            -H "Content-type: application/json" \
+            --data '{"blocks": '["$(_jq)"]'}' \
+            $WEBHOOK_URL
+    done
+
+fi
\ No newline at end of file
diff --git a/tests/functional_tests/shell_test_utils/run_ci_test.sh b/tests/functional_tests/shell_test_utils/run_ci_test.sh
new file mode 100644
index 0000000000..bb03676bc9
--- /dev/null
+++ b/tests/functional_tests/shell_test_utils/run_ci_test.sh
@@ -0,0 +1,92 @@
+#!/bin/bash
+
+set -exo pipefail
+
+echo "------ARGUMENTS LIST --------"
+for ARGUMENT in "$@"; do
+    echo $ARGUMENT
+    KEY=$(echo $ARGUMENT | cut -f1 -d=)
+
+    KEY_LENGTH=${#KEY}
+    VALUE=$(eval echo ${ARGUMENT:$KEY_LENGTH+1})
+    export "$KEY"="$VALUE"
+    echo "$KEY=$VALUE"
+done
+echo "---------------------------------"
+
+# Check that mandatory vars are set
+MANDATORY_VARS=(
+    "TRAINING_SCRIPT_PATH"
+    "TRAINING_PARAMS_PATH"
+    "GOLDEN_VALUES_PATH"
+    "OUTPUT_PATH"
+    "TENSORBOARD_PATH"
+    "CHECKPOINT_PATH"
+    "DATA_PATH"
+    "DATA_CACHE_PATH"
+)
+for mandatory_var in "${MANDATORY_VARS[@]}"; do
+    if [[ -z "${!mandatory_var}" ]]; then
+        echo 'Providing $'$mandatory_var' is mandatory.'
+        exit 1
+    fi
+done
+
+SCRIPT_DIR=$( cd -- "$( dirname -- "${BASH_SOURCE[0]}" )" &> /dev/null && pwd )
+ROOT_DIR=$(realpath $SCRIPT_DIR/../../../)
+
+# Extract settings from params file
+TEST_TYPE=$(cat $TRAINING_PARAMS_PATH \
+            | yq '.TEST_TYPE')
+NVTE_ALLOW_NONDETERMINISTIC_ALGO=$(cat $TRAINING_PARAMS_PATH \
+                                   | yq '.ENV_VARS.NVTE_ALLOW_NONDETERMINISTIC_ALGO')
+SKIP_PYTEST=$(cat $TRAINING_PARAMS_PATH \
+              | yq '.ENV_VARS.SKIP_PYTEST')
+N_REPEATS=$(cat $TRAINING_PARAMS_PATH \
+              | yq '.ENV_VARS.N_REPEATS //1')
+
+for i in $(seq 1 $N_REPEATS);
+do
+    if [[ $i -gt 1 ]]; then
+        rm -rf $CHECKPOINT_PATH/*
+    fi
+
+    # Training
+    export RUN_NUMBER=1
+    bash $ROOT_DIR/tests/functional_tests/shell_test_utils/_run_training.sh
+
+    # Maybe checkpoint resume training
+    if [[ "$TEST_TYPE" == "ckpt-resume" ]]; then 
+        rm -rf $CHECKPOINT_PATH/iter_0000100; 
+        echo 50 > $CHECKPOINT_PATH/latest_checkpointed_iteration.txt;
+        export RUN_NUMBER=2
+        bash $ROOT_DIR/tests/functional_tests/shell_test_utils/_run_training.sh
+    fi
+
+    # Save run results
+    export PYTHONPATH=$ROOT_DIR
+    python3 $ROOT_DIR/tests/functional_tests/python_test_utils/get_test_results_from_tensorboard_logs.py \
+        --logs-dir $TENSORBOARD_PATH \
+        --output-path ${OUTPUT_PATH}/$(basename $GOLDEN_VALUES_PATH)
+
+    # Maybe run tests
+    if [[ ${SKIP_PYTEST:-0} != 1 ]]; then
+        export NVTE_ALLOW_NONDETERMINISTIC_ALGO
+        export LOGS_DIR=$TENSORBOARD_PATH
+        
+        if [[ "$TEST_TYPE" == "ckpt-resume" ]]; then
+            echo "Running pytest 1st vs 2nd run comparison"
+            pytest -s $ROOT_DIR/tests/functional_tests/python_test_utils/test_resume_checkpoint_pipeline.py
+
+        elif [[ "$TEST_TYPE" == "regular" ]]; then
+            echo "Running pytest checks against golden values"
+            export EXPECTED_METRICS_FILE=$GOLDEN_VALUES_PATH 
+            pytest -s $ROOT_DIR/tests/functional_tests/python_test_utils/test_ci_pipeline.py
+
+        else
+            echo "Test type $TEST_TYPE not yet implemented."
+        fi
+    fi
+done
+
+
diff --git a/tests/functional_tests/test_cases/bert/bert_mr_mcore_tp2_pp2_dgx_a100_1N8G/golden_values_dev.json b/tests/functional_tests/test_cases/bert/bert_mr_mcore_tp2_pp2_dgx_a100_1N8G/golden_values_dev.json
new file mode 100644
index 0000000000..0f6772f012
--- /dev/null
+++ b/tests/functional_tests/test_cases/bert/bert_mr_mcore_tp2_pp2_dgx_a100_1N8G/golden_values_dev.json
@@ -0,0 +1,52 @@
+{   "lm loss": {
+        "start_step": 0,
+        "end_step": 50,
+        "step_interval": 5,
+        "values": [
+            10.49569,
+            10.48173,
+            10.48047,
+            10.45353,
+            10.44394,
+            10.35611,
+            10.13779,
+            10.04017,
+            9.86834,
+            9.67307
+        ]
+    },
+    "num-zeros": {
+        "start_step": 0,
+        "end_step": 50,
+        "step_interval": 5,
+        "values": [
+            2254.0,
+            2585.0,
+            2101.0,
+            2157.0,
+            2241.0,
+            2475.0,
+            2890.0,
+            3199.0,
+            3524.0,
+            3090.0
+        ]
+    },
+    "iteration-time": {
+        "start_step": 0,
+        "end_step": 50,
+        "step_interval": 5,
+        "values": [
+            13.65829,
+            1.27589,
+            1.2782,
+            1.32374,
+            1.26543,
+            1.26423,
+            1.26203,
+            1.54723,
+            1.27297,
+            1.26491
+        ]
+    }
+}
\ No newline at end of file
diff --git a/tests/functional_tests/test_cases/bert/bert_mr_mcore_tp2_pp2_dgx_a100_1N8G/golden_values_lts.json b/tests/functional_tests/test_cases/bert/bert_mr_mcore_tp2_pp2_dgx_a100_1N8G/golden_values_lts.json
new file mode 100644
index 0000000000..26ee3ea257
--- /dev/null
+++ b/tests/functional_tests/test_cases/bert/bert_mr_mcore_tp2_pp2_dgx_a100_1N8G/golden_values_lts.json
@@ -0,0 +1 @@
+{"lm loss": {"start_step": 0, "end_step": 50, "step_interval": 5, "values": [10.49574, 10.48174, 10.4804, 10.45344, 10.44396, 10.35607, 10.13786, 10.04016, 9.86838, 9.67302]}, "num-zeros": {"start_step": 0, "end_step": 50, "step_interval": 5, "values": [2182.0, 2462.0, 2158.0, 2112.0, 2291.0, 2485.0, 2953.0, 3287.0, 3440.0, 3059.0]}, "iteration_timing_avg": 0.8110379411764704}
\ No newline at end of file
diff --git a/tests/functional_tests/test_cases/bert/bert_mr_mcore_tp2_pp2_dgx_a100_1N8G/model_config.yaml b/tests/functional_tests/test_cases/bert/bert_mr_mcore_tp2_pp2_dgx_a100_1N8G/model_config.yaml
new file mode 100644
index 0000000000..704fd1ce5a
--- /dev/null
+++ b/tests/functional_tests/test_cases/bert/bert_mr_mcore_tp2_pp2_dgx_a100_1N8G/model_config.yaml
@@ -0,0 +1,46 @@
+ENV_VARS:
+  CUDA_DEVICE_MAX_CONNECTIONS: 1
+  NVTE_FLASH_ATTN: 0
+  NVTE_FUSED_ATTN: 0
+  NVTE_ALLOW_NONDETERMINISTIC_ALGO: 0
+  NCCL_ALGO: Tree
+  CUBLAS_WORKSPACE_CONFIG: :4096:8
+  N_REPEATS: 5
+MODEL_ARGS:
+  --num-layers: 24
+  --hidden-size: 1024
+  --num-attention-heads: 16
+  --log-params-norm: true
+  --log-num-zeros-in-grad: true
+  --log-validation-ppl-to-tensorboard: true
+  --log-timers-to-tensorboard: true
+  --log-memory-to-tensorboard: true
+  --tensorboard-dir: ${TENSORBOARD_PATH}
+  --micro-batch-size: 4
+  --global-batch-size: 128
+  --seq-length: 512
+  --max-position-embeddings: 512
+  --train-iters: 50
+  --timing-log-level: 2
+  --lr-decay-iters: 990000
+  --save: ${CHECKPOINT_PATH}
+  --load: ${CHECKPOINT_PATH}
+  --data-path: ${DATA_PATH}/my-bert_00_text_sentence
+  --vocab-file: ${DATA_PATH}/vocab.txt
+  --split: 949,50,1
+  --distributed-backend: nccl
+  --lr: 0.0001
+  --min-lr: 0.00001
+  --lr-warmup-fraction: 0.01
+  --log-interval: 1
+  --save-interval: 10000
+  --eval-interval: 1000
+  --eval-iters: 10
+  --tensor-model-parallel-size: 2
+  --pipeline-model-parallel-size: 2
+  --deterministic-mode: true
+  --no-gradient-accumulation-fusion: true
+  --data-cache-path: ${DATA_CACHE_PATH}
+  --bf16: true
+  --ckpt-format: torch
+TEST_TYPE: regular
diff --git a/tests/functional_tests/test_cases/bert/bert_mr_mcore_tp2_pp2_local_spec_dgx_a100_1N8G/golden_values_dev.json b/tests/functional_tests/test_cases/bert/bert_mr_mcore_tp2_pp2_local_spec_dgx_a100_1N8G/golden_values_dev.json
new file mode 100644
index 0000000000..1950cd0d08
--- /dev/null
+++ b/tests/functional_tests/test_cases/bert/bert_mr_mcore_tp2_pp2_local_spec_dgx_a100_1N8G/golden_values_dev.json
@@ -0,0 +1,70 @@
+{
+    "lm loss": {
+        "start_step": 0,
+        "end_step": 50,
+        "step_interval": 5,
+        "values": [
+            10.49566,
+            10.48166,
+            10.48045,
+            10.45348,
+            10.44412,
+            10.3561,
+            10.13792,
+            10.04026,
+            9.86832,
+            9.67306
+        ]
+    },
+    "num-zeros": {
+        "start_step": 0,
+        "end_step": 50,
+        "step_interval": 5,
+        "values": [
+            2183.0,
+            2469.0,
+            2115.0,
+            2126.0,
+            2281.0,
+            2389.0,
+            3013.0,
+            3255.0,
+            3491.0,
+            3062.0
+        ]
+    },
+    "mem-allocated-bytes": {
+        "start_step": 0,
+        "end_step": 50,
+        "step_interval": 5,
+        "values": [
+            1767237120.0,
+            1767237120.0,
+            1767237120.0,
+            1767237120.0,
+            1767237120.0,
+            1767237120.0,
+            1767237120.0,
+            1767237120.0,
+            1767237120.0,
+            1767237120.0
+        ]
+    },
+    "iteration-time": {
+        "start_step": 0,
+        "end_step": 50,
+        "step_interval": 5,
+        "values": [
+            14.75035,
+            1.17988,
+            1.18643,
+            1.18301,
+            1.19116,
+            1.19494,
+            1.54654,
+            1.19342,
+            1.1823,
+            1.18039
+        ]
+    }
+}
\ No newline at end of file
diff --git a/tests/functional_tests/test_cases/bert/bert_mr_mcore_tp2_pp2_local_spec_dgx_a100_1N8G/golden_values_lts.json b/tests/functional_tests/test_cases/bert/bert_mr_mcore_tp2_pp2_local_spec_dgx_a100_1N8G/golden_values_lts.json
new file mode 100644
index 0000000000..1950cd0d08
--- /dev/null
+++ b/tests/functional_tests/test_cases/bert/bert_mr_mcore_tp2_pp2_local_spec_dgx_a100_1N8G/golden_values_lts.json
@@ -0,0 +1,70 @@
+{
+    "lm loss": {
+        "start_step": 0,
+        "end_step": 50,
+        "step_interval": 5,
+        "values": [
+            10.49566,
+            10.48166,
+            10.48045,
+            10.45348,
+            10.44412,
+            10.3561,
+            10.13792,
+            10.04026,
+            9.86832,
+            9.67306
+        ]
+    },
+    "num-zeros": {
+        "start_step": 0,
+        "end_step": 50,
+        "step_interval": 5,
+        "values": [
+            2183.0,
+            2469.0,
+            2115.0,
+            2126.0,
+            2281.0,
+            2389.0,
+            3013.0,
+            3255.0,
+            3491.0,
+            3062.0
+        ]
+    },
+    "mem-allocated-bytes": {
+        "start_step": 0,
+        "end_step": 50,
+        "step_interval": 5,
+        "values": [
+            1767237120.0,
+            1767237120.0,
+            1767237120.0,
+            1767237120.0,
+            1767237120.0,
+            1767237120.0,
+            1767237120.0,
+            1767237120.0,
+            1767237120.0,
+            1767237120.0
+        ]
+    },
+    "iteration-time": {
+        "start_step": 0,
+        "end_step": 50,
+        "step_interval": 5,
+        "values": [
+            14.75035,
+            1.17988,
+            1.18643,
+            1.18301,
+            1.19116,
+            1.19494,
+            1.54654,
+            1.19342,
+            1.1823,
+            1.18039
+        ]
+    }
+}
\ No newline at end of file
diff --git a/tests/functional_tests/test_cases/bert/bert_mr_mcore_tp2_pp2_local_spec_dgx_a100_1N8G/model_config.yaml b/tests/functional_tests/test_cases/bert/bert_mr_mcore_tp2_pp2_local_spec_dgx_a100_1N8G/model_config.yaml
new file mode 100644
index 0000000000..eaf288d30d
--- /dev/null
+++ b/tests/functional_tests/test_cases/bert/bert_mr_mcore_tp2_pp2_local_spec_dgx_a100_1N8G/model_config.yaml
@@ -0,0 +1,47 @@
+ENV_VARS:
+  CUDA_DEVICE_MAX_CONNECTIONS: 1
+  NVTE_FLASH_ATTN: 0
+  NVTE_FUSED_ATTN: 0
+  NVTE_ALLOW_NONDETERMINISTIC_ALGO: 0
+  NCCL_ALGO: Tree
+  CUBLAS_WORKSPACE_CONFIG: :4096:8
+  N_REPEATS: 5
+MODEL_ARGS:
+  --num-layers: 24
+  --hidden-size: 1024
+  --num-attention-heads: 16
+  --log-params-norm: true
+  --log-num-zeros-in-grad: true
+  --log-validation-ppl-to-tensorboard: true
+  --log-timers-to-tensorboard: true
+  --log-memory-to-tensorboard: true
+  --tensorboard-dir: ${TENSORBOARD_PATH}
+  --micro-batch-size: 4
+  --global-batch-size: 128
+  --seq-length: 512
+  --max-position-embeddings: 512
+  --train-iters: 50
+  --timing-log-level: 2
+  --lr-decay-iters: 990000
+  --save: ${CHECKPOINT_PATH}
+  --load: ${CHECKPOINT_PATH}
+  --data-path: ${DATA_PATH}/my-bert_00_text_sentence
+  --vocab-file: ${DATA_PATH}/vocab.txt
+  --split: 949,50,1
+  --distributed-backend: nccl
+  --lr: 0.0001
+  --min-lr: 0.00001
+  --lr-warmup-fraction: 0.01
+  --log-interval: 1
+  --save-interval: 10000
+  --eval-interval: 1000
+  --eval-iters: 10
+  --tensor-model-parallel-size: 2
+  --pipeline-model-parallel-size: 2
+  --spec: local
+  --deterministic-mode: true
+  --no-gradient-accumulation-fusion: true
+  --data-cache-path: ${DATA_CACHE_PATH}
+  --bf16: true
+  --ckpt-format: torch
+TEST_TYPE: regular
diff --git a/tests/functional_tests/test_cases/bert/bert_mr_mcore_tp2_pp2_resume_torch_dist_dgx_a100_1N8G/model_config.yaml b/tests/functional_tests/test_cases/bert/bert_mr_mcore_tp2_pp2_resume_torch_dist_dgx_a100_1N8G/model_config.yaml
new file mode 100644
index 0000000000..7072374fab
--- /dev/null
+++ b/tests/functional_tests/test_cases/bert/bert_mr_mcore_tp2_pp2_resume_torch_dist_dgx_a100_1N8G/model_config.yaml
@@ -0,0 +1,48 @@
+ENV_VARS:
+  CUDA_DEVICE_MAX_CONNECTIONS: 1
+  NVTE_FLASH_ATTN: 0
+  NVTE_FUSED_ATTN: 0
+  NVTE_ALLOW_NONDETERMINISTIC_ALGO: 0
+  NCCL_ALGO: Tree
+  CUBLAS_WORKSPACE_CONFIG: :4096:8
+  N_REPEATS: 5
+MODEL_ARGS:
+  --num-layers: 24
+  --hidden-size: 1024
+  --num-attention-heads: 16
+  --log-params-norm: true
+  --log-num-zeros-in-grad: true
+  --log-validation-ppl-to-tensorboard: true
+  --log-timers-to-tensorboard: true
+  --log-memory-to-tensorboard: true
+  --tensorboard-dir: ${TENSORBOARD_PATH}
+  --micro-batch-size: 4
+  --global-batch-size: 128
+  --seq-length: 512
+  --max-position-embeddings: 512
+  --train-iters: 100
+  --timing-log-level: 2
+  --lr-decay-iters: 990000
+  --save: ${CHECKPOINT_PATH}
+  --load: ${CHECKPOINT_PATH}
+  --data-path: ${DATA_PATH}/my-bert_00_text_sentence
+  --vocab-file: ${DATA_PATH}/vocab.txt
+  --split: 949,50,1
+  --distributed-backend: nccl
+  --lr: 0.0001
+  --min-lr: 0.00001
+  --lr-warmup-fraction: 0.01
+  --log-interval: 1
+  --save-interval: 50
+  --eval-interval: 1000
+  --eval-iters: 10
+  --tensor-model-parallel-size: 2
+  --pipeline-model-parallel-size: 2
+  --deterministic-mode: true
+  --use-checkpoint-args: true
+  --use-checkpoint-opt_param-scheduler: true
+  --no-gradient-accumulation-fusion: true
+  --data-cache-path: ${DATA_CACHE_PATH}
+  --bf16: true
+  --ckpt-format: torch
+TEST_TYPE: ckpt-resume
diff --git a/tests/functional_tests/test_cases/bert/bert_mr_mcore_tp2_pp2_resume_torch_dist_local_spec_dgx_a100_1N8G/model_config.yaml b/tests/functional_tests/test_cases/bert/bert_mr_mcore_tp2_pp2_resume_torch_dist_local_spec_dgx_a100_1N8G/model_config.yaml
new file mode 100644
index 0000000000..f3afb10fd5
--- /dev/null
+++ b/tests/functional_tests/test_cases/bert/bert_mr_mcore_tp2_pp2_resume_torch_dist_local_spec_dgx_a100_1N8G/model_config.yaml
@@ -0,0 +1,49 @@
+ENV_VARS:
+  CUDA_DEVICE_MAX_CONNECTIONS: 1
+  NVTE_FLASH_ATTN: 0
+  NVTE_FUSED_ATTN: 0
+  NVTE_ALLOW_NONDETERMINISTIC_ALGO: 0
+  NCCL_ALGO: Tree
+  CUBLAS_WORKSPACE_CONFIG: :4096:8
+  N_REPEATS: 5
+MODEL_ARGS:
+  --num-layers: 24
+  --hidden-size: 1024
+  --num-attention-heads: 16
+  --log-params-norm: true
+  --log-num-zeros-in-grad: true
+  --log-validation-ppl-to-tensorboard: true
+  --log-timers-to-tensorboard: true
+  --log-memory-to-tensorboard: true
+  --tensorboard-dir: ${TENSORBOARD_PATH}
+  --micro-batch-size: 4
+  --global-batch-size: 128
+  --seq-length: 512
+  --max-position-embeddings: 512
+  --train-iters: 100
+  --timing-log-level: 2
+  --lr-decay-iters: 990000
+  --save: ${CHECKPOINT_PATH}
+  --load: ${CHECKPOINT_PATH}
+  --data-path: ${DATA_PATH}/my-bert_00_text_sentence
+  --vocab-file: ${DATA_PATH}/vocab.txt
+  --split: 949,50,1
+  --distributed-backend: nccl
+  --lr: 0.0001
+  --min-lr: 0.00001
+  --lr-warmup-fraction: 0.01
+  --log-interval: 1
+  --save-interval: 50
+  --eval-interval: 1000
+  --eval-iters: 10
+  --tensor-model-parallel-size: 2
+  --pipeline-model-parallel-size: 2
+  --spec: local
+  --deterministic-mode: true
+  --use-checkpoint-args: true
+  --use-checkpoint-opt_param-scheduler: true
+  --no-gradient-accumulation-fusion: true
+  --data-cache-path: ${DATA_CACHE_PATH}
+  --bf16: true
+  --ckpt-format: torch
+TEST_TYPE: ckpt-resume
diff --git a/tests/functional_tests/test_cases/bert/bert_mr_tp1_pp4_vp2_dgx_a100_1N8G/golden_values_dev.json b/tests/functional_tests/test_cases/bert/bert_mr_tp1_pp4_vp2_dgx_a100_1N8G/golden_values_dev.json
new file mode 100644
index 0000000000..83fd267942
--- /dev/null
+++ b/tests/functional_tests/test_cases/bert/bert_mr_tp1_pp4_vp2_dgx_a100_1N8G/golden_values_dev.json
@@ -0,0 +1 @@
+{"lm loss": {"start_step": 0, "end_step": 50, "step_interval": 5, "values": [10.54308, 10.53881, 10.55633, 10.53805, 10.52589, 10.49568, 10.45958, 10.32846, 10.17264, 9.96952]}, "num-zeros": {"start_step": 0, "end_step": 34, "step_interval": 5, "values": [22584.0, 20590.0, 27442.0, 22852.0, 22567.0, 20740.0, 23315.0]}, "iteration_timing_avg": 0.7692817647058824}
diff --git a/tests/functional_tests/test_cases/bert/bert_mr_tp1_pp4_vp2_dgx_a100_1N8G/golden_values_lts.json b/tests/functional_tests/test_cases/bert/bert_mr_tp1_pp4_vp2_dgx_a100_1N8G/golden_values_lts.json
new file mode 100644
index 0000000000..83fd267942
--- /dev/null
+++ b/tests/functional_tests/test_cases/bert/bert_mr_tp1_pp4_vp2_dgx_a100_1N8G/golden_values_lts.json
@@ -0,0 +1 @@
+{"lm loss": {"start_step": 0, "end_step": 50, "step_interval": 5, "values": [10.54308, 10.53881, 10.55633, 10.53805, 10.52589, 10.49568, 10.45958, 10.32846, 10.17264, 9.96952]}, "num-zeros": {"start_step": 0, "end_step": 34, "step_interval": 5, "values": [22584.0, 20590.0, 27442.0, 22852.0, 22567.0, 20740.0, 23315.0]}, "iteration_timing_avg": 0.7692817647058824}
diff --git a/tests/functional_tests/test_cases/bert/bert_mr_tp1_pp4_vp2_dgx_a100_1N8G/model_config.yaml b/tests/functional_tests/test_cases/bert/bert_mr_tp1_pp4_vp2_dgx_a100_1N8G/model_config.yaml
new file mode 100644
index 0000000000..1e8f604797
--- /dev/null
+++ b/tests/functional_tests/test_cases/bert/bert_mr_tp1_pp4_vp2_dgx_a100_1N8G/model_config.yaml
@@ -0,0 +1,50 @@
+ENV_VARS:
+  CUDA_DEVICE_MAX_CONNECTIONS: 1
+  NVTE_FLASH_ATTN: 0
+  NVTE_FUSED_ATTN: 0
+  NVTE_ALLOW_NONDETERMINISTIC_ALGO: 0
+  NCCL_ALGO: Tree
+  CUBLAS_WORKSPACE_CONFIG: :4096:8
+  N_REPEATS: 5
+MODEL_ARGS:
+  --num-layers: 24
+  --hidden-size: 1024
+  --num-attention-heads: 16
+  --log-params-norm: true
+  --log-num-zeros-in-grad: true
+  --log-validation-ppl-to-tensorboard: true
+  --log-timers-to-tensorboard: true
+  --log-memory-to-tensorboard: true
+  --tensorboard-dir: ${TENSORBOARD_PATH}
+  --micro-batch-size: 4
+  --global-batch-size: 128
+  --seq-length: 512
+  --max-position-embeddings: 512
+  --train-iters: 50
+  --timing-log-level: 2
+  --lr-decay-iters: 990000
+  --save: ${CHECKPOINT_PATH}
+  --load: ${CHECKPOINT_PATH}
+  --data-path: ${DATA_PATH}/my-bert_00_text_sentence
+  --vocab-file: ${DATA_PATH}/vocab.txt
+  --split: 949,50,1
+  --distributed-backend: nccl
+  --lr: 0.0001
+  --min-lr: 0.00001
+  --lr-warmup-fraction: 0.01
+  --log-interval: 1
+  --save-interval: 10000
+  --eval-interval: 1000
+  --eval-iters: 10
+  --tensor-model-parallel-size: 1
+  --pipeline-model-parallel-size: 4
+  --num-layers-per-virtual-pipeline-stage: 2
+  --use-legacy-models: true
+  --transformer-impl: local
+  --deterministic-mode: true
+  --no-gradient-accumulation-fusion: true
+  --data-cache-path: ${DATA_CACHE_PATH}
+  --fp16: true
+  --apply-query-key-layer-scaling: true
+  --ckpt-format: torch
+TEST_TYPE: regular
diff --git a/tests/functional_tests/test_cases/bert/bert_mr_tp1_pp4_vp2_resume_torch_dgx_a100_1N8G/model_config.yaml b/tests/functional_tests/test_cases/bert/bert_mr_tp1_pp4_vp2_resume_torch_dgx_a100_1N8G/model_config.yaml
new file mode 100644
index 0000000000..66ab6cabfd
--- /dev/null
+++ b/tests/functional_tests/test_cases/bert/bert_mr_tp1_pp4_vp2_resume_torch_dgx_a100_1N8G/model_config.yaml
@@ -0,0 +1,52 @@
+ENV_VARS:
+  CUDA_DEVICE_MAX_CONNECTIONS: 1
+  NVTE_FLASH_ATTN: 0
+  NVTE_FUSED_ATTN: 0
+  NVTE_ALLOW_NONDETERMINISTIC_ALGO: 0
+  NCCL_ALGO: Tree
+  CUBLAS_WORKSPACE_CONFIG: :4096:8
+  N_REPEATS: 5
+MODEL_ARGS:
+  --num-layers: 24
+  --hidden-size: 1024
+  --num-attention-heads: 16
+  --log-params-norm: true
+  --log-num-zeros-in-grad: true
+  --log-validation-ppl-to-tensorboard: true
+  --log-timers-to-tensorboard: true
+  --log-memory-to-tensorboard: true
+  --tensorboard-dir: ${TENSORBOARD_PATH}
+  --micro-batch-size: 4
+  --global-batch-size: 128
+  --seq-length: 512
+  --max-position-embeddings: 512
+  --train-iters: 100
+  --timing-log-level: 2
+  --lr-decay-iters: 990000
+  --save: ${CHECKPOINT_PATH}
+  --load: ${CHECKPOINT_PATH}
+  --data-path: ${DATA_PATH}/my-bert_00_text_sentence
+  --vocab-file: ${DATA_PATH}/vocab.txt
+  --split: 949,50,1
+  --distributed-backend: nccl
+  --lr: 0.0001
+  --min-lr: 0.00001
+  --lr-warmup-fraction: 0.01
+  --log-interval: 1
+  --save-interval: 50
+  --eval-interval: 1000
+  --eval-iters: 10
+  --tensor-model-parallel-size: 1
+  --pipeline-model-parallel-size: 4
+  --num-layers-per-virtual-pipeline-stage: 2
+  --use-legacy-models: true
+  --transformer-impl: local
+  --deterministic-mode: true
+  --use-checkpoint-args: true
+  --use-checkpoint-opt_param-scheduler: true
+  --no-gradient-accumulation-fusion: true
+  --data-cache-path: ${DATA_CACHE_PATH}
+  --fp16: true
+  --apply-query-key-layer-scaling: true
+  --ckpt-format: torch
+TEST_TYPE: ckpt-resume
diff --git a/tests/functional_tests/test_cases/bert/bert_mr_tp2_pp2_dgx_a100_1N8G/golden_values_dev.json b/tests/functional_tests/test_cases/bert/bert_mr_tp2_pp2_dgx_a100_1N8G/golden_values_dev.json
new file mode 100644
index 0000000000..5e5b762761
--- /dev/null
+++ b/tests/functional_tests/test_cases/bert/bert_mr_tp2_pp2_dgx_a100_1N8G/golden_values_dev.json
@@ -0,0 +1 @@
+{"lm loss": {"start_step": 0, "end_step": 50, "step_interval": 5, "values": [10.43755, 10.43587, 10.44704, 10.44395, 10.44965, 10.44295, 10.32757, 10.23341, 10.09049, 9.93294]}, "num-zeros": {"start_step": 0, "end_step": 34, "step_interval": 5, "values": [27979.0, 20991.0, 29735.0, 24779.0, 26808.0, 33075.0, 24387.0]}, "iteration_timing_avg": 0.7523635294117648}
diff --git a/tests/functional_tests/test_cases/bert/bert_mr_tp2_pp2_dgx_a100_1N8G/golden_values_lts.json b/tests/functional_tests/test_cases/bert/bert_mr_tp2_pp2_dgx_a100_1N8G/golden_values_lts.json
new file mode 100644
index 0000000000..5e5b762761
--- /dev/null
+++ b/tests/functional_tests/test_cases/bert/bert_mr_tp2_pp2_dgx_a100_1N8G/golden_values_lts.json
@@ -0,0 +1 @@
+{"lm loss": {"start_step": 0, "end_step": 50, "step_interval": 5, "values": [10.43755, 10.43587, 10.44704, 10.44395, 10.44965, 10.44295, 10.32757, 10.23341, 10.09049, 9.93294]}, "num-zeros": {"start_step": 0, "end_step": 34, "step_interval": 5, "values": [27979.0, 20991.0, 29735.0, 24779.0, 26808.0, 33075.0, 24387.0]}, "iteration_timing_avg": 0.7523635294117648}
diff --git a/tests/functional_tests/test_cases/bert/bert_mr_tp2_pp2_dgx_a100_1N8G/model_config.yaml b/tests/functional_tests/test_cases/bert/bert_mr_tp2_pp2_dgx_a100_1N8G/model_config.yaml
new file mode 100644
index 0000000000..94d2f2feca
--- /dev/null
+++ b/tests/functional_tests/test_cases/bert/bert_mr_tp2_pp2_dgx_a100_1N8G/model_config.yaml
@@ -0,0 +1,49 @@
+ENV_VARS:
+  CUDA_DEVICE_MAX_CONNECTIONS: 1
+  NVTE_FLASH_ATTN: 0
+  NVTE_FUSED_ATTN: 0
+  NVTE_ALLOW_NONDETERMINISTIC_ALGO: 0
+  NCCL_ALGO: Tree
+  CUBLAS_WORKSPACE_CONFIG: :4096:8
+  N_REPEATS: 5
+MODEL_ARGS:
+  --num-layers: 24
+  --hidden-size: 1024
+  --num-attention-heads: 16
+  --log-params-norm: true
+  --log-num-zeros-in-grad: true
+  --log-validation-ppl-to-tensorboard: true
+  --log-timers-to-tensorboard: true
+  --log-memory-to-tensorboard: true
+  --tensorboard-dir: ${TENSORBOARD_PATH}
+  --micro-batch-size: 4
+  --global-batch-size: 128
+  --seq-length: 512
+  --max-position-embeddings: 512
+  --train-iters: 50
+  --timing-log-level: 2
+  --lr-decay-iters: 990000
+  --save: ${CHECKPOINT_PATH}
+  --load: ${CHECKPOINT_PATH}
+  --data-path: ${DATA_PATH}/my-bert_00_text_sentence
+  --vocab-file: ${DATA_PATH}/vocab.txt
+  --split: 949,50,1
+  --distributed-backend: nccl
+  --lr: 0.0001
+  --min-lr: 0.00001
+  --lr-warmup-fraction: 0.01
+  --log-interval: 1
+  --save-interval: 10000
+  --eval-interval: 1000
+  --eval-iters: 10
+  --tensor-model-parallel-size: 2
+  --pipeline-model-parallel-size: 2
+  --use-legacy-models: true
+  --transformer-impl: local
+  --deterministic-mode: true
+  --no-gradient-accumulation-fusion: true
+  --data-cache-path: ${DATA_CACHE_PATH}
+  --fp16: true
+  --apply-query-key-layer-scaling: true
+  --ckpt-format: torch
+TEST_TYPE: regular
diff --git a/tests/functional_tests/test_cases/bert/bert_mr_tp2_pp2_resume_torch_dgx_a100_1N8G/model_config.yaml b/tests/functional_tests/test_cases/bert/bert_mr_tp2_pp2_resume_torch_dgx_a100_1N8G/model_config.yaml
new file mode 100644
index 0000000000..2f6d24e945
--- /dev/null
+++ b/tests/functional_tests/test_cases/bert/bert_mr_tp2_pp2_resume_torch_dgx_a100_1N8G/model_config.yaml
@@ -0,0 +1,51 @@
+ENV_VARS:
+  CUDA_DEVICE_MAX_CONNECTIONS: 1
+  NVTE_FLASH_ATTN: 0
+  NVTE_FUSED_ATTN: 0
+  NVTE_ALLOW_NONDETERMINISTIC_ALGO: 0
+  NCCL_ALGO: Tree
+  CUBLAS_WORKSPACE_CONFIG: :4096:8
+  N_REPEATS: 5
+MODEL_ARGS:
+  --num-layers: 24
+  --hidden-size: 1024
+  --num-attention-heads: 16
+  --log-params-norm: true
+  --log-num-zeros-in-grad: true
+  --log-validation-ppl-to-tensorboard: true
+  --log-timers-to-tensorboard: true
+  --log-memory-to-tensorboard: true
+  --tensorboard-dir: ${TENSORBOARD_PATH}
+  --micro-batch-size: 4
+  --global-batch-size: 128
+  --seq-length: 512
+  --max-position-embeddings: 512
+  --train-iters: 100
+  --timing-log-level: 2
+  --lr-decay-iters: 990000
+  --save: ${CHECKPOINT_PATH}
+  --load: ${CHECKPOINT_PATH}
+  --data-path: ${DATA_PATH}/my-bert_00_text_sentence
+  --vocab-file: ${DATA_PATH}/vocab.txt
+  --split: 949,50,1
+  --distributed-backend: nccl
+  --lr: 0.0001
+  --min-lr: 0.00001
+  --lr-warmup-fraction: 0.01
+  --log-interval: 1
+  --save-interval: 50
+  --eval-interval: 1000
+  --eval-iters: 10
+  --tensor-model-parallel-size: 2
+  --pipeline-model-parallel-size: 2
+  --use-legacy-models: true
+  --transformer-impl: local
+  --deterministic-mode: true
+  --use-checkpoint-args: true
+  --use-checkpoint-opt_param-scheduler: true
+  --no-gradient-accumulation-fusion: true
+  --data-cache-path: ${DATA_CACHE_PATH}
+  --fp16: true
+  --apply-query-key-layer-scaling: true
+  --ckpt-format: torch
+TEST_TYPE: ckpt-resume
diff --git a/tests/functional_tests/test_cases/bert/bert_nightly_dgx_a100_1N8G_mcore_tp1_pp2/golden_values_dev.json b/tests/functional_tests/test_cases/bert/bert_nightly_dgx_a100_1N8G_mcore_tp1_pp2/golden_values_dev.json
new file mode 100644
index 0000000000..bfc68cb542
--- /dev/null
+++ b/tests/functional_tests/test_cases/bert/bert_nightly_dgx_a100_1N8G_mcore_tp1_pp2/golden_values_dev.json
@@ -0,0 +1,53 @@
+{
+    "lm loss": {
+        "start_step": 0,
+        "end_step": 50,
+        "step_interval": 5,
+        "values": [
+            10.49411,
+            10.4825,
+            10.49242,
+            10.47802,
+            10.46608,
+            10.35193,
+            10.17693,
+            10.07728,
+            9.88753,
+            9.68034
+        ]
+    },
+    "num-zeros": {
+        "start_step": 0,
+        "end_step": 50,
+        "step_interval": 5,
+        "values": [
+            1931.0,
+            2555.0,
+            2017.0,
+            2135.0,
+            2440.0,
+            2464.0,
+            3070.0,
+            3006.0,
+            2932.0,
+            2303.0
+        ]
+    },
+    "iteration-time": {
+        "start_step": 0,
+        "end_step": 50,
+        "step_interval": 5,
+        "values": [
+            10.94975,
+            0.67196,
+            0.67378,
+            0.66862,
+            0.69618,
+            0.66936,
+            0.67757,
+            0.67189,
+            0.67519,
+            0.67762
+        ]
+    }
+}
\ No newline at end of file
diff --git a/tests/functional_tests/test_cases/bert/bert_nightly_dgx_a100_1N8G_mcore_tp1_pp2/golden_values_lts.json b/tests/functional_tests/test_cases/bert/bert_nightly_dgx_a100_1N8G_mcore_tp1_pp2/golden_values_lts.json
new file mode 100644
index 0000000000..25faec6b8c
--- /dev/null
+++ b/tests/functional_tests/test_cases/bert/bert_nightly_dgx_a100_1N8G_mcore_tp1_pp2/golden_values_lts.json
@@ -0,0 +1 @@
+{"lm loss": {"start_step": 0, "end_step": 50, "step_interval": 5, "values": [10.49405, 10.48276, 10.49249, 10.47813, 10.46623, 10.35183, 10.17697, 10.07728, 9.8875, 9.68029]}, "num-zeros": {"start_step": 0, "end_step": 50, "step_interval": 5, "values": [2018.0, 2636.0, 2067.0, 2225.0, 2555.0, 2554.0, 2969.0, 2935.0, 2967.0, 2287.0]}, "iteration_timing_avg": 0.5847132352941178}
\ No newline at end of file
diff --git a/tests/functional_tests/test_cases/bert/bert_nightly_dgx_a100_1N8G_mcore_tp1_pp2/model_config.yaml b/tests/functional_tests/test_cases/bert/bert_nightly_dgx_a100_1N8G_mcore_tp1_pp2/model_config.yaml
new file mode 100644
index 0000000000..cb94c9c91b
--- /dev/null
+++ b/tests/functional_tests/test_cases/bert/bert_nightly_dgx_a100_1N8G_mcore_tp1_pp2/model_config.yaml
@@ -0,0 +1,46 @@
+ENV_VARS:
+  CUDA_DEVICE_MAX_CONNECTIONS: 1
+  NVTE_FLASH_ATTN: 0
+  NVTE_FUSED_ATTN: 0
+  NVTE_ALLOW_NONDETERMINISTIC_ALGO: 0
+  NCCL_ALGO: Tree
+  CUBLAS_WORKSPACE_CONFIG: :4096:8
+  N_REPEATS: 5
+MODEL_ARGS:
+  --num-layers: 24
+  --hidden-size: 1024
+  --num-attention-heads: 16
+  --log-params-norm: true
+  --log-num-zeros-in-grad: true
+  --log-validation-ppl-to-tensorboard: true
+  --log-timers-to-tensorboard: true
+  --log-memory-to-tensorboard: true
+  --tensorboard-dir: ${TENSORBOARD_PATH}
+  --micro-batch-size: 4
+  --global-batch-size: 128
+  --seq-length: 512
+  --max-position-embeddings: 512
+  --train-iters: 50
+  --timing-log-level: 2
+  --lr-decay-iters: 990000
+  --save: ${CHECKPOINT_PATH}
+  --load: ${CHECKPOINT_PATH}
+  --data-path: ${DATA_PATH}/my-bert_00_text_sentence
+  --vocab-file: ${DATA_PATH}/vocab.txt
+  --split: 949,50,1
+  --distributed-backend: nccl
+  --lr: 0.0001
+  --min-lr: 0.00001
+  --lr-warmup-fraction: 0.01
+  --log-interval: 1
+  --save-interval: 10000
+  --eval-interval: 1000
+  --eval-iters: 10
+  --tensor-model-parallel-size: 1
+  --pipeline-model-parallel-size: 2
+  --deterministic-mode: true
+  --no-gradient-accumulation-fusion: true
+  --data-cache-path: ${DATA_CACHE_PATH}
+  --bf16: true
+  --ckpt-format: torch
+TEST_TYPE: regular
\ No newline at end of file
diff --git a/tests/functional_tests/test_cases/bert/bert_nightly_dgx_a100_1N8G_mcore_tp1_pp4_vp2/golden_values_dev.json b/tests/functional_tests/test_cases/bert/bert_nightly_dgx_a100_1N8G_mcore_tp1_pp4_vp2/golden_values_dev.json
new file mode 100644
index 0000000000..915df96674
--- /dev/null
+++ b/tests/functional_tests/test_cases/bert/bert_nightly_dgx_a100_1N8G_mcore_tp1_pp4_vp2/golden_values_dev.json
@@ -0,0 +1,53 @@
+{
+    "lm loss": {
+        "start_step": 0,
+        "end_step": 50,
+        "step_interval": 5,
+        "values": [
+            10.46796,
+            10.45723,
+            10.44911,
+            10.44107,
+            10.41739,
+            10.34626,
+            10.11387,
+            10.0439,
+            9.86702,
+            9.679
+        ]
+    },
+    "num-zeros": {
+        "start_step": 0,
+        "end_step": 50,
+        "step_interval": 5,
+        "values": [
+            2404.0,
+            2610.0,
+            2173.0,
+            2312.0,
+            2371.0,
+            2652.0,
+            3089.0,
+            3200.0,
+            3497.0,
+            3075.0
+        ]
+    },
+    "iteration-time": {
+        "start_step": 0,
+        "end_step": 50,
+        "step_interval": 5,
+        "values": [
+            15.80389,
+            0.94155,
+            0.88518,
+            1.22442,
+            0.86955,
+            0.85166,
+            1.02329,
+            1.07525,
+            0.90283,
+            0.88308
+        ]
+    }
+}
\ No newline at end of file
diff --git a/tests/functional_tests/test_cases/bert/bert_nightly_dgx_a100_1N8G_mcore_tp1_pp4_vp2/golden_values_lts.json b/tests/functional_tests/test_cases/bert/bert_nightly_dgx_a100_1N8G_mcore_tp1_pp4_vp2/golden_values_lts.json
new file mode 100644
index 0000000000..6b516a3457
--- /dev/null
+++ b/tests/functional_tests/test_cases/bert/bert_nightly_dgx_a100_1N8G_mcore_tp1_pp4_vp2/golden_values_lts.json
@@ -0,0 +1,53 @@
+{
+    "lm loss": {
+        "start_step": 0,
+        "end_step": 50,
+        "step_interval": 5,
+        "values": [
+            10.4681,
+            10.45734,
+            10.4491,
+            10.44121,
+            10.41764,
+            10.34626,
+            10.11384,
+            10.04383,
+            9.86686,
+            9.67906
+        ]
+    },
+    "num-zeros": {
+        "start_step": 0,
+        "end_step": 50,
+        "step_interval": 5,
+        "values": [
+            2373.0,
+            2593.0,
+            2187.0,
+            2325.0,
+            2407.0,
+            2627.0,
+            3036.0,
+            3109.0,
+            3568.0,
+            3019.0
+        ]
+    },
+    "iteration-time": {
+        "start_step": 0,
+        "end_step": 50,
+        "step_interval": 5,
+        "values": [
+            22.86543,
+            0.84168,
+            0.92727,
+            0.84734,
+            0.93196,
+            0.86308,
+            0.86633,
+            0.86112,
+            0.87598,
+            1.02461
+        ]
+    }
+}
\ No newline at end of file
diff --git a/tests/functional_tests/test_cases/bert/bert_nightly_dgx_a100_1N8G_mcore_tp1_pp4_vp2/model_config.yaml b/tests/functional_tests/test_cases/bert/bert_nightly_dgx_a100_1N8G_mcore_tp1_pp4_vp2/model_config.yaml
new file mode 100644
index 0000000000..3dd071d3de
--- /dev/null
+++ b/tests/functional_tests/test_cases/bert/bert_nightly_dgx_a100_1N8G_mcore_tp1_pp4_vp2/model_config.yaml
@@ -0,0 +1,47 @@
+ENV_VARS:
+  CUDA_DEVICE_MAX_CONNECTIONS: 1
+  NVTE_FLASH_ATTN: 0
+  NVTE_FUSED_ATTN: 0
+  NVTE_ALLOW_NONDETERMINISTIC_ALGO: 0
+  NCCL_ALGO: Tree
+  CUBLAS_WORKSPACE_CONFIG: :4096:8
+  N_REPEATS: 5
+MODEL_ARGS:
+  --num-layers: 24
+  --hidden-size: 1024
+  --num-attention-heads: 16
+  --log-params-norm: true
+  --log-num-zeros-in-grad: true
+  --log-validation-ppl-to-tensorboard: true
+  --log-timers-to-tensorboard: true
+  --log-memory-to-tensorboard: true
+  --tensorboard-dir: ${TENSORBOARD_PATH}
+  --micro-batch-size: 4
+  --global-batch-size: 128
+  --seq-length: 512
+  --max-position-embeddings: 512
+  --train-iters: 50
+  --timing-log-level: 2
+  --lr-decay-iters: 990000
+  --save: ${CHECKPOINT_PATH}
+  --load: ${CHECKPOINT_PATH}
+  --data-path: ${DATA_PATH}/my-bert_00_text_sentence
+  --vocab-file: ${DATA_PATH}/vocab.txt
+  --split: 949,50,1
+  --distributed-backend: nccl
+  --lr: 0.0001
+  --min-lr: 0.00001
+  --lr-warmup-fraction: 0.01
+  --log-interval: 1
+  --save-interval: 10000
+  --eval-interval: 1000
+  --eval-iters: 10
+  --tensor-model-parallel-size: 1
+  --pipeline-model-parallel-size: 4
+  --num-layers-per-virtual-pipeline-stage: 2
+  --deterministic-mode: true
+  --no-gradient-accumulation-fusion: true
+  --data-cache-path: ${DATA_CACHE_PATH}
+  --bf16: true
+  --ckpt-format: torch
+TEST_TYPE: regular
\ No newline at end of file
diff --git a/tests/functional_tests/test_cases/bert/bert_nightly_dgx_a100_1N8G_mcore_tp4_pp1/golden_values_dev.json b/tests/functional_tests/test_cases/bert/bert_nightly_dgx_a100_1N8G_mcore_tp4_pp1/golden_values_dev.json
new file mode 100644
index 0000000000..65e3ca244f
--- /dev/null
+++ b/tests/functional_tests/test_cases/bert/bert_nightly_dgx_a100_1N8G_mcore_tp4_pp1/golden_values_dev.json
@@ -0,0 +1,53 @@
+{
+    "lm loss": {
+        "start_step": 0,
+        "end_step": 50,
+        "step_interval": 5,
+        "values": [
+            10.42085,
+            10.42901,
+            10.43576,
+            10.40804,
+            10.38463,
+            10.32426,
+            10.13148,
+            10.04317,
+            9.86257,
+            9.65771
+        ]
+    },
+    "num-zeros": {
+        "start_step": 0,
+        "end_step": 50,
+        "step_interval": 5,
+        "values": [
+            3252.0,
+            2595.0,
+            3240.0,
+            3429.0,
+            3463.0,
+            3509.0,
+            4065.0,
+            4114.0,
+            4651.0,
+            4253.0
+        ]
+    },
+    "iteration-time": {
+        "start_step": 0,
+        "end_step": 50,
+        "step_interval": 5,
+        "values": [
+            10.83012,
+            2.26196,
+            2.22779,
+            2.22677,
+            2.23847,
+            2.24307,
+            2.23859,
+            2.23544,
+            2.2414,
+            2.25107
+        ]
+    }
+}
\ No newline at end of file
diff --git a/tests/functional_tests/test_cases/bert/bert_nightly_dgx_a100_1N8G_mcore_tp4_pp1/golden_values_lts.json b/tests/functional_tests/test_cases/bert/bert_nightly_dgx_a100_1N8G_mcore_tp4_pp1/golden_values_lts.json
new file mode 100644
index 0000000000..4c2193349d
--- /dev/null
+++ b/tests/functional_tests/test_cases/bert/bert_nightly_dgx_a100_1N8G_mcore_tp4_pp1/golden_values_lts.json
@@ -0,0 +1,53 @@
+{
+    "lm loss": {
+        "start_step": 0,
+        "end_step": 50,
+        "step_interval": 5,
+        "values": [
+            10.4209,
+            10.42905,
+            10.43557,
+            10.40806,
+            10.38457,
+            10.32414,
+            10.13167,
+            10.04335,
+            9.86262,
+            9.65771
+        ]
+    },
+    "num-zeros": {
+        "start_step": 0,
+        "end_step": 50,
+        "step_interval": 5,
+        "values": [
+            2249.0,
+            3640.0,
+            3249.0,
+            2318.0,
+            3512.0,
+            3601.0,
+            4111.0,
+            3175.0,
+            4713.0,
+            3320.0
+        ]
+    },
+    "iteration-time": {
+        "start_step": 0,
+        "end_step": 50,
+        "step_interval": 5,
+        "values": [
+            12.51144,
+            2.1285,
+            2.28886,
+            2.24273,
+            2.20818,
+            2.20231,
+            2.18786,
+            2.17554,
+            2.213,
+            2.18811
+        ]
+    }
+}
\ No newline at end of file
diff --git a/tests/functional_tests/test_cases/bert/bert_nightly_dgx_a100_1N8G_mcore_tp4_pp1/model_config.yaml b/tests/functional_tests/test_cases/bert/bert_nightly_dgx_a100_1N8G_mcore_tp4_pp1/model_config.yaml
new file mode 100644
index 0000000000..6d39266da3
--- /dev/null
+++ b/tests/functional_tests/test_cases/bert/bert_nightly_dgx_a100_1N8G_mcore_tp4_pp1/model_config.yaml
@@ -0,0 +1,46 @@
+ENV_VARS:
+  CUDA_DEVICE_MAX_CONNECTIONS: 1
+  NVTE_FLASH_ATTN: 0
+  NVTE_FUSED_ATTN: 0
+  NVTE_ALLOW_NONDETERMINISTIC_ALGO: 0
+  NCCL_ALGO: Tree
+  CUBLAS_WORKSPACE_CONFIG: :4096:8
+  N_REPEATS: 5
+MODEL_ARGS:
+  --num-layers: 24
+  --hidden-size: 1024
+  --num-attention-heads: 16
+  --log-params-norm: true
+  --log-num-zeros-in-grad: true
+  --log-validation-ppl-to-tensorboard: true
+  --log-timers-to-tensorboard: true
+  --log-memory-to-tensorboard: true
+  --tensorboard-dir: ${TENSORBOARD_PATH}
+  --micro-batch-size: 4
+  --global-batch-size: 128
+  --seq-length: 512
+  --max-position-embeddings: 512
+  --train-iters: 50
+  --timing-log-level: 2
+  --lr-decay-iters: 990000
+  --save: ${CHECKPOINT_PATH}
+  --load: ${CHECKPOINT_PATH}
+  --data-path: ${DATA_PATH}/my-bert_00_text_sentence
+  --vocab-file: ${DATA_PATH}/vocab.txt
+  --split: 949,50,1
+  --distributed-backend: nccl
+  --lr: 0.0001
+  --min-lr: 0.00001
+  --lr-warmup-fraction: 0.01
+  --log-interval: 1
+  --save-interval: 10000
+  --eval-interval: 1000
+  --eval-iters: 10
+  --tensor-model-parallel-size: 4
+  --pipeline-model-parallel-size: 1
+  --deterministic-mode: true
+  --no-gradient-accumulation-fusion: true
+  --data-cache-path: ${DATA_CACHE_PATH}
+  --bf16: true
+  --ckpt-format: torch
+TEST_TYPE: regular
\ No newline at end of file
diff --git a/tests/functional_tests/test_cases/bert/bert_nightly_dgx_a100_1N8G_tp1_pp2/golden_values_dev.json b/tests/functional_tests/test_cases/bert/bert_nightly_dgx_a100_1N8G_tp1_pp2/golden_values_dev.json
new file mode 100644
index 0000000000..428150fc39
--- /dev/null
+++ b/tests/functional_tests/test_cases/bert/bert_nightly_dgx_a100_1N8G_tp1_pp2/golden_values_dev.json
@@ -0,0 +1,50 @@
+{
+    "lm loss": {
+        "start_step": 0,
+        "end_step": 50,
+        "step_interval": 5,
+        "values": [
+            10.49101,
+            10.49526,
+            10.48682,
+            10.48817,
+            10.49415,
+            10.4724,
+            10.42265,
+            10.29901,
+            10.1572,
+            9.97594
+        ]
+    },
+    "iteration-time": {
+        "start_step": 0,
+        "end_step": 50,
+        "step_interval": 5,
+        "values": [
+            12.56945,
+            0.58599,
+            0.58451,
+            0.68178,
+            0.6056,
+            0.609,
+            0.59965,
+            0.60618,
+            0.60152,
+            0.59945
+        ]
+    },
+    "num-zeros": {
+        "start_step": 0,
+        "end_step": 34,
+        "step_interval": 5,
+        "values": [
+            17032.0,
+            16918.0,
+            19957.0,
+            18761.0,
+            25689.0,
+            19897.0,
+            22224.0
+        ]
+    }
+}
\ No newline at end of file
diff --git a/tests/functional_tests/test_cases/bert/bert_nightly_dgx_a100_1N8G_tp1_pp2/golden_values_lts.json b/tests/functional_tests/test_cases/bert/bert_nightly_dgx_a100_1N8G_tp1_pp2/golden_values_lts.json
new file mode 100644
index 0000000000..ab9cc2b4d9
--- /dev/null
+++ b/tests/functional_tests/test_cases/bert/bert_nightly_dgx_a100_1N8G_tp1_pp2/golden_values_lts.json
@@ -0,0 +1,50 @@
+{
+    "lm loss": {
+        "start_step": 0,
+        "end_step": 50,
+        "step_interval": 5,
+        "values": [
+            10.50096,
+            10.48594,
+            10.4936,
+            10.48501,
+            10.50417,
+            10.4773,
+            10.42154,
+            10.29716,
+            10.15831,
+            9.96751
+        ]
+    },
+    "iteration-time": {
+        "start_step": 0,
+        "end_step": 50,
+        "step_interval": 5,
+        "values": [
+            12.85743,
+            0.58922,
+            0.54928,
+            0.54147,
+            0.56305,
+            0.56895,
+            0.56282,
+            0.56247,
+            0.56751,
+            0.69574
+        ]
+    },
+    "num-zeros": {
+        "start_step": 0,
+        "end_step": 34,
+        "step_interval": 5,
+        "values": [
+            16595.0,
+            18537.0,
+            19509.0,
+            18532.0,
+            26712.0,
+            20164.0,
+            20981.0
+        ]
+    }
+}
\ No newline at end of file
diff --git a/tests/functional_tests/test_cases/bert/bert_nightly_dgx_a100_1N8G_tp1_pp2/model_config.yaml b/tests/functional_tests/test_cases/bert/bert_nightly_dgx_a100_1N8G_tp1_pp2/model_config.yaml
new file mode 100644
index 0000000000..989988f7cd
--- /dev/null
+++ b/tests/functional_tests/test_cases/bert/bert_nightly_dgx_a100_1N8G_tp1_pp2/model_config.yaml
@@ -0,0 +1,49 @@
+ENV_VARS:
+  CUDA_DEVICE_MAX_CONNECTIONS: 1
+  NVTE_FLASH_ATTN: 0
+  NVTE_FUSED_ATTN: 0
+  NVTE_ALLOW_NONDETERMINISTIC_ALGO: 0
+  NCCL_ALGO: Tree
+  CUBLAS_WORKSPACE_CONFIG: :4096:8
+  NVTE_APPLY_QK_LAYER_SCALING: 1
+  N_REPEATS: 5
+MODEL_ARGS:
+  --num-layers: 24
+  --hidden-size: 1024
+  --num-attention-heads: 16
+  --log-params-norm: true
+  --log-num-zeros-in-grad: true
+  --log-validation-ppl-to-tensorboard: true
+  --log-timers-to-tensorboard: true
+  --log-memory-to-tensorboard: true
+  --tensorboard-dir: ${TENSORBOARD_PATH}
+  --micro-batch-size: 4
+  --global-batch-size: 128
+  --seq-length: 512
+  --max-position-embeddings: 512
+  --train-iters: 50
+  --timing-log-level: 2
+  --lr-decay-iters: 990000
+  --save: ${CHECKPOINT_PATH}
+  --load: ${CHECKPOINT_PATH}
+  --data-path: ${DATA_PATH}/my-bert_00_text_sentence
+  --vocab-file: ${DATA_PATH}/vocab.txt
+  --split: 949,50,1
+  --distributed-backend: nccl
+  --lr: 0.0001
+  --min-lr: 0.00001
+  --lr-warmup-fraction: 0.01
+  --log-interval: 1
+  --save-interval: 10000
+  --eval-interval: 1000
+  --eval-iters: 10
+  --tensor-model-parallel-size: 1
+  --pipeline-model-parallel-size: 2
+  --use-legacy-models: true
+  --deterministic-mode: true
+  --no-gradient-accumulation-fusion: true
+  --data-cache-path: ${DATA_CACHE_PATH}
+  --fp16: true
+  --apply-query-key-layer-scaling: true
+  --ckpt-format: torch
+TEST_TYPE: regular
\ No newline at end of file
diff --git a/tests/functional_tests/test_cases/bert/bert_nightly_dgx_a100_1N8G_tp4_pp1/golden_values_dev.json b/tests/functional_tests/test_cases/bert/bert_nightly_dgx_a100_1N8G_tp4_pp1/golden_values_dev.json
new file mode 100644
index 0000000000..9cd1672cfd
--- /dev/null
+++ b/tests/functional_tests/test_cases/bert/bert_nightly_dgx_a100_1N8G_tp4_pp1/golden_values_dev.json
@@ -0,0 +1,50 @@
+{
+    "lm loss": {
+        "start_step": 0,
+        "end_step": 50,
+        "step_interval": 5,
+        "values": [
+            10.49734,
+            10.49243,
+            10.49325,
+            10.50311,
+            10.48985,
+            10.4721,
+            10.41217,
+            10.2805,
+            10.14052,
+            9.94191
+        ]
+    },
+    "iteration-time": {
+        "start_step": 0,
+        "end_step": 50,
+        "step_interval": 5,
+        "values": [
+            8.58282,
+            2.06311,
+            2.05789,
+            2.24493,
+            2.05273,
+            2.05118,
+            2.05666,
+            2.04533,
+            2.05152,
+            2.04761
+        ]
+    },
+    "num-zeros": {
+        "start_step": 0,
+        "end_step": 34,
+        "step_interval": 5,
+        "values": [
+            26081.0,
+            18799.0,
+            24479.0,
+            23782.0,
+            21056.0,
+            19877.0,
+            19774.0
+        ]
+    }
+}
\ No newline at end of file
diff --git a/tests/functional_tests/test_cases/bert/bert_nightly_dgx_a100_1N8G_tp4_pp1/golden_values_lts.json b/tests/functional_tests/test_cases/bert/bert_nightly_dgx_a100_1N8G_tp4_pp1/golden_values_lts.json
new file mode 100644
index 0000000000..a09f1d9a20
--- /dev/null
+++ b/tests/functional_tests/test_cases/bert/bert_nightly_dgx_a100_1N8G_tp4_pp1/golden_values_lts.json
@@ -0,0 +1,50 @@
+{
+    "lm loss": {
+        "start_step": 0,
+        "end_step": 50,
+        "step_interval": 5,
+        "values": [
+            10.48685,
+            10.49276,
+            10.48837,
+            10.51348,
+            10.49396,
+            10.4755,
+            10.41921,
+            10.28044,
+            10.14256,
+            9.94738
+        ]
+    },
+    "iteration-time": {
+        "start_step": 0,
+        "end_step": 50,
+        "step_interval": 5,
+        "values": [
+            10.8221,
+            1.96114,
+            1.9401,
+            2.22227,
+            1.94508,
+            1.94212,
+            1.93958,
+            1.94562,
+            1.9442,
+            1.94606
+        ]
+    },
+    "num-zeros": {
+        "start_step": 0,
+        "end_step": 34,
+        "step_interval": 5,
+        "values": [
+            26876.0,
+            19339.0,
+            24146.0,
+            23625.0,
+            21440.0,
+            17865.0,
+            19282.0
+        ]
+    }
+}
\ No newline at end of file
diff --git a/tests/functional_tests/test_cases/bert/bert_nightly_dgx_a100_1N8G_tp4_pp1/model_config.yaml b/tests/functional_tests/test_cases/bert/bert_nightly_dgx_a100_1N8G_tp4_pp1/model_config.yaml
new file mode 100644
index 0000000000..edcf75a772
--- /dev/null
+++ b/tests/functional_tests/test_cases/bert/bert_nightly_dgx_a100_1N8G_tp4_pp1/model_config.yaml
@@ -0,0 +1,49 @@
+ENV_VARS:
+  CUDA_DEVICE_MAX_CONNECTIONS: 1
+  NVTE_FLASH_ATTN: 0
+  NVTE_FUSED_ATTN: 0
+  NVTE_ALLOW_NONDETERMINISTIC_ALGO: 0
+  NCCL_ALGO: Tree
+  CUBLAS_WORKSPACE_CONFIG: :4096:8
+  NVTE_APPLY_QK_LAYER_SCALING: 1
+  N_REPEATS: 5
+MODEL_ARGS:
+  --num-layers: 24
+  --hidden-size: 1024
+  --num-attention-heads: 16
+  --log-params-norm: true
+  --log-num-zeros-in-grad: true
+  --log-validation-ppl-to-tensorboard: true
+  --log-timers-to-tensorboard: true
+  --log-memory-to-tensorboard: true
+  --tensorboard-dir: ${TENSORBOARD_PATH}
+  --micro-batch-size: 4
+  --global-batch-size: 128
+  --seq-length: 512
+  --max-position-embeddings: 512
+  --train-iters: 50
+  --timing-log-level: 2
+  --lr-decay-iters: 990000
+  --save: ${CHECKPOINT_PATH}
+  --load: ${CHECKPOINT_PATH}
+  --data-path: ${DATA_PATH}/my-bert_00_text_sentence
+  --vocab-file: ${DATA_PATH}/vocab.txt
+  --split: 949,50,1
+  --distributed-backend: nccl
+  --lr: 0.0001
+  --min-lr: 0.00001
+  --lr-warmup-fraction: 0.01
+  --log-interval: 1
+  --save-interval: 10000
+  --eval-interval: 1000
+  --eval-iters: 10
+  --tensor-model-parallel-size: 4
+  --pipeline-model-parallel-size: 1
+  --use-legacy-models: true
+  --deterministic-mode: true
+  --no-gradient-accumulation-fusion: true
+  --data-cache-path: ${DATA_CACHE_PATH}
+  --fp16: true  
+  --apply-query-key-layer-scaling: true
+  --ckpt-format: torch
+TEST_TYPE: regular
\ No newline at end of file
diff --git a/tests/functional_tests/test_cases/bert/bert_release/golden_values_0.8.0.json b/tests/functional_tests/test_cases/bert/bert_release/golden_values_0.8.0.json
new file mode 100644
index 0000000000..cd37089428
--- /dev/null
+++ b/tests/functional_tests/test_cases/bert/bert_release/golden_values_0.8.0.json
@@ -0,0 +1,6590 @@
+{
+    "lm loss": {
+        "start_step": 0,
+        "end_step": 16335,
+        "step_interval": 5,
+        "values": [
+            10.53793,
+            10.53833,
+            10.57328,
+            10.53546,
+            10.07398,
+            9.7437,
+            9.42134,
+            9.37734,
+            9.23363,
+            9.19234,
+            8.97735,
+            8.9212,
+            8.71322,
+            8.6598,
+            8.60404,
+            8.35312,
+            8.22921,
+            8.17413,
+            7.70251,
+            7.94843,
+            7.75401,
+            7.6155,
+            7.57677,
+            7.57115,
+            7.46261,
+            7.3348,
+            7.34965,
+            7.21065,
+            7.2967,
+            7.51623,
+            7.50848,
+            7.13886,
+            7.26099,
+            7.22096,
+            7.33946,
+            7.29352,
+            7.13829,
+            7.33535,
+            7.46038,
+            7.35064,
+            7.16396,
+            7.3037,
+            7.1074,
+            7.22845,
+            7.0236,
+            7.38542,
+            7.13949,
+            7.35053,
+            7.19933,
+            7.16134,
+            7.49269,
+            7.24922,
+            7.12929,
+            7.10281,
+            7.04489,
+            7.23503,
+            7.05831,
+            7.2197,
+            7.43084,
+            7.22903,
+            7.13581,
+            6.87717,
+            6.99137,
+            6.74988,
+            7.0204,
+            7.00762,
+            7.15195,
+            7.0732,
+            7.04017,
+            6.91983,
+            7.26792,
+            7.03561,
+            6.89552,
+            7.00603,
+            7.08591,
+            7.13913,
+            6.68255,
+            7.00998,
+            7.14783,
+            7.03557,
+            6.80588,
+            7.0735,
+            7.04492,
+            6.89815,
+            6.7917,
+            7.02153,
+            6.91982,
+            7.09829,
+            7.02664,
+            6.9825,
+            6.87097,
+            6.7737,
+            7.15663,
+            6.84695,
+            6.63555,
+            6.78703,
+            7.23335,
+            6.78468,
+            6.839,
+            7.1042,
+            6.97448,
+            7.06354,
+            6.94179,
+            6.87885,
+            6.75294,
+            6.72927,
+            7.07929,
+            6.83135,
+            6.9368,
+            6.89887,
+            6.86077,
+            6.86416,
+            6.91727,
+            6.83948,
+            6.91308,
+            6.95168,
+            6.79076,
+            6.6855,
+            6.78904,
+            6.69888,
+            7.00146,
+            6.86774,
+            6.88572,
+            6.80512,
+            6.90702,
+            6.72501,
+            6.86568,
+            7.0434,
+            6.54832,
+            6.81509,
+            6.91147,
+            6.86305,
+            6.9005,
+            6.81867,
+            6.82176,
+            6.64392,
+            6.5638,
+            6.77185,
+            6.81198,
+            6.79084,
+            6.93628,
+            6.82454,
+            6.80167,
+            6.76513,
+            6.57557,
+            6.43356,
+            6.69509,
+            6.80516,
+            6.65939,
+            6.92698,
+            6.8058,
+            6.72331,
+            6.78141,
+            6.75542,
+            6.79796,
+            6.6264,
+            6.86748,
+            6.36556,
+            6.78603,
+            7.00148,
+            6.77036,
+            6.91134,
+            6.71107,
+            6.77084,
+            6.8175,
+            6.45329,
+            6.51056,
+            7.04084,
+            6.70346,
+            6.71543,
+            6.88176,
+            6.88362,
+            6.64275,
+            6.36647,
+            6.49632,
+            6.56393,
+            6.51217,
+            6.75527,
+            6.80634,
+            6.46915,
+            6.8323,
+            6.54895,
+            6.74257,
+            6.49547,
+            6.80514,
+            6.62616,
+            6.69978,
+            6.58011,
+            6.30268,
+            6.76174,
+            6.24135,
+            6.63064,
+            6.67607,
+            6.82092,
+            6.66534,
+            6.57511,
+            6.58103,
+            6.76152,
+            6.65552,
+            6.45148,
+            6.77848,
+            6.61225,
+            6.43268,
+            6.7872,
+            6.68052,
+            6.97383,
+            6.83668,
+            6.11858,
+            6.50668,
+            6.36788,
+            6.86786,
+            6.70669,
+            6.78096,
+            6.33542,
+            6.67341,
+            6.75006,
+            6.60192,
+            6.57628,
+            6.54004,
+            6.71131,
+            6.57678,
+            6.74634,
+            6.45335,
+            6.72892,
+            6.90587,
+            6.5513,
+            6.71344,
+            6.74165,
+            6.72742,
+            6.74569,
+            6.33972,
+            6.52666,
+            6.36364,
+            6.65061,
+            6.71181,
+            6.86922,
+            6.69166,
+            6.8349,
+            6.79604,
+            6.38846,
+            6.7216,
+            6.75765,
+            6.1974,
+            6.45594,
+            6.53824,
+            6.93955,
+            6.70867,
+            6.55834,
+            6.53449,
+            6.8526,
+            6.4796,
+            6.48663,
+            6.86959,
+            6.27279,
+            6.84281,
+            6.39654,
+            6.66493,
+            6.56859,
+            6.46318,
+            6.75265,
+            6.59639,
+            6.65157,
+            6.52565,
+            6.23494,
+            6.54594,
+            6.43118,
+            6.44598,
+            6.36322,
+            6.54569,
+            6.46544,
+            6.60581,
+            6.58219,
+            6.63418,
+            6.30714,
+            6.50061,
+            6.44069,
+            6.49446,
+            6.67531,
+            6.64179,
+            6.40956,
+            6.65959,
+            6.66559,
+            6.45583,
+            6.45205,
+            6.56506,
+            6.5485,
+            6.46778,
+            6.51845,
+            6.73219,
+            6.5964,
+            6.09757,
+            6.49973,
+            6.50196,
+            6.49873,
+            6.67664,
+            6.47666,
+            6.34272,
+            6.25304,
+            6.3851,
+            6.60383,
+            6.33063,
+            6.32831,
+            6.40469,
+            6.61802,
+            6.62854,
+            6.73167,
+            6.51272,
+            6.54725,
+            6.59096,
+            6.52632,
+            6.81511,
+            6.5014,
+            6.31227,
+            6.33856,
+            6.6418,
+            6.39458,
+            6.44231,
+            6.38421,
+            6.31583,
+            6.58783,
+            6.30739,
+            6.21895,
+            6.28344,
+            6.55022,
+            6.3775,
+            6.75864,
+            6.55435,
+            6.94564,
+            6.31112,
+            6.71671,
+            6.25305,
+            6.29523,
+            6.4124,
+            6.56301,
+            6.7562,
+            6.49733,
+            6.63249,
+            6.29465,
+            6.27924,
+            6.68726,
+            6.30938,
+            6.38028,
+            6.57888,
+            6.42417,
+            6.38214,
+            6.12301,
+            6.49907,
+            6.25454,
+            6.33313,
+            6.35794,
+            6.50602,
+            6.02649,
+            6.61622,
+            6.34758,
+            6.35316,
+            6.37007,
+            6.31706,
+            6.23337,
+            6.38233,
+            6.402,
+            6.5168,
+            6.42076,
+            6.35078,
+            6.32276,
+            6.43155,
+            6.2052,
+            6.3692,
+            6.51592,
+            6.29469,
+            6.42076,
+            6.60076,
+            6.61081,
+            6.40174,
+            6.29924,
+            6.74568,
+            6.39252,
+            6.33087,
+            6.24725,
+            6.32582,
+            6.71362,
+            6.50464,
+            6.29898,
+            6.58622,
+            6.20531,
+            6.37231,
+            6.47688,
+            6.06606,
+            6.4361,
+            6.43802,
+            5.93011,
+            6.50386,
+            6.34479,
+            6.2994,
+            6.57209,
+            6.25778,
+            6.45508,
+            6.39037,
+            6.45798,
+            6.36904,
+            6.3742,
+            6.34459,
+            6.40159,
+            6.35231,
+            6.21572,
+            6.41328,
+            6.65358,
+            6.50605,
+            6.30743,
+            6.02136,
+            6.42199,
+            6.44523,
+            6.53604,
+            6.37327,
+            6.27059,
+            6.56258,
+            6.34048,
+            6.38827,
+            5.99745,
+            6.26555,
+            6.45509,
+            6.6419,
+            6.17585,
+            6.07765,
+            6.32005,
+            5.9988,
+            6.3088,
+            6.32593,
+            6.28967,
+            6.49087,
+            6.57397,
+            6.75413,
+            6.16988,
+            6.26637,
+            6.50306,
+            6.63417,
+            6.55743,
+            6.4403,
+            6.57198,
+            6.30406,
+            6.2777,
+            6.30065,
+            6.2156,
+            6.27963,
+            5.94078,
+            6.21481,
+            6.64228,
+            6.30421,
+            6.55175,
+            6.41225,
+            6.18714,
+            6.53382,
+            5.99607,
+            6.10913,
+            6.2521,
+            6.2201,
+            6.31349,
+            6.51799,
+            6.45944,
+            6.33556,
+            6.56389,
+            6.43665,
+            6.36721,
+            6.34374,
+            6.15574,
+            6.47752,
+            6.38969,
+            6.47163,
+            6.53956,
+            6.51249,
+            6.39771,
+            6.04294,
+            6.58281,
+            6.31275,
+            6.42086,
+            6.14868,
+            6.21364,
+            6.19408,
+            6.41132,
+            6.45343,
+            6.19411,
+            6.18659,
+            6.56525,
+            6.40467,
+            6.28638,
+            6.33442,
+            6.6218,
+            6.43731,
+            6.36122,
+            6.25071,
+            6.12011,
+            6.40226,
+            5.99376,
+            6.60549,
+            6.16224,
+            6.56538,
+            6.38555,
+            6.43746,
+            6.43002,
+            6.62869,
+            6.15875,
+            6.34685,
+            6.3523,
+            6.49109,
+            6.37212,
+            6.44384,
+            6.10934,
+            6.39318,
+            6.42245,
+            6.14934,
+            6.46085,
+            6.32821,
+            6.60509,
+            6.46596,
+            6.39857,
+            5.87817,
+            6.24183,
+            6.44909,
+            6.33179,
+            6.4368,
+            6.24726,
+            6.40252,
+            6.131,
+            6.50046,
+            6.3391,
+            6.34118,
+            6.46806,
+            6.31596,
+            6.16235,
+            6.54313,
+            6.42882,
+            6.37647,
+            6.51876,
+            6.16584,
+            6.47311,
+            6.21822,
+            6.32196,
+            6.07977,
+            6.44668,
+            6.39247,
+            6.25631,
+            6.47592,
+            6.29171,
+            6.38129,
+            6.55715,
+            6.28978,
+            6.26295,
+            6.4926,
+            6.18279,
+            6.58878,
+            6.10062,
+            6.17452,
+            6.10584,
+            6.18107,
+            6.4517,
+            6.46322,
+            6.18413,
+            6.04441,
+            6.15884,
+            6.2331,
+            6.16856,
+            6.18516,
+            6.56784,
+            6.25482,
+            6.38822,
+            6.03013,
+            6.03972,
+            6.41785,
+            6.30254,
+            6.36035,
+            6.02451,
+            6.50559,
+            6.40899,
+            6.18496,
+            6.34395,
+            6.52951,
+            6.25829,
+            6.51237,
+            6.28479,
+            6.14295,
+            6.52767,
+            6.07687,
+            6.40724,
+            6.39342,
+            6.28972,
+            6.2584,
+            6.32533,
+            6.43399,
+            6.36631,
+            6.16643,
+            6.33093,
+            6.45457,
+            6.25883,
+            6.34143,
+            6.2437,
+            6.23937,
+            6.16769,
+            6.07649,
+            6.12008,
+            6.40524,
+            6.32947,
+            6.39147,
+            6.28194,
+            6.12545,
+            6.35343,
+            6.33975,
+            6.53219,
+            6.41075,
+            6.21738,
+            6.37557,
+            6.51013,
+            6.1613,
+            6.14545,
+            6.33928,
+            6.4156,
+            6.34552,
+            6.18562,
+            6.31044,
+            6.535,
+            6.2967,
+            6.34847,
+            6.38755,
+            6.09215,
+            6.15779,
+            6.09988,
+            6.3951,
+            6.11293,
+            6.15412,
+            6.34488,
+            6.02805,
+            6.37669,
+            6.08256,
+            6.29337,
+            6.11569,
+            6.3343,
+            6.23769,
+            6.33333,
+            6.19854,
+            6.13166,
+            6.53816,
+            6.14203,
+            6.22576,
+            6.31578,
+            6.18142,
+            6.24817,
+            6.54147,
+            6.26769,
+            6.50317,
+            6.35394,
+            6.00299,
+            6.1815,
+            6.22899,
+            6.25878,
+            6.44192,
+            6.44892,
+            6.39553,
+            5.98413,
+            6.43795,
+            6.37013,
+            6.06328,
+            6.58424,
+            6.35392,
+            6.30076,
+            6.4262,
+            6.08959,
+            6.37101,
+            6.25673,
+            5.98083,
+            6.42341,
+            6.22051,
+            6.31869,
+            5.99465,
+            6.20636,
+            6.29428,
+            6.28203,
+            6.15005,
+            6.03871,
+            6.18434,
+            6.53488,
+            6.36443,
+            6.07942,
+            6.30651,
+            6.06713,
+            6.26565,
+            6.40616,
+            6.741,
+            6.24939,
+            6.13291,
+            6.09875,
+            6.31759,
+            5.93891,
+            6.2543,
+            6.00153,
+            6.54021,
+            6.40471,
+            6.22258,
+            6.2507,
+            6.12092,
+            6.1711,
+            6.03053,
+            6.46355,
+            6.29811,
+            6.27215,
+            6.08401,
+            6.22164,
+            6.39539,
+            6.47017,
+            6.11386,
+            6.45237,
+            6.04349,
+            6.30801,
+            6.3468,
+            6.18748,
+            6.42659,
+            5.99932,
+            6.12072,
+            6.22595,
+            6.33846,
+            6.56846,
+            6.08395,
+            6.37881,
+            6.59243,
+            6.15607,
+            6.2082,
+            6.21438,
+            6.27514,
+            5.84324,
+            6.40712,
+            6.19796,
+            6.33034,
+            6.18061,
+            6.41243,
+            6.21666,
+            6.15695,
+            5.96279,
+            6.30155,
+            6.15897,
+            6.21676,
+            6.0512,
+            6.08294,
+            6.0621,
+            6.09995,
+            6.13439,
+            6.40333,
+            6.33143,
+            5.96941,
+            6.13624,
+            6.43448,
+            6.23377,
+            6.40988,
+            6.22927,
+            5.99602,
+            6.41574,
+            6.17216,
+            6.32381,
+            6.12876,
+            5.96916,
+            5.99431,
+            6.17928,
+            6.01173,
+            6.20852,
+            6.3407,
+            6.39336,
+            6.09081,
+            6.35499,
+            6.24335,
+            6.31461,
+            6.15029,
+            6.30659,
+            6.26253,
+            6.39301,
+            6.2042,
+            6.37907,
+            5.97963,
+            6.38598,
+            6.27523,
+            6.03397,
+            6.552,
+            6.27548,
+            6.28337,
+            6.21724,
+            6.20224,
+            6.07868,
+            6.073,
+            6.30956,
+            6.21111,
+            6.12205,
+            6.45981,
+            6.1036,
+            6.15625,
+            6.18828,
+            6.40387,
+            6.34025,
+            6.2894,
+            6.39874,
+            6.18994,
+            6.12809,
+            6.30166,
+            6.20345,
+            6.35857,
+            6.12282,
+            6.3579,
+            6.42851,
+            6.2104,
+            6.13,
+            6.32673,
+            5.99126,
+            6.53213,
+            6.39713,
+            6.22232,
+            6.36209,
+            6.37234,
+            6.06583,
+            5.96905,
+            6.07293,
+            5.89625,
+            6.16057,
+            6.04981,
+            6.10996,
+            6.48529,
+            6.08862,
+            6.29631,
+            6.25923,
+            6.16974,
+            6.27645,
+            6.34773,
+            6.14065,
+            6.39893,
+            6.20423,
+            6.44389,
+            6.14672,
+            6.09501,
+            6.23888,
+            6.14447,
+            6.30253,
+            6.38443,
+            6.40943,
+            6.34193,
+            6.26095,
+            6.06244,
+            6.42097,
+            6.1041,
+            6.38684,
+            6.37667,
+            6.12186,
+            5.99692,
+            6.19204,
+            6.1919,
+            6.50044,
+            6.3115,
+            6.05882,
+            5.86439,
+            6.45141,
+            5.88432,
+            6.23995,
+            6.11292,
+            6.20951,
+            5.90822,
+            6.19528,
+            5.81616,
+            6.2398,
+            6.34606,
+            6.36593,
+            6.09603,
+            6.33785,
+            6.42073,
+            5.92349,
+            6.37215,
+            6.39677,
+            6.36358,
+            6.22775,
+            5.98277,
+            6.35036,
+            6.21034,
+            5.97164,
+            6.09301,
+            6.12039,
+            6.46194,
+            6.2046,
+            5.96427,
+            6.29253,
+            6.10433,
+            6.08377,
+            6.3307,
+            6.4867,
+            6.31023,
+            6.09359,
+            6.22142,
+            6.05327,
+            6.15394,
+            6.23608,
+            6.03966,
+            5.8949,
+            6.2167,
+            6.26209,
+            5.93462,
+            6.07415,
+            6.09805,
+            6.29827,
+            6.3569,
+            6.21374,
+            6.25305,
+            6.44093,
+            6.31724,
+            5.94012,
+            6.06901,
+            6.44223,
+            6.15413,
+            6.30072,
+            6.16676,
+            6.16942,
+            5.98695,
+            6.23098,
+            6.05042,
+            6.28081,
+            6.09711,
+            6.37741,
+            6.06699,
+            6.05882,
+            6.17689,
+            6.22381,
+            6.32849,
+            6.24238,
+            6.31961,
+            5.93739,
+            6.2644,
+            5.98268,
+            6.16066,
+            5.98254,
+            6.23034,
+            6.13085,
+            6.00423,
+            5.90725,
+            6.16344,
+            6.04893,
+            6.19732,
+            6.05768,
+            6.04611,
+            6.21645,
+            6.14967,
+            6.24572,
+            6.01439,
+            6.30176,
+            5.80022,
+            6.47263,
+            6.18387,
+            6.25577,
+            6.24843,
+            5.91143,
+            5.96473,
+            6.14371,
+            6.11824,
+            5.84433,
+            6.0589,
+            6.22986,
+            6.33661,
+            5.88936,
+            6.4773,
+            6.1532,
+            6.24312,
+            5.5371,
+            5.94914,
+            6.09041,
+            6.13193,
+            5.7848,
+            6.08348,
+            6.14052,
+            6.0647,
+            6.26865,
+            6.25012,
+            6.25113,
+            6.30421,
+            6.3171,
+            6.45796,
+            6.27366,
+            6.14312,
+            6.49744,
+            6.16217,
+            6.23036,
+            5.86772,
+            6.02907,
+            6.19862,
+            6.26842,
+            6.35715,
+            6.10501,
+            5.91702,
+            6.03526,
+            6.15697,
+            6.03631,
+            6.07692,
+            6.24646,
+            6.14011,
+            6.05932,
+            6.15876,
+            6.05441,
+            5.99278,
+            6.12618,
+            6.39054,
+            6.14162,
+            6.10958,
+            6.45082,
+            6.30386,
+            6.0778,
+            5.93397,
+            5.90111,
+            6.06705,
+            6.14443,
+            6.31779,
+            5.74064,
+            6.10349,
+            5.97327,
+            6.09052,
+            6.25249,
+            6.07548,
+            6.07552,
+            5.98058,
+            5.99296,
+            6.05499,
+            5.86394,
+            5.86196,
+            5.83776,
+            5.83957,
+            6.2593,
+            5.83799,
+            6.1191,
+            6.08244,
+            6.22337,
+            6.09661,
+            6.0732,
+            5.98194,
+            6.35632,
+            5.77603,
+            5.84978,
+            6.18573,
+            5.89755,
+            6.14481,
+            6.15262,
+            5.94744,
+            5.90468,
+            6.14408,
+            6.02246,
+            6.12202,
+            5.92749,
+            6.19453,
+            6.06292,
+            6.05398,
+            5.78895,
+            6.07653,
+            5.87674,
+            6.10413,
+            6.20621,
+            6.02689,
+            6.15198,
+            6.22689,
+            5.85123,
+            6.07978,
+            5.97042,
+            5.81312,
+            6.10418,
+            6.21739,
+            6.1917,
+            6.24606,
+            5.95878,
+            5.82133,
+            5.92305,
+            5.85724,
+            6.05554,
+            6.18299,
+            6.15499,
+            5.83163,
+            6.46447,
+            6.15277,
+            6.04714,
+            6.07566,
+            6.14775,
+            6.07494,
+            5.95285,
+            5.96777,
+            5.99285,
+            6.25656,
+            5.90819,
+            5.84823,
+            5.9248,
+            6.12159,
+            6.05189,
+            6.25358,
+            5.98047,
+            5.91779,
+            6.07089,
+            6.10884,
+            6.05018,
+            5.91499,
+            5.84059,
+            6.00829,
+            6.01661,
+            6.08329,
+            5.8952,
+            6.01278,
+            5.67961,
+            5.83088,
+            6.13372,
+            6.0899,
+            6.15196,
+            6.18286,
+            6.14409,
+            5.7606,
+            6.08712,
+            6.10897,
+            5.99769,
+            5.93637,
+            5.87955,
+            5.95937,
+            6.29087,
+            5.87092,
+            5.78197,
+            6.14667,
+            6.05809,
+            6.16481,
+            5.94991,
+            5.75291,
+            5.8592,
+            6.19805,
+            5.9858,
+            6.1639,
+            6.09678,
+            6.02787,
+            5.81271,
+            6.09139,
+            6.32533,
+            5.96413,
+            6.16299,
+            6.00276,
+            6.19657,
+            6.02726,
+            6.05171,
+            5.84633,
+            5.77209,
+            5.96961,
+            5.9849,
+            6.02932,
+            6.0537,
+            6.08561,
+            5.89283,
+            6.19435,
+            6.06464,
+            6.2568,
+            5.80293,
+            6.02946,
+            5.7978,
+            6.10829,
+            5.84662,
+            5.77951,
+            5.7912,
+            6.04755,
+            5.90745,
+            5.93444,
+            6.17925,
+            5.82008,
+            5.96972,
+            5.71202,
+            6.00809,
+            5.80207,
+            5.97974,
+            5.88935,
+            6.33257,
+            6.14508,
+            5.86721,
+            5.86794,
+            6.01291,
+            5.74821,
+            5.91841,
+            5.82207,
+            5.83811,
+            5.54737,
+            5.80353,
+            5.72796,
+            6.0506,
+            6.03371,
+            5.80528,
+            5.93526,
+            6.11032,
+            6.03443,
+            5.9479,
+            5.84056,
+            5.86626,
+            5.88418,
+            6.0262,
+            5.86155,
+            6.06552,
+            5.88192,
+            5.8404,
+            5.92057,
+            5.83942,
+            6.01708,
+            5.96875,
+            5.79609,
+            5.88157,
+            5.78996,
+            6.01264,
+            6.04324,
+            5.8411,
+            5.83899,
+            5.94632,
+            6.03382,
+            5.8096,
+            5.6814,
+            5.61011,
+            5.82258,
+            6.0532,
+            6.26449,
+            5.90097,
+            6.03606,
+            5.59388,
+            5.84266,
+            5.97485,
+            5.95277,
+            6.24308,
+            5.91125,
+            6.12072,
+            5.96379,
+            5.86492,
+            5.99428,
+            5.83884,
+            5.82211,
+            5.70013,
+            6.0971,
+            6.03164,
+            5.78511,
+            5.90645,
+            5.66368,
+            5.73694,
+            6.13804,
+            6.1053,
+            5.96152,
+            6.11842,
+            5.99783,
+            6.00233,
+            5.63439,
+            5.85923,
+            5.93705,
+            5.58148,
+            5.94662,
+            5.76007,
+            5.84042,
+            5.74787,
+            5.88519,
+            5.97658,
+            5.7215,
+            5.87309,
+            6.00525,
+            5.93322,
+            5.81608,
+            5.74541,
+            5.8454,
+            5.93668,
+            5.85126,
+            5.7304,
+            5.84281,
+            6.01029,
+            5.98761,
+            5.73332,
+            5.84772,
+            5.72475,
+            5.54015,
+            5.99439,
+            6.09163,
+            5.84615,
+            5.70075,
+            5.81065,
+            6.0266,
+            5.76754,
+            5.72074,
+            6.09481,
+            5.72303,
+            5.56257,
+            5.85745,
+            5.69924,
+            5.82868,
+            5.78828,
+            5.67483,
+            5.496,
+            5.73639,
+            5.72971,
+            5.76467,
+            5.66526,
+            5.65788,
+            5.92271,
+            5.62234,
+            5.31858,
+            5.64535,
+            5.99382,
+            5.651,
+            5.76309,
+            5.79016,
+            5.95155,
+            5.68025,
+            5.53956,
+            5.92439,
+            5.78876,
+            5.79481,
+            5.81312,
+            5.69195,
+            5.7748,
+            5.70214,
+            5.90134,
+            5.75172,
+            5.8835,
+            5.57238,
+            5.60218,
+            5.45807,
+            5.53449,
+            5.58066,
+            5.6957,
+            5.64536,
+            5.68633,
+            5.81438,
+            5.40124,
+            5.83671,
+            5.96217,
+            6.00974,
+            5.58393,
+            5.53247,
+            5.78327,
+            5.88263,
+            5.84458,
+            5.78983,
+            5.58777,
+            5.74236,
+            5.75036,
+            5.52226,
+            5.49968,
+            5.67871,
+            6.00464,
+            5.641,
+            5.65137,
+            5.55635,
+            5.61197,
+            5.44461,
+            5.63676,
+            5.85305,
+            5.6634,
+            5.70227,
+            5.63678,
+            5.87241,
+            5.9005,
+            6.00072,
+            5.71109,
+            5.85047,
+            5.8183,
+            5.5811,
+            5.28681,
+            5.53006,
+            6.04771,
+            5.50425,
+            5.67854,
+            5.51973,
+            5.84652,
+            5.86275,
+            5.91333,
+            5.60112,
+            5.80213,
+            5.60584,
+            5.40794,
+            5.63212,
+            5.47845,
+            5.80563,
+            5.64168,
+            5.89571,
+            5.89592,
+            5.88066,
+            5.62191,
+            5.64817,
+            5.49271,
+            5.80496,
+            5.63366,
+            5.49444,
+            5.81441,
+            5.86738,
+            5.77686,
+            5.81384,
+            5.73914,
+            5.77844,
+            5.41317,
+            5.57368,
+            5.85532,
+            5.57311,
+            5.72023,
+            5.66576,
+            5.31334,
+            5.78508,
+            5.93047,
+            5.85842,
+            5.94373,
+            5.67211,
+            5.54567,
+            5.49603,
+            5.57147,
+            5.33313,
+            5.55491,
+            5.33363,
+            5.72239,
+            5.662,
+            5.45219,
+            5.5106,
+            5.53594,
+            5.82025,
+            5.77807,
+            5.2408,
+            5.59296,
+            5.62683,
+            5.69741,
+            5.73427,
+            5.49788,
+            5.66272,
+            5.57567,
+            5.74357,
+            5.52734,
+            5.50491,
+            5.57587,
+            5.96142,
+            5.49539,
+            5.71266,
+            5.70483,
+            5.23033,
+            5.44142,
+            5.59221,
+            5.61425,
+            5.36935,
+            5.57102,
+            5.73355,
+            5.58329,
+            5.76048,
+            5.78104,
+            5.51218,
+            5.54391,
+            5.89282,
+            5.71522,
+            5.56901,
+            5.45096,
+            5.36384,
+            5.78966,
+            5.79038,
+            5.52832,
+            5.47669,
+            5.65642,
+            5.59188,
+            5.56174,
+            5.52253,
+            5.50719,
+            5.29606,
+            5.75425,
+            5.68504,
+            5.46854,
+            5.67471,
+            5.72898,
+            5.90051,
+            5.5793,
+            5.6441,
+            5.7178,
+            5.8198,
+            5.57355,
+            5.61022,
+            5.66798,
+            5.19177,
+            5.91541,
+            5.40464,
+            5.39557,
+            5.50319,
+            5.66164,
+            5.7401,
+            5.55738,
+            5.72171,
+            5.61542,
+            5.6533,
+            5.50204,
+            5.5001,
+            5.6838,
+            5.74351,
+            5.23517,
+            5.27947,
+            5.7736,
+            5.74565,
+            5.61515,
+            5.51495,
+            5.34017,
+            5.55685,
+            5.78903,
+            5.57942,
+            5.85997,
+            5.24422,
+            5.33002,
+            5.52458,
+            5.6809,
+            5.7238,
+            5.45601,
+            5.57291,
+            5.51181,
+            5.56948,
+            5.32142,
+            5.35315,
+            5.47335,
+            5.58987,
+            5.56781,
+            5.33109,
+            5.47933,
+            5.60359,
+            5.33716,
+            5.70209,
+            5.57574,
+            5.15947,
+            5.40233,
+            5.14065,
+            5.39899,
+            5.68815,
+            5.05608,
+            5.26242,
+            5.46771,
+            5.10152,
+            5.704,
+            5.29233,
+            5.33947,
+            5.25637,
+            5.67878,
+            5.55052,
+            5.51558,
+            5.46657,
+            5.1927,
+            5.63042,
+            5.54801,
+            5.61803,
+            5.59148,
+            5.59111,
+            5.53997,
+            5.71475,
+            5.751,
+            5.50991,
+            5.54956,
+            5.26494,
+            5.25531,
+            5.62038,
+            5.40946,
+            5.45863,
+            5.08687,
+            5.5366,
+            5.60898,
+            5.30272,
+            5.6928,
+            5.55462,
+            5.6038,
+            5.35577,
+            5.4286,
+            5.77712,
+            5.12033,
+            5.44462,
+            5.41782,
+            5.32479,
+            5.21973,
+            5.45154,
+            5.20559,
+            5.6674,
+            5.21263,
+            5.42332,
+            5.54029,
+            5.68911,
+            5.21107,
+            5.5421,
+            5.28456,
+            5.22619,
+            5.07375,
+            5.77718,
+            5.52267,
+            5.27374,
+            5.39799,
+            5.42136,
+            5.29616,
+            5.37187,
+            5.18627,
+            5.41708,
+            5.56821,
+            5.51711,
+            5.26606,
+            5.44275,
+            5.27222,
+            5.48044,
+            5.42999,
+            5.36919,
+            5.82357,
+            5.48711,
+            5.23278,
+            5.33405,
+            5.24011,
+            5.39905,
+            5.4392,
+            5.36185,
+            5.42562,
+            5.43673,
+            5.2401,
+            5.44366,
+            5.55005,
+            5.18979,
+            5.56064,
+            5.27104,
+            5.37792,
+            5.72462,
+            5.31993,
+            5.43134,
+            5.26772,
+            5.47394,
+            5.37205,
+            5.27303,
+            5.29492,
+            5.32969,
+            5.514,
+            5.41325,
+            5.24781,
+            5.50394,
+            5.43094,
+            5.21885,
+            5.697,
+            5.49622,
+            5.3313,
+            5.37993,
+            5.31966,
+            5.38266,
+            5.40369,
+            5.27459,
+            5.26548,
+            5.47746,
+            5.32108,
+            5.4704,
+            5.3552,
+            5.68324,
+            5.56886,
+            5.59513,
+            5.26185,
+            5.19901,
+            5.47215,
+            5.46836,
+            4.99488,
+            5.4407,
+            5.34759,
+            5.79016,
+            5.42391,
+            5.31161,
+            5.51834,
+            5.37018,
+            5.33223,
+            5.62554,
+            5.1873,
+            5.26472,
+            5.22393,
+            5.01926,
+            5.41349,
+            5.23932,
+            5.41591,
+            5.23388,
+            5.46969,
+            5.59588,
+            5.63601,
+            5.51309,
+            5.25855,
+            5.47349,
+            5.54422,
+            5.54735,
+            5.30105,
+            5.1544,
+            5.38647,
+            5.18654,
+            5.45893,
+            5.42539,
+            5.46495,
+            5.30878,
+            5.16631,
+            5.61421,
+            5.32415,
+            5.5367,
+            5.46586,
+            5.4395,
+            5.40487,
+            5.10759,
+            5.43359,
+            5.5656,
+            5.35044,
+            5.2805,
+            5.52335,
+            5.3629,
+            5.62948,
+            5.25984,
+            5.40786,
+            5.22698,
+            5.44817,
+            5.20858,
+            5.3904,
+            5.67465,
+            5.50158,
+            5.25219,
+            5.40554,
+            5.42222,
+            5.12741,
+            5.58132,
+            5.23858,
+            5.472,
+            5.53455,
+            5.09749,
+            5.32636,
+            5.66949,
+            5.47415,
+            5.83646,
+            5.15267,
+            5.65019,
+            5.39714,
+            5.2346,
+            5.39145,
+            5.21172,
+            5.38191,
+            5.29957,
+            5.4159,
+            5.23551,
+            5.46337,
+            5.10637,
+            5.49482,
+            5.51147,
+            5.22539,
+            5.48015,
+            5.36735,
+            5.41412,
+            5.31927,
+            5.6195,
+            5.4469,
+            5.04296,
+            5.01706,
+            5.42501,
+            5.57975,
+            5.18865,
+            5.30631,
+            5.23734,
+            5.14166,
+            5.29754,
+            4.74249,
+            5.33519,
+            5.17675,
+            4.96699,
+            5.02152,
+            5.48829,
+            5.37785,
+            5.52028,
+            5.2346,
+            5.21928,
+            5.42326,
+            5.21575,
+            5.34642,
+            5.50497,
+            5.34291,
+            5.44243,
+            5.26401,
+            5.48028,
+            5.29042,
+            4.97953,
+            5.21126,
+            5.40469,
+            5.093,
+            5.33717,
+            5.18471,
+            5.20772,
+            5.23414,
+            5.00452,
+            4.85325,
+            5.4221,
+            5.34867,
+            5.44642,
+            5.41004,
+            5.01,
+            5.10068,
+            5.3912,
+            5.30883,
+            5.02749,
+            5.25628,
+            4.84244,
+            5.53958,
+            5.06558,
+            5.18397,
+            5.16718,
+            5.43679,
+            5.41454,
+            5.2013,
+            5.17036,
+            5.61725,
+            5.21891,
+            5.18433,
+            5.27505,
+            5.08694,
+            5.04475,
+            5.00165,
+            4.89636,
+            5.10688,
+            4.87777,
+            5.12496,
+            5.12076,
+            5.28615,
+            5.37844,
+            5.31216,
+            5.16521,
+            5.26539,
+            5.04044,
+            5.22532,
+            5.06384,
+            4.87431,
+            5.27989,
+            5.39772,
+            5.26121,
+            5.10267,
+            5.04472,
+            5.30136,
+            5.12835,
+            5.32223,
+            5.30201,
+            5.47047,
+            5.08983,
+            5.09329,
+            5.22051,
+            5.18219,
+            5.26414,
+            4.85314,
+            4.80557,
+            5.11929,
+            4.97588,
+            5.10509,
+            5.12232,
+            5.1768,
+            5.21992,
+            5.18914,
+            5.40696,
+            4.9601,
+            5.13121,
+            5.039,
+            5.08148,
+            5.00974,
+            4.95523,
+            5.22023,
+            5.18992,
+            5.23818,
+            5.43358,
+            5.25654,
+            5.1727,
+            5.38586,
+            5.33956,
+            5.15538,
+            5.31171,
+            5.03377,
+            5.15866,
+            5.1277,
+            5.05149,
+            5.22973,
+            5.31626,
+            4.79504,
+            5.08908,
+            5.21996,
+            4.99717,
+            5.11511,
+            5.09157,
+            5.18415,
+            5.35206,
+            4.483,
+            5.11497,
+            5.18612,
+            5.09318,
+            5.3488,
+            5.19722,
+            4.92825,
+            4.76935,
+            4.97035,
+            4.93379,
+            5.11701,
+            5.18488,
+            4.99943,
+            5.11904,
+            4.78261,
+            5.29948,
+            5.12962,
+            5.26287,
+            5.32794,
+            5.23089,
+            5.07579,
+            5.21165,
+            5.15483,
+            4.94098,
+            5.14296,
+            4.70642,
+            5.02005,
+            4.9152,
+            5.27068,
+            5.31659,
+            5.29478,
+            5.17467,
+            5.48285,
+            5.17564,
+            4.97944,
+            5.11965,
+            4.77649,
+            5.43721,
+            5.06011,
+            5.12371,
+            4.96652,
+            5.11622,
+            5.20294,
+            5.20476,
+            4.83474,
+            4.99933,
+            5.23165,
+            4.80956,
+            5.16499,
+            5.40001,
+            5.15955,
+            5.10155,
+            5.4379,
+            4.92316,
+            5.29426,
+            4.83243,
+            4.96744,
+            5.04034,
+            4.96892,
+            5.42396,
+            5.02501,
+            4.91994,
+            5.06529,
+            5.23294,
+            4.98085,
+            5.0054,
+            5.12737,
+            4.99702,
+            4.85744,
+            4.64251,
+            4.97963,
+            5.30969,
+            5.13006,
+            4.84322,
+            5.23145,
+            5.0589,
+            5.02944,
+            5.1554,
+            5.14248,
+            5.29471,
+            5.11387,
+            5.01216,
+            4.90647,
+            4.93221,
+            5.35247,
+            5.39206,
+            4.90045,
+            5.27059,
+            5.22647,
+            5.11795,
+            5.06723,
+            4.96303,
+            5.24919,
+            5.29575,
+            5.04291,
+            5.20157,
+            5.44766,
+            5.09375,
+            5.00037,
+            5.18376,
+            5.07238,
+            5.05871,
+            5.04124,
+            4.98874,
+            4.80654,
+            5.15762,
+            5.35158,
+            5.13558,
+            5.04201,
+            5.21272,
+            4.84443,
+            5.09973,
+            5.26597,
+            5.26834,
+            5.10139,
+            5.36117,
+            5.11024,
+            5.31294,
+            4.97496,
+            4.7405,
+            5.25625,
+            4.9144,
+            5.21628,
+            5.06403,
+            4.79898,
+            4.89406,
+            5.19256,
+            5.24569,
+            4.88062,
+            5.01205,
+            4.90107,
+            5.14932,
+            4.86965,
+            4.99126,
+            4.91607,
+            4.86337,
+            5.09162,
+            4.9213,
+            4.99198,
+            4.81591,
+            5.04119,
+            5.08007,
+            4.91372,
+            4.88984,
+            5.15553,
+            5.44333,
+            5.21246,
+            5.00124,
+            5.15027,
+            4.82246,
+            4.97428,
+            4.94423,
+            4.567,
+            5.30908,
+            4.99444,
+            4.69225,
+            4.80792,
+            4.76228,
+            4.91197,
+            5.27037,
+            4.83068,
+            4.66668,
+            4.93349,
+            4.96998,
+            4.88633,
+            5.12723,
+            4.93398,
+            4.73109,
+            5.27862,
+            5.08144,
+            4.8117,
+            5.03094,
+            4.85073,
+            5.19184,
+            5.38803,
+            5.12819,
+            4.97051,
+            5.22417,
+            5.01635,
+            5.0717,
+            5.19179,
+            5.09407,
+            5.09324,
+            5.07832,
+            5.26847,
+            5.28364,
+            5.1167,
+            5.0541,
+            4.58195,
+            4.98147,
+            4.96462,
+            5.09185,
+            5.15236,
+            5.06825,
+            5.01385,
+            4.97451,
+            5.09335,
+            5.04342,
+            5.08338,
+            4.90682,
+            5.17985,
+            5.16023,
+            5.08981,
+            4.98628,
+            4.89905,
+            4.72349,
+            4.79049,
+            5.01912,
+            4.71261,
+            4.73899,
+            5.31541,
+            5.17609,
+            4.88201,
+            5.12856,
+            4.91881,
+            5.10478,
+            4.78821,
+            4.91988,
+            4.55291,
+            5.28126,
+            5.38192,
+            4.90148,
+            4.91535,
+            4.86343,
+            4.51877,
+            4.82147,
+            5.19334,
+            4.99626,
+            5.1268,
+            4.90126,
+            4.97496,
+            4.6243,
+            5.06909,
+            4.78466,
+            4.94887,
+            4.41497,
+            5.12551,
+            4.89441,
+            5.01441,
+            4.9732,
+            4.80138,
+            4.87926,
+            4.86248,
+            4.78461,
+            4.4913,
+            4.93864,
+            5.09337,
+            5.02533,
+            4.96463,
+            4.91174,
+            4.90578,
+            5.02837,
+            5.0042,
+            5.18834,
+            5.16745,
+            4.94125,
+            4.78142,
+            5.08765,
+            5.162,
+            4.99523,
+            4.72421,
+            5.06853,
+            5.15604,
+            4.70324,
+            5.14308,
+            5.26969,
+            5.01419,
+            4.89412,
+            4.66994,
+            4.56827,
+            4.82008,
+            4.88612,
+            4.99335,
+            5.00443,
+            5.00444,
+            4.76957,
+            5.23505,
+            4.73968,
+            5.14181,
+            4.91469,
+            5.23114,
+            5.33121,
+            4.81551,
+            4.90884,
+            4.9496,
+            5.10944,
+            4.47681,
+            4.67398,
+            4.8943,
+            4.84807,
+            5.11156,
+            4.88003,
+            5.00481,
+            4.9316,
+            5.34696,
+            4.76706,
+            4.66782,
+            4.91814,
+            5.01827,
+            4.93052,
+            4.7207,
+            4.63041,
+            4.76303,
+            4.84309,
+            4.69046,
+            5.03413,
+            5.03258,
+            4.59029,
+            5.05744,
+            4.90873,
+            5.21043,
+            4.81666,
+            5.0944,
+            5.14665,
+            4.78434,
+            5.15583,
+            4.9822,
+            4.85239,
+            5.05721,
+            5.0517,
+            4.78335,
+            4.85769,
+            4.99127,
+            5.0996,
+            4.9464,
+            4.80083,
+            4.62979,
+            4.96829,
+            4.8878,
+            4.96983,
+            4.61779,
+            5.05413,
+            4.79733,
+            5.06758,
+            4.85831,
+            5.00424,
+            4.79188,
+            4.69064,
+            5.03358,
+            5.19736,
+            4.92724,
+            4.83414,
+            4.78382,
+            4.77864,
+            5.132,
+            5.23577,
+            5.05201,
+            4.72849,
+            4.82143,
+            4.63096,
+            4.87687,
+            4.48367,
+            4.97165,
+            4.85723,
+            5.18116,
+            4.99292,
+            4.97902,
+            5.17941,
+            4.77471,
+            4.71585,
+            5.35185,
+            4.68413,
+            4.98282,
+            4.67711,
+            5.03022,
+            4.93753,
+            4.71009,
+            4.88578,
+            5.17075,
+            5.02417,
+            4.75791,
+            4.95128,
+            5.35481,
+            4.56358,
+            4.80616,
+            4.70277,
+            4.97661,
+            4.83534,
+            4.75097,
+            4.87225,
+            4.97889,
+            4.5431,
+            4.59369,
+            5.12614,
+            4.63494,
+            4.97415,
+            4.79503,
+            5.15621,
+            4.67314,
+            4.70713,
+            4.90119,
+            4.92401,
+            4.64504,
+            5.11849,
+            4.97763,
+            5.1621,
+            4.65454,
+            4.6877,
+            5.1589,
+            5.01839,
+            4.81071,
+            5.24575,
+            4.9913,
+            4.80177,
+            5.18696,
+            4.87271,
+            4.97809,
+            4.88067,
+            4.9305,
+            4.81187,
+            4.4605,
+            4.92943,
+            5.23168,
+            4.94083,
+            4.69259,
+            4.76095,
+            4.74441,
+            4.81102,
+            4.94293,
+            4.90204,
+            4.53579,
+            4.91026,
+            4.63342,
+            4.90098,
+            5.04656,
+            4.89438,
+            4.89704,
+            4.9667,
+            4.94035,
+            4.64381,
+            4.76133,
+            4.49628,
+            4.60273,
+            4.87816,
+            4.86968,
+            5.03411,
+            4.71504,
+            4.18378,
+            5.06436,
+            4.47125,
+            4.80177,
+            5.02795,
+            4.95047,
+            4.74993,
+            4.84984,
+            4.99234,
+            4.57989,
+            4.80215,
+            4.72603,
+            4.96978,
+            4.96059,
+            4.83065,
+            4.78615,
+            4.85814,
+            4.69989,
+            4.56412,
+            4.70496,
+            4.85209,
+            4.80944,
+            4.791,
+            4.8028,
+            4.65022,
+            4.90279,
+            4.8498,
+            4.68366,
+            4.82477,
+            4.96829,
+            5.114,
+            5.11631,
+            4.94083,
+            4.67494,
+            5.05614,
+            4.61798,
+            4.68506,
+            4.58312,
+            4.89027,
+            4.71545,
+            4.92529,
+            4.77487,
+            4.3764,
+            4.97832,
+            4.81992,
+            4.81131,
+            4.91933,
+            4.72543,
+            4.5749,
+            4.85909,
+            4.98992,
+            4.62782,
+            5.00526,
+            4.77509,
+            4.54296,
+            4.93964,
+            4.65526,
+            4.74844,
+            4.98197,
+            4.93855,
+            4.73361,
+            4.40623,
+            4.84044,
+            4.68303,
+            4.5449,
+            4.74978,
+            4.73286,
+            4.63082,
+            5.10716,
+            5.11458,
+            5.04425,
+            5.11559,
+            4.88711,
+            4.78152,
+            4.92955,
+            4.79275,
+            4.92607,
+            4.43538,
+            4.72603,
+            4.67828,
+            4.76623,
+            4.8814,
+            4.96701,
+            5.2285,
+            4.83771,
+            4.63808,
+            4.58013,
+            4.96567,
+            5.07546,
+            5.02061,
+            4.51382,
+            4.67226,
+            4.6261,
+            5.19041,
+            4.9004,
+            4.81254,
+            4.92005,
+            4.63456,
+            4.82491,
+            4.8335,
+            4.78664,
+            4.41905,
+            4.87111,
+            4.8236,
+            4.36369,
+            4.50181,
+            4.99971,
+            4.54458,
+            4.40778,
+            4.37317,
+            4.84384,
+            4.89916,
+            4.83623,
+            4.96574,
+            4.72721,
+            4.93398,
+            4.90094,
+            4.87484,
+            4.69947,
+            4.46603,
+            4.83921,
+            5.13761,
+            4.68306,
+            4.49873,
+            4.85083,
+            4.93194,
+            4.80737,
+            4.9269,
+            4.81604,
+            4.56751,
+            4.76934,
+            4.97913,
+            5.07645,
+            4.61252,
+            4.62552,
+            4.79322,
+            4.92026,
+            4.65237,
+            4.71413,
+            4.6462,
+            5.07187,
+            4.36671,
+            4.67012,
+            5.09229,
+            4.79901,
+            4.6969,
+            4.92218,
+            4.69102,
+            4.97988,
+            4.75608,
+            4.93425,
+            4.3048,
+            4.85624,
+            4.65828,
+            4.76871,
+            5.08266,
+            4.55283,
+            4.58891,
+            4.65472,
+            4.81356,
+            4.8506,
+            4.57807,
+            4.39672,
+            5.14019,
+            4.34043,
+            4.68014,
+            4.94118,
+            4.444,
+            4.90963,
+            4.67061,
+            5.12985,
+            4.61707,
+            4.58806,
+            4.68679,
+            4.96487,
+            4.76082,
+            4.39427,
+            4.63108,
+            4.55283,
+            4.75749,
+            4.49963,
+            4.40536,
+            4.98277,
+            4.79013,
+            4.6621,
+            4.61666,
+            4.83047,
+            4.80454,
+            4.66187,
+            4.68888,
+            4.86322,
+            4.91509,
+            4.53975,
+            4.67541,
+            4.73188,
+            4.88715,
+            4.57492,
+            4.7416,
+            4.51026,
+            4.87815,
+            4.64985,
+            4.6465,
+            4.78482,
+            4.7504,
+            4.57867,
+            4.53992,
+            4.8434,
+            4.77999,
+            4.48138,
+            4.63586,
+            4.55482,
+            4.57308,
+            4.57164,
+            4.64359,
+            4.75031,
+            4.89821,
+            4.65596,
+            4.62546,
+            4.68994,
+            4.91806,
+            4.49626,
+            4.86053,
+            4.71938,
+            4.37908,
+            4.65407,
+            4.73407,
+            4.57251,
+            4.4987,
+            4.76839,
+            4.8754,
+            4.79227,
+            4.53006,
+            4.54724,
+            4.47674,
+            4.42248,
+            4.80017,
+            4.73179,
+            4.79641,
+            4.79088,
+            4.6273,
+            4.66027,
+            4.80137,
+            4.48846,
+            4.84206,
+            4.40344,
+            5.0109,
+            4.62057,
+            4.71667,
+            4.9149,
+            4.68968,
+            4.25696,
+            4.49662,
+            4.80345,
+            4.66772,
+            4.86094,
+            5.02861,
+            4.55318,
+            4.43461,
+            4.78399,
+            4.78803,
+            4.75466,
+            4.82244,
+            4.53552,
+            4.6763,
+            4.88463,
+            4.64964,
+            4.73164,
+            4.81068,
+            5.19057,
+            4.50818,
+            4.5406,
+            4.94924,
+            4.57704,
+            4.58163,
+            4.80786,
+            4.98468,
+            4.58419,
+            4.66698,
+            4.65373,
+            4.92446,
+            4.74359,
+            4.50878,
+            4.89068,
+            4.63939,
+            4.61131,
+            4.98252,
+            4.59273,
+            4.79158,
+            4.53856,
+            4.93761,
+            4.61306,
+            4.42088,
+            4.63097,
+            4.6103,
+            4.59015,
+            4.58752,
+            4.62203,
+            4.87797,
+            4.72938,
+            4.43258,
+            4.60739,
+            4.68735,
+            4.42201,
+            4.42015,
+            4.74505,
+            4.64322,
+            4.91427,
+            4.53722,
+            4.70557,
+            4.62932,
+            4.66876,
+            4.82749,
+            4.71134,
+            4.80566,
+            4.52442,
+            4.6009,
+            4.64384,
+            4.79434,
+            4.74472,
+            4.45022,
+            4.77569,
+            4.68638,
+            4.4187,
+            4.85921,
+            4.87999,
+            4.79189,
+            4.37663,
+            4.64966,
+            4.29849,
+            4.76478,
+            4.68621,
+            4.55806,
+            4.53001,
+            4.47709,
+            4.78342,
+            4.58067,
+            4.50417,
+            4.34648,
+            4.52445,
+            4.80306,
+            4.51902,
+            4.75548,
+            4.64674,
+            4.39946,
+            4.71706,
+            4.63076,
+            4.62203,
+            4.71245,
+            4.82305,
+            4.52816,
+            4.71965,
+            4.75728,
+            4.50563,
+            5.02663,
+            4.79956,
+            4.65917,
+            4.5779,
+            4.47024,
+            4.83687,
+            4.45878,
+            4.60851,
+            4.62461,
+            4.89863,
+            4.91485,
+            4.72872,
+            4.54498,
+            4.9651,
+            4.3266,
+            4.64575,
+            4.74564,
+            4.81184,
+            4.65392,
+            4.59487,
+            4.75213,
+            4.66301,
+            4.46364,
+            4.5547,
+            4.58862,
+            4.44177,
+            4.70497,
+            4.51295,
+            4.49054,
+            4.69194,
+            4.37789,
+            4.66219,
+            4.79966,
+            4.55419,
+            4.33516,
+            4.20753,
+            4.88029,
+            5.06925,
+            4.44313,
+            4.32421,
+            4.58562,
+            4.62403,
+            4.68836,
+            4.33875,
+            4.59315,
+            4.87061,
+            4.71288,
+            4.39329,
+            4.38261,
+            4.44289,
+            4.46501,
+            4.58984,
+            4.4295,
+            4.76357,
+            4.65818,
+            4.29182,
+            4.71164,
+            4.65288,
+            4.4973,
+            4.78969,
+            4.37633,
+            4.35127,
+            4.307,
+            4.52359,
+            4.82105,
+            4.53729,
+            4.76207,
+            4.42362,
+            4.40303,
+            4.4377,
+            4.86301,
+            4.90302,
+            4.692,
+            4.57753,
+            4.70418,
+            4.50144,
+            4.85641,
+            4.55561,
+            4.31637,
+            4.35236,
+            4.30115,
+            4.79165,
+            4.90526,
+            4.86331,
+            4.66247,
+            4.54139,
+            4.68041,
+            4.58016,
+            4.27833,
+            4.5759,
+            4.67343,
+            4.27369,
+            4.67216,
+            4.65717,
+            4.67139,
+            4.54835,
+            4.39216,
+            4.50057,
+            4.56748,
+            4.60155,
+            4.80153,
+            4.11793,
+            4.47047,
+            4.18955,
+            4.33829,
+            4.66226,
+            4.44477,
+            4.62824,
+            4.30975,
+            4.42812,
+            4.71616,
+            4.73539,
+            4.30571,
+            4.09786,
+            4.67863,
+            4.48796,
+            4.55961,
+            4.67433,
+            4.72275,
+            4.19958,
+            4.47261,
+            4.58471,
+            4.30993,
+            4.96653,
+            4.40258,
+            4.44839,
+            4.32347,
+            4.51009,
+            4.26612,
+            4.43606,
+            4.70357,
+            4.66502,
+            4.42429,
+            4.2093,
+            4.79596,
+            4.15997,
+            4.91028,
+            4.17702,
+            4.20549,
+            4.44555,
+            4.32572,
+            4.61908,
+            4.15513,
+            4.79776,
+            4.50623,
+            4.38259,
+            4.42717,
+            4.57026,
+            4.36837,
+            4.86207,
+            4.64917,
+            4.61132,
+            4.50166,
+            4.58746,
+            4.66519,
+            4.30949,
+            4.40413,
+            4.76713,
+            4.52146,
+            4.78904,
+            4.4571,
+            4.50096,
+            4.56644,
+            4.73034,
+            4.78384,
+            4.61916,
+            4.73353,
+            4.57054,
+            4.39329,
+            4.7341,
+            4.35901,
+            4.70845,
+            4.65756,
+            4.66067,
+            4.51914,
+            4.64305,
+            4.52182,
+            4.66556,
+            4.4135,
+            4.41948,
+            4.24224,
+            4.2263,
+            4.4588,
+            4.47769,
+            4.31695,
+            4.73466,
+            4.44606,
+            4.73487,
+            3.9312,
+            4.85601,
+            4.63095,
+            4.26169,
+            4.42984,
+            4.48301,
+            4.42146,
+            4.55999,
+            4.47162,
+            4.74291,
+            4.6523,
+            4.68257,
+            4.29395,
+            4.49655,
+            4.85343,
+            4.4064,
+            4.56434,
+            4.47784,
+            4.91544,
+            4.67268,
+            4.42724,
+            4.98248,
+            4.25848,
+            4.66936,
+            4.76909,
+            4.25358,
+            4.49284,
+            4.65497,
+            4.44305,
+            4.17465,
+            4.72947,
+            4.03942,
+            4.68037,
+            4.45605,
+            4.77292,
+            4.48504,
+            4.63545,
+            4.55736,
+            4.14487,
+            4.44325,
+            4.71957,
+            4.37663,
+            4.56119,
+            4.35405,
+            4.46848,
+            4.27411,
+            4.23502,
+            4.25284,
+            4.37734,
+            4.60687,
+            4.14061,
+            4.51885,
+            4.26807,
+            4.6728,
+            4.66543,
+            4.68522,
+            4.052,
+            4.23172,
+            4.37141,
+            4.23223,
+            4.70984,
+            4.28569,
+            4.53202,
+            4.69518,
+            4.51001,
+            4.622,
+            4.61422,
+            4.27405,
+            4.70186,
+            4.53139,
+            4.61653,
+            4.52805,
+            4.45494,
+            4.64947,
+            4.36956,
+            4.60318,
+            4.57024,
+            4.54094,
+            4.48008,
+            4.63427,
+            4.72048,
+            4.38163,
+            4.48795,
+            4.58948,
+            4.43165,
+            4.42964,
+            4.36689,
+            4.29122,
+            4.46294,
+            4.25289,
+            4.2381,
+            4.5669,
+            4.65292,
+            4.72824,
+            4.5424,
+            4.5074,
+            4.41069,
+            4.34589,
+            4.66087,
+            4.3667,
+            4.12599,
+            4.46192,
+            4.6647,
+            4.39198,
+            4.30146,
+            4.44691,
+            4.0823,
+            4.37265,
+            4.44928,
+            4.55266,
+            4.32833,
+            4.56199,
+            4.5511,
+            4.61409,
+            4.52698,
+            4.58919,
+            4.40964,
+            4.62931,
+            4.65034,
+            4.72942,
+            4.58582,
+            4.75097,
+            4.45131,
+            4.62278,
+            4.30087,
+            4.20944,
+            4.72759,
+            4.64991,
+            4.276,
+            4.61855,
+            4.34225,
+            4.31856,
+            4.43884,
+            4.20519,
+            4.62112,
+            4.41565,
+            4.29785,
+            4.24867,
+            4.48361,
+            4.78776,
+            4.68757,
+            4.53799,
+            4.21952,
+            4.28089,
+            4.51176,
+            4.25543,
+            4.61468,
+            4.38846,
+            4.21651,
+            4.40214,
+            4.89177,
+            4.34657,
+            4.47874,
+            4.22253,
+            4.37631,
+            4.24356,
+            4.01877,
+            4.47286,
+            4.38093,
+            4.22209,
+            4.62499,
+            4.38607,
+            4.66667,
+            4.71728,
+            4.40116,
+            4.45076,
+            4.50306,
+            4.60412,
+            4.72615,
+            4.47617,
+            4.56085,
+            4.81438,
+            4.23634,
+            4.3366,
+            4.46868,
+            4.78242,
+            4.53482,
+            4.23392,
+            4.61119,
+            4.4743,
+            4.13638,
+            4.10941,
+            4.80199,
+            4.33583,
+            4.40042,
+            4.74981,
+            4.40471,
+            4.5992,
+            4.44396,
+            4.29101,
+            4.59187,
+            4.36723,
+            4.45177,
+            4.55756,
+            4.36824,
+            4.54848,
+            4.31046,
+            4.69068,
+            4.60546,
+            4.29302,
+            3.78524,
+            4.64622,
+            4.52625,
+            4.36206,
+            4.0618,
+            4.61758,
+            4.43272,
+            4.02894,
+            4.47178,
+            4.32032,
+            4.63518,
+            4.32917,
+            4.5668,
+            4.35877,
+            4.72676,
+            5.00534,
+            4.58696,
+            4.2586,
+            4.60091,
+            4.34239,
+            4.36907,
+            4.86409,
+            4.29057,
+            4.38333,
+            4.30863,
+            4.39333,
+            4.59365,
+            4.40166,
+            4.07245,
+            4.60984,
+            4.61895,
+            4.00926,
+            4.6481,
+            4.53555,
+            4.2329,
+            4.45218,
+            4.32422,
+            4.56335,
+            4.18252,
+            4.00789,
+            4.36448,
+            4.56634,
+            4.55995,
+            4.24424,
+            4.49537,
+            4.4365,
+            4.32871,
+            4.51815,
+            4.58975,
+            4.35395,
+            4.44043,
+            4.39594,
+            4.31501,
+            4.24702,
+            4.59454,
+            4.32586,
+            4.79668,
+            4.24409,
+            4.53054,
+            4.44084,
+            4.55064,
+            3.97967,
+            4.37847,
+            4.36902,
+            4.62033,
+            4.41077,
+            4.54702,
+            4.66114,
+            4.58558,
+            4.73869,
+            4.6505,
+            4.28815,
+            4.62306,
+            4.61922,
+            4.62194,
+            4.47024,
+            4.38572,
+            4.23153,
+            4.4582,
+            4.39949,
+            4.51669,
+            4.54652,
+            4.44432,
+            4.07713,
+            4.89498,
+            4.40956,
+            4.5585,
+            4.45401,
+            4.64648,
+            4.34599,
+            4.38254,
+            4.2725,
+            4.71591,
+            3.87683,
+            4.37337,
+            4.47734,
+            4.45168,
+            4.08619,
+            4.23965,
+            4.39212,
+            4.5313,
+            4.33085,
+            4.23232,
+            4.45552,
+            4.48156,
+            4.36242,
+            4.43116,
+            4.19682,
+            4.29684,
+            4.38084,
+            4.62292,
+            4.45856,
+            4.44504,
+            4.36544,
+            4.63477,
+            4.2519,
+            4.2906,
+            4.01187,
+            4.71216,
+            4.30352,
+            4.29585,
+            4.25058,
+            4.46083,
+            4.66354,
+            4.71122,
+            4.60744,
+            4.12529,
+            3.94824,
+            4.48864,
+            4.2015,
+            4.2891,
+            4.62722,
+            4.5061,
+            4.37218,
+            4.45055,
+            4.00527,
+            4.45265,
+            4.43356,
+            4.2977,
+            4.55992,
+            4.6705,
+            4.18849,
+            4.54513,
+            4.4587,
+            3.99098,
+            4.21912,
+            4.2775,
+            4.42525,
+            4.31546,
+            4.25047,
+            4.28106,
+            4.68477,
+            4.20129,
+            4.5783,
+            4.4996,
+            4.62058,
+            4.35665,
+            4.56785,
+            4.28635,
+            4.20255,
+            4.7094,
+            4.28498,
+            4.29269,
+            4.71604,
+            4.29835,
+            4.19412,
+            4.70592,
+            4.73931,
+            4.3699,
+            4.25445,
+            4.23463,
+            4.89396,
+            4.72456,
+            4.47222,
+            4.47906,
+            4.4803,
+            4.22133,
+            4.74637,
+            4.07069,
+            4.33534,
+            4.72215,
+            4.5711,
+            4.30587,
+            4.15091,
+            4.16803,
+            4.27706,
+            4.29576,
+            4.53465,
+            4.48614,
+            4.37501,
+            4.04455,
+            4.30444,
+            4.2725,
+            4.21472,
+            4.40963,
+            4.35502,
+            4.31452,
+            4.29067,
+            4.65515,
+            4.05838,
+            4.53869,
+            4.05647,
+            4.42281,
+            4.47959,
+            4.24617,
+            4.33588,
+            4.05389,
+            4.31867,
+            4.49374,
+            4.11889,
+            4.35429,
+            4.28919,
+            4.52904,
+            4.37941,
+            4.4773,
+            4.26081,
+            3.991,
+            4.45552,
+            4.17192,
+            4.36896,
+            4.18408,
+            3.96995,
+            4.23564,
+            4.43569,
+            4.4537,
+            4.05621,
+            4.1512,
+            4.43451
+        ]
+    },
+    "mem-allocated-bytes": {
+        "start_step": 0,
+        "end_step": 16335,
+        "step_interval": 5,
+        "values": [
+            151624192.0,
+            151624704.0,
+            152017920.0,
+            231819776.0,
+            233392640.0,
+            233392640.0,
+            233392640.0,
+            233392640.0,
+            233392640.0,
+            233392640.0,
+            233392640.0,
+            232344064.0,
+            233392640.0,
+            232344064.0,
+            233392640.0,
+            233392640.0,
+            232344064.0,
+            233392640.0,
+            232344064.0,
+            233392640.0,
+            233392640.0,
+            232344064.0,
+            233392640.0,
+            233392640.0,
+            233392640.0,
+            233392640.0,
+            232344064.0,
+            232344064.0,
+            232344064.0,
+            232344064.0,
+            233392640.0,
+            233392640.0,
+            232344064.0,
+            233392640.0,
+            232344064.0,
+            232344064.0,
+            233392640.0,
+            233392640.0,
+            233392640.0,
+            233392640.0,
+            232344064.0,
+            233392640.0,
+            233392640.0,
+            233392640.0,
+            232344064.0,
+            233392640.0,
+            233392640.0,
+            233392640.0,
+            232344064.0,
+            233392640.0,
+            233392640.0,
+            233392640.0,
+            233392640.0,
+            232344064.0,
+            233392640.0,
+            233392640.0,
+            233392640.0,
+            232344064.0,
+            233392640.0,
+            233392640.0,
+            233392640.0,
+            233392640.0,
+            233392640.0,
+            232344064.0,
+            233392640.0,
+            232344064.0,
+            233392640.0,
+            232344064.0,
+            233392640.0,
+            232344064.0,
+            233392640.0,
+            233392640.0,
+            233392640.0,
+            233392640.0,
+            233392640.0,
+            232344064.0,
+            232344064.0,
+            233392640.0,
+            233392640.0,
+            232344064.0,
+            232344064.0,
+            232344064.0,
+            233392640.0,
+            233392640.0,
+            233392640.0,
+            233392640.0,
+            233392640.0,
+            233392640.0,
+            233392640.0,
+            233392640.0,
+            233392640.0,
+            233392640.0,
+            233392640.0,
+            233392640.0,
+            232344064.0,
+            233392640.0,
+            233392640.0,
+            233392640.0,
+            233392640.0,
+            232344064.0,
+            233392640.0,
+            233392640.0,
+            233392640.0,
+            233392640.0,
+            232344064.0,
+            233392640.0,
+            232344064.0,
+            233392640.0,
+            233392640.0,
+            233392640.0,
+            233392640.0,
+            233392640.0,
+            233392640.0,
+            233392640.0,
+            232344064.0,
+            232344064.0,
+            233392640.0,
+            233392640.0,
+            233392640.0,
+            233392640.0,
+            232344064.0,
+            232344064.0,
+            233392640.0,
+            233392640.0,
+            233392640.0,
+            233392640.0,
+            233392640.0,
+            232344064.0,
+            233392640.0,
+            233392640.0,
+            233392640.0,
+            233392640.0,
+            233392640.0,
+            233392640.0,
+            233392640.0,
+            233392640.0,
+            233392640.0,
+            233392640.0,
+            233392640.0,
+            233392640.0,
+            233392640.0,
+            233392640.0,
+            233392640.0,
+            233392640.0,
+            233392640.0,
+            233392640.0,
+            233392640.0,
+            233392640.0,
+            232344064.0,
+            233392640.0,
+            233392640.0,
+            232344064.0,
+            233392640.0,
+            233392640.0,
+            233392640.0,
+            233392640.0,
+            232344064.0,
+            233392640.0,
+            233392640.0,
+            233392640.0,
+            233392640.0,
+            233392640.0,
+            233392640.0,
+            233392640.0,
+            233392640.0,
+            232344064.0,
+            232344064.0,
+            233392640.0,
+            233392640.0,
+            233392640.0,
+            233392640.0,
+            233392640.0,
+            233392640.0,
+            233392640.0,
+            233392640.0,
+            233392640.0,
+            232344064.0,
+            233392640.0,
+            233392640.0,
+            233392640.0,
+            233392640.0,
+            233392640.0,
+            233392640.0,
+            233392640.0,
+            232344064.0,
+            233392640.0,
+            233392640.0,
+            233392640.0,
+            233392640.0,
+            232344064.0,
+            233392640.0,
+            232344064.0,
+            233392640.0,
+            233392640.0,
+            233392640.0,
+            233392640.0,
+            233392640.0,
+            232344064.0,
+            233392640.0,
+            233392640.0,
+            233916928.0,
+            233392640.0,
+            232344064.0,
+            233392640.0,
+            233392640.0,
+            233392640.0,
+            233392640.0,
+            233392640.0,
+            233392640.0,
+            233392640.0,
+            232344064.0,
+            232344064.0,
+            233392640.0,
+            233392640.0,
+            233392640.0,
+            232344064.0,
+            233392640.0,
+            233392640.0,
+            233392640.0,
+            233392640.0,
+            232344064.0,
+            233392640.0,
+            233392640.0,
+            232344064.0,
+            233392640.0,
+            233392640.0,
+            233392640.0,
+            233392640.0,
+            233392640.0,
+            232344064.0,
+            233392640.0,
+            233392640.0,
+            232344064.0,
+            233392640.0,
+            233392640.0,
+            233392640.0,
+            233392640.0,
+            232344064.0,
+            233392640.0,
+            232344064.0,
+            232344064.0,
+            233916928.0,
+            233392640.0,
+            233392640.0,
+            232344064.0,
+            232344064.0,
+            232344064.0,
+            232344064.0,
+            232344064.0,
+            233916928.0,
+            232344064.0,
+            233392640.0,
+            233392640.0,
+            232344064.0,
+            233916928.0,
+            233392640.0,
+            232344064.0,
+            232344064.0,
+            233392640.0,
+            232344064.0,
+            232344064.0,
+            232344064.0,
+            233392640.0,
+            233392640.0,
+            233392640.0,
+            232344064.0,
+            233392640.0,
+            233392640.0,
+            233392640.0,
+            233916928.0,
+            232344064.0,
+            232344064.0,
+            232344064.0,
+            232344064.0,
+            233392640.0,
+            233392640.0,
+            232344064.0,
+            233392640.0,
+            232344064.0,
+            232344064.0,
+            232344064.0,
+            233392640.0,
+            232344064.0,
+            233392640.0,
+            233392640.0,
+            232344064.0,
+            232344064.0,
+            233392640.0,
+            232344064.0,
+            232344064.0,
+            233916928.0,
+            232344064.0,
+            232344064.0,
+            232344064.0,
+            232344064.0,
+            232344064.0,
+            232344064.0,
+            233392640.0,
+            232344064.0,
+            233392640.0,
+            233392640.0,
+            233392640.0,
+            233392640.0,
+            233916928.0,
+            232344064.0,
+            232344064.0,
+            232344064.0,
+            232344064.0,
+            232344064.0,
+            234965504.0,
+            232344064.0,
+            232344064.0,
+            233392640.0,
+            233392640.0,
+            233392640.0,
+            232344064.0,
+            232344064.0,
+            233392640.0,
+            232344064.0,
+            233392640.0,
+            233392640.0,
+            233392640.0,
+            233392640.0,
+            233392640.0,
+            232344064.0,
+            232344064.0,
+            232344064.0,
+            232344064.0,
+            232344064.0,
+            232344064.0,
+            233392640.0,
+            233916928.0,
+            233392640.0,
+            233392640.0,
+            232344064.0,
+            233916928.0,
+            232344064.0,
+            232344064.0,
+            233392640.0,
+            232344064.0,
+            233392640.0,
+            232344064.0,
+            232344064.0,
+            232344064.0,
+            232344064.0,
+            233392640.0,
+            232344064.0,
+            232344064.0,
+            233392640.0,
+            233392640.0,
+            233916928.0,
+            233392640.0,
+            232344064.0,
+            233392640.0,
+            232344064.0,
+            232344064.0,
+            232344064.0,
+            233392640.0,
+            233392640.0,
+            232344064.0,
+            233916928.0,
+            232344064.0,
+            233392640.0,
+            233392640.0,
+            232344064.0,
+            231295488.0,
+            232344064.0,
+            233392640.0,
+            233392640.0,
+            233392640.0,
+            233392640.0,
+            232344064.0,
+            233392640.0,
+            232344064.0,
+            232344064.0,
+            233392640.0,
+            233392640.0,
+            232344064.0,
+            232344064.0,
+            232344064.0,
+            232344064.0,
+            233916928.0,
+            233916928.0,
+            233392640.0,
+            232344064.0,
+            234965504.0,
+            233392640.0,
+            233392640.0,
+            232344064.0,
+            233392640.0,
+            232344064.0,
+            233392640.0,
+            233392640.0,
+            233392640.0,
+            233392640.0,
+            233392640.0,
+            233392640.0,
+            232344064.0,
+            232344064.0,
+            233392640.0,
+            232344064.0,
+            233392640.0,
+            232344064.0,
+            232344064.0,
+            232344064.0,
+            232344064.0,
+            233392640.0,
+            233392640.0,
+            233392640.0,
+            233392640.0,
+            232344064.0,
+            232344064.0,
+            233392640.0,
+            233916928.0,
+            233916928.0,
+            232344064.0,
+            233392640.0,
+            234965504.0,
+            232344064.0,
+            233916928.0,
+            233392640.0,
+            233392640.0,
+            233392640.0,
+            233392640.0,
+            232344064.0,
+            232344064.0,
+            232344064.0,
+            233392640.0,
+            232344064.0,
+            232344064.0,
+            232344064.0,
+            232344064.0,
+            232344064.0,
+            233392640.0,
+            233916928.0,
+            233392640.0,
+            233392640.0,
+            232344064.0,
+            233392640.0,
+            233392640.0,
+            233392640.0,
+            232344064.0,
+            232344064.0,
+            233392640.0,
+            232344064.0,
+            233392640.0,
+            232344064.0,
+            232344064.0,
+            232344064.0,
+            232344064.0,
+            232344064.0,
+            232344064.0,
+            232344064.0,
+            232344064.0,
+            232344064.0,
+            232344064.0,
+            232344064.0,
+            233392640.0,
+            233392640.0,
+            232344064.0,
+            232344064.0,
+            232344064.0,
+            233392640.0,
+            232344064.0,
+            232344064.0,
+            232344064.0,
+            233392640.0,
+            232344064.0,
+            232344064.0,
+            232344064.0,
+            233392640.0,
+            233392640.0,
+            232344064.0,
+            232344064.0,
+            233916928.0,
+            233392640.0,
+            232344064.0,
+            233392640.0,
+            233916928.0,
+            233392640.0,
+            232344064.0,
+            232344064.0,
+            232344064.0,
+            233916928.0,
+            233392640.0,
+            233392640.0,
+            232344064.0,
+            232344064.0,
+            232344064.0,
+            232344064.0,
+            232344064.0,
+            233392640.0,
+            233392640.0,
+            232344064.0,
+            232344064.0,
+            232344064.0,
+            232344064.0,
+            232344064.0,
+            232344064.0,
+            233392640.0,
+            232344064.0,
+            233392640.0,
+            233392640.0,
+            232344064.0,
+            232344064.0,
+            233392640.0,
+            233392640.0,
+            232344064.0,
+            232344064.0,
+            233392640.0,
+            233392640.0,
+            233392640.0,
+            232344064.0,
+            233392640.0,
+            232344064.0,
+            232344064.0,
+            232344064.0,
+            232344064.0,
+            233392640.0,
+            233392640.0,
+            233392640.0,
+            232344064.0,
+            232344064.0,
+            232344064.0,
+            233916928.0,
+            233392640.0,
+            232344064.0,
+            233916928.0,
+            233392640.0,
+            232344064.0,
+            233392640.0,
+            233392640.0,
+            232344064.0,
+            233392640.0,
+            233392640.0,
+            233392640.0,
+            232344064.0,
+            233392640.0,
+            232344064.0,
+            233392640.0,
+            232344064.0,
+            232344064.0,
+            232344064.0,
+            233916928.0,
+            232344064.0,
+            232344064.0,
+            233392640.0,
+            232344064.0,
+            232344064.0,
+            232344064.0,
+            234965504.0,
+            232344064.0,
+            232344064.0,
+            231295488.0,
+            232344064.0,
+            233392640.0,
+            233392640.0,
+            233392640.0,
+            233392640.0,
+            233392640.0,
+            233916928.0,
+            233392640.0,
+            233392640.0,
+            232344064.0,
+            232344064.0,
+            233392640.0,
+            232344064.0,
+            233392640.0,
+            232344064.0,
+            232344064.0,
+            233392640.0,
+            233392640.0,
+            232344064.0,
+            233392640.0,
+            232344064.0,
+            232344064.0,
+            233392640.0,
+            232344064.0,
+            232344064.0,
+            232344064.0,
+            232344064.0,
+            233392640.0,
+            233392640.0,
+            233392640.0,
+            233392640.0,
+            232344064.0,
+            232344064.0,
+            232344064.0,
+            232344064.0,
+            232344064.0,
+            233392640.0,
+            233392640.0,
+            233392640.0,
+            232344064.0,
+            232344064.0,
+            232344064.0,
+            233392640.0,
+            233392640.0,
+            232344064.0,
+            232344064.0,
+            233392640.0,
+            232344064.0,
+            232344064.0,
+            232344064.0,
+            233392640.0,
+            233392640.0,
+            232344064.0,
+            232344064.0,
+            232344064.0,
+            233392640.0,
+            233392640.0,
+            233392640.0,
+            232344064.0,
+            232344064.0,
+            233392640.0,
+            232344064.0,
+            232344064.0,
+            232344064.0,
+            232344064.0,
+            232344064.0,
+            233392640.0,
+            233392640.0,
+            232344064.0,
+            233392640.0,
+            233392640.0,
+            233392640.0,
+            233392640.0,
+            233392640.0,
+            233392640.0,
+            232344064.0,
+            233392640.0,
+            233392640.0,
+            232344064.0,
+            232344064.0,
+            233392640.0,
+            233392640.0,
+            233392640.0,
+            232344064.0,
+            232868352.0,
+            232344064.0,
+            233392640.0,
+            233392640.0,
+            233392640.0,
+            232344064.0,
+            232344064.0,
+            232344064.0,
+            232344064.0,
+            233916928.0,
+            233392640.0,
+            232344064.0,
+            232344064.0,
+            232344064.0,
+            233392640.0,
+            233392640.0,
+            232344064.0,
+            233392640.0,
+            233392640.0,
+            233392640.0,
+            232344064.0,
+            232344064.0,
+            232344064.0,
+            232344064.0,
+            232344064.0,
+            232344064.0,
+            232344064.0,
+            232344064.0,
+            232344064.0,
+            233392640.0,
+            232344064.0,
+            233392640.0,
+            233916928.0,
+            233392640.0,
+            232344064.0,
+            232344064.0,
+            233916928.0,
+            233392640.0,
+            233392640.0,
+            233392640.0,
+            233916928.0,
+            232344064.0,
+            232344064.0,
+            232344064.0,
+            233392640.0,
+            233392640.0,
+            233392640.0,
+            233392640.0,
+            233392640.0,
+            232344064.0,
+            233392640.0,
+            232344064.0,
+            233392640.0,
+            232344064.0,
+            234965504.0,
+            232344064.0,
+            232344064.0,
+            232344064.0,
+            232344064.0,
+            233392640.0,
+            233392640.0,
+            233916928.0,
+            232344064.0,
+            233916928.0,
+            232344064.0,
+            232344064.0,
+            232344064.0,
+            232344064.0,
+            232344064.0,
+            233392640.0,
+            232344064.0,
+            233392640.0,
+            233392640.0,
+            233916928.0,
+            232344064.0,
+            232344064.0,
+            233916928.0,
+            232344064.0,
+            233916928.0,
+            232344064.0,
+            233916928.0,
+            232344064.0,
+            234965504.0,
+            232344064.0,
+            233916928.0,
+            233916928.0,
+            232344064.0,
+            232344064.0,
+            232344064.0,
+            232344064.0,
+            233392640.0,
+            233916928.0,
+            232344064.0,
+            232344064.0,
+            233392640.0,
+            233392640.0,
+            233392640.0,
+            232344064.0,
+            232344064.0,
+            233916928.0,
+            233916928.0,
+            232344064.0,
+            233916928.0,
+            232344064.0,
+            232344064.0,
+            232344064.0,
+            232344064.0,
+            232344064.0,
+            233916928.0,
+            232344064.0,
+            232344064.0,
+            232344064.0,
+            233916928.0,
+            233916928.0,
+            232344064.0,
+            232344064.0,
+            232344064.0,
+            232344064.0,
+            233916928.0,
+            232344064.0,
+            233916928.0,
+            234965504.0,
+            233916928.0,
+            232344064.0,
+            232344064.0,
+            232344064.0,
+            233392640.0,
+            232344064.0,
+            233392640.0,
+            232344064.0,
+            232344064.0,
+            232344064.0,
+            233392640.0,
+            233916928.0,
+            232344064.0,
+            232344064.0,
+            232344064.0,
+            232344064.0,
+            232344064.0,
+            232344064.0,
+            234965504.0,
+            232344064.0,
+            233392640.0,
+            232344064.0,
+            232344064.0,
+            232344064.0,
+            232344064.0,
+            233916928.0,
+            232868352.0,
+            232344064.0,
+            232344064.0,
+            232344064.0,
+            233392640.0,
+            232344064.0,
+            232344064.0,
+            233392640.0,
+            233916928.0,
+            233916928.0,
+            233916928.0,
+            233916928.0,
+            232344064.0,
+            232344064.0,
+            232344064.0,
+            233916928.0,
+            232344064.0,
+            233916928.0,
+            233916928.0,
+            232344064.0,
+            232344064.0,
+            232344064.0,
+            232344064.0,
+            232344064.0,
+            233392640.0,
+            233916928.0,
+            232344064.0,
+            232344064.0,
+            232344064.0,
+            233916928.0,
+            233916928.0,
+            233916928.0,
+            233392640.0,
+            232344064.0,
+            232344064.0,
+            232344064.0,
+            232344064.0,
+            232344064.0,
+            232344064.0,
+            232344064.0,
+            233916928.0,
+            233392640.0,
+            233392640.0,
+            232344064.0,
+            233916928.0,
+            233392640.0,
+            232344064.0,
+            232344064.0,
+            233916928.0,
+            233392640.0,
+            232344064.0,
+            233916928.0,
+            232344064.0,
+            232344064.0,
+            232344064.0,
+            233392640.0,
+            232344064.0,
+            232344064.0,
+            232344064.0,
+            233392640.0,
+            233392640.0,
+            232344064.0,
+            233392640.0,
+            233392640.0,
+            233392640.0,
+            234965504.0,
+            232344064.0,
+            233392640.0,
+            232344064.0,
+            233392640.0,
+            232344064.0,
+            233392640.0,
+            233392640.0,
+            232344064.0,
+            232344064.0,
+            232344064.0,
+            233392640.0,
+            233392640.0,
+            233392640.0,
+            233392640.0,
+            232344064.0,
+            232344064.0,
+            232344064.0,
+            232344064.0,
+            233392640.0,
+            232344064.0,
+            232344064.0,
+            233392640.0,
+            232344064.0,
+            232344064.0,
+            232344064.0,
+            232344064.0,
+            232344064.0,
+            232344064.0,
+            233392640.0,
+            232344064.0,
+            232344064.0,
+            232344064.0,
+            232344064.0,
+            233392640.0,
+            233392640.0,
+            232344064.0,
+            232344064.0,
+            232344064.0,
+            233392640.0,
+            233392640.0,
+            232344064.0,
+            233392640.0,
+            232344064.0,
+            232344064.0,
+            232344064.0,
+            232344064.0,
+            232344064.0,
+            233392640.0,
+            232344064.0,
+            233916928.0,
+            232344064.0,
+            232344064.0,
+            232344064.0,
+            232344064.0,
+            232344064.0,
+            232344064.0,
+            233916928.0,
+            233392640.0,
+            232344064.0,
+            232344064.0,
+            232344064.0,
+            232344064.0,
+            233916928.0,
+            232344064.0,
+            232344064.0,
+            233392640.0,
+            232344064.0,
+            232344064.0,
+            232344064.0,
+            233916928.0,
+            233392640.0,
+            232344064.0,
+            232344064.0,
+            233392640.0,
+            232344064.0,
+            233392640.0,
+            232344064.0,
+            232344064.0,
+            232344064.0,
+            232344064.0,
+            233916928.0,
+            232344064.0,
+            232344064.0,
+            233392640.0,
+            233392640.0,
+            232344064.0,
+            232344064.0,
+            233916928.0,
+            233916928.0,
+            232344064.0,
+            233916928.0,
+            232344064.0,
+            232344064.0,
+            232344064.0,
+            232344064.0,
+            232344064.0,
+            232344064.0,
+            232344064.0,
+            232344064.0,
+            233916928.0,
+            232344064.0,
+            232344064.0,
+            233916928.0,
+            233916928.0,
+            232344064.0,
+            233392640.0,
+            233916928.0,
+            232344064.0,
+            232344064.0,
+            232344064.0,
+            232344064.0,
+            233916928.0,
+            232344064.0,
+            233392640.0,
+            232344064.0,
+            232344064.0,
+            232868352.0,
+            232344064.0,
+            233392640.0,
+            232344064.0,
+            232344064.0,
+            233916928.0,
+            232344064.0,
+            232344064.0,
+            232344064.0,
+            233392640.0,
+            232344064.0,
+            232344064.0,
+            232344064.0,
+            232344064.0,
+            233916928.0,
+            233916928.0,
+            232344064.0,
+            233392640.0,
+            233392640.0,
+            232344064.0,
+            232344064.0,
+            232344064.0,
+            233392640.0,
+            233392640.0,
+            233916928.0,
+            232344064.0,
+            232344064.0,
+            232344064.0,
+            233916928.0,
+            232344064.0,
+            232344064.0,
+            232344064.0,
+            232344064.0,
+            232344064.0,
+            232344064.0,
+            232344064.0,
+            233392640.0,
+            232344064.0,
+            233392640.0,
+            233916928.0,
+            233392640.0,
+            232344064.0,
+            233392640.0,
+            233916928.0,
+            232344064.0,
+            233916928.0,
+            232344064.0,
+            231295488.0,
+            233916928.0,
+            232344064.0,
+            232344064.0,
+            232344064.0,
+            232344064.0,
+            232344064.0,
+            233916928.0,
+            233916928.0,
+            232344064.0,
+            232344064.0,
+            233392640.0,
+            232344064.0,
+            232344064.0,
+            233392640.0,
+            232344064.0,
+            232344064.0,
+            232344064.0,
+            233916928.0,
+            232344064.0,
+            233916928.0,
+            233392640.0,
+            233392640.0,
+            233916928.0,
+            233916928.0,
+            233392640.0,
+            232344064.0,
+            233916928.0,
+            232344064.0,
+            233916928.0,
+            232344064.0,
+            233392640.0,
+            233392640.0,
+            232344064.0,
+            233392640.0,
+            232344064.0,
+            232344064.0,
+            232344064.0,
+            232344064.0,
+            232344064.0,
+            232344064.0,
+            232344064.0,
+            233392640.0,
+            233916928.0,
+            232344064.0,
+            233916928.0,
+            232344064.0,
+            232344064.0,
+            233916928.0,
+            233392640.0,
+            232344064.0,
+            233392640.0,
+            232344064.0,
+            232344064.0,
+            232344064.0,
+            232344064.0,
+            233916928.0,
+            232344064.0,
+            232344064.0,
+            233392640.0,
+            233392640.0,
+            233916928.0,
+            232344064.0,
+            232344064.0,
+            232344064.0,
+            233392640.0,
+            232344064.0,
+            232344064.0,
+            232344064.0,
+            232344064.0,
+            233392640.0,
+            232344064.0,
+            232344064.0,
+            233916928.0,
+            232344064.0,
+            233916928.0,
+            233916928.0,
+            232344064.0,
+            232344064.0,
+            232344064.0,
+            232344064.0,
+            232344064.0,
+            233392640.0,
+            233392640.0,
+            232344064.0,
+            233392640.0,
+            232344064.0,
+            233392640.0,
+            233392640.0,
+            233392640.0,
+            233916928.0,
+            233392640.0,
+            233916928.0,
+            233392640.0,
+            232344064.0,
+            232344064.0,
+            232344064.0,
+            233392640.0,
+            232344064.0,
+            232344064.0,
+            232344064.0,
+            232344064.0,
+            233916928.0,
+            232344064.0,
+            233392640.0,
+            232344064.0,
+            232344064.0,
+            232344064.0,
+            233916928.0,
+            232344064.0,
+            233916928.0,
+            233392640.0,
+            232344064.0,
+            232344064.0,
+            233916928.0,
+            232344064.0,
+            232344064.0,
+            233916928.0,
+            232344064.0,
+            232344064.0,
+            232344064.0,
+            232344064.0,
+            234965504.0,
+            232344064.0,
+            233916928.0,
+            232344064.0,
+            232344064.0,
+            233916928.0,
+            233916928.0,
+            232344064.0,
+            232344064.0,
+            233392640.0,
+            233392640.0,
+            233392640.0,
+            233392640.0,
+            233392640.0,
+            232344064.0,
+            233392640.0,
+            233392640.0,
+            234965504.0,
+            232344064.0,
+            233392640.0,
+            232344064.0,
+            233392640.0,
+            233392640.0,
+            232344064.0,
+            233392640.0,
+            232344064.0,
+            233392640.0,
+            232344064.0,
+            232344064.0,
+            232344064.0,
+            233392640.0,
+            233392640.0,
+            233392640.0,
+            232344064.0,
+            233392640.0,
+            233392640.0,
+            233392640.0,
+            233392640.0,
+            233392640.0,
+            233392640.0,
+            232344064.0,
+            233392640.0,
+            233392640.0,
+            233392640.0,
+            233392640.0,
+            234965504.0,
+            233392640.0,
+            233392640.0,
+            233392640.0,
+            232344064.0,
+            233392640.0,
+            232344064.0,
+            232344064.0,
+            232344064.0,
+            232344064.0,
+            232344064.0,
+            233392640.0,
+            232344064.0,
+            233392640.0,
+            233392640.0,
+            233392640.0,
+            233392640.0,
+            233392640.0,
+            232344064.0,
+            233392640.0,
+            233916928.0,
+            232344064.0,
+            233392640.0,
+            232344064.0,
+            232344064.0,
+            233392640.0,
+            233392640.0,
+            233392640.0,
+            233392640.0,
+            233392640.0,
+            233392640.0,
+            232344064.0,
+            232344064.0,
+            233392640.0,
+            233392640.0,
+            233392640.0,
+            233392640.0,
+            233392640.0,
+            233392640.0,
+            232344064.0,
+            232344064.0,
+            233916928.0,
+            233392640.0,
+            233392640.0,
+            233916928.0,
+            232344064.0,
+            233916928.0,
+            233392640.0,
+            233392640.0,
+            234965504.0,
+            232344064.0,
+            232344064.0,
+            232344064.0,
+            232344064.0,
+            233916928.0,
+            232344064.0,
+            232344064.0,
+            232344064.0,
+            232344064.0,
+            233916928.0,
+            232344064.0,
+            233916928.0,
+            233916928.0,
+            234965504.0,
+            232344064.0,
+            233916928.0,
+            232344064.0,
+            232344064.0,
+            232344064.0,
+            232344064.0,
+            232344064.0,
+            232344064.0,
+            233916928.0,
+            232344064.0,
+            232344064.0,
+            232344064.0,
+            233916928.0,
+            233916928.0,
+            232344064.0,
+            232344064.0,
+            232344064.0,
+            233392640.0,
+            232344064.0,
+            232344064.0,
+            232344064.0,
+            233916928.0,
+            232344064.0,
+            232344064.0,
+            232344064.0,
+            233392640.0,
+            233916928.0,
+            232344064.0,
+            233392640.0,
+            233392640.0,
+            232344064.0,
+            232344064.0,
+            232344064.0,
+            233392640.0,
+            232344064.0,
+            232344064.0,
+            233916928.0,
+            232344064.0,
+            233392640.0,
+            232344064.0,
+            233916928.0,
+            232344064.0,
+            232344064.0,
+            232344064.0,
+            233916928.0,
+            232344064.0,
+            232344064.0,
+            232344064.0,
+            232344064.0,
+            232344064.0,
+            233392640.0,
+            232344064.0,
+            233916928.0,
+            232344064.0,
+            232344064.0,
+            232344064.0,
+            232344064.0,
+            232344064.0,
+            233392640.0,
+            233916928.0,
+            233916928.0,
+            232344064.0,
+            232344064.0,
+            232344064.0,
+            232344064.0,
+            233392640.0,
+            232344064.0,
+            232344064.0,
+            233392640.0,
+            232344064.0,
+            232868352.0,
+            232344064.0,
+            232344064.0,
+            233392640.0,
+            233916928.0,
+            232344064.0,
+            232344064.0,
+            232344064.0,
+            232344064.0,
+            232344064.0,
+            232344064.0,
+            233392640.0,
+            232344064.0,
+            233392640.0,
+            233392640.0,
+            233392640.0,
+            232344064.0,
+            232344064.0,
+            232344064.0,
+            232344064.0,
+            232344064.0,
+            233916928.0,
+            232344064.0,
+            232344064.0,
+            232344064.0,
+            232344064.0,
+            232344064.0,
+            233916928.0,
+            233392640.0,
+            233916928.0,
+            232344064.0,
+            232344064.0,
+            232344064.0,
+            232344064.0,
+            232344064.0,
+            232344064.0,
+            232344064.0,
+            233392640.0,
+            232344064.0,
+            232344064.0,
+            232344064.0,
+            232344064.0,
+            232344064.0,
+            233916928.0,
+            232344064.0,
+            232344064.0,
+            232344064.0,
+            233916928.0,
+            233392640.0,
+            232344064.0,
+            232344064.0,
+            232344064.0,
+            232344064.0,
+            232344064.0,
+            232344064.0,
+            232344064.0,
+            233916928.0,
+            233916928.0,
+            232344064.0,
+            232344064.0,
+            232344064.0,
+            232344064.0,
+            234965504.0,
+            232344064.0,
+            232344064.0,
+            232344064.0,
+            233392640.0,
+            232344064.0,
+            232344064.0,
+            233916928.0,
+            232344064.0,
+            232344064.0,
+            232344064.0,
+            233392640.0,
+            232344064.0,
+            232344064.0,
+            232344064.0,
+            233392640.0,
+            232344064.0,
+            232344064.0,
+            233392640.0,
+            233916928.0,
+            233392640.0,
+            232344064.0,
+            232344064.0,
+            232344064.0,
+            232344064.0,
+            232344064.0,
+            233392640.0,
+            232344064.0,
+            233916928.0,
+            232344064.0,
+            232344064.0,
+            233392640.0,
+            232344064.0,
+            233392640.0,
+            232344064.0,
+            232344064.0,
+            232344064.0,
+            233392640.0,
+            232344064.0,
+            232344064.0,
+            232344064.0,
+            232344064.0,
+            232344064.0,
+            232344064.0,
+            233916928.0,
+            233916928.0,
+            232344064.0,
+            232344064.0,
+            233392640.0,
+            233916928.0,
+            232344064.0,
+            232344064.0,
+            233916928.0,
+            233916928.0,
+            233916928.0,
+            232344064.0,
+            232344064.0,
+            233392640.0,
+            233916928.0,
+            233392640.0,
+            233916928.0,
+            232344064.0,
+            232344064.0,
+            233392640.0,
+            233916928.0,
+            232344064.0,
+            232344064.0,
+            233392640.0,
+            232344064.0,
+            232344064.0,
+            232344064.0,
+            232344064.0,
+            233392640.0,
+            233392640.0,
+            232344064.0,
+            232344064.0,
+            233916928.0,
+            232344064.0,
+            232344064.0,
+            233916928.0,
+            232344064.0,
+            232344064.0,
+            232344064.0,
+            233916928.0,
+            232344064.0,
+            232344064.0,
+            232344064.0,
+            232344064.0,
+            233392640.0,
+            232344064.0,
+            233916928.0,
+            232344064.0,
+            232344064.0,
+            233392640.0,
+            233392640.0,
+            232344064.0,
+            232344064.0,
+            232344064.0,
+            233392640.0,
+            233392640.0,
+            232344064.0,
+            233392640.0,
+            232344064.0,
+            233392640.0,
+            232344064.0,
+            232344064.0,
+            232344064.0,
+            232344064.0,
+            233916928.0,
+            232344064.0,
+            232344064.0,
+            233916928.0,
+            232344064.0,
+            233392640.0,
+            232344064.0,
+            233392640.0,
+            233916928.0,
+            232344064.0,
+            232344064.0,
+            233392640.0,
+            233392640.0,
+            232344064.0,
+            233392640.0,
+            233916928.0,
+            232344064.0,
+            233392640.0,
+            232344064.0,
+            233916928.0,
+            232868352.0,
+            232344064.0,
+            232344064.0,
+            233916928.0,
+            233392640.0,
+            233392640.0,
+            232344064.0,
+            233916928.0,
+            232344064.0,
+            233916928.0,
+            232344064.0,
+            233916928.0,
+            232344064.0,
+            232344064.0,
+            232344064.0,
+            233392640.0,
+            232344064.0,
+            232344064.0,
+            233392640.0,
+            232344064.0,
+            233392640.0,
+            233916928.0,
+            233916928.0,
+            232344064.0,
+            233916928.0,
+            232344064.0,
+            233916928.0,
+            233916928.0,
+            233916928.0,
+            232344064.0,
+            232344064.0,
+            232344064.0,
+            232344064.0,
+            232344064.0,
+            232344064.0,
+            233916928.0,
+            232344064.0,
+            232344064.0,
+            233916928.0,
+            233392640.0,
+            232344064.0,
+            233916928.0,
+            233916928.0,
+            232344064.0,
+            232344064.0,
+            232344064.0,
+            233392640.0,
+            233392640.0,
+            232344064.0,
+            233392640.0,
+            232344064.0,
+            232344064.0,
+            233392640.0,
+            232344064.0,
+            233916928.0,
+            233916928.0,
+            233392640.0,
+            233916928.0,
+            233916928.0,
+            233392640.0,
+            232344064.0,
+            233392640.0,
+            232344064.0,
+            233916928.0,
+            232344064.0,
+            232344064.0,
+            233392640.0,
+            232344064.0,
+            232344064.0,
+            232344064.0,
+            232344064.0,
+            232344064.0,
+            233916928.0,
+            232344064.0,
+            232344064.0,
+            232344064.0,
+            232344064.0,
+            233916928.0,
+            233916928.0,
+            233916928.0,
+            232344064.0,
+            233916928.0,
+            233916928.0,
+            232344064.0,
+            232344064.0,
+            232344064.0,
+            232344064.0,
+            233916928.0,
+            233392640.0,
+            232344064.0,
+            232344064.0,
+            233916928.0,
+            232344064.0,
+            232344064.0,
+            232344064.0,
+            232344064.0,
+            232344064.0,
+            232344064.0,
+            233916928.0,
+            232344064.0,
+            232344064.0,
+            232344064.0,
+            232344064.0,
+            232344064.0,
+            232344064.0,
+            233916928.0,
+            233392640.0,
+            233392640.0,
+            232344064.0,
+            233916928.0,
+            232344064.0,
+            232344064.0,
+            232344064.0,
+            232344064.0,
+            232344064.0,
+            233392640.0,
+            232344064.0,
+            233392640.0,
+            232344064.0,
+            233916928.0,
+            233916928.0,
+            232344064.0,
+            232344064.0,
+            233392640.0,
+            232344064.0,
+            233916928.0,
+            233916928.0,
+            232344064.0,
+            233392640.0,
+            233392640.0,
+            233392640.0,
+            232344064.0,
+            233392640.0,
+            233392640.0,
+            233916928.0,
+            233392640.0,
+            233392640.0,
+            232344064.0,
+            232344064.0,
+            232344064.0,
+            232344064.0,
+            232344064.0,
+            232344064.0,
+            232344064.0,
+            232344064.0,
+            233392640.0,
+            232344064.0,
+            233392640.0,
+            232344064.0,
+            233916928.0,
+            232344064.0,
+            232344064.0,
+            233392640.0,
+            232344064.0,
+            232344064.0,
+            232344064.0,
+            232344064.0,
+            232344064.0,
+            233392640.0,
+            232344064.0,
+            232344064.0,
+            232344064.0,
+            233916928.0,
+            232344064.0,
+            232344064.0,
+            232344064.0,
+            232344064.0,
+            233392640.0,
+            232344064.0,
+            233392640.0,
+            233392640.0,
+            232344064.0,
+            233916928.0,
+            232344064.0,
+            233916928.0,
+            232344064.0,
+            232344064.0,
+            233392640.0,
+            233916928.0,
+            232344064.0,
+            232344064.0,
+            232344064.0,
+            232344064.0,
+            232344064.0,
+            233916928.0,
+            233392640.0,
+            233392640.0,
+            232344064.0,
+            233392640.0,
+            232344064.0,
+            233392640.0,
+            233392640.0,
+            233392640.0,
+            232344064.0,
+            232344064.0,
+            233392640.0,
+            233392640.0,
+            233916928.0,
+            232344064.0,
+            232344064.0,
+            233392640.0,
+            232344064.0,
+            232344064.0,
+            233392640.0,
+            232344064.0,
+            233392640.0,
+            232344064.0,
+            233916928.0,
+            232344064.0,
+            233916928.0,
+            232344064.0,
+            232868352.0,
+            232344064.0,
+            233916928.0,
+            233392640.0,
+            232344064.0,
+            232344064.0,
+            232344064.0,
+            233916928.0,
+            233916928.0,
+            232344064.0,
+            233392640.0,
+            233392640.0,
+            232344064.0,
+            233392640.0,
+            232344064.0,
+            232344064.0,
+            232868352.0,
+            233916928.0,
+            232344064.0,
+            232868352.0,
+            233916928.0,
+            232344064.0,
+            233916928.0,
+            233916928.0,
+            232344064.0,
+            232344064.0,
+            233392640.0,
+            232344064.0,
+            232344064.0,
+            232344064.0,
+            233392640.0,
+            233916928.0,
+            233916928.0,
+            233916928.0,
+            232344064.0,
+            232344064.0,
+            232344064.0,
+            232344064.0,
+            233392640.0,
+            232344064.0,
+            232344064.0,
+            233392640.0,
+            232344064.0,
+            232344064.0,
+            233392640.0,
+            232344064.0,
+            233916928.0,
+            233392640.0,
+            233392640.0,
+            232344064.0,
+            233392640.0,
+            232344064.0,
+            233392640.0,
+            232344064.0,
+            232344064.0,
+            233392640.0,
+            233916928.0,
+            232344064.0,
+            232344064.0,
+            233916928.0,
+            233392640.0,
+            233916928.0,
+            232344064.0,
+            232344064.0,
+            233392640.0,
+            232344064.0,
+            233392640.0,
+            233392640.0,
+            233392640.0,
+            232344064.0,
+            232344064.0,
+            233916928.0,
+            232344064.0,
+            233916928.0,
+            233392640.0,
+            232344064.0,
+            233392640.0,
+            232344064.0,
+            232344064.0,
+            233392640.0,
+            233392640.0,
+            232344064.0,
+            232344064.0,
+            232344064.0,
+            232344064.0,
+            233392640.0,
+            232344064.0,
+            233392640.0,
+            232344064.0,
+            233916928.0,
+            233392640.0,
+            232344064.0,
+            232344064.0,
+            233392640.0,
+            232344064.0,
+            232868352.0,
+            232344064.0,
+            233392640.0,
+            233916928.0,
+            232344064.0,
+            232344064.0,
+            232344064.0,
+            232344064.0,
+            233392640.0,
+            233392640.0,
+            233916928.0,
+            232344064.0,
+            232344064.0,
+            233392640.0,
+            232344064.0,
+            232344064.0,
+            233392640.0,
+            232344064.0,
+            232344064.0,
+            233916928.0,
+            233916928.0,
+            233916928.0,
+            233392640.0,
+            232344064.0,
+            232344064.0,
+            233392640.0,
+            232344064.0,
+            233916928.0,
+            232344064.0,
+            233392640.0,
+            232344064.0,
+            232344064.0,
+            233916928.0,
+            233392640.0,
+            233392640.0,
+            233916928.0,
+            232344064.0,
+            233392640.0,
+            232344064.0,
+            232344064.0,
+            232344064.0,
+            232344064.0,
+            232344064.0,
+            232344064.0,
+            233392640.0,
+            233392640.0,
+            233392640.0,
+            232344064.0,
+            233916928.0,
+            233916928.0,
+            232344064.0,
+            233916928.0,
+            233916928.0,
+            233916928.0,
+            233392640.0,
+            232344064.0,
+            232344064.0,
+            232344064.0,
+            233916928.0,
+            232344064.0,
+            233916928.0,
+            233916928.0,
+            232344064.0,
+            233916928.0,
+            232344064.0,
+            232344064.0,
+            233392640.0,
+            232344064.0,
+            232344064.0,
+            232344064.0,
+            233392640.0,
+            233392640.0,
+            233916928.0,
+            232344064.0,
+            232344064.0,
+            232344064.0,
+            233916928.0,
+            232344064.0,
+            233916928.0,
+            233916928.0,
+            232344064.0,
+            233392640.0,
+            232344064.0,
+            232344064.0,
+            232344064.0,
+            232344064.0,
+            233916928.0,
+            233392640.0,
+            232344064.0,
+            232344064.0,
+            232344064.0,
+            232344064.0,
+            232344064.0,
+            233916928.0,
+            232344064.0,
+            233392640.0,
+            232344064.0,
+            232344064.0,
+            232344064.0,
+            232344064.0,
+            233916928.0,
+            232344064.0,
+            232344064.0,
+            233916928.0,
+            232344064.0,
+            233392640.0,
+            233392640.0,
+            233392640.0,
+            232344064.0,
+            233392640.0,
+            232344064.0,
+            232344064.0,
+            232344064.0,
+            232344064.0,
+            233392640.0,
+            232344064.0,
+            232344064.0,
+            233392640.0,
+            233916928.0,
+            233392640.0,
+            232344064.0,
+            233392640.0,
+            233392640.0,
+            233392640.0,
+            232344064.0,
+            233916928.0,
+            232344064.0,
+            233916928.0,
+            233392640.0,
+            232344064.0,
+            232344064.0,
+            232344064.0,
+            233392640.0,
+            232344064.0,
+            232344064.0,
+            232344064.0,
+            233392640.0,
+            233392640.0,
+            233392640.0,
+            232344064.0,
+            233916928.0,
+            232344064.0,
+            233392640.0,
+            232344064.0,
+            232344064.0,
+            232344064.0,
+            232344064.0,
+            232344064.0,
+            233392640.0,
+            233392640.0,
+            233392640.0,
+            233392640.0,
+            233392640.0,
+            233392640.0,
+            233392640.0,
+            233392640.0,
+            233392640.0,
+            233392640.0,
+            233392640.0,
+            233392640.0,
+            233392640.0,
+            233392640.0,
+            233392640.0,
+            232344064.0,
+            233392640.0,
+            232344064.0,
+            233916928.0,
+            232344064.0,
+            232344064.0,
+            232344064.0,
+            232344064.0,
+            233916928.0,
+            232344064.0,
+            232344064.0,
+            232344064.0,
+            232344064.0,
+            232344064.0,
+            232344064.0,
+            233916928.0,
+            233916928.0,
+            233392640.0,
+            232344064.0,
+            232344064.0,
+            233916928.0,
+            232344064.0,
+            232344064.0,
+            233916928.0,
+            232344064.0,
+            232344064.0,
+            232344064.0,
+            233916928.0,
+            233392640.0,
+            232344064.0,
+            233916928.0,
+            233392640.0,
+            232344064.0,
+            232344064.0,
+            232344064.0,
+            232344064.0,
+            233916928.0,
+            232344064.0,
+            232344064.0,
+            233916928.0,
+            232344064.0,
+            232344064.0,
+            232344064.0,
+            232344064.0,
+            233916928.0,
+            233916928.0,
+            232344064.0,
+            233916928.0,
+            233916928.0,
+            233392640.0,
+            232344064.0,
+            233392640.0,
+            233916928.0,
+            233916928.0,
+            232344064.0,
+            232344064.0,
+            233392640.0,
+            232344064.0,
+            232344064.0,
+            232344064.0,
+            233392640.0,
+            232344064.0,
+            232344064.0,
+            232344064.0,
+            233916928.0,
+            233392640.0,
+            232344064.0,
+            233916928.0,
+            232344064.0,
+            232344064.0,
+            233916928.0,
+            233916928.0,
+            233392640.0,
+            232344064.0,
+            232344064.0,
+            233392640.0,
+            233916928.0,
+            233392640.0,
+            233392640.0,
+            233392640.0,
+            233392640.0,
+            232344064.0,
+            232344064.0,
+            232344064.0,
+            232344064.0,
+            232344064.0,
+            234965504.0,
+            232344064.0,
+            232344064.0,
+            232344064.0,
+            233916928.0,
+            232344064.0,
+            233916928.0,
+            232344064.0,
+            233916928.0,
+            233392640.0,
+            232344064.0,
+            233392640.0,
+            233392640.0,
+            232344064.0,
+            233392640.0,
+            233392640.0,
+            233392640.0,
+            233392640.0,
+            233392640.0,
+            232344064.0,
+            232344064.0,
+            232344064.0,
+            232344064.0,
+            233392640.0,
+            232344064.0,
+            233392640.0,
+            233392640.0,
+            233392640.0,
+            233392640.0,
+            233392640.0,
+            233392640.0,
+            232344064.0,
+            233392640.0,
+            233392640.0,
+            233392640.0,
+            233392640.0,
+            233392640.0,
+            233392640.0,
+            233392640.0,
+            233392640.0,
+            233392640.0,
+            233392640.0,
+            233392640.0,
+            233392640.0,
+            233392640.0,
+            232344064.0,
+            233392640.0,
+            233392640.0,
+            232344064.0,
+            232344064.0,
+            233392640.0,
+            233392640.0,
+            233392640.0,
+            233392640.0,
+            233392640.0,
+            232344064.0,
+            233392640.0,
+            233392640.0,
+            233392640.0,
+            233392640.0,
+            233392640.0,
+            232344064.0,
+            233392640.0,
+            232344064.0,
+            233392640.0,
+            233392640.0,
+            232344064.0,
+            232344064.0,
+            233392640.0,
+            232344064.0,
+            233392640.0,
+            232344064.0,
+            232344064.0,
+            232344064.0,
+            233392640.0,
+            233392640.0,
+            233392640.0,
+            233392640.0,
+            232344064.0,
+            233392640.0,
+            232344064.0,
+            232344064.0,
+            233392640.0,
+            232344064.0,
+            232344064.0,
+            233392640.0,
+            232344064.0,
+            232344064.0,
+            232344064.0,
+            233392640.0,
+            232344064.0,
+            233392640.0,
+            233392640.0,
+            232344064.0,
+            233392640.0,
+            232344064.0,
+            233392640.0,
+            233392640.0,
+            233392640.0,
+            233916928.0,
+            233916928.0,
+            232344064.0,
+            233392640.0,
+            233392640.0,
+            233392640.0,
+            232344064.0,
+            234965504.0,
+            233392640.0,
+            233392640.0,
+            233392640.0,
+            233392640.0,
+            233392640.0,
+            233392640.0,
+            233392640.0,
+            232344064.0,
+            233392640.0,
+            232344064.0,
+            232344064.0,
+            233392640.0,
+            233392640.0,
+            233392640.0,
+            233392640.0,
+            233392640.0,
+            233392640.0,
+            233392640.0,
+            232344064.0,
+            233392640.0,
+            233392640.0,
+            232344064.0,
+            233392640.0,
+            233392640.0,
+            232344064.0,
+            233392640.0,
+            233392640.0,
+            232344064.0,
+            233392640.0,
+            233392640.0,
+            233392640.0,
+            232344064.0,
+            233392640.0,
+            234965504.0,
+            233392640.0,
+            233392640.0,
+            233392640.0,
+            232344064.0,
+            233392640.0,
+            232344064.0,
+            232344064.0,
+            232344064.0,
+            232344064.0,
+            232344064.0,
+            233392640.0,
+            232344064.0,
+            232344064.0,
+            232344064.0,
+            232344064.0,
+            232344064.0,
+            233916928.0,
+            232344064.0,
+            232344064.0,
+            232344064.0,
+            232344064.0,
+            232344064.0,
+            232344064.0,
+            232344064.0,
+            232344064.0,
+            233392640.0,
+            232344064.0,
+            232868352.0,
+            232344064.0,
+            232344064.0,
+            232344064.0,
+            232344064.0,
+            232344064.0,
+            232344064.0,
+            233392640.0,
+            232344064.0,
+            232344064.0,
+            233392640.0,
+            232344064.0,
+            232344064.0,
+            232868352.0,
+            232344064.0,
+            232344064.0,
+            232344064.0,
+            232344064.0,
+            232344064.0,
+            232344064.0,
+            232344064.0,
+            232344064.0,
+            233916928.0,
+            232344064.0,
+            232344064.0,
+            232344064.0,
+            233392640.0,
+            233916928.0,
+            232344064.0,
+            233392640.0,
+            232344064.0,
+            232344064.0,
+            232344064.0,
+            232344064.0,
+            233392640.0,
+            232344064.0,
+            232344064.0,
+            233392640.0,
+            233392640.0,
+            232344064.0,
+            233392640.0,
+            232344064.0,
+            232344064.0,
+            232344064.0,
+            232344064.0,
+            232344064.0,
+            233392640.0,
+            232344064.0,
+            232344064.0,
+            232344064.0,
+            232344064.0,
+            233392640.0,
+            234965504.0,
+            233392640.0,
+            233916928.0,
+            233392640.0,
+            234965504.0,
+            232344064.0,
+            233392640.0,
+            232344064.0,
+            232344064.0,
+            232344064.0,
+            232344064.0,
+            232344064.0,
+            232344064.0,
+            232344064.0,
+            232344064.0,
+            232344064.0,
+            232344064.0,
+            232344064.0,
+            232344064.0,
+            232344064.0,
+            232344064.0,
+            233392640.0,
+            234965504.0,
+            232344064.0,
+            232344064.0,
+            232344064.0,
+            232344064.0,
+            232344064.0,
+            232344064.0,
+            232344064.0,
+            232344064.0,
+            233916928.0,
+            232344064.0,
+            232344064.0,
+            232344064.0,
+            232344064.0,
+            233392640.0,
+            232344064.0,
+            232344064.0,
+            232344064.0,
+            232344064.0,
+            232344064.0,
+            232344064.0,
+            232344064.0,
+            232344064.0,
+            232344064.0,
+            233916928.0,
+            232344064.0,
+            233392640.0,
+            232344064.0,
+            232344064.0,
+            233916928.0,
+            233392640.0,
+            232868352.0,
+            232344064.0,
+            232344064.0,
+            232344064.0,
+            232344064.0,
+            232344064.0,
+            232344064.0,
+            233392640.0,
+            233916928.0,
+            233916928.0,
+            232344064.0,
+            232344064.0,
+            232344064.0,
+            232344064.0,
+            232344064.0,
+            233916928.0,
+            231295488.0,
+            232344064.0,
+            233392640.0,
+            233916928.0,
+            232344064.0,
+            232344064.0,
+            233392640.0,
+            232868352.0,
+            232344064.0,
+            233392640.0,
+            232344064.0,
+            232344064.0,
+            233392640.0,
+            232344064.0,
+            232344064.0,
+            233392640.0,
+            232344064.0,
+            232344064.0,
+            231295488.0,
+            232344064.0,
+            233916928.0,
+            233916928.0,
+            233392640.0,
+            232344064.0,
+            232344064.0,
+            232344064.0,
+            232344064.0,
+            232344064.0,
+            233392640.0,
+            232344064.0,
+            232344064.0,
+            232344064.0,
+            232344064.0,
+            232344064.0,
+            232344064.0,
+            232344064.0,
+            233392640.0,
+            232344064.0,
+            232344064.0,
+            232344064.0,
+            233916928.0,
+            232344064.0,
+            232344064.0,
+            232344064.0,
+            233916928.0,
+            232344064.0,
+            232344064.0,
+            232344064.0,
+            232344064.0,
+            232344064.0,
+            232344064.0,
+            233392640.0,
+            233392640.0,
+            232344064.0,
+            233392640.0,
+            233392640.0,
+            233392640.0,
+            233392640.0,
+            233392640.0,
+            233392640.0,
+            232344064.0,
+            233916928.0,
+            233392640.0,
+            233392640.0,
+            233392640.0,
+            233916928.0,
+            233392640.0,
+            232344064.0,
+            232344064.0,
+            232344064.0,
+            232344064.0,
+            233392640.0,
+            233392640.0,
+            232344064.0,
+            232344064.0,
+            233392640.0,
+            233392640.0,
+            233392640.0,
+            233392640.0,
+            233392640.0,
+            233392640.0,
+            232344064.0,
+            232344064.0,
+            232344064.0,
+            233392640.0,
+            232344064.0,
+            233392640.0,
+            233392640.0,
+            233392640.0,
+            232344064.0,
+            232344064.0,
+            233392640.0,
+            232344064.0,
+            233392640.0,
+            232344064.0,
+            232344064.0,
+            233392640.0,
+            233916928.0,
+            234965504.0,
+            233392640.0,
+            233392640.0,
+            232344064.0,
+            232344064.0,
+            232344064.0,
+            233392640.0,
+            233392640.0,
+            233392640.0,
+            233392640.0,
+            232344064.0,
+            232344064.0,
+            233392640.0,
+            233916928.0,
+            233392640.0,
+            232344064.0,
+            231295488.0,
+            232344064.0,
+            232344064.0,
+            233916928.0,
+            233392640.0,
+            233916928.0,
+            233392640.0,
+            233392640.0,
+            233392640.0,
+            232344064.0,
+            232344064.0,
+            233392640.0,
+            233392640.0,
+            233392640.0,
+            233392640.0,
+            232344064.0,
+            233916928.0,
+            233392640.0,
+            233916928.0,
+            232344064.0,
+            233392640.0,
+            232344064.0,
+            232344064.0,
+            232344064.0,
+            233392640.0,
+            233392640.0,
+            233392640.0,
+            233392640.0,
+            233392640.0,
+            233392640.0,
+            233392640.0,
+            232344064.0,
+            234965504.0,
+            233392640.0,
+            233392640.0,
+            233392640.0,
+            233392640.0,
+            233392640.0,
+            233392640.0,
+            232344064.0,
+            233392640.0,
+            233392640.0,
+            233392640.0,
+            233392640.0,
+            233392640.0,
+            233392640.0,
+            233392640.0,
+            233392640.0,
+            233392640.0,
+            232344064.0,
+            233916928.0,
+            232344064.0,
+            233392640.0,
+            232344064.0,
+            233392640.0,
+            233392640.0,
+            233392640.0,
+            233392640.0,
+            232344064.0,
+            233392640.0,
+            233392640.0,
+            232344064.0,
+            233392640.0,
+            233392640.0,
+            233392640.0,
+            233916928.0,
+            233392640.0,
+            233392640.0,
+            233392640.0,
+            233392640.0,
+            233392640.0,
+            233392640.0,
+            233392640.0,
+            232344064.0,
+            232344064.0,
+            233392640.0,
+            233392640.0,
+            233392640.0,
+            233392640.0,
+            233916928.0,
+            233392640.0,
+            233392640.0,
+            233392640.0,
+            233392640.0,
+            232344064.0,
+            233392640.0,
+            233392640.0,
+            232344064.0,
+            233392640.0,
+            233392640.0,
+            233392640.0,
+            233392640.0,
+            233392640.0,
+            233392640.0,
+            233392640.0,
+            233392640.0,
+            233392640.0,
+            233392640.0,
+            233392640.0,
+            233392640.0,
+            233392640.0,
+            233392640.0,
+            233392640.0,
+            233392640.0,
+            233392640.0,
+            233392640.0,
+            233392640.0,
+            233392640.0,
+            233392640.0,
+            233392640.0,
+            233392640.0,
+            233392640.0,
+            232344064.0,
+            233392640.0,
+            233392640.0,
+            233392640.0,
+            233392640.0,
+            233392640.0,
+            233392640.0,
+            233392640.0,
+            233392640.0,
+            233392640.0,
+            233392640.0,
+            233392640.0,
+            232344064.0,
+            233392640.0,
+            232344064.0,
+            232344064.0,
+            233392640.0,
+            233392640.0,
+            232344064.0,
+            233392640.0,
+            233392640.0,
+            233392640.0,
+            233392640.0,
+            233392640.0,
+            233392640.0,
+            233392640.0,
+            233392640.0,
+            233392640.0,
+            233392640.0,
+            233392640.0,
+            233392640.0,
+            233392640.0,
+            233392640.0,
+            232344064.0,
+            233392640.0,
+            233392640.0,
+            233392640.0,
+            233392640.0,
+            233392640.0,
+            232344064.0,
+            233392640.0,
+            233392640.0,
+            233392640.0,
+            233392640.0,
+            233392640.0,
+            233392640.0,
+            232344064.0,
+            233392640.0,
+            233916928.0,
+            233392640.0,
+            232344064.0,
+            233392640.0,
+            233392640.0,
+            233392640.0,
+            233916928.0,
+            233392640.0,
+            233392640.0,
+            232344064.0,
+            233392640.0,
+            233392640.0,
+            232344064.0,
+            233392640.0,
+            232344064.0,
+            233392640.0,
+            233392640.0,
+            233392640.0,
+            233392640.0,
+            232344064.0,
+            233392640.0,
+            233392640.0,
+            233392640.0,
+            233392640.0,
+            233392640.0,
+            232344064.0,
+            233392640.0,
+            233392640.0,
+            233392640.0,
+            232344064.0,
+            232344064.0,
+            233392640.0,
+            233392640.0,
+            233392640.0,
+            233392640.0,
+            232344064.0,
+            233392640.0,
+            232344064.0,
+            233392640.0,
+            233392640.0,
+            232344064.0,
+            233392640.0,
+            233392640.0,
+            232344064.0,
+            233392640.0,
+            232344064.0,
+            233392640.0,
+            233392640.0,
+            233392640.0,
+            233392640.0,
+            232344064.0,
+            233392640.0,
+            232344064.0,
+            232344064.0,
+            233392640.0,
+            233392640.0,
+            232344064.0,
+            233392640.0,
+            233392640.0,
+            232344064.0,
+            233392640.0,
+            233392640.0,
+            233392640.0,
+            233392640.0,
+            233392640.0,
+            232344064.0,
+            233392640.0,
+            233392640.0,
+            232344064.0,
+            233392640.0,
+            232344064.0,
+            232344064.0,
+            232344064.0,
+            233392640.0,
+            233392640.0,
+            233392640.0,
+            233392640.0,
+            232344064.0,
+            233392640.0,
+            233392640.0,
+            232344064.0,
+            233392640.0,
+            233392640.0,
+            233392640.0,
+            232344064.0,
+            233392640.0,
+            233392640.0,
+            232344064.0,
+            233392640.0,
+            233392640.0,
+            233392640.0,
+            233392640.0,
+            233392640.0,
+            233392640.0,
+            232344064.0,
+            233392640.0,
+            233392640.0,
+            233392640.0,
+            233392640.0,
+            232344064.0,
+            233392640.0,
+            233392640.0,
+            233392640.0,
+            232344064.0,
+            233392640.0,
+            233392640.0,
+            233392640.0,
+            233392640.0,
+            233392640.0,
+            233392640.0,
+            232344064.0,
+            233392640.0,
+            233392640.0,
+            233392640.0,
+            233392640.0,
+            232344064.0,
+            233392640.0,
+            232344064.0,
+            233392640.0,
+            233392640.0,
+            233392640.0,
+            232344064.0,
+            233392640.0,
+            233392640.0,
+            232344064.0,
+            233392640.0,
+            233392640.0,
+            233392640.0,
+            233392640.0,
+            233392640.0,
+            233392640.0,
+            232344064.0,
+            233392640.0,
+            233392640.0,
+            233392640.0,
+            232344064.0,
+            233392640.0,
+            233392640.0,
+            233392640.0,
+            233392640.0,
+            232344064.0,
+            233392640.0,
+            233392640.0,
+            233392640.0,
+            233392640.0,
+            233392640.0,
+            233392640.0,
+            233392640.0,
+            233392640.0,
+            233392640.0,
+            233392640.0,
+            233392640.0,
+            233392640.0,
+            233392640.0,
+            232344064.0,
+            232344064.0,
+            233392640.0,
+            232344064.0,
+            232344064.0,
+            232344064.0,
+            233392640.0,
+            233392640.0,
+            233392640.0,
+            233392640.0,
+            233392640.0,
+            233392640.0,
+            232344064.0,
+            234965504.0,
+            233392640.0,
+            233392640.0,
+            232344064.0,
+            233392640.0,
+            232344064.0,
+            233392640.0,
+            233392640.0,
+            233392640.0,
+            233392640.0,
+            233392640.0,
+            232344064.0,
+            232344064.0,
+            233392640.0,
+            233392640.0,
+            232344064.0,
+            233392640.0,
+            233392640.0,
+            233392640.0,
+            233392640.0,
+            233392640.0,
+            232344064.0,
+            233392640.0,
+            233916928.0,
+            233392640.0,
+            233392640.0,
+            233392640.0,
+            233392640.0,
+            232344064.0,
+            233392640.0,
+            233392640.0,
+            232344064.0,
+            233392640.0,
+            233392640.0,
+            233392640.0,
+            233392640.0,
+            232344064.0,
+            233392640.0,
+            232344064.0,
+            233392640.0,
+            232344064.0,
+            234965504.0,
+            233392640.0,
+            233392640.0,
+            233392640.0,
+            233392640.0,
+            232344064.0,
+            232344064.0,
+            233392640.0,
+            232344064.0,
+            233392640.0,
+            233392640.0,
+            232344064.0,
+            233392640.0,
+            233392640.0,
+            233392640.0,
+            233392640.0,
+            233392640.0,
+            233392640.0,
+            233392640.0,
+            233392640.0,
+            233392640.0,
+            232344064.0,
+            233392640.0,
+            233916928.0,
+            233392640.0,
+            233916928.0,
+            233392640.0,
+            233392640.0,
+            232344064.0,
+            233392640.0,
+            233392640.0,
+            232344064.0,
+            233392640.0,
+            233392640.0,
+            233392640.0,
+            233392640.0,
+            233392640.0,
+            233392640.0,
+            233392640.0,
+            232344064.0,
+            232344064.0,
+            233392640.0,
+            233392640.0,
+            233392640.0,
+            233392640.0,
+            232344064.0,
+            233392640.0,
+            232344064.0,
+            233392640.0,
+            233392640.0,
+            233392640.0,
+            233392640.0,
+            232344064.0,
+            233392640.0,
+            233392640.0,
+            233392640.0,
+            233392640.0,
+            233916928.0,
+            232344064.0,
+            233392640.0,
+            233392640.0,
+            233392640.0,
+            233392640.0,
+            233392640.0,
+            233392640.0,
+            233392640.0,
+            232344064.0,
+            233392640.0,
+            232344064.0,
+            233916928.0,
+            232344064.0,
+            232344064.0,
+            232344064.0,
+            233916928.0,
+            232344064.0,
+            232344064.0,
+            233392640.0,
+            233916928.0,
+            232344064.0,
+            232344064.0,
+            232344064.0,
+            232344064.0,
+            233392640.0,
+            233392640.0,
+            232344064.0,
+            232344064.0,
+            232344064.0,
+            232344064.0,
+            232344064.0,
+            232344064.0,
+            232344064.0,
+            232344064.0,
+            232344064.0,
+            232344064.0,
+            233916928.0,
+            232344064.0,
+            233916928.0,
+            232344064.0,
+            232344064.0,
+            232344064.0,
+            232344064.0,
+            232344064.0,
+            233392640.0,
+            232344064.0,
+            232344064.0,
+            233392640.0,
+            232344064.0,
+            232344064.0,
+            232344064.0,
+            232344064.0,
+            234965504.0,
+            232344064.0,
+            232344064.0,
+            232344064.0,
+            233916928.0,
+            232344064.0,
+            233392640.0,
+            232344064.0,
+            233392640.0,
+            233392640.0,
+            233392640.0,
+            232344064.0,
+            233392640.0,
+            233392640.0,
+            233916928.0,
+            232344064.0,
+            233392640.0,
+            233392640.0,
+            233392640.0,
+            232344064.0,
+            232344064.0,
+            233916928.0,
+            232344064.0,
+            232344064.0,
+            232344064.0,
+            233916928.0,
+            232344064.0,
+            233392640.0,
+            232344064.0,
+            233392640.0,
+            232344064.0,
+            232344064.0,
+            233392640.0,
+            233392640.0,
+            232344064.0,
+            232344064.0,
+            232344064.0,
+            232344064.0,
+            232344064.0,
+            232868352.0,
+            232344064.0,
+            232344064.0,
+            233916928.0,
+            232344064.0,
+            232344064.0,
+            231295488.0,
+            232344064.0,
+            232344064.0,
+            232344064.0,
+            233916928.0,
+            233916928.0,
+            233392640.0,
+            232344064.0,
+            232344064.0,
+            232344064.0,
+            232344064.0,
+            232344064.0,
+            232344064.0,
+            232344064.0,
+            233392640.0,
+            233392640.0,
+            232344064.0,
+            232344064.0,
+            232344064.0,
+            232344064.0,
+            233916928.0,
+            232344064.0,
+            232344064.0,
+            232344064.0,
+            232344064.0,
+            232344064.0,
+            232344064.0,
+            232344064.0,
+            232344064.0,
+            233392640.0,
+            232344064.0,
+            232344064.0,
+            232344064.0,
+            232344064.0,
+            232344064.0,
+            232344064.0,
+            232344064.0,
+            232344064.0,
+            232344064.0,
+            232344064.0,
+            232344064.0,
+            232344064.0,
+            232344064.0,
+            233392640.0,
+            233392640.0,
+            232344064.0,
+            232344064.0,
+            233392640.0,
+            232344064.0,
+            232344064.0,
+            232344064.0,
+            232344064.0,
+            232344064.0,
+            232344064.0,
+            232344064.0,
+            232344064.0,
+            233392640.0,
+            232344064.0,
+            232344064.0,
+            233392640.0,
+            232344064.0,
+            233916928.0,
+            232344064.0,
+            232344064.0,
+            233916928.0,
+            233916928.0,
+            232344064.0,
+            232344064.0,
+            232344064.0,
+            232344064.0,
+            232344064.0,
+            232344064.0,
+            233392640.0,
+            233392640.0,
+            233916928.0,
+            232344064.0,
+            233916928.0,
+            233392640.0,
+            232344064.0,
+            232344064.0,
+            232344064.0,
+            232344064.0,
+            232344064.0,
+            232344064.0,
+            232344064.0,
+            232344064.0,
+            232344064.0,
+            232344064.0,
+            232344064.0,
+            232344064.0,
+            232344064.0,
+            232344064.0,
+            232344064.0,
+            232344064.0,
+            233392640.0,
+            233392640.0,
+            232344064.0,
+            232344064.0,
+            232344064.0,
+            232344064.0,
+            232344064.0,
+            233392640.0,
+            232344064.0,
+            232344064.0,
+            234965504.0,
+            233392640.0,
+            232344064.0,
+            232344064.0,
+            233392640.0,
+            232344064.0,
+            232344064.0,
+            232344064.0,
+            232344064.0,
+            232344064.0,
+            232344064.0,
+            232344064.0,
+            232344064.0,
+            233392640.0,
+            232344064.0,
+            233916928.0,
+            232344064.0,
+            233916928.0,
+            233392640.0,
+            232344064.0,
+            233916928.0,
+            232344064.0,
+            232344064.0,
+            232344064.0,
+            232344064.0,
+            232344064.0,
+            232344064.0,
+            232344064.0,
+            233916928.0,
+            232344064.0,
+            233392640.0,
+            232344064.0,
+            232344064.0,
+            232344064.0,
+            233392640.0,
+            232344064.0,
+            232344064.0,
+            232344064.0,
+            232344064.0,
+            232344064.0,
+            233392640.0,
+            232344064.0,
+            232344064.0,
+            233916928.0,
+            232344064.0,
+            232344064.0,
+            232344064.0,
+            232344064.0,
+            232344064.0,
+            232344064.0,
+            232344064.0,
+            232344064.0,
+            233392640.0,
+            232344064.0,
+            232344064.0,
+            232344064.0,
+            232344064.0,
+            232344064.0,
+            232344064.0,
+            232344064.0,
+            233392640.0,
+            233392640.0,
+            232344064.0,
+            233916928.0,
+            232344064.0,
+            233392640.0,
+            232344064.0,
+            232344064.0,
+            233392640.0,
+            232344064.0,
+            232344064.0,
+            233916928.0,
+            232344064.0,
+            232344064.0,
+            232344064.0,
+            232344064.0,
+            232344064.0,
+            232344064.0,
+            232344064.0,
+            232344064.0,
+            232344064.0,
+            233916928.0,
+            233392640.0,
+            232344064.0,
+            232344064.0,
+            232344064.0,
+            233392640.0,
+            233916928.0,
+            232344064.0,
+            232344064.0,
+            232344064.0,
+            232344064.0,
+            233392640.0,
+            232344064.0,
+            232344064.0,
+            232344064.0,
+            232344064.0,
+            232344064.0,
+            232344064.0,
+            232344064.0,
+            232344064.0,
+            232344064.0,
+            232344064.0,
+            232344064.0,
+            232344064.0,
+            233392640.0,
+            233916928.0,
+            232344064.0,
+            233392640.0,
+            232868352.0,
+            232344064.0,
+            232344064.0,
+            232344064.0,
+            233392640.0,
+            232344064.0,
+            233392640.0,
+            232344064.0,
+            232344064.0,
+            233392640.0,
+            233392640.0,
+            233392640.0,
+            232344064.0,
+            233916928.0,
+            232344064.0,
+            233392640.0,
+            232344064.0,
+            233392640.0,
+            232344064.0,
+            233392640.0,
+            232344064.0,
+            232344064.0,
+            232344064.0,
+            233392640.0,
+            232344064.0,
+            232344064.0,
+            233392640.0,
+            233916928.0,
+            232344064.0,
+            233392640.0,
+            232344064.0,
+            232344064.0,
+            233392640.0,
+            233392640.0,
+            233916928.0,
+            233392640.0,
+            233392640.0,
+            233392640.0,
+            233392640.0,
+            233392640.0,
+            233392640.0,
+            233392640.0,
+            233392640.0,
+            233392640.0,
+            233392640.0,
+            233392640.0,
+            233392640.0,
+            233392640.0,
+            233392640.0,
+            233392640.0,
+            233392640.0,
+            233392640.0,
+            233392640.0,
+            233392640.0,
+            233392640.0,
+            233392640.0,
+            233392640.0,
+            233392640.0,
+            233392640.0,
+            232344064.0,
+            233392640.0,
+            233392640.0,
+            233392640.0,
+            233392640.0,
+            233392640.0,
+            232344064.0,
+            233392640.0
+        ]
+    },
+    "iteration-time": {
+        "start_step": 0,
+        "end_step": 163,
+        "step_interval": 5,
+        "values": [
+            0.95312,
+            0.38289,
+            0.45849,
+            0.52211,
+            0.39902,
+            0.40484,
+            0.46371,
+            0.42504,
+            0.61644,
+            0.40232,
+            0.37125,
+            0.43733,
+            0.65037,
+            0.41577,
+            0.42127,
+            0.40125,
+            0.42634,
+            0.40008,
+            0.42375,
+            0.52799,
+            0.41603,
+            0.41023,
+            0.52821,
+            0.50114,
+            0.58024,
+            0.63016,
+            0.45667,
+            0.40373,
+            0.41419,
+            0.44541,
+            0.43878,
+            0.43471,
+            0.50943
+        ]
+    }
+}
\ No newline at end of file
diff --git a/tests/functional_tests/test_cases/bert/bert_release/model_config.yaml b/tests/functional_tests/test_cases/bert/bert_release/model_config.yaml
new file mode 100644
index 0000000000..5c92fbf7da
--- /dev/null
+++ b/tests/functional_tests/test_cases/bert/bert_release/model_config.yaml
@@ -0,0 +1,54 @@
+ENV_VARS:
+  CUDA_DEVICE_MAX_CONNECTIONS: '1'
+  NVTE_ALLOW_NONDETERMINISTIC_ALGO: '1'
+  NVTE_FLASH_ATTN: '0'
+  NVTE_FUSED_ATTN: '0'
+
+TEST_TYPE: 'release'
+
+MODEL_ARGS:
+  # Bert model args
+  --num-layers: 24 
+  --hidden-size: 1024 
+  --num-attention-heads: 16 
+  --seq-length: 512 
+  --max-position-embeddings: 512 
+
+  # Training args
+  --micro-batch-size: 4 
+  --global-batch-size: 32 
+  --train-iters: 20000 
+  --weight-decay: 1e-2 
+  --clip-grad: 1.0 
+  --fp16: true
+  --lr: 0.0001
+  --lr-decay-style: linear 
+  --min-lr: 1.0e-5 
+  --lr-warmup-fraction: .01 
+  --bert-no-binary-head: true
+
+  # Model parallel
+  --tensor-model-parallel-size: 8 
+  --pipeline-model-parallel-size: 8 
+
+  # Data args
+  --data-path: ${DATA_BLEND}
+  --vocab-file: ${DATA_PATH}/vocab.txt 
+  --split: 949,50,1
+  --data-cache-path: ${DATA_CACHE_PATH}
+
+  # EVAL_AND_LOGGING_ARGS
+  --log-interval: 100
+  --save-interval: 2000
+  --eval-interval: 1000 
+  --save: ${CHECKPOINT_PATH}
+  --load: ${CHECKPOINT_PATH}
+  --eval-iters: 10
+  --tensorboard-dir: ${TENSORBOARD_PATH} 
+  --log-timers-to-tensorboard: true
+  --log-memory-to-tensorboard: true
+  --log-num-zeros-in-grad: true
+  --log-params-norm: true
+  --log-validation-ppl-to-tensorboard: true
+  --wandb-project: megatron-core-release-runs
+  --wandb-exp-name: ${WANDB_EXPERIMENT}
\ No newline at end of file
diff --git a/tests/functional_tests/test_cases/gpt-nemo/gpt3-nemo_126m_mr_mbs1_gbs8_mcore_te_tp2_pp4_vp3_seq_par_overlap_p2p_dgx_a100_1N8G/model_config.yaml b/tests/functional_tests/test_cases/gpt-nemo/gpt3-nemo_126m_mr_mbs1_gbs8_mcore_te_tp2_pp4_vp3_seq_par_overlap_p2p_dgx_a100_1N8G/model_config.yaml
new file mode 100644
index 0000000000..89c71f6291
--- /dev/null
+++ b/tests/functional_tests/test_cases/gpt-nemo/gpt3-nemo_126m_mr_mbs1_gbs8_mcore_te_tp2_pp4_vp3_seq_par_overlap_p2p_dgx_a100_1N8G/model_config.yaml
@@ -0,0 +1,36 @@
+ENV_VARS:
+  CUDA_DEVICE_MAX_CONNECTIONS: 1
+  SKIP_PYTEST: 1
+  N_REPEATS: 1
+MODEL_ARGS:
+  trainer.num_nodes: 1
+  trainer.devices: 8
+  trainer.max_steps: 50
+  trainer.val_check_interval: 50
+  trainer.limit_val_batches: 50
+  trainer.max_epochs: 'null'
+  trainer.precision: bf16
+  model.num_layers: 12
+  model.hidden_size: 768
+  model.num_attention_heads: 12
+  model.micro_batch_size: 1
+  model.global_batch_size: 8
+  model.tensor_model_parallel_size: 2
+  model.pipeline_model_parallel_size: 4
+  model.virtual_pipeline_model_parallel_size: 3
+  model.encoder_seq_length: 2048
+  model.max_position_embeddings: 2048
+  model.ffn_hidden_size: 3072
+  model.mcore_gpt: 'True'
+  model.apply_query_key_layer_scaling: 'True'
+  model.megatron_amp_O2: 'True'
+  model.data.data_prefix: '[]'
+  model.data.data_impl: mock
+  model.data.splits_string: '[99990,8,2]'
+  model.optim.name: distributed_fused_adam
+  model.optim.weight_decay: 0.1
+  exp_manager.create_checkpoint_callback: 'False'
+  model.sequence_parallel: 'True'
+  model.overlap_p2p_comm: 'True'
+  model.batch_p2p_comm: 'False'
+TEST_TYPE: regular
diff --git a/tests/functional_tests/test_cases/gpt-nemo/gpt3-nemo_126m_mr_mbs4_gbs64_mcore_te_tp1_pp1_dgx_a100_1N8G/model_config.yaml b/tests/functional_tests/test_cases/gpt-nemo/gpt3-nemo_126m_mr_mbs4_gbs64_mcore_te_tp1_pp1_dgx_a100_1N8G/model_config.yaml
new file mode 100644
index 0000000000..d7e926e96e
--- /dev/null
+++ b/tests/functional_tests/test_cases/gpt-nemo/gpt3-nemo_126m_mr_mbs4_gbs64_mcore_te_tp1_pp1_dgx_a100_1N8G/model_config.yaml
@@ -0,0 +1,33 @@
+ENV_VARS:
+  CUDA_DEVICE_MAX_CONNECTIONS: 1
+  SKIP_PYTEST: 1
+  N_REPEATS: 1
+MODEL_ARGS:
+  trainer.num_nodes: 1
+  trainer.devices: 8
+  trainer.max_steps: 50
+  trainer.val_check_interval: 50
+  trainer.limit_val_batches: 50
+  trainer.max_epochs: 'null'
+  trainer.precision: bf16
+  model.num_layers: 12
+  model.hidden_size: 768
+  model.num_attention_heads: 12
+  model.micro_batch_size: 4
+  model.global_batch_size: 64
+  model.tensor_model_parallel_size: 1
+  model.pipeline_model_parallel_size: 1
+  model.virtual_pipeline_model_parallel_size: 'null'
+  model.encoder_seq_length: 2048
+  model.max_position_embeddings: 2048
+  model.ffn_hidden_size: 3072
+  model.mcore_gpt: 'True'
+  model.apply_query_key_layer_scaling: 'True'
+  model.megatron_amp_O2: 'True'
+  model.data.data_prefix: '[]'
+  model.data.data_impl: mock
+  model.data.splits_string: '[99990,8,2]'
+  model.optim.name: distributed_fused_adam
+  model.optim.weight_decay: 0.1
+  exp_manager.create_checkpoint_callback: 'False'
+TEST_TYPE: regular
diff --git a/tests/functional_tests/test_cases/gpt/gpt3_15b_8t_release/golden_values_0.8.0.json b/tests/functional_tests/test_cases/gpt/gpt3_15b_8t_release/golden_values_0.8.0.json
new file mode 100644
index 0000000000..de1f0fc4c9
--- /dev/null
+++ b/tests/functional_tests/test_cases/gpt/gpt3_15b_8t_release/golden_values_0.8.0.json
@@ -0,0 +1,1199 @@
+{   
+    "lm loss": {
+        "start_step": 0,
+        "end_step": 2924,
+        "step_interval": 5,
+        "values": [
+            12.98403,
+            12.91905,
+            12.86639,
+            11.80178,
+            10.36046,
+            10.02508,
+            9.62221,
+            9.4955,
+            9.14872,
+            8.94894,
+            8.83409,
+            8.72075,
+            8.62175,
+            8.4803,
+            8.3141,
+            8.31485,
+            8.21301,
+            8.05619,
+            8.03993,
+            7.89079,
+            7.75619,
+            7.69641,
+            7.57577,
+            7.59624,
+            7.48417,
+            7.27241,
+            7.32754,
+            7.17152,
+            7.13675,
+            7.13916,
+            7.0296,
+            6.98413,
+            6.86775,
+            6.84081,
+            6.94393,
+            6.78266,
+            6.70487,
+            6.66921,
+            6.67557,
+            6.69083,
+            6.62926,
+            6.57314,
+            6.54207,
+            6.48718,
+            6.56656,
+            6.52225,
+            6.39211,
+            6.43077,
+            6.4313,
+            6.38146,
+            6.38012,
+            6.25064,
+            6.26353,
+            6.22999,
+            6.24913,
+            6.26542,
+            6.18599,
+            6.19121,
+            6.12336,
+            6.15534,
+            6.13545,
+            6.14558,
+            6.03815,
+            6.03552,
+            5.98914,
+            5.95498,
+            6.05819,
+            5.92126,
+            5.98038,
+            5.90334,
+            5.91262,
+            5.89738,
+            5.84066,
+            5.80738,
+            5.80602,
+            5.72881,
+            5.8061,
+            5.74937,
+            5.73758,
+            5.75618,
+            5.7316,
+            5.74263,
+            5.67045,
+            5.63838,
+            5.6232,
+            5.63786,
+            5.5965,
+            5.65082,
+            5.57064,
+            5.53708,
+            5.55975,
+            5.56886,
+            5.58339,
+            5.50802,
+            5.45239,
+            5.46833,
+            5.47828,
+            5.46339,
+            5.45622,
+            5.41625,
+            5.43573,
+            5.40692,
+            5.41341,
+            5.42214,
+            5.33807,
+            5.34711,
+            5.37209,
+            5.35972,
+            5.35578,
+            5.32397,
+            5.30983,
+            5.33378,
+            5.27146,
+            5.30895,
+            5.333,
+            5.24425,
+            5.31699,
+            5.19989,
+            5.17072,
+            5.28175,
+            5.18568,
+            5.16216,
+            5.16152,
+            5.17291,
+            5.19225,
+            5.22522,
+            5.18483,
+            5.12269,
+            5.11527,
+            5.14034,
+            5.13279,
+            5.12626,
+            5.08066,
+            5.03365,
+            5.08431,
+            5.04733,
+            5.01305,
+            5.00476,
+            5.02491,
+            4.98779,
+            4.98514,
+            4.86199,
+            4.87843,
+            4.90509,
+            4.8462,
+            4.87811,
+            4.88625,
+            4.78769,
+            4.79964,
+            4.8037,
+            4.80904,
+            4.78916,
+            4.71706,
+            4.74322,
+            4.72538,
+            4.72356,
+            4.71707,
+            4.59276,
+            4.62852,
+            4.61932,
+            4.62474,
+            4.60913,
+            4.61314,
+            4.58065,
+            4.59596,
+            4.51722,
+            4.54072,
+            4.51915,
+            4.5058,
+            4.50754,
+            4.48612,
+            4.42434,
+            4.5281,
+            4.42243,
+            4.42119,
+            4.40814,
+            4.38947,
+            4.43578,
+            4.41079,
+            4.34424,
+            4.4458,
+            4.38832,
+            4.37063,
+            4.33551,
+            4.30543,
+            4.34502,
+            4.32366,
+            4.28705,
+            4.33382,
+            4.24342,
+            4.27102,
+            4.21196,
+            4.2094,
+            4.26323,
+            4.2211,
+            4.19478,
+            4.2264,
+            4.25528,
+            4.1844,
+            4.21439,
+            4.17958,
+            4.15965,
+            4.20032,
+            4.19108,
+            4.16656,
+            4.11609,
+            4.10448,
+            4.10847,
+            4.06067,
+            4.13422,
+            4.09094,
+            4.13758,
+            4.10255,
+            4.05368,
+            4.09669,
+            4.02159,
+            4.06341,
+            4.04922,
+            4.0341,
+            4.04917,
+            4.05269,
+            4.03212,
+            3.96123,
+            4.0125,
+            4.03331,
+            4.07618,
+            4.01799,
+            3.98262,
+            3.97674,
+            3.99244,
+            3.96663,
+            3.95716,
+            3.97524,
+            3.98075,
+            3.84107,
+            3.93674,
+            3.94907,
+            3.89852,
+            3.96144,
+            3.91439,
+            3.88467,
+            3.93694,
+            3.89926,
+            3.87537,
+            3.82985,
+            3.89558,
+            3.83219,
+            3.82415,
+            3.86387,
+            3.87259,
+            3.85311,
+            3.85602,
+            3.84239,
+            3.82888,
+            3.84089,
+            3.80756,
+            3.83549,
+            3.80762,
+            3.79835,
+            3.7783,
+            3.77396,
+            3.78777,
+            3.78436,
+            3.76241,
+            3.70647,
+            3.76628,
+            3.80323,
+            3.81618,
+            3.73526,
+            3.80323,
+            3.73948,
+            3.71244,
+            3.75242,
+            3.79684,
+            3.72411,
+            3.68427,
+            3.72174,
+            3.70343,
+            3.75025,
+            3.6977,
+            3.66065,
+            3.71761,
+            3.68864,
+            3.68118,
+            3.66005,
+            3.67648,
+            3.66823,
+            3.68612,
+            3.69209,
+            3.66626,
+            3.69118,
+            3.65966,
+            3.617,
+            3.62539,
+            3.65815,
+            3.60098,
+            3.64213,
+            3.56802,
+            3.63929,
+            3.62702,
+            3.60266,
+            3.57597,
+            3.64716,
+            3.62137,
+            3.61376,
+            3.6213,
+            3.61249,
+            3.55488,
+            3.59665,
+            3.57476,
+            3.55501,
+            3.56539,
+            3.6084,
+            3.58844,
+            3.60825,
+            3.60013,
+            3.51477,
+            3.5232,
+            3.55779,
+            3.50929,
+            3.60958,
+            3.57917,
+            3.48286,
+            3.47633,
+            3.48853,
+            3.57624,
+            3.46667,
+            3.5186,
+            3.52609,
+            3.45463,
+            3.52258,
+            3.50758,
+            3.47706,
+            3.43532,
+            3.46913,
+            3.45331,
+            3.55574,
+            3.47274,
+            3.50296,
+            3.49048,
+            3.45181,
+            3.50516,
+            3.47354,
+            3.48291,
+            3.45316,
+            3.46022,
+            3.4687,
+            3.47465,
+            3.40249,
+            3.44108,
+            3.41925,
+            3.43972,
+            3.46996,
+            3.39189,
+            3.39564,
+            3.39032,
+            3.41347,
+            3.45305,
+            3.4397,
+            3.40188,
+            3.41963,
+            3.41077,
+            3.393,
+            3.37584,
+            3.44314,
+            3.35556,
+            3.38315,
+            3.36762,
+            3.46275,
+            3.36062,
+            3.42604,
+            3.3417,
+            3.31891,
+            3.3759,
+            3.34508,
+            3.34173,
+            3.37406,
+            3.34535,
+            3.34497,
+            3.32886,
+            3.28686,
+            3.36797,
+            3.29887,
+            3.32538,
+            3.37052,
+            3.34514,
+            3.3546,
+            3.29153,
+            3.30181,
+            3.36724,
+            3.26415,
+            3.32624,
+            3.36198,
+            3.34542,
+            3.29475,
+            3.31116,
+            3.27022,
+            3.30327,
+            3.30326,
+            3.25067,
+            3.28979,
+            3.26245,
+            3.30043,
+            3.31216,
+            3.24633,
+            3.2676,
+            3.30406,
+            3.2327,
+            3.27332,
+            3.25166,
+            3.26097,
+            3.22124,
+            3.25568,
+            3.26761,
+            3.26833,
+            3.26281,
+            3.30591,
+            3.24213,
+            3.24061,
+            3.24286,
+            3.22774,
+            3.25028,
+            3.18913,
+            3.25822,
+            3.1822,
+            3.17925,
+            3.18922,
+            3.24945,
+            3.19828,
+            3.17282,
+            3.20145,
+            3.23939,
+            3.27525,
+            3.27783,
+            3.25473,
+            3.24593,
+            3.19433,
+            3.19204,
+            3.17389,
+            3.22167,
+            3.19708,
+            3.17916,
+            3.22465,
+            3.18648,
+            3.17492,
+            3.21295,
+            3.20901,
+            3.21699,
+            3.21743,
+            3.15615,
+            3.13348,
+            3.15566,
+            3.12028,
+            3.2289,
+            3.1873,
+            3.17874,
+            3.11699,
+            3.13456,
+            3.19976,
+            3.16119,
+            3.14575,
+            3.09448,
+            3.12586,
+            3.13487,
+            3.14319,
+            3.11977,
+            3.10171,
+            3.17339,
+            3.14112,
+            3.15304,
+            3.14225,
+            3.12857,
+            3.15438,
+            3.09987,
+            3.09702,
+            3.11459,
+            3.08699,
+            3.0833,
+            3.09299,
+            3.15723,
+            3.11388,
+            3.13932,
+            3.10038,
+            3.13188,
+            3.13259,
+            3.11938,
+            3.08561,
+            3.04368,
+            3.1147,
+            3.08933,
+            3.14307,
+            3.08731,
+            3.13677,
+            3.08017,
+            3.06886,
+            3.07081,
+            3.07784,
+            3.06735,
+            3.06241,
+            3.05711,
+            3.15474,
+            3.17411,
+            3.0933,
+            3.09073,
+            3.08262,
+            3.0181,
+            3.08743,
+            2.99959,
+            3.03228,
+            3.03871,
+            3.09454,
+            3.11336,
+            3.04832,
+            3.04739,
+            3.02767,
+            2.95159,
+            3.07803,
+            3.00463,
+            3.04212,
+            3.01239,
+            3.02106,
+            3.06591,
+            3.02159,
+            3.00528,
+            3.04621,
+            3.01085,
+            2.98911,
+            3.00693,
+            3.05469,
+            3.02043,
+            3.02014,
+            3.02013,
+            3.07027,
+            3.02857,
+            3.00833,
+            3.02054,
+            2.99549,
+            2.99681,
+            3.01604,
+            2.96746,
+            3.01247,
+            3.00166,
+            3.05515,
+            3.0751,
+            3.02145,
+            3.09756,
+            3.03393,
+            3.15062,
+            3.0338,
+            3.05434,
+            2.95537,
+            2.96026,
+            3.00947,
+            2.96684,
+            2.9767,
+            2.93125,
+            2.936,
+            2.95276,
+            2.97053,
+            2.95618,
+            2.96532,
+            2.96022,
+            2.96507,
+            3.03753,
+            3.02243,
+            2.96328,
+            3.01834,
+            2.95557,
+            3.00232,
+            3.01729,
+            2.9955,
+            2.94597,
+            2.94341,
+            2.92035,
+            2.9421,
+            3.01453,
+            2.91331,
+            2.92921,
+            2.98194,
+            2.89057,
+            2.96294,
+            2.95374,
+            2.99872,
+            2.9698,
+            2.94731
+        ]
+    },
+    "mem-allocated-bytes": {
+        "start_step": 0,
+        "end_step": 2924,
+        "step_interval": 5,
+        "values": [
+            12697244672.0,
+            12697245696.0,
+            12697245696.0,
+            12697245696.0,
+            12697245696.0,
+            12697245696.0,
+            12697245696.0,
+            12697245696.0,
+            12697245696.0,
+            12697245696.0,
+            12697245696.0,
+            12697245696.0,
+            12697245696.0,
+            12697245696.0,
+            12697245696.0,
+            12697245696.0,
+            12697245696.0,
+            12697245696.0,
+            12697245696.0,
+            12697245696.0,
+            12697245696.0,
+            12697245696.0,
+            12697245696.0,
+            12697245696.0,
+            12697245696.0,
+            12697245696.0,
+            12697245696.0,
+            12697245696.0,
+            12697245696.0,
+            12697245696.0,
+            12697245696.0,
+            12697245696.0,
+            12697245696.0,
+            12697245696.0,
+            12697245696.0,
+            12697245696.0,
+            12697245696.0,
+            12697245696.0,
+            12697245696.0,
+            12697245696.0,
+            12697245696.0,
+            12697245696.0,
+            12697245696.0,
+            12697245696.0,
+            12697245696.0,
+            12697245696.0,
+            12697245696.0,
+            12697245696.0,
+            12697245696.0,
+            12697245696.0,
+            12697245696.0,
+            12697245696.0,
+            12697245696.0,
+            12697245696.0,
+            12697245696.0,
+            12697245696.0,
+            12697245696.0,
+            12697245696.0,
+            12697245696.0,
+            12697245696.0,
+            12697245696.0,
+            12697245696.0,
+            12697245696.0,
+            12697245696.0,
+            12697245696.0,
+            12697245696.0,
+            12697245696.0,
+            12697245696.0,
+            12697245696.0,
+            12697245696.0,
+            12697245696.0,
+            12697245696.0,
+            12697245696.0,
+            12697245696.0,
+            12697245696.0,
+            12697245696.0,
+            12697245696.0,
+            12697245696.0,
+            12697245696.0,
+            12697245696.0,
+            12697245696.0,
+            12697245696.0,
+            12697245696.0,
+            12697245696.0,
+            12697245696.0,
+            12697245696.0,
+            12697245696.0,
+            12697245696.0,
+            12697245696.0,
+            12697245696.0,
+            12697245696.0,
+            12697245696.0,
+            12697245696.0,
+            12697245696.0,
+            12697245696.0,
+            12697245696.0,
+            12697245696.0,
+            12697245696.0,
+            12697245696.0,
+            12697245696.0,
+            12697245696.0,
+            12697245696.0,
+            12697245696.0,
+            12697245696.0,
+            12697245696.0,
+            12697245696.0,
+            12697245696.0,
+            12697245696.0,
+            12697245696.0,
+            12697245696.0,
+            12697245696.0,
+            12697245696.0,
+            12697245696.0,
+            12697245696.0,
+            12697245696.0,
+            12697245696.0,
+            12697245696.0,
+            12697245696.0,
+            12697245696.0,
+            12697245696.0,
+            12697245696.0,
+            12697245696.0,
+            12697245696.0,
+            12697245696.0,
+            12697245696.0,
+            12697245696.0,
+            12697245696.0,
+            12697245696.0,
+            12697245696.0,
+            12697245696.0,
+            12697245696.0,
+            12697245696.0,
+            12697245696.0,
+            12697245696.0,
+            12697245696.0,
+            12697245696.0,
+            12697245696.0,
+            12697245696.0,
+            12697245696.0,
+            12697245696.0,
+            12697245696.0,
+            12697245696.0,
+            12697245696.0,
+            12697245696.0,
+            12697245696.0,
+            12697245696.0,
+            12697245696.0,
+            12697245696.0,
+            12697245696.0,
+            12697245696.0,
+            12697245696.0,
+            12697245696.0,
+            12697245696.0,
+            12697245696.0,
+            12697245696.0,
+            12697245696.0,
+            12697245696.0,
+            12697245696.0,
+            12697245696.0,
+            12697245696.0,
+            12697245696.0,
+            12697245696.0,
+            12697245696.0,
+            12697245696.0,
+            12697245696.0,
+            12697245696.0,
+            12697245696.0,
+            12697245696.0,
+            12697245696.0,
+            12697245696.0,
+            12697245696.0,
+            12697245696.0,
+            12697245696.0,
+            12697245696.0,
+            12697245696.0,
+            12697245696.0,
+            12697245696.0,
+            12697245696.0,
+            12697245696.0,
+            12697245696.0,
+            12697245696.0,
+            12697245696.0,
+            12697245696.0,
+            12697245696.0,
+            12697245696.0,
+            12697245696.0,
+            12697245696.0,
+            12697245696.0,
+            12697245696.0,
+            12697245696.0,
+            12697245696.0,
+            12697245696.0,
+            12697245696.0,
+            12697245696.0,
+            12697245696.0,
+            12697245696.0,
+            12697245696.0,
+            12697245696.0,
+            12697245696.0,
+            12697245696.0,
+            12697245696.0,
+            12697245696.0,
+            12697245696.0,
+            12697245696.0,
+            12697245696.0,
+            12697245696.0,
+            12697245696.0,
+            12697245696.0,
+            12697245696.0,
+            12697245696.0,
+            12697245696.0,
+            12697245696.0,
+            12697245696.0,
+            12697245696.0,
+            12697245696.0,
+            12697245696.0,
+            12697245696.0,
+            12697245696.0,
+            12697245696.0,
+            12697245696.0,
+            12697245696.0,
+            12697245696.0,
+            12697245696.0,
+            12697245696.0,
+            12697245696.0,
+            12697245696.0,
+            12697245696.0,
+            12697245696.0,
+            12697245696.0,
+            12697245696.0,
+            12697245696.0,
+            12697245696.0,
+            12697245696.0,
+            12697245696.0,
+            12697245696.0,
+            12697245696.0,
+            12697245696.0,
+            12697245696.0,
+            12697245696.0,
+            12697245696.0,
+            12697245696.0,
+            12697245696.0,
+            12697245696.0,
+            12697245696.0,
+            12697245696.0,
+            12697245696.0,
+            12697245696.0,
+            12697245696.0,
+            12697245696.0,
+            12697245696.0,
+            12697245696.0,
+            12697245696.0,
+            12697245696.0,
+            12697245696.0,
+            12697245696.0,
+            12697245696.0,
+            12697245696.0,
+            12697245696.0,
+            12697245696.0,
+            12697245696.0,
+            12697245696.0,
+            12697245696.0,
+            12697245696.0,
+            12697245696.0,
+            12697245696.0,
+            12697245696.0,
+            12697245696.0,
+            12697245696.0,
+            12697245696.0,
+            12697245696.0,
+            12697245696.0,
+            12697245696.0,
+            12697245696.0,
+            12697245696.0,
+            12697245696.0,
+            12697245696.0,
+            12697245696.0,
+            12697245696.0,
+            12697245696.0,
+            12697245696.0,
+            12697245696.0,
+            12697245696.0,
+            12697245696.0,
+            12697245696.0,
+            12697245696.0,
+            12697245696.0,
+            12697245696.0,
+            12697245696.0,
+            12697245696.0,
+            12697245696.0,
+            12697245696.0,
+            12697245696.0,
+            12697245696.0,
+            12697245696.0,
+            12697245696.0,
+            12697245696.0,
+            12697245696.0,
+            12697245696.0,
+            12697245696.0,
+            12697245696.0,
+            12697245696.0,
+            12697245696.0,
+            12697245696.0,
+            12697245696.0,
+            12697245696.0,
+            12697245696.0,
+            12697245696.0,
+            12697245696.0,
+            12697245696.0,
+            12697245696.0,
+            12697245696.0,
+            12697245696.0,
+            12697245696.0,
+            12697245696.0,
+            12697245696.0,
+            12697245696.0,
+            12697245696.0,
+            12697245696.0,
+            12697245696.0,
+            12697245696.0,
+            12697245696.0,
+            12697245696.0,
+            12697245696.0,
+            12697245696.0,
+            12697245696.0,
+            12697245696.0,
+            12697245696.0,
+            12697245696.0,
+            12697245696.0,
+            12697245696.0,
+            12697245696.0,
+            12697245696.0,
+            12697245696.0,
+            12697245696.0,
+            12697245696.0,
+            12697245696.0,
+            12697245696.0,
+            12697245696.0,
+            12697245696.0,
+            12697245696.0,
+            12697245696.0,
+            12697245696.0,
+            12697245696.0,
+            12697245696.0,
+            12697245696.0,
+            12697245696.0,
+            12697245696.0,
+            12697245696.0,
+            12697245696.0,
+            12697245696.0,
+            12697245696.0,
+            12697245696.0,
+            12697245696.0,
+            12697245696.0,
+            12697245696.0,
+            12697245696.0,
+            12697245696.0,
+            12697245696.0,
+            12697245696.0,
+            12697245696.0,
+            12697245696.0,
+            12697245696.0,
+            12697245696.0,
+            12697245696.0,
+            12697245696.0,
+            12697245696.0,
+            12697245696.0,
+            12697245696.0,
+            12697245696.0,
+            12697245696.0,
+            12697245696.0,
+            12697245696.0,
+            12697245696.0,
+            12697245696.0,
+            12697245696.0,
+            12697245696.0,
+            12697245696.0,
+            12697245696.0,
+            12697245696.0,
+            12697245696.0,
+            12697245696.0,
+            12697245696.0,
+            12697245696.0,
+            12697245696.0,
+            12697245696.0,
+            12697245696.0,
+            12697245696.0,
+            12697245696.0,
+            12697245696.0,
+            12697245696.0,
+            12697245696.0,
+            12697245696.0,
+            12697245696.0,
+            12697245696.0,
+            12697245696.0,
+            12697245696.0,
+            12697245696.0,
+            12697245696.0,
+            12697245696.0,
+            12697245696.0,
+            12697245696.0,
+            12697245696.0,
+            12697245696.0,
+            12697245696.0,
+            12697245696.0,
+            12697245696.0,
+            12697245696.0,
+            12697245696.0,
+            12697245696.0,
+            12697245696.0,
+            12697245696.0,
+            12697245696.0,
+            12697245696.0,
+            12697245696.0,
+            12697245696.0,
+            12697245696.0,
+            12697245696.0,
+            12697245696.0,
+            12697245696.0,
+            12697245696.0,
+            12697245696.0,
+            12697245696.0,
+            12697245696.0,
+            12697245696.0,
+            12697245696.0,
+            12697245696.0,
+            12697245696.0,
+            12697245696.0,
+            12697245696.0,
+            12697245696.0,
+            12697245696.0,
+            12697245696.0,
+            12697245696.0,
+            12697245696.0,
+            12697245696.0,
+            12697245696.0,
+            12697245696.0,
+            12697245696.0,
+            12697245696.0,
+            12697245696.0,
+            12697245696.0,
+            12697245696.0,
+            12697245696.0,
+            12697245696.0,
+            12697245696.0,
+            12697245696.0,
+            12697245696.0,
+            12697245696.0,
+            12697245696.0,
+            12697245696.0,
+            12697245696.0,
+            12697245696.0,
+            12697245696.0,
+            12697245696.0,
+            12697245696.0,
+            12697245696.0,
+            12697245696.0,
+            12697245696.0,
+            12697245696.0,
+            12697245696.0,
+            12697245696.0,
+            12697245696.0,
+            12697245696.0,
+            12697245696.0,
+            12697245696.0,
+            12697245696.0,
+            12697245696.0,
+            12697245696.0,
+            12697245696.0,
+            12697245696.0,
+            12697245696.0,
+            12697245696.0,
+            12697245696.0,
+            12697245696.0,
+            12697245696.0,
+            12697245696.0,
+            12697245696.0,
+            12697245696.0,
+            12697245696.0,
+            12697245696.0,
+            12697245696.0,
+            12697245696.0,
+            12697245696.0,
+            12697245696.0,
+            12697245696.0,
+            12697245696.0,
+            12697245696.0,
+            12697245696.0,
+            12697245696.0,
+            12697245696.0,
+            12697245696.0,
+            12697245696.0,
+            12697245696.0,
+            12697245696.0,
+            12697245696.0,
+            12697245696.0,
+            12697245696.0,
+            12697245696.0,
+            12697245696.0,
+            12697245696.0,
+            12697245696.0,
+            12697245696.0,
+            12697245696.0,
+            12697245696.0,
+            12697245696.0,
+            12697245696.0,
+            12697245696.0,
+            12697245696.0,
+            12697245696.0,
+            12697245696.0,
+            12697245696.0,
+            12697245696.0,
+            12697245696.0,
+            12697245696.0,
+            12697245696.0,
+            12697245696.0,
+            12697245696.0,
+            12697245696.0,
+            12697245696.0,
+            12697245696.0,
+            12697245696.0,
+            12697245696.0,
+            12697245696.0,
+            12697245696.0,
+            12697245696.0,
+            12697245696.0,
+            12697245696.0,
+            12697245696.0,
+            12697245696.0,
+            12697245696.0,
+            12697245696.0,
+            12697245696.0,
+            12697245696.0,
+            12697245696.0,
+            12697245696.0,
+            12697245696.0,
+            12697245696.0,
+            12697245696.0,
+            12697245696.0,
+            12697245696.0,
+            12697245696.0,
+            12697245696.0,
+            12697245696.0,
+            12697245696.0,
+            12697245696.0,
+            12697245696.0,
+            12697245696.0,
+            12697245696.0,
+            12697245696.0,
+            12697245696.0,
+            12697245696.0,
+            12697245696.0,
+            12697245696.0,
+            12697245696.0,
+            12697245696.0,
+            12697245696.0,
+            12697245696.0,
+            12697245696.0,
+            12697245696.0,
+            12697245696.0,
+            12697245696.0,
+            12697245696.0,
+            12697245696.0,
+            12697245696.0,
+            12697245696.0,
+            12697245696.0,
+            12697245696.0,
+            12697245696.0,
+            12697245696.0,
+            12697245696.0,
+            12697245696.0,
+            12697245696.0,
+            12697245696.0,
+            12697245696.0,
+            12697245696.0,
+            12697245696.0,
+            12697245696.0,
+            12697245696.0,
+            12697245696.0,
+            12697245696.0,
+            12697245696.0,
+            12697245696.0,
+            12697245696.0,
+            12697245696.0,
+            12697245696.0
+        ]
+    },
+    "iteration-time": {
+        "start_step": 0,
+        "end_step": 29,
+        "step_interval": 5,
+        "values": [
+            3.59643,
+            3.46816,
+            3.44454,
+            3.42413,
+            3.41615,
+            3.41152
+        ]
+    }
+}
\ No newline at end of file
diff --git a/tests/functional_tests/test_cases/gpt/gpt3_15b_8t_release/model_config.yaml b/tests/functional_tests/test_cases/gpt/gpt3_15b_8t_release/model_config.yaml
new file mode 100644
index 0000000000..bf88792152
--- /dev/null
+++ b/tests/functional_tests/test_cases/gpt/gpt3_15b_8t_release/model_config.yaml
@@ -0,0 +1,100 @@
+ENV_VARS:
+  NCCL_IB_SL: 1
+  NCCL_IB_TIMEOUT: 19
+  CUDA_DEVICE_MAX_CONNECTIONS: 1
+  NVTE_FWD_LAYERNORM_SM_MARGIN: 16
+  NVTE_BWD_LAYERNORM_SM_MARGIN: 16
+  NCCL_P2P_NET_CHUNKSIZE: 2097152
+  NVTE_ALLOW_NONDETERMINISTIC_ALGO: 1
+
+TEST_TYPE: "release"
+
+MODEL_ARGS:
+  # Distributed args
+  --distributed-timeout-minutes: 60
+  --tensor-model-parallel-size: 8
+  --pipeline-model-parallel-size: 1
+  --use-distributed-optimizer: true
+  --overlap-grad-reduce: true
+  --overlap-param-gather: true
+
+  # Training args
+  --use-mcore-models: true
+  --sequence-parallel: true
+  --disable-bias-linear: true
+  --micro-batch-size: 4
+  --rampup-batch-size: "384 384 97656250"
+  --global-batch-size: 1152
+  --train-samples: 19531250
+  --manual-gc: true
+
+  # Transformer Engine args
+  --transformer-impl: transformer_engine
+
+  # Data args
+  --data-cache-path: ${DATA_CACHE_PATH}
+  --tokenizer-type: GPTSentencePieceTokenizer
+  --tokenizer-model: ${DATA_PATH}/utils/nemotron_2_256k.model
+  --data-path: $DATA_BLEND
+  --split: 99,1,0
+  --no-mmap-bin-files: true
+  --num-workers: 6
+
+  # Add network size args
+  --apply-layernorm-1p: true
+  --untie-embeddings-and-output-weights: true
+  --no-position-embedding: true
+  --use-rotary-position-embeddings: true
+  --rotary-percent: 0.5
+  --squared-relu: true
+  --num-layers: 32
+  --hidden-size: 6144
+  --num-attention-heads: 48
+  --group-query-attention: true
+  --num-query-groups: 8
+  --seq-length: 4096
+  --max-position-embeddings: 4096
+
+  # Add regularization args
+  --attention-dropout: 0.0
+  --hidden-dropout: 0.0
+  --clip-grad: 1.0
+  --weight-decay: 0.1
+
+  # Add learning rate args
+  --lr-decay-samples: 1949218748
+  --lr-warmup-samples: 3906252
+  --lr: 4.5e-4
+  --min-lr: 4.5e-5
+  --decoupled-lr: 5.0e-4
+  --decoupled-min-lr: 4.5e-5
+  --lr-decay-style: cosine
+  --adam-beta1: 0.9
+  --adam-beta2: 0.95
+
+  # Add validation args
+  --eval-iters: 32
+  --eval-interval: 2000
+
+  # Add checkpointing args
+  --load: ${OUTPUT_PATH}/checkpoints
+  --save: ${OUTPUT_PATH}/checkpoints
+  --save-interval: 500
+
+  # Add initialization args
+  --init-method-std: 0.0134
+
+  # Add logging args
+  --log-timers-to-tensorboard: true
+  --log-memory-to-tensorboard: true
+  --log-num-zeros-in-grad: true
+  --log-params-norm: true
+  --log-validation-ppl-to-tensorboard: true
+  --log-throughput: true
+  --log-interval: 100
+  --tensorboard-dir: ${OUTPUT_PATH}/tensorboard
+  --wandb-project: megatron-core-release-runs
+  --wandb-exp-name: ${WANDB_EXPERIMENT}
+
+  # Add mixed precision args
+  --bf16: true
\ No newline at end of file
diff --git a/tests/functional_tests/test_cases/gpt/gpt3_15b_8t_release_sm/model_config.yaml b/tests/functional_tests/test_cases/gpt/gpt3_15b_8t_release_sm/model_config.yaml
new file mode 100644
index 0000000000..9453db100c
--- /dev/null
+++ b/tests/functional_tests/test_cases/gpt/gpt3_15b_8t_release_sm/model_config.yaml
@@ -0,0 +1,100 @@
+ENV_VARS:
+  NCCL_IB_SL: 1
+  NCCL_IB_TIMEOUT: 19
+  CUDA_DEVICE_MAX_CONNECTIONS: 1
+  NVTE_FWD_LAYERNORM_SM_MARGIN: 16
+  NVTE_BWD_LAYERNORM_SM_MARGIN: 16
+  NCCL_P2P_NET_CHUNKSIZE: 2097152
+  NVTE_ALLOW_NONDETERMINISTIC_ALGO: 1
+
+TEST_TYPE: "release"
+
+MODEL_ARGS:
+  # Distributed args
+  --distributed-timeout-minutes: 60
+  --tensor-model-parallel-size: 8
+  --pipeline-model-parallel-size: 1
+  --use-distributed-optimizer: true
+  --overlap-grad-reduce: true
+  --overlap-param-gather: true
+
+  # Training args
+  --use-mcore-models: true
+  --sequence-parallel: true
+  --disable-bias-linear: true
+  --micro-batch-size: 4
+  --rampup-batch-size: "384 384 97656250"
+  --global-batch-size: 1152
+  --train-samples: 4882812
+  --manual-gc: true
+
+  # Transformer Engine args
+  --transformer-impl: transformer_engine
+
+  # Data args
+  --data-cache-path: ${DATA_CACHE_PATH}
+  --tokenizer-type: GPTSentencePieceTokenizer
+  --tokenizer-model: ${DATA_PATH}/utils/nemotron_2_256k.model
+  --data-path: $DATA_BLEND
+  --split: 99,1,0
+  --no-mmap-bin-files: true
+  --num-workers: 6
+
+  # Add network size args
+  --apply-layernorm-1p: true
+  --untie-embeddings-and-output-weights: true
+  --no-position-embedding: true
+  --use-rotary-position-embeddings: true
+  --rotary-percent: 0.5
+  --squared-relu: true
+  --num-layers: 32
+  --hidden-size: 6144
+  --num-attention-heads: 48
+  --group-query-attention: true
+  --num-query-groups: 8
+  --seq-length: 4096
+  --max-position-embeddings: 4096
+
+  # Add regularization args
+  --attention-dropout: 0.0
+  --hidden-dropout: 0.0
+  --clip-grad: 1.0
+  --weight-decay: 0.1
+
+  # Add learning rate args
+  --lr-decay-samples: 1949218748
+  --lr-warmup-samples: 3906252
+  --lr: 4.5e-4
+  --min-lr: 4.5e-5
+  --decoupled-lr: 5.0e-4
+  --decoupled-min-lr: 4.5e-5
+  --lr-decay-style: cosine
+  --adam-beta1: 0.9
+  --adam-beta2: 0.95
+
+  # Add validation args
+  --eval-iters: 32
+  --eval-interval: 2000
+
+  # Add checkpointing args
+  --load: ${OUTPUT_PATH}/checkpoints
+  --save: ${OUTPUT_PATH}/checkpoints
+  --save-interval: 500
+
+  # Add initialization args
+  --init-method-std: 0.0134
+
+  # Add logging args
+  --log-timers-to-tensorboard: true
+  --log-memory-to-tensorboard: true
+  --log-num-zeros-in-grad: true
+  --log-params-norm: true
+  --log-validation-ppl-to-tensorboard: true
+  --log-throughput: true
+  --log-interval: 100
+  --tensorboard-dir: ${OUTPUT_PATH}/tensorboard
+  --wandb-project: megatron-core-release-runs
+  --wandb-exp-name: ${WANDB_EXPERIMENT}
+
+  # Add mixed precision args
+  --bf16: true
\ No newline at end of file
diff --git a/tests/functional_tests/test_cases/gpt/gpt3_345m_nightly_dgx_a100_1N8G_mcore_tp1_pp1_dist_optimizer_overlap_grad_reduce_param_gather/golden_values_dev.json b/tests/functional_tests/test_cases/gpt/gpt3_345m_nightly_dgx_a100_1N8G_mcore_tp1_pp1_dist_optimizer_overlap_grad_reduce_param_gather/golden_values_dev.json
new file mode 100644
index 0000000000..ce02aad6c4
--- /dev/null
+++ b/tests/functional_tests/test_cases/gpt/gpt3_345m_nightly_dgx_a100_1N8G_mcore_tp1_pp1_dist_optimizer_overlap_grad_reduce_param_gather/golden_values_dev.json
@@ -0,0 +1,53 @@
+{
+    "lm loss": {
+        "start_step": 0,
+        "end_step": 50,
+        "step_interval": 5,
+        "values": [
+            10.84013,
+            10.8726,
+            10.85028,
+            10.7965,
+            10.68165,
+            10.60635,
+            10.12791,
+            10.22204,
+            10.13807,
+            9.82329
+        ]
+    },
+    "num-zeros": {
+        "start_step": 0,
+        "end_step": 50,
+        "step_interval": 5,
+        "values": [
+            1715.0,
+            1828.0,
+            1929.0,
+            2000.0,
+            1947.0,
+            1769.0,
+            1649.0,
+            2052.0,
+            2353.0,
+            2301.0
+        ]
+    },
+    "iteration-time": {
+        "start_step": 0,
+        "end_step": 50,
+        "step_interval": 5,
+        "values": [
+            5.42717,
+            0.09122,
+            0.08825,
+            0.08981,
+            0.08828,
+            0.08996,
+            0.08919,
+            0.0901,
+            0.08957,
+            0.08977
+        ]
+    }
+}
\ No newline at end of file
diff --git a/tests/functional_tests/test_cases/gpt/gpt3_345m_nightly_dgx_a100_1N8G_mcore_tp1_pp1_dist_optimizer_overlap_grad_reduce_param_gather/golden_values_lts.json b/tests/functional_tests/test_cases/gpt/gpt3_345m_nightly_dgx_a100_1N8G_mcore_tp1_pp1_dist_optimizer_overlap_grad_reduce_param_gather/golden_values_lts.json
new file mode 100644
index 0000000000..b5847f72a2
--- /dev/null
+++ b/tests/functional_tests/test_cases/gpt/gpt3_345m_nightly_dgx_a100_1N8G_mcore_tp1_pp1_dist_optimizer_overlap_grad_reduce_param_gather/golden_values_lts.json
@@ -0,0 +1,53 @@
+{
+    "lm loss": {
+        "start_step": 0,
+        "end_step": 50,
+        "step_interval": 5,
+        "values": [
+            10.84013,
+            10.8726,
+            10.85028,
+            10.79652,
+            10.68163,
+            10.60637,
+            10.12795,
+            10.22205,
+            10.13809,
+            9.82324
+        ]
+    },
+    "num-zeros": {
+        "start_step": 0,
+        "end_step": 50,
+        "step_interval": 5,
+        "values": [
+            1715.0,
+            1828.0,
+            1915.0,
+            1898.0,
+            1954.0,
+            1773.0,
+            1701.0,
+            2089.0,
+            2262.0,
+            2284.0
+        ]
+    },
+    "iteration-time": {
+        "start_step": 0,
+        "end_step": 50,
+        "step_interval": 5,
+        "values": [
+            12.57806,
+            0.09197,
+            0.09095,
+            0.09076,
+            0.09095,
+            0.09051,
+            0.09095,
+            0.09036,
+            0.09029,
+            0.09061
+        ]
+    }
+}
\ No newline at end of file
diff --git a/tests/functional_tests/test_cases/gpt/gpt3_345m_nightly_dgx_a100_1N8G_mcore_tp1_pp1_dist_optimizer_overlap_grad_reduce_param_gather/model_config.yaml b/tests/functional_tests/test_cases/gpt/gpt3_345m_nightly_dgx_a100_1N8G_mcore_tp1_pp1_dist_optimizer_overlap_grad_reduce_param_gather/model_config.yaml
new file mode 100644
index 0000000000..459270a1b2
--- /dev/null
+++ b/tests/functional_tests/test_cases/gpt/gpt3_345m_nightly_dgx_a100_1N8G_mcore_tp1_pp1_dist_optimizer_overlap_grad_reduce_param_gather/model_config.yaml
@@ -0,0 +1,53 @@
+ENV_VARS:
+  CUDA_DEVICE_MAX_CONNECTIONS: 1
+  NVTE_ALLOW_NONDETERMINISTIC_ALGO: 0
+  NCCL_ALGO: Tree
+  CUBLAS_WORKSPACE_CONFIG: :4096:8
+  N_REPEATS: 5
+MODEL_ARGS:
+  --num-layers: 12
+  --hidden-size: 512
+  --num-attention-heads: 8
+  --log-params-norm: true
+  --log-num-zeros-in-grad: true
+  --log-validation-ppl-to-tensorboard: true
+  --log-timers-to-tensorboard: true
+  --tensorboard-dir: ${TENSORBOARD_PATH}
+  --micro-batch-size: 4
+  --global-batch-size: 32
+  --seq-length: 1024
+  --max-position-embeddings: 1024
+  --train-iters: 50
+  --timing-log-level: 2
+  --lr-decay-iters: 320000
+  --save: ${CHECKPOINT_PATH}
+  --load: ${CHECKPOINT_PATH}
+  --data-path: ${DATA_PATH}/my-gpt3_00_text_document
+  --vocab-file: ${DATA_PATH}/bpe/vocab.json
+  --merge-file: ${DATA_PATH}/bpe/merges.txt
+  --split: 949,50,1
+  --distributed-backend: nccl
+  --lr: 0.00015
+  --lr-decay-style: cosine
+  --min-lr: 1.0e-5
+  --weight-decay: 1e-2
+  --clip-grad: 1.0
+  --lr-warmup-fraction: .01
+  --log-interval: 1
+  --save-interval: 10000
+  --eval-interval: 1000
+  --eval-iters: 10
+  --transformer-impl: local
+  --tensor-model-parallel-size: 1
+  --pipeline-model-parallel-size: 1
+  --use-distributed-optimizer: true
+  --overlap-grad-reduce: true
+  --overlap-param-gather: true
+  --deterministic-mode: true
+  --no-gradient-accumulation-fusion: true
+  --use-mcore-models: true
+  --ckpt-format: torch_dist
+  --data-cache-path: ${DATA_CACHE_PATH}
+  --bf16: true
+  --apply-query-key-layer-scaling: true
+TEST_TYPE: regular
\ No newline at end of file
diff --git a/tests/functional_tests/test_cases/gpt/gpt3_345m_nightly_dgx_a100_1N8G_mcore_tp1_pp1_resume_torch_dist_dist_optimizer_overlap_grad_reduce_param_gather/model_config.yaml b/tests/functional_tests/test_cases/gpt/gpt3_345m_nightly_dgx_a100_1N8G_mcore_tp1_pp1_resume_torch_dist_dist_optimizer_overlap_grad_reduce_param_gather/model_config.yaml
new file mode 100644
index 0000000000..dcb80dc007
--- /dev/null
+++ b/tests/functional_tests/test_cases/gpt/gpt3_345m_nightly_dgx_a100_1N8G_mcore_tp1_pp1_resume_torch_dist_dist_optimizer_overlap_grad_reduce_param_gather/model_config.yaml
@@ -0,0 +1,54 @@
+ENV_VARS:
+  CUDA_DEVICE_MAX_CONNECTIONS: 1
+  NVTE_ALLOW_NONDETERMINISTIC_ALGO: 0
+  NCCL_ALGO: Tree
+  CUBLAS_WORKSPACE_CONFIG: :4096:8
+  N_REPEATS: 5
+MODEL_ARGS:
+  --num-layers: 12
+  --hidden-size: 512
+  --num-attention-heads: 8
+  --log-params-norm: true
+  --log-num-zeros-in-grad: true
+  --log-validation-ppl-to-tensorboard: true
+  --log-timers-to-tensorboard: true
+  --tensorboard-dir: ${TENSORBOARD_PATH}
+  --micro-batch-size: 4
+  --global-batch-size: 32
+  --seq-length: 1024
+  --max-position-embeddings: 1024
+  --train-iters: 100
+  --timing-log-level: 2
+  --lr-decay-iters: 320000
+  --save: ${CHECKPOINT_PATH}
+  --load: ${CHECKPOINT_PATH}
+  --data-path: ${DATA_PATH}/my-gpt3_00_text_document
+  --vocab-file: ${DATA_PATH}/bpe/vocab.json
+  --merge-file: ${DATA_PATH}/bpe/merges.txt
+  --split: 949,50,1
+  --distributed-backend: nccl
+  --lr: 0.00015
+  --lr-decay-style: cosine
+  --min-lr: 1.0e-5
+  --weight-decay: 1e-2
+  --clip-grad: 1.0
+  --lr-warmup-fraction: .01
+  --log-interval: 1
+  --save-interval: 50
+  --eval-interval: 1000
+  --eval-iters: 10
+  --transformer-impl: local
+  --tensor-model-parallel-size: 1
+  --pipeline-model-parallel-size: 1
+  --use-distributed-optimizer: true
+  --overlap-grad-reduce: true
+  --overlap-param-gather: true
+  --deterministic-mode: true
+  --no-gradient-accumulation-fusion: true
+  --use-checkpoint-opt_param-scheduler: true
+  --use-mcore-models: true
+  --ckpt-format: torch_dist
+  --data-cache-path: ${DATA_CACHE_PATH}
+  --fp16: true
+  --apply-query-key-layer-scaling: true
+TEST_TYPE: ckpt-resume
\ No newline at end of file
diff --git a/tests/functional_tests/test_cases/gpt/gpt3_345m_nightly_dgx_a100_1N8G_mcore_tp1_pp2/golden_values_dev.json b/tests/functional_tests/test_cases/gpt/gpt3_345m_nightly_dgx_a100_1N8G_mcore_tp1_pp2/golden_values_dev.json
new file mode 100644
index 0000000000..9895a353ac
--- /dev/null
+++ b/tests/functional_tests/test_cases/gpt/gpt3_345m_nightly_dgx_a100_1N8G_mcore_tp1_pp2/golden_values_dev.json
@@ -0,0 +1,53 @@
+{
+    "lm loss": {
+        "start_step": 0,
+        "end_step": 50,
+        "step_interval": 5,
+        "values": [
+            10.83373,
+            10.86683,
+            10.89023,
+            10.81051,
+            10.68459,
+            10.60979,
+            10.08992,
+            10.21481,
+            10.14018,
+            9.80603
+        ]
+    },
+    "num-zeros": {
+        "start_step": 0,
+        "end_step": 50,
+        "step_interval": 5,
+        "values": [
+            1488.0,
+            1854.0,
+            1854.0,
+            1884.0,
+            1794.0,
+            1784.0,
+            1569.0,
+            1942.0,
+            2263.0,
+            2147.0
+        ]
+    },
+    "iteration-time": {
+        "start_step": 0,
+        "end_step": 50,
+        "step_interval": 5,
+        "values": [
+            13.39475,
+            0.14158,
+            0.14256,
+            0.14166,
+            0.14243,
+            0.14232,
+            0.143,
+            0.14113,
+            0.14164,
+            0.14069
+        ]
+    }
+}
\ No newline at end of file
diff --git a/tests/functional_tests/test_cases/gpt/gpt3_345m_nightly_dgx_a100_1N8G_mcore_tp1_pp2/golden_values_lts.json b/tests/functional_tests/test_cases/gpt/gpt3_345m_nightly_dgx_a100_1N8G_mcore_tp1_pp2/golden_values_lts.json
new file mode 100644
index 0000000000..9895a353ac
--- /dev/null
+++ b/tests/functional_tests/test_cases/gpt/gpt3_345m_nightly_dgx_a100_1N8G_mcore_tp1_pp2/golden_values_lts.json
@@ -0,0 +1,53 @@
+{
+    "lm loss": {
+        "start_step": 0,
+        "end_step": 50,
+        "step_interval": 5,
+        "values": [
+            10.83373,
+            10.86683,
+            10.89023,
+            10.81051,
+            10.68459,
+            10.60979,
+            10.08992,
+            10.21481,
+            10.14018,
+            9.80603
+        ]
+    },
+    "num-zeros": {
+        "start_step": 0,
+        "end_step": 50,
+        "step_interval": 5,
+        "values": [
+            1488.0,
+            1854.0,
+            1854.0,
+            1884.0,
+            1794.0,
+            1784.0,
+            1569.0,
+            1942.0,
+            2263.0,
+            2147.0
+        ]
+    },
+    "iteration-time": {
+        "start_step": 0,
+        "end_step": 50,
+        "step_interval": 5,
+        "values": [
+            13.39475,
+            0.14158,
+            0.14256,
+            0.14166,
+            0.14243,
+            0.14232,
+            0.143,
+            0.14113,
+            0.14164,
+            0.14069
+        ]
+    }
+}
\ No newline at end of file
diff --git a/tests/functional_tests/test_cases/gpt/gpt3_345m_nightly_dgx_a100_1N8G_mcore_tp1_pp2/model_config.yaml b/tests/functional_tests/test_cases/gpt/gpt3_345m_nightly_dgx_a100_1N8G_mcore_tp1_pp2/model_config.yaml
new file mode 100644
index 0000000000..d94f5277d4
--- /dev/null
+++ b/tests/functional_tests/test_cases/gpt/gpt3_345m_nightly_dgx_a100_1N8G_mcore_tp1_pp2/model_config.yaml
@@ -0,0 +1,50 @@
+ENV_VARS:
+  CUDA_DEVICE_MAX_CONNECTIONS: 1
+  NVTE_ALLOW_NONDETERMINISTIC_ALGO: 0
+  NCCL_ALGO: Tree
+  CUBLAS_WORKSPACE_CONFIG: :4096:8
+  N_REPEATS: 5
+MODEL_ARGS:
+  --num-layers: 12
+  --hidden-size: 512
+  --num-attention-heads: 8
+  --log-params-norm: true
+  --log-num-zeros-in-grad: true
+  --log-validation-ppl-to-tensorboard: true
+  --log-timers-to-tensorboard: true
+  --tensorboard-dir: ${TENSORBOARD_PATH}
+  --micro-batch-size: 4
+  --global-batch-size: 32
+  --seq-length: 1024
+  --max-position-embeddings: 1024
+  --train-iters: 50
+  --timing-log-level: 2
+  --lr-decay-iters: 320000
+  --save: ${CHECKPOINT_PATH}
+  --load: ${CHECKPOINT_PATH}
+  --data-path: ${DATA_PATH}/my-gpt3_00_text_document
+  --vocab-file: ${DATA_PATH}/bpe/vocab.json
+  --merge-file: ${DATA_PATH}/bpe/merges.txt
+  --split: 949,50,1
+  --distributed-backend: nccl
+  --lr: 0.00015
+  --lr-decay-style: cosine
+  --min-lr: 1.0e-5
+  --weight-decay: 1e-2
+  --clip-grad: 1.0
+  --lr-warmup-fraction: .01
+  --log-interval: 1
+  --save-interval: 10000
+  --eval-interval: 1000
+  --eval-iters: 10
+  --transformer-impl: local
+  --tensor-model-parallel-size: 1
+  --pipeline-model-parallel-size: 2
+  --deterministic-mode: true
+  --no-gradient-accumulation-fusion: true
+  --use-mcore-models: true
+  --ckpt-format: torch_dist
+  --data-cache-path: ${DATA_CACHE_PATH}
+  --bf16: true
+  --apply-query-key-layer-scaling: true
+TEST_TYPE: regular
\ No newline at end of file
diff --git a/tests/functional_tests/test_cases/gpt/gpt3_345m_nightly_dgx_a100_1N8G_mcore_tp1_pp2_fp16/golden_values_dev.json b/tests/functional_tests/test_cases/gpt/gpt3_345m_nightly_dgx_a100_1N8G_mcore_tp1_pp2_fp16/golden_values_dev.json
new file mode 100644
index 0000000000..418a8d65de
--- /dev/null
+++ b/tests/functional_tests/test_cases/gpt/gpt3_345m_nightly_dgx_a100_1N8G_mcore_tp1_pp2_fp16/golden_values_dev.json
@@ -0,0 +1 @@
+{"lm loss": {"start_step": 0, "end_step": 50, "step_interval": 5, "values": [10.83369, 10.86796, 10.8992, 10.86517, 10.85506, 10.82693, 10.6268, 10.61756, 10.53014, 10.24593]}, "num-zeros": {"start_step": 0, "end_step": 33, "step_interval": 5, "values": [2173.0, 2276.0, 2414.0, 2449.0, 2193.0, 1934.0, 2524.0]}, "iteration_timing_avg": 0.11905411764705882}
\ No newline at end of file
diff --git a/tests/functional_tests/test_cases/gpt/gpt3_345m_nightly_dgx_a100_1N8G_mcore_tp1_pp2_fp16/golden_values_lts.json b/tests/functional_tests/test_cases/gpt/gpt3_345m_nightly_dgx_a100_1N8G_mcore_tp1_pp2_fp16/golden_values_lts.json
new file mode 100644
index 0000000000..418a8d65de
--- /dev/null
+++ b/tests/functional_tests/test_cases/gpt/gpt3_345m_nightly_dgx_a100_1N8G_mcore_tp1_pp2_fp16/golden_values_lts.json
@@ -0,0 +1 @@
+{"lm loss": {"start_step": 0, "end_step": 50, "step_interval": 5, "values": [10.83369, 10.86796, 10.8992, 10.86517, 10.85506, 10.82693, 10.6268, 10.61756, 10.53014, 10.24593]}, "num-zeros": {"start_step": 0, "end_step": 33, "step_interval": 5, "values": [2173.0, 2276.0, 2414.0, 2449.0, 2193.0, 1934.0, 2524.0]}, "iteration_timing_avg": 0.11905411764705882}
\ No newline at end of file
diff --git a/tests/functional_tests/test_cases/gpt/gpt3_345m_nightly_dgx_a100_1N8G_mcore_tp1_pp2_fp16/model_config.yaml b/tests/functional_tests/test_cases/gpt/gpt3_345m_nightly_dgx_a100_1N8G_mcore_tp1_pp2_fp16/model_config.yaml
new file mode 100644
index 0000000000..9f210d838f
--- /dev/null
+++ b/tests/functional_tests/test_cases/gpt/gpt3_345m_nightly_dgx_a100_1N8G_mcore_tp1_pp2_fp16/model_config.yaml
@@ -0,0 +1,50 @@
+ENV_VARS:
+  CUDA_DEVICE_MAX_CONNECTIONS: 1
+  NVTE_ALLOW_NONDETERMINISTIC_ALGO: 0
+  NCCL_ALGO: Tree
+  CUBLAS_WORKSPACE_CONFIG: :4096:8
+  N_REPEATS: 5
+MODEL_ARGS:
+  --num-layers: 12
+  --hidden-size: 512
+  --num-attention-heads: 8
+  --log-params-norm: true
+  --log-num-zeros-in-grad: true
+  --log-validation-ppl-to-tensorboard: true
+  --log-timers-to-tensorboard: true
+  --tensorboard-dir: ${TENSORBOARD_PATH}
+  --micro-batch-size: 4
+  --global-batch-size: 32
+  --seq-length: 1024
+  --max-position-embeddings: 1024
+  --train-iters: 50
+  --timing-log-level: 2
+  --lr-decay-iters: 320000
+  --save: ${CHECKPOINT_PATH}
+  --load: ${CHECKPOINT_PATH}
+  --data-path: ${DATA_PATH}/my-gpt3_00_text_document
+  --vocab-file: ${DATA_PATH}/bpe/vocab.json
+  --merge-file: ${DATA_PATH}/bpe/merges.txt
+  --split: 949,50,1
+  --distributed-backend: nccl
+  --lr: 0.00015
+  --lr-decay-style: cosine
+  --min-lr: 1.0e-5
+  --weight-decay: 1e-2
+  --clip-grad: 1.0
+  --lr-warmup-fraction: .01
+  --log-interval: 1
+  --save-interval: 10000
+  --eval-interval: 1000
+  --eval-iters: 10
+  --transformer-impl: local
+  --tensor-model-parallel-size: 1
+  --pipeline-model-parallel-size: 2
+  --deterministic-mode: true
+  --no-gradient-accumulation-fusion: true
+  --use-mcore-models: true
+  --ckpt-format: torch_dist
+  --data-cache-path: ${DATA_CACHE_PATH}
+  --fp16: true
+  --apply-query-key-layer-scaling: true
+TEST_TYPE: regular
\ No newline at end of file
diff --git a/tests/functional_tests/test_cases/gpt/gpt3_345m_nightly_dgx_a100_1N8G_mcore_tp1_pp2_resume_torch_dist/golden_values_dev.json b/tests/functional_tests/test_cases/gpt/gpt3_345m_nightly_dgx_a100_1N8G_mcore_tp1_pp2_resume_torch_dist/golden_values_dev.json
new file mode 100644
index 0000000000..fa1ca531db
--- /dev/null
+++ b/tests/functional_tests/test_cases/gpt/gpt3_345m_nightly_dgx_a100_1N8G_mcore_tp1_pp2_resume_torch_dist/golden_values_dev.json
@@ -0,0 +1 @@
+{"lm loss": {"start_step": 0, "end_step": 100, "step_interval": 5, "values": [10.83377, 10.86686, 10.89018, 10.81039, 10.68443, 10.60957, 10.08966, 10.21453, 10.13998, 9.80584, 9.83013, 9.60653, 9.67621, 9.68788, 9.59862, 9.07653, 9.47156, 9.06787, 9.32985, 9.51568]}, "num-zeros": {"start_step": 0, "end_step": 100, "step_interval": 5, "values": [1566.0, 1800.0, 1833.0, 1834.0, 1824.0, 1641.0, 1539.0, 1880.0, 2289.0, 2267.0, 2472.0, 2970.0, 3076.0, 3074.0, 3018.0, 2972.0, 3783.0, 2794.0, 2743.0, 3289.0]}, "iteration_timing_avg": 0.12010238805970147}
\ No newline at end of file
diff --git a/tests/functional_tests/test_cases/gpt/gpt3_345m_nightly_dgx_a100_1N8G_mcore_tp1_pp2_resume_torch_dist/golden_values_lts.json b/tests/functional_tests/test_cases/gpt/gpt3_345m_nightly_dgx_a100_1N8G_mcore_tp1_pp2_resume_torch_dist/golden_values_lts.json
new file mode 100644
index 0000000000..fa1ca531db
--- /dev/null
+++ b/tests/functional_tests/test_cases/gpt/gpt3_345m_nightly_dgx_a100_1N8G_mcore_tp1_pp2_resume_torch_dist/golden_values_lts.json
@@ -0,0 +1 @@
+{"lm loss": {"start_step": 0, "end_step": 100, "step_interval": 5, "values": [10.83377, 10.86686, 10.89018, 10.81039, 10.68443, 10.60957, 10.08966, 10.21453, 10.13998, 9.80584, 9.83013, 9.60653, 9.67621, 9.68788, 9.59862, 9.07653, 9.47156, 9.06787, 9.32985, 9.51568]}, "num-zeros": {"start_step": 0, "end_step": 100, "step_interval": 5, "values": [1566.0, 1800.0, 1833.0, 1834.0, 1824.0, 1641.0, 1539.0, 1880.0, 2289.0, 2267.0, 2472.0, 2970.0, 3076.0, 3074.0, 3018.0, 2972.0, 3783.0, 2794.0, 2743.0, 3289.0]}, "iteration_timing_avg": 0.12010238805970147}
\ No newline at end of file
diff --git a/tests/functional_tests/test_cases/gpt/gpt3_345m_nightly_dgx_a100_1N8G_mcore_tp1_pp2_resume_torch_dist/model_config.yaml b/tests/functional_tests/test_cases/gpt/gpt3_345m_nightly_dgx_a100_1N8G_mcore_tp1_pp2_resume_torch_dist/model_config.yaml
new file mode 100644
index 0000000000..b943bfec0f
--- /dev/null
+++ b/tests/functional_tests/test_cases/gpt/gpt3_345m_nightly_dgx_a100_1N8G_mcore_tp1_pp2_resume_torch_dist/model_config.yaml
@@ -0,0 +1,51 @@
+ENV_VARS:
+  CUDA_DEVICE_MAX_CONNECTIONS: 1
+  NVTE_ALLOW_NONDETERMINISTIC_ALGO: 0
+  NCCL_ALGO: Tree
+  CUBLAS_WORKSPACE_CONFIG: :4096:8
+  N_REPEATS: 5
+MODEL_ARGS:
+  --num-layers: 12
+  --hidden-size: 512
+  --num-attention-heads: 8
+  --log-params-norm: true
+  --log-num-zeros-in-grad: true
+  --log-validation-ppl-to-tensorboard: true
+  --log-timers-to-tensorboard: true
+  --tensorboard-dir: ${TENSORBOARD_PATH}
+  --micro-batch-size: 4
+  --global-batch-size: 32
+  --seq-length: 1024
+  --max-position-embeddings: 1024
+  --train-iters: 100
+  --timing-log-level: 2
+  --lr-decay-iters: 320000
+  --save: ${CHECKPOINT_PATH}
+  --load: ${CHECKPOINT_PATH}
+  --data-path: ${DATA_PATH}/my-gpt3_00_text_document
+  --vocab-file: ${DATA_PATH}/bpe/vocab.json
+  --merge-file: ${DATA_PATH}/bpe/merges.txt
+  --split: 949,50,1
+  --distributed-backend: nccl
+  --lr: 0.00015
+  --lr-decay-style: cosine
+  --min-lr: 1.0e-5
+  --weight-decay: 1e-2
+  --clip-grad: 1.0
+  --lr-warmup-fraction: .01
+  --log-interval: 1
+  --save-interval: 50
+  --eval-interval: 1000
+  --eval-iters: 10
+  --transformer-impl: local
+  --tensor-model-parallel-size: 1
+  --pipeline-model-parallel-size: 2
+  --deterministic-mode: true
+  --no-gradient-accumulation-fusion: true
+  --use-checkpoint-opt_param-scheduler: true
+  --use-mcore-models: true
+  --ckpt-format: torch_dist
+  --data-cache-path: ${DATA_CACHE_PATH}
+  --fp16: true
+  --apply-query-key-layer-scaling: true
+TEST_TYPE: ckpt-resume
\ No newline at end of file
diff --git a/tests/functional_tests/test_cases/gpt/gpt3_345m_nightly_dgx_a100_1N8G_mcore_tp1_pp4/golden_values_dev.json b/tests/functional_tests/test_cases/gpt/gpt3_345m_nightly_dgx_a100_1N8G_mcore_tp1_pp4/golden_values_dev.json
new file mode 100644
index 0000000000..4924720d79
--- /dev/null
+++ b/tests/functional_tests/test_cases/gpt/gpt3_345m_nightly_dgx_a100_1N8G_mcore_tp1_pp4/golden_values_dev.json
@@ -0,0 +1,53 @@
+{
+    "lm loss": {
+        "start_step": 0,
+        "end_step": 50,
+        "step_interval": 5,
+        "values": [
+            10.79206,
+            10.86691,
+            10.89065,
+            10.78186,
+            10.65978,
+            10.58022,
+            10.08207,
+            10.19156,
+            10.13495,
+            9.81167
+        ]
+    },
+    "num-zeros": {
+        "start_step": 0,
+        "end_step": 50,
+        "step_interval": 5,
+        "values": [
+            1626.0,
+            1866.0,
+            1959.0,
+            1816.0,
+            1890.0,
+            1654.0,
+            1537.0,
+            1965.0,
+            2436.0,
+            2405.0
+        ]
+    },
+    "iteration-time": {
+        "start_step": 0,
+        "end_step": 50,
+        "step_interval": 5,
+        "values": [
+            21.9348,
+            0.1633,
+            0.16334,
+            0.16269,
+            0.16133,
+            0.16064,
+            0.16007,
+            0.15926,
+            0.1592,
+            0.15982
+        ]
+    }
+}
\ No newline at end of file
diff --git a/tests/functional_tests/test_cases/gpt/gpt3_345m_nightly_dgx_a100_1N8G_mcore_tp1_pp4/golden_values_lts.json b/tests/functional_tests/test_cases/gpt/gpt3_345m_nightly_dgx_a100_1N8G_mcore_tp1_pp4/golden_values_lts.json
new file mode 100644
index 0000000000..4924720d79
--- /dev/null
+++ b/tests/functional_tests/test_cases/gpt/gpt3_345m_nightly_dgx_a100_1N8G_mcore_tp1_pp4/golden_values_lts.json
@@ -0,0 +1,53 @@
+{
+    "lm loss": {
+        "start_step": 0,
+        "end_step": 50,
+        "step_interval": 5,
+        "values": [
+            10.79206,
+            10.86691,
+            10.89065,
+            10.78186,
+            10.65978,
+            10.58022,
+            10.08207,
+            10.19156,
+            10.13495,
+            9.81167
+        ]
+    },
+    "num-zeros": {
+        "start_step": 0,
+        "end_step": 50,
+        "step_interval": 5,
+        "values": [
+            1626.0,
+            1866.0,
+            1959.0,
+            1816.0,
+            1890.0,
+            1654.0,
+            1537.0,
+            1965.0,
+            2436.0,
+            2405.0
+        ]
+    },
+    "iteration-time": {
+        "start_step": 0,
+        "end_step": 50,
+        "step_interval": 5,
+        "values": [
+            21.9348,
+            0.1633,
+            0.16334,
+            0.16269,
+            0.16133,
+            0.16064,
+            0.16007,
+            0.15926,
+            0.1592,
+            0.15982
+        ]
+    }
+}
\ No newline at end of file
diff --git a/tests/functional_tests/test_cases/gpt/gpt3_345m_nightly_dgx_a100_1N8G_mcore_tp1_pp4/model_config.yaml b/tests/functional_tests/test_cases/gpt/gpt3_345m_nightly_dgx_a100_1N8G_mcore_tp1_pp4/model_config.yaml
new file mode 100644
index 0000000000..108cb6b1a4
--- /dev/null
+++ b/tests/functional_tests/test_cases/gpt/gpt3_345m_nightly_dgx_a100_1N8G_mcore_tp1_pp4/model_config.yaml
@@ -0,0 +1,50 @@
+ENV_VARS:
+  CUDA_DEVICE_MAX_CONNECTIONS: 1
+  NVTE_ALLOW_NONDETERMINISTIC_ALGO: 0
+  NCCL_ALGO: Tree
+  CUBLAS_WORKSPACE_CONFIG: :4096:8
+  N_REPEATS: 5
+MODEL_ARGS:
+  --num-layers: 12
+  --hidden-size: 512
+  --num-attention-heads: 8
+  --log-params-norm: true
+  --log-num-zeros-in-grad: true
+  --log-validation-ppl-to-tensorboard: true
+  --log-timers-to-tensorboard: true
+  --tensorboard-dir: ${TENSORBOARD_PATH}
+  --micro-batch-size: 4
+  --global-batch-size: 32
+  --seq-length: 1024
+  --max-position-embeddings: 1024
+  --train-iters: 50
+  --timing-log-level: 2
+  --lr-decay-iters: 320000
+  --save: ${CHECKPOINT_PATH}
+  --load: ${CHECKPOINT_PATH}
+  --data-path: ${DATA_PATH}/my-gpt3_00_text_document
+  --vocab-file: ${DATA_PATH}/bpe/vocab.json
+  --merge-file: ${DATA_PATH}/bpe/merges.txt
+  --split: 949,50,1
+  --distributed-backend: nccl
+  --lr: 0.00015
+  --lr-decay-style: cosine
+  --min-lr: 1.0e-5
+  --weight-decay: 1e-2
+  --clip-grad: 1.0
+  --lr-warmup-fraction: .01
+  --log-interval: 1
+  --save-interval: 10000
+  --eval-interval: 1000
+  --eval-iters: 10
+  --transformer-impl: local
+  --tensor-model-parallel-size: 1
+  --pipeline-model-parallel-size: 4
+  --deterministic-mode: true
+  --no-gradient-accumulation-fusion: true
+  --use-mcore-models: true
+  --ckpt-format: torch_dist
+  --data-cache-path: ${DATA_CACHE_PATH}
+  --bf16: true
+  --apply-query-key-layer-scaling: true
+TEST_TYPE: regular
\ No newline at end of file
diff --git a/tests/functional_tests/test_cases/gpt/gpt3_345m_nightly_dgx_a100_1N8G_mcore_tp1_pp4_resume_torch_dist/model_config.yaml b/tests/functional_tests/test_cases/gpt/gpt3_345m_nightly_dgx_a100_1N8G_mcore_tp1_pp4_resume_torch_dist/model_config.yaml
new file mode 100644
index 0000000000..1c2a42eaaa
--- /dev/null
+++ b/tests/functional_tests/test_cases/gpt/gpt3_345m_nightly_dgx_a100_1N8G_mcore_tp1_pp4_resume_torch_dist/model_config.yaml
@@ -0,0 +1,51 @@
+ENV_VARS:
+  CUDA_DEVICE_MAX_CONNECTIONS: 1
+  NVTE_ALLOW_NONDETERMINISTIC_ALGO: 0
+  NCCL_ALGO: Tree
+  CUBLAS_WORKSPACE_CONFIG: :4096:8
+  N_REPEATS: 5
+MODEL_ARGS:
+  --num-layers: 12
+  --hidden-size: 512
+  --num-attention-heads: 8
+  --log-params-norm: true
+  --log-num-zeros-in-grad: true
+  --log-validation-ppl-to-tensorboard: true
+  --log-timers-to-tensorboard: true
+  --tensorboard-dir: ${TENSORBOARD_PATH}
+  --micro-batch-size: 4
+  --global-batch-size: 32
+  --seq-length: 1024
+  --max-position-embeddings: 1024
+  --train-iters: 100
+  --timing-log-level: 2
+  --lr-decay-iters: 320000
+  --save: ${CHECKPOINT_PATH}
+  --load: ${CHECKPOINT_PATH}
+  --data-path: ${DATA_PATH}/my-gpt3_00_text_document
+  --vocab-file: ${DATA_PATH}/bpe/vocab.json
+  --merge-file: ${DATA_PATH}/bpe/merges.txt
+  --split: 949,50,1
+  --distributed-backend: nccl
+  --lr: 0.00015
+  --lr-decay-style: cosine
+  --min-lr: 1.0e-5
+  --weight-decay: 1e-2
+  --clip-grad: 1.0
+  --lr-warmup-fraction: .01
+  --log-interval: 1
+  --save-interval: 50
+  --eval-interval: 1000
+  --eval-iters: 10
+  --transformer-impl: local
+  --tensor-model-parallel-size: 1
+  --pipeline-model-parallel-size: 4
+  --deterministic-mode: true
+  --no-gradient-accumulation-fusion: true
+  --use-checkpoint-opt_param-scheduler: true
+  --use-mcore-models: true
+  --ckpt-format: torch_dist
+  --data-cache-path: ${DATA_CACHE_PATH}
+  --fp16: true
+  --apply-query-key-layer-scaling: true
+TEST_TYPE: ckpt-resume
\ No newline at end of file
diff --git a/tests/functional_tests/test_cases/gpt/gpt3_345m_nightly_dgx_a100_1N8G_mcore_tp2_pp2_ep2_resume_torch_dist_te_4experts2parallel/model_config.yaml b/tests/functional_tests/test_cases/gpt/gpt3_345m_nightly_dgx_a100_1N8G_mcore_tp2_pp2_ep2_resume_torch_dist_te_4experts2parallel/model_config.yaml
new file mode 100644
index 0000000000..cb0214f264
--- /dev/null
+++ b/tests/functional_tests/test_cases/gpt/gpt3_345m_nightly_dgx_a100_1N8G_mcore_tp2_pp2_ep2_resume_torch_dist_te_4experts2parallel/model_config.yaml
@@ -0,0 +1,55 @@
+ENV_VARS:
+  CUDA_DEVICE_MAX_CONNECTIONS: 1
+  NVTE_ALLOW_NONDETERMINISTIC_ALGO: 0
+  NCCL_ALGO: Tree
+  CUBLAS_WORKSPACE_CONFIG: :4096:8
+  N_REPEATS: 5
+MODEL_ARGS:
+  --num-layers: 12
+  --hidden-size: 512
+  --num-attention-heads: 8
+  --log-params-norm: true
+  --log-num-zeros-in-grad: true
+  --log-validation-ppl-to-tensorboard: true
+  --log-timers-to-tensorboard: true
+  --tensorboard-dir: ${TENSORBOARD_PATH}
+  --micro-batch-size: 4
+  --global-batch-size: 32
+  --seq-length: 1024
+  --max-position-embeddings: 1024
+  --train-iters: 100
+  --timing-log-level: 2
+  --lr-decay-iters: 320000
+  --save: ${CHECKPOINT_PATH}
+  --load: ${CHECKPOINT_PATH}
+  --data-path: ${DATA_PATH}/my-gpt3_00_text_document
+  --vocab-file: ${DATA_PATH}/bpe/vocab.json
+  --merge-file: ${DATA_PATH}/bpe/merges.txt
+  --split: 949,50,1
+  --distributed-backend: nccl
+  --lr: 0.00015
+  --lr-decay-style: cosine
+  --min-lr: 1.0e-5
+  --weight-decay: 1e-2
+  --clip-grad: 1.0
+  --lr-warmup-fraction: .01
+  --log-interval: 1
+  --save-interval: 50
+  --eval-interval: 1000
+  --eval-iters: 10
+  --transformer-impl: local
+  --tensor-model-parallel-size: 2
+  --pipeline-model-parallel-size: 2
+  --expert-model-parallel-size: 2
+  --sequence-parallel: true
+  --num-experts: 4
+  --moe-router-load-balancing-type: sinkhorn
+  --moe-router-topk: 1
+  --deterministic-mode: true
+  --no-gradient-accumulation-fusion: true
+  --use-checkpoint-opt_param-scheduler: true
+  --use-mcore-models: true
+  --ckpt-format: torch_dist
+  --data-cache-path: ${DATA_CACHE_PATH}
+  --bf16: true
+TEST_TYPE: ckpt-resume
\ No newline at end of file
diff --git a/tests/functional_tests/test_cases/gpt/gpt3_345m_nightly_dgx_a100_1N8G_mcore_tp2_pp2_ep2_te_4experts2parallel/golden_values_dev.json b/tests/functional_tests/test_cases/gpt/gpt3_345m_nightly_dgx_a100_1N8G_mcore_tp2_pp2_ep2_te_4experts2parallel/golden_values_dev.json
new file mode 100644
index 0000000000..a9e79fc380
--- /dev/null
+++ b/tests/functional_tests/test_cases/gpt/gpt3_345m_nightly_dgx_a100_1N8G_mcore_tp2_pp2_ep2_te_4experts2parallel/golden_values_dev.json
@@ -0,0 +1,53 @@
+{
+    "lm loss": {
+        "start_step": 0,
+        "end_step": 50,
+        "step_interval": 5,
+        "values": [
+            10.81962,
+            10.8674,
+            10.8579,
+            10.80754,
+            10.71119,
+            10.63665,
+            10.16221,
+            10.27928,
+            10.18799,
+            9.89003
+        ]
+    },
+    "num-zeros": {
+        "start_step": 0,
+        "end_step": 50,
+        "step_interval": 5,
+        "values": [
+            12597.0,
+            15988.0,
+            16507.0,
+            15995.0,
+            14088.0,
+            14994.0,
+            12887.0,
+            15815.0,
+            17017.0,
+            17439.0
+        ]
+    },
+    "iteration-time": {
+        "start_step": 0,
+        "end_step": 50,
+        "step_interval": 5,
+        "values": [
+            16.34149,
+            0.66962,
+            0.66905,
+            0.66791,
+            0.67695,
+            0.66977,
+            0.67438,
+            0.67368,
+            0.6714,
+            0.67874
+        ]
+    }
+}
\ No newline at end of file
diff --git a/tests/functional_tests/test_cases/gpt/gpt3_345m_nightly_dgx_a100_1N8G_mcore_tp2_pp2_ep2_te_4experts2parallel/golden_values_lts.json b/tests/functional_tests/test_cases/gpt/gpt3_345m_nightly_dgx_a100_1N8G_mcore_tp2_pp2_ep2_te_4experts2parallel/golden_values_lts.json
new file mode 100644
index 0000000000..58284659fa
--- /dev/null
+++ b/tests/functional_tests/test_cases/gpt/gpt3_345m_nightly_dgx_a100_1N8G_mcore_tp2_pp2_ep2_te_4experts2parallel/golden_values_lts.json
@@ -0,0 +1,53 @@
+{
+    "lm loss": {
+        "start_step": 0,
+        "end_step": 50,
+        "step_interval": 5,
+        "values": [
+            10.81962,
+            10.8674,
+            10.8579,
+            10.80754,
+            10.71119,
+            10.63665,
+            10.16221,
+            10.27928,
+            10.18787,
+            9.88951
+        ]
+    },
+    "num-zeros": {
+        "start_step": 0,
+        "end_step": 50,
+        "step_interval": 5,
+        "values": [
+            12597.0,
+            15988.0,
+            16507.0,
+            15995.0,
+            14088.0,
+            14994.0,
+            12887.0,
+            15815.0,
+            17049.0,
+            17592.0
+        ]
+    },
+    "iteration-time": {
+        "start_step": 0,
+        "end_step": 50,
+        "step_interval": 5,
+        "values": [
+            25.19848,
+            0.70611,
+            0.70356,
+            0.70548,
+            0.70285,
+            0.70488,
+            0.70589,
+            0.70459,
+            0.70261,
+            0.71213
+        ]
+    }
+}
\ No newline at end of file
diff --git a/tests/functional_tests/test_cases/gpt/gpt3_345m_nightly_dgx_a100_1N8G_mcore_tp2_pp2_ep2_te_4experts2parallel/model_config.yaml b/tests/functional_tests/test_cases/gpt/gpt3_345m_nightly_dgx_a100_1N8G_mcore_tp2_pp2_ep2_te_4experts2parallel/model_config.yaml
new file mode 100644
index 0000000000..97d3d8c5f0
--- /dev/null
+++ b/tests/functional_tests/test_cases/gpt/gpt3_345m_nightly_dgx_a100_1N8G_mcore_tp2_pp2_ep2_te_4experts2parallel/model_config.yaml
@@ -0,0 +1,54 @@
+ENV_VARS:
+  CUDA_DEVICE_MAX_CONNECTIONS: 1
+  NVTE_ALLOW_NONDETERMINISTIC_ALGO: 0
+  NCCL_ALGO: Tree
+  CUBLAS_WORKSPACE_CONFIG: :4096:8
+  N_REPEATS: 5
+MODEL_ARGS:
+  --num-layers: 12
+  --hidden-size: 512
+  --num-attention-heads: 8
+  --log-params-norm: true
+  --log-num-zeros-in-grad: true
+  --log-validation-ppl-to-tensorboard: true
+  --log-timers-to-tensorboard: true
+  --tensorboard-dir: ${TENSORBOARD_PATH}
+  --micro-batch-size: 4
+  --global-batch-size: 32
+  --seq-length: 1024
+  --max-position-embeddings: 1024
+  --train-iters: 50
+  --timing-log-level: 2
+  --lr-decay-iters: 320000
+  --save: ${CHECKPOINT_PATH}
+  --load: ${CHECKPOINT_PATH}
+  --data-path: ${DATA_PATH}/my-gpt3_00_text_document
+  --vocab-file: ${DATA_PATH}/bpe/vocab.json
+  --merge-file: ${DATA_PATH}/bpe/merges.txt
+  --split: 949,50,1
+  --distributed-backend: nccl
+  --lr: 0.00015
+  --lr-decay-style: cosine
+  --min-lr: 1.0e-5
+  --weight-decay: 1e-2
+  --clip-grad: 1.0
+  --lr-warmup-fraction: .01
+  --log-interval: 1
+  --save-interval: 10000
+  --eval-interval: 1000
+  --eval-iters: 10
+  --transformer-impl: local
+  --tensor-model-parallel-size: 2
+  --pipeline-model-parallel-size: 2
+  --expert-model-parallel-size: 2
+  --sequence-parallel: true
+  --num-experts: 4
+  --moe-router-load-balancing-type: sinkhorn
+  --moe-router-topk: 1
+  --deterministic-mode: true
+  --no-gradient-accumulation-fusion: true
+  --use-mcore-models: true
+  --ckpt-format: torch_dist
+  --data-cache-path: ${DATA_CACHE_PATH}
+  --bf16: true
+TEST_TYPE: regular
\ No newline at end of file
diff --git a/tests/functional_tests/test_cases/gpt/gpt3_345m_nightly_dgx_a100_1N8G_mcore_tp2_pp2_resume_torch_dist_te_2experts/model_config.yaml b/tests/functional_tests/test_cases/gpt/gpt3_345m_nightly_dgx_a100_1N8G_mcore_tp2_pp2_resume_torch_dist_te_2experts/model_config.yaml
new file mode 100644
index 0000000000..1a15825731
--- /dev/null
+++ b/tests/functional_tests/test_cases/gpt/gpt3_345m_nightly_dgx_a100_1N8G_mcore_tp2_pp2_resume_torch_dist_te_2experts/model_config.yaml
@@ -0,0 +1,55 @@
+ENV_VARS:
+  CUDA_DEVICE_MAX_CONNECTIONS: 1
+  NVTE_ALLOW_NONDETERMINISTIC_ALGO: 0
+  NCCL_ALGO: Tree
+  CUBLAS_WORKSPACE_CONFIG: :4096:8
+  N_REPEATS: 5
+MODEL_ARGS:
+  --num-layers: 12
+  --hidden-size: 512
+  --num-attention-heads: 8
+  --log-params-norm: true
+  --log-num-zeros-in-grad: true
+  --log-validation-ppl-to-tensorboard: true
+  --log-timers-to-tensorboard: true
+  --tensorboard-dir: ${TENSORBOARD_PATH}
+  --micro-batch-size: 4
+  --global-batch-size: 32
+  --seq-length: 1024
+  --max-position-embeddings: 1024
+  --train-iters: 100
+  --timing-log-level: 2
+  --lr-decay-iters: 320000
+  --save: ${CHECKPOINT_PATH}
+  --load: ${CHECKPOINT_PATH}
+  --data-path: ${DATA_PATH}/my-gpt3_00_text_document
+  --vocab-file: ${DATA_PATH}/bpe/vocab.json
+  --merge-file: ${DATA_PATH}/bpe/merges.txt
+  --split: 949,50,1
+  --distributed-backend: nccl
+  --lr: 0.00015
+  --lr-decay-style: cosine
+  --min-lr: 1.0e-5
+  --weight-decay: 1e-2
+  --clip-grad: 1.0
+  --lr-warmup-fraction: .01
+  --log-interval: 1
+  --save-interval: 50
+  --eval-interval: 1000
+  --eval-iters: 10
+  --transformer-impl: local
+  --tensor-model-parallel-size: 2
+  --pipeline-model-parallel-size: 2
+  --num-experts: 2
+  --sequence-parallel: true
+  --moe-router-load-balancing-type: sinkhorn
+  --moe-router-topk: 1
+  --deterministic-mode: true
+  --no-gradient-accumulation-fusion: true
+  --use-checkpoint-opt_param-scheduler: true
+  --use-mcore-models: true
+  --ckpt-format: torch_dist
+  --data-cache-path: ${DATA_CACHE_PATH}
+  --fp16: true
+  --apply-query-key-layer-scaling: true
+TEST_TYPE: ckpt-resume
\ No newline at end of file
diff --git a/tests/functional_tests/test_cases/gpt/gpt3_345m_nightly_dgx_a100_1N8G_mcore_tp2_pp2_te_2experts/golden_values_dev.json b/tests/functional_tests/test_cases/gpt/gpt3_345m_nightly_dgx_a100_1N8G_mcore_tp2_pp2_te_2experts/golden_values_dev.json
new file mode 100644
index 0000000000..a675a63d5e
--- /dev/null
+++ b/tests/functional_tests/test_cases/gpt/gpt3_345m_nightly_dgx_a100_1N8G_mcore_tp2_pp2_te_2experts/golden_values_dev.json
@@ -0,0 +1,53 @@
+{
+    "lm loss": {
+        "start_step": 0,
+        "end_step": 50,
+        "step_interval": 5,
+        "values": [
+            10.79574,
+            10.84041,
+            10.81392,
+            10.7652,
+            10.65759,
+            10.56196,
+            10.08853,
+            10.21342,
+            10.11653,
+            9.83431
+        ]
+    },
+    "num-zeros": {
+        "start_step": 0,
+        "end_step": 50,
+        "step_interval": 5,
+        "values": [
+            2977.0,
+            3533.0,
+            3432.0,
+            3418.0,
+            3277.0,
+            3305.0,
+            2851.0,
+            3325.0,
+            3684.0,
+            3712.0
+        ]
+    },
+    "iteration-time": {
+        "start_step": 0,
+        "end_step": 50,
+        "step_interval": 5,
+        "values": [
+            25.64274,
+            0.6941,
+            0.69152,
+            0.69181,
+            0.69128,
+            0.68614,
+            0.68462,
+            0.6845,
+            0.68711,
+            0.68237
+        ]
+    }
+}
\ No newline at end of file
diff --git a/tests/functional_tests/test_cases/gpt/gpt3_345m_nightly_dgx_a100_1N8G_mcore_tp2_pp2_te_2experts/golden_values_lts.json b/tests/functional_tests/test_cases/gpt/gpt3_345m_nightly_dgx_a100_1N8G_mcore_tp2_pp2_te_2experts/golden_values_lts.json
new file mode 100644
index 0000000000..a675a63d5e
--- /dev/null
+++ b/tests/functional_tests/test_cases/gpt/gpt3_345m_nightly_dgx_a100_1N8G_mcore_tp2_pp2_te_2experts/golden_values_lts.json
@@ -0,0 +1,53 @@
+{
+    "lm loss": {
+        "start_step": 0,
+        "end_step": 50,
+        "step_interval": 5,
+        "values": [
+            10.79574,
+            10.84041,
+            10.81392,
+            10.7652,
+            10.65759,
+            10.56196,
+            10.08853,
+            10.21342,
+            10.11653,
+            9.83431
+        ]
+    },
+    "num-zeros": {
+        "start_step": 0,
+        "end_step": 50,
+        "step_interval": 5,
+        "values": [
+            2977.0,
+            3533.0,
+            3432.0,
+            3418.0,
+            3277.0,
+            3305.0,
+            2851.0,
+            3325.0,
+            3684.0,
+            3712.0
+        ]
+    },
+    "iteration-time": {
+        "start_step": 0,
+        "end_step": 50,
+        "step_interval": 5,
+        "values": [
+            25.64274,
+            0.6941,
+            0.69152,
+            0.69181,
+            0.69128,
+            0.68614,
+            0.68462,
+            0.6845,
+            0.68711,
+            0.68237
+        ]
+    }
+}
\ No newline at end of file
diff --git a/tests/functional_tests/test_cases/gpt/gpt3_345m_nightly_dgx_a100_1N8G_mcore_tp2_pp2_te_2experts/model_config.yaml b/tests/functional_tests/test_cases/gpt/gpt3_345m_nightly_dgx_a100_1N8G_mcore_tp2_pp2_te_2experts/model_config.yaml
new file mode 100644
index 0000000000..c6728722e2
--- /dev/null
+++ b/tests/functional_tests/test_cases/gpt/gpt3_345m_nightly_dgx_a100_1N8G_mcore_tp2_pp2_te_2experts/model_config.yaml
@@ -0,0 +1,54 @@
+ENV_VARS:
+  CUDA_DEVICE_MAX_CONNECTIONS: 1
+  NVTE_ALLOW_NONDETERMINISTIC_ALGO: 0
+  NCCL_ALGO: Tree
+  CUBLAS_WORKSPACE_CONFIG: :4096:8
+  N_REPEATS: 5
+MODEL_ARGS:
+  --num-layers: 12
+  --hidden-size: 512
+  --num-attention-heads: 8
+  --log-params-norm: true
+  --log-num-zeros-in-grad: true
+  --log-validation-ppl-to-tensorboard: true
+  --log-timers-to-tensorboard: true
+  --tensorboard-dir: ${TENSORBOARD_PATH}
+  --micro-batch-size: 4
+  --global-batch-size: 32
+  --seq-length: 1024
+  --max-position-embeddings: 1024
+  --train-iters: 50
+  --timing-log-level: 2
+  --lr-decay-iters: 320000
+  --save: ${CHECKPOINT_PATH}
+  --load: ${CHECKPOINT_PATH}
+  --data-path: ${DATA_PATH}/my-gpt3_00_text_document
+  --vocab-file: ${DATA_PATH}/bpe/vocab.json
+  --merge-file: ${DATA_PATH}/bpe/merges.txt
+  --split: 949,50,1
+  --distributed-backend: nccl
+  --lr: 0.00015
+  --lr-decay-style: cosine
+  --min-lr: 1.0e-5
+  --weight-decay: 1e-2
+  --clip-grad: 1.0
+  --lr-warmup-fraction: .01
+  --log-interval: 1
+  --save-interval: 10000
+  --eval-interval: 1000
+  --eval-iters: 10
+  --transformer-impl: local
+  --tensor-model-parallel-size: 2
+  --pipeline-model-parallel-size: 2
+  --num-experts: 2
+  --sequence-parallel: true
+  --moe-router-load-balancing-type: sinkhorn
+  --moe-router-topk: 1
+  --deterministic-mode: true
+  --no-gradient-accumulation-fusion: true
+  --use-mcore-models: true
+  --ckpt-format: torch_dist
+  --data-cache-path: ${DATA_CACHE_PATH}
+  --bf16: true
+  --apply-query-key-layer-scaling: true
+TEST_TYPE: regular
\ No newline at end of file
diff --git a/tests/functional_tests/test_cases/gpt/gpt3_345m_nightly_dgx_a100_1N8G_mcore_tp4_pp1/golden_values_dev.json b/tests/functional_tests/test_cases/gpt/gpt3_345m_nightly_dgx_a100_1N8G_mcore_tp4_pp1/golden_values_dev.json
new file mode 100644
index 0000000000..4172a17a7a
--- /dev/null
+++ b/tests/functional_tests/test_cases/gpt/gpt3_345m_nightly_dgx_a100_1N8G_mcore_tp4_pp1/golden_values_dev.json
@@ -0,0 +1,53 @@
+{
+    "lm loss": {
+        "start_step": 0,
+        "end_step": 50,
+        "step_interval": 5,
+        "values": [
+            10.86122,
+            10.88647,
+            10.87773,
+            10.83111,
+            10.7165,
+            10.60619,
+            10.13147,
+            10.22767,
+            10.15929,
+            9.83482
+        ]
+    },
+    "num-zeros": {
+        "start_step": 0,
+        "end_step": 50,
+        "step_interval": 5,
+        "values": [
+            1694.0,
+            2148.0,
+            2169.0,
+            2103.0,
+            1991.0,
+            1900.0,
+            1707.0,
+            2189.0,
+            2557.0,
+            2606.0
+        ]
+    },
+    "iteration-time": {
+        "start_step": 0,
+        "end_step": 50,
+        "step_interval": 5,
+        "values": [
+            9.61991,
+            0.29135,
+            0.28852,
+            0.28971,
+            0.29221,
+            0.28994,
+            0.28976,
+            0.28887,
+            0.28975,
+            0.2869
+        ]
+    }
+}
\ No newline at end of file
diff --git a/tests/functional_tests/test_cases/gpt/gpt3_345m_nightly_dgx_a100_1N8G_mcore_tp4_pp1/golden_values_lts.json b/tests/functional_tests/test_cases/gpt/gpt3_345m_nightly_dgx_a100_1N8G_mcore_tp4_pp1/golden_values_lts.json
new file mode 100644
index 0000000000..dc8076a2f2
--- /dev/null
+++ b/tests/functional_tests/test_cases/gpt/gpt3_345m_nightly_dgx_a100_1N8G_mcore_tp4_pp1/golden_values_lts.json
@@ -0,0 +1,53 @@
+{
+    "lm loss": {
+        "start_step": 0,
+        "end_step": 50,
+        "step_interval": 5,
+        "values": [
+            10.86122,
+            10.88647,
+            10.87773,
+            10.83111,
+            10.7165,
+            10.60623,
+            10.13146,
+            10.2277,
+            10.15933,
+            9.8348
+        ]
+    },
+    "num-zeros": {
+        "start_step": 0,
+        "end_step": 50,
+        "step_interval": 5,
+        "values": [
+            1694.0,
+            2148.0,
+            2169.0,
+            2103.0,
+            1991.0,
+            1869.0,
+            1760.0,
+            2214.0,
+            2529.0,
+            2587.0
+        ]
+    },
+    "iteration-time": {
+        "start_step": 0,
+        "end_step": 50,
+        "step_interval": 5,
+        "values": [
+            11.72537,
+            0.29824,
+            0.29549,
+            0.29574,
+            0.29514,
+            0.29533,
+            0.29415,
+            0.30722,
+            0.29731,
+            0.29867
+        ]
+    }
+}
\ No newline at end of file
diff --git a/tests/functional_tests/test_cases/gpt/gpt3_345m_nightly_dgx_a100_1N8G_mcore_tp4_pp1/model_config.yaml b/tests/functional_tests/test_cases/gpt/gpt3_345m_nightly_dgx_a100_1N8G_mcore_tp4_pp1/model_config.yaml
new file mode 100644
index 0000000000..37cc4615a5
--- /dev/null
+++ b/tests/functional_tests/test_cases/gpt/gpt3_345m_nightly_dgx_a100_1N8G_mcore_tp4_pp1/model_config.yaml
@@ -0,0 +1,50 @@
+ENV_VARS:
+  CUDA_DEVICE_MAX_CONNECTIONS: 1
+  NVTE_ALLOW_NONDETERMINISTIC_ALGO: 0
+  NCCL_ALGO: Tree
+  CUBLAS_WORKSPACE_CONFIG: :4096:8
+  N_REPEATS: 5
+MODEL_ARGS:
+  --num-layers: 12
+  --hidden-size: 512
+  --num-attention-heads: 8
+  --log-params-norm: true
+  --log-num-zeros-in-grad: true
+  --log-validation-ppl-to-tensorboard: true
+  --log-timers-to-tensorboard: true
+  --tensorboard-dir: ${TENSORBOARD_PATH}
+  --micro-batch-size: 4
+  --global-batch-size: 32
+  --seq-length: 1024
+  --max-position-embeddings: 1024
+  --train-iters: 50
+  --timing-log-level: 2
+  --lr-decay-iters: 320000
+  --save: ${CHECKPOINT_PATH}
+  --load: ${CHECKPOINT_PATH}
+  --data-path: ${DATA_PATH}/my-gpt3_00_text_document
+  --vocab-file: ${DATA_PATH}/bpe/vocab.json
+  --merge-file: ${DATA_PATH}/bpe/merges.txt
+  --split: 949,50,1
+  --distributed-backend: nccl
+  --lr: 0.00015
+  --lr-decay-style: cosine
+  --min-lr: 1.0e-5
+  --weight-decay: 1e-2
+  --clip-grad: 1.0
+  --lr-warmup-fraction: .01
+  --log-interval: 1
+  --save-interval: 10000
+  --eval-interval: 1000
+  --eval-iters: 10
+  --transformer-impl: local
+  --tensor-model-parallel-size: 4
+  --pipeline-model-parallel-size: 1
+  --deterministic-mode: true
+  --no-gradient-accumulation-fusion: true
+  --use-mcore-models: true
+  --ckpt-format: torch_dist
+  --data-cache-path: ${DATA_CACHE_PATH}
+  --bf16: true
+  --apply-query-key-layer-scaling: true
+TEST_TYPE: regular
\ No newline at end of file
diff --git a/tests/functional_tests/test_cases/gpt/gpt3_345m_nightly_dgx_a100_1N8G_mcore_tp4_pp1_resume_torch/model_config.yaml b/tests/functional_tests/test_cases/gpt/gpt3_345m_nightly_dgx_a100_1N8G_mcore_tp4_pp1_resume_torch/model_config.yaml
new file mode 100644
index 0000000000..528b691a28
--- /dev/null
+++ b/tests/functional_tests/test_cases/gpt/gpt3_345m_nightly_dgx_a100_1N8G_mcore_tp4_pp1_resume_torch/model_config.yaml
@@ -0,0 +1,50 @@
+ENV_VARS:
+  CUDA_DEVICE_MAX_CONNECTIONS: 1
+  NVTE_ALLOW_NONDETERMINISTIC_ALGO: 0
+  NCCL_ALGO: Tree
+  CUBLAS_WORKSPACE_CONFIG: :4096:8
+  N_REPEATS: 5
+MODEL_ARGS:
+  --num-layers: 12
+  --hidden-size: 512
+  --num-attention-heads: 8
+  --log-params-norm: true
+  --log-num-zeros-in-grad: true
+  --log-validation-ppl-to-tensorboard: true
+  --log-timers-to-tensorboard: true
+  --tensorboard-dir: ${TENSORBOARD_PATH}
+  --micro-batch-size: 4
+  --global-batch-size: 32
+  --seq-length: 1024
+  --max-position-embeddings: 1024
+  --train-iters: 100
+  --timing-log-level: 2
+  --lr-decay-iters: 320000
+  --save: ${CHECKPOINT_PATH}
+  --load: ${CHECKPOINT_PATH}
+  --data-path: ${DATA_PATH}/my-gpt3_00_text_document
+  --vocab-file: ${DATA_PATH}/bpe/vocab.json
+  --merge-file: ${DATA_PATH}/bpe/merges.txt
+  --split: 949,50,1
+  --distributed-backend: nccl
+  --lr: 0.00015
+  --lr-decay-style: cosine
+  --min-lr: 1.0e-5
+  --weight-decay: 1e-2
+  --clip-grad: 1.0
+  --lr-warmup-fraction: .01
+  --log-interval: 1
+  --save-interval: 50
+  --eval-interval: 1000
+  --eval-iters: 10
+  --transformer-impl: local
+  --tensor-model-parallel-size: 4
+  --pipeline-model-parallel-size: 1
+  --deterministic-mode: true
+  --no-gradient-accumulation-fusion: true
+  --use-checkpoint-opt_param-scheduler: true
+  --ckpt-format: torch
+  --data-cache-path: ${DATA_CACHE_PATH}
+  --fp16: true
+  --apply-query-key-layer-scaling: true
+TEST_TYPE: ckpt-resume
\ No newline at end of file
diff --git a/tests/functional_tests/test_cases/gpt/gpt3_345m_nightly_dgx_a100_1N8G_mcore_tp4_pp1_resume_torch_dist/model_config.yaml b/tests/functional_tests/test_cases/gpt/gpt3_345m_nightly_dgx_a100_1N8G_mcore_tp4_pp1_resume_torch_dist/model_config.yaml
new file mode 100644
index 0000000000..4f5e8d93b7
--- /dev/null
+++ b/tests/functional_tests/test_cases/gpt/gpt3_345m_nightly_dgx_a100_1N8G_mcore_tp4_pp1_resume_torch_dist/model_config.yaml
@@ -0,0 +1,51 @@
+ENV_VARS:
+  CUDA_DEVICE_MAX_CONNECTIONS: 1
+  NVTE_ALLOW_NONDETERMINISTIC_ALGO: 0
+  NCCL_ALGO: Tree
+  CUBLAS_WORKSPACE_CONFIG: :4096:8
+  N_REPEATS: 5
+MODEL_ARGS:
+  --num-layers: 12
+  --hidden-size: 512
+  --num-attention-heads: 8
+  --log-params-norm: true
+  --log-num-zeros-in-grad: true
+  --log-validation-ppl-to-tensorboard: true
+  --log-timers-to-tensorboard: true
+  --tensorboard-dir: ${TENSORBOARD_PATH}
+  --micro-batch-size: 4
+  --global-batch-size: 32
+  --seq-length: 1024
+  --max-position-embeddings: 1024
+  --train-iters: 100
+  --timing-log-level: 2
+  --lr-decay-iters: 320000
+  --save: ${CHECKPOINT_PATH}
+  --load: ${CHECKPOINT_PATH}
+  --data-path: ${DATA_PATH}/my-gpt3_00_text_document
+  --vocab-file: ${DATA_PATH}/bpe/vocab.json
+  --merge-file: ${DATA_PATH}/bpe/merges.txt
+  --split: 949,50,1
+  --distributed-backend: nccl
+  --lr: 0.00015
+  --lr-decay-style: cosine
+  --min-lr: 1.0e-5
+  --weight-decay: 1e-2
+  --clip-grad: 1.0
+  --lr-warmup-fraction: .01
+  --log-interval: 1
+  --save-interval: 50
+  --eval-interval: 1000
+  --eval-iters: 10
+  --transformer-impl: local
+  --tensor-model-parallel-size: 4
+  --pipeline-model-parallel-size: 1
+  --deterministic-mode: true
+  --no-gradient-accumulation-fusion: true
+  --use-checkpoint-opt_param-scheduler: true
+  --use-mcore-models: true
+  --ckpt-format: torch_dist
+  --data-cache-path: ${DATA_CACHE_PATH}
+  --fp16: true
+  --apply-query-key-layer-scaling: true
+TEST_TYPE: ckpt-resume
\ No newline at end of file
diff --git a/tests/functional_tests/test_cases/gpt/gpt3_345m_nightly_dgx_a100_1N8G_tp1_pp1_dist_optimizer_overlap_grad_reduce/golden_values_dev.json b/tests/functional_tests/test_cases/gpt/gpt3_345m_nightly_dgx_a100_1N8G_tp1_pp1_dist_optimizer_overlap_grad_reduce/golden_values_dev.json
new file mode 100644
index 0000000000..9fe4f01d80
--- /dev/null
+++ b/tests/functional_tests/test_cases/gpt/gpt3_345m_nightly_dgx_a100_1N8G_tp1_pp1_dist_optimizer_overlap_grad_reduce/golden_values_dev.json
@@ -0,0 +1,50 @@
+{
+    "lm loss": {
+        "start_step": 0,
+        "end_step": 50,
+        "step_interval": 5,
+        "values": [
+            10.87346,
+            10.89625,
+            10.88939,
+            10.88681,
+            10.8893,
+            10.84863,
+            10.6962,
+            10.63919,
+            10.53931,
+            10.31119
+        ]
+    },
+    "iteration-time": {
+        "start_step": 0,
+        "end_step": 50,
+        "step_interval": 5,
+        "values": [
+            4.95266,
+            0.07818,
+            0.07961,
+            0.07716,
+            0.08368,
+            0.08327,
+            0.08409,
+            0.08371,
+            0.08372,
+            0.08387
+        ]
+    },
+    "num-zeros": {
+        "start_step": 0,
+        "end_step": 32,
+        "step_interval": 5,
+        "values": [
+            1300.0,
+            1287.0,
+            1565.0,
+            1441.0,
+            1419.0,
+            1295.0,
+            1177.0
+        ]
+    }
+}
\ No newline at end of file
diff --git a/tests/functional_tests/test_cases/gpt/gpt3_345m_nightly_dgx_a100_1N8G_tp1_pp1_dist_optimizer_overlap_grad_reduce/golden_values_lts.json b/tests/functional_tests/test_cases/gpt/gpt3_345m_nightly_dgx_a100_1N8G_tp1_pp1_dist_optimizer_overlap_grad_reduce/golden_values_lts.json
new file mode 100644
index 0000000000..69ca350fdd
--- /dev/null
+++ b/tests/functional_tests/test_cases/gpt/gpt3_345m_nightly_dgx_a100_1N8G_tp1_pp1_dist_optimizer_overlap_grad_reduce/golden_values_lts.json
@@ -0,0 +1 @@
+{"lm loss": {"start_step": 0, "end_step": 50, "step_interval": 5, "values": [10.87346, 10.89625, 10.88939, 10.88681, 10.8893, 10.84864, 10.6962, 10.63918, 10.5393, 10.31119]}, "num-zeros": {"start_step": 0, "end_step": 32, "step_interval": 5, "values": [1298.0, 1352.0, 1590.0, 1403.0, 1435.0, 1266.0, 1195.0]}, "iteration_timing_avg": 0.07655911764705883}
\ No newline at end of file
diff --git a/tests/functional_tests/test_cases/gpt/gpt3_345m_nightly_dgx_a100_1N8G_tp1_pp1_dist_optimizer_overlap_grad_reduce/model_config.yaml b/tests/functional_tests/test_cases/gpt/gpt3_345m_nightly_dgx_a100_1N8G_tp1_pp1_dist_optimizer_overlap_grad_reduce/model_config.yaml
new file mode 100644
index 0000000000..64d504bf29
--- /dev/null
+++ b/tests/functional_tests/test_cases/gpt/gpt3_345m_nightly_dgx_a100_1N8G_tp1_pp1_dist_optimizer_overlap_grad_reduce/model_config.yaml
@@ -0,0 +1,52 @@
+ENV_VARS:
+  CUDA_DEVICE_MAX_CONNECTIONS: 1
+  NVTE_ALLOW_NONDETERMINISTIC_ALGO: 0
+  NCCL_ALGO: Tree
+  CUBLAS_WORKSPACE_CONFIG: :4096:8
+  N_REPEATS: 5
+MODEL_ARGS:
+  --num-layers: 12
+  --hidden-size: 512
+  --num-attention-heads: 8
+  --log-params-norm: true
+  --log-num-zeros-in-grad: true
+  --log-validation-ppl-to-tensorboard: true
+  --log-timers-to-tensorboard: true
+  --tensorboard-dir: ${TENSORBOARD_PATH}
+  --micro-batch-size: 4
+  --global-batch-size: 32
+  --seq-length: 1024
+  --max-position-embeddings: 1024
+  --train-iters: 50
+  --timing-log-level: 2
+  --lr-decay-iters: 320000
+  --save: ${CHECKPOINT_PATH}
+  --load: ${CHECKPOINT_PATH}
+  --data-path: ${DATA_PATH}/my-gpt3_00_text_document
+  --vocab-file: ${DATA_PATH}/bpe/vocab.json
+  --merge-file: ${DATA_PATH}/bpe/merges.txt
+  --split: 949,50,1
+  --distributed-backend: nccl
+  --lr: 0.00015
+  --lr-decay-style: cosine
+  --min-lr: 1.0e-5
+  --weight-decay: 1e-2
+  --clip-grad: 1.0
+  --lr-warmup-fraction: .01
+  --log-interval: 1
+  --save-interval: 10000
+  --eval-interval: 1000
+  --eval-iters: 10
+  --transformer-impl: local
+  --tensor-model-parallel-size: 1
+  --pipeline-model-parallel-size: 1
+  --use-distributed-optimizer: true
+  --overlap-grad-reduce: true
+  --deterministic-mode: true
+  --no-gradient-accumulation-fusion: true
+  --ckpt-format: torch
+  --use-legacy-models: true
+  --data-cache-path: ${DATA_CACHE_PATH}
+  --fp16: true
+  --apply-query-key-layer-scaling: true
+TEST_TYPE: regular
\ No newline at end of file
diff --git a/tests/functional_tests/test_cases/gpt/gpt3_345m_nightly_dgx_a100_1N8G_tp1_pp1_overlap_grad_reduce/golden_values_dev.json b/tests/functional_tests/test_cases/gpt/gpt3_345m_nightly_dgx_a100_1N8G_tp1_pp1_overlap_grad_reduce/golden_values_dev.json
new file mode 100644
index 0000000000..bad34329da
--- /dev/null
+++ b/tests/functional_tests/test_cases/gpt/gpt3_345m_nightly_dgx_a100_1N8G_tp1_pp1_overlap_grad_reduce/golden_values_dev.json
@@ -0,0 +1,50 @@
+{
+    "lm loss": {
+        "start_step": 0,
+        "end_step": 50,
+        "step_interval": 5,
+        "values": [
+            10.87346,
+            10.89625,
+            10.88939,
+            10.88681,
+            10.88931,
+            10.84864,
+            10.6962,
+            10.63918,
+            10.5393,
+            10.31119
+        ]
+    },
+    "iteration-time": {
+        "start_step": 0,
+        "end_step": 50,
+        "step_interval": 5,
+        "values": [
+            5.32064,
+            0.08204,
+            0.08233,
+            0.08176,
+            0.09748,
+            0.0966,
+            0.09648,
+            0.09617,
+            0.09604,
+            0.09646
+        ]
+    },
+    "num-zeros": {
+        "start_step": 0,
+        "end_step": 32,
+        "step_interval": 5,
+        "values": [
+            1112.0,
+            1124.0,
+            1229.0,
+            1665.0,
+            1269.0,
+            1219.0,
+            1572.0
+        ]
+    }
+}
\ No newline at end of file
diff --git a/tests/functional_tests/test_cases/gpt/gpt3_345m_nightly_dgx_a100_1N8G_tp1_pp1_overlap_grad_reduce/golden_values_lts.json b/tests/functional_tests/test_cases/gpt/gpt3_345m_nightly_dgx_a100_1N8G_tp1_pp1_overlap_grad_reduce/golden_values_lts.json
new file mode 100644
index 0000000000..96b8036e95
--- /dev/null
+++ b/tests/functional_tests/test_cases/gpt/gpt3_345m_nightly_dgx_a100_1N8G_tp1_pp1_overlap_grad_reduce/golden_values_lts.json
@@ -0,0 +1 @@
+{"lm loss": {"start_step": 0, "end_step": 50, "step_interval": 5, "values": [10.87346, 10.89625, 10.88939, 10.88681, 10.88931, 10.84864, 10.6962, 10.63918, 10.53931, 10.31119]}, "num-zeros": {"start_step": 0, "end_step": 32, "step_interval": 5, "values": [1131.0, 1173.0, 1218.0, 1783.0, 1278.0, 1244.0, 1555.0]}, "iteration_timing_avg": 0.07975499999999999}
\ No newline at end of file
diff --git a/tests/functional_tests/test_cases/gpt/gpt3_345m_nightly_dgx_a100_1N8G_tp1_pp1_overlap_grad_reduce/model_config.yaml b/tests/functional_tests/test_cases/gpt/gpt3_345m_nightly_dgx_a100_1N8G_tp1_pp1_overlap_grad_reduce/model_config.yaml
new file mode 100644
index 0000000000..190e5777f2
--- /dev/null
+++ b/tests/functional_tests/test_cases/gpt/gpt3_345m_nightly_dgx_a100_1N8G_tp1_pp1_overlap_grad_reduce/model_config.yaml
@@ -0,0 +1,51 @@
+ENV_VARS:
+  CUDA_DEVICE_MAX_CONNECTIONS: 1
+  NVTE_ALLOW_NONDETERMINISTIC_ALGO: 0
+  NCCL_ALGO: Tree
+  CUBLAS_WORKSPACE_CONFIG: :4096:8
+  N_REPEATS: 5
+MODEL_ARGS:
+  --num-layers: 12
+  --hidden-size: 512
+  --num-attention-heads: 8
+  --log-params-norm: true
+  --log-num-zeros-in-grad: true
+  --log-validation-ppl-to-tensorboard: true
+  --log-timers-to-tensorboard: true
+  --tensorboard-dir: ${TENSORBOARD_PATH}
+  --micro-batch-size: 4
+  --global-batch-size: 32
+  --seq-length: 1024
+  --max-position-embeddings: 1024
+  --train-iters: 50
+  --timing-log-level: 2
+  --lr-decay-iters: 320000
+  --save: ${CHECKPOINT_PATH}
+  --load: ${CHECKPOINT_PATH}
+  --data-path: ${DATA_PATH}/my-gpt3_00_text_document
+  --vocab-file: ${DATA_PATH}/bpe/vocab.json
+  --merge-file: ${DATA_PATH}/bpe/merges.txt
+  --split: 949,50,1
+  --distributed-backend: nccl
+  --lr: 0.00015
+  --lr-decay-style: cosine
+  --min-lr: 1.0e-5
+  --weight-decay: 1e-2
+  --clip-grad: 1.0
+  --lr-warmup-fraction: .01
+  --log-interval: 1
+  --save-interval: 10000
+  --eval-interval: 1000
+  --eval-iters: 10
+  --transformer-impl: local
+  --tensor-model-parallel-size: 1
+  --pipeline-model-parallel-size: 1
+  --overlap-grad-reduce: true
+  --deterministic-mode: true
+  --no-gradient-accumulation-fusion: true
+  --ckpt-format: torch
+  --use-legacy-models: true
+  --data-cache-path: ${DATA_CACHE_PATH}
+  --fp16: true
+  --apply-query-key-layer-scaling: true
+TEST_TYPE: regular
\ No newline at end of file
diff --git a/tests/functional_tests/test_cases/gpt/gpt3_345m_nightly_dgx_a100_1N8G_tp1_pp2/golden_values_dev.json b/tests/functional_tests/test_cases/gpt/gpt3_345m_nightly_dgx_a100_1N8G_tp1_pp2/golden_values_dev.json
new file mode 100644
index 0000000000..6c6d8e79fc
--- /dev/null
+++ b/tests/functional_tests/test_cases/gpt/gpt3_345m_nightly_dgx_a100_1N8G_tp1_pp2/golden_values_dev.json
@@ -0,0 +1 @@
+{"lm loss": {"start_step": 0, "end_step": 50, "step_interval": 5, "values": [10.84009, 10.89314, 10.908, 10.87524, 10.86367, 10.83848, 10.64647, 10.62126, 10.53743, 10.24831]}, "num-zeros": {"start_step": 0, "end_step": 33, "step_interval": 5, "values": [2044.0, 2242.0, 2368.0, 2598.0, 2188.0, 1850.0, 2436.0]}, "iteration_timing_avg": 0.10581941176470588}
\ No newline at end of file
diff --git a/tests/functional_tests/test_cases/gpt/gpt3_345m_nightly_dgx_a100_1N8G_tp1_pp2/golden_values_lts.json b/tests/functional_tests/test_cases/gpt/gpt3_345m_nightly_dgx_a100_1N8G_tp1_pp2/golden_values_lts.json
new file mode 100644
index 0000000000..6c6d8e79fc
--- /dev/null
+++ b/tests/functional_tests/test_cases/gpt/gpt3_345m_nightly_dgx_a100_1N8G_tp1_pp2/golden_values_lts.json
@@ -0,0 +1 @@
+{"lm loss": {"start_step": 0, "end_step": 50, "step_interval": 5, "values": [10.84009, 10.89314, 10.908, 10.87524, 10.86367, 10.83848, 10.64647, 10.62126, 10.53743, 10.24831]}, "num-zeros": {"start_step": 0, "end_step": 33, "step_interval": 5, "values": [2044.0, 2242.0, 2368.0, 2598.0, 2188.0, 1850.0, 2436.0]}, "iteration_timing_avg": 0.10581941176470588}
\ No newline at end of file
diff --git a/tests/functional_tests/test_cases/gpt/gpt3_345m_nightly_dgx_a100_1N8G_tp1_pp2/model_config.yaml b/tests/functional_tests/test_cases/gpt/gpt3_345m_nightly_dgx_a100_1N8G_tp1_pp2/model_config.yaml
new file mode 100644
index 0000000000..99d0ac8f6b
--- /dev/null
+++ b/tests/functional_tests/test_cases/gpt/gpt3_345m_nightly_dgx_a100_1N8G_tp1_pp2/model_config.yaml
@@ -0,0 +1,50 @@
+ENV_VARS:
+  CUDA_DEVICE_MAX_CONNECTIONS: 1
+  NVTE_ALLOW_NONDETERMINISTIC_ALGO: 0
+  NCCL_ALGO: Tree
+  CUBLAS_WORKSPACE_CONFIG: :4096:8
+  N_REPEATS: 5
+MODEL_ARGS:
+  --num-layers: 12
+  --hidden-size: 512
+  --num-attention-heads: 8
+  --log-params-norm: true
+  --log-num-zeros-in-grad: true
+  --log-validation-ppl-to-tensorboard: true
+  --log-timers-to-tensorboard: true
+  --tensorboard-dir: ${TENSORBOARD_PATH}
+  --micro-batch-size: 4
+  --global-batch-size: 32
+  --seq-length: 1024
+  --max-position-embeddings: 1024
+  --train-iters: 50
+  --timing-log-level: 2
+  --lr-decay-iters: 320000
+  --save: ${CHECKPOINT_PATH}
+  --load: ${CHECKPOINT_PATH}
+  --data-path: ${DATA_PATH}/my-gpt3_00_text_document
+  --vocab-file: ${DATA_PATH}/bpe/vocab.json
+  --merge-file: ${DATA_PATH}/bpe/merges.txt
+  --split: 949,50,1
+  --distributed-backend: nccl
+  --lr: 0.00015
+  --lr-decay-style: cosine
+  --min-lr: 1.0e-5
+  --weight-decay: 1e-2
+  --clip-grad: 1.0
+  --lr-warmup-fraction: .01
+  --log-interval: 1
+  --save-interval: 10000
+  --eval-interval: 1000
+  --eval-iters: 10
+  --transformer-impl: local
+  --tensor-model-parallel-size: 1
+  --pipeline-model-parallel-size: 2
+  --deterministic-mode: true
+  --no-gradient-accumulation-fusion: true
+  --ckpt-format: torch
+  --use-legacy-models: true
+  --data-cache-path: ${DATA_CACHE_PATH}
+  --fp16: true
+  --apply-query-key-layer-scaling: true
+TEST_TYPE: regular
\ No newline at end of file
diff --git a/tests/functional_tests/test_cases/gpt/gpt3_345m_nightly_dgx_a100_1N8G_tp1_pp2_resume_torch/golden_values_dev.json b/tests/functional_tests/test_cases/gpt/gpt3_345m_nightly_dgx_a100_1N8G_tp1_pp2_resume_torch/golden_values_dev.json
new file mode 100644
index 0000000000..d4a5cfb78e
--- /dev/null
+++ b/tests/functional_tests/test_cases/gpt/gpt3_345m_nightly_dgx_a100_1N8G_tp1_pp2_resume_torch/golden_values_dev.json
@@ -0,0 +1 @@
+{"lm loss": {"start_step": 0, "end_step": 100, "step_interval": 5, "values": [10.84009, 10.89314, 10.908, 10.87524, 10.86367, 10.83848, 10.64647, 10.62126, 10.53743, 10.24831, 10.20828, 9.96658, 9.97022, 9.92437, 9.79137, 9.26612, 9.61914, 9.19057, 9.46177, 9.62185]}, "num-zeros": {"start_step": 0, "end_step": 83, "step_interval": 5, "values": [2044.0, 2242.0, 2368.0, 2598.0, 2188.0, 1850.0, 2436.0, 2732.0, 2678.0, 2452.0, 2879.0, 2572.0, 3456.0, 3237.0, 2990.0, 3067.0, 3173.0]}, "iteration_timing_avg": 0.10533134328358208}
\ No newline at end of file
diff --git a/tests/functional_tests/test_cases/gpt/gpt3_345m_nightly_dgx_a100_1N8G_tp1_pp2_resume_torch/golden_values_lts.json b/tests/functional_tests/test_cases/gpt/gpt3_345m_nightly_dgx_a100_1N8G_tp1_pp2_resume_torch/golden_values_lts.json
new file mode 100644
index 0000000000..d4a5cfb78e
--- /dev/null
+++ b/tests/functional_tests/test_cases/gpt/gpt3_345m_nightly_dgx_a100_1N8G_tp1_pp2_resume_torch/golden_values_lts.json
@@ -0,0 +1 @@
+{"lm loss": {"start_step": 0, "end_step": 100, "step_interval": 5, "values": [10.84009, 10.89314, 10.908, 10.87524, 10.86367, 10.83848, 10.64647, 10.62126, 10.53743, 10.24831, 10.20828, 9.96658, 9.97022, 9.92437, 9.79137, 9.26612, 9.61914, 9.19057, 9.46177, 9.62185]}, "num-zeros": {"start_step": 0, "end_step": 83, "step_interval": 5, "values": [2044.0, 2242.0, 2368.0, 2598.0, 2188.0, 1850.0, 2436.0, 2732.0, 2678.0, 2452.0, 2879.0, 2572.0, 3456.0, 3237.0, 2990.0, 3067.0, 3173.0]}, "iteration_timing_avg": 0.10533134328358208}
\ No newline at end of file
diff --git a/tests/functional_tests/test_cases/gpt/gpt3_345m_nightly_dgx_a100_1N8G_tp1_pp2_resume_torch/model_config.yaml b/tests/functional_tests/test_cases/gpt/gpt3_345m_nightly_dgx_a100_1N8G_tp1_pp2_resume_torch/model_config.yaml
new file mode 100644
index 0000000000..6242b2ebbc
--- /dev/null
+++ b/tests/functional_tests/test_cases/gpt/gpt3_345m_nightly_dgx_a100_1N8G_tp1_pp2_resume_torch/model_config.yaml
@@ -0,0 +1,51 @@
+ENV_VARS:
+  CUDA_DEVICE_MAX_CONNECTIONS: 1
+  NVTE_ALLOW_NONDETERMINISTIC_ALGO: 0
+  NCCL_ALGO: Tree
+  CUBLAS_WORKSPACE_CONFIG: :4096:8
+  N_REPEATS: 5
+MODEL_ARGS:
+  --num-layers: 12
+  --hidden-size: 512
+  --num-attention-heads: 8
+  --log-params-norm: true
+  --log-num-zeros-in-grad: true
+  --log-validation-ppl-to-tensorboard: true
+  --log-timers-to-tensorboard: true
+  --tensorboard-dir: ${TENSORBOARD_PATH}
+  --micro-batch-size: 4
+  --global-batch-size: 32
+  --seq-length: 1024
+  --max-position-embeddings: 1024
+  --train-iters: 100
+  --timing-log-level: 2
+  --lr-decay-iters: 320000
+  --save: ${CHECKPOINT_PATH}
+  --load: ${CHECKPOINT_PATH}
+  --data-path: ${DATA_PATH}/my-gpt3_00_text_document
+  --vocab-file: ${DATA_PATH}/bpe/vocab.json
+  --merge-file: ${DATA_PATH}/bpe/merges.txt
+  --split: 949,50,1
+  --distributed-backend: nccl
+  --lr: 0.00015
+  --lr-decay-style: cosine
+  --min-lr: 1.0e-5
+  --weight-decay: 1e-2
+  --clip-grad: 1.0
+  --lr-warmup-fraction: .01
+  --log-interval: 1
+  --save-interval: 50
+  --eval-interval: 1000
+  --eval-iters: 10
+  --transformer-impl: local
+  --tensor-model-parallel-size: 1
+  --pipeline-model-parallel-size: 2
+  --deterministic-mode: true
+  --no-gradient-accumulation-fusion: true
+  --use-checkpoint-opt_param-scheduler: true
+  --ckpt-format: torch
+  --use-legacy-models: true
+  --data-cache-path: ${DATA_CACHE_PATH}
+  --fp16: true
+  --apply-query-key-layer-scaling: true
+TEST_TYPE: ckpt-resume
\ No newline at end of file
diff --git a/tests/functional_tests/test_cases/gpt/gpt3_345m_nightly_dgx_a100_1N8G_tp1_pp4/golden_values_dev.json b/tests/functional_tests/test_cases/gpt/gpt3_345m_nightly_dgx_a100_1N8G_tp1_pp4/golden_values_dev.json
new file mode 100644
index 0000000000..0f5ad40c1c
--- /dev/null
+++ b/tests/functional_tests/test_cases/gpt/gpt3_345m_nightly_dgx_a100_1N8G_tp1_pp4/golden_values_dev.json
@@ -0,0 +1 @@
+{"lm loss": {"start_step": 0, "end_step": 50, "step_interval": 5, "values": [10.81248, 10.87098, 10.90003, 10.85021, 10.84909, 10.81546, 10.61697, 10.61018, 10.52451, 10.23087]}, "num-zeros": {"start_step": 0, "end_step": 33, "step_interval": 5, "values": [2427.0, 2538.0, 2652.0, 2303.0, 2378.0, 2744.0, 2530.0]}, "iteration_timing_avg": 0.1367805882352941}
\ No newline at end of file
diff --git a/tests/functional_tests/test_cases/gpt/gpt3_345m_nightly_dgx_a100_1N8G_tp1_pp4/golden_values_lts.json b/tests/functional_tests/test_cases/gpt/gpt3_345m_nightly_dgx_a100_1N8G_tp1_pp4/golden_values_lts.json
new file mode 100644
index 0000000000..0f5ad40c1c
--- /dev/null
+++ b/tests/functional_tests/test_cases/gpt/gpt3_345m_nightly_dgx_a100_1N8G_tp1_pp4/golden_values_lts.json
@@ -0,0 +1 @@
+{"lm loss": {"start_step": 0, "end_step": 50, "step_interval": 5, "values": [10.81248, 10.87098, 10.90003, 10.85021, 10.84909, 10.81546, 10.61697, 10.61018, 10.52451, 10.23087]}, "num-zeros": {"start_step": 0, "end_step": 33, "step_interval": 5, "values": [2427.0, 2538.0, 2652.0, 2303.0, 2378.0, 2744.0, 2530.0]}, "iteration_timing_avg": 0.1367805882352941}
\ No newline at end of file
diff --git a/tests/functional_tests/test_cases/gpt/gpt3_345m_nightly_dgx_a100_1N8G_tp1_pp4/model_config.yaml b/tests/functional_tests/test_cases/gpt/gpt3_345m_nightly_dgx_a100_1N8G_tp1_pp4/model_config.yaml
new file mode 100644
index 0000000000..81727e052d
--- /dev/null
+++ b/tests/functional_tests/test_cases/gpt/gpt3_345m_nightly_dgx_a100_1N8G_tp1_pp4/model_config.yaml
@@ -0,0 +1,50 @@
+ENV_VARS:
+  CUDA_DEVICE_MAX_CONNECTIONS: 1
+  NVTE_ALLOW_NONDETERMINISTIC_ALGO: 0
+  NCCL_ALGO: Tree
+  CUBLAS_WORKSPACE_CONFIG: :4096:8
+  N_REPEATS: 5
+MODEL_ARGS:
+  --num-layers: 12
+  --hidden-size: 512
+  --num-attention-heads: 8
+  --log-params-norm: true
+  --log-num-zeros-in-grad: true
+  --log-validation-ppl-to-tensorboard: true
+  --log-timers-to-tensorboard: true
+  --tensorboard-dir: ${TENSORBOARD_PATH}
+  --micro-batch-size: 4
+  --global-batch-size: 32
+  --seq-length: 1024
+  --max-position-embeddings: 1024
+  --train-iters: 50
+  --timing-log-level: 2
+  --lr-decay-iters: 320000
+  --save: ${CHECKPOINT_PATH}
+  --load: ${CHECKPOINT_PATH}
+  --data-path: ${DATA_PATH}/my-gpt3_00_text_document
+  --vocab-file: ${DATA_PATH}/bpe/vocab.json
+  --merge-file: ${DATA_PATH}/bpe/merges.txt
+  --split: 949,50,1
+  --distributed-backend: nccl
+  --lr: 0.00015
+  --lr-decay-style: cosine
+  --min-lr: 1.0e-5
+  --weight-decay: 1e-2
+  --clip-grad: 1.0
+  --lr-warmup-fraction: .01
+  --log-interval: 1
+  --save-interval: 10000
+  --eval-interval: 1000
+  --eval-iters: 10
+  --transformer-impl: local
+  --tensor-model-parallel-size: 1
+  --pipeline-model-parallel-size: 4
+  --deterministic-mode: true
+  --no-gradient-accumulation-fusion: true
+  --ckpt-format: torch
+  --use-legacy-models: true
+  --data-cache-path: ${DATA_CACHE_PATH}
+  --fp16: true
+  --apply-query-key-layer-scaling: true
+TEST_TYPE: regular
\ No newline at end of file
diff --git a/tests/functional_tests/test_cases/gpt/gpt3_345m_nightly_dgx_a100_1N8G_tp1_pp4_overlap_grad_reduce/golden_values_dev.json b/tests/functional_tests/test_cases/gpt/gpt3_345m_nightly_dgx_a100_1N8G_tp1_pp4_overlap_grad_reduce/golden_values_dev.json
new file mode 100644
index 0000000000..b9816fbf8b
--- /dev/null
+++ b/tests/functional_tests/test_cases/gpt/gpt3_345m_nightly_dgx_a100_1N8G_tp1_pp4_overlap_grad_reduce/golden_values_dev.json
@@ -0,0 +1 @@
+{"lm loss": {"start_step": 0, "end_step": 50, "step_interval": 5, "values": [10.81248, 10.87098, 10.90003, 10.85021, 10.84909, 10.81546, 10.61697, 10.61018, 10.52451, 10.23087]}, "num-zeros": {"start_step": 0, "end_step": 33, "step_interval": 5, "values": [2427.0, 2538.0, 2652.0, 2303.0, 2378.0, 2744.0, 2530.0]}, "iteration_timing_avg": 0.13371323529411766}
\ No newline at end of file
diff --git a/tests/functional_tests/test_cases/gpt/gpt3_345m_nightly_dgx_a100_1N8G_tp1_pp4_overlap_grad_reduce/golden_values_lts.json b/tests/functional_tests/test_cases/gpt/gpt3_345m_nightly_dgx_a100_1N8G_tp1_pp4_overlap_grad_reduce/golden_values_lts.json
new file mode 100644
index 0000000000..b9816fbf8b
--- /dev/null
+++ b/tests/functional_tests/test_cases/gpt/gpt3_345m_nightly_dgx_a100_1N8G_tp1_pp4_overlap_grad_reduce/golden_values_lts.json
@@ -0,0 +1 @@
+{"lm loss": {"start_step": 0, "end_step": 50, "step_interval": 5, "values": [10.81248, 10.87098, 10.90003, 10.85021, 10.84909, 10.81546, 10.61697, 10.61018, 10.52451, 10.23087]}, "num-zeros": {"start_step": 0, "end_step": 33, "step_interval": 5, "values": [2427.0, 2538.0, 2652.0, 2303.0, 2378.0, 2744.0, 2530.0]}, "iteration_timing_avg": 0.13371323529411766}
\ No newline at end of file
diff --git a/tests/functional_tests/test_cases/gpt/gpt3_345m_nightly_dgx_a100_1N8G_tp1_pp4_overlap_grad_reduce/model_config.yaml b/tests/functional_tests/test_cases/gpt/gpt3_345m_nightly_dgx_a100_1N8G_tp1_pp4_overlap_grad_reduce/model_config.yaml
new file mode 100644
index 0000000000..525d0f2c90
--- /dev/null
+++ b/tests/functional_tests/test_cases/gpt/gpt3_345m_nightly_dgx_a100_1N8G_tp1_pp4_overlap_grad_reduce/model_config.yaml
@@ -0,0 +1,51 @@
+ENV_VARS:
+  CUDA_DEVICE_MAX_CONNECTIONS: 1
+  NVTE_ALLOW_NONDETERMINISTIC_ALGO: 0
+  NCCL_ALGO: Tree
+  CUBLAS_WORKSPACE_CONFIG: :4096:8
+  N_REPEATS: 5
+MODEL_ARGS:
+  --num-layers: 12
+  --hidden-size: 512
+  --num-attention-heads: 8
+  --log-params-norm: true
+  --log-num-zeros-in-grad: true
+  --log-validation-ppl-to-tensorboard: true
+  --log-timers-to-tensorboard: true
+  --tensorboard-dir: ${TENSORBOARD_PATH}
+  --micro-batch-size: 4
+  --global-batch-size: 32
+  --seq-length: 1024
+  --max-position-embeddings: 1024
+  --train-iters: 50
+  --timing-log-level: 2
+  --lr-decay-iters: 320000
+  --save: ${CHECKPOINT_PATH}
+  --load: ${CHECKPOINT_PATH}
+  --data-path: ${DATA_PATH}/my-gpt3_00_text_document
+  --vocab-file: ${DATA_PATH}/bpe/vocab.json
+  --merge-file: ${DATA_PATH}/bpe/merges.txt
+  --split: 949,50,1
+  --distributed-backend: nccl
+  --lr: 0.00015
+  --lr-decay-style: cosine
+  --min-lr: 1.0e-5
+  --weight-decay: 1e-2
+  --clip-grad: 1.0
+  --lr-warmup-fraction: .01
+  --log-interval: 1
+  --save-interval: 10000
+  --eval-interval: 1000
+  --eval-iters: 10
+  --transformer-impl: local
+  --tensor-model-parallel-size: 1
+  --pipeline-model-parallel-size: 4
+  --overlap-grad-reduce: true
+  --deterministic-mode: true
+  --no-gradient-accumulation-fusion: true
+  --ckpt-format: torch
+  --use-legacy-models: true
+  --data-cache-path: ${DATA_CACHE_PATH}
+  --fp16: true
+  --apply-query-key-layer-scaling: true
+TEST_TYPE: regular
\ No newline at end of file
diff --git a/tests/functional_tests/test_cases/gpt/gpt3_345m_nightly_dgx_a100_1N8G_tp1_pp4_resume_torch/golden_values_dev.json b/tests/functional_tests/test_cases/gpt/gpt3_345m_nightly_dgx_a100_1N8G_tp1_pp4_resume_torch/golden_values_dev.json
new file mode 100644
index 0000000000..4cf16ef911
--- /dev/null
+++ b/tests/functional_tests/test_cases/gpt/gpt3_345m_nightly_dgx_a100_1N8G_tp1_pp4_resume_torch/golden_values_dev.json
@@ -0,0 +1 @@
+{"lm loss": {"start_step": 0, "end_step": 100, "step_interval": 5, "values": [10.81248, 10.87098, 10.90003, 10.85021, 10.84909, 10.81546, 10.61697, 10.61018, 10.52451, 10.23087, 10.19557, 9.94382, 9.95175, 9.90538, 9.79357, 9.25904, 9.61568, 9.19187, 9.46047, 9.6229]}, "num-zeros": {"start_step": 0, "end_step": 83, "step_interval": 5, "values": [2427.0, 2538.0, 2652.0, 2303.0, 2378.0, 2744.0, 2530.0, 3566.0, 3139.0, 3236.0, 3208.0, 3413.0, 3913.0, 3194.0, 3581.0, 3625.0, 4695.0]}, "iteration_timing_avg": 0.1320626865671642}
\ No newline at end of file
diff --git a/tests/functional_tests/test_cases/gpt/gpt3_345m_nightly_dgx_a100_1N8G_tp1_pp4_resume_torch/golden_values_lts.json b/tests/functional_tests/test_cases/gpt/gpt3_345m_nightly_dgx_a100_1N8G_tp1_pp4_resume_torch/golden_values_lts.json
new file mode 100644
index 0000000000..4cf16ef911
--- /dev/null
+++ b/tests/functional_tests/test_cases/gpt/gpt3_345m_nightly_dgx_a100_1N8G_tp1_pp4_resume_torch/golden_values_lts.json
@@ -0,0 +1 @@
+{"lm loss": {"start_step": 0, "end_step": 100, "step_interval": 5, "values": [10.81248, 10.87098, 10.90003, 10.85021, 10.84909, 10.81546, 10.61697, 10.61018, 10.52451, 10.23087, 10.19557, 9.94382, 9.95175, 9.90538, 9.79357, 9.25904, 9.61568, 9.19187, 9.46047, 9.6229]}, "num-zeros": {"start_step": 0, "end_step": 83, "step_interval": 5, "values": [2427.0, 2538.0, 2652.0, 2303.0, 2378.0, 2744.0, 2530.0, 3566.0, 3139.0, 3236.0, 3208.0, 3413.0, 3913.0, 3194.0, 3581.0, 3625.0, 4695.0]}, "iteration_timing_avg": 0.1320626865671642}
\ No newline at end of file
diff --git a/tests/functional_tests/test_cases/gpt/gpt3_345m_nightly_dgx_a100_1N8G_tp1_pp4_resume_torch/model_config.yaml b/tests/functional_tests/test_cases/gpt/gpt3_345m_nightly_dgx_a100_1N8G_tp1_pp4_resume_torch/model_config.yaml
new file mode 100644
index 0000000000..516e1dd517
--- /dev/null
+++ b/tests/functional_tests/test_cases/gpt/gpt3_345m_nightly_dgx_a100_1N8G_tp1_pp4_resume_torch/model_config.yaml
@@ -0,0 +1,51 @@
+ENV_VARS:
+  CUDA_DEVICE_MAX_CONNECTIONS: 1
+  NVTE_ALLOW_NONDETERMINISTIC_ALGO: 0
+  NCCL_ALGO: Tree
+  CUBLAS_WORKSPACE_CONFIG: :4096:8
+  N_REPEATS: 5
+MODEL_ARGS:
+  --num-layers: 12
+  --hidden-size: 512
+  --num-attention-heads: 8
+  --log-params-norm: true
+  --log-num-zeros-in-grad: true
+  --log-validation-ppl-to-tensorboard: true
+  --log-timers-to-tensorboard: true
+  --tensorboard-dir: ${TENSORBOARD_PATH}
+  --micro-batch-size: 4
+  --global-batch-size: 32
+  --seq-length: 1024
+  --max-position-embeddings: 1024
+  --train-iters: 100
+  --timing-log-level: 2
+  --lr-decay-iters: 320000
+  --save: ${CHECKPOINT_PATH}
+  --load: ${CHECKPOINT_PATH}
+  --data-path: ${DATA_PATH}/my-gpt3_00_text_document
+  --vocab-file: ${DATA_PATH}/bpe/vocab.json
+  --merge-file: ${DATA_PATH}/bpe/merges.txt
+  --split: 949,50,1
+  --distributed-backend: nccl
+  --lr: 0.00015
+  --lr-decay-style: cosine
+  --min-lr: 1.0e-5
+  --weight-decay: 1e-2
+  --clip-grad: 1.0
+  --lr-warmup-fraction: .01
+  --log-interval: 1
+  --save-interval: 50
+  --eval-interval: 1000
+  --eval-iters: 10
+  --transformer-impl: local
+  --tensor-model-parallel-size: 1
+  --pipeline-model-parallel-size: 4
+  --deterministic-mode: true
+  --no-gradient-accumulation-fusion: true
+  --use-checkpoint-opt_param-scheduler: true
+  --ckpt-format: torch
+  --use-legacy-models: true
+  --data-cache-path: ${DATA_CACHE_PATH}
+  --fp16: true
+  --apply-query-key-layer-scaling: true
+TEST_TYPE: ckpt-resume
\ No newline at end of file
diff --git a/tests/functional_tests/test_cases/gpt/gpt3_345m_nightly_dgx_a100_1N8G_tp1_pp4_vp1_overlap_grad_reduce/golden_values_dev.json b/tests/functional_tests/test_cases/gpt/gpt3_345m_nightly_dgx_a100_1N8G_tp1_pp4_vp1_overlap_grad_reduce/golden_values_dev.json
new file mode 100644
index 0000000000..302a1524b4
--- /dev/null
+++ b/tests/functional_tests/test_cases/gpt/gpt3_345m_nightly_dgx_a100_1N8G_tp1_pp4_vp1_overlap_grad_reduce/golden_values_dev.json
@@ -0,0 +1 @@
+{"lm loss": {"start_step": 0, "end_step": 50, "step_interval": 5, "values": [10.79311, 10.85248, 10.87281, 10.83016, 10.82949, 10.78726, 10.565, 10.57088, 10.4836, 10.19521]}, "num-zeros": {"start_step": 0, "end_step": 34, "step_interval": 5, "values": [2450.0, 2765.0, 2163.0, 2585.0, 2634.0, 2585.0, 2987.0]}, "iteration_timing_avg": 0.1333435294117647}
\ No newline at end of file
diff --git a/tests/functional_tests/test_cases/gpt/gpt3_345m_nightly_dgx_a100_1N8G_tp1_pp4_vp1_overlap_grad_reduce/golden_values_lts.json b/tests/functional_tests/test_cases/gpt/gpt3_345m_nightly_dgx_a100_1N8G_tp1_pp4_vp1_overlap_grad_reduce/golden_values_lts.json
new file mode 100644
index 0000000000..302a1524b4
--- /dev/null
+++ b/tests/functional_tests/test_cases/gpt/gpt3_345m_nightly_dgx_a100_1N8G_tp1_pp4_vp1_overlap_grad_reduce/golden_values_lts.json
@@ -0,0 +1 @@
+{"lm loss": {"start_step": 0, "end_step": 50, "step_interval": 5, "values": [10.79311, 10.85248, 10.87281, 10.83016, 10.82949, 10.78726, 10.565, 10.57088, 10.4836, 10.19521]}, "num-zeros": {"start_step": 0, "end_step": 34, "step_interval": 5, "values": [2450.0, 2765.0, 2163.0, 2585.0, 2634.0, 2585.0, 2987.0]}, "iteration_timing_avg": 0.1333435294117647}
\ No newline at end of file
diff --git a/tests/functional_tests/test_cases/gpt/gpt3_345m_nightly_dgx_a100_1N8G_tp1_pp4_vp1_overlap_grad_reduce/model_config.yaml b/tests/functional_tests/test_cases/gpt/gpt3_345m_nightly_dgx_a100_1N8G_tp1_pp4_vp1_overlap_grad_reduce/model_config.yaml
new file mode 100644
index 0000000000..10fc8c2f23
--- /dev/null
+++ b/tests/functional_tests/test_cases/gpt/gpt3_345m_nightly_dgx_a100_1N8G_tp1_pp4_vp1_overlap_grad_reduce/model_config.yaml
@@ -0,0 +1,52 @@
+ENV_VARS:
+  CUDA_DEVICE_MAX_CONNECTIONS: 1
+  NVTE_ALLOW_NONDETERMINISTIC_ALGO: 0
+  NCCL_ALGO: Tree
+  CUBLAS_WORKSPACE_CONFIG: :4096:8
+  N_REPEATS: 5
+MODEL_ARGS:
+  --num-layers: 12
+  --hidden-size: 512
+  --num-attention-heads: 8
+  --log-params-norm: true
+  --log-num-zeros-in-grad: true
+  --log-validation-ppl-to-tensorboard: true
+  --log-timers-to-tensorboard: true
+  --tensorboard-dir: ${TENSORBOARD_PATH}
+  --micro-batch-size: 4
+  --global-batch-size: 32
+  --seq-length: 1024
+  --max-position-embeddings: 1024
+  --train-iters: 50
+  --timing-log-level: 2
+  --lr-decay-iters: 320000
+  --save: ${CHECKPOINT_PATH}
+  --load: ${CHECKPOINT_PATH}
+  --data-path: ${DATA_PATH}/my-gpt3_00_text_document
+  --vocab-file: ${DATA_PATH}/bpe/vocab.json
+  --merge-file: ${DATA_PATH}/bpe/merges.txt
+  --split: 949,50,1
+  --distributed-backend: nccl
+  --lr: 0.00015
+  --lr-decay-style: cosine
+  --min-lr: 1.0e-5
+  --weight-decay: 1e-2
+  --clip-grad: 1.0
+  --lr-warmup-fraction: .01
+  --log-interval: 1
+  --save-interval: 10000
+  --eval-interval: 1000
+  --eval-iters: 10
+  --transformer-impl: local
+  --tensor-model-parallel-size: 1
+  --pipeline-model-parallel-size: 4
+  --num-layers-per-virtual-pipeline-stage: 1
+  --overlap-grad-reduce: true
+  --deterministic-mode: true
+  --no-gradient-accumulation-fusion: true
+  --ckpt-format: torch
+  --use-legacy-models: true
+  --data-cache-path: ${DATA_CACHE_PATH}
+  --fp16: true
+  --apply-query-key-layer-scaling: true
+TEST_TYPE: regular
\ No newline at end of file
diff --git a/tests/functional_tests/test_cases/gpt/gpt3_345m_nightly_dgx_a100_1N8G_tp2_pp2_4experts/golden_values_dev.json b/tests/functional_tests/test_cases/gpt/gpt3_345m_nightly_dgx_a100_1N8G_tp2_pp2_4experts/golden_values_dev.json
new file mode 100644
index 0000000000..114dfb1e2a
--- /dev/null
+++ b/tests/functional_tests/test_cases/gpt/gpt3_345m_nightly_dgx_a100_1N8G_tp2_pp2_4experts/golden_values_dev.json
@@ -0,0 +1 @@
+{"lm loss": {"start_step": 0, "end_step": 50, "step_interval": 5, "values": [10.80264, 10.85778, 10.86259, 10.83903, 10.82934, 10.81016, 10.60251, 10.61471, 10.54092, 10.27186]}, "num-zeros": {"start_step": 0, "end_step": 34, "step_interval": 5, "values": [8571.0, 7897.0, 7748.0, 9008.0, 9165.0, 8986.0, 9155.0]}, "iteration_timing_avg": 0.3671870588235294}
\ No newline at end of file
diff --git a/tests/functional_tests/test_cases/gpt/gpt3_345m_nightly_dgx_a100_1N8G_tp2_pp2_4experts/golden_values_lts.json b/tests/functional_tests/test_cases/gpt/gpt3_345m_nightly_dgx_a100_1N8G_tp2_pp2_4experts/golden_values_lts.json
new file mode 100644
index 0000000000..114dfb1e2a
--- /dev/null
+++ b/tests/functional_tests/test_cases/gpt/gpt3_345m_nightly_dgx_a100_1N8G_tp2_pp2_4experts/golden_values_lts.json
@@ -0,0 +1 @@
+{"lm loss": {"start_step": 0, "end_step": 50, "step_interval": 5, "values": [10.80264, 10.85778, 10.86259, 10.83903, 10.82934, 10.81016, 10.60251, 10.61471, 10.54092, 10.27186]}, "num-zeros": {"start_step": 0, "end_step": 34, "step_interval": 5, "values": [8571.0, 7897.0, 7748.0, 9008.0, 9165.0, 8986.0, 9155.0]}, "iteration_timing_avg": 0.3671870588235294}
\ No newline at end of file
diff --git a/tests/functional_tests/test_cases/gpt/gpt3_345m_nightly_dgx_a100_1N8G_tp2_pp2_4experts/model_config.yaml b/tests/functional_tests/test_cases/gpt/gpt3_345m_nightly_dgx_a100_1N8G_tp2_pp2_4experts/model_config.yaml
new file mode 100644
index 0000000000..ba219d4445
--- /dev/null
+++ b/tests/functional_tests/test_cases/gpt/gpt3_345m_nightly_dgx_a100_1N8G_tp2_pp2_4experts/model_config.yaml
@@ -0,0 +1,54 @@
+ENV_VARS:
+  CUDA_DEVICE_MAX_CONNECTIONS: 1
+  NVTE_ALLOW_NONDETERMINISTIC_ALGO: 0
+  NCCL_ALGO: Tree
+  CUBLAS_WORKSPACE_CONFIG: :4096:8
+  N_REPEATS: 5
+MODEL_ARGS:
+  --num-layers: 12
+  --hidden-size: 512
+  --num-attention-heads: 8
+  --log-params-norm: true
+  --log-num-zeros-in-grad: true
+  --log-validation-ppl-to-tensorboard: true
+  --log-timers-to-tensorboard: true
+  --tensorboard-dir: ${TENSORBOARD_PATH}
+  --micro-batch-size: 4
+  --global-batch-size: 32
+  --seq-length: 1024
+  --max-position-embeddings: 1024
+  --train-iters: 50
+  --timing-log-level: 2
+  --lr-decay-iters: 320000
+  --save: ${CHECKPOINT_PATH}
+  --load: ${CHECKPOINT_PATH}
+  --data-path: ${DATA_PATH}/my-gpt3_00_text_document
+  --vocab-file: ${DATA_PATH}/bpe/vocab.json
+  --merge-file: ${DATA_PATH}/bpe/merges.txt
+  --split: 949,50,1
+  --distributed-backend: nccl
+  --lr: 0.00015
+  --lr-decay-style: cosine
+  --min-lr: 1.0e-5
+  --weight-decay: 1e-2
+  --clip-grad: 1.0
+  --lr-warmup-fraction: .01
+  --log-interval: 1
+  --save-interval: 10000
+  --eval-interval: 1000
+  --eval-iters: 10
+  --transformer-impl: local
+  --tensor-model-parallel-size: 2
+  --pipeline-model-parallel-size: 2
+  --sequence-parallel: true
+  --num-experts: 4
+  --moe-router-load-balancing-type: sinkhorn
+  --moe-router-topk: 1
+  --deterministic-mode: true
+  --no-gradient-accumulation-fusion: true
+  --ckpt-format: torch
+  --use-legacy-models: true
+  --data-cache-path: ${DATA_CACHE_PATH}
+  --fp16: true
+  --apply-query-key-layer-scaling: true
+TEST_TYPE: regular
\ No newline at end of file
diff --git a/tests/functional_tests/test_cases/gpt/gpt3_345m_nightly_dgx_a100_1N8G_tp2_pp2_overlap_grad_reduce/golden_values_dev.json b/tests/functional_tests/test_cases/gpt/gpt3_345m_nightly_dgx_a100_1N8G_tp2_pp2_overlap_grad_reduce/golden_values_dev.json
new file mode 100644
index 0000000000..b807a2e979
--- /dev/null
+++ b/tests/functional_tests/test_cases/gpt/gpt3_345m_nightly_dgx_a100_1N8G_tp2_pp2_overlap_grad_reduce/golden_values_dev.json
@@ -0,0 +1 @@
+{"lm loss": {"start_step": 0, "end_step": 50, "step_interval": 5, "values": [10.85929, 10.89211, 10.87639, 10.86988, 10.88179, 10.83898, 10.66589, 10.62691, 10.52461, 10.25708]}, "num-zeros": {"start_step": 0, "end_step": 33, "step_interval": 5, "values": [2458.0, 2527.0, 2467.0, 2148.0, 2250.0, 2467.0, 2528.0]}, "iteration_timing_avg": 0.1660379411764706}
\ No newline at end of file
diff --git a/tests/functional_tests/test_cases/gpt/gpt3_345m_nightly_dgx_a100_1N8G_tp2_pp2_overlap_grad_reduce/golden_values_lts.json b/tests/functional_tests/test_cases/gpt/gpt3_345m_nightly_dgx_a100_1N8G_tp2_pp2_overlap_grad_reduce/golden_values_lts.json
new file mode 100644
index 0000000000..b807a2e979
--- /dev/null
+++ b/tests/functional_tests/test_cases/gpt/gpt3_345m_nightly_dgx_a100_1N8G_tp2_pp2_overlap_grad_reduce/golden_values_lts.json
@@ -0,0 +1 @@
+{"lm loss": {"start_step": 0, "end_step": 50, "step_interval": 5, "values": [10.85929, 10.89211, 10.87639, 10.86988, 10.88179, 10.83898, 10.66589, 10.62691, 10.52461, 10.25708]}, "num-zeros": {"start_step": 0, "end_step": 33, "step_interval": 5, "values": [2458.0, 2527.0, 2467.0, 2148.0, 2250.0, 2467.0, 2528.0]}, "iteration_timing_avg": 0.1660379411764706}
\ No newline at end of file
diff --git a/tests/functional_tests/test_cases/gpt/gpt3_345m_nightly_dgx_a100_1N8G_tp2_pp2_overlap_grad_reduce/model_config.yaml b/tests/functional_tests/test_cases/gpt/gpt3_345m_nightly_dgx_a100_1N8G_tp2_pp2_overlap_grad_reduce/model_config.yaml
new file mode 100644
index 0000000000..c547f47970
--- /dev/null
+++ b/tests/functional_tests/test_cases/gpt/gpt3_345m_nightly_dgx_a100_1N8G_tp2_pp2_overlap_grad_reduce/model_config.yaml
@@ -0,0 +1,51 @@
+ENV_VARS:
+  CUDA_DEVICE_MAX_CONNECTIONS: 1
+  NVTE_ALLOW_NONDETERMINISTIC_ALGO: 0
+  NCCL_ALGO: Tree
+  CUBLAS_WORKSPACE_CONFIG: :4096:8
+  N_REPEATS: 5
+MODEL_ARGS:
+  --num-layers: 12
+  --hidden-size: 512
+  --num-attention-heads: 8
+  --log-params-norm: true
+  --log-num-zeros-in-grad: true
+  --log-validation-ppl-to-tensorboard: true
+  --log-timers-to-tensorboard: true
+  --tensorboard-dir: ${TENSORBOARD_PATH}
+  --micro-batch-size: 4
+  --global-batch-size: 32
+  --seq-length: 1024
+  --max-position-embeddings: 1024
+  --train-iters: 50
+  --timing-log-level: 2
+  --lr-decay-iters: 320000
+  --save: ${CHECKPOINT_PATH}
+  --load: ${CHECKPOINT_PATH}
+  --data-path: ${DATA_PATH}/my-gpt3_00_text_document
+  --vocab-file: ${DATA_PATH}/bpe/vocab.json
+  --merge-file: ${DATA_PATH}/bpe/merges.txt
+  --split: 949,50,1
+  --distributed-backend: nccl
+  --lr: 0.00015
+  --lr-decay-style: cosine
+  --min-lr: 1.0e-5
+  --weight-decay: 1e-2
+  --clip-grad: 1.0
+  --lr-warmup-fraction: .01
+  --log-interval: 1
+  --save-interval: 10000
+  --eval-interval: 1000
+  --eval-iters: 10
+  --transformer-impl: local
+  --tensor-model-parallel-size: 2
+  --pipeline-model-parallel-size: 2
+  --overlap-grad-reduce: true
+  --deterministic-mode: true
+  --no-gradient-accumulation-fusion: true
+  --ckpt-format: torch
+  --use-legacy-models: true
+  --data-cache-path: ${DATA_CACHE_PATH}
+  --fp16: true
+  --apply-query-key-layer-scaling: true
+TEST_TYPE: regular
\ No newline at end of file
diff --git a/tests/functional_tests/test_cases/gpt/gpt3_345m_nightly_dgx_a100_1N8G_tp2_pp2_resume_torch_4experts/golden_values_dev.json b/tests/functional_tests/test_cases/gpt/gpt3_345m_nightly_dgx_a100_1N8G_tp2_pp2_resume_torch_4experts/golden_values_dev.json
new file mode 100644
index 0000000000..546ccfca5e
--- /dev/null
+++ b/tests/functional_tests/test_cases/gpt/gpt3_345m_nightly_dgx_a100_1N8G_tp2_pp2_resume_torch_4experts/golden_values_dev.json
@@ -0,0 +1 @@
+{"lm loss": {"start_step": 0, "end_step": 100, "step_interval": 5, "values": [10.80264, 10.85778, 10.86259, 10.83903, 10.82934, 10.81016, 10.60251, 10.61471, 10.54092, 10.27186, 10.24338, 10.02058, 10.03017, 9.99471, 9.84885, 9.34867, 9.67263, 9.2457, 9.53365, 9.67548]}, "num-zeros": {"start_step": 0, "end_step": 84, "step_interval": 5, "values": [8571.0, 7897.0, 7748.0, 9008.0, 9165.0, 8986.0, 9155.0, 7960.0, 7684.0, 9743.0, 8727.0, 9382.0, 10992.0, 11177.0, 11270.0, 13404.0, 11533.0]}, "iteration_timing_avg": 0.3735462686567164}
\ No newline at end of file
diff --git a/tests/functional_tests/test_cases/gpt/gpt3_345m_nightly_dgx_a100_1N8G_tp2_pp2_resume_torch_4experts/golden_values_lts.json b/tests/functional_tests/test_cases/gpt/gpt3_345m_nightly_dgx_a100_1N8G_tp2_pp2_resume_torch_4experts/golden_values_lts.json
new file mode 100644
index 0000000000..546ccfca5e
--- /dev/null
+++ b/tests/functional_tests/test_cases/gpt/gpt3_345m_nightly_dgx_a100_1N8G_tp2_pp2_resume_torch_4experts/golden_values_lts.json
@@ -0,0 +1 @@
+{"lm loss": {"start_step": 0, "end_step": 100, "step_interval": 5, "values": [10.80264, 10.85778, 10.86259, 10.83903, 10.82934, 10.81016, 10.60251, 10.61471, 10.54092, 10.27186, 10.24338, 10.02058, 10.03017, 9.99471, 9.84885, 9.34867, 9.67263, 9.2457, 9.53365, 9.67548]}, "num-zeros": {"start_step": 0, "end_step": 84, "step_interval": 5, "values": [8571.0, 7897.0, 7748.0, 9008.0, 9165.0, 8986.0, 9155.0, 7960.0, 7684.0, 9743.0, 8727.0, 9382.0, 10992.0, 11177.0, 11270.0, 13404.0, 11533.0]}, "iteration_timing_avg": 0.3735462686567164}
\ No newline at end of file
diff --git a/tests/functional_tests/test_cases/gpt/gpt3_345m_nightly_dgx_a100_1N8G_tp2_pp2_resume_torch_4experts/model_config.yaml b/tests/functional_tests/test_cases/gpt/gpt3_345m_nightly_dgx_a100_1N8G_tp2_pp2_resume_torch_4experts/model_config.yaml
new file mode 100644
index 0000000000..72c98e80be
--- /dev/null
+++ b/tests/functional_tests/test_cases/gpt/gpt3_345m_nightly_dgx_a100_1N8G_tp2_pp2_resume_torch_4experts/model_config.yaml
@@ -0,0 +1,55 @@
+ENV_VARS:
+  CUDA_DEVICE_MAX_CONNECTIONS: 1
+  NVTE_ALLOW_NONDETERMINISTIC_ALGO: 0
+  NCCL_ALGO: Tree
+  CUBLAS_WORKSPACE_CONFIG: :4096:8
+  N_REPEATS: 5
+MODEL_ARGS:
+  --num-layers: 12
+  --hidden-size: 512
+  --num-attention-heads: 8
+  --log-params-norm: true
+  --log-num-zeros-in-grad: true
+  --log-validation-ppl-to-tensorboard: true
+  --log-timers-to-tensorboard: true
+  --tensorboard-dir: ${TENSORBOARD_PATH}
+  --micro-batch-size: 4
+  --global-batch-size: 32
+  --seq-length: 1024
+  --max-position-embeddings: 1024
+  --train-iters: 100
+  --timing-log-level: 2
+  --lr-decay-iters: 320000
+  --save: ${CHECKPOINT_PATH}
+  --load: ${CHECKPOINT_PATH}
+  --data-path: ${DATA_PATH}/my-gpt3_00_text_document
+  --vocab-file: ${DATA_PATH}/bpe/vocab.json
+  --merge-file: ${DATA_PATH}/bpe/merges.txt
+  --split: 949,50,1
+  --distributed-backend: nccl
+  --lr: 0.00015
+  --lr-decay-style: cosine
+  --min-lr: 1.0e-5
+  --weight-decay: 1e-2
+  --clip-grad: 1.0
+  --lr-warmup-fraction: .01
+  --log-interval: 1
+  --save-interval: 50
+  --eval-interval: 1000
+  --eval-iters: 10
+  --transformer-impl: local
+  --tensor-model-parallel-size: 2
+  --pipeline-model-parallel-size: 2
+  --sequence-parallel: true
+  --num-experts: 4
+  --moe-router-load-balancing-type: sinkhorn
+  --moe-router-topk: 1
+  --deterministic-mode: true
+  --no-gradient-accumulation-fusion: true
+  --use-checkpoint-opt_param-scheduler: true
+  --ckpt-format: torch
+  --use-legacy-models: true
+  --data-cache-path: ${DATA_CACHE_PATH}
+  --fp16: true
+  --apply-query-key-layer-scaling: true
+TEST_TYPE: ckpt-resume
\ No newline at end of file
diff --git a/tests/functional_tests/test_cases/gpt/gpt3_345m_nightly_dgx_a100_1N8G_tp2_pp2_resume_torch_overlap_grad_reduce/golden_values_dev.json b/tests/functional_tests/test_cases/gpt/gpt3_345m_nightly_dgx_a100_1N8G_tp2_pp2_resume_torch_overlap_grad_reduce/golden_values_dev.json
new file mode 100644
index 0000000000..c0a53bdb6c
--- /dev/null
+++ b/tests/functional_tests/test_cases/gpt/gpt3_345m_nightly_dgx_a100_1N8G_tp2_pp2_resume_torch_overlap_grad_reduce/golden_values_dev.json
@@ -0,0 +1 @@
+{"lm loss": {"start_step": 0, "end_step": 100, "step_interval": 5, "values": [10.85929, 10.89211, 10.87639, 10.86988, 10.88179, 10.83898, 10.66589, 10.62691, 10.52461, 10.25708, 10.19741, 9.9562, 9.96369, 9.91398, 9.79604, 9.2686, 9.61975, 9.19501, 9.47332, 9.62216]}, "num-zeros": {"start_step": 0, "end_step": 83, "step_interval": 5, "values": [2458.0, 2527.0, 2467.0, 2148.0, 2250.0, 2467.0, 2528.0, 3656.0, 3275.0, 3203.0, 3297.0, 3364.0, 3789.0, 3277.0, 3660.0, 3733.0, 4815.0]}, "iteration_timing_avg": 0.1628459701492537}
\ No newline at end of file
diff --git a/tests/functional_tests/test_cases/gpt/gpt3_345m_nightly_dgx_a100_1N8G_tp2_pp2_resume_torch_overlap_grad_reduce/golden_values_lts.json b/tests/functional_tests/test_cases/gpt/gpt3_345m_nightly_dgx_a100_1N8G_tp2_pp2_resume_torch_overlap_grad_reduce/golden_values_lts.json
new file mode 100644
index 0000000000..c0a53bdb6c
--- /dev/null
+++ b/tests/functional_tests/test_cases/gpt/gpt3_345m_nightly_dgx_a100_1N8G_tp2_pp2_resume_torch_overlap_grad_reduce/golden_values_lts.json
@@ -0,0 +1 @@
+{"lm loss": {"start_step": 0, "end_step": 100, "step_interval": 5, "values": [10.85929, 10.89211, 10.87639, 10.86988, 10.88179, 10.83898, 10.66589, 10.62691, 10.52461, 10.25708, 10.19741, 9.9562, 9.96369, 9.91398, 9.79604, 9.2686, 9.61975, 9.19501, 9.47332, 9.62216]}, "num-zeros": {"start_step": 0, "end_step": 83, "step_interval": 5, "values": [2458.0, 2527.0, 2467.0, 2148.0, 2250.0, 2467.0, 2528.0, 3656.0, 3275.0, 3203.0, 3297.0, 3364.0, 3789.0, 3277.0, 3660.0, 3733.0, 4815.0]}, "iteration_timing_avg": 0.1628459701492537}
\ No newline at end of file
diff --git a/tests/functional_tests/test_cases/gpt/gpt3_345m_nightly_dgx_a100_1N8G_tp2_pp2_resume_torch_overlap_grad_reduce/model_config.yaml b/tests/functional_tests/test_cases/gpt/gpt3_345m_nightly_dgx_a100_1N8G_tp2_pp2_resume_torch_overlap_grad_reduce/model_config.yaml
new file mode 100644
index 0000000000..03ddd8a7ca
--- /dev/null
+++ b/tests/functional_tests/test_cases/gpt/gpt3_345m_nightly_dgx_a100_1N8G_tp2_pp2_resume_torch_overlap_grad_reduce/model_config.yaml
@@ -0,0 +1,52 @@
+ENV_VARS:
+  CUDA_DEVICE_MAX_CONNECTIONS: 1
+  NVTE_ALLOW_NONDETERMINISTIC_ALGO: 0
+  NCCL_ALGO: Tree
+  CUBLAS_WORKSPACE_CONFIG: :4096:8
+  N_REPEATS: 5
+MODEL_ARGS:
+  --num-layers: 12
+  --hidden-size: 512
+  --num-attention-heads: 8
+  --log-params-norm: true
+  --log-num-zeros-in-grad: true
+  --log-validation-ppl-to-tensorboard: true
+  --log-timers-to-tensorboard: true
+  --tensorboard-dir: ${TENSORBOARD_PATH}
+  --micro-batch-size: 4
+  --global-batch-size: 32
+  --seq-length: 1024
+  --max-position-embeddings: 1024
+  --train-iters: 100
+  --timing-log-level: 2
+  --lr-decay-iters: 320000
+  --save: ${CHECKPOINT_PATH}
+  --load: ${CHECKPOINT_PATH}
+  --data-path: ${DATA_PATH}/my-gpt3_00_text_document
+  --vocab-file: ${DATA_PATH}/bpe/vocab.json
+  --merge-file: ${DATA_PATH}/bpe/merges.txt
+  --split: 949,50,1
+  --distributed-backend: nccl
+  --lr: 0.00015
+  --lr-decay-style: cosine
+  --min-lr: 1.0e-5
+  --weight-decay: 1e-2
+  --clip-grad: 1.0
+  --lr-warmup-fraction: .01
+  --log-interval: 1
+  --save-interval: 50
+  --eval-interval: 1000
+  --eval-iters: 10
+  --transformer-impl: local
+  --tensor-model-parallel-size: 2
+  --pipeline-model-parallel-size: 2
+  --overlap-grad-reduce: true
+  --deterministic-mode: true
+  --no-gradient-accumulation-fusion: true
+  --use-checkpoint-opt_param-scheduler: true
+  --ckpt-format: torch
+  --use-legacy-models: true
+  --data-cache-path: ${DATA_CACHE_PATH}
+  --fp16: true
+  --apply-query-key-layer-scaling: true
+TEST_TYPE: ckpt-resume
\ No newline at end of file
diff --git a/tests/functional_tests/test_cases/gpt/gpt3_345m_nightly_dgx_a100_1N8G_tp4_pp1/golden_values_dev.json b/tests/functional_tests/test_cases/gpt/gpt3_345m_nightly_dgx_a100_1N8G_tp4_pp1/golden_values_dev.json
new file mode 100644
index 0000000000..18457f230d
--- /dev/null
+++ b/tests/functional_tests/test_cases/gpt/gpt3_345m_nightly_dgx_a100_1N8G_tp4_pp1/golden_values_dev.json
@@ -0,0 +1 @@
+{"lm loss": {"start_step": 0, "end_step": 50, "step_interval": 5, "values": [10.86312, 10.87712, 10.87347, 10.88278, 10.89457, 10.84427, 10.69023, 10.62687, 10.53974, 10.26525]}, "num-zeros": {"start_step": 0, "end_step": 33, "step_interval": 5, "values": [2244.0, 2273.0, 2447.0, 2031.0, 2134.0, 2491.0, 2380.0]}, "iteration_timing_avg": 0.23144205882352942}
\ No newline at end of file
diff --git a/tests/functional_tests/test_cases/gpt/gpt3_345m_nightly_dgx_a100_1N8G_tp4_pp1/golden_values_lts.json b/tests/functional_tests/test_cases/gpt/gpt3_345m_nightly_dgx_a100_1N8G_tp4_pp1/golden_values_lts.json
new file mode 100644
index 0000000000..18457f230d
--- /dev/null
+++ b/tests/functional_tests/test_cases/gpt/gpt3_345m_nightly_dgx_a100_1N8G_tp4_pp1/golden_values_lts.json
@@ -0,0 +1 @@
+{"lm loss": {"start_step": 0, "end_step": 50, "step_interval": 5, "values": [10.86312, 10.87712, 10.87347, 10.88278, 10.89457, 10.84427, 10.69023, 10.62687, 10.53974, 10.26525]}, "num-zeros": {"start_step": 0, "end_step": 33, "step_interval": 5, "values": [2244.0, 2273.0, 2447.0, 2031.0, 2134.0, 2491.0, 2380.0]}, "iteration_timing_avg": 0.23144205882352942}
\ No newline at end of file
diff --git a/tests/functional_tests/test_cases/gpt/gpt3_345m_nightly_dgx_a100_1N8G_tp4_pp1/model_config.yaml b/tests/functional_tests/test_cases/gpt/gpt3_345m_nightly_dgx_a100_1N8G_tp4_pp1/model_config.yaml
new file mode 100644
index 0000000000..84128fa780
--- /dev/null
+++ b/tests/functional_tests/test_cases/gpt/gpt3_345m_nightly_dgx_a100_1N8G_tp4_pp1/model_config.yaml
@@ -0,0 +1,50 @@
+ENV_VARS:
+  CUDA_DEVICE_MAX_CONNECTIONS: 1
+  NVTE_ALLOW_NONDETERMINISTIC_ALGO: 0
+  NCCL_ALGO: Tree
+  CUBLAS_WORKSPACE_CONFIG: :4096:8
+  N_REPEATS: 5
+MODEL_ARGS:
+  --num-layers: 12
+  --hidden-size: 512
+  --num-attention-heads: 8
+  --log-params-norm: true
+  --log-num-zeros-in-grad: true
+  --log-validation-ppl-to-tensorboard: true
+  --log-timers-to-tensorboard: true
+  --tensorboard-dir: ${TENSORBOARD_PATH}
+  --micro-batch-size: 4
+  --global-batch-size: 32
+  --seq-length: 1024
+  --max-position-embeddings: 1024
+  --train-iters: 50
+  --timing-log-level: 2
+  --lr-decay-iters: 320000
+  --save: ${CHECKPOINT_PATH}
+  --load: ${CHECKPOINT_PATH}
+  --data-path: ${DATA_PATH}/my-gpt3_00_text_document
+  --vocab-file: ${DATA_PATH}/bpe/vocab.json
+  --merge-file: ${DATA_PATH}/bpe/merges.txt
+  --split: 949,50,1
+  --distributed-backend: nccl
+  --lr: 0.00015
+  --lr-decay-style: cosine
+  --min-lr: 1.0e-5
+  --weight-decay: 1e-2
+  --clip-grad: 1.0
+  --lr-warmup-fraction: .01
+  --log-interval: 1
+  --save-interval: 10000
+  --eval-interval: 1000
+  --eval-iters: 10
+  --transformer-impl: local
+  --tensor-model-parallel-size: 4
+  --pipeline-model-parallel-size: 1
+  --deterministic-mode: true
+  --no-gradient-accumulation-fusion: true
+  --ckpt-format: torch
+  --use-legacy-models: true
+  --data-cache-path: ${DATA_CACHE_PATH}
+  --fp16: true
+  --apply-query-key-layer-scaling: true
+TEST_TYPE: regular
\ No newline at end of file
diff --git a/tests/functional_tests/test_cases/gpt/gpt3_345m_nightly_dgx_a100_1N8G_tp4_pp1_overlap_grad_reduce/golden_values_dev.json b/tests/functional_tests/test_cases/gpt/gpt3_345m_nightly_dgx_a100_1N8G_tp4_pp1_overlap_grad_reduce/golden_values_dev.json
new file mode 100644
index 0000000000..7b39f86c32
--- /dev/null
+++ b/tests/functional_tests/test_cases/gpt/gpt3_345m_nightly_dgx_a100_1N8G_tp4_pp1_overlap_grad_reduce/golden_values_dev.json
@@ -0,0 +1 @@
+{"lm loss": {"start_step": 0, "end_step": 50, "step_interval": 5, "values": [10.86312, 10.87712, 10.87347, 10.88278, 10.89457, 10.84427, 10.69023, 10.62687, 10.53974, 10.26525]}, "num-zeros": {"start_step": 0, "end_step": 33, "step_interval": 5, "values": [2244.0, 2273.0, 2447.0, 2031.0, 2134.0, 2491.0, 2380.0]}, "iteration_timing_avg": 0.23131970588235293}
\ No newline at end of file
diff --git a/tests/functional_tests/test_cases/gpt/gpt3_345m_nightly_dgx_a100_1N8G_tp4_pp1_overlap_grad_reduce/golden_values_lts.json b/tests/functional_tests/test_cases/gpt/gpt3_345m_nightly_dgx_a100_1N8G_tp4_pp1_overlap_grad_reduce/golden_values_lts.json
new file mode 100644
index 0000000000..7b39f86c32
--- /dev/null
+++ b/tests/functional_tests/test_cases/gpt/gpt3_345m_nightly_dgx_a100_1N8G_tp4_pp1_overlap_grad_reduce/golden_values_lts.json
@@ -0,0 +1 @@
+{"lm loss": {"start_step": 0, "end_step": 50, "step_interval": 5, "values": [10.86312, 10.87712, 10.87347, 10.88278, 10.89457, 10.84427, 10.69023, 10.62687, 10.53974, 10.26525]}, "num-zeros": {"start_step": 0, "end_step": 33, "step_interval": 5, "values": [2244.0, 2273.0, 2447.0, 2031.0, 2134.0, 2491.0, 2380.0]}, "iteration_timing_avg": 0.23131970588235293}
\ No newline at end of file
diff --git a/tests/functional_tests/test_cases/gpt/gpt3_345m_nightly_dgx_a100_1N8G_tp4_pp1_overlap_grad_reduce/model_config.yaml b/tests/functional_tests/test_cases/gpt/gpt3_345m_nightly_dgx_a100_1N8G_tp4_pp1_overlap_grad_reduce/model_config.yaml
new file mode 100644
index 0000000000..b664115f27
--- /dev/null
+++ b/tests/functional_tests/test_cases/gpt/gpt3_345m_nightly_dgx_a100_1N8G_tp4_pp1_overlap_grad_reduce/model_config.yaml
@@ -0,0 +1,51 @@
+ENV_VARS:
+  CUDA_DEVICE_MAX_CONNECTIONS: 1
+  NVTE_ALLOW_NONDETERMINISTIC_ALGO: 0
+  NCCL_ALGO: Tree
+  CUBLAS_WORKSPACE_CONFIG: :4096:8
+  N_REPEATS: 5
+MODEL_ARGS:
+  --num-layers: 12
+  --hidden-size: 512
+  --num-attention-heads: 8
+  --log-params-norm: true
+  --log-num-zeros-in-grad: true
+  --log-validation-ppl-to-tensorboard: true
+  --log-timers-to-tensorboard: true
+  --tensorboard-dir: ${TENSORBOARD_PATH}
+  --micro-batch-size: 4
+  --global-batch-size: 32
+  --seq-length: 1024
+  --max-position-embeddings: 1024
+  --train-iters: 50
+  --timing-log-level: 2
+  --lr-decay-iters: 320000
+  --save: ${CHECKPOINT_PATH}
+  --load: ${CHECKPOINT_PATH}
+  --data-path: ${DATA_PATH}/my-gpt3_00_text_document
+  --vocab-file: ${DATA_PATH}/bpe/vocab.json
+  --merge-file: ${DATA_PATH}/bpe/merges.txt
+  --split: 949,50,1
+  --distributed-backend: nccl
+  --lr: 0.00015
+  --lr-decay-style: cosine
+  --min-lr: 1.0e-5
+  --weight-decay: 1e-2
+  --clip-grad: 1.0
+  --lr-warmup-fraction: .01
+  --log-interval: 1
+  --save-interval: 10000
+  --eval-interval: 1000
+  --eval-iters: 10
+  --transformer-impl: local
+  --tensor-model-parallel-size: 4
+  --pipeline-model-parallel-size: 1
+  --overlap-grad-reduce: true
+  --deterministic-mode: true
+  --no-gradient-accumulation-fusion: true
+  --ckpt-format: torch
+  --use-legacy-models: true
+  --data-cache-path: ${DATA_CACHE_PATH}
+  --fp16: true
+  --apply-query-key-layer-scaling: true
+TEST_TYPE: regular
\ No newline at end of file
diff --git a/tests/functional_tests/test_cases/gpt/gpt3_345m_nightly_dgx_a100_1N8G_tp4_pp1_resume_torch/golden_values_dev.json b/tests/functional_tests/test_cases/gpt/gpt3_345m_nightly_dgx_a100_1N8G_tp4_pp1_resume_torch/golden_values_dev.json
new file mode 100644
index 0000000000..47198f9ec6
--- /dev/null
+++ b/tests/functional_tests/test_cases/gpt/gpt3_345m_nightly_dgx_a100_1N8G_tp4_pp1_resume_torch/golden_values_dev.json
@@ -0,0 +1 @@
+{"lm loss": {"start_step": 0, "end_step": 100, "step_interval": 5, "values": [10.86312, 10.87712, 10.87347, 10.88278, 10.89457, 10.84427, 10.69023, 10.62687, 10.53974, 10.26525, 10.21403, 9.9801, 9.96977, 9.93973, 9.81158, 9.28667, 9.63194, 9.19732, 9.48341, 9.62985]}, "num-zeros": {"start_step": 0, "end_step": 83, "step_interval": 5, "values": [2244.0, 2273.0, 2447.0, 2031.0, 2134.0, 2491.0, 2380.0, 3451.0, 3205.0, 2940.0, 3143.0, 3310.0, 3884.0, 3232.0, 3491.0, 3751.0, 5022.0]}, "iteration_timing_avg": 0.22914074626865674}
\ No newline at end of file
diff --git a/tests/functional_tests/test_cases/gpt/gpt3_345m_nightly_dgx_a100_1N8G_tp4_pp1_resume_torch/golden_values_lts.json b/tests/functional_tests/test_cases/gpt/gpt3_345m_nightly_dgx_a100_1N8G_tp4_pp1_resume_torch/golden_values_lts.json
new file mode 100644
index 0000000000..47198f9ec6
--- /dev/null
+++ b/tests/functional_tests/test_cases/gpt/gpt3_345m_nightly_dgx_a100_1N8G_tp4_pp1_resume_torch/golden_values_lts.json
@@ -0,0 +1 @@
+{"lm loss": {"start_step": 0, "end_step": 100, "step_interval": 5, "values": [10.86312, 10.87712, 10.87347, 10.88278, 10.89457, 10.84427, 10.69023, 10.62687, 10.53974, 10.26525, 10.21403, 9.9801, 9.96977, 9.93973, 9.81158, 9.28667, 9.63194, 9.19732, 9.48341, 9.62985]}, "num-zeros": {"start_step": 0, "end_step": 83, "step_interval": 5, "values": [2244.0, 2273.0, 2447.0, 2031.0, 2134.0, 2491.0, 2380.0, 3451.0, 3205.0, 2940.0, 3143.0, 3310.0, 3884.0, 3232.0, 3491.0, 3751.0, 5022.0]}, "iteration_timing_avg": 0.22914074626865674}
\ No newline at end of file
diff --git a/tests/functional_tests/test_cases/gpt/gpt3_345m_nightly_dgx_a100_1N8G_tp4_pp1_resume_torch/model_config.yaml b/tests/functional_tests/test_cases/gpt/gpt3_345m_nightly_dgx_a100_1N8G_tp4_pp1_resume_torch/model_config.yaml
new file mode 100644
index 0000000000..0ec5d88ad9
--- /dev/null
+++ b/tests/functional_tests/test_cases/gpt/gpt3_345m_nightly_dgx_a100_1N8G_tp4_pp1_resume_torch/model_config.yaml
@@ -0,0 +1,51 @@
+ENV_VARS:
+  CUDA_DEVICE_MAX_CONNECTIONS: 1
+  NVTE_ALLOW_NONDETERMINISTIC_ALGO: 0
+  NCCL_ALGO: Tree
+  CUBLAS_WORKSPACE_CONFIG: :4096:8
+  N_REPEATS: 5
+MODEL_ARGS:
+  --num-layers: 12
+  --hidden-size: 512
+  --num-attention-heads: 8
+  --log-params-norm: true
+  --log-num-zeros-in-grad: true
+  --log-validation-ppl-to-tensorboard: true
+  --log-timers-to-tensorboard: true
+  --tensorboard-dir: ${TENSORBOARD_PATH}
+  --micro-batch-size: 4
+  --global-batch-size: 32
+  --seq-length: 1024
+  --max-position-embeddings: 1024
+  --train-iters: 100
+  --timing-log-level: 2
+  --lr-decay-iters: 320000
+  --save: ${CHECKPOINT_PATH}
+  --load: ${CHECKPOINT_PATH}
+  --data-path: ${DATA_PATH}/my-gpt3_00_text_document
+  --vocab-file: ${DATA_PATH}/bpe/vocab.json
+  --merge-file: ${DATA_PATH}/bpe/merges.txt
+  --split: 949,50,1
+  --distributed-backend: nccl
+  --lr: 0.00015
+  --lr-decay-style: cosine
+  --min-lr: 1.0e-5
+  --weight-decay: 1e-2
+  --clip-grad: 1.0
+  --lr-warmup-fraction: .01
+  --log-interval: 1
+  --save-interval: 50
+  --eval-interval: 1000
+  --eval-iters: 10
+  --transformer-impl: local
+  --tensor-model-parallel-size: 4
+  --pipeline-model-parallel-size: 1
+  --deterministic-mode: true
+  --no-gradient-accumulation-fusion: true
+  --use-checkpoint-opt_param-scheduler: true
+  --ckpt-format: torch
+  --use-legacy-models: true
+  --data-cache-path: ${DATA_CACHE_PATH}
+  --fp16: true
+  --apply-query-key-layer-scaling: true
+TEST_TYPE: ckpt-resume
\ No newline at end of file
diff --git a/tests/functional_tests/test_cases/gpt/gpt3_345m_weekly_dgx_h100_1N8G_mcore_nondet_tp1_pp1_fp8_no_model_parallel/golden_values_dev.json b/tests/functional_tests/test_cases/gpt/gpt3_345m_weekly_dgx_h100_1N8G_mcore_nondet_tp1_pp1_fp8_no_model_parallel/golden_values_dev.json
new file mode 100644
index 0000000000..7335b2067c
--- /dev/null
+++ b/tests/functional_tests/test_cases/gpt/gpt3_345m_weekly_dgx_h100_1N8G_mcore_nondet_tp1_pp1_fp8_no_model_parallel/golden_values_dev.json
@@ -0,0 +1 @@
+{"forward-backward-time": {"start_step": 0, "end_step": 2000, "step_interval": 5, "values": [18.28053, 0.49505, 0.49249, 0.4863, 0.49126, 0.48294, 0.48297, 0.49211, 0.49244, 0.48476, 0.49685, 0.48221, 0.48444, 0.48262, 0.4868, 0.4822, 0.48935, 0.49261, 0.49648, 0.48319, 0.48763, 0.48829, 0.48803, 0.48167, 0.48323, 0.48629, 0.48421, 0.48466, 0.48642, 0.48171, 0.5845, 0.48341, 0.47926, 0.48909, 0.49939, 0.50358, 0.4812, 0.48449, 0.48356, 0.48264, 0.48384, 0.48252, 0.4847, 0.48316, 0.48125, 0.48107, 0.57559, 0.48254, 0.48595, 0.48176, 0.48343, 0.48901, 0.48231, 0.48126, 0.48705, 0.48449, 0.48313, 0.48504, 0.49265, 0.49529, 0.48979, 0.48846, 0.48904, 0.48991, 0.49197, 0.48869, 0.48889, 0.49026, 0.49051, 0.48812, 0.4895, 0.4888, 0.49274, 0.49157, 0.49398, 0.68596, 0.48574, 0.48994, 0.48496, 0.496, 0.48608, 0.49521, 0.48726, 0.49274, 0.48836, 0.49429, 0.49013, 0.49126, 0.48792, 0.49147, 0.49169, 0.48964, 0.49008, 0.49378, 0.49365, 0.49165, 0.49075, 0.57694, 0.48973, 0.48945, 0.48773, 0.49186, 0.48699, 0.49202, 0.48785, 0.48984, 0.48807, 0.4924, 0.48739, 0.48901, 0.48669, 0.48864, 0.48892, 0.48906, 0.48729, 0.48907, 0.4886, 0.49334, 0.48702, 0.57734, 0.70083, 0.49192, 0.48993, 0.48756, 0.48839, 0.49692, 0.49292, 0.48647, 0.49172, 0.4875, 0.49397, 0.48663, 0.49145, 0.48815, 0.49401, 0.48878, 0.49212, 0.48753, 0.49235, 0.48811, 0.49451, 0.48865, 0.58524, 0.49262, 0.49011, 0.48923, 0.48823, 0.49108, 0.4881, 0.49074, 0.49805, 0.49124, 0.48831, 0.49161, 0.48613, 0.49324, 0.48948, 0.49372, 0.48427, 0.49263, 0.48691, 0.49317, 0.49667, 0.4969, 0.57482, 0.61619, 0.48773, 0.48884, 0.49076, 0.49017, 0.48952, 0.49239, 0.49075, 0.48963, 0.4911, 0.48939, 0.48983, 0.49046, 0.49409, 0.48869, 0.49044, 0.4872, 0.49356, 0.48711, 0.49475, 0.49335, 0.49242, 0.48938, 0.48799, 0.49308, 0.48649, 0.49513, 0.57985, 0.49149, 0.49028, 0.4911, 0.49172, 0.48942, 0.49435, 0.48938, 0.47502, 0.48947, 0.48882, 0.48685, 0.48977, 0.4839, 0.49208, 0.49183, 0.4899, 0.49107, 0.48954, 0.48936, 0.49081, 0.48809, 0.49012, 0.49118, 0.49592, 0.49005, 0.49234, 0.48935, 0.49702, 0.4881, 0.49255, 0.4923, 0.49215, 0.49408, 0.4896, 0.49166, 0.49036, 0.57641, 0.49203, 0.4866, 0.49827, 0.49306, 0.48826, 0.49197, 0.50213, 0.49344, 0.48736, 0.49635, 0.57884, 0.49438, 0.49181, 0.49665, 0.49267, 0.48679, 0.48884, 0.48977, 0.49284, 0.48791, 0.49204, 0.49178, 0.49595, 0.4931, 0.49191, 0.48826, 0.49306, 0.48701, 0.48992, 0.48579, 0.49069, 0.48562, 0.49508, 0.48592, 0.49748, 0.4852, 0.49001, 0.48851, 0.48928, 0.48685, 0.4898, 0.49343, 0.48889, 0.49276, 0.4874, 0.50472, 0.49085, 0.59958, 0.49141, 0.49279, 0.49191, 0.48975, 0.4895, 0.49082, 0.48927, 0.4914, 0.48634, 0.48671, 0.48679, 0.49495, 0.48847, 0.49036, 0.48784, 0.49319, 0.4893, 0.49337, 0.58198, 0.58629, 0.4953, 0.49089, 0.48763, 0.49392, 0.48743, 0.49484, 0.48893, 0.49356, 0.48948, 0.49182, 0.48987, 0.49043, 0.49529, 0.49039, 0.4921, 0.49072, 0.59678, 0.49229, 0.49187, 0.4928, 0.49741, 0.49468, 0.48644, 0.49313, 0.49332, 0.48749, 0.49394, 0.48779, 0.49346, 0.48849, 0.49244, 0.48985, 0.49183, 0.49358, 0.48865, 0.49267, 0.4914, 0.49166, 0.48871, 0.49327, 0.49077, 0.49024, 0.49629, 0.48853, 0.57947, 0.49147, 0.48886, 0.50383, 0.48817, 0.49188, 0.4873, 0.49974, 0.49014, 0.4908, 0.4922, 0.49589, 0.49266, 0.48782, 0.49383, 0.48872, 0.49176, 0.49069, 0.49264, 0.49042, 0.4914, 0.4912, 0.48803, 0.49078, 0.49007, 0.48811, 0.49406, 0.48945, 0.48976, 0.49052, 0.49238, 0.48839, 0.48749, 0.48884, 0.49154, 0.48706, 0.48761, 0.49108, 0.49077, 0.49131, 0.49425, 0.48822, 0.49246, 0.49172, 0.49273, 0.57851, 0.49276, 0.49599, 0.48901, 0.49655, 0.49128, 0.48808, 0.49162, 0.49012, 0.49189, 0.50308, 0.49552, 0.48646]}, "forward-compute-time": {"start_step": 0, "end_step": 2000, "step_interval": 5, "values": [9.21276, 0.28687, 0.28815, 0.2833, 0.28439, 0.27844, 0.27842, 0.28317, 0.28459, 0.28018, 0.29052, 0.27923, 0.27964, 0.27881, 0.28284, 0.27894, 0.2858, 0.28599, 0.29109, 0.28083, 0.28444, 0.28303, 0.2848, 0.27728, 0.28052, 0.2809, 0.27929, 0.2805, 0.28333, 0.27803, 0.3776, 0.27848, 0.27391, 0.28208, 0.29927, 0.30354, 0.28082, 0.28432, 0.28327, 0.28318, 0.28355, 0.28207, 0.28438, 0.28242, 0.28127, 0.28045, 0.37514, 0.2813, 0.28253, 0.28106, 0.28235, 0.28881, 0.28182, 0.28128, 0.28489, 0.28348, 0.2813, 0.28279, 0.29008, 0.29295, 0.28746, 0.2869, 0.28708, 0.28818, 0.28744, 0.28543, 0.28582, 0.28782, 0.28724, 0.28631, 0.28595, 0.28734, 0.2881, 0.28983, 0.2918, 0.48123, 0.28384, 0.28784, 0.28341, 0.28813, 0.28363, 0.29108, 0.2853, 0.28861, 0.28671, 0.29218, 0.28714, 0.29008, 0.28661, 0.29, 0.28895, 0.28724, 0.289, 0.29102, 0.28959, 0.28779, 0.28919, 0.37298, 0.28802, 0.28671, 0.28631, 0.29013, 0.28597, 0.29054, 0.28653, 0.28662, 0.28618, 0.28937, 0.285, 0.28745, 0.28473, 0.2862, 0.28623, 0.28613, 0.28465, 0.28674, 0.2875, 0.2909, 0.28626, 0.37409, 0.49531, 0.29025, 0.28653, 0.28605, 0.284, 0.29546, 0.29024, 0.28506, 0.29074, 0.28487, 0.29199, 0.28427, 0.28721, 0.28569, 0.28978, 0.28671, 0.29019, 0.2858, 0.29107, 0.28549, 0.28872, 0.28587, 0.38328, 0.28744, 0.28899, 0.28716, 0.28682, 0.28652, 0.28709, 0.28668, 0.29569, 0.28914, 0.28688, 0.28981, 0.28508, 0.29181, 0.28828, 0.29083, 0.28368, 0.28892, 0.28472, 0.2903, 0.29275, 0.29136, 0.3738, 0.41333, 0.28566, 0.28691, 0.28887, 0.2879, 0.28701, 0.2905, 0.28746, 0.28816, 0.28899, 0.28753, 0.2884, 0.28928, 0.29105, 0.28699, 0.28797, 0.28497, 0.29203, 0.28489, 0.28827, 0.29119, 0.29128, 0.28793, 0.28557, 0.29143, 0.28602, 0.29322, 0.37776, 0.28815, 0.28911, 0.28768, 0.28978, 0.2868, 0.2925, 0.28589, 0.27191, 0.28653, 0.28666, 0.28333, 0.28729, 0.28057, 0.28965, 0.2861, 0.28679, 0.28928, 0.28452, 0.28737, 0.28913, 0.28511, 0.28745, 0.28832, 0.29349, 0.28729, 0.28924, 0.28804, 0.29076, 0.28598, 0.29056, 0.28869, 0.28825, 0.29164, 0.28711, 0.28995, 0.2878, 0.37312, 0.28833, 0.28482, 0.29549, 0.28742, 0.28591, 0.28649, 0.29968, 0.29157, 0.2854, 0.29423, 0.37624, 0.29269, 0.28871, 0.29189, 0.28756, 0.28409, 0.28672, 0.28672, 0.29028, 0.28554, 0.29097, 0.28867, 0.29335, 0.29036, 0.28781, 0.28622, 0.28846, 0.28532, 0.28399, 0.28365, 0.28792, 0.28385, 0.29346, 0.28436, 0.29447, 0.28249, 0.28597, 0.28637, 0.28537, 0.28417, 0.28799, 0.28802, 0.28653, 0.29059, 0.28295, 0.30255, 0.28676, 0.39524, 0.28938, 0.28909, 0.28993, 0.28689, 0.2868, 0.28486, 0.2869, 0.28468, 0.28373, 0.28395, 0.28399, 0.29311, 0.28649, 0.28867, 0.2844, 0.29111, 0.28595, 0.29083, 0.37422, 0.38481, 0.2917, 0.28795, 0.28411, 0.29214, 0.28545, 0.29182, 0.28619, 0.29032, 0.28643, 0.28955, 0.287, 0.28693, 0.29048, 0.28673, 0.28964, 0.28608, 0.39417, 0.28909, 0.28926, 0.28892, 0.29626, 0.29035, 0.28418, 0.29096, 0.28911, 0.2861, 0.29247, 0.28616, 0.28914, 0.28625, 0.28976, 0.28808, 0.28866, 0.29068, 0.28692, 0.29086, 0.28868, 0.29004, 0.28595, 0.29148, 0.28842, 0.2886, 0.29171, 0.28773, 0.3764, 0.28898, 0.28636, 0.29892, 0.28549, 0.28973, 0.28465, 0.29697, 0.28725, 0.28663, 0.2894, 0.294, 0.29116, 0.28622, 0.29179, 0.28632, 0.29035, 0.28768, 0.28989, 0.28709, 0.2891, 0.28817, 0.28602, 0.28837, 0.28768, 0.28625, 0.28964, 0.28715, 0.287, 0.28748, 0.29025, 0.28485, 0.28473, 0.2867, 0.28777, 0.28402, 0.28515, 0.28793, 0.28644, 0.2893, 0.28758, 0.28612, 0.28687, 0.29012, 0.2871, 0.37328, 0.28876, 0.29273, 0.28732, 0.29333, 0.28722, 0.28605, 0.2878, 0.28786, 0.28733, 0.29635, 0.29189, 0.28435]}, "backward-compute-time": {"start_step": 0, "end_step": 2000, "step_interval": 5, "values": [3.24795, 0.21194, 0.21471, 0.20869, 0.21204, 0.20759, 0.20377, 0.2107, 0.20945, 0.20618, 0.21705, 0.20521, 0.20785, 0.20627, 0.20635, 0.2064, 0.20649, 0.21053, 0.21523, 0.20491, 0.20938, 0.20895, 0.21121, 0.20684, 0.20811, 0.20914, 0.20848, 0.20944, 0.21029, 0.2088, 0.20823, 0.20765, 0.20786, 0.21144, 0.20746, 0.20856, 0.20791, 0.20961, 0.20962, 0.20803, 0.20624, 0.20748, 0.20646, 0.20637, 0.20506, 0.20636, 0.20873, 0.20709, 0.21021, 0.20645, 0.20725, 0.21067, 0.20689, 0.20484, 0.21018, 0.20758, 0.20809, 0.20663, 0.21735, 0.22092, 0.2181, 0.21664, 0.21604, 0.21705, 0.21811, 0.2175, 0.21613, 0.21894, 0.2186, 0.21706, 0.21821, 0.21776, 0.22265, 0.21862, 0.2187, 0.21766, 0.21611, 0.217, 0.21459, 0.22041, 0.21715, 0.2188, 0.21633, 0.21946, 0.21474, 0.21906, 0.21831, 0.21662, 0.21778, 0.21777, 0.21604, 0.21593, 0.21431, 0.21926, 0.2178, 0.21741, 0.21712, 0.22133, 0.2158, 0.21733, 0.21522, 0.21854, 0.21582, 0.21924, 0.21532, 0.21807, 0.216, 0.22003, 0.21598, 0.21559, 0.21655, 0.21799, 0.21734, 0.21749, 0.21785, 0.21759, 0.21855, 0.21936, 0.21602, 0.21592, 0.21786, 0.22091, 0.21874, 0.21753, 0.21923, 0.22306, 0.22024, 0.21591, 0.22007, 0.2187, 0.222, 0.2157, 0.22232, 0.21719, 0.22251, 0.21763, 0.22074, 0.21731, 0.21953, 0.21712, 0.22337, 0.22066, 0.22071, 0.21949, 0.21972, 0.21565, 0.21695, 0.22019, 0.21716, 0.219, 0.22553, 0.21923, 0.21738, 0.2203, 0.21678, 0.22028, 0.21797, 0.22029, 0.21479, 0.22065, 0.21605, 0.22109, 0.22372, 0.22023, 0.2184, 0.21646, 0.21673, 0.21835, 0.21624, 0.21877, 0.21593, 0.21993, 0.21906, 0.21748, 0.21846, 0.21846, 0.21773, 0.21782, 0.22154, 0.21764, 0.2193, 0.2172, 0.21983, 0.21556, 0.22293, 0.22107, 0.22132, 0.21857, 0.21717, 0.22128, 0.21593, 0.22043, 0.22094, 0.22038, 0.21956, 0.21936, 0.21966, 0.21754, 0.22141, 0.21803, 0.21648, 0.21739, 0.21902, 0.21686, 0.21805, 0.21493, 0.22077, 0.22186, 0.21962, 0.22048, 0.22052, 0.21855, 0.21913, 0.21681, 0.21996, 0.22012, 0.22218, 0.22009, 0.21986, 0.21939, 0.22266, 0.2163, 0.21865, 0.22182, 0.2197, 0.22192, 0.21676, 0.22102, 0.21734, 0.22013, 0.21984, 0.21564, 0.22434, 0.22271, 0.21673, 0.22212, 0.22818, 0.22064, 0.21733, 0.22214, 0.21857, 0.2223, 0.22007, 0.22387, 0.22019, 0.21548, 0.21818, 0.21601, 0.22079, 0.21586, 0.22149, 0.2206, 0.2192, 0.22065, 0.22097, 0.21714, 0.22179, 0.21621, 0.21994, 0.21491, 0.21991, 0.21504, 0.2197, 0.21388, 0.2201, 0.21487, 0.21828, 0.21636, 0.2175, 0.2155, 0.21587, 0.22018, 0.2151, 0.21983, 0.21588, 0.22793, 0.21875, 0.21694, 0.21987, 0.21989, 0.2186, 0.21826, 0.21718, 0.21971, 0.21741, 0.22031, 0.21565, 0.21643, 0.21559, 0.22115, 0.21694, 0.21849, 0.2154, 0.2201, 0.2167, 0.21944, 0.22561, 0.21402, 0.22049, 0.21782, 0.21537, 0.22116, 0.2162, 0.21949, 0.21494, 0.21795, 0.21647, 0.2181, 0.21867, 0.21751, 0.22266, 0.21692, 0.21888, 0.218, 0.22288, 0.21842, 0.21856, 0.21818, 0.22158, 0.22161, 0.21476, 0.21952, 0.21926, 0.21497, 0.21832, 0.21576, 0.21887, 0.2162, 0.21752, 0.21687, 0.21921, 0.22035, 0.21626, 0.22133, 0.21774, 0.22037, 0.21522, 0.22047, 0.21579, 0.21844, 0.22391, 0.21642, 0.21898, 0.21906, 0.21598, 0.22975, 0.21527, 0.21717, 0.21546, 0.22404, 0.21811, 0.21888, 0.2205, 0.22021, 0.22075, 0.21565, 0.21932, 0.21653, 0.21917, 0.21911, 0.22008, 0.21787, 0.21844, 0.21948, 0.21617, 0.21938, 0.21829, 0.21659, 0.2228, 0.21857, 0.21702, 0.21841, 0.21741, 0.21545, 0.21539, 0.21773, 0.21824, 0.21609, 0.21521, 0.21832, 0.21767, 0.21765, 0.21961, 0.21554, 0.21864, 0.21727, 0.21996, 0.21834, 0.21793, 0.22003, 0.21486, 0.22016, 0.21713, 0.21621, 0.21798, 0.21593, 0.21822, 0.22518, 0.21883, 0.21389]}, "batch-generator-time": {"start_step": 0, "end_step": 2000, "step_interval": 5, "values": [0.60577, 0.00374, 0.00393, 0.00334, 0.0036, 0.00342, 0.00344, 0.00397, 0.00331, 0.00323, 0.00356, 0.00332, 0.00341, 0.00356, 0.00347, 0.00308, 0.00337, 0.00327, 0.00342, 0.00359, 0.00317, 0.00312, 0.00326, 0.00315, 0.00321, 0.00318, 0.00314, 0.00309, 0.00313, 0.0031, 0.00327, 0.00314, 0.00303, 0.00338, 0.00311, 0.00306, 0.00302, 0.00321, 0.00306, 0.0032, 0.00305, 0.00309, 0.00302, 0.00328, 0.00297, 0.00295, 0.00322, 0.00301, 0.00307, 0.00325, 0.00287, 0.00312, 0.00289, 0.00302, 0.00308, 0.00307, 0.00308, 0.0035, 0.00327, 0.0032, 0.00318, 0.00312, 0.00322, 0.00336, 0.00333, 0.00345, 0.00311, 0.00326, 0.00307, 0.00318, 0.00309, 0.00331, 0.0031, 0.00327, 0.00333, 0.0033, 0.00321, 0.00328, 0.00317, 0.00325, 0.00309, 0.0033, 0.00326, 0.00323, 0.00321, 0.00319, 0.00318, 0.00329, 0.00315, 0.00331, 0.00368, 0.00361, 0.00377, 0.00374, 0.00383, 0.00345, 0.00348, 0.00347, 0.00339, 0.0035, 0.00312, 0.00344, 0.00325, 0.00318, 0.00318, 0.00323, 0.00328, 0.00331, 0.00329, 0.00318, 0.00327, 0.0032, 0.00317, 0.00314, 0.00313, 0.00316, 0.00327, 0.00348, 0.00319, 0.00309, 0.00338, 0.00315, 0.00347, 0.00335, 0.00315, 0.00314, 0.00339, 0.00316, 0.00323, 0.00311, 0.00331, 0.00317, 0.00311, 0.00316, 0.00317, 0.00314, 0.00323, 0.00319, 0.00311, 0.00328, 0.00326, 0.00315, 0.00319, 0.0035, 0.00303, 0.00311, 0.00331, 0.00334, 0.00314, 0.00323, 0.00345, 0.00325, 0.00319, 0.00322, 0.00331, 0.00339, 0.00342, 0.00343, 0.00335, 0.00349, 0.00338, 0.00342, 0.00327, 0.00325, 0.00331, 0.00327, 0.00328, 0.00325, 0.00321, 0.00326, 0.00324, 0.00346, 0.00329, 0.00347, 0.00325, 0.00327, 0.00322, 0.0032, 0.00311, 0.00307, 0.00322, 0.00303, 0.00312, 0.00323, 0.00329, 0.00312, 0.00323, 0.00323, 0.00307, 0.00315, 0.00324, 0.00314, 0.00308, 0.00308, 0.00313, 0.00322, 0.00318, 0.0032, 0.0032, 0.00322, 0.02747, 0.00304, 0.0031, 0.00322, 0.00309, 0.00303, 0.00319, 0.00304, 0.00319, 0.00315, 0.00305, 0.00324, 0.00328, 0.00297, 0.0033, 0.00302, 0.00329, 0.00319, 0.00309, 0.00319, 0.00324, 0.00336, 0.00317, 0.00324, 0.00322, 0.00343, 0.00323, 0.00314, 0.00337, 0.00333, 0.00319, 0.00305, 0.00351, 0.00342, 0.00323, 0.00333, 0.00325, 0.00329, 0.00309, 0.00337, 0.00313, 0.00331, 0.00309, 0.00329, 0.00319, 0.00325, 0.00323, 0.00324, 0.00332, 0.0034, 0.0033, 0.00322, 0.00318, 0.00319, 0.00329, 0.00315, 0.00329, 0.00325, 0.00333, 0.00322, 0.00337, 0.00313, 0.00313, 0.00327, 0.00332, 0.00313, 0.00307, 0.00312, 0.00306, 0.00322, 0.00309, 0.0033, 0.00323, 0.00341, 0.00326, 0.0035, 0.00329, 0.00341, 0.00333, 0.00334, 0.00347, 0.00314, 0.00336, 0.00336, 0.00329, 0.0032, 0.00322, 0.00331, 0.00337, 0.00336, 0.00312, 0.00321, 0.00407, 0.00319, 0.00353, 0.00339, 0.00344, 0.00327, 0.00338, 0.00335, 0.00325, 0.00334, 0.00318, 0.00329, 0.00329, 0.00323, 0.00318, 0.00325, 0.00322, 0.00317, 0.00327, 0.00307, 0.00322, 0.00305, 0.00323, 0.00318, 0.00328, 0.00317, 0.00326, 0.00313, 0.00312, 0.00317, 0.00319, 0.00322, 0.00326, 0.00311, 0.00318, 0.00349, 0.00314, 0.00329, 0.00324, 0.00339, 0.0031, 0.00326, 0.00308, 0.00316, 0.0031, 0.0034, 0.00318, 0.00327, 0.00321, 0.00313, 0.00335, 0.00311, 0.00333, 0.00329, 0.0031, 0.00325, 0.00325, 0.00326, 0.0033, 0.00323, 0.00315, 0.00321, 0.00322, 0.003, 0.00355, 0.00301, 0.00302, 0.00319, 0.00323, 0.0032, 0.00321, 0.0031, 0.00344, 0.00317, 0.0033, 0.00322, 0.00317, 0.00318, 0.00314, 0.00328, 0.0033, 0.0033, 0.0031, 0.00321, 0.0033, 0.00315, 0.00323, 0.00342, 0.00315, 0.00321, 0.00324, 0.00312, 0.00341, 0.00323, 0.00333, 0.00335, 0.00334, 0.00324, 0.00319, 0.00335, 0.00319, 0.0032, 0.00317, 0.0033, 0.00322, 0.00334, 0.0034, 0.00306]}, "layernorm-grads-all-reduce-time": {"start_step": 0, "end_step": 2000, "step_interval": 5, "values": [3e-05, 1e-05, 1e-05, 1e-05, 1e-05, 1e-05, 1e-05, 1e-05, 1e-05, 1e-05, 1e-05, 1e-05, 2e-05, 1e-05, 1e-05, 1e-05, 1e-05, 1e-05, 1e-05, 1e-05, 1e-05, 1e-05, 1e-05, 1e-05, 1e-05, 2e-05, 1e-05, 1e-05, 1e-05, 1e-05, 1e-05, 1e-05, 1e-05, 1e-05, 1e-05, 1e-05, 1e-05, 1e-05, 1e-05, 1e-05, 1e-05, 1e-05, 1e-05, 1e-05, 1e-05, 1e-05, 1e-05, 1e-05, 1e-05, 1e-05, 2e-05, 1e-05, 1e-05, 1e-05, 1e-05, 1e-05, 1e-05, 1e-05, 1e-05, 1e-05, 1e-05, 1e-05, 1e-05, 1e-05, 2e-05, 1e-05, 1e-05, 1e-05, 1e-05, 1e-05, 1e-05, 1e-05, 1e-05, 1e-05, 1e-05, 2e-05, 2e-05, 1e-05, 1e-05, 1e-05, 1e-05, 1e-05, 1e-05, 1e-05, 1e-05, 1e-05, 1e-05, 1e-05, 1e-05, 1e-05, 1e-05, 2e-05, 1e-05, 1e-05, 1e-05, 1e-05, 1e-05, 1e-05, 1e-05, 1e-05, 1e-05, 1e-05, 1e-05, 1e-05, 1e-05, 2e-05, 1e-05, 1e-05, 1e-05, 1e-05, 1e-05, 1e-05, 1e-05, 1e-05, 1e-05, 1e-05, 1e-05, 1e-05, 1e-05, 1e-05, 1e-05, 1e-05, 1e-05, 1e-05, 1e-05, 1e-05, 1e-05, 1e-05, 1e-05, 1e-05, 1e-05, 1e-05, 2e-05, 2e-05, 1e-05, 1e-05, 1e-05, 1e-05, 1e-05, 1e-05, 1e-05, 1e-05, 1e-05, 1e-05, 1e-05, 1e-05, 1e-05, 1e-05, 1e-05, 1e-05, 1e-05, 1e-05, 1e-05, 1e-05, 1e-05, 1e-05, 1e-05, 1e-05, 1e-05, 1e-05, 1e-05, 1e-05, 1e-05, 1e-05, 1e-05, 1e-05, 1e-05, 1e-05, 1e-05, 1e-05, 1e-05, 1e-05, 1e-05, 1e-05, 1e-05, 1e-05, 1e-05, 1e-05, 1e-05, 1e-05, 1e-05, 1e-05, 1e-05, 1e-05, 1e-05, 1e-05, 2e-05, 1e-05, 1e-05, 1e-05, 1e-05, 1e-05, 1e-05, 1e-05, 1e-05, 1e-05, 1e-05, 1e-05, 1e-05, 1e-05, 1e-05, 1e-05, 1e-05, 1e-05, 1e-05, 1e-05, 1e-05, 1e-05, 1e-05, 1e-05, 2e-05, 1e-05, 1e-05, 1e-05, 1e-05, 2e-05, 1e-05, 1e-05, 1e-05, 1e-05, 1e-05, 1e-05, 1e-05, 1e-05, 1e-05, 1e-05, 1e-05, 1e-05, 1e-05, 1e-05, 2e-05, 1e-05, 1e-05, 1e-05, 1e-05, 1e-05, 1e-05, 1e-05, 1e-05, 1e-05, 1e-05, 1e-05, 1e-05, 1e-05, 1e-05, 1e-05, 1e-05, 1e-05, 1e-05, 1e-05, 1e-05, 1e-05, 1e-05, 1e-05, 1e-05, 1e-05, 1e-05, 1e-05, 1e-05, 1e-05, 1e-05, 1e-05, 1e-05, 1e-05, 1e-05, 1e-05, 1e-05, 1e-05, 1e-05, 1e-05, 1e-05, 1e-05, 1e-05, 1e-05, 1e-05, 1e-05, 1e-05, 1e-05, 1e-05, 1e-05, 1e-05, 1e-05, 1e-05, 1e-05, 1e-05, 1e-05, 1e-05, 1e-05, 1e-05, 1e-05, 1e-05, 1e-05, 1e-05, 1e-05, 1e-05, 1e-05, 1e-05, 1e-05, 1e-05, 1e-05, 1e-05, 1e-05, 1e-05, 1e-05, 1e-05, 1e-05, 1e-05, 1e-05, 1e-05, 1e-05, 1e-05, 1e-05, 1e-05, 1e-05, 1e-05, 1e-05, 1e-05, 1e-05, 1e-05, 1e-05, 1e-05, 1e-05, 2e-05, 1e-05, 1e-05, 1e-05, 1e-05, 1e-05, 1e-05, 1e-05, 1e-05, 1e-05, 1e-05, 1e-05, 1e-05, 1e-05, 1e-05, 1e-05, 1e-05, 1e-05, 1e-05, 1e-05, 1e-05, 1e-05, 1e-05, 1e-05, 1e-05, 1e-05, 1e-05, 1e-05, 1e-05, 1e-05, 1e-05, 1e-05, 1e-05, 1e-05, 1e-05, 1e-05, 1e-05, 1e-05, 1e-05, 1e-05, 1e-05, 1e-05, 1e-05, 1e-05, 1e-05, 1e-05, 1e-05, 1e-05, 1e-05, 1e-05, 1e-05, 1e-05, 1e-05, 1e-05, 1e-05, 1e-05, 1e-05, 1e-05, 1e-05, 1e-05, 1e-05, 1e-05, 1e-05, 1e-05, 1e-05, 1e-05, 1e-05, 1e-05, 1e-05, 1e-05, 1e-05, 1e-05, 1e-05, 1e-05, 1e-05, 1e-05, 1e-05, 1e-05]}, "embedding-grads-all-reduce-time": {"start_step": 0, "end_step": 2000, "step_interval": 5, "values": [4e-05, 2e-05, 2e-05, 2e-05, 2e-05, 2e-05, 2e-05, 2e-05, 2e-05, 2e-05, 2e-05, 2e-05, 3e-05, 2e-05, 2e-05, 2e-05, 2e-05, 2e-05, 2e-05, 2e-05, 2e-05, 2e-05, 2e-05, 2e-05, 2e-05, 2e-05, 2e-05, 2e-05, 2e-05, 2e-05, 2e-05, 2e-05, 2e-05, 2e-05, 2e-05, 2e-05, 2e-05, 2e-05, 2e-05, 2e-05, 2e-05, 2e-05, 2e-05, 2e-05, 2e-05, 2e-05, 2e-05, 2e-05, 2e-05, 2e-05, 3e-05, 2e-05, 2e-05, 2e-05, 2e-05, 2e-05, 2e-05, 2e-05, 2e-05, 2e-05, 2e-05, 2e-05, 2e-05, 2e-05, 2e-05, 2e-05, 2e-05, 2e-05, 2e-05, 2e-05, 2e-05, 2e-05, 2e-05, 2e-05, 2e-05, 2e-05, 2e-05, 2e-05, 2e-05, 2e-05, 2e-05, 2e-05, 2e-05, 2e-05, 2e-05, 2e-05, 2e-05, 2e-05, 2e-05, 2e-05, 2e-05, 2e-05, 2e-05, 2e-05, 2e-05, 2e-05, 2e-05, 2e-05, 2e-05, 2e-05, 2e-05, 2e-05, 2e-05, 2e-05, 2e-05, 2e-05, 2e-05, 2e-05, 2e-05, 2e-05, 2e-05, 2e-05, 2e-05, 2e-05, 2e-05, 2e-05, 2e-05, 2e-05, 2e-05, 2e-05, 2e-05, 2e-05, 2e-05, 2e-05, 2e-05, 2e-05, 2e-05, 2e-05, 2e-05, 2e-05, 2e-05, 2e-05, 2e-05, 2e-05, 2e-05, 2e-05, 2e-05, 2e-05, 2e-05, 2e-05, 2e-05, 2e-05, 2e-05, 2e-05, 2e-05, 2e-05, 2e-05, 2e-05, 2e-05, 2e-05, 2e-05, 2e-05, 2e-05, 2e-05, 2e-05, 2e-05, 2e-05, 2e-05, 2e-05, 2e-05, 2e-05, 2e-05, 2e-05, 2e-05, 2e-05, 2e-05, 2e-05, 2e-05, 2e-05, 2e-05, 2e-05, 2e-05, 2e-05, 2e-05, 2e-05, 2e-05, 2e-05, 2e-05, 2e-05, 2e-05, 2e-05, 2e-05, 2e-05, 2e-05, 2e-05, 2e-05, 2e-05, 2e-05, 2e-05, 2e-05, 2e-05, 2e-05, 2e-05, 2e-05, 2e-05, 2e-05, 2e-05, 2e-05, 2e-05, 2e-05, 2e-05, 2e-05, 2e-05, 2e-05, 2e-05, 2e-05, 2e-05, 2e-05, 2e-05, 2e-05, 2e-05, 2e-05, 2e-05, 2e-05, 2e-05, 2e-05, 2e-05, 2e-05, 2e-05, 2e-05, 2e-05, 2e-05, 2e-05, 2e-05, 2e-05, 2e-05, 2e-05, 2e-05, 2e-05, 2e-05, 2e-05, 2e-05, 2e-05, 2e-05, 2e-05, 2e-05, 2e-05, 2e-05, 2e-05, 2e-05, 2e-05, 2e-05, 2e-05, 2e-05, 2e-05, 2e-05, 2e-05, 2e-05, 2e-05, 2e-05, 2e-05, 2e-05, 2e-05, 2e-05, 2e-05, 2e-05, 2e-05, 2e-05, 2e-05, 2e-05, 2e-05, 2e-05, 2e-05, 2e-05, 2e-05, 2e-05, 2e-05, 2e-05, 2e-05, 2e-05, 2e-05, 2e-05, 2e-05, 2e-05, 2e-05, 2e-05, 2e-05, 2e-05, 2e-05, 2e-05, 2e-05, 2e-05, 2e-05, 2e-05, 2e-05, 2e-05, 2e-05, 2e-05, 2e-05, 2e-05, 2e-05, 2e-05, 2e-05, 2e-05, 2e-05, 2e-05, 2e-05, 2e-05, 2e-05, 2e-05, 2e-05, 2e-05, 2e-05, 2e-05, 2e-05, 2e-05, 2e-05, 2e-05, 2e-05, 2e-05, 2e-05, 2e-05, 2e-05, 2e-05, 2e-05, 2e-05, 2e-05, 2e-05, 2e-05, 2e-05, 2e-05, 2e-05, 2e-05, 2e-05, 2e-05, 2e-05, 2e-05, 2e-05, 2e-05, 2e-05, 2e-05, 2e-05, 2e-05, 2e-05, 2e-05, 2e-05, 2e-05, 2e-05, 2e-05, 2e-05, 2e-05, 2e-05, 2e-05, 2e-05, 2e-05, 2e-05, 2e-05, 2e-05, 2e-05, 2e-05, 2e-05, 2e-05, 2e-05, 2e-05, 2e-05, 2e-05, 2e-05, 2e-05, 2e-05, 2e-05, 2e-05, 2e-05, 2e-05, 2e-05, 2e-05, 2e-05, 2e-05, 2e-05, 2e-05, 2e-05, 2e-05, 2e-05, 2e-05, 2e-05, 2e-05, 2e-05, 2e-05, 2e-05, 2e-05, 2e-05, 2e-05, 2e-05, 2e-05, 2e-05, 2e-05, 2e-05, 2e-05, 2e-05, 2e-05, 2e-05, 2e-05, 2e-05, 2e-05, 2e-05, 2e-05, 2e-05, 2e-05, 2e-05, 2e-05, 2e-05]}, "all-grads-sync-time": {"start_step": 0, "end_step": 2000, "step_interval": 5, "values": [6.03213, 0.0015, 0.00156, 0.00153, 0.00152, 0.00153, 0.00156, 0.00153, 0.00152, 0.00153, 0.00155, 0.00152, 0.00157, 0.00153, 0.00155, 0.00153, 0.00153, 0.00151, 0.00155, 0.00153, 0.00154, 0.00152, 0.00154, 0.00153, 0.00155, 0.00154, 0.00154, 0.00154, 0.00154, 0.00153, 0.00156, 0.00152, 0.00152, 0.00153, 0.00156, 0.00153, 0.00153, 0.00155, 0.00153, 0.00152, 0.00154, 0.00155, 0.00155, 0.00152, 0.00152, 0.00153, 0.00154, 0.00153, 0.00154, 0.00152, 0.00154, 0.00154, 0.00155, 0.00153, 0.00156, 0.00154, 0.00156, 0.00153, 0.00156, 0.00151, 0.00154, 0.00153, 0.00156, 0.00151, 0.00156, 0.00155, 0.00155, 0.00152, 0.00155, 0.00152, 0.00154, 0.00153, 0.00156, 0.00153, 0.00154, 0.00154, 0.00156, 0.00154, 0.00155, 0.00155, 0.00155, 0.00153, 0.00154, 0.00152, 0.00155, 0.00154, 0.00156, 0.00153, 0.00153, 0.00153, 0.00155, 0.00154, 0.00155, 0.00153, 0.00154, 0.00153, 0.00155, 0.00153, 0.00154, 0.00152, 0.00155, 0.00152, 0.00155, 0.00154, 0.00155, 0.00154, 0.00155, 0.00153, 0.00154, 0.00152, 0.00155, 0.00153, 0.00153, 0.00154, 0.00154, 0.00151, 0.00155, 0.00153, 0.00156, 0.00153, 0.00155, 0.00154, 0.00156, 0.00156, 0.00155, 0.00154, 0.00155, 0.00153, 0.00152, 0.00153, 0.00155, 0.00154, 0.00155, 0.00154, 0.00154, 0.00154, 0.00155, 0.00151, 0.00152, 0.00153, 0.00153, 0.00151, 0.00153, 0.00154, 0.00156, 0.00155, 0.00157, 0.00154, 0.00156, 0.00154, 0.00155, 0.00151, 0.00154, 0.00153, 0.00154, 0.00153, 0.00156, 0.00155, 0.00155, 0.00152, 0.00157, 0.00153, 0.00154, 0.00154, 0.00155, 0.00154, 0.00151, 0.00154, 0.00155, 0.00152, 0.00155, 0.00152, 0.00156, 0.00153, 0.00153, 0.00155, 0.00154, 0.00153, 0.00154, 0.00152, 0.00154, 0.00155, 0.00154, 0.00152, 0.00157, 0.00154, 0.00154, 0.00152, 0.00155, 0.00152, 0.00157, 0.00152, 0.00154, 0.00153, 0.00156, 0.00153, 0.00156, 0.00154, 0.00156, 0.00153, 0.00154, 0.00153, 0.00157, 0.00155, 0.00154, 0.00156, 0.00154, 0.00153, 0.00151, 0.00156, 0.00156, 0.00155, 0.00155, 0.00154, 0.00155, 0.00154, 0.00155, 0.00152, 0.00154, 0.00154, 0.00154, 0.00156, 0.00157, 0.00154, 0.00155, 0.00155, 0.00153, 0.00153, 0.00154, 0.00155, 0.00155, 0.00155, 0.00155, 0.00154, 0.00154, 0.00154, 0.00154, 0.00153, 0.00154, 0.00154, 0.00154, 0.00154, 0.00155, 0.00154, 0.00156, 0.00156, 0.00154, 0.00155, 0.00153, 0.00155, 0.00152, 0.00156, 0.00154, 0.00156, 0.00156, 0.00152, 0.00154, 0.00153, 0.00153, 0.00155, 0.00154, 0.00157, 0.00154, 0.00153, 0.00157, 0.00155, 0.00156, 0.00155, 0.00157, 0.00155, 0.00155, 0.00153, 0.00156, 0.00158, 0.00155, 0.00155, 0.00157, 0.00153, 0.00155, 0.00154, 0.00155, 0.00153, 0.00155, 0.00155, 0.00154, 0.00151, 0.00154, 0.00156, 0.00156, 0.00155, 0.00155, 0.00155, 0.00155, 0.00153, 0.00155, 0.00156, 0.00154, 0.00155, 0.00153, 0.00155, 0.00155, 0.00153, 0.00154, 0.00154, 0.00156, 0.00156, 0.00155, 0.00155, 0.00154, 0.00153, 0.00155, 0.00155, 0.00155, 0.00154, 0.00153, 0.00154, 0.00154, 0.00155, 0.00156, 0.00156, 0.00156, 0.00156, 0.00156, 0.00156, 0.00155, 0.00155, 0.00154, 0.00156, 0.00154, 0.00156, 0.00155, 0.00154, 0.00156, 0.00154, 0.00153, 0.00155, 0.00152, 0.00156, 0.00151, 0.00155, 0.00154, 0.00155, 0.00155, 0.00156, 0.00153, 0.00155, 0.00154, 0.00156, 0.00154, 0.00154, 0.00154, 0.00155, 0.00155, 0.00155, 0.00153, 0.00155, 0.00154, 0.00154, 0.00155, 0.00156, 0.00153, 0.00153, 0.00154, 0.00155, 0.00153, 0.00154, 0.00155, 0.00154, 0.00154, 0.00155, 0.00155, 0.00155, 0.00153, 0.00155, 0.00154, 0.00157, 0.00156, 0.00153, 0.00157, 0.00157, 0.00156, 0.00157, 0.00154, 0.00155, 0.00157, 0.00155, 0.00155, 0.00153, 0.00153, 0.00152, 0.00154, 0.00155, 0.00155, 0.00154, 0.00153, 0.00155, 0.00154, 0.00155, 0.00155, 0.00155]}, "optimizer-copy-to-main-grad-time": {"start_step": 0, "end_step": 2000, "step_interval": 5, "values": [0.00038, 0.00024, 0.00024, 0.00015, 0.00015, 0.00016, 0.00015, 0.00016, 0.00015, 0.00013, 0.00013, 0.00015, 0.00015, 0.00013, 0.00015, 0.00013, 0.00015, 0.00013, 0.00015, 0.00015, 0.00013, 0.00015, 0.00013, 0.00015, 0.00013, 0.00014, 0.00013, 0.00013, 0.00015, 0.00013, 0.00015, 0.00013, 0.00013, 0.00013, 0.00013, 0.00013, 0.00013, 0.00013, 0.00014, 0.00013, 0.00013, 0.00013, 0.00013, 0.00013, 0.00013, 0.00013, 0.00016, 0.00013, 0.00013, 0.00013, 0.00015, 0.00013, 0.00013, 0.00013, 0.00013, 0.00013, 0.00013, 0.00025, 0.00018, 0.00018, 0.00019, 0.00018, 0.0003, 0.00018, 0.00019, 0.00018, 0.00018, 0.00018, 0.00018, 0.00018, 0.00019, 0.00018, 0.00019, 0.00019, 0.00018, 0.00019, 0.00019, 0.00019, 0.00018, 0.00019, 0.00019, 0.00019, 0.00021, 0.00018, 0.00021, 0.00018, 0.00019, 0.00019, 0.00018, 0.00018, 0.00018, 0.0002, 0.00019, 0.00018, 0.00018, 0.00018, 0.00019, 0.00021, 0.00018, 0.00019, 0.00019, 0.00018, 0.00018, 0.00021, 0.00018, 0.00019, 0.00019, 0.00019, 0.00018, 0.00021, 0.00018, 0.00018, 0.00018, 0.0002, 0.00018, 0.00021, 0.00019, 0.00018, 0.00018, 0.0002, 0.00023, 0.00018, 0.00018, 0.0002, 0.00018, 0.00018, 0.00018, 0.00018, 0.00018, 0.00021, 0.00018, 0.00018, 0.00019, 0.00021, 0.00018, 0.00018, 0.00021, 0.00018, 0.0002, 0.00018, 0.00018, 0.00021, 0.00018, 0.00019, 0.00019, 0.00018, 0.00018, 0.00018, 0.00018, 0.00019, 0.00018, 0.00018, 0.00018, 0.00019, 0.00018, 0.0002, 0.00021, 0.00019, 0.00018, 0.00021, 0.00021, 0.00018, 0.00019, 0.00019, 0.00018, 0.00019, 0.00018, 0.00018, 0.00018, 0.00021, 0.00018, 0.00018, 0.00018, 0.00019, 0.00018, 0.00018, 0.00021, 0.00018, 0.00019, 0.00021, 0.00018, 0.00018, 0.00021, 0.00018, 0.00021, 0.00018, 0.00018, 0.00019, 0.00021, 0.00021, 0.00021, 0.00021, 0.00018, 0.00018, 0.00019, 0.00019, 0.00018, 0.0002, 0.00021, 0.00021, 0.0002, 0.00018, 0.00019, 0.00019, 0.00018, 0.00018, 0.00018, 0.00021, 0.00019, 0.00018, 0.00019, 0.00018, 0.00018, 0.00018, 0.00018, 0.00021, 0.00018, 0.00021, 0.00018, 0.00018, 0.00018, 0.00021, 0.00018, 0.00018, 0.00021, 0.00018, 0.00018, 0.00019, 0.00018, 0.00018, 0.00018, 0.00019, 0.00018, 0.00018, 0.00018, 0.00021, 0.00018, 0.00018, 0.00018, 0.00018, 0.00018, 0.00018, 0.00018, 0.00021, 0.00018, 0.00018, 0.00018, 0.00018, 0.00019, 0.00018, 0.00021, 0.00018, 0.00018, 0.00021, 0.00018, 0.00018, 0.00018, 0.00018, 0.00018, 0.00018, 0.0002, 0.00018, 0.00018, 0.00018, 0.00018, 0.00018, 0.00021, 0.00021, 0.00018, 0.0002, 0.00018, 0.00018, 0.00018, 0.00021, 0.00019, 0.00018, 0.00018, 0.00018, 0.00018, 0.00019, 0.00019, 0.00018, 0.00018, 0.00018, 0.00018, 0.0002, 0.00018, 0.00018, 0.00019, 0.00018, 0.00019, 0.00019, 0.00018, 0.00018, 0.00018, 0.00018, 0.00021, 0.00018, 0.00018, 0.00021, 0.00019, 0.00018, 0.00018, 0.00018, 0.00021, 0.00018, 0.00018, 0.00018, 0.00018, 0.00021, 0.00018, 0.00021, 0.00018, 0.00021, 0.00018, 0.00018, 0.00018, 0.00019, 0.00018, 0.00018, 0.00018, 0.00018, 0.00019, 0.00018, 0.00018, 0.00018, 0.00018, 0.00018, 0.00018, 0.00018, 0.00018, 0.00021, 0.00018, 0.00019, 0.00018, 0.00018, 0.00018, 0.00018, 0.00018, 0.00018, 0.00018, 0.00018, 0.00018, 0.00021, 0.00019, 0.00018, 0.00018, 0.00019, 0.00021, 0.00019, 0.00021, 0.00018, 0.00018, 0.00018, 0.00018, 0.00018, 0.00018, 0.00018, 0.00018, 0.00021, 0.00018, 0.00018, 0.00018, 0.00018, 0.00018, 0.00019, 0.00018, 0.00018, 0.00018, 0.00018, 0.00018, 0.00021, 0.00018, 0.00018, 0.00021, 0.00019, 0.00019, 0.00019, 0.00021, 0.00023, 0.00018, 0.00021, 0.00019, 0.00018, 0.00021, 0.00019, 0.00019, 0.00019, 0.00019, 0.00018, 0.00019, 0.00019, 0.00018, 0.00019, 0.00018, 0.00019, 0.00018, 0.00022, 0.00021, 0.00018]}, "optimizer-clip-main-grad-time": {"start_step": 0, "end_step": 2000, "step_interval": 5, "values": [0.62631, 0.00104, 0.00106, 0.00093, 0.00092, 0.00096, 0.00095, 0.00096, 0.00092, 0.00091, 0.0009, 0.00091, 0.00101, 0.00091, 0.00091, 0.0009, 0.0009, 0.0009, 0.00093, 0.00094, 0.0009, 0.00115, 0.0009, 0.00092, 0.00091, 0.00098, 0.00089, 0.00091, 0.00091, 0.0009, 0.00094, 0.0009, 0.00095, 0.00091, 0.00091, 0.0009, 0.0009, 0.00091, 0.00091, 0.00091, 0.00091, 0.00091, 0.00091, 0.00091, 0.00092, 0.0009, 0.00093, 0.00093, 0.00091, 0.00091, 0.00101, 0.00091, 0.0009, 0.0009, 0.0009, 0.00091, 0.00091, 0.00107, 0.00099, 0.001, 0.00101, 0.001, 0.00179, 0.001, 0.001, 0.00101, 0.0011, 0.00101, 0.001, 0.00101, 0.00101, 0.00101, 0.001, 0.001, 0.00101, 0.00109, 0.00106, 0.001, 0.001, 0.00102, 0.00101, 0.00102, 0.00109, 0.00101, 0.00104, 0.001, 0.00099, 0.00103, 0.00102, 0.001, 0.001, 0.00113, 0.00082, 0.00079, 0.0008, 0.001, 0.00102, 0.00105, 0.001, 0.001, 0.001, 0.00102, 0.00079, 0.00105, 0.00079, 0.00106, 0.0008, 0.00079, 0.00099, 0.00087, 0.00101, 0.0008, 0.00099, 0.00086, 0.00101, 0.00083, 0.00081, 0.001, 0.0008, 0.001, 0.00085, 0.00081, 0.001, 0.00079, 0.001, 0.00101, 0.001, 0.00079, 0.001, 0.00106, 0.001, 0.001, 0.00103, 0.00104, 0.00079, 0.00101, 0.00084, 0.00079, 0.0008, 0.0008, 0.00109, 0.00105, 0.00099, 0.0008, 0.00101, 0.00101, 0.00102, 0.00102, 0.0008, 0.00079, 0.00111, 0.00101, 0.00099, 0.0008, 0.001, 0.00108, 0.00107, 0.00103, 0.00103, 0.00084, 0.00105, 0.001, 0.00101, 0.001, 0.00101, 0.00101, 0.001, 0.00101, 0.00101, 0.00114, 0.00099, 0.0008, 0.00079, 0.00101, 0.001, 0.001, 0.00105, 0.00101, 0.001, 0.00113, 0.00101, 0.001, 0.00106, 0.001, 0.00101, 0.00101, 0.00101, 0.00101, 0.00106, 0.00105, 0.00107, 0.00106, 0.00102, 0.001, 0.00104, 0.00101, 0.00105, 0.001, 0.00104, 0.00105, 0.00104, 0.00103, 0.001, 0.001, 0.001, 0.00109, 0.00101, 0.00104, 0.001, 0.00108, 0.00108, 0.001, 0.00101, 0.001, 0.00103, 0.00106, 0.00102, 0.00106, 0.00102, 0.00099, 0.00101, 0.00105, 0.00104, 0.00101, 0.00105, 0.00102, 0.00103, 0.00102, 0.001, 0.001, 0.00104, 0.001, 0.00101, 0.00101, 0.001, 0.00105, 0.00101, 0.00107, 0.00102, 0.001, 0.00101, 0.00101, 0.00101, 0.00108, 0.00101, 0.001, 0.00106, 0.00101, 0.001, 0.001, 0.00105, 0.00101, 0.00116, 0.00112, 0.00101, 0.001, 0.00103, 0.00101, 0.00103, 0.00101, 0.00105, 0.00103, 0.00102, 0.001, 0.00101, 0.001, 0.00108, 0.00108, 0.00101, 0.00106, 0.00109, 0.00106, 0.00102, 0.00104, 0.001, 0.001, 0.00099, 0.00101, 0.00101, 0.001, 0.001, 0.001, 0.00102, 0.00105, 0.001, 0.00103, 0.00103, 0.001, 0.00101, 0.001, 0.00107, 0.00101, 0.001, 0.001, 0.00102, 0.001, 0.00111, 0.001, 0.00102, 0.00104, 0.00099, 0.001, 0.00101, 0.00101, 0.00105, 0.00101, 0.001, 0.00101, 0.00107, 0.00113, 0.00103, 0.00105, 0.00102, 0.00105, 0.00101, 0.00101, 0.00102, 0.001, 0.00101, 0.00103, 0.001, 0.00102, 0.00108, 0.00103, 0.00103, 0.00101, 0.00104, 0.001, 0.00103, 0.00101, 0.00107, 0.00106, 0.00099, 0.00103, 0.00102, 0.00101, 0.00102, 0.001, 0.00101, 0.00101, 0.00102, 0.001, 0.00101, 0.0011, 0.00101, 0.001, 0.00101, 0.001, 0.00108, 0.001, 0.0011, 0.00108, 0.00101, 0.001, 0.00102, 0.00102, 0.00101, 0.001, 0.00102, 0.00108, 0.00101, 0.00103, 0.001, 0.00101, 0.00101, 0.001, 0.00109, 0.001, 0.001, 0.00105, 0.00101, 0.00105, 0.001, 0.00102, 0.0011, 0.00103, 0.00103, 0.00102, 0.00106, 0.00104, 0.00104, 0.00107, 0.00101, 0.001, 0.00111, 0.00102, 0.00101, 0.00103, 0.00101, 0.00102, 0.001, 0.00102, 0.00103, 0.00101, 0.00101, 0.0011, 0.001, 0.00105, 0.00106, 0.00101]}, "optimizer-count-zeros-time": {"start_step": 0, "end_step": 2000, "step_interval": 5, "values": [0.00488, 0.00438, 0.00439, 0.00461, 0.00443, 0.0046, 0.00465, 0.00446, 0.00441, 0.00439, 0.00443, 0.0044, 0.00516, 0.00445, 0.0044, 0.0044, 0.00439, 0.0044, 0.0044, 0.00441, 0.00443, 0.00441, 0.00443, 0.00439, 0.00443, 0.0051, 0.0044, 0.00439, 0.00443, 0.00441, 0.0044, 0.00438, 0.00442, 0.00442, 0.00442, 0.00442, 0.00443, 0.0044, 0.00442, 0.00439, 0.0045, 0.00441, 0.00439, 0.00439, 0.0044, 0.00441, 0.00438, 0.00441, 0.00441, 0.0044, 0.00485, 0.00441, 0.00442, 0.00439, 0.0044, 0.00438, 0.00445, 0.00462, 0.00437, 0.00439, 0.0044, 0.00439, 0.0044, 0.00442, 0.00439, 0.00441, 0.00442, 0.00439, 0.00439, 0.00439, 0.00442, 0.0044, 0.00439, 0.00441, 0.00438, 0.00523, 0.00508, 0.00442, 0.00437, 0.00496, 0.00442, 0.00437, 0.00556, 0.00439, 0.00438, 0.00443, 0.00439, 0.0044, 0.00439, 0.00442, 0.00441, 0.0052, 0.00441, 0.00441, 0.00438, 0.00444, 0.00441, 0.0044, 0.00441, 0.00439, 0.00443, 0.00439, 0.00438, 0.00443, 0.0044, 0.00439, 0.00442, 0.00443, 0.00439, 0.00439, 0.00441, 0.00441, 0.0044, 0.00544, 0.00439, 0.0044, 0.0044, 0.00442, 0.00441, 0.00438, 0.00439, 0.00441, 0.00442, 0.00439, 0.00438, 0.00441, 0.00442, 0.0044, 0.0044, 0.00441, 0.00436, 0.0044, 0.00438, 0.00442, 0.00442, 0.00442, 0.00444, 0.00442, 0.00441, 0.0044, 0.00439, 0.00439, 0.00439, 0.00441, 0.00441, 0.00443, 0.00439, 0.00439, 0.00439, 0.00439, 0.00438, 0.0044, 0.00439, 0.00441, 0.00441, 0.00481, 0.00443, 0.0044, 0.0044, 0.00442, 0.0044, 0.00439, 0.0044, 0.00438, 0.00454, 0.0044, 0.00439, 0.0044, 0.00439, 0.0044, 0.0044, 0.00438, 0.00441, 0.00437, 0.00439, 0.0044, 0.00441, 0.00438, 0.00441, 0.00439, 0.00441, 0.00442, 0.0044, 0.00439, 0.00438, 0.00441, 0.00439, 0.00441, 0.0044, 0.0044, 0.0044, 0.00439, 0.0044, 0.00442, 0.00467, 0.00439, 0.0044, 0.0044, 0.00442, 0.00441, 0.00442, 0.0044, 0.00442, 0.00442, 0.00441, 0.00509, 0.00443, 0.0044, 0.00442, 0.00438, 0.00487, 0.00531, 0.00442, 0.00442, 0.00442, 0.00442, 0.00441, 0.00439, 0.00441, 0.0044, 0.00439, 0.0044, 0.00441, 0.00439, 0.00439, 0.0044, 0.0044, 0.00439, 0.00443, 0.00441, 0.00454, 0.00439, 0.00441, 0.0044, 0.00441, 0.00439, 0.00441, 0.00442, 0.0044, 0.00441, 0.00438, 0.0044, 0.00439, 0.0044, 0.0044, 0.00442, 0.0044, 0.0044, 0.0044, 0.00438, 0.0044, 0.0044, 0.0044, 0.0044, 0.0044, 0.00441, 0.00441, 0.0044, 0.00442, 0.0044, 0.00439, 0.00439, 0.00439, 0.00439, 0.00439, 0.0044, 0.00442, 0.00441, 0.00439, 0.00443, 0.00439, 0.0044, 0.0044, 0.00439, 0.0044, 0.0044, 0.00441, 0.0044, 0.00438, 0.00441, 0.00442, 0.0044, 0.00439, 0.00443, 0.00534, 0.00438, 0.00442, 0.0044, 0.0044, 0.00441, 0.00495, 0.00439, 0.00441, 0.00438, 0.00441, 0.00441, 0.0044, 0.00437, 0.00441, 0.00439, 0.0044, 0.00442, 0.0044, 0.00442, 0.00439, 0.00437, 0.00441, 0.0044, 0.00439, 0.0044, 0.00457, 0.00441, 0.00441, 0.00442, 0.00441, 0.00443, 0.00439, 0.00443, 0.00439, 0.00439, 0.00439, 0.00441, 0.00486, 0.00439, 0.00441, 0.00441, 0.00453, 0.0044, 0.00437, 0.00441, 0.0044, 0.00442, 0.0044, 0.00442, 0.00441, 0.00441, 0.00439, 0.00439, 0.00441, 0.00438, 0.0044, 0.00442, 0.00443, 0.0044, 0.0044, 0.00442, 0.00441, 0.00439, 0.00442, 0.00441, 0.0044, 0.00439, 0.00438, 0.00439, 0.00442, 0.00439, 0.00441, 0.00439, 0.0044, 0.00441, 0.0044, 0.00442, 0.00443, 0.0044, 0.00438, 0.0044, 0.00439, 0.00444, 0.00439, 0.00442, 0.0044, 0.00439, 0.00441, 0.00439, 0.00442, 0.00439, 0.00438, 0.00439, 0.00438, 0.0044, 0.00442, 0.0044, 0.00438, 0.00442, 0.00443, 0.0044, 0.0044, 0.00439, 0.00441, 0.00439, 0.0044, 0.00444, 0.00455, 0.00442, 0.00443, 0.00441, 0.00442, 0.00442, 0.00443, 0.0044]}, "optimizer-inner-step-time": {"start_step": 0, "end_step": 2000, "step_interval": 5, "values": [0.00313, 0.00096, 0.00097, 0.00093, 0.00094, 0.00094, 0.00094, 0.00093, 0.00093, 0.00093, 0.00094, 0.00093, 0.00094, 0.00092, 0.00093, 0.00092, 0.00094, 0.00092, 0.00093, 0.00093, 0.00093, 0.00092, 0.00092, 0.00092, 0.00092, 0.00094, 0.00092, 0.00093, 0.00093, 0.00092, 0.00092, 0.00092, 0.00092, 0.00093, 0.00092, 0.00092, 0.00092, 0.00099, 0.00092, 0.00093, 0.00094, 0.00093, 0.00092, 0.00092, 0.00092, 0.00092, 0.00093, 0.00093, 0.00093, 0.00093, 0.00094, 0.00092, 0.00092, 0.00092, 0.00092, 0.00092, 0.00092, 0.00096, 0.00093, 0.00093, 0.00092, 0.00093, 0.00092, 0.00092, 0.00092, 0.00094, 0.00093, 0.00092, 0.00093, 0.00092, 0.00093, 0.00093, 0.00092, 0.00093, 0.00092, 0.00097, 0.00095, 0.00092, 0.00093, 0.00093, 0.00092, 0.00099, 0.00095, 0.00093, 0.00094, 0.00093, 0.00092, 0.00093, 0.00092, 0.00093, 0.00094, 0.00095, 0.00093, 0.00093, 0.00093, 0.00092, 0.00092, 0.00093, 0.00093, 0.00092, 0.00093, 0.00093, 0.00093, 0.00094, 0.00095, 0.00092, 0.00093, 0.00093, 0.00093, 0.00093, 0.00093, 0.00093, 0.00093, 0.00093, 0.00092, 0.00093, 0.00092, 0.00092, 0.00093, 0.00092, 0.00094, 0.00093, 0.00093, 0.00092, 0.00092, 0.00093, 0.00092, 0.00093, 0.00093, 0.00093, 0.00092, 0.00092, 0.00093, 0.00093, 0.00092, 0.00092, 0.00094, 0.00093, 0.00092, 0.00093, 0.00094, 0.00094, 0.00092, 0.00093, 0.00092, 0.00093, 0.00092, 0.00093, 0.00092, 0.00093, 0.00093, 0.00092, 0.00092, 0.00093, 0.00092, 0.00093, 0.00094, 0.00093, 0.00092, 0.00094, 0.00093, 0.00093, 0.00092, 0.00093, 0.00095, 0.00093, 0.00092, 0.00092, 0.00093, 0.00094, 0.00092, 0.00092, 0.00093, 0.00093, 0.00093, 0.00092, 0.00094, 0.00094, 0.00092, 0.00094, 0.00092, 0.00093, 0.00093, 0.00092, 0.00093, 0.00092, 0.00093, 0.00092, 0.00092, 0.00094, 0.00093, 0.00093, 0.00092, 0.00092, 0.00093, 0.00092, 0.00092, 0.00093, 0.00094, 0.00093, 0.00094, 0.00093, 0.00093, 0.00093, 0.00093, 0.00095, 0.00092, 0.00093, 0.00093, 0.00093, 0.00094, 0.00093, 0.00092, 0.00094, 0.00093, 0.00095, 0.00094, 0.00094, 0.00092, 0.00093, 0.00093, 0.00094, 0.00093, 0.00093, 0.00094, 0.00092, 0.00093, 0.00092, 0.00093, 0.00093, 0.00093, 0.00093, 0.00097, 0.00093, 0.00092, 0.00094, 0.00092, 0.00092, 0.00093, 0.00093, 0.00093, 0.00093, 0.00093, 0.00094, 0.00093, 0.00093, 0.00092, 0.00093, 0.00093, 0.00093, 0.00094, 0.00093, 0.00093, 0.00093, 0.00094, 0.00093, 0.00094, 0.00094, 0.00093, 0.00092, 0.00093, 0.00093, 0.00093, 0.00093, 0.00093, 0.00093, 0.00093, 0.00093, 0.00093, 0.00093, 0.00093, 0.00093, 0.00093, 0.00094, 0.00093, 0.00094, 0.00092, 0.00094, 0.00093, 0.00092, 0.00092, 0.00093, 0.00093, 0.00093, 0.00093, 0.00093, 0.00093, 0.00093, 0.00092, 0.00092, 0.00095, 0.00093, 0.00093, 0.00093, 0.00093, 0.00092, 0.00094, 0.00093, 0.00092, 0.00094, 0.00093, 0.00093, 0.00093, 0.00092, 0.00094, 0.00093, 0.00093, 0.00092, 0.00093, 0.00094, 0.00093, 0.00093, 0.00092, 0.00093, 0.00093, 0.00093, 0.00093, 0.00093, 0.00094, 0.00092, 0.00092, 0.00093, 0.00094, 0.00093, 0.00094, 0.00093, 0.00093, 0.00093, 0.00093, 0.00093, 0.00093, 0.00093, 0.00093, 0.00093, 0.00093, 0.00093, 0.00093, 0.00092, 0.00093, 0.00093, 0.00093, 0.00093, 0.00093, 0.00093, 0.00093, 0.00094, 0.00093, 0.00092, 0.00094, 0.00093, 0.00093, 0.00093, 0.00093, 0.00093, 0.00093, 0.00092, 0.00092, 0.00092, 0.00092, 0.00093, 0.00093, 0.00093, 0.00093, 0.00092, 0.00093, 0.00093, 0.00094, 0.00094, 0.00093, 0.00093, 0.00093, 0.00094, 0.00092, 0.00093, 0.00093, 0.00094, 0.00093, 0.00093, 0.00093, 0.00093, 0.00093, 0.00094, 0.00094, 0.00093, 0.00094, 0.00095, 0.00093, 0.00093, 0.00093, 0.00093, 0.00093, 0.00093, 0.00096, 0.00093, 0.00093, 0.00093, 0.00093, 0.00094, 0.00094, 0.00094]}, "optimizer-copy-main-to-model-params-time": {"start_step": 0, "end_step": 2000, "step_interval": 5, "values": [0.0012, 0.001, 0.00119, 0.00096, 0.00096, 0.00096, 0.00097, 0.00096, 0.00096, 0.00096, 0.00095, 0.00096, 0.00097, 0.00095, 0.00096, 0.00096, 0.00096, 0.00096, 0.00096, 0.00096, 0.00097, 0.00096, 0.00096, 0.00095, 0.00096, 0.00097, 0.00096, 0.00095, 0.00096, 0.00096, 0.00096, 0.00096, 0.00096, 0.00095, 0.00095, 0.00095, 0.00096, 0.00104, 0.00096, 0.00095, 0.00097, 0.00095, 0.00096, 0.00096, 0.00096, 0.00096, 0.00096, 0.00095, 0.00096, 0.00096, 0.00097, 0.00096, 0.00096, 0.00095, 0.00096, 0.00095, 0.00096, 0.001, 0.00099, 0.00099, 0.00099, 0.00098, 0.00099, 0.00099, 0.00098, 0.00098, 0.00098, 0.00099, 0.00099, 0.00099, 0.00099, 0.00099, 0.00099, 0.00099, 0.00099, 0.00101, 0.001, 0.00099, 0.001, 0.00098, 0.00099, 0.00099, 0.001, 0.00099, 0.001, 0.00099, 0.00098, 0.00098, 0.00099, 0.00099, 0.00098, 0.00103, 0.00099, 0.00099, 0.00099, 0.001, 0.001, 0.001, 0.00099, 0.00099, 0.001, 0.00099, 0.00099, 0.00101, 0.00099, 0.00099, 0.00099, 0.00099, 0.001, 0.00103, 0.00099, 0.00099, 0.00099, 0.001, 0.00099, 0.001, 0.00099, 0.00099, 0.00099, 0.001, 0.00101, 0.00099, 0.00099, 0.00099, 0.00099, 0.00099, 0.00099, 0.00099, 0.00098, 0.001, 0.00099, 0.00099, 0.00099, 0.001, 0.00099, 0.00099, 0.00099, 0.00099, 0.00099, 0.00099, 0.00099, 0.001, 0.00099, 0.00099, 0.00099, 0.00098, 0.00099, 0.00099, 0.00099, 0.00099, 0.00098, 0.00098, 0.00099, 0.00098, 0.00098, 0.001, 0.001, 0.001, 0.00099, 0.001, 0.001, 0.00099, 0.00099, 0.00099, 0.00101, 0.00099, 0.00099, 0.00099, 0.00099, 0.00101, 0.001, 0.00099, 0.00099, 0.00098, 0.00099, 0.00099, 0.001, 0.00099, 0.00098, 0.00099, 0.00099, 0.00099, 0.001, 0.00099, 0.00102, 0.00099, 0.00099, 0.00098, 0.001, 0.00099, 0.00099, 0.001, 0.00099, 0.00098, 0.00099, 0.00099, 0.00099, 0.00099, 0.00101, 0.001, 0.00101, 0.00099, 0.00099, 0.00099, 0.00099, 0.00101, 0.00099, 0.00101, 0.00099, 0.00099, 0.00099, 0.00101, 0.00099, 0.00099, 0.00099, 0.001, 0.00098, 0.001, 0.00099, 0.001, 0.00099, 0.00101, 0.00099, 0.00099, 0.001, 0.00099, 0.001, 0.00101, 0.00099, 0.001, 0.00098, 0.00099, 0.00105, 0.00099, 0.00099, 0.00101, 0.00099, 0.00099, 0.00099, 0.00099, 0.00099, 0.001, 0.00099, 0.00102, 0.00098, 0.00098, 0.00099, 0.001, 0.00099, 0.001, 0.001, 0.001, 0.00098, 0.00101, 0.00099, 0.001, 0.00098, 0.00099, 0.001, 0.00099, 0.001, 0.00099, 0.00099, 0.00099, 0.00099, 0.00099, 0.00101, 0.001, 0.00098, 0.00101, 0.00099, 0.00098, 0.00099, 0.00103, 0.00098, 0.00099, 0.00099, 0.001, 0.00098, 0.001, 0.00099, 0.00099, 0.00099, 0.00099, 0.00099, 0.00106, 0.00099, 0.00099, 0.00099, 0.00099, 0.00101, 0.001, 0.00099, 0.00099, 0.00099, 0.00099, 0.00101, 0.001, 0.00099, 0.001, 0.001, 0.001, 0.00098, 0.00099, 0.001, 0.00099, 0.00099, 0.00099, 0.00099, 0.001, 0.00099, 0.001, 0.00099, 0.001, 0.00099, 0.00099, 0.00099, 0.00099, 0.001, 0.00099, 0.001, 0.00099, 0.001, 0.00099, 0.00099, 0.00099, 0.00099, 0.00099, 0.00098, 0.00099, 0.00099, 0.001, 0.00099, 0.001, 0.00099, 0.001, 0.00101, 0.00099, 0.00099, 0.00099, 0.00099, 0.00099, 0.001, 0.001, 0.00101, 0.00099, 0.00099, 0.00099, 0.001, 0.001, 0.001, 0.001, 0.00099, 0.00099, 0.00099, 0.00099, 0.00099, 0.001, 0.001, 0.00101, 0.001, 0.00099, 0.00099, 0.00099, 0.001, 0.00099, 0.00099, 0.00099, 0.00099, 0.00098, 0.00099, 0.001, 0.00099, 0.00099, 0.00099, 0.001, 0.00099, 0.00099, 0.001, 0.001, 0.00099, 0.00099, 0.00099, 0.00099, 0.001, 0.00099, 0.001, 0.001, 0.00099, 0.00099, 0.00099, 0.001, 0.001, 0.001, 0.001, 0.00099, 0.00099, 0.001, 0.00101, 0.00099]}, "optimizer-time": {"start_step": 0, "end_step": 2000, "step_interval": 5, "values": [0.63786, 0.00795, 0.00821, 0.00789, 0.00772, 0.00795, 0.00797, 0.00777, 0.00768, 0.00764, 0.00767, 0.00766, 0.0086, 0.00767, 0.00766, 0.00763, 0.00766, 0.00763, 0.00768, 0.0077, 0.00769, 0.0079, 0.00766, 0.00765, 0.00767, 0.00848, 0.00762, 0.00762, 0.0077, 0.00763, 0.0077, 0.0076, 0.00769, 0.00767, 0.00763, 0.00763, 0.00766, 0.0078, 0.00766, 0.00762, 0.00777, 0.00763, 0.00763, 0.00761, 0.00765, 0.00763, 0.00767, 0.00766, 0.00766, 0.00764, 0.00825, 0.00763, 0.00764, 0.00762, 0.00762, 0.00761, 0.00768, 0.00821, 0.00776, 0.00779, 0.00781, 0.00778, 0.00875, 0.00781, 0.00783, 0.00782, 0.00792, 0.00779, 0.00782, 0.00781, 0.00783, 0.00781, 0.0078, 0.00782, 0.0078, 0.00884, 0.00896, 0.00783, 0.00778, 0.00843, 0.00783, 0.00789, 0.00911, 0.0078, 0.00787, 0.00783, 0.00779, 0.00784, 0.00781, 0.00784, 0.00782, 0.00886, 0.00764, 0.00763, 0.00759, 0.00785, 0.00785, 0.0079, 0.00781, 0.0078, 0.00787, 0.00782, 0.00759, 0.00793, 0.00762, 0.00785, 0.00763, 0.00765, 0.00781, 0.00773, 0.00784, 0.00762, 0.0078, 0.00885, 0.00779, 0.00767, 0.00763, 0.00782, 0.00761, 0.0078, 0.00773, 0.00766, 0.00783, 0.00758, 0.00778, 0.00785, 0.00781, 0.00759, 0.00779, 0.00791, 0.00776, 0.0078, 0.00782, 0.0079, 0.00761, 0.00781, 0.00773, 0.0076, 0.00764, 0.0076, 0.0079, 0.00789, 0.00777, 0.00763, 0.00782, 0.00784, 0.00781, 0.00782, 0.00757, 0.0076, 0.00788, 0.0078, 0.00778, 0.00762, 0.0078, 0.00834, 0.00794, 0.00785, 0.00783, 0.00773, 0.0079, 0.0078, 0.00783, 0.0078, 0.00801, 0.00782, 0.0078, 0.0078, 0.00781, 0.00801, 0.00781, 0.00758, 0.0076, 0.00778, 0.00779, 0.0078, 0.00791, 0.00781, 0.00781, 0.00797, 0.00782, 0.00782, 0.0079, 0.0078, 0.00784, 0.00783, 0.00781, 0.00782, 0.00788, 0.0079, 0.00791, 0.0079, 0.00782, 0.00781, 0.00814, 0.0078, 0.00785, 0.00782, 0.00793, 0.00792, 0.008, 0.00785, 0.00786, 0.00784, 0.00782, 0.00866, 0.00784, 0.00789, 0.00784, 0.00787, 0.00839, 0.0088, 0.00783, 0.00783, 0.00785, 0.00793, 0.00785, 0.0079, 0.00785, 0.0078, 0.00782, 0.00791, 0.00786, 0.00781, 0.0079, 0.00782, 0.00783, 0.00783, 0.00783, 0.00782, 0.00798, 0.00781, 0.00795, 0.00782, 0.00782, 0.00791, 0.00782, 0.00789, 0.00781, 0.00782, 0.00779, 0.00782, 0.00781, 0.00795, 0.00784, 0.00781, 0.00787, 0.00782, 0.00781, 0.0078, 0.00791, 0.00784, 0.00796, 0.00798, 0.00782, 0.00782, 0.00785, 0.00784, 0.00818, 0.00781, 0.00787, 0.00783, 0.00781, 0.0078, 0.00782, 0.00781, 0.00794, 0.00793, 0.0078, 0.00794, 0.00789, 0.00786, 0.00784, 0.0079, 0.00782, 0.00783, 0.00781, 0.00784, 0.00779, 0.00782, 0.00783, 0.00781, 0.00781, 0.00789, 0.00881, 0.00824, 0.00789, 0.00781, 0.00781, 0.0078, 0.0085, 0.00783, 0.00782, 0.00779, 0.00783, 0.0078, 0.00797, 0.00779, 0.00784, 0.00789, 0.00782, 0.00783, 0.00779, 0.00782, 0.00789, 0.00779, 0.00783, 0.00781, 0.00786, 0.00799, 0.00801, 0.0079, 0.00782, 0.00791, 0.00782, 0.00785, 0.00781, 0.00784, 0.00782, 0.00783, 0.00779, 0.00783, 0.0084, 0.00783, 0.00791, 0.00782, 0.00798, 0.00782, 0.0078, 0.00782, 0.00787, 0.00792, 0.0078, 0.00787, 0.00784, 0.00783, 0.00784, 0.00779, 0.00783, 0.00781, 0.00782, 0.00783, 0.00786, 0.00794, 0.00785, 0.00783, 0.00782, 0.00781, 0.00795, 0.00782, 0.00795, 0.00789, 0.00781, 0.00783, 0.00785, 0.00782, 0.00782, 0.0078, 0.00782, 0.00794, 0.00782, 0.00786, 0.00785, 0.00783, 0.0078, 0.00783, 0.0079, 0.00784, 0.00781, 0.00787, 0.00781, 0.0079, 0.00782, 0.00782, 0.00796, 0.00784, 0.00782, 0.00783, 0.00789, 0.00792, 0.00787, 0.00791, 0.00781, 0.00783, 0.00802, 0.00784, 0.00783, 0.00785, 0.00783, 0.00782, 0.00781, 0.00788, 0.00802, 0.00787, 0.00787, 0.00793, 0.00784, 0.00793, 0.00797, 0.00783]}, "learning-rate": {"start_step": 0, "end_step": 2000, "step_interval": 5, "values": [0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 1e-05, 1e-05, 1e-05, 1e-05, 1e-05, 1e-05, 1e-05, 1e-05, 1e-05, 1e-05, 1e-05, 1e-05, 1e-05, 1e-05, 1e-05, 1e-05, 1e-05, 1e-05, 1e-05, 1e-05, 1e-05, 1e-05, 1e-05, 1e-05, 1e-05, 1e-05, 1e-05, 1e-05, 1e-05, 1e-05, 1e-05, 1e-05, 1e-05, 1e-05, 1e-05, 1e-05, 1e-05, 1e-05, 1e-05, 1e-05, 1e-05, 1e-05, 2e-05, 2e-05, 2e-05, 2e-05, 2e-05, 2e-05, 2e-05, 2e-05, 2e-05, 2e-05, 2e-05, 2e-05, 2e-05, 2e-05, 2e-05, 2e-05, 2e-05, 2e-05, 2e-05, 2e-05, 2e-05, 2e-05, 2e-05, 2e-05, 2e-05, 2e-05, 2e-05, 2e-05, 2e-05, 2e-05, 2e-05, 2e-05, 2e-05, 2e-05, 2e-05, 2e-05, 2e-05, 2e-05, 2e-05, 2e-05, 2e-05, 2e-05, 2e-05, 3e-05, 3e-05, 3e-05, 3e-05, 3e-05, 3e-05, 3e-05, 3e-05, 3e-05, 3e-05, 3e-05, 3e-05, 3e-05, 3e-05, 3e-05, 3e-05, 3e-05, 3e-05, 3e-05, 3e-05, 3e-05, 3e-05, 3e-05, 3e-05, 3e-05, 3e-05, 3e-05, 3e-05, 3e-05, 3e-05, 3e-05, 3e-05, 3e-05, 3e-05, 3e-05, 3e-05, 3e-05, 3e-05, 3e-05, 3e-05, 3e-05, 3e-05, 3e-05, 4e-05, 4e-05, 4e-05, 4e-05, 4e-05, 4e-05, 4e-05, 4e-05, 4e-05, 4e-05, 4e-05, 4e-05, 4e-05, 4e-05, 4e-05, 4e-05, 4e-05, 4e-05, 4e-05, 4e-05, 4e-05, 4e-05, 4e-05, 4e-05, 4e-05, 4e-05, 4e-05, 4e-05, 4e-05, 4e-05, 4e-05, 4e-05, 4e-05, 4e-05, 4e-05, 4e-05, 4e-05, 4e-05, 4e-05, 4e-05, 4e-05, 4e-05, 5e-05, 5e-05, 5e-05, 5e-05, 5e-05, 5e-05, 5e-05, 5e-05, 5e-05, 5e-05, 5e-05, 5e-05, 5e-05, 5e-05, 5e-05, 5e-05, 5e-05, 5e-05, 5e-05, 5e-05, 5e-05, 5e-05, 5e-05, 5e-05, 5e-05, 5e-05, 5e-05, 5e-05, 5e-05, 5e-05, 5e-05, 5e-05, 5e-05, 5e-05, 5e-05, 5e-05, 5e-05, 5e-05, 5e-05, 5e-05, 5e-05, 5e-05, 5e-05, 6e-05, 6e-05, 6e-05, 6e-05, 6e-05, 6e-05, 6e-05, 6e-05, 6e-05, 6e-05, 6e-05, 6e-05, 6e-05, 6e-05, 6e-05, 6e-05, 6e-05, 6e-05, 6e-05, 6e-05, 6e-05, 6e-05, 6e-05, 6e-05, 6e-05, 6e-05, 6e-05, 6e-05, 6e-05, 6e-05, 6e-05, 6e-05, 6e-05, 6e-05, 6e-05, 6e-05, 6e-05, 6e-05, 6e-05, 6e-05, 6e-05, 6e-05, 6e-05, 7e-05, 7e-05, 7e-05, 7e-05, 7e-05, 7e-05, 7e-05, 7e-05, 7e-05, 7e-05, 7e-05, 7e-05, 7e-05, 7e-05, 7e-05, 7e-05, 7e-05, 7e-05, 7e-05, 7e-05, 7e-05, 7e-05, 7e-05, 7e-05, 7e-05, 7e-05, 7e-05, 7e-05, 7e-05, 7e-05, 7e-05, 7e-05, 7e-05, 7e-05, 7e-05, 7e-05, 7e-05, 7e-05, 7e-05, 7e-05, 7e-05, 7e-05, 8e-05, 8e-05, 8e-05, 8e-05, 8e-05, 8e-05, 8e-05, 8e-05, 8e-05, 8e-05, 8e-05, 8e-05, 8e-05, 8e-05, 8e-05, 8e-05, 8e-05, 8e-05, 8e-05, 8e-05, 8e-05, 8e-05, 8e-05, 8e-05, 8e-05, 8e-05, 8e-05, 8e-05, 8e-05, 8e-05, 8e-05, 8e-05, 8e-05, 8e-05, 8e-05, 8e-05, 8e-05, 8e-05, 8e-05, 8e-05, 8e-05, 8e-05, 8e-05, 9e-05, 9e-05, 9e-05, 9e-05, 9e-05, 9e-05, 9e-05, 9e-05, 9e-05, 9e-05, 9e-05, 9e-05, 9e-05, 9e-05, 9e-05, 9e-05, 9e-05, 9e-05, 9e-05, 9e-05, 9e-05, 9e-05, 9e-05, 9e-05, 9e-05, 9e-05, 9e-05, 9e-05, 9e-05, 9e-05, 9e-05, 9e-05, 9e-05, 9e-05, 9e-05, 9e-05, 9e-05]}, "learning-rate vs samples": {"start_step": 0, "end_step": 2000, "step_interval": 5, "values": [0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 1e-05, 1e-05, 1e-05, 1e-05, 1e-05, 1e-05, 1e-05, 1e-05, 1e-05, 1e-05, 1e-05, 1e-05, 1e-05, 1e-05, 1e-05, 1e-05, 1e-05, 1e-05, 1e-05, 1e-05, 1e-05, 1e-05, 1e-05, 1e-05, 1e-05, 1e-05, 1e-05, 1e-05, 1e-05, 1e-05, 1e-05, 1e-05, 1e-05, 1e-05, 1e-05, 1e-05, 1e-05, 1e-05, 1e-05, 1e-05, 1e-05, 1e-05, 2e-05, 2e-05, 2e-05, 2e-05, 2e-05, 2e-05, 2e-05, 2e-05, 2e-05, 2e-05, 2e-05, 2e-05, 2e-05, 2e-05, 2e-05, 2e-05, 2e-05, 2e-05, 2e-05, 2e-05, 2e-05, 2e-05, 2e-05, 2e-05, 2e-05, 2e-05, 2e-05, 2e-05, 2e-05, 2e-05, 2e-05, 2e-05, 2e-05, 2e-05, 2e-05, 2e-05, 2e-05, 2e-05, 2e-05, 2e-05, 2e-05, 2e-05, 2e-05, 3e-05, 3e-05, 3e-05, 3e-05, 3e-05, 3e-05, 3e-05, 3e-05, 3e-05, 3e-05, 3e-05, 3e-05, 3e-05, 3e-05, 3e-05, 3e-05, 3e-05, 3e-05, 3e-05, 3e-05, 3e-05, 3e-05, 3e-05, 3e-05, 3e-05, 3e-05, 3e-05, 3e-05, 3e-05, 3e-05, 3e-05, 3e-05, 3e-05, 3e-05, 3e-05, 3e-05, 3e-05, 3e-05, 3e-05, 3e-05, 3e-05, 3e-05, 3e-05, 4e-05, 4e-05, 4e-05, 4e-05, 4e-05, 4e-05, 4e-05, 4e-05, 4e-05, 4e-05, 4e-05, 4e-05, 4e-05, 4e-05, 4e-05, 4e-05, 4e-05, 4e-05, 4e-05, 4e-05, 4e-05, 4e-05, 4e-05, 4e-05, 4e-05, 4e-05, 4e-05, 4e-05, 4e-05, 4e-05, 4e-05, 4e-05, 4e-05, 4e-05, 4e-05, 4e-05, 4e-05, 4e-05, 4e-05, 4e-05, 4e-05, 4e-05, 5e-05, 5e-05, 5e-05, 5e-05, 5e-05, 5e-05, 5e-05, 5e-05, 5e-05, 5e-05, 5e-05, 5e-05, 5e-05, 5e-05, 5e-05, 5e-05, 5e-05, 5e-05, 5e-05, 5e-05, 5e-05, 5e-05, 5e-05, 5e-05, 5e-05, 5e-05, 5e-05, 5e-05, 5e-05, 5e-05, 5e-05, 5e-05, 5e-05, 5e-05, 5e-05, 5e-05, 5e-05, 5e-05, 5e-05, 5e-05, 5e-05, 5e-05, 5e-05, 6e-05, 6e-05, 6e-05, 6e-05, 6e-05, 6e-05, 6e-05, 6e-05, 6e-05, 6e-05, 6e-05, 6e-05, 6e-05, 6e-05, 6e-05, 6e-05, 6e-05, 6e-05, 6e-05, 6e-05, 6e-05, 6e-05, 6e-05, 6e-05, 6e-05, 6e-05, 6e-05, 6e-05, 6e-05, 6e-05, 6e-05, 6e-05, 6e-05, 6e-05, 6e-05, 6e-05, 6e-05, 6e-05, 6e-05, 6e-05, 6e-05, 6e-05, 6e-05, 7e-05, 7e-05, 7e-05, 7e-05, 7e-05, 7e-05, 7e-05, 7e-05, 7e-05, 7e-05, 7e-05, 7e-05, 7e-05, 7e-05, 7e-05, 7e-05, 7e-05, 7e-05, 7e-05, 7e-05, 7e-05, 7e-05, 7e-05, 7e-05, 7e-05, 7e-05, 7e-05, 7e-05, 7e-05, 7e-05, 7e-05, 7e-05, 7e-05, 7e-05, 7e-05, 7e-05, 7e-05, 7e-05, 7e-05, 7e-05, 7e-05, 7e-05, 8e-05, 8e-05, 8e-05, 8e-05, 8e-05, 8e-05, 8e-05, 8e-05, 8e-05, 8e-05, 8e-05, 8e-05, 8e-05, 8e-05, 8e-05, 8e-05, 8e-05, 8e-05, 8e-05, 8e-05, 8e-05, 8e-05, 8e-05, 8e-05, 8e-05, 8e-05, 8e-05, 8e-05, 8e-05, 8e-05, 8e-05, 8e-05, 8e-05, 8e-05, 8e-05, 8e-05, 8e-05, 8e-05, 8e-05, 8e-05, 8e-05, 8e-05, 8e-05, 9e-05, 9e-05, 9e-05, 9e-05, 9e-05, 9e-05, 9e-05, 9e-05, 9e-05, 9e-05, 9e-05, 9e-05, 9e-05, 9e-05, 9e-05, 9e-05, 9e-05, 9e-05, 9e-05, 9e-05, 9e-05, 9e-05, 9e-05, 9e-05, 9e-05, 9e-05, 9e-05, 9e-05, 9e-05, 9e-05, 9e-05, 9e-05, 9e-05, 9e-05, 9e-05, 9e-05, 9e-05]}, "batch-size": {"start_step": 0, "end_step": 2000, "step_interval": 5, "values": [128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0]}, "batch-size vs samples": {"start_step": 0, "end_step": 2000, "step_interval": 5, "values": [128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0]}, "lm loss": {"start_step": 0, "end_step": 2000, "step_interval": 5, "values": [10.88345, 10.90291, 10.88739, 10.83435, 10.68106, 10.65239, 10.43882, 10.15796, 9.94566, 9.85031, 9.59624, 9.85805, 9.88827, 9.63311, 9.79091, 9.51415, 9.46112, 9.65226, 9.38851, 9.33535, 9.24597, 9.15002, 9.1791, 9.00048, 9.19456, 9.06645, 9.16089, 9.17249, 9.30644, 8.99568, 8.93903, 9.04853, 9.05134, 8.65891, 8.72191, 8.75857, 8.68509, 8.7367, 8.66155, 8.76648, 8.66383, 8.85312, 8.83506, 8.49989, 8.39023, 8.43268, 8.49362, 8.38495, 8.4346, 8.58278, 8.36836, 8.19768, 8.22999, 8.22623, 8.27021, 7.91926, 8.10177, 7.89448, 8.24737, 8.23304, 8.007, 7.96876, 7.92354, 7.74219, 7.74672, 7.64691, 7.51972, 7.90702, 7.70393, 7.45184, 7.74158, 7.77006, 7.54684, 7.30265, 7.45642, 7.33883, 7.46797, 7.22942, 7.63514, 7.28131, 7.35335, 7.21286, 7.21895, 7.42346, 7.17843, 7.28509, 7.00192, 7.0089, 7.04286, 7.14056, 6.82835, 6.99014, 7.09279, 7.00447, 6.88003, 6.761, 6.99471, 7.0633, 6.70925, 6.5917, 6.73258, 6.74964, 6.73779, 6.74258, 6.66376, 6.41582, 6.64124, 6.62873, 6.45047, 6.63243, 6.75424, 6.61807, 6.73736, 6.70363, 6.63926, 6.51953, 6.61425, 6.42312, 6.67885, 6.26757, 6.26882, 6.32005, 6.41287, 6.37101, 6.46896, 6.31397, 6.36148, 6.25486, 6.22526, 6.42692, 6.35485, 6.35029, 6.19105, 6.18567, 6.26859, 6.415, 6.23334, 6.18337, 6.21035, 6.14535, 6.09626, 6.10387, 6.28772, 6.43606, 6.29503, 6.335, 6.13464, 6.21503, 6.02829, 6.06095, 5.9935, 6.28273, 6.22023, 5.99847, 5.81393, 6.16265, 5.87946, 6.14445, 5.82485, 6.19248, 6.18157, 6.12584, 5.97074, 6.14877, 5.98325, 6.23524, 5.93942, 5.83892, 5.82229, 5.72934, 6.05496, 6.0434, 6.11051, 5.93954, 6.09171, 6.01241, 6.04004, 6.0322, 5.99651, 5.89061, 6.00653, 5.67122, 5.75784, 5.94696, 5.9005, 5.91468, 5.82189, 5.89471, 5.77842, 5.61622, 5.78054, 5.69253, 5.90048, 5.66647, 5.77352, 5.78152, 5.97131, 5.71328, 5.92696, 5.81669, 5.94504, 5.4175, 5.97213, 5.95642, 5.93165, 5.48932, 5.49949, 5.70719, 5.6873, 5.5725, 5.66702, 5.76913, 5.57229, 5.82826, 5.61559, 5.69173, 5.731, 5.73072, 5.62169, 5.71676, 5.78883, 5.80232, 5.67949, 5.77122, 5.47901, 5.79612, 5.73059, 5.53929, 5.69307, 5.7447, 5.6605, 5.44825, 5.66038, 5.60993, 5.60208, 5.50359, 5.67847, 5.72987, 5.52511, 5.65798, 5.63632, 5.4706, 5.64734, 5.55245, 5.58744, 5.44937, 5.20181, 5.63792, 5.72045, 5.87194, 5.56238, 5.74796, 5.79022, 5.38902, 5.44605, 5.54282, 5.55739, 5.49575, 5.64498, 5.33577, 5.45876, 5.42673, 5.5365, 5.42129, 5.62761, 5.71678, 5.48104, 5.60527, 5.5126, 5.25058, 5.49118, 5.43681, 5.48508, 5.28923, 5.46474, 5.45286, 5.6724, 5.35082, 5.46484, 5.40053, 5.54964, 5.16851, 5.10998, 5.5302, 5.59551, 5.43932, 5.53394, 5.2946, 5.37074, 5.47423, 5.2811, 5.46993, 5.28979, 5.57821, 5.48542, 5.37281, 5.45382, 5.27315, 5.53883, 5.2931, 5.25971, 5.35796, 5.33386, 5.5094, 5.38011, 5.51219, 5.30068, 5.34103, 5.49541, 5.54901, 5.50235, 5.43059, 5.39677, 5.52711, 5.19094, 5.45817, 5.34325, 5.56956, 5.41302, 5.43584, 5.37612, 5.25951, 5.25447, 5.49422, 5.5781, 5.35768, 5.3279, 5.19136, 5.4016, 5.39747, 5.20526, 5.61362, 5.29418, 5.39709, 5.44712, 5.30146, 5.34724, 5.36676, 5.28901, 5.361, 5.45905, 5.27649, 5.47318, 5.21725, 5.22023, 5.35122, 5.28396, 5.21834, 5.10071, 5.23602, 5.43096, 5.33142, 5.33017, 5.66246, 5.3004, 5.30692, 5.39386, 5.13475, 5.06957, 5.3365, 5.37793, 5.21244, 5.29887, 5.36995, 5.34675, 5.15473, 5.24757, 5.27856, 5.16172, 5.08869, 5.37568, 5.11393, 5.55309, 5.15317, 5.32295, 5.06795, 5.13265, 5.17242, 5.01042, 5.01637, 5.20515, 5.17193, 5.18392, 5.30507, 5.25233, 5.31569, 5.14154, 5.24356, 5.12106, 5.31092, 5.36465, 5.24729, 5.09639, 5.1804, 5.29568, 5.10464, 5.27827, 5.10619, 5.10892, 5.03572]}, "lm loss vs samples": {"start_step": 0, "end_step": 2000, "step_interval": 5, "values": [10.88345, 10.90291, 10.88739, 10.83435, 10.68106, 10.65239, 10.43882, 10.15796, 9.94566, 9.85031, 9.59624, 9.85805, 9.88827, 9.63311, 9.79091, 9.51415, 9.46112, 9.65226, 9.38851, 9.33535, 9.24597, 9.15002, 9.1791, 9.00048, 9.19456, 9.06645, 9.16089, 9.17249, 9.30644, 8.99568, 8.93903, 9.04853, 9.05134, 8.65891, 8.72191, 8.75857, 8.68509, 8.7367, 8.66155, 8.76648, 8.66383, 8.85312, 8.83506, 8.49989, 8.39023, 8.43268, 8.49362, 8.38495, 8.4346, 8.58278, 8.36836, 8.19768, 8.22999, 8.22623, 8.27021, 7.91926, 8.10177, 7.89448, 8.24737, 8.23304, 8.007, 7.96876, 7.92354, 7.74219, 7.74672, 7.64691, 7.51972, 7.90702, 7.70393, 7.45184, 7.74158, 7.77006, 7.54684, 7.30265, 7.45642, 7.33883, 7.46797, 7.22942, 7.63514, 7.28131, 7.35335, 7.21286, 7.21895, 7.42346, 7.17843, 7.28509, 7.00192, 7.0089, 7.04286, 7.14056, 6.82835, 6.99014, 7.09279, 7.00447, 6.88003, 6.761, 6.99471, 7.0633, 6.70925, 6.5917, 6.73258, 6.74964, 6.73779, 6.74258, 6.66376, 6.41582, 6.64124, 6.62873, 6.45047, 6.63243, 6.75424, 6.61807, 6.73736, 6.70363, 6.63926, 6.51953, 6.61425, 6.42312, 6.67885, 6.26757, 6.26882, 6.32005, 6.41287, 6.37101, 6.46896, 6.31397, 6.36148, 6.25486, 6.22526, 6.42692, 6.35485, 6.35029, 6.19105, 6.18567, 6.26859, 6.415, 6.23334, 6.18337, 6.21035, 6.14535, 6.09626, 6.10387, 6.28772, 6.43606, 6.29503, 6.335, 6.13464, 6.21503, 6.02829, 6.06095, 5.9935, 6.28273, 6.22023, 5.99847, 5.81393, 6.16265, 5.87946, 6.14445, 5.82485, 6.19248, 6.18157, 6.12584, 5.97074, 6.14877, 5.98325, 6.23524, 5.93942, 5.83892, 5.82229, 5.72934, 6.05496, 6.0434, 6.11051, 5.93954, 6.09171, 6.01241, 6.04004, 6.0322, 5.99651, 5.89061, 6.00653, 5.67122, 5.75784, 5.94696, 5.9005, 5.91468, 5.82189, 5.89471, 5.77842, 5.61622, 5.78054, 5.69253, 5.90048, 5.66647, 5.77352, 5.78152, 5.97131, 5.71328, 5.92696, 5.81669, 5.94504, 5.4175, 5.97213, 5.95642, 5.93165, 5.48932, 5.49949, 5.70719, 5.6873, 5.5725, 5.66702, 5.76913, 5.57229, 5.82826, 5.61559, 5.69173, 5.731, 5.73072, 5.62169, 5.71676, 5.78883, 5.80232, 5.67949, 5.77122, 5.47901, 5.79612, 5.73059, 5.53929, 5.69307, 5.7447, 5.6605, 5.44825, 5.66038, 5.60993, 5.60208, 5.50359, 5.67847, 5.72987, 5.52511, 5.65798, 5.63632, 5.4706, 5.64734, 5.55245, 5.58744, 5.44937, 5.20181, 5.63792, 5.72045, 5.87194, 5.56238, 5.74796, 5.79022, 5.38902, 5.44605, 5.54282, 5.55739, 5.49575, 5.64498, 5.33577, 5.45876, 5.42673, 5.5365, 5.42129, 5.62761, 5.71678, 5.48104, 5.60527, 5.5126, 5.25058, 5.49118, 5.43681, 5.48508, 5.28923, 5.46474, 5.45286, 5.6724, 5.35082, 5.46484, 5.40053, 5.54964, 5.16851, 5.10998, 5.5302, 5.59551, 5.43932, 5.53394, 5.2946, 5.37074, 5.47423, 5.2811, 5.46993, 5.28979, 5.57821, 5.48542, 5.37281, 5.45382, 5.27315, 5.53883, 5.2931, 5.25971, 5.35796, 5.33386, 5.5094, 5.38011, 5.51219, 5.30068, 5.34103, 5.49541, 5.54901, 5.50235, 5.43059, 5.39677, 5.52711, 5.19094, 5.45817, 5.34325, 5.56956, 5.41302, 5.43584, 5.37612, 5.25951, 5.25447, 5.49422, 5.5781, 5.35768, 5.3279, 5.19136, 5.4016, 5.39747, 5.20526, 5.61362, 5.29418, 5.39709, 5.44712, 5.30146, 5.34724, 5.36676, 5.28901, 5.361, 5.45905, 5.27649, 5.47318, 5.21725, 5.22023, 5.35122, 5.28396, 5.21834, 5.10071, 5.23602, 5.43096, 5.33142, 5.33017, 5.66246, 5.3004, 5.30692, 5.39386, 5.13475, 5.06957, 5.3365, 5.37793, 5.21244, 5.29887, 5.36995, 5.34675, 5.15473, 5.24757, 5.27856, 5.16172, 5.08869, 5.37568, 5.11393, 5.55309, 5.15317, 5.32295, 5.06795, 5.13265, 5.17242, 5.01042, 5.01637, 5.20515, 5.17193, 5.18392, 5.30507, 5.25233, 5.31569, 5.14154, 5.24356, 5.12106, 5.31092, 5.36465, 5.24729, 5.09639, 5.1804, 5.29568, 5.10464, 5.27827, 5.10619, 5.10892, 5.03572]}, "loss-scale": {"start_step": 0, "end_step": 2000, "step_interval": 5, "values": [1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0]}, "loss-scale vs samples": {"start_step": 0, "end_step": 2000, "step_interval": 5, "values": [1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0]}, "grad-norm": {"start_step": 0, "end_step": 2000, "step_interval": 5, "values": [12.43997, 12.4994, 12.67738, 12.01981, 11.40989, 9.15396, 6.91154, 7.19653, 6.10097, 4.66447, 4.20211, 2.8807, 2.37647, 2.34175, 2.05101, 2.19366, 2.12083, 1.89191, 2.18481, 2.06821, 2.11865, 2.16674, 2.00167, 2.19993, 1.94652, 2.02914, 1.87967, 1.849, 1.87625, 2.13926, 2.1644, 1.83737, 1.7865, 2.10617, 2.09168, 2.03916, 1.97963, 1.83822, 1.96495, 1.70803, 2.13244, 1.91303, 1.67031, 1.85063, 1.89388, 1.7393, 1.73696, 1.73834, 1.81384, 1.54681, 1.72306, 1.83162, 1.75476, 1.78654, 1.54973, 1.8348, 1.71396, 1.79871, 1.46752, 1.54685, 1.64797, 1.57656, 1.70218, 1.63082, 1.61792, 1.6742, 1.70617, 1.4063, 1.49439, 1.5398, 1.39435, 1.372, 1.63172, 1.45579, 1.3529, 1.50085, 1.31258, 1.33724, 1.14869, 1.28976, 1.19311, 1.38603, 1.20251, 1.31173, 1.10965, 1.18009, 1.42638, 1.54885, 1.1348, 1.01505, 1.06293, 1.23147, 0.95714, 0.89268, 0.94079, 1.27319, 1.18212, 1.01407, 1.03886, 1.50527, 1.02205, 1.09161, 0.91857, 1.10077, 0.94051, 1.19162, 0.99345, 0.96782, 1.0889, 0.98132, 1.29717, 0.8425, 1.11704, 0.95051, 1.15684, 0.97961, 0.94467, 1.05905, 0.93968, 1.14615, 0.96345, 0.97578, 1.19987, 0.96535, 1.25273, 1.46243, 1.21921, 0.99922, 1.14431, 1.34353, 1.06135, 1.14405, 1.10872, 1.1588, 0.94471, 1.01308, 0.94383, 0.99273, 0.97851, 0.89198, 1.09779, 1.31177, 1.05508, 0.91714, 1.0117, 1.28832, 1.09784, 1.19667, 0.92098, 0.98378, 1.03891, 1.07858, 1.29929, 0.94354, 1.06388, 1.50705, 1.0007, 1.35362, 1.28287, 0.84574, 1.11813, 1.1825, 1.04876, 1.12893, 1.16116, 1.12585, 1.11897, 1.15162, 1.30322, 1.20265, 1.018, 0.99879, 0.90328, 1.21092, 1.0701, 1.06218, 1.10403, 1.0926, 1.05063, 1.07573, 1.20003, 1.25848, 1.34649, 1.12066, 1.50822, 1.14324, 1.4787, 1.1305, 1.14505, 1.16533, 1.14287, 1.24641, 1.38816, 1.42518, 1.1866, 1.45857, 1.17698, 1.2263, 1.01505, 1.21325, 1.36272, 1.305, 1.19874, 1.18217, 1.01807, 1.24602, 1.46217, 1.22746, 1.20492, 1.3465, 1.12878, 1.16877, 1.06974, 1.08696, 1.6092, 1.25397, 1.20201, 1.08861, 1.34872, 1.27688, 1.5104, 1.30437, 1.05297, 1.3032, 1.2672, 1.36045, 1.15533, 1.08165, 1.20493, 1.17126, 1.18099, 1.25764, 1.52555, 1.33265, 1.17044, 1.32121, 1.21081, 1.39328, 1.50488, 1.28381, 1.24675, 1.23603, 1.3193, 1.29405, 1.23259, 1.07163, 1.1052, 1.24045, 1.37927, 1.50839, 1.32285, 1.38782, 1.13484, 1.21127, 2.00278, 1.36691, 1.32213, 1.37434, 1.00254, 1.08214, 1.17335, 1.41525, 1.25392, 1.43316, 1.39572, 1.31067, 1.2846, 1.09515, 1.18724, 1.20128, 1.30643, 1.23357, 1.11402, 1.17568, 1.29277, 1.22678, 1.1362, 1.18826, 1.25873, 1.2814, 1.22295, 1.02105, 1.29626, 1.3106, 1.38573, 1.28368, 1.04758, 1.13079, 1.06747, 1.51913, 1.45844, 1.11656, 1.1972, 1.22395, 1.4347, 1.41031, 1.11466, 1.5639, 1.36293, 1.24572, 1.4447, 1.25296, 1.14388, 1.12495, 1.31276, 1.35398, 1.2105, 1.44264, 1.16726, 1.19041, 1.35889, 1.20903, 1.15845, 1.12041, 1.06639, 1.2833, 1.21736, 1.18244, 1.41925, 1.21164, 1.17543, 1.27955, 1.27399, 1.23019, 1.33022, 1.24584, 1.546, 1.32952, 1.1706, 1.31643, 1.32431, 1.26323, 1.13097, 1.34316, 1.10348, 1.33974, 1.18037, 1.18919, 1.42354, 1.37144, 1.33382, 1.39443, 1.37347, 1.18285, 1.1776, 1.31269, 1.10901, 1.33507, 1.39353, 1.28869, 1.32106, 1.36384, 1.307, 1.2118, 1.20055, 1.076, 1.20907, 1.28103, 1.2481, 1.49609, 1.25261, 1.22933, 1.23135, 1.40382, 1.47949, 1.50263, 1.27893, 1.27615, 1.34666, 1.30354, 1.1997, 1.51644, 1.42165, 1.35804, 1.19426, 1.23401, 1.36501, 1.05637, 1.11768, 1.22237, 1.39349, 1.3636, 1.33587, 1.44787, 1.23775, 1.25341, 1.15189, 1.07392, 1.29463, 1.16475, 1.13311, 1.32307, 1.04489, 1.17108, 1.24996, 1.21235, 1.90656, 1.20192, 1.24416, 1.32035]}, "grad-norm vs samples": {"start_step": 0, "end_step": 2000, "step_interval": 5, "values": [12.43997, 12.4994, 12.67738, 12.01981, 11.40989, 9.15396, 6.91154, 7.19653, 6.10097, 4.66447, 4.20211, 2.8807, 2.37647, 2.34175, 2.05101, 2.19366, 2.12083, 1.89191, 2.18481, 2.06821, 2.11865, 2.16674, 2.00167, 2.19993, 1.94652, 2.02914, 1.87967, 1.849, 1.87625, 2.13926, 2.1644, 1.83737, 1.7865, 2.10617, 2.09168, 2.03916, 1.97963, 1.83822, 1.96495, 1.70803, 2.13244, 1.91303, 1.67031, 1.85063, 1.89388, 1.7393, 1.73696, 1.73834, 1.81384, 1.54681, 1.72306, 1.83162, 1.75476, 1.78654, 1.54973, 1.8348, 1.71396, 1.79871, 1.46752, 1.54685, 1.64797, 1.57656, 1.70218, 1.63082, 1.61792, 1.6742, 1.70617, 1.4063, 1.49439, 1.5398, 1.39435, 1.372, 1.63172, 1.45579, 1.3529, 1.50085, 1.31258, 1.33724, 1.14869, 1.28976, 1.19311, 1.38603, 1.20251, 1.31173, 1.10965, 1.18009, 1.42638, 1.54885, 1.1348, 1.01505, 1.06293, 1.23147, 0.95714, 0.89268, 0.94079, 1.27319, 1.18212, 1.01407, 1.03886, 1.50527, 1.02205, 1.09161, 0.91857, 1.10077, 0.94051, 1.19162, 0.99345, 0.96782, 1.0889, 0.98132, 1.29717, 0.8425, 1.11704, 0.95051, 1.15684, 0.97961, 0.94467, 1.05905, 0.93968, 1.14615, 0.96345, 0.97578, 1.19987, 0.96535, 1.25273, 1.46243, 1.21921, 0.99922, 1.14431, 1.34353, 1.06135, 1.14405, 1.10872, 1.1588, 0.94471, 1.01308, 0.94383, 0.99273, 0.97851, 0.89198, 1.09779, 1.31177, 1.05508, 0.91714, 1.0117, 1.28832, 1.09784, 1.19667, 0.92098, 0.98378, 1.03891, 1.07858, 1.29929, 0.94354, 1.06388, 1.50705, 1.0007, 1.35362, 1.28287, 0.84574, 1.11813, 1.1825, 1.04876, 1.12893, 1.16116, 1.12585, 1.11897, 1.15162, 1.30322, 1.20265, 1.018, 0.99879, 0.90328, 1.21092, 1.0701, 1.06218, 1.10403, 1.0926, 1.05063, 1.07573, 1.20003, 1.25848, 1.34649, 1.12066, 1.50822, 1.14324, 1.4787, 1.1305, 1.14505, 1.16533, 1.14287, 1.24641, 1.38816, 1.42518, 1.1866, 1.45857, 1.17698, 1.2263, 1.01505, 1.21325, 1.36272, 1.305, 1.19874, 1.18217, 1.01807, 1.24602, 1.46217, 1.22746, 1.20492, 1.3465, 1.12878, 1.16877, 1.06974, 1.08696, 1.6092, 1.25397, 1.20201, 1.08861, 1.34872, 1.27688, 1.5104, 1.30437, 1.05297, 1.3032, 1.2672, 1.36045, 1.15533, 1.08165, 1.20493, 1.17126, 1.18099, 1.25764, 1.52555, 1.33265, 1.17044, 1.32121, 1.21081, 1.39328, 1.50488, 1.28381, 1.24675, 1.23603, 1.3193, 1.29405, 1.23259, 1.07163, 1.1052, 1.24045, 1.37927, 1.50839, 1.32285, 1.38782, 1.13484, 1.21127, 2.00278, 1.36691, 1.32213, 1.37434, 1.00254, 1.08214, 1.17335, 1.41525, 1.25392, 1.43316, 1.39572, 1.31067, 1.2846, 1.09515, 1.18724, 1.20128, 1.30643, 1.23357, 1.11402, 1.17568, 1.29277, 1.22678, 1.1362, 1.18826, 1.25873, 1.2814, 1.22295, 1.02105, 1.29626, 1.3106, 1.38573, 1.28368, 1.04758, 1.13079, 1.06747, 1.51913, 1.45844, 1.11656, 1.1972, 1.22395, 1.4347, 1.41031, 1.11466, 1.5639, 1.36293, 1.24572, 1.4447, 1.25296, 1.14388, 1.12495, 1.31276, 1.35398, 1.2105, 1.44264, 1.16726, 1.19041, 1.35889, 1.20903, 1.15845, 1.12041, 1.06639, 1.2833, 1.21736, 1.18244, 1.41925, 1.21164, 1.17543, 1.27955, 1.27399, 1.23019, 1.33022, 1.24584, 1.546, 1.32952, 1.1706, 1.31643, 1.32431, 1.26323, 1.13097, 1.34316, 1.10348, 1.33974, 1.18037, 1.18919, 1.42354, 1.37144, 1.33382, 1.39443, 1.37347, 1.18285, 1.1776, 1.31269, 1.10901, 1.33507, 1.39353, 1.28869, 1.32106, 1.36384, 1.307, 1.2118, 1.20055, 1.076, 1.20907, 1.28103, 1.2481, 1.49609, 1.25261, 1.22933, 1.23135, 1.40382, 1.47949, 1.50263, 1.27893, 1.27615, 1.34666, 1.30354, 1.1997, 1.51644, 1.42165, 1.35804, 1.19426, 1.23401, 1.36501, 1.05637, 1.11768, 1.22237, 1.39349, 1.3636, 1.33587, 1.44787, 1.23775, 1.25341, 1.15189, 1.07392, 1.29463, 1.16475, 1.13311, 1.32307, 1.04489, 1.17108, 1.24996, 1.21235, 1.90656, 1.20192, 1.24416, 1.32035]}, "num-zeros": {"start_step": 0, "end_step": 2000, "step_interval": 5, "values": [80.0, 89.0, 102.0, 88.0, 78.0, 115.0, 125.0, 114.0, 129.0, 106.0, 125.0, 179.0, 156.0, 184.0, 179.0, 191.0, 171.0, 216.0, 169.0, 200.0, 171.0, 184.0, 206.0, 173.0, 221.0, 181.0, 188.0, 209.0, 187.0, 188.0, 167.0, 165.0, 180.0, 204.0, 152.0, 155.0, 170.0, 179.0, 177.0, 197.0, 184.0, 162.0, 194.0, 184.0, 171.0, 206.0, 198.0, 200.0, 187.0, 238.0, 208.0, 173.0, 201.0, 145.0, 199.0, 194.0, 185.0, 173.0, 266.0, 238.0, 190.0, 195.0, 182.0, 188.0, 199.0, 262.0, 210.0, 233.0, 216.0, 199.0, 257.0, 213.0, 220.0, 243.0, 218.0, 215.0, 229.0, 219.0, 289.0, 212.0, 280.0, 229.0, 196.0, 274.0, 237.0, 246.0, 170.0, 203.0, 205.0, 236.0, 201.0, 203.0, 256.0, 220.0, 191.0, 173.0, 214.0, 225.0, 183.0, 151.0, 195.0, 174.0, 218.0, 189.0, 159.0, 151.0, 154.0, 154.0, 130.0, 202.0, 162.0, 186.0, 166.0, 187.0, 136.0, 145.0, 168.0, 100.0, 161.0, 124.0, 138.0, 163.0, 108.0, 167.0, 129.0, 131.0, 141.0, 148.0, 128.0, 124.0, 137.0, 168.0, 133.0, 114.0, 139.0, 123.0, 161.0, 139.0, 133.0, 152.0, 122.0, 111.0, 135.0, 155.0, 158.0, 101.0, 134.0, 164.0, 136.0, 163.0, 110.0, 153.0, 116.0, 132.0, 120.0, 115.0, 108.0, 85.0, 97.0, 169.0, 112.0, 115.0, 134.0, 105.0, 114.0, 156.0, 115.0, 103.0, 125.0, 113.0, 121.0, 138.0, 114.0, 130.0, 122.0, 118.0, 88.0, 106.0, 113.0, 121.0, 134.0, 131.0, 118.0, 130.0, 93.0, 111.0, 114.0, 111.0, 106.0, 95.0, 105.0, 107.0, 107.0, 87.0, 112.0, 90.0, 116.0, 104.0, 135.0, 140.0, 102.0, 104.0, 142.0, 144.0, 121.0, 87.0, 99.0, 136.0, 115.0, 105.0, 126.0, 112.0, 126.0, 125.0, 115.0, 116.0, 121.0, 145.0, 109.0, 111.0, 103.0, 112.0, 129.0, 115.0, 130.0, 97.0, 119.0, 103.0, 116.0, 135.0, 109.0, 115.0, 109.0, 113.0, 119.0, 116.0, 105.0, 107.0, 105.0, 109.0, 113.0, 115.0, 101.0, 114.0, 109.0, 123.0, 111.0, 117.0, 106.0, 92.0, 103.0, 118.0, 116.0, 130.0, 99.0, 107.0, 121.0, 96.0, 124.0, 112.0, 134.0, 104.0, 115.0, 104.0, 113.0, 107.0, 119.0, 124.0, 116.0, 115.0, 123.0, 139.0, 117.0, 118.0, 110.0, 112.0, 124.0, 112.0, 104.0, 98.0, 108.0, 134.0, 108.0, 126.0, 123.0, 118.0, 120.0, 122.0, 141.0, 105.0, 81.0, 122.0, 131.0, 123.0, 122.0, 101.0, 129.0, 88.0, 131.0, 124.0, 110.0, 124.0, 130.0, 141.0, 109.0, 107.0, 95.0, 104.0, 136.0, 123.0, 121.0, 123.0, 111.0, 117.0, 142.0, 120.0, 111.0, 108.0, 86.0, 121.0, 115.0, 111.0, 125.0, 128.0, 93.0, 126.0, 116.0, 124.0, 94.0, 107.0, 107.0, 128.0, 106.0, 110.0, 128.0, 104.0, 105.0, 114.0, 118.0, 117.0, 99.0, 123.0, 108.0, 107.0, 126.0, 119.0, 121.0, 121.0, 107.0, 116.0, 116.0, 116.0, 126.0, 145.0, 132.0, 133.0, 125.0, 100.0, 98.0, 129.0, 118.0, 121.0, 105.0, 107.0, 95.0, 113.0, 106.0, 108.0, 94.0, 121.0, 139.0, 118.0, 101.0, 98.0, 111.0, 117.0, 112.0, 129.0, 113.0, 119.0, 103.0, 123.0, 124.0, 107.0, 121.0, 117.0, 126.0, 123.0, 103.0, 113.0, 131.0, 117.0, 128.0, 123.0, 103.0, 149.0, 113.0, 101.0, 122.0, 110.0]}, "num-zeros vs samples": {"start_step": 0, "end_step": 2000, "step_interval": 5, "values": [80.0, 89.0, 102.0, 88.0, 78.0, 115.0, 125.0, 114.0, 129.0, 106.0, 125.0, 179.0, 156.0, 184.0, 179.0, 191.0, 171.0, 216.0, 169.0, 200.0, 171.0, 184.0, 206.0, 173.0, 221.0, 181.0, 188.0, 209.0, 187.0, 188.0, 167.0, 165.0, 180.0, 204.0, 152.0, 155.0, 170.0, 179.0, 177.0, 197.0, 184.0, 162.0, 194.0, 184.0, 171.0, 206.0, 198.0, 200.0, 187.0, 238.0, 208.0, 173.0, 201.0, 145.0, 199.0, 194.0, 185.0, 173.0, 266.0, 238.0, 190.0, 195.0, 182.0, 188.0, 199.0, 262.0, 210.0, 233.0, 216.0, 199.0, 257.0, 213.0, 220.0, 243.0, 218.0, 215.0, 229.0, 219.0, 289.0, 212.0, 280.0, 229.0, 196.0, 274.0, 237.0, 246.0, 170.0, 203.0, 205.0, 236.0, 201.0, 203.0, 256.0, 220.0, 191.0, 173.0, 214.0, 225.0, 183.0, 151.0, 195.0, 174.0, 218.0, 189.0, 159.0, 151.0, 154.0, 154.0, 130.0, 202.0, 162.0, 186.0, 166.0, 187.0, 136.0, 145.0, 168.0, 100.0, 161.0, 124.0, 138.0, 163.0, 108.0, 167.0, 129.0, 131.0, 141.0, 148.0, 128.0, 124.0, 137.0, 168.0, 133.0, 114.0, 139.0, 123.0, 161.0, 139.0, 133.0, 152.0, 122.0, 111.0, 135.0, 155.0, 158.0, 101.0, 134.0, 164.0, 136.0, 163.0, 110.0, 153.0, 116.0, 132.0, 120.0, 115.0, 108.0, 85.0, 97.0, 169.0, 112.0, 115.0, 134.0, 105.0, 114.0, 156.0, 115.0, 103.0, 125.0, 113.0, 121.0, 138.0, 114.0, 130.0, 122.0, 118.0, 88.0, 106.0, 113.0, 121.0, 134.0, 131.0, 118.0, 130.0, 93.0, 111.0, 114.0, 111.0, 106.0, 95.0, 105.0, 107.0, 107.0, 87.0, 112.0, 90.0, 116.0, 104.0, 135.0, 140.0, 102.0, 104.0, 142.0, 144.0, 121.0, 87.0, 99.0, 136.0, 115.0, 105.0, 126.0, 112.0, 126.0, 125.0, 115.0, 116.0, 121.0, 145.0, 109.0, 111.0, 103.0, 112.0, 129.0, 115.0, 130.0, 97.0, 119.0, 103.0, 116.0, 135.0, 109.0, 115.0, 109.0, 113.0, 119.0, 116.0, 105.0, 107.0, 105.0, 109.0, 113.0, 115.0, 101.0, 114.0, 109.0, 123.0, 111.0, 117.0, 106.0, 92.0, 103.0, 118.0, 116.0, 130.0, 99.0, 107.0, 121.0, 96.0, 124.0, 112.0, 134.0, 104.0, 115.0, 104.0, 113.0, 107.0, 119.0, 124.0, 116.0, 115.0, 123.0, 139.0, 117.0, 118.0, 110.0, 112.0, 124.0, 112.0, 104.0, 98.0, 108.0, 134.0, 108.0, 126.0, 123.0, 118.0, 120.0, 122.0, 141.0, 105.0, 81.0, 122.0, 131.0, 123.0, 122.0, 101.0, 129.0, 88.0, 131.0, 124.0, 110.0, 124.0, 130.0, 141.0, 109.0, 107.0, 95.0, 104.0, 136.0, 123.0, 121.0, 123.0, 111.0, 117.0, 142.0, 120.0, 111.0, 108.0, 86.0, 121.0, 115.0, 111.0, 125.0, 128.0, 93.0, 126.0, 116.0, 124.0, 94.0, 107.0, 107.0, 128.0, 106.0, 110.0, 128.0, 104.0, 105.0, 114.0, 118.0, 117.0, 99.0, 123.0, 108.0, 107.0, 126.0, 119.0, 121.0, 121.0, 107.0, 116.0, 116.0, 116.0, 126.0, 145.0, 132.0, 133.0, 125.0, 100.0, 98.0, 129.0, 118.0, 121.0, 105.0, 107.0, 95.0, 113.0, 106.0, 108.0, 94.0, 121.0, 139.0, 118.0, 101.0, 98.0, 111.0, 117.0, 112.0, 129.0, 113.0, 119.0, 103.0, 123.0, 124.0, 107.0, 121.0, 117.0, 126.0, 123.0, 103.0, 113.0, 131.0, 117.0, 128.0, 123.0, 103.0, 149.0, 113.0, 101.0, 122.0, 110.0]}, "params-norm": {"start_step": 0, "end_step": 2000, "step_interval": 5, "values": [179.95625, 179.95625, 179.95625, 179.95625, 179.95625, 179.95624, 179.95622, 179.95612, 179.95593, 179.95575, 179.95451, 179.95384, 179.95331, 179.95131, 179.95029, 179.94963, 179.94899, 179.94896, 179.94923, 179.94928, 179.94922, 179.94897, 179.94885, 179.9491, 179.94991, 179.951, 179.95213, 179.95309, 179.95415, 179.95551, 179.9574, 179.95952, 179.96179, 179.96399, 179.96649, 179.96965, 179.97318, 179.97679, 179.98051, 179.98468, 179.98955, 179.99477, 180.00044, 180.00658, 180.01337, 180.02075, 180.02858, 180.03702, 180.04625, 180.05624, 180.06699, 180.0782, 180.09018, 180.10277, 180.11606, 180.12999, 180.14421, 180.159, 180.17467, 180.19148, 180.20897, 180.22713, 180.24684, 180.26782, 180.2896, 180.31204, 180.33545, 180.35973, 180.38542, 180.41144, 180.43797, 180.46524, 180.4928, 180.52104, 180.54993, 180.57939, 180.60922, 180.63998, 180.67151, 180.70398, 180.73651, 180.76875, 180.80157, 180.83536, 180.86948, 180.90508, 180.9411, 180.97647, 181.01176, 181.04828, 181.08588, 181.12448, 181.16327, 181.20253, 181.24295, 181.28366, 181.32249, 181.35963, 181.39644, 181.43352, 181.47067, 181.50752, 181.54518, 181.58394, 181.62318, 181.66335, 181.7032, 181.74304, 181.78291, 181.82195, 181.86037, 181.89832, 181.93773, 181.97792, 182.01897, 182.05927, 182.09976, 182.14062, 182.18091, 182.22133, 182.26169, 182.30261, 182.34355, 182.38451, 182.4248, 182.46426, 182.50208, 182.53731, 182.57451, 182.61168, 182.64999, 182.68562, 182.72139, 182.75731, 182.79347, 182.83156, 182.87192, 182.91328, 182.95439, 182.99614, 183.03891, 183.07968, 183.12061, 183.16183, 183.20284, 183.24399, 183.28496, 183.325, 183.3662, 183.40788, 183.45087, 183.49307, 183.53464, 183.57661, 183.61989, 183.66231, 183.70183, 183.7419, 183.78094, 183.81953, 183.86018, 183.90375, 183.94774, 183.9931, 184.03831, 184.08267, 184.12688, 184.16986, 184.21062, 184.25189, 184.29411, 184.3373, 184.38132, 184.42554, 184.46965, 184.51401, 184.55882, 184.60381, 184.64806, 184.69025, 184.73256, 184.7748, 184.817, 184.86073, 184.90417, 184.94685, 184.98766, 185.02675, 185.06696, 185.10852, 185.15274, 185.19722, 185.24055, 185.28352, 185.32553, 185.36723, 185.40932, 185.45212, 185.49559, 185.54068, 185.58374, 185.62703, 185.6687, 185.71231, 185.75662, 185.80209, 185.84537, 185.88788, 185.93077, 185.97299, 186.01599, 186.05911, 186.10475, 186.15176, 186.19826, 186.24303, 186.28674, 186.33194, 186.377, 186.42128, 186.46397, 186.50703, 186.55083, 186.59554, 186.63943, 186.68254, 186.72632, 186.77109, 186.81587, 186.86107, 186.90485, 186.94669, 186.9883, 187.03162, 187.07474, 187.11856, 187.16187, 187.20621, 187.25069, 187.29416, 187.33778, 187.38162, 187.42618, 187.47089, 187.51416, 187.56001, 187.60674, 187.6539, 187.70016, 187.74496, 187.7905, 187.83824, 187.88522, 187.93312, 187.98019, 188.02357, 188.06801, 188.11484, 188.1615, 188.21011, 188.26111, 188.31125, 188.35876, 188.4053, 188.45084, 188.49641, 188.54265, 188.58983, 188.64067, 188.69183, 188.74222, 188.79266, 188.84273, 188.89304, 188.94508, 188.99475, 189.04398, 189.09485, 189.14598, 189.1965, 189.24777, 189.29964, 189.35378, 189.40587, 189.45831, 189.50987, 189.56148, 189.61368, 189.66797, 189.71982, 189.77005, 189.81833, 189.86722, 189.91873, 189.97101, 190.02145, 190.07199, 190.12384, 190.17366, 190.22346, 190.27402, 190.3253, 190.37793, 190.43097, 190.48424, 190.53532, 190.58551, 190.63808, 190.69084, 190.74536, 190.79968, 190.85349, 190.90894, 190.96626, 191.02402, 191.08208, 191.13948, 191.19746, 191.25615, 191.31114, 191.36597, 191.4203, 191.47542, 191.53027, 191.58527, 191.63684, 191.68701, 191.73514, 191.78677, 191.83801, 191.8905, 191.94266, 191.99596, 192.05061, 192.1071, 192.16386, 192.21751, 192.27289, 192.32852, 192.37949, 192.43187, 192.48483, 192.53804, 192.59248, 192.64667, 192.70181, 192.75798, 192.81502, 192.87016, 192.92496, 192.98015, 193.03481, 193.09019, 193.14693, 193.20465, 193.26526, 193.32504, 193.38451, 193.44281, 193.49977, 193.55804, 193.61533, 193.67177, 193.72891, 193.78667, 193.84259, 193.89799, 193.95425, 194.01086, 194.06876, 194.12726, 194.18596, 194.24385, 194.30168, 194.35782, 194.41516, 194.47411, 194.53342, 194.59587, 194.65793, 194.71797, 194.77441, 194.83284, 194.88989, 194.94766, 195.00539, 195.06413, 195.12605, 195.19096, 195.25722, 195.32449, 195.39157, 195.45724, 195.52281, 195.58981, 195.65671, 195.7216, 195.78194, 195.84415, 195.90858]}, "params-norm vs samples": {"start_step": 0, "end_step": 2000, "step_interval": 5, "values": [179.95625, 179.95625, 179.95625, 179.95625, 179.95625, 179.95624, 179.95622, 179.95612, 179.95593, 179.95575, 179.95451, 179.95384, 179.95331, 179.95131, 179.95029, 179.94963, 179.94899, 179.94896, 179.94923, 179.94928, 179.94922, 179.94897, 179.94885, 179.9491, 179.94991, 179.951, 179.95213, 179.95309, 179.95415, 179.95551, 179.9574, 179.95952, 179.96179, 179.96399, 179.96649, 179.96965, 179.97318, 179.97679, 179.98051, 179.98468, 179.98955, 179.99477, 180.00044, 180.00658, 180.01337, 180.02075, 180.02858, 180.03702, 180.04625, 180.05624, 180.06699, 180.0782, 180.09018, 180.10277, 180.11606, 180.12999, 180.14421, 180.159, 180.17467, 180.19148, 180.20897, 180.22713, 180.24684, 180.26782, 180.2896, 180.31204, 180.33545, 180.35973, 180.38542, 180.41144, 180.43797, 180.46524, 180.4928, 180.52104, 180.54993, 180.57939, 180.60922, 180.63998, 180.67151, 180.70398, 180.73651, 180.76875, 180.80157, 180.83536, 180.86948, 180.90508, 180.9411, 180.97647, 181.01176, 181.04828, 181.08588, 181.12448, 181.16327, 181.20253, 181.24295, 181.28366, 181.32249, 181.35963, 181.39644, 181.43352, 181.47067, 181.50752, 181.54518, 181.58394, 181.62318, 181.66335, 181.7032, 181.74304, 181.78291, 181.82195, 181.86037, 181.89832, 181.93773, 181.97792, 182.01897, 182.05927, 182.09976, 182.14062, 182.18091, 182.22133, 182.26169, 182.30261, 182.34355, 182.38451, 182.4248, 182.46426, 182.50208, 182.53731, 182.57451, 182.61168, 182.64999, 182.68562, 182.72139, 182.75731, 182.79347, 182.83156, 182.87192, 182.91328, 182.95439, 182.99614, 183.03891, 183.07968, 183.12061, 183.16183, 183.20284, 183.24399, 183.28496, 183.325, 183.3662, 183.40788, 183.45087, 183.49307, 183.53464, 183.57661, 183.61989, 183.66231, 183.70183, 183.7419, 183.78094, 183.81953, 183.86018, 183.90375, 183.94774, 183.9931, 184.03831, 184.08267, 184.12688, 184.16986, 184.21062, 184.25189, 184.29411, 184.3373, 184.38132, 184.42554, 184.46965, 184.51401, 184.55882, 184.60381, 184.64806, 184.69025, 184.73256, 184.7748, 184.817, 184.86073, 184.90417, 184.94685, 184.98766, 185.02675, 185.06696, 185.10852, 185.15274, 185.19722, 185.24055, 185.28352, 185.32553, 185.36723, 185.40932, 185.45212, 185.49559, 185.54068, 185.58374, 185.62703, 185.6687, 185.71231, 185.75662, 185.80209, 185.84537, 185.88788, 185.93077, 185.97299, 186.01599, 186.05911, 186.10475, 186.15176, 186.19826, 186.24303, 186.28674, 186.33194, 186.377, 186.42128, 186.46397, 186.50703, 186.55083, 186.59554, 186.63943, 186.68254, 186.72632, 186.77109, 186.81587, 186.86107, 186.90485, 186.94669, 186.9883, 187.03162, 187.07474, 187.11856, 187.16187, 187.20621, 187.25069, 187.29416, 187.33778, 187.38162, 187.42618, 187.47089, 187.51416, 187.56001, 187.60674, 187.6539, 187.70016, 187.74496, 187.7905, 187.83824, 187.88522, 187.93312, 187.98019, 188.02357, 188.06801, 188.11484, 188.1615, 188.21011, 188.26111, 188.31125, 188.35876, 188.4053, 188.45084, 188.49641, 188.54265, 188.58983, 188.64067, 188.69183, 188.74222, 188.79266, 188.84273, 188.89304, 188.94508, 188.99475, 189.04398, 189.09485, 189.14598, 189.1965, 189.24777, 189.29964, 189.35378, 189.40587, 189.45831, 189.50987, 189.56148, 189.61368, 189.66797, 189.71982, 189.77005, 189.81833, 189.86722, 189.91873, 189.97101, 190.02145, 190.07199, 190.12384, 190.17366, 190.22346, 190.27402, 190.3253, 190.37793, 190.43097, 190.48424, 190.53532, 190.58551, 190.63808, 190.69084, 190.74536, 190.79968, 190.85349, 190.90894, 190.96626, 191.02402, 191.08208, 191.13948, 191.19746, 191.25615, 191.31114, 191.36597, 191.4203, 191.47542, 191.53027, 191.58527, 191.63684, 191.68701, 191.73514, 191.78677, 191.83801, 191.8905, 191.94266, 191.99596, 192.05061, 192.1071, 192.16386, 192.21751, 192.27289, 192.32852, 192.37949, 192.43187, 192.48483, 192.53804, 192.59248, 192.64667, 192.70181, 192.75798, 192.81502, 192.87016, 192.92496, 192.98015, 193.03481, 193.09019, 193.14693, 193.20465, 193.26526, 193.32504, 193.38451, 193.44281, 193.49977, 193.55804, 193.61533, 193.67177, 193.72891, 193.78667, 193.84259, 193.89799, 193.95425, 194.01086, 194.06876, 194.12726, 194.18596, 194.24385, 194.30168, 194.35782, 194.41516, 194.47411, 194.53342, 194.59587, 194.65793, 194.71797, 194.77441, 194.83284, 194.88989, 194.94766, 195.00539, 195.06413, 195.12605, 195.19096, 195.25722, 195.32449, 195.39157, 195.45724, 195.52281, 195.58981, 195.65671, 195.7216, 195.78194, 195.84415, 195.90858]}, "iteration-time": {"start_step": 0, "end_step": 2000, "step_interval": 5, "values": [18.92793, 0.51136, 0.50959, 0.5023, 0.50706, 0.49889, 0.49918, 0.50787, 0.50805, 0.50023, 0.51244, 0.49782, 0.5011, 0.49829, 0.50242, 0.49765, 0.50512, 0.50815, 0.51211, 0.49886, 0.50327, 0.50436, 0.50354, 0.4972, 0.49868, 0.50277, 0.49981, 0.50008, 0.50203, 0.49718, 0.60026, 0.49876, 0.49477, 0.5046, 0.51537, 0.5196, 0.49706, 0.49993, 0.49908, 0.49804, 0.4994, 0.49794, 0.50015, 0.49859, 0.49669, 0.49649, 0.59124, 0.49837, 0.50138, 0.49717, 0.49966, 0.50461, 0.4977, 0.49673, 0.5025, 0.49998, 0.49865, 0.50151, 0.50846, 0.51111, 0.50552, 0.50429, 0.50589, 0.50627, 0.50795, 0.505, 0.50478, 0.50608, 0.5063, 0.50392, 0.50528, 0.50464, 0.50852, 0.50732, 0.50975, 0.70338, 0.50322, 0.50607, 0.5008, 0.51264, 0.50202, 0.51117, 0.50466, 0.50856, 0.50482, 0.5101, 0.50604, 0.50708, 0.50371, 0.50732, 0.50754, 0.50725, 0.50576, 0.50944, 0.50954, 0.50758, 0.50654, 0.5929, 0.50552, 0.50521, 0.50353, 0.50768, 0.50269, 0.50818, 0.50339, 0.50584, 0.50369, 0.50801, 0.50311, 0.50501, 0.50259, 0.50478, 0.50477, 0.50612, 0.50304, 0.5048, 0.50419, 0.50917, 0.50259, 0.59305, 0.71675, 0.50782, 0.50595, 0.50366, 0.50416, 0.5131, 0.50874, 0.50202, 0.5075, 0.50344, 0.50969, 0.50236, 0.50738, 0.5042, 0.50968, 0.50453, 0.50797, 0.50316, 0.50801, 0.50385, 0.51048, 0.50461, 0.60109, 0.50835, 0.50599, 0.50503, 0.50405, 0.50686, 0.50365, 0.50633, 0.51394, 0.507, 0.50416, 0.5072, 0.50187, 0.50987, 0.50554, 0.50964, 0.49997, 0.5086, 0.50287, 0.50901, 0.51253, 0.51268, 0.59174, 0.63218, 0.50352, 0.50458, 0.50663, 0.50624, 0.50529, 0.50834, 0.50628, 0.50536, 0.50697, 0.50514, 0.5058, 0.5064, 0.51003, 0.50482, 0.50622, 0.50306, 0.50955, 0.50288, 0.51052, 0.50915, 0.50819, 0.50518, 0.50395, 0.50908, 0.50261, 0.5111, 0.59558, 0.50726, 0.50659, 0.50692, 0.50765, 0.50516, 0.51034, 0.50537, 0.49111, 0.50535, 0.50465, 0.50275, 0.50558, 0.5014, 0.5079, 0.5078, 0.50568, 0.5069, 0.50614, 0.50631, 0.5066, 0.50398, 0.50618, 0.50721, 0.51171, 0.50602, 0.50818, 0.50511, 0.51286, 0.50398, 0.50849, 0.50801, 0.50817, 0.50985, 0.50547, 0.50729, 0.50608, 0.59229, 0.50801, 0.50242, 0.51408, 0.50883, 0.5042, 0.508, 0.51821, 0.50964, 0.50309, 0.51214, 0.59459, 0.51016, 0.50757, 0.51259, 0.50854, 0.50258, 0.50468, 0.50579, 0.50859, 0.50372, 0.50798, 0.50757, 0.51184, 0.50914, 0.50776, 0.50432, 0.50917, 0.50287, 0.50616, 0.50167, 0.5065, 0.50145, 0.51091, 0.50163, 0.51326, 0.50092, 0.50601, 0.50447, 0.50502, 0.50274, 0.50572, 0.50976, 0.5047, 0.50868, 0.50316, 0.52048, 0.50699, 0.61568, 0.50722, 0.5088, 0.50773, 0.50579, 0.50532, 0.50689, 0.50615, 0.50762, 0.5023, 0.50258, 0.50262, 0.51065, 0.50567, 0.50633, 0.50361, 0.50893, 0.50511, 0.50936, 0.59793, 0.60202, 0.51102, 0.50683, 0.50341, 0.50975, 0.50313, 0.51068, 0.50494, 0.5094, 0.50552, 0.5077, 0.50574, 0.50655, 0.51164, 0.50641, 0.50789, 0.50671, 0.61258, 0.50815, 0.50767, 0.50856, 0.51335, 0.5105, 0.50233, 0.50903, 0.50975, 0.50328, 0.50987, 0.50357, 0.50951, 0.50423, 0.50818, 0.50563, 0.50771, 0.50968, 0.50443, 0.50847, 0.50717, 0.50752, 0.50453, 0.50914, 0.50657, 0.50601, 0.51204, 0.50439, 0.59526, 0.50772, 0.50461, 0.51966, 0.50388, 0.50764, 0.50335, 0.51566, 0.50622, 0.50664, 0.50857, 0.51175, 0.50837, 0.50352, 0.50963, 0.50442, 0.50747, 0.50672, 0.50844, 0.50629, 0.50717, 0.5071, 0.50387, 0.5066, 0.50594, 0.50388, 0.50981, 0.50538, 0.5055, 0.50641, 0.50813, 0.50422, 0.50345, 0.50462, 0.50731, 0.50278, 0.50356, 0.50701, 0.5066, 0.5073, 0.51, 0.50394, 0.50873, 0.50751, 0.50848, 0.59448, 0.50862, 0.5117, 0.50484, 0.51229, 0.50735, 0.50392, 0.50744, 0.50609, 0.50765, 0.51917, 0.51153, 0.50229]}, "lm loss validation": {"start_step": 0, "end_step": 4, "step_interval": 5, "values": [5.68727]}, "lm loss validation vs samples": {"start_step": 0, "end_step": 4, "step_interval": 5, "values": [5.68727]}, "lm loss validation ppl": {"start_step": 0, "end_step": 4, "step_interval": 5, "values": [295.08755]}, "lm loss validation ppl vs samples": {"start_step": 0, "end_step": 4, "step_interval": 5, "values": [295.08755]}}
\ No newline at end of file
diff --git a/tests/functional_tests/test_cases/gpt/gpt3_345m_weekly_dgx_h100_1N8G_mcore_nondet_tp1_pp1_fp8_no_model_parallel/golden_values_lts.json b/tests/functional_tests/test_cases/gpt/gpt3_345m_weekly_dgx_h100_1N8G_mcore_nondet_tp1_pp1_fp8_no_model_parallel/golden_values_lts.json
new file mode 100644
index 0000000000..7335b2067c
--- /dev/null
+++ b/tests/functional_tests/test_cases/gpt/gpt3_345m_weekly_dgx_h100_1N8G_mcore_nondet_tp1_pp1_fp8_no_model_parallel/golden_values_lts.json
@@ -0,0 +1 @@
+{"forward-backward-time": {"start_step": 0, "end_step": 2000, "step_interval": 5, "values": [18.28053, 0.49505, 0.49249, 0.4863, 0.49126, 0.48294, 0.48297, 0.49211, 0.49244, 0.48476, 0.49685, 0.48221, 0.48444, 0.48262, 0.4868, 0.4822, 0.48935, 0.49261, 0.49648, 0.48319, 0.48763, 0.48829, 0.48803, 0.48167, 0.48323, 0.48629, 0.48421, 0.48466, 0.48642, 0.48171, 0.5845, 0.48341, 0.47926, 0.48909, 0.49939, 0.50358, 0.4812, 0.48449, 0.48356, 0.48264, 0.48384, 0.48252, 0.4847, 0.48316, 0.48125, 0.48107, 0.57559, 0.48254, 0.48595, 0.48176, 0.48343, 0.48901, 0.48231, 0.48126, 0.48705, 0.48449, 0.48313, 0.48504, 0.49265, 0.49529, 0.48979, 0.48846, 0.48904, 0.48991, 0.49197, 0.48869, 0.48889, 0.49026, 0.49051, 0.48812, 0.4895, 0.4888, 0.49274, 0.49157, 0.49398, 0.68596, 0.48574, 0.48994, 0.48496, 0.496, 0.48608, 0.49521, 0.48726, 0.49274, 0.48836, 0.49429, 0.49013, 0.49126, 0.48792, 0.49147, 0.49169, 0.48964, 0.49008, 0.49378, 0.49365, 0.49165, 0.49075, 0.57694, 0.48973, 0.48945, 0.48773, 0.49186, 0.48699, 0.49202, 0.48785, 0.48984, 0.48807, 0.4924, 0.48739, 0.48901, 0.48669, 0.48864, 0.48892, 0.48906, 0.48729, 0.48907, 0.4886, 0.49334, 0.48702, 0.57734, 0.70083, 0.49192, 0.48993, 0.48756, 0.48839, 0.49692, 0.49292, 0.48647, 0.49172, 0.4875, 0.49397, 0.48663, 0.49145, 0.48815, 0.49401, 0.48878, 0.49212, 0.48753, 0.49235, 0.48811, 0.49451, 0.48865, 0.58524, 0.49262, 0.49011, 0.48923, 0.48823, 0.49108, 0.4881, 0.49074, 0.49805, 0.49124, 0.48831, 0.49161, 0.48613, 0.49324, 0.48948, 0.49372, 0.48427, 0.49263, 0.48691, 0.49317, 0.49667, 0.4969, 0.57482, 0.61619, 0.48773, 0.48884, 0.49076, 0.49017, 0.48952, 0.49239, 0.49075, 0.48963, 0.4911, 0.48939, 0.48983, 0.49046, 0.49409, 0.48869, 0.49044, 0.4872, 0.49356, 0.48711, 0.49475, 0.49335, 0.49242, 0.48938, 0.48799, 0.49308, 0.48649, 0.49513, 0.57985, 0.49149, 0.49028, 0.4911, 0.49172, 0.48942, 0.49435, 0.48938, 0.47502, 0.48947, 0.48882, 0.48685, 0.48977, 0.4839, 0.49208, 0.49183, 0.4899, 0.49107, 0.48954, 0.48936, 0.49081, 0.48809, 0.49012, 0.49118, 0.49592, 0.49005, 0.49234, 0.48935, 0.49702, 0.4881, 0.49255, 0.4923, 0.49215, 0.49408, 0.4896, 0.49166, 0.49036, 0.57641, 0.49203, 0.4866, 0.49827, 0.49306, 0.48826, 0.49197, 0.50213, 0.49344, 0.48736, 0.49635, 0.57884, 0.49438, 0.49181, 0.49665, 0.49267, 0.48679, 0.48884, 0.48977, 0.49284, 0.48791, 0.49204, 0.49178, 0.49595, 0.4931, 0.49191, 0.48826, 0.49306, 0.48701, 0.48992, 0.48579, 0.49069, 0.48562, 0.49508, 0.48592, 0.49748, 0.4852, 0.49001, 0.48851, 0.48928, 0.48685, 0.4898, 0.49343, 0.48889, 0.49276, 0.4874, 0.50472, 0.49085, 0.59958, 0.49141, 0.49279, 0.49191, 0.48975, 0.4895, 0.49082, 0.48927, 0.4914, 0.48634, 0.48671, 0.48679, 0.49495, 0.48847, 0.49036, 0.48784, 0.49319, 0.4893, 0.49337, 0.58198, 0.58629, 0.4953, 0.49089, 0.48763, 0.49392, 0.48743, 0.49484, 0.48893, 0.49356, 0.48948, 0.49182, 0.48987, 0.49043, 0.49529, 0.49039, 0.4921, 0.49072, 0.59678, 0.49229, 0.49187, 0.4928, 0.49741, 0.49468, 0.48644, 0.49313, 0.49332, 0.48749, 0.49394, 0.48779, 0.49346, 0.48849, 0.49244, 0.48985, 0.49183, 0.49358, 0.48865, 0.49267, 0.4914, 0.49166, 0.48871, 0.49327, 0.49077, 0.49024, 0.49629, 0.48853, 0.57947, 0.49147, 0.48886, 0.50383, 0.48817, 0.49188, 0.4873, 0.49974, 0.49014, 0.4908, 0.4922, 0.49589, 0.49266, 0.48782, 0.49383, 0.48872, 0.49176, 0.49069, 0.49264, 0.49042, 0.4914, 0.4912, 0.48803, 0.49078, 0.49007, 0.48811, 0.49406, 0.48945, 0.48976, 0.49052, 0.49238, 0.48839, 0.48749, 0.48884, 0.49154, 0.48706, 0.48761, 0.49108, 0.49077, 0.49131, 0.49425, 0.48822, 0.49246, 0.49172, 0.49273, 0.57851, 0.49276, 0.49599, 0.48901, 0.49655, 0.49128, 0.48808, 0.49162, 0.49012, 0.49189, 0.50308, 0.49552, 0.48646]}, "forward-compute-time": {"start_step": 0, "end_step": 2000, "step_interval": 5, "values": [9.21276, 0.28687, 0.28815, 0.2833, 0.28439, 0.27844, 0.27842, 0.28317, 0.28459, 0.28018, 0.29052, 0.27923, 0.27964, 0.27881, 0.28284, 0.27894, 0.2858, 0.28599, 0.29109, 0.28083, 0.28444, 0.28303, 0.2848, 0.27728, 0.28052, 0.2809, 0.27929, 0.2805, 0.28333, 0.27803, 0.3776, 0.27848, 0.27391, 0.28208, 0.29927, 0.30354, 0.28082, 0.28432, 0.28327, 0.28318, 0.28355, 0.28207, 0.28438, 0.28242, 0.28127, 0.28045, 0.37514, 0.2813, 0.28253, 0.28106, 0.28235, 0.28881, 0.28182, 0.28128, 0.28489, 0.28348, 0.2813, 0.28279, 0.29008, 0.29295, 0.28746, 0.2869, 0.28708, 0.28818, 0.28744, 0.28543, 0.28582, 0.28782, 0.28724, 0.28631, 0.28595, 0.28734, 0.2881, 0.28983, 0.2918, 0.48123, 0.28384, 0.28784, 0.28341, 0.28813, 0.28363, 0.29108, 0.2853, 0.28861, 0.28671, 0.29218, 0.28714, 0.29008, 0.28661, 0.29, 0.28895, 0.28724, 0.289, 0.29102, 0.28959, 0.28779, 0.28919, 0.37298, 0.28802, 0.28671, 0.28631, 0.29013, 0.28597, 0.29054, 0.28653, 0.28662, 0.28618, 0.28937, 0.285, 0.28745, 0.28473, 0.2862, 0.28623, 0.28613, 0.28465, 0.28674, 0.2875, 0.2909, 0.28626, 0.37409, 0.49531, 0.29025, 0.28653, 0.28605, 0.284, 0.29546, 0.29024, 0.28506, 0.29074, 0.28487, 0.29199, 0.28427, 0.28721, 0.28569, 0.28978, 0.28671, 0.29019, 0.2858, 0.29107, 0.28549, 0.28872, 0.28587, 0.38328, 0.28744, 0.28899, 0.28716, 0.28682, 0.28652, 0.28709, 0.28668, 0.29569, 0.28914, 0.28688, 0.28981, 0.28508, 0.29181, 0.28828, 0.29083, 0.28368, 0.28892, 0.28472, 0.2903, 0.29275, 0.29136, 0.3738, 0.41333, 0.28566, 0.28691, 0.28887, 0.2879, 0.28701, 0.2905, 0.28746, 0.28816, 0.28899, 0.28753, 0.2884, 0.28928, 0.29105, 0.28699, 0.28797, 0.28497, 0.29203, 0.28489, 0.28827, 0.29119, 0.29128, 0.28793, 0.28557, 0.29143, 0.28602, 0.29322, 0.37776, 0.28815, 0.28911, 0.28768, 0.28978, 0.2868, 0.2925, 0.28589, 0.27191, 0.28653, 0.28666, 0.28333, 0.28729, 0.28057, 0.28965, 0.2861, 0.28679, 0.28928, 0.28452, 0.28737, 0.28913, 0.28511, 0.28745, 0.28832, 0.29349, 0.28729, 0.28924, 0.28804, 0.29076, 0.28598, 0.29056, 0.28869, 0.28825, 0.29164, 0.28711, 0.28995, 0.2878, 0.37312, 0.28833, 0.28482, 0.29549, 0.28742, 0.28591, 0.28649, 0.29968, 0.29157, 0.2854, 0.29423, 0.37624, 0.29269, 0.28871, 0.29189, 0.28756, 0.28409, 0.28672, 0.28672, 0.29028, 0.28554, 0.29097, 0.28867, 0.29335, 0.29036, 0.28781, 0.28622, 0.28846, 0.28532, 0.28399, 0.28365, 0.28792, 0.28385, 0.29346, 0.28436, 0.29447, 0.28249, 0.28597, 0.28637, 0.28537, 0.28417, 0.28799, 0.28802, 0.28653, 0.29059, 0.28295, 0.30255, 0.28676, 0.39524, 0.28938, 0.28909, 0.28993, 0.28689, 0.2868, 0.28486, 0.2869, 0.28468, 0.28373, 0.28395, 0.28399, 0.29311, 0.28649, 0.28867, 0.2844, 0.29111, 0.28595, 0.29083, 0.37422, 0.38481, 0.2917, 0.28795, 0.28411, 0.29214, 0.28545, 0.29182, 0.28619, 0.29032, 0.28643, 0.28955, 0.287, 0.28693, 0.29048, 0.28673, 0.28964, 0.28608, 0.39417, 0.28909, 0.28926, 0.28892, 0.29626, 0.29035, 0.28418, 0.29096, 0.28911, 0.2861, 0.29247, 0.28616, 0.28914, 0.28625, 0.28976, 0.28808, 0.28866, 0.29068, 0.28692, 0.29086, 0.28868, 0.29004, 0.28595, 0.29148, 0.28842, 0.2886, 0.29171, 0.28773, 0.3764, 0.28898, 0.28636, 0.29892, 0.28549, 0.28973, 0.28465, 0.29697, 0.28725, 0.28663, 0.2894, 0.294, 0.29116, 0.28622, 0.29179, 0.28632, 0.29035, 0.28768, 0.28989, 0.28709, 0.2891, 0.28817, 0.28602, 0.28837, 0.28768, 0.28625, 0.28964, 0.28715, 0.287, 0.28748, 0.29025, 0.28485, 0.28473, 0.2867, 0.28777, 0.28402, 0.28515, 0.28793, 0.28644, 0.2893, 0.28758, 0.28612, 0.28687, 0.29012, 0.2871, 0.37328, 0.28876, 0.29273, 0.28732, 0.29333, 0.28722, 0.28605, 0.2878, 0.28786, 0.28733, 0.29635, 0.29189, 0.28435]}, "backward-compute-time": {"start_step": 0, "end_step": 2000, "step_interval": 5, "values": [3.24795, 0.21194, 0.21471, 0.20869, 0.21204, 0.20759, 0.20377, 0.2107, 0.20945, 0.20618, 0.21705, 0.20521, 0.20785, 0.20627, 0.20635, 0.2064, 0.20649, 0.21053, 0.21523, 0.20491, 0.20938, 0.20895, 0.21121, 0.20684, 0.20811, 0.20914, 0.20848, 0.20944, 0.21029, 0.2088, 0.20823, 0.20765, 0.20786, 0.21144, 0.20746, 0.20856, 0.20791, 0.20961, 0.20962, 0.20803, 0.20624, 0.20748, 0.20646, 0.20637, 0.20506, 0.20636, 0.20873, 0.20709, 0.21021, 0.20645, 0.20725, 0.21067, 0.20689, 0.20484, 0.21018, 0.20758, 0.20809, 0.20663, 0.21735, 0.22092, 0.2181, 0.21664, 0.21604, 0.21705, 0.21811, 0.2175, 0.21613, 0.21894, 0.2186, 0.21706, 0.21821, 0.21776, 0.22265, 0.21862, 0.2187, 0.21766, 0.21611, 0.217, 0.21459, 0.22041, 0.21715, 0.2188, 0.21633, 0.21946, 0.21474, 0.21906, 0.21831, 0.21662, 0.21778, 0.21777, 0.21604, 0.21593, 0.21431, 0.21926, 0.2178, 0.21741, 0.21712, 0.22133, 0.2158, 0.21733, 0.21522, 0.21854, 0.21582, 0.21924, 0.21532, 0.21807, 0.216, 0.22003, 0.21598, 0.21559, 0.21655, 0.21799, 0.21734, 0.21749, 0.21785, 0.21759, 0.21855, 0.21936, 0.21602, 0.21592, 0.21786, 0.22091, 0.21874, 0.21753, 0.21923, 0.22306, 0.22024, 0.21591, 0.22007, 0.2187, 0.222, 0.2157, 0.22232, 0.21719, 0.22251, 0.21763, 0.22074, 0.21731, 0.21953, 0.21712, 0.22337, 0.22066, 0.22071, 0.21949, 0.21972, 0.21565, 0.21695, 0.22019, 0.21716, 0.219, 0.22553, 0.21923, 0.21738, 0.2203, 0.21678, 0.22028, 0.21797, 0.22029, 0.21479, 0.22065, 0.21605, 0.22109, 0.22372, 0.22023, 0.2184, 0.21646, 0.21673, 0.21835, 0.21624, 0.21877, 0.21593, 0.21993, 0.21906, 0.21748, 0.21846, 0.21846, 0.21773, 0.21782, 0.22154, 0.21764, 0.2193, 0.2172, 0.21983, 0.21556, 0.22293, 0.22107, 0.22132, 0.21857, 0.21717, 0.22128, 0.21593, 0.22043, 0.22094, 0.22038, 0.21956, 0.21936, 0.21966, 0.21754, 0.22141, 0.21803, 0.21648, 0.21739, 0.21902, 0.21686, 0.21805, 0.21493, 0.22077, 0.22186, 0.21962, 0.22048, 0.22052, 0.21855, 0.21913, 0.21681, 0.21996, 0.22012, 0.22218, 0.22009, 0.21986, 0.21939, 0.22266, 0.2163, 0.21865, 0.22182, 0.2197, 0.22192, 0.21676, 0.22102, 0.21734, 0.22013, 0.21984, 0.21564, 0.22434, 0.22271, 0.21673, 0.22212, 0.22818, 0.22064, 0.21733, 0.22214, 0.21857, 0.2223, 0.22007, 0.22387, 0.22019, 0.21548, 0.21818, 0.21601, 0.22079, 0.21586, 0.22149, 0.2206, 0.2192, 0.22065, 0.22097, 0.21714, 0.22179, 0.21621, 0.21994, 0.21491, 0.21991, 0.21504, 0.2197, 0.21388, 0.2201, 0.21487, 0.21828, 0.21636, 0.2175, 0.2155, 0.21587, 0.22018, 0.2151, 0.21983, 0.21588, 0.22793, 0.21875, 0.21694, 0.21987, 0.21989, 0.2186, 0.21826, 0.21718, 0.21971, 0.21741, 0.22031, 0.21565, 0.21643, 0.21559, 0.22115, 0.21694, 0.21849, 0.2154, 0.2201, 0.2167, 0.21944, 0.22561, 0.21402, 0.22049, 0.21782, 0.21537, 0.22116, 0.2162, 0.21949, 0.21494, 0.21795, 0.21647, 0.2181, 0.21867, 0.21751, 0.22266, 0.21692, 0.21888, 0.218, 0.22288, 0.21842, 0.21856, 0.21818, 0.22158, 0.22161, 0.21476, 0.21952, 0.21926, 0.21497, 0.21832, 0.21576, 0.21887, 0.2162, 0.21752, 0.21687, 0.21921, 0.22035, 0.21626, 0.22133, 0.21774, 0.22037, 0.21522, 0.22047, 0.21579, 0.21844, 0.22391, 0.21642, 0.21898, 0.21906, 0.21598, 0.22975, 0.21527, 0.21717, 0.21546, 0.22404, 0.21811, 0.21888, 0.2205, 0.22021, 0.22075, 0.21565, 0.21932, 0.21653, 0.21917, 0.21911, 0.22008, 0.21787, 0.21844, 0.21948, 0.21617, 0.21938, 0.21829, 0.21659, 0.2228, 0.21857, 0.21702, 0.21841, 0.21741, 0.21545, 0.21539, 0.21773, 0.21824, 0.21609, 0.21521, 0.21832, 0.21767, 0.21765, 0.21961, 0.21554, 0.21864, 0.21727, 0.21996, 0.21834, 0.21793, 0.22003, 0.21486, 0.22016, 0.21713, 0.21621, 0.21798, 0.21593, 0.21822, 0.22518, 0.21883, 0.21389]}, "batch-generator-time": {"start_step": 0, "end_step": 2000, "step_interval": 5, "values": [0.60577, 0.00374, 0.00393, 0.00334, 0.0036, 0.00342, 0.00344, 0.00397, 0.00331, 0.00323, 0.00356, 0.00332, 0.00341, 0.00356, 0.00347, 0.00308, 0.00337, 0.00327, 0.00342, 0.00359, 0.00317, 0.00312, 0.00326, 0.00315, 0.00321, 0.00318, 0.00314, 0.00309, 0.00313, 0.0031, 0.00327, 0.00314, 0.00303, 0.00338, 0.00311, 0.00306, 0.00302, 0.00321, 0.00306, 0.0032, 0.00305, 0.00309, 0.00302, 0.00328, 0.00297, 0.00295, 0.00322, 0.00301, 0.00307, 0.00325, 0.00287, 0.00312, 0.00289, 0.00302, 0.00308, 0.00307, 0.00308, 0.0035, 0.00327, 0.0032, 0.00318, 0.00312, 0.00322, 0.00336, 0.00333, 0.00345, 0.00311, 0.00326, 0.00307, 0.00318, 0.00309, 0.00331, 0.0031, 0.00327, 0.00333, 0.0033, 0.00321, 0.00328, 0.00317, 0.00325, 0.00309, 0.0033, 0.00326, 0.00323, 0.00321, 0.00319, 0.00318, 0.00329, 0.00315, 0.00331, 0.00368, 0.00361, 0.00377, 0.00374, 0.00383, 0.00345, 0.00348, 0.00347, 0.00339, 0.0035, 0.00312, 0.00344, 0.00325, 0.00318, 0.00318, 0.00323, 0.00328, 0.00331, 0.00329, 0.00318, 0.00327, 0.0032, 0.00317, 0.00314, 0.00313, 0.00316, 0.00327, 0.00348, 0.00319, 0.00309, 0.00338, 0.00315, 0.00347, 0.00335, 0.00315, 0.00314, 0.00339, 0.00316, 0.00323, 0.00311, 0.00331, 0.00317, 0.00311, 0.00316, 0.00317, 0.00314, 0.00323, 0.00319, 0.00311, 0.00328, 0.00326, 0.00315, 0.00319, 0.0035, 0.00303, 0.00311, 0.00331, 0.00334, 0.00314, 0.00323, 0.00345, 0.00325, 0.00319, 0.00322, 0.00331, 0.00339, 0.00342, 0.00343, 0.00335, 0.00349, 0.00338, 0.00342, 0.00327, 0.00325, 0.00331, 0.00327, 0.00328, 0.00325, 0.00321, 0.00326, 0.00324, 0.00346, 0.00329, 0.00347, 0.00325, 0.00327, 0.00322, 0.0032, 0.00311, 0.00307, 0.00322, 0.00303, 0.00312, 0.00323, 0.00329, 0.00312, 0.00323, 0.00323, 0.00307, 0.00315, 0.00324, 0.00314, 0.00308, 0.00308, 0.00313, 0.00322, 0.00318, 0.0032, 0.0032, 0.00322, 0.02747, 0.00304, 0.0031, 0.00322, 0.00309, 0.00303, 0.00319, 0.00304, 0.00319, 0.00315, 0.00305, 0.00324, 0.00328, 0.00297, 0.0033, 0.00302, 0.00329, 0.00319, 0.00309, 0.00319, 0.00324, 0.00336, 0.00317, 0.00324, 0.00322, 0.00343, 0.00323, 0.00314, 0.00337, 0.00333, 0.00319, 0.00305, 0.00351, 0.00342, 0.00323, 0.00333, 0.00325, 0.00329, 0.00309, 0.00337, 0.00313, 0.00331, 0.00309, 0.00329, 0.00319, 0.00325, 0.00323, 0.00324, 0.00332, 0.0034, 0.0033, 0.00322, 0.00318, 0.00319, 0.00329, 0.00315, 0.00329, 0.00325, 0.00333, 0.00322, 0.00337, 0.00313, 0.00313, 0.00327, 0.00332, 0.00313, 0.00307, 0.00312, 0.00306, 0.00322, 0.00309, 0.0033, 0.00323, 0.00341, 0.00326, 0.0035, 0.00329, 0.00341, 0.00333, 0.00334, 0.00347, 0.00314, 0.00336, 0.00336, 0.00329, 0.0032, 0.00322, 0.00331, 0.00337, 0.00336, 0.00312, 0.00321, 0.00407, 0.00319, 0.00353, 0.00339, 0.00344, 0.00327, 0.00338, 0.00335, 0.00325, 0.00334, 0.00318, 0.00329, 0.00329, 0.00323, 0.00318, 0.00325, 0.00322, 0.00317, 0.00327, 0.00307, 0.00322, 0.00305, 0.00323, 0.00318, 0.00328, 0.00317, 0.00326, 0.00313, 0.00312, 0.00317, 0.00319, 0.00322, 0.00326, 0.00311, 0.00318, 0.00349, 0.00314, 0.00329, 0.00324, 0.00339, 0.0031, 0.00326, 0.00308, 0.00316, 0.0031, 0.0034, 0.00318, 0.00327, 0.00321, 0.00313, 0.00335, 0.00311, 0.00333, 0.00329, 0.0031, 0.00325, 0.00325, 0.00326, 0.0033, 0.00323, 0.00315, 0.00321, 0.00322, 0.003, 0.00355, 0.00301, 0.00302, 0.00319, 0.00323, 0.0032, 0.00321, 0.0031, 0.00344, 0.00317, 0.0033, 0.00322, 0.00317, 0.00318, 0.00314, 0.00328, 0.0033, 0.0033, 0.0031, 0.00321, 0.0033, 0.00315, 0.00323, 0.00342, 0.00315, 0.00321, 0.00324, 0.00312, 0.00341, 0.00323, 0.00333, 0.00335, 0.00334, 0.00324, 0.00319, 0.00335, 0.00319, 0.0032, 0.00317, 0.0033, 0.00322, 0.00334, 0.0034, 0.00306]}, "layernorm-grads-all-reduce-time": {"start_step": 0, "end_step": 2000, "step_interval": 5, "values": [3e-05, 1e-05, 1e-05, 1e-05, 1e-05, 1e-05, 1e-05, 1e-05, 1e-05, 1e-05, 1e-05, 1e-05, 2e-05, 1e-05, 1e-05, 1e-05, 1e-05, 1e-05, 1e-05, 1e-05, 1e-05, 1e-05, 1e-05, 1e-05, 1e-05, 2e-05, 1e-05, 1e-05, 1e-05, 1e-05, 1e-05, 1e-05, 1e-05, 1e-05, 1e-05, 1e-05, 1e-05, 1e-05, 1e-05, 1e-05, 1e-05, 1e-05, 1e-05, 1e-05, 1e-05, 1e-05, 1e-05, 1e-05, 1e-05, 1e-05, 2e-05, 1e-05, 1e-05, 1e-05, 1e-05, 1e-05, 1e-05, 1e-05, 1e-05, 1e-05, 1e-05, 1e-05, 1e-05, 1e-05, 2e-05, 1e-05, 1e-05, 1e-05, 1e-05, 1e-05, 1e-05, 1e-05, 1e-05, 1e-05, 1e-05, 2e-05, 2e-05, 1e-05, 1e-05, 1e-05, 1e-05, 1e-05, 1e-05, 1e-05, 1e-05, 1e-05, 1e-05, 1e-05, 1e-05, 1e-05, 1e-05, 2e-05, 1e-05, 1e-05, 1e-05, 1e-05, 1e-05, 1e-05, 1e-05, 1e-05, 1e-05, 1e-05, 1e-05, 1e-05, 1e-05, 2e-05, 1e-05, 1e-05, 1e-05, 1e-05, 1e-05, 1e-05, 1e-05, 1e-05, 1e-05, 1e-05, 1e-05, 1e-05, 1e-05, 1e-05, 1e-05, 1e-05, 1e-05, 1e-05, 1e-05, 1e-05, 1e-05, 1e-05, 1e-05, 1e-05, 1e-05, 1e-05, 2e-05, 2e-05, 1e-05, 1e-05, 1e-05, 1e-05, 1e-05, 1e-05, 1e-05, 1e-05, 1e-05, 1e-05, 1e-05, 1e-05, 1e-05, 1e-05, 1e-05, 1e-05, 1e-05, 1e-05, 1e-05, 1e-05, 1e-05, 1e-05, 1e-05, 1e-05, 1e-05, 1e-05, 1e-05, 1e-05, 1e-05, 1e-05, 1e-05, 1e-05, 1e-05, 1e-05, 1e-05, 1e-05, 1e-05, 1e-05, 1e-05, 1e-05, 1e-05, 1e-05, 1e-05, 1e-05, 1e-05, 1e-05, 1e-05, 1e-05, 1e-05, 1e-05, 1e-05, 1e-05, 2e-05, 1e-05, 1e-05, 1e-05, 1e-05, 1e-05, 1e-05, 1e-05, 1e-05, 1e-05, 1e-05, 1e-05, 1e-05, 1e-05, 1e-05, 1e-05, 1e-05, 1e-05, 1e-05, 1e-05, 1e-05, 1e-05, 1e-05, 1e-05, 2e-05, 1e-05, 1e-05, 1e-05, 1e-05, 2e-05, 1e-05, 1e-05, 1e-05, 1e-05, 1e-05, 1e-05, 1e-05, 1e-05, 1e-05, 1e-05, 1e-05, 1e-05, 1e-05, 1e-05, 2e-05, 1e-05, 1e-05, 1e-05, 1e-05, 1e-05, 1e-05, 1e-05, 1e-05, 1e-05, 1e-05, 1e-05, 1e-05, 1e-05, 1e-05, 1e-05, 1e-05, 1e-05, 1e-05, 1e-05, 1e-05, 1e-05, 1e-05, 1e-05, 1e-05, 1e-05, 1e-05, 1e-05, 1e-05, 1e-05, 1e-05, 1e-05, 1e-05, 1e-05, 1e-05, 1e-05, 1e-05, 1e-05, 1e-05, 1e-05, 1e-05, 1e-05, 1e-05, 1e-05, 1e-05, 1e-05, 1e-05, 1e-05, 1e-05, 1e-05, 1e-05, 1e-05, 1e-05, 1e-05, 1e-05, 1e-05, 1e-05, 1e-05, 1e-05, 1e-05, 1e-05, 1e-05, 1e-05, 1e-05, 1e-05, 1e-05, 1e-05, 1e-05, 1e-05, 1e-05, 1e-05, 1e-05, 1e-05, 1e-05, 1e-05, 1e-05, 1e-05, 1e-05, 1e-05, 1e-05, 1e-05, 1e-05, 1e-05, 1e-05, 1e-05, 1e-05, 1e-05, 1e-05, 1e-05, 1e-05, 1e-05, 1e-05, 2e-05, 1e-05, 1e-05, 1e-05, 1e-05, 1e-05, 1e-05, 1e-05, 1e-05, 1e-05, 1e-05, 1e-05, 1e-05, 1e-05, 1e-05, 1e-05, 1e-05, 1e-05, 1e-05, 1e-05, 1e-05, 1e-05, 1e-05, 1e-05, 1e-05, 1e-05, 1e-05, 1e-05, 1e-05, 1e-05, 1e-05, 1e-05, 1e-05, 1e-05, 1e-05, 1e-05, 1e-05, 1e-05, 1e-05, 1e-05, 1e-05, 1e-05, 1e-05, 1e-05, 1e-05, 1e-05, 1e-05, 1e-05, 1e-05, 1e-05, 1e-05, 1e-05, 1e-05, 1e-05, 1e-05, 1e-05, 1e-05, 1e-05, 1e-05, 1e-05, 1e-05, 1e-05, 1e-05, 1e-05, 1e-05, 1e-05, 1e-05, 1e-05, 1e-05, 1e-05, 1e-05, 1e-05, 1e-05, 1e-05, 1e-05, 1e-05, 1e-05, 1e-05]}, "embedding-grads-all-reduce-time": {"start_step": 0, "end_step": 2000, "step_interval": 5, "values": [4e-05, 2e-05, 2e-05, 2e-05, 2e-05, 2e-05, 2e-05, 2e-05, 2e-05, 2e-05, 2e-05, 2e-05, 3e-05, 2e-05, 2e-05, 2e-05, 2e-05, 2e-05, 2e-05, 2e-05, 2e-05, 2e-05, 2e-05, 2e-05, 2e-05, 2e-05, 2e-05, 2e-05, 2e-05, 2e-05, 2e-05, 2e-05, 2e-05, 2e-05, 2e-05, 2e-05, 2e-05, 2e-05, 2e-05, 2e-05, 2e-05, 2e-05, 2e-05, 2e-05, 2e-05, 2e-05, 2e-05, 2e-05, 2e-05, 2e-05, 3e-05, 2e-05, 2e-05, 2e-05, 2e-05, 2e-05, 2e-05, 2e-05, 2e-05, 2e-05, 2e-05, 2e-05, 2e-05, 2e-05, 2e-05, 2e-05, 2e-05, 2e-05, 2e-05, 2e-05, 2e-05, 2e-05, 2e-05, 2e-05, 2e-05, 2e-05, 2e-05, 2e-05, 2e-05, 2e-05, 2e-05, 2e-05, 2e-05, 2e-05, 2e-05, 2e-05, 2e-05, 2e-05, 2e-05, 2e-05, 2e-05, 2e-05, 2e-05, 2e-05, 2e-05, 2e-05, 2e-05, 2e-05, 2e-05, 2e-05, 2e-05, 2e-05, 2e-05, 2e-05, 2e-05, 2e-05, 2e-05, 2e-05, 2e-05, 2e-05, 2e-05, 2e-05, 2e-05, 2e-05, 2e-05, 2e-05, 2e-05, 2e-05, 2e-05, 2e-05, 2e-05, 2e-05, 2e-05, 2e-05, 2e-05, 2e-05, 2e-05, 2e-05, 2e-05, 2e-05, 2e-05, 2e-05, 2e-05, 2e-05, 2e-05, 2e-05, 2e-05, 2e-05, 2e-05, 2e-05, 2e-05, 2e-05, 2e-05, 2e-05, 2e-05, 2e-05, 2e-05, 2e-05, 2e-05, 2e-05, 2e-05, 2e-05, 2e-05, 2e-05, 2e-05, 2e-05, 2e-05, 2e-05, 2e-05, 2e-05, 2e-05, 2e-05, 2e-05, 2e-05, 2e-05, 2e-05, 2e-05, 2e-05, 2e-05, 2e-05, 2e-05, 2e-05, 2e-05, 2e-05, 2e-05, 2e-05, 2e-05, 2e-05, 2e-05, 2e-05, 2e-05, 2e-05, 2e-05, 2e-05, 2e-05, 2e-05, 2e-05, 2e-05, 2e-05, 2e-05, 2e-05, 2e-05, 2e-05, 2e-05, 2e-05, 2e-05, 2e-05, 2e-05, 2e-05, 2e-05, 2e-05, 2e-05, 2e-05, 2e-05, 2e-05, 2e-05, 2e-05, 2e-05, 2e-05, 2e-05, 2e-05, 2e-05, 2e-05, 2e-05, 2e-05, 2e-05, 2e-05, 2e-05, 2e-05, 2e-05, 2e-05, 2e-05, 2e-05, 2e-05, 2e-05, 2e-05, 2e-05, 2e-05, 2e-05, 2e-05, 2e-05, 2e-05, 2e-05, 2e-05, 2e-05, 2e-05, 2e-05, 2e-05, 2e-05, 2e-05, 2e-05, 2e-05, 2e-05, 2e-05, 2e-05, 2e-05, 2e-05, 2e-05, 2e-05, 2e-05, 2e-05, 2e-05, 2e-05, 2e-05, 2e-05, 2e-05, 2e-05, 2e-05, 2e-05, 2e-05, 2e-05, 2e-05, 2e-05, 2e-05, 2e-05, 2e-05, 2e-05, 2e-05, 2e-05, 2e-05, 2e-05, 2e-05, 2e-05, 2e-05, 2e-05, 2e-05, 2e-05, 2e-05, 2e-05, 2e-05, 2e-05, 2e-05, 2e-05, 2e-05, 2e-05, 2e-05, 2e-05, 2e-05, 2e-05, 2e-05, 2e-05, 2e-05, 2e-05, 2e-05, 2e-05, 2e-05, 2e-05, 2e-05, 2e-05, 2e-05, 2e-05, 2e-05, 2e-05, 2e-05, 2e-05, 2e-05, 2e-05, 2e-05, 2e-05, 2e-05, 2e-05, 2e-05, 2e-05, 2e-05, 2e-05, 2e-05, 2e-05, 2e-05, 2e-05, 2e-05, 2e-05, 2e-05, 2e-05, 2e-05, 2e-05, 2e-05, 2e-05, 2e-05, 2e-05, 2e-05, 2e-05, 2e-05, 2e-05, 2e-05, 2e-05, 2e-05, 2e-05, 2e-05, 2e-05, 2e-05, 2e-05, 2e-05, 2e-05, 2e-05, 2e-05, 2e-05, 2e-05, 2e-05, 2e-05, 2e-05, 2e-05, 2e-05, 2e-05, 2e-05, 2e-05, 2e-05, 2e-05, 2e-05, 2e-05, 2e-05, 2e-05, 2e-05, 2e-05, 2e-05, 2e-05, 2e-05, 2e-05, 2e-05, 2e-05, 2e-05, 2e-05, 2e-05, 2e-05, 2e-05, 2e-05, 2e-05, 2e-05, 2e-05, 2e-05, 2e-05, 2e-05, 2e-05, 2e-05, 2e-05, 2e-05, 2e-05, 2e-05, 2e-05, 2e-05, 2e-05, 2e-05, 2e-05, 2e-05, 2e-05, 2e-05, 2e-05, 2e-05, 2e-05, 2e-05, 2e-05]}, "all-grads-sync-time": {"start_step": 0, "end_step": 2000, "step_interval": 5, "values": [6.03213, 0.0015, 0.00156, 0.00153, 0.00152, 0.00153, 0.00156, 0.00153, 0.00152, 0.00153, 0.00155, 0.00152, 0.00157, 0.00153, 0.00155, 0.00153, 0.00153, 0.00151, 0.00155, 0.00153, 0.00154, 0.00152, 0.00154, 0.00153, 0.00155, 0.00154, 0.00154, 0.00154, 0.00154, 0.00153, 0.00156, 0.00152, 0.00152, 0.00153, 0.00156, 0.00153, 0.00153, 0.00155, 0.00153, 0.00152, 0.00154, 0.00155, 0.00155, 0.00152, 0.00152, 0.00153, 0.00154, 0.00153, 0.00154, 0.00152, 0.00154, 0.00154, 0.00155, 0.00153, 0.00156, 0.00154, 0.00156, 0.00153, 0.00156, 0.00151, 0.00154, 0.00153, 0.00156, 0.00151, 0.00156, 0.00155, 0.00155, 0.00152, 0.00155, 0.00152, 0.00154, 0.00153, 0.00156, 0.00153, 0.00154, 0.00154, 0.00156, 0.00154, 0.00155, 0.00155, 0.00155, 0.00153, 0.00154, 0.00152, 0.00155, 0.00154, 0.00156, 0.00153, 0.00153, 0.00153, 0.00155, 0.00154, 0.00155, 0.00153, 0.00154, 0.00153, 0.00155, 0.00153, 0.00154, 0.00152, 0.00155, 0.00152, 0.00155, 0.00154, 0.00155, 0.00154, 0.00155, 0.00153, 0.00154, 0.00152, 0.00155, 0.00153, 0.00153, 0.00154, 0.00154, 0.00151, 0.00155, 0.00153, 0.00156, 0.00153, 0.00155, 0.00154, 0.00156, 0.00156, 0.00155, 0.00154, 0.00155, 0.00153, 0.00152, 0.00153, 0.00155, 0.00154, 0.00155, 0.00154, 0.00154, 0.00154, 0.00155, 0.00151, 0.00152, 0.00153, 0.00153, 0.00151, 0.00153, 0.00154, 0.00156, 0.00155, 0.00157, 0.00154, 0.00156, 0.00154, 0.00155, 0.00151, 0.00154, 0.00153, 0.00154, 0.00153, 0.00156, 0.00155, 0.00155, 0.00152, 0.00157, 0.00153, 0.00154, 0.00154, 0.00155, 0.00154, 0.00151, 0.00154, 0.00155, 0.00152, 0.00155, 0.00152, 0.00156, 0.00153, 0.00153, 0.00155, 0.00154, 0.00153, 0.00154, 0.00152, 0.00154, 0.00155, 0.00154, 0.00152, 0.00157, 0.00154, 0.00154, 0.00152, 0.00155, 0.00152, 0.00157, 0.00152, 0.00154, 0.00153, 0.00156, 0.00153, 0.00156, 0.00154, 0.00156, 0.00153, 0.00154, 0.00153, 0.00157, 0.00155, 0.00154, 0.00156, 0.00154, 0.00153, 0.00151, 0.00156, 0.00156, 0.00155, 0.00155, 0.00154, 0.00155, 0.00154, 0.00155, 0.00152, 0.00154, 0.00154, 0.00154, 0.00156, 0.00157, 0.00154, 0.00155, 0.00155, 0.00153, 0.00153, 0.00154, 0.00155, 0.00155, 0.00155, 0.00155, 0.00154, 0.00154, 0.00154, 0.00154, 0.00153, 0.00154, 0.00154, 0.00154, 0.00154, 0.00155, 0.00154, 0.00156, 0.00156, 0.00154, 0.00155, 0.00153, 0.00155, 0.00152, 0.00156, 0.00154, 0.00156, 0.00156, 0.00152, 0.00154, 0.00153, 0.00153, 0.00155, 0.00154, 0.00157, 0.00154, 0.00153, 0.00157, 0.00155, 0.00156, 0.00155, 0.00157, 0.00155, 0.00155, 0.00153, 0.00156, 0.00158, 0.00155, 0.00155, 0.00157, 0.00153, 0.00155, 0.00154, 0.00155, 0.00153, 0.00155, 0.00155, 0.00154, 0.00151, 0.00154, 0.00156, 0.00156, 0.00155, 0.00155, 0.00155, 0.00155, 0.00153, 0.00155, 0.00156, 0.00154, 0.00155, 0.00153, 0.00155, 0.00155, 0.00153, 0.00154, 0.00154, 0.00156, 0.00156, 0.00155, 0.00155, 0.00154, 0.00153, 0.00155, 0.00155, 0.00155, 0.00154, 0.00153, 0.00154, 0.00154, 0.00155, 0.00156, 0.00156, 0.00156, 0.00156, 0.00156, 0.00156, 0.00155, 0.00155, 0.00154, 0.00156, 0.00154, 0.00156, 0.00155, 0.00154, 0.00156, 0.00154, 0.00153, 0.00155, 0.00152, 0.00156, 0.00151, 0.00155, 0.00154, 0.00155, 0.00155, 0.00156, 0.00153, 0.00155, 0.00154, 0.00156, 0.00154, 0.00154, 0.00154, 0.00155, 0.00155, 0.00155, 0.00153, 0.00155, 0.00154, 0.00154, 0.00155, 0.00156, 0.00153, 0.00153, 0.00154, 0.00155, 0.00153, 0.00154, 0.00155, 0.00154, 0.00154, 0.00155, 0.00155, 0.00155, 0.00153, 0.00155, 0.00154, 0.00157, 0.00156, 0.00153, 0.00157, 0.00157, 0.00156, 0.00157, 0.00154, 0.00155, 0.00157, 0.00155, 0.00155, 0.00153, 0.00153, 0.00152, 0.00154, 0.00155, 0.00155, 0.00154, 0.00153, 0.00155, 0.00154, 0.00155, 0.00155, 0.00155]}, "optimizer-copy-to-main-grad-time": {"start_step": 0, "end_step": 2000, "step_interval": 5, "values": [0.00038, 0.00024, 0.00024, 0.00015, 0.00015, 0.00016, 0.00015, 0.00016, 0.00015, 0.00013, 0.00013, 0.00015, 0.00015, 0.00013, 0.00015, 0.00013, 0.00015, 0.00013, 0.00015, 0.00015, 0.00013, 0.00015, 0.00013, 0.00015, 0.00013, 0.00014, 0.00013, 0.00013, 0.00015, 0.00013, 0.00015, 0.00013, 0.00013, 0.00013, 0.00013, 0.00013, 0.00013, 0.00013, 0.00014, 0.00013, 0.00013, 0.00013, 0.00013, 0.00013, 0.00013, 0.00013, 0.00016, 0.00013, 0.00013, 0.00013, 0.00015, 0.00013, 0.00013, 0.00013, 0.00013, 0.00013, 0.00013, 0.00025, 0.00018, 0.00018, 0.00019, 0.00018, 0.0003, 0.00018, 0.00019, 0.00018, 0.00018, 0.00018, 0.00018, 0.00018, 0.00019, 0.00018, 0.00019, 0.00019, 0.00018, 0.00019, 0.00019, 0.00019, 0.00018, 0.00019, 0.00019, 0.00019, 0.00021, 0.00018, 0.00021, 0.00018, 0.00019, 0.00019, 0.00018, 0.00018, 0.00018, 0.0002, 0.00019, 0.00018, 0.00018, 0.00018, 0.00019, 0.00021, 0.00018, 0.00019, 0.00019, 0.00018, 0.00018, 0.00021, 0.00018, 0.00019, 0.00019, 0.00019, 0.00018, 0.00021, 0.00018, 0.00018, 0.00018, 0.0002, 0.00018, 0.00021, 0.00019, 0.00018, 0.00018, 0.0002, 0.00023, 0.00018, 0.00018, 0.0002, 0.00018, 0.00018, 0.00018, 0.00018, 0.00018, 0.00021, 0.00018, 0.00018, 0.00019, 0.00021, 0.00018, 0.00018, 0.00021, 0.00018, 0.0002, 0.00018, 0.00018, 0.00021, 0.00018, 0.00019, 0.00019, 0.00018, 0.00018, 0.00018, 0.00018, 0.00019, 0.00018, 0.00018, 0.00018, 0.00019, 0.00018, 0.0002, 0.00021, 0.00019, 0.00018, 0.00021, 0.00021, 0.00018, 0.00019, 0.00019, 0.00018, 0.00019, 0.00018, 0.00018, 0.00018, 0.00021, 0.00018, 0.00018, 0.00018, 0.00019, 0.00018, 0.00018, 0.00021, 0.00018, 0.00019, 0.00021, 0.00018, 0.00018, 0.00021, 0.00018, 0.00021, 0.00018, 0.00018, 0.00019, 0.00021, 0.00021, 0.00021, 0.00021, 0.00018, 0.00018, 0.00019, 0.00019, 0.00018, 0.0002, 0.00021, 0.00021, 0.0002, 0.00018, 0.00019, 0.00019, 0.00018, 0.00018, 0.00018, 0.00021, 0.00019, 0.00018, 0.00019, 0.00018, 0.00018, 0.00018, 0.00018, 0.00021, 0.00018, 0.00021, 0.00018, 0.00018, 0.00018, 0.00021, 0.00018, 0.00018, 0.00021, 0.00018, 0.00018, 0.00019, 0.00018, 0.00018, 0.00018, 0.00019, 0.00018, 0.00018, 0.00018, 0.00021, 0.00018, 0.00018, 0.00018, 0.00018, 0.00018, 0.00018, 0.00018, 0.00021, 0.00018, 0.00018, 0.00018, 0.00018, 0.00019, 0.00018, 0.00021, 0.00018, 0.00018, 0.00021, 0.00018, 0.00018, 0.00018, 0.00018, 0.00018, 0.00018, 0.0002, 0.00018, 0.00018, 0.00018, 0.00018, 0.00018, 0.00021, 0.00021, 0.00018, 0.0002, 0.00018, 0.00018, 0.00018, 0.00021, 0.00019, 0.00018, 0.00018, 0.00018, 0.00018, 0.00019, 0.00019, 0.00018, 0.00018, 0.00018, 0.00018, 0.0002, 0.00018, 0.00018, 0.00019, 0.00018, 0.00019, 0.00019, 0.00018, 0.00018, 0.00018, 0.00018, 0.00021, 0.00018, 0.00018, 0.00021, 0.00019, 0.00018, 0.00018, 0.00018, 0.00021, 0.00018, 0.00018, 0.00018, 0.00018, 0.00021, 0.00018, 0.00021, 0.00018, 0.00021, 0.00018, 0.00018, 0.00018, 0.00019, 0.00018, 0.00018, 0.00018, 0.00018, 0.00019, 0.00018, 0.00018, 0.00018, 0.00018, 0.00018, 0.00018, 0.00018, 0.00018, 0.00021, 0.00018, 0.00019, 0.00018, 0.00018, 0.00018, 0.00018, 0.00018, 0.00018, 0.00018, 0.00018, 0.00018, 0.00021, 0.00019, 0.00018, 0.00018, 0.00019, 0.00021, 0.00019, 0.00021, 0.00018, 0.00018, 0.00018, 0.00018, 0.00018, 0.00018, 0.00018, 0.00018, 0.00021, 0.00018, 0.00018, 0.00018, 0.00018, 0.00018, 0.00019, 0.00018, 0.00018, 0.00018, 0.00018, 0.00018, 0.00021, 0.00018, 0.00018, 0.00021, 0.00019, 0.00019, 0.00019, 0.00021, 0.00023, 0.00018, 0.00021, 0.00019, 0.00018, 0.00021, 0.00019, 0.00019, 0.00019, 0.00019, 0.00018, 0.00019, 0.00019, 0.00018, 0.00019, 0.00018, 0.00019, 0.00018, 0.00022, 0.00021, 0.00018]}, "optimizer-clip-main-grad-time": {"start_step": 0, "end_step": 2000, "step_interval": 5, "values": [0.62631, 0.00104, 0.00106, 0.00093, 0.00092, 0.00096, 0.00095, 0.00096, 0.00092, 0.00091, 0.0009, 0.00091, 0.00101, 0.00091, 0.00091, 0.0009, 0.0009, 0.0009, 0.00093, 0.00094, 0.0009, 0.00115, 0.0009, 0.00092, 0.00091, 0.00098, 0.00089, 0.00091, 0.00091, 0.0009, 0.00094, 0.0009, 0.00095, 0.00091, 0.00091, 0.0009, 0.0009, 0.00091, 0.00091, 0.00091, 0.00091, 0.00091, 0.00091, 0.00091, 0.00092, 0.0009, 0.00093, 0.00093, 0.00091, 0.00091, 0.00101, 0.00091, 0.0009, 0.0009, 0.0009, 0.00091, 0.00091, 0.00107, 0.00099, 0.001, 0.00101, 0.001, 0.00179, 0.001, 0.001, 0.00101, 0.0011, 0.00101, 0.001, 0.00101, 0.00101, 0.00101, 0.001, 0.001, 0.00101, 0.00109, 0.00106, 0.001, 0.001, 0.00102, 0.00101, 0.00102, 0.00109, 0.00101, 0.00104, 0.001, 0.00099, 0.00103, 0.00102, 0.001, 0.001, 0.00113, 0.00082, 0.00079, 0.0008, 0.001, 0.00102, 0.00105, 0.001, 0.001, 0.001, 0.00102, 0.00079, 0.00105, 0.00079, 0.00106, 0.0008, 0.00079, 0.00099, 0.00087, 0.00101, 0.0008, 0.00099, 0.00086, 0.00101, 0.00083, 0.00081, 0.001, 0.0008, 0.001, 0.00085, 0.00081, 0.001, 0.00079, 0.001, 0.00101, 0.001, 0.00079, 0.001, 0.00106, 0.001, 0.001, 0.00103, 0.00104, 0.00079, 0.00101, 0.00084, 0.00079, 0.0008, 0.0008, 0.00109, 0.00105, 0.00099, 0.0008, 0.00101, 0.00101, 0.00102, 0.00102, 0.0008, 0.00079, 0.00111, 0.00101, 0.00099, 0.0008, 0.001, 0.00108, 0.00107, 0.00103, 0.00103, 0.00084, 0.00105, 0.001, 0.00101, 0.001, 0.00101, 0.00101, 0.001, 0.00101, 0.00101, 0.00114, 0.00099, 0.0008, 0.00079, 0.00101, 0.001, 0.001, 0.00105, 0.00101, 0.001, 0.00113, 0.00101, 0.001, 0.00106, 0.001, 0.00101, 0.00101, 0.00101, 0.00101, 0.00106, 0.00105, 0.00107, 0.00106, 0.00102, 0.001, 0.00104, 0.00101, 0.00105, 0.001, 0.00104, 0.00105, 0.00104, 0.00103, 0.001, 0.001, 0.001, 0.00109, 0.00101, 0.00104, 0.001, 0.00108, 0.00108, 0.001, 0.00101, 0.001, 0.00103, 0.00106, 0.00102, 0.00106, 0.00102, 0.00099, 0.00101, 0.00105, 0.00104, 0.00101, 0.00105, 0.00102, 0.00103, 0.00102, 0.001, 0.001, 0.00104, 0.001, 0.00101, 0.00101, 0.001, 0.00105, 0.00101, 0.00107, 0.00102, 0.001, 0.00101, 0.00101, 0.00101, 0.00108, 0.00101, 0.001, 0.00106, 0.00101, 0.001, 0.001, 0.00105, 0.00101, 0.00116, 0.00112, 0.00101, 0.001, 0.00103, 0.00101, 0.00103, 0.00101, 0.00105, 0.00103, 0.00102, 0.001, 0.00101, 0.001, 0.00108, 0.00108, 0.00101, 0.00106, 0.00109, 0.00106, 0.00102, 0.00104, 0.001, 0.001, 0.00099, 0.00101, 0.00101, 0.001, 0.001, 0.001, 0.00102, 0.00105, 0.001, 0.00103, 0.00103, 0.001, 0.00101, 0.001, 0.00107, 0.00101, 0.001, 0.001, 0.00102, 0.001, 0.00111, 0.001, 0.00102, 0.00104, 0.00099, 0.001, 0.00101, 0.00101, 0.00105, 0.00101, 0.001, 0.00101, 0.00107, 0.00113, 0.00103, 0.00105, 0.00102, 0.00105, 0.00101, 0.00101, 0.00102, 0.001, 0.00101, 0.00103, 0.001, 0.00102, 0.00108, 0.00103, 0.00103, 0.00101, 0.00104, 0.001, 0.00103, 0.00101, 0.00107, 0.00106, 0.00099, 0.00103, 0.00102, 0.00101, 0.00102, 0.001, 0.00101, 0.00101, 0.00102, 0.001, 0.00101, 0.0011, 0.00101, 0.001, 0.00101, 0.001, 0.00108, 0.001, 0.0011, 0.00108, 0.00101, 0.001, 0.00102, 0.00102, 0.00101, 0.001, 0.00102, 0.00108, 0.00101, 0.00103, 0.001, 0.00101, 0.00101, 0.001, 0.00109, 0.001, 0.001, 0.00105, 0.00101, 0.00105, 0.001, 0.00102, 0.0011, 0.00103, 0.00103, 0.00102, 0.00106, 0.00104, 0.00104, 0.00107, 0.00101, 0.001, 0.00111, 0.00102, 0.00101, 0.00103, 0.00101, 0.00102, 0.001, 0.00102, 0.00103, 0.00101, 0.00101, 0.0011, 0.001, 0.00105, 0.00106, 0.00101]}, "optimizer-count-zeros-time": {"start_step": 0, "end_step": 2000, "step_interval": 5, "values": [0.00488, 0.00438, 0.00439, 0.00461, 0.00443, 0.0046, 0.00465, 0.00446, 0.00441, 0.00439, 0.00443, 0.0044, 0.00516, 0.00445, 0.0044, 0.0044, 0.00439, 0.0044, 0.0044, 0.00441, 0.00443, 0.00441, 0.00443, 0.00439, 0.00443, 0.0051, 0.0044, 0.00439, 0.00443, 0.00441, 0.0044, 0.00438, 0.00442, 0.00442, 0.00442, 0.00442, 0.00443, 0.0044, 0.00442, 0.00439, 0.0045, 0.00441, 0.00439, 0.00439, 0.0044, 0.00441, 0.00438, 0.00441, 0.00441, 0.0044, 0.00485, 0.00441, 0.00442, 0.00439, 0.0044, 0.00438, 0.00445, 0.00462, 0.00437, 0.00439, 0.0044, 0.00439, 0.0044, 0.00442, 0.00439, 0.00441, 0.00442, 0.00439, 0.00439, 0.00439, 0.00442, 0.0044, 0.00439, 0.00441, 0.00438, 0.00523, 0.00508, 0.00442, 0.00437, 0.00496, 0.00442, 0.00437, 0.00556, 0.00439, 0.00438, 0.00443, 0.00439, 0.0044, 0.00439, 0.00442, 0.00441, 0.0052, 0.00441, 0.00441, 0.00438, 0.00444, 0.00441, 0.0044, 0.00441, 0.00439, 0.00443, 0.00439, 0.00438, 0.00443, 0.0044, 0.00439, 0.00442, 0.00443, 0.00439, 0.00439, 0.00441, 0.00441, 0.0044, 0.00544, 0.00439, 0.0044, 0.0044, 0.00442, 0.00441, 0.00438, 0.00439, 0.00441, 0.00442, 0.00439, 0.00438, 0.00441, 0.00442, 0.0044, 0.0044, 0.00441, 0.00436, 0.0044, 0.00438, 0.00442, 0.00442, 0.00442, 0.00444, 0.00442, 0.00441, 0.0044, 0.00439, 0.00439, 0.00439, 0.00441, 0.00441, 0.00443, 0.00439, 0.00439, 0.00439, 0.00439, 0.00438, 0.0044, 0.00439, 0.00441, 0.00441, 0.00481, 0.00443, 0.0044, 0.0044, 0.00442, 0.0044, 0.00439, 0.0044, 0.00438, 0.00454, 0.0044, 0.00439, 0.0044, 0.00439, 0.0044, 0.0044, 0.00438, 0.00441, 0.00437, 0.00439, 0.0044, 0.00441, 0.00438, 0.00441, 0.00439, 0.00441, 0.00442, 0.0044, 0.00439, 0.00438, 0.00441, 0.00439, 0.00441, 0.0044, 0.0044, 0.0044, 0.00439, 0.0044, 0.00442, 0.00467, 0.00439, 0.0044, 0.0044, 0.00442, 0.00441, 0.00442, 0.0044, 0.00442, 0.00442, 0.00441, 0.00509, 0.00443, 0.0044, 0.00442, 0.00438, 0.00487, 0.00531, 0.00442, 0.00442, 0.00442, 0.00442, 0.00441, 0.00439, 0.00441, 0.0044, 0.00439, 0.0044, 0.00441, 0.00439, 0.00439, 0.0044, 0.0044, 0.00439, 0.00443, 0.00441, 0.00454, 0.00439, 0.00441, 0.0044, 0.00441, 0.00439, 0.00441, 0.00442, 0.0044, 0.00441, 0.00438, 0.0044, 0.00439, 0.0044, 0.0044, 0.00442, 0.0044, 0.0044, 0.0044, 0.00438, 0.0044, 0.0044, 0.0044, 0.0044, 0.0044, 0.00441, 0.00441, 0.0044, 0.00442, 0.0044, 0.00439, 0.00439, 0.00439, 0.00439, 0.00439, 0.0044, 0.00442, 0.00441, 0.00439, 0.00443, 0.00439, 0.0044, 0.0044, 0.00439, 0.0044, 0.0044, 0.00441, 0.0044, 0.00438, 0.00441, 0.00442, 0.0044, 0.00439, 0.00443, 0.00534, 0.00438, 0.00442, 0.0044, 0.0044, 0.00441, 0.00495, 0.00439, 0.00441, 0.00438, 0.00441, 0.00441, 0.0044, 0.00437, 0.00441, 0.00439, 0.0044, 0.00442, 0.0044, 0.00442, 0.00439, 0.00437, 0.00441, 0.0044, 0.00439, 0.0044, 0.00457, 0.00441, 0.00441, 0.00442, 0.00441, 0.00443, 0.00439, 0.00443, 0.00439, 0.00439, 0.00439, 0.00441, 0.00486, 0.00439, 0.00441, 0.00441, 0.00453, 0.0044, 0.00437, 0.00441, 0.0044, 0.00442, 0.0044, 0.00442, 0.00441, 0.00441, 0.00439, 0.00439, 0.00441, 0.00438, 0.0044, 0.00442, 0.00443, 0.0044, 0.0044, 0.00442, 0.00441, 0.00439, 0.00442, 0.00441, 0.0044, 0.00439, 0.00438, 0.00439, 0.00442, 0.00439, 0.00441, 0.00439, 0.0044, 0.00441, 0.0044, 0.00442, 0.00443, 0.0044, 0.00438, 0.0044, 0.00439, 0.00444, 0.00439, 0.00442, 0.0044, 0.00439, 0.00441, 0.00439, 0.00442, 0.00439, 0.00438, 0.00439, 0.00438, 0.0044, 0.00442, 0.0044, 0.00438, 0.00442, 0.00443, 0.0044, 0.0044, 0.00439, 0.00441, 0.00439, 0.0044, 0.00444, 0.00455, 0.00442, 0.00443, 0.00441, 0.00442, 0.00442, 0.00443, 0.0044]}, "optimizer-inner-step-time": {"start_step": 0, "end_step": 2000, "step_interval": 5, "values": [0.00313, 0.00096, 0.00097, 0.00093, 0.00094, 0.00094, 0.00094, 0.00093, 0.00093, 0.00093, 0.00094, 0.00093, 0.00094, 0.00092, 0.00093, 0.00092, 0.00094, 0.00092, 0.00093, 0.00093, 0.00093, 0.00092, 0.00092, 0.00092, 0.00092, 0.00094, 0.00092, 0.00093, 0.00093, 0.00092, 0.00092, 0.00092, 0.00092, 0.00093, 0.00092, 0.00092, 0.00092, 0.00099, 0.00092, 0.00093, 0.00094, 0.00093, 0.00092, 0.00092, 0.00092, 0.00092, 0.00093, 0.00093, 0.00093, 0.00093, 0.00094, 0.00092, 0.00092, 0.00092, 0.00092, 0.00092, 0.00092, 0.00096, 0.00093, 0.00093, 0.00092, 0.00093, 0.00092, 0.00092, 0.00092, 0.00094, 0.00093, 0.00092, 0.00093, 0.00092, 0.00093, 0.00093, 0.00092, 0.00093, 0.00092, 0.00097, 0.00095, 0.00092, 0.00093, 0.00093, 0.00092, 0.00099, 0.00095, 0.00093, 0.00094, 0.00093, 0.00092, 0.00093, 0.00092, 0.00093, 0.00094, 0.00095, 0.00093, 0.00093, 0.00093, 0.00092, 0.00092, 0.00093, 0.00093, 0.00092, 0.00093, 0.00093, 0.00093, 0.00094, 0.00095, 0.00092, 0.00093, 0.00093, 0.00093, 0.00093, 0.00093, 0.00093, 0.00093, 0.00093, 0.00092, 0.00093, 0.00092, 0.00092, 0.00093, 0.00092, 0.00094, 0.00093, 0.00093, 0.00092, 0.00092, 0.00093, 0.00092, 0.00093, 0.00093, 0.00093, 0.00092, 0.00092, 0.00093, 0.00093, 0.00092, 0.00092, 0.00094, 0.00093, 0.00092, 0.00093, 0.00094, 0.00094, 0.00092, 0.00093, 0.00092, 0.00093, 0.00092, 0.00093, 0.00092, 0.00093, 0.00093, 0.00092, 0.00092, 0.00093, 0.00092, 0.00093, 0.00094, 0.00093, 0.00092, 0.00094, 0.00093, 0.00093, 0.00092, 0.00093, 0.00095, 0.00093, 0.00092, 0.00092, 0.00093, 0.00094, 0.00092, 0.00092, 0.00093, 0.00093, 0.00093, 0.00092, 0.00094, 0.00094, 0.00092, 0.00094, 0.00092, 0.00093, 0.00093, 0.00092, 0.00093, 0.00092, 0.00093, 0.00092, 0.00092, 0.00094, 0.00093, 0.00093, 0.00092, 0.00092, 0.00093, 0.00092, 0.00092, 0.00093, 0.00094, 0.00093, 0.00094, 0.00093, 0.00093, 0.00093, 0.00093, 0.00095, 0.00092, 0.00093, 0.00093, 0.00093, 0.00094, 0.00093, 0.00092, 0.00094, 0.00093, 0.00095, 0.00094, 0.00094, 0.00092, 0.00093, 0.00093, 0.00094, 0.00093, 0.00093, 0.00094, 0.00092, 0.00093, 0.00092, 0.00093, 0.00093, 0.00093, 0.00093, 0.00097, 0.00093, 0.00092, 0.00094, 0.00092, 0.00092, 0.00093, 0.00093, 0.00093, 0.00093, 0.00093, 0.00094, 0.00093, 0.00093, 0.00092, 0.00093, 0.00093, 0.00093, 0.00094, 0.00093, 0.00093, 0.00093, 0.00094, 0.00093, 0.00094, 0.00094, 0.00093, 0.00092, 0.00093, 0.00093, 0.00093, 0.00093, 0.00093, 0.00093, 0.00093, 0.00093, 0.00093, 0.00093, 0.00093, 0.00093, 0.00093, 0.00094, 0.00093, 0.00094, 0.00092, 0.00094, 0.00093, 0.00092, 0.00092, 0.00093, 0.00093, 0.00093, 0.00093, 0.00093, 0.00093, 0.00093, 0.00092, 0.00092, 0.00095, 0.00093, 0.00093, 0.00093, 0.00093, 0.00092, 0.00094, 0.00093, 0.00092, 0.00094, 0.00093, 0.00093, 0.00093, 0.00092, 0.00094, 0.00093, 0.00093, 0.00092, 0.00093, 0.00094, 0.00093, 0.00093, 0.00092, 0.00093, 0.00093, 0.00093, 0.00093, 0.00093, 0.00094, 0.00092, 0.00092, 0.00093, 0.00094, 0.00093, 0.00094, 0.00093, 0.00093, 0.00093, 0.00093, 0.00093, 0.00093, 0.00093, 0.00093, 0.00093, 0.00093, 0.00093, 0.00093, 0.00092, 0.00093, 0.00093, 0.00093, 0.00093, 0.00093, 0.00093, 0.00093, 0.00094, 0.00093, 0.00092, 0.00094, 0.00093, 0.00093, 0.00093, 0.00093, 0.00093, 0.00093, 0.00092, 0.00092, 0.00092, 0.00092, 0.00093, 0.00093, 0.00093, 0.00093, 0.00092, 0.00093, 0.00093, 0.00094, 0.00094, 0.00093, 0.00093, 0.00093, 0.00094, 0.00092, 0.00093, 0.00093, 0.00094, 0.00093, 0.00093, 0.00093, 0.00093, 0.00093, 0.00094, 0.00094, 0.00093, 0.00094, 0.00095, 0.00093, 0.00093, 0.00093, 0.00093, 0.00093, 0.00093, 0.00096, 0.00093, 0.00093, 0.00093, 0.00093, 0.00094, 0.00094, 0.00094]}, "optimizer-copy-main-to-model-params-time": {"start_step": 0, "end_step": 2000, "step_interval": 5, "values": [0.0012, 0.001, 0.00119, 0.00096, 0.00096, 0.00096, 0.00097, 0.00096, 0.00096, 0.00096, 0.00095, 0.00096, 0.00097, 0.00095, 0.00096, 0.00096, 0.00096, 0.00096, 0.00096, 0.00096, 0.00097, 0.00096, 0.00096, 0.00095, 0.00096, 0.00097, 0.00096, 0.00095, 0.00096, 0.00096, 0.00096, 0.00096, 0.00096, 0.00095, 0.00095, 0.00095, 0.00096, 0.00104, 0.00096, 0.00095, 0.00097, 0.00095, 0.00096, 0.00096, 0.00096, 0.00096, 0.00096, 0.00095, 0.00096, 0.00096, 0.00097, 0.00096, 0.00096, 0.00095, 0.00096, 0.00095, 0.00096, 0.001, 0.00099, 0.00099, 0.00099, 0.00098, 0.00099, 0.00099, 0.00098, 0.00098, 0.00098, 0.00099, 0.00099, 0.00099, 0.00099, 0.00099, 0.00099, 0.00099, 0.00099, 0.00101, 0.001, 0.00099, 0.001, 0.00098, 0.00099, 0.00099, 0.001, 0.00099, 0.001, 0.00099, 0.00098, 0.00098, 0.00099, 0.00099, 0.00098, 0.00103, 0.00099, 0.00099, 0.00099, 0.001, 0.001, 0.001, 0.00099, 0.00099, 0.001, 0.00099, 0.00099, 0.00101, 0.00099, 0.00099, 0.00099, 0.00099, 0.001, 0.00103, 0.00099, 0.00099, 0.00099, 0.001, 0.00099, 0.001, 0.00099, 0.00099, 0.00099, 0.001, 0.00101, 0.00099, 0.00099, 0.00099, 0.00099, 0.00099, 0.00099, 0.00099, 0.00098, 0.001, 0.00099, 0.00099, 0.00099, 0.001, 0.00099, 0.00099, 0.00099, 0.00099, 0.00099, 0.00099, 0.00099, 0.001, 0.00099, 0.00099, 0.00099, 0.00098, 0.00099, 0.00099, 0.00099, 0.00099, 0.00098, 0.00098, 0.00099, 0.00098, 0.00098, 0.001, 0.001, 0.001, 0.00099, 0.001, 0.001, 0.00099, 0.00099, 0.00099, 0.00101, 0.00099, 0.00099, 0.00099, 0.00099, 0.00101, 0.001, 0.00099, 0.00099, 0.00098, 0.00099, 0.00099, 0.001, 0.00099, 0.00098, 0.00099, 0.00099, 0.00099, 0.001, 0.00099, 0.00102, 0.00099, 0.00099, 0.00098, 0.001, 0.00099, 0.00099, 0.001, 0.00099, 0.00098, 0.00099, 0.00099, 0.00099, 0.00099, 0.00101, 0.001, 0.00101, 0.00099, 0.00099, 0.00099, 0.00099, 0.00101, 0.00099, 0.00101, 0.00099, 0.00099, 0.00099, 0.00101, 0.00099, 0.00099, 0.00099, 0.001, 0.00098, 0.001, 0.00099, 0.001, 0.00099, 0.00101, 0.00099, 0.00099, 0.001, 0.00099, 0.001, 0.00101, 0.00099, 0.001, 0.00098, 0.00099, 0.00105, 0.00099, 0.00099, 0.00101, 0.00099, 0.00099, 0.00099, 0.00099, 0.00099, 0.001, 0.00099, 0.00102, 0.00098, 0.00098, 0.00099, 0.001, 0.00099, 0.001, 0.001, 0.001, 0.00098, 0.00101, 0.00099, 0.001, 0.00098, 0.00099, 0.001, 0.00099, 0.001, 0.00099, 0.00099, 0.00099, 0.00099, 0.00099, 0.00101, 0.001, 0.00098, 0.00101, 0.00099, 0.00098, 0.00099, 0.00103, 0.00098, 0.00099, 0.00099, 0.001, 0.00098, 0.001, 0.00099, 0.00099, 0.00099, 0.00099, 0.00099, 0.00106, 0.00099, 0.00099, 0.00099, 0.00099, 0.00101, 0.001, 0.00099, 0.00099, 0.00099, 0.00099, 0.00101, 0.001, 0.00099, 0.001, 0.001, 0.001, 0.00098, 0.00099, 0.001, 0.00099, 0.00099, 0.00099, 0.00099, 0.001, 0.00099, 0.001, 0.00099, 0.001, 0.00099, 0.00099, 0.00099, 0.00099, 0.001, 0.00099, 0.001, 0.00099, 0.001, 0.00099, 0.00099, 0.00099, 0.00099, 0.00099, 0.00098, 0.00099, 0.00099, 0.001, 0.00099, 0.001, 0.00099, 0.001, 0.00101, 0.00099, 0.00099, 0.00099, 0.00099, 0.00099, 0.001, 0.001, 0.00101, 0.00099, 0.00099, 0.00099, 0.001, 0.001, 0.001, 0.001, 0.00099, 0.00099, 0.00099, 0.00099, 0.00099, 0.001, 0.001, 0.00101, 0.001, 0.00099, 0.00099, 0.00099, 0.001, 0.00099, 0.00099, 0.00099, 0.00099, 0.00098, 0.00099, 0.001, 0.00099, 0.00099, 0.00099, 0.001, 0.00099, 0.00099, 0.001, 0.001, 0.00099, 0.00099, 0.00099, 0.00099, 0.001, 0.00099, 0.001, 0.001, 0.00099, 0.00099, 0.00099, 0.001, 0.001, 0.001, 0.001, 0.00099, 0.00099, 0.001, 0.00101, 0.00099]}, "optimizer-time": {"start_step": 0, "end_step": 2000, "step_interval": 5, "values": [0.63786, 0.00795, 0.00821, 0.00789, 0.00772, 0.00795, 0.00797, 0.00777, 0.00768, 0.00764, 0.00767, 0.00766, 0.0086, 0.00767, 0.00766, 0.00763, 0.00766, 0.00763, 0.00768, 0.0077, 0.00769, 0.0079, 0.00766, 0.00765, 0.00767, 0.00848, 0.00762, 0.00762, 0.0077, 0.00763, 0.0077, 0.0076, 0.00769, 0.00767, 0.00763, 0.00763, 0.00766, 0.0078, 0.00766, 0.00762, 0.00777, 0.00763, 0.00763, 0.00761, 0.00765, 0.00763, 0.00767, 0.00766, 0.00766, 0.00764, 0.00825, 0.00763, 0.00764, 0.00762, 0.00762, 0.00761, 0.00768, 0.00821, 0.00776, 0.00779, 0.00781, 0.00778, 0.00875, 0.00781, 0.00783, 0.00782, 0.00792, 0.00779, 0.00782, 0.00781, 0.00783, 0.00781, 0.0078, 0.00782, 0.0078, 0.00884, 0.00896, 0.00783, 0.00778, 0.00843, 0.00783, 0.00789, 0.00911, 0.0078, 0.00787, 0.00783, 0.00779, 0.00784, 0.00781, 0.00784, 0.00782, 0.00886, 0.00764, 0.00763, 0.00759, 0.00785, 0.00785, 0.0079, 0.00781, 0.0078, 0.00787, 0.00782, 0.00759, 0.00793, 0.00762, 0.00785, 0.00763, 0.00765, 0.00781, 0.00773, 0.00784, 0.00762, 0.0078, 0.00885, 0.00779, 0.00767, 0.00763, 0.00782, 0.00761, 0.0078, 0.00773, 0.00766, 0.00783, 0.00758, 0.00778, 0.00785, 0.00781, 0.00759, 0.00779, 0.00791, 0.00776, 0.0078, 0.00782, 0.0079, 0.00761, 0.00781, 0.00773, 0.0076, 0.00764, 0.0076, 0.0079, 0.00789, 0.00777, 0.00763, 0.00782, 0.00784, 0.00781, 0.00782, 0.00757, 0.0076, 0.00788, 0.0078, 0.00778, 0.00762, 0.0078, 0.00834, 0.00794, 0.00785, 0.00783, 0.00773, 0.0079, 0.0078, 0.00783, 0.0078, 0.00801, 0.00782, 0.0078, 0.0078, 0.00781, 0.00801, 0.00781, 0.00758, 0.0076, 0.00778, 0.00779, 0.0078, 0.00791, 0.00781, 0.00781, 0.00797, 0.00782, 0.00782, 0.0079, 0.0078, 0.00784, 0.00783, 0.00781, 0.00782, 0.00788, 0.0079, 0.00791, 0.0079, 0.00782, 0.00781, 0.00814, 0.0078, 0.00785, 0.00782, 0.00793, 0.00792, 0.008, 0.00785, 0.00786, 0.00784, 0.00782, 0.00866, 0.00784, 0.00789, 0.00784, 0.00787, 0.00839, 0.0088, 0.00783, 0.00783, 0.00785, 0.00793, 0.00785, 0.0079, 0.00785, 0.0078, 0.00782, 0.00791, 0.00786, 0.00781, 0.0079, 0.00782, 0.00783, 0.00783, 0.00783, 0.00782, 0.00798, 0.00781, 0.00795, 0.00782, 0.00782, 0.00791, 0.00782, 0.00789, 0.00781, 0.00782, 0.00779, 0.00782, 0.00781, 0.00795, 0.00784, 0.00781, 0.00787, 0.00782, 0.00781, 0.0078, 0.00791, 0.00784, 0.00796, 0.00798, 0.00782, 0.00782, 0.00785, 0.00784, 0.00818, 0.00781, 0.00787, 0.00783, 0.00781, 0.0078, 0.00782, 0.00781, 0.00794, 0.00793, 0.0078, 0.00794, 0.00789, 0.00786, 0.00784, 0.0079, 0.00782, 0.00783, 0.00781, 0.00784, 0.00779, 0.00782, 0.00783, 0.00781, 0.00781, 0.00789, 0.00881, 0.00824, 0.00789, 0.00781, 0.00781, 0.0078, 0.0085, 0.00783, 0.00782, 0.00779, 0.00783, 0.0078, 0.00797, 0.00779, 0.00784, 0.00789, 0.00782, 0.00783, 0.00779, 0.00782, 0.00789, 0.00779, 0.00783, 0.00781, 0.00786, 0.00799, 0.00801, 0.0079, 0.00782, 0.00791, 0.00782, 0.00785, 0.00781, 0.00784, 0.00782, 0.00783, 0.00779, 0.00783, 0.0084, 0.00783, 0.00791, 0.00782, 0.00798, 0.00782, 0.0078, 0.00782, 0.00787, 0.00792, 0.0078, 0.00787, 0.00784, 0.00783, 0.00784, 0.00779, 0.00783, 0.00781, 0.00782, 0.00783, 0.00786, 0.00794, 0.00785, 0.00783, 0.00782, 0.00781, 0.00795, 0.00782, 0.00795, 0.00789, 0.00781, 0.00783, 0.00785, 0.00782, 0.00782, 0.0078, 0.00782, 0.00794, 0.00782, 0.00786, 0.00785, 0.00783, 0.0078, 0.00783, 0.0079, 0.00784, 0.00781, 0.00787, 0.00781, 0.0079, 0.00782, 0.00782, 0.00796, 0.00784, 0.00782, 0.00783, 0.00789, 0.00792, 0.00787, 0.00791, 0.00781, 0.00783, 0.00802, 0.00784, 0.00783, 0.00785, 0.00783, 0.00782, 0.00781, 0.00788, 0.00802, 0.00787, 0.00787, 0.00793, 0.00784, 0.00793, 0.00797, 0.00783]}, "learning-rate": {"start_step": 0, "end_step": 2000, "step_interval": 5, "values": [0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 1e-05, 1e-05, 1e-05, 1e-05, 1e-05, 1e-05, 1e-05, 1e-05, 1e-05, 1e-05, 1e-05, 1e-05, 1e-05, 1e-05, 1e-05, 1e-05, 1e-05, 1e-05, 1e-05, 1e-05, 1e-05, 1e-05, 1e-05, 1e-05, 1e-05, 1e-05, 1e-05, 1e-05, 1e-05, 1e-05, 1e-05, 1e-05, 1e-05, 1e-05, 1e-05, 1e-05, 1e-05, 1e-05, 1e-05, 1e-05, 1e-05, 1e-05, 2e-05, 2e-05, 2e-05, 2e-05, 2e-05, 2e-05, 2e-05, 2e-05, 2e-05, 2e-05, 2e-05, 2e-05, 2e-05, 2e-05, 2e-05, 2e-05, 2e-05, 2e-05, 2e-05, 2e-05, 2e-05, 2e-05, 2e-05, 2e-05, 2e-05, 2e-05, 2e-05, 2e-05, 2e-05, 2e-05, 2e-05, 2e-05, 2e-05, 2e-05, 2e-05, 2e-05, 2e-05, 2e-05, 2e-05, 2e-05, 2e-05, 2e-05, 2e-05, 3e-05, 3e-05, 3e-05, 3e-05, 3e-05, 3e-05, 3e-05, 3e-05, 3e-05, 3e-05, 3e-05, 3e-05, 3e-05, 3e-05, 3e-05, 3e-05, 3e-05, 3e-05, 3e-05, 3e-05, 3e-05, 3e-05, 3e-05, 3e-05, 3e-05, 3e-05, 3e-05, 3e-05, 3e-05, 3e-05, 3e-05, 3e-05, 3e-05, 3e-05, 3e-05, 3e-05, 3e-05, 3e-05, 3e-05, 3e-05, 3e-05, 3e-05, 3e-05, 4e-05, 4e-05, 4e-05, 4e-05, 4e-05, 4e-05, 4e-05, 4e-05, 4e-05, 4e-05, 4e-05, 4e-05, 4e-05, 4e-05, 4e-05, 4e-05, 4e-05, 4e-05, 4e-05, 4e-05, 4e-05, 4e-05, 4e-05, 4e-05, 4e-05, 4e-05, 4e-05, 4e-05, 4e-05, 4e-05, 4e-05, 4e-05, 4e-05, 4e-05, 4e-05, 4e-05, 4e-05, 4e-05, 4e-05, 4e-05, 4e-05, 4e-05, 5e-05, 5e-05, 5e-05, 5e-05, 5e-05, 5e-05, 5e-05, 5e-05, 5e-05, 5e-05, 5e-05, 5e-05, 5e-05, 5e-05, 5e-05, 5e-05, 5e-05, 5e-05, 5e-05, 5e-05, 5e-05, 5e-05, 5e-05, 5e-05, 5e-05, 5e-05, 5e-05, 5e-05, 5e-05, 5e-05, 5e-05, 5e-05, 5e-05, 5e-05, 5e-05, 5e-05, 5e-05, 5e-05, 5e-05, 5e-05, 5e-05, 5e-05, 5e-05, 6e-05, 6e-05, 6e-05, 6e-05, 6e-05, 6e-05, 6e-05, 6e-05, 6e-05, 6e-05, 6e-05, 6e-05, 6e-05, 6e-05, 6e-05, 6e-05, 6e-05, 6e-05, 6e-05, 6e-05, 6e-05, 6e-05, 6e-05, 6e-05, 6e-05, 6e-05, 6e-05, 6e-05, 6e-05, 6e-05, 6e-05, 6e-05, 6e-05, 6e-05, 6e-05, 6e-05, 6e-05, 6e-05, 6e-05, 6e-05, 6e-05, 6e-05, 6e-05, 7e-05, 7e-05, 7e-05, 7e-05, 7e-05, 7e-05, 7e-05, 7e-05, 7e-05, 7e-05, 7e-05, 7e-05, 7e-05, 7e-05, 7e-05, 7e-05, 7e-05, 7e-05, 7e-05, 7e-05, 7e-05, 7e-05, 7e-05, 7e-05, 7e-05, 7e-05, 7e-05, 7e-05, 7e-05, 7e-05, 7e-05, 7e-05, 7e-05, 7e-05, 7e-05, 7e-05, 7e-05, 7e-05, 7e-05, 7e-05, 7e-05, 7e-05, 8e-05, 8e-05, 8e-05, 8e-05, 8e-05, 8e-05, 8e-05, 8e-05, 8e-05, 8e-05, 8e-05, 8e-05, 8e-05, 8e-05, 8e-05, 8e-05, 8e-05, 8e-05, 8e-05, 8e-05, 8e-05, 8e-05, 8e-05, 8e-05, 8e-05, 8e-05, 8e-05, 8e-05, 8e-05, 8e-05, 8e-05, 8e-05, 8e-05, 8e-05, 8e-05, 8e-05, 8e-05, 8e-05, 8e-05, 8e-05, 8e-05, 8e-05, 8e-05, 9e-05, 9e-05, 9e-05, 9e-05, 9e-05, 9e-05, 9e-05, 9e-05, 9e-05, 9e-05, 9e-05, 9e-05, 9e-05, 9e-05, 9e-05, 9e-05, 9e-05, 9e-05, 9e-05, 9e-05, 9e-05, 9e-05, 9e-05, 9e-05, 9e-05, 9e-05, 9e-05, 9e-05, 9e-05, 9e-05, 9e-05, 9e-05, 9e-05, 9e-05, 9e-05, 9e-05, 9e-05]}, "learning-rate vs samples": {"start_step": 0, "end_step": 2000, "step_interval": 5, "values": [0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 1e-05, 1e-05, 1e-05, 1e-05, 1e-05, 1e-05, 1e-05, 1e-05, 1e-05, 1e-05, 1e-05, 1e-05, 1e-05, 1e-05, 1e-05, 1e-05, 1e-05, 1e-05, 1e-05, 1e-05, 1e-05, 1e-05, 1e-05, 1e-05, 1e-05, 1e-05, 1e-05, 1e-05, 1e-05, 1e-05, 1e-05, 1e-05, 1e-05, 1e-05, 1e-05, 1e-05, 1e-05, 1e-05, 1e-05, 1e-05, 1e-05, 1e-05, 2e-05, 2e-05, 2e-05, 2e-05, 2e-05, 2e-05, 2e-05, 2e-05, 2e-05, 2e-05, 2e-05, 2e-05, 2e-05, 2e-05, 2e-05, 2e-05, 2e-05, 2e-05, 2e-05, 2e-05, 2e-05, 2e-05, 2e-05, 2e-05, 2e-05, 2e-05, 2e-05, 2e-05, 2e-05, 2e-05, 2e-05, 2e-05, 2e-05, 2e-05, 2e-05, 2e-05, 2e-05, 2e-05, 2e-05, 2e-05, 2e-05, 2e-05, 2e-05, 3e-05, 3e-05, 3e-05, 3e-05, 3e-05, 3e-05, 3e-05, 3e-05, 3e-05, 3e-05, 3e-05, 3e-05, 3e-05, 3e-05, 3e-05, 3e-05, 3e-05, 3e-05, 3e-05, 3e-05, 3e-05, 3e-05, 3e-05, 3e-05, 3e-05, 3e-05, 3e-05, 3e-05, 3e-05, 3e-05, 3e-05, 3e-05, 3e-05, 3e-05, 3e-05, 3e-05, 3e-05, 3e-05, 3e-05, 3e-05, 3e-05, 3e-05, 3e-05, 4e-05, 4e-05, 4e-05, 4e-05, 4e-05, 4e-05, 4e-05, 4e-05, 4e-05, 4e-05, 4e-05, 4e-05, 4e-05, 4e-05, 4e-05, 4e-05, 4e-05, 4e-05, 4e-05, 4e-05, 4e-05, 4e-05, 4e-05, 4e-05, 4e-05, 4e-05, 4e-05, 4e-05, 4e-05, 4e-05, 4e-05, 4e-05, 4e-05, 4e-05, 4e-05, 4e-05, 4e-05, 4e-05, 4e-05, 4e-05, 4e-05, 4e-05, 5e-05, 5e-05, 5e-05, 5e-05, 5e-05, 5e-05, 5e-05, 5e-05, 5e-05, 5e-05, 5e-05, 5e-05, 5e-05, 5e-05, 5e-05, 5e-05, 5e-05, 5e-05, 5e-05, 5e-05, 5e-05, 5e-05, 5e-05, 5e-05, 5e-05, 5e-05, 5e-05, 5e-05, 5e-05, 5e-05, 5e-05, 5e-05, 5e-05, 5e-05, 5e-05, 5e-05, 5e-05, 5e-05, 5e-05, 5e-05, 5e-05, 5e-05, 5e-05, 6e-05, 6e-05, 6e-05, 6e-05, 6e-05, 6e-05, 6e-05, 6e-05, 6e-05, 6e-05, 6e-05, 6e-05, 6e-05, 6e-05, 6e-05, 6e-05, 6e-05, 6e-05, 6e-05, 6e-05, 6e-05, 6e-05, 6e-05, 6e-05, 6e-05, 6e-05, 6e-05, 6e-05, 6e-05, 6e-05, 6e-05, 6e-05, 6e-05, 6e-05, 6e-05, 6e-05, 6e-05, 6e-05, 6e-05, 6e-05, 6e-05, 6e-05, 6e-05, 7e-05, 7e-05, 7e-05, 7e-05, 7e-05, 7e-05, 7e-05, 7e-05, 7e-05, 7e-05, 7e-05, 7e-05, 7e-05, 7e-05, 7e-05, 7e-05, 7e-05, 7e-05, 7e-05, 7e-05, 7e-05, 7e-05, 7e-05, 7e-05, 7e-05, 7e-05, 7e-05, 7e-05, 7e-05, 7e-05, 7e-05, 7e-05, 7e-05, 7e-05, 7e-05, 7e-05, 7e-05, 7e-05, 7e-05, 7e-05, 7e-05, 7e-05, 8e-05, 8e-05, 8e-05, 8e-05, 8e-05, 8e-05, 8e-05, 8e-05, 8e-05, 8e-05, 8e-05, 8e-05, 8e-05, 8e-05, 8e-05, 8e-05, 8e-05, 8e-05, 8e-05, 8e-05, 8e-05, 8e-05, 8e-05, 8e-05, 8e-05, 8e-05, 8e-05, 8e-05, 8e-05, 8e-05, 8e-05, 8e-05, 8e-05, 8e-05, 8e-05, 8e-05, 8e-05, 8e-05, 8e-05, 8e-05, 8e-05, 8e-05, 8e-05, 9e-05, 9e-05, 9e-05, 9e-05, 9e-05, 9e-05, 9e-05, 9e-05, 9e-05, 9e-05, 9e-05, 9e-05, 9e-05, 9e-05, 9e-05, 9e-05, 9e-05, 9e-05, 9e-05, 9e-05, 9e-05, 9e-05, 9e-05, 9e-05, 9e-05, 9e-05, 9e-05, 9e-05, 9e-05, 9e-05, 9e-05, 9e-05, 9e-05, 9e-05, 9e-05, 9e-05, 9e-05]}, "batch-size": {"start_step": 0, "end_step": 2000, "step_interval": 5, "values": [128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0]}, "batch-size vs samples": {"start_step": 0, "end_step": 2000, "step_interval": 5, "values": [128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0]}, "lm loss": {"start_step": 0, "end_step": 2000, "step_interval": 5, "values": [10.88345, 10.90291, 10.88739, 10.83435, 10.68106, 10.65239, 10.43882, 10.15796, 9.94566, 9.85031, 9.59624, 9.85805, 9.88827, 9.63311, 9.79091, 9.51415, 9.46112, 9.65226, 9.38851, 9.33535, 9.24597, 9.15002, 9.1791, 9.00048, 9.19456, 9.06645, 9.16089, 9.17249, 9.30644, 8.99568, 8.93903, 9.04853, 9.05134, 8.65891, 8.72191, 8.75857, 8.68509, 8.7367, 8.66155, 8.76648, 8.66383, 8.85312, 8.83506, 8.49989, 8.39023, 8.43268, 8.49362, 8.38495, 8.4346, 8.58278, 8.36836, 8.19768, 8.22999, 8.22623, 8.27021, 7.91926, 8.10177, 7.89448, 8.24737, 8.23304, 8.007, 7.96876, 7.92354, 7.74219, 7.74672, 7.64691, 7.51972, 7.90702, 7.70393, 7.45184, 7.74158, 7.77006, 7.54684, 7.30265, 7.45642, 7.33883, 7.46797, 7.22942, 7.63514, 7.28131, 7.35335, 7.21286, 7.21895, 7.42346, 7.17843, 7.28509, 7.00192, 7.0089, 7.04286, 7.14056, 6.82835, 6.99014, 7.09279, 7.00447, 6.88003, 6.761, 6.99471, 7.0633, 6.70925, 6.5917, 6.73258, 6.74964, 6.73779, 6.74258, 6.66376, 6.41582, 6.64124, 6.62873, 6.45047, 6.63243, 6.75424, 6.61807, 6.73736, 6.70363, 6.63926, 6.51953, 6.61425, 6.42312, 6.67885, 6.26757, 6.26882, 6.32005, 6.41287, 6.37101, 6.46896, 6.31397, 6.36148, 6.25486, 6.22526, 6.42692, 6.35485, 6.35029, 6.19105, 6.18567, 6.26859, 6.415, 6.23334, 6.18337, 6.21035, 6.14535, 6.09626, 6.10387, 6.28772, 6.43606, 6.29503, 6.335, 6.13464, 6.21503, 6.02829, 6.06095, 5.9935, 6.28273, 6.22023, 5.99847, 5.81393, 6.16265, 5.87946, 6.14445, 5.82485, 6.19248, 6.18157, 6.12584, 5.97074, 6.14877, 5.98325, 6.23524, 5.93942, 5.83892, 5.82229, 5.72934, 6.05496, 6.0434, 6.11051, 5.93954, 6.09171, 6.01241, 6.04004, 6.0322, 5.99651, 5.89061, 6.00653, 5.67122, 5.75784, 5.94696, 5.9005, 5.91468, 5.82189, 5.89471, 5.77842, 5.61622, 5.78054, 5.69253, 5.90048, 5.66647, 5.77352, 5.78152, 5.97131, 5.71328, 5.92696, 5.81669, 5.94504, 5.4175, 5.97213, 5.95642, 5.93165, 5.48932, 5.49949, 5.70719, 5.6873, 5.5725, 5.66702, 5.76913, 5.57229, 5.82826, 5.61559, 5.69173, 5.731, 5.73072, 5.62169, 5.71676, 5.78883, 5.80232, 5.67949, 5.77122, 5.47901, 5.79612, 5.73059, 5.53929, 5.69307, 5.7447, 5.6605, 5.44825, 5.66038, 5.60993, 5.60208, 5.50359, 5.67847, 5.72987, 5.52511, 5.65798, 5.63632, 5.4706, 5.64734, 5.55245, 5.58744, 5.44937, 5.20181, 5.63792, 5.72045, 5.87194, 5.56238, 5.74796, 5.79022, 5.38902, 5.44605, 5.54282, 5.55739, 5.49575, 5.64498, 5.33577, 5.45876, 5.42673, 5.5365, 5.42129, 5.62761, 5.71678, 5.48104, 5.60527, 5.5126, 5.25058, 5.49118, 5.43681, 5.48508, 5.28923, 5.46474, 5.45286, 5.6724, 5.35082, 5.46484, 5.40053, 5.54964, 5.16851, 5.10998, 5.5302, 5.59551, 5.43932, 5.53394, 5.2946, 5.37074, 5.47423, 5.2811, 5.46993, 5.28979, 5.57821, 5.48542, 5.37281, 5.45382, 5.27315, 5.53883, 5.2931, 5.25971, 5.35796, 5.33386, 5.5094, 5.38011, 5.51219, 5.30068, 5.34103, 5.49541, 5.54901, 5.50235, 5.43059, 5.39677, 5.52711, 5.19094, 5.45817, 5.34325, 5.56956, 5.41302, 5.43584, 5.37612, 5.25951, 5.25447, 5.49422, 5.5781, 5.35768, 5.3279, 5.19136, 5.4016, 5.39747, 5.20526, 5.61362, 5.29418, 5.39709, 5.44712, 5.30146, 5.34724, 5.36676, 5.28901, 5.361, 5.45905, 5.27649, 5.47318, 5.21725, 5.22023, 5.35122, 5.28396, 5.21834, 5.10071, 5.23602, 5.43096, 5.33142, 5.33017, 5.66246, 5.3004, 5.30692, 5.39386, 5.13475, 5.06957, 5.3365, 5.37793, 5.21244, 5.29887, 5.36995, 5.34675, 5.15473, 5.24757, 5.27856, 5.16172, 5.08869, 5.37568, 5.11393, 5.55309, 5.15317, 5.32295, 5.06795, 5.13265, 5.17242, 5.01042, 5.01637, 5.20515, 5.17193, 5.18392, 5.30507, 5.25233, 5.31569, 5.14154, 5.24356, 5.12106, 5.31092, 5.36465, 5.24729, 5.09639, 5.1804, 5.29568, 5.10464, 5.27827, 5.10619, 5.10892, 5.03572]}, "lm loss vs samples": {"start_step": 0, "end_step": 2000, "step_interval": 5, "values": [10.88345, 10.90291, 10.88739, 10.83435, 10.68106, 10.65239, 10.43882, 10.15796, 9.94566, 9.85031, 9.59624, 9.85805, 9.88827, 9.63311, 9.79091, 9.51415, 9.46112, 9.65226, 9.38851, 9.33535, 9.24597, 9.15002, 9.1791, 9.00048, 9.19456, 9.06645, 9.16089, 9.17249, 9.30644, 8.99568, 8.93903, 9.04853, 9.05134, 8.65891, 8.72191, 8.75857, 8.68509, 8.7367, 8.66155, 8.76648, 8.66383, 8.85312, 8.83506, 8.49989, 8.39023, 8.43268, 8.49362, 8.38495, 8.4346, 8.58278, 8.36836, 8.19768, 8.22999, 8.22623, 8.27021, 7.91926, 8.10177, 7.89448, 8.24737, 8.23304, 8.007, 7.96876, 7.92354, 7.74219, 7.74672, 7.64691, 7.51972, 7.90702, 7.70393, 7.45184, 7.74158, 7.77006, 7.54684, 7.30265, 7.45642, 7.33883, 7.46797, 7.22942, 7.63514, 7.28131, 7.35335, 7.21286, 7.21895, 7.42346, 7.17843, 7.28509, 7.00192, 7.0089, 7.04286, 7.14056, 6.82835, 6.99014, 7.09279, 7.00447, 6.88003, 6.761, 6.99471, 7.0633, 6.70925, 6.5917, 6.73258, 6.74964, 6.73779, 6.74258, 6.66376, 6.41582, 6.64124, 6.62873, 6.45047, 6.63243, 6.75424, 6.61807, 6.73736, 6.70363, 6.63926, 6.51953, 6.61425, 6.42312, 6.67885, 6.26757, 6.26882, 6.32005, 6.41287, 6.37101, 6.46896, 6.31397, 6.36148, 6.25486, 6.22526, 6.42692, 6.35485, 6.35029, 6.19105, 6.18567, 6.26859, 6.415, 6.23334, 6.18337, 6.21035, 6.14535, 6.09626, 6.10387, 6.28772, 6.43606, 6.29503, 6.335, 6.13464, 6.21503, 6.02829, 6.06095, 5.9935, 6.28273, 6.22023, 5.99847, 5.81393, 6.16265, 5.87946, 6.14445, 5.82485, 6.19248, 6.18157, 6.12584, 5.97074, 6.14877, 5.98325, 6.23524, 5.93942, 5.83892, 5.82229, 5.72934, 6.05496, 6.0434, 6.11051, 5.93954, 6.09171, 6.01241, 6.04004, 6.0322, 5.99651, 5.89061, 6.00653, 5.67122, 5.75784, 5.94696, 5.9005, 5.91468, 5.82189, 5.89471, 5.77842, 5.61622, 5.78054, 5.69253, 5.90048, 5.66647, 5.77352, 5.78152, 5.97131, 5.71328, 5.92696, 5.81669, 5.94504, 5.4175, 5.97213, 5.95642, 5.93165, 5.48932, 5.49949, 5.70719, 5.6873, 5.5725, 5.66702, 5.76913, 5.57229, 5.82826, 5.61559, 5.69173, 5.731, 5.73072, 5.62169, 5.71676, 5.78883, 5.80232, 5.67949, 5.77122, 5.47901, 5.79612, 5.73059, 5.53929, 5.69307, 5.7447, 5.6605, 5.44825, 5.66038, 5.60993, 5.60208, 5.50359, 5.67847, 5.72987, 5.52511, 5.65798, 5.63632, 5.4706, 5.64734, 5.55245, 5.58744, 5.44937, 5.20181, 5.63792, 5.72045, 5.87194, 5.56238, 5.74796, 5.79022, 5.38902, 5.44605, 5.54282, 5.55739, 5.49575, 5.64498, 5.33577, 5.45876, 5.42673, 5.5365, 5.42129, 5.62761, 5.71678, 5.48104, 5.60527, 5.5126, 5.25058, 5.49118, 5.43681, 5.48508, 5.28923, 5.46474, 5.45286, 5.6724, 5.35082, 5.46484, 5.40053, 5.54964, 5.16851, 5.10998, 5.5302, 5.59551, 5.43932, 5.53394, 5.2946, 5.37074, 5.47423, 5.2811, 5.46993, 5.28979, 5.57821, 5.48542, 5.37281, 5.45382, 5.27315, 5.53883, 5.2931, 5.25971, 5.35796, 5.33386, 5.5094, 5.38011, 5.51219, 5.30068, 5.34103, 5.49541, 5.54901, 5.50235, 5.43059, 5.39677, 5.52711, 5.19094, 5.45817, 5.34325, 5.56956, 5.41302, 5.43584, 5.37612, 5.25951, 5.25447, 5.49422, 5.5781, 5.35768, 5.3279, 5.19136, 5.4016, 5.39747, 5.20526, 5.61362, 5.29418, 5.39709, 5.44712, 5.30146, 5.34724, 5.36676, 5.28901, 5.361, 5.45905, 5.27649, 5.47318, 5.21725, 5.22023, 5.35122, 5.28396, 5.21834, 5.10071, 5.23602, 5.43096, 5.33142, 5.33017, 5.66246, 5.3004, 5.30692, 5.39386, 5.13475, 5.06957, 5.3365, 5.37793, 5.21244, 5.29887, 5.36995, 5.34675, 5.15473, 5.24757, 5.27856, 5.16172, 5.08869, 5.37568, 5.11393, 5.55309, 5.15317, 5.32295, 5.06795, 5.13265, 5.17242, 5.01042, 5.01637, 5.20515, 5.17193, 5.18392, 5.30507, 5.25233, 5.31569, 5.14154, 5.24356, 5.12106, 5.31092, 5.36465, 5.24729, 5.09639, 5.1804, 5.29568, 5.10464, 5.27827, 5.10619, 5.10892, 5.03572]}, "loss-scale": {"start_step": 0, "end_step": 2000, "step_interval": 5, "values": [1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0]}, "loss-scale vs samples": {"start_step": 0, "end_step": 2000, "step_interval": 5, "values": [1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0]}, "grad-norm": {"start_step": 0, "end_step": 2000, "step_interval": 5, "values": [12.43997, 12.4994, 12.67738, 12.01981, 11.40989, 9.15396, 6.91154, 7.19653, 6.10097, 4.66447, 4.20211, 2.8807, 2.37647, 2.34175, 2.05101, 2.19366, 2.12083, 1.89191, 2.18481, 2.06821, 2.11865, 2.16674, 2.00167, 2.19993, 1.94652, 2.02914, 1.87967, 1.849, 1.87625, 2.13926, 2.1644, 1.83737, 1.7865, 2.10617, 2.09168, 2.03916, 1.97963, 1.83822, 1.96495, 1.70803, 2.13244, 1.91303, 1.67031, 1.85063, 1.89388, 1.7393, 1.73696, 1.73834, 1.81384, 1.54681, 1.72306, 1.83162, 1.75476, 1.78654, 1.54973, 1.8348, 1.71396, 1.79871, 1.46752, 1.54685, 1.64797, 1.57656, 1.70218, 1.63082, 1.61792, 1.6742, 1.70617, 1.4063, 1.49439, 1.5398, 1.39435, 1.372, 1.63172, 1.45579, 1.3529, 1.50085, 1.31258, 1.33724, 1.14869, 1.28976, 1.19311, 1.38603, 1.20251, 1.31173, 1.10965, 1.18009, 1.42638, 1.54885, 1.1348, 1.01505, 1.06293, 1.23147, 0.95714, 0.89268, 0.94079, 1.27319, 1.18212, 1.01407, 1.03886, 1.50527, 1.02205, 1.09161, 0.91857, 1.10077, 0.94051, 1.19162, 0.99345, 0.96782, 1.0889, 0.98132, 1.29717, 0.8425, 1.11704, 0.95051, 1.15684, 0.97961, 0.94467, 1.05905, 0.93968, 1.14615, 0.96345, 0.97578, 1.19987, 0.96535, 1.25273, 1.46243, 1.21921, 0.99922, 1.14431, 1.34353, 1.06135, 1.14405, 1.10872, 1.1588, 0.94471, 1.01308, 0.94383, 0.99273, 0.97851, 0.89198, 1.09779, 1.31177, 1.05508, 0.91714, 1.0117, 1.28832, 1.09784, 1.19667, 0.92098, 0.98378, 1.03891, 1.07858, 1.29929, 0.94354, 1.06388, 1.50705, 1.0007, 1.35362, 1.28287, 0.84574, 1.11813, 1.1825, 1.04876, 1.12893, 1.16116, 1.12585, 1.11897, 1.15162, 1.30322, 1.20265, 1.018, 0.99879, 0.90328, 1.21092, 1.0701, 1.06218, 1.10403, 1.0926, 1.05063, 1.07573, 1.20003, 1.25848, 1.34649, 1.12066, 1.50822, 1.14324, 1.4787, 1.1305, 1.14505, 1.16533, 1.14287, 1.24641, 1.38816, 1.42518, 1.1866, 1.45857, 1.17698, 1.2263, 1.01505, 1.21325, 1.36272, 1.305, 1.19874, 1.18217, 1.01807, 1.24602, 1.46217, 1.22746, 1.20492, 1.3465, 1.12878, 1.16877, 1.06974, 1.08696, 1.6092, 1.25397, 1.20201, 1.08861, 1.34872, 1.27688, 1.5104, 1.30437, 1.05297, 1.3032, 1.2672, 1.36045, 1.15533, 1.08165, 1.20493, 1.17126, 1.18099, 1.25764, 1.52555, 1.33265, 1.17044, 1.32121, 1.21081, 1.39328, 1.50488, 1.28381, 1.24675, 1.23603, 1.3193, 1.29405, 1.23259, 1.07163, 1.1052, 1.24045, 1.37927, 1.50839, 1.32285, 1.38782, 1.13484, 1.21127, 2.00278, 1.36691, 1.32213, 1.37434, 1.00254, 1.08214, 1.17335, 1.41525, 1.25392, 1.43316, 1.39572, 1.31067, 1.2846, 1.09515, 1.18724, 1.20128, 1.30643, 1.23357, 1.11402, 1.17568, 1.29277, 1.22678, 1.1362, 1.18826, 1.25873, 1.2814, 1.22295, 1.02105, 1.29626, 1.3106, 1.38573, 1.28368, 1.04758, 1.13079, 1.06747, 1.51913, 1.45844, 1.11656, 1.1972, 1.22395, 1.4347, 1.41031, 1.11466, 1.5639, 1.36293, 1.24572, 1.4447, 1.25296, 1.14388, 1.12495, 1.31276, 1.35398, 1.2105, 1.44264, 1.16726, 1.19041, 1.35889, 1.20903, 1.15845, 1.12041, 1.06639, 1.2833, 1.21736, 1.18244, 1.41925, 1.21164, 1.17543, 1.27955, 1.27399, 1.23019, 1.33022, 1.24584, 1.546, 1.32952, 1.1706, 1.31643, 1.32431, 1.26323, 1.13097, 1.34316, 1.10348, 1.33974, 1.18037, 1.18919, 1.42354, 1.37144, 1.33382, 1.39443, 1.37347, 1.18285, 1.1776, 1.31269, 1.10901, 1.33507, 1.39353, 1.28869, 1.32106, 1.36384, 1.307, 1.2118, 1.20055, 1.076, 1.20907, 1.28103, 1.2481, 1.49609, 1.25261, 1.22933, 1.23135, 1.40382, 1.47949, 1.50263, 1.27893, 1.27615, 1.34666, 1.30354, 1.1997, 1.51644, 1.42165, 1.35804, 1.19426, 1.23401, 1.36501, 1.05637, 1.11768, 1.22237, 1.39349, 1.3636, 1.33587, 1.44787, 1.23775, 1.25341, 1.15189, 1.07392, 1.29463, 1.16475, 1.13311, 1.32307, 1.04489, 1.17108, 1.24996, 1.21235, 1.90656, 1.20192, 1.24416, 1.32035]}, "grad-norm vs samples": {"start_step": 0, "end_step": 2000, "step_interval": 5, "values": [12.43997, 12.4994, 12.67738, 12.01981, 11.40989, 9.15396, 6.91154, 7.19653, 6.10097, 4.66447, 4.20211, 2.8807, 2.37647, 2.34175, 2.05101, 2.19366, 2.12083, 1.89191, 2.18481, 2.06821, 2.11865, 2.16674, 2.00167, 2.19993, 1.94652, 2.02914, 1.87967, 1.849, 1.87625, 2.13926, 2.1644, 1.83737, 1.7865, 2.10617, 2.09168, 2.03916, 1.97963, 1.83822, 1.96495, 1.70803, 2.13244, 1.91303, 1.67031, 1.85063, 1.89388, 1.7393, 1.73696, 1.73834, 1.81384, 1.54681, 1.72306, 1.83162, 1.75476, 1.78654, 1.54973, 1.8348, 1.71396, 1.79871, 1.46752, 1.54685, 1.64797, 1.57656, 1.70218, 1.63082, 1.61792, 1.6742, 1.70617, 1.4063, 1.49439, 1.5398, 1.39435, 1.372, 1.63172, 1.45579, 1.3529, 1.50085, 1.31258, 1.33724, 1.14869, 1.28976, 1.19311, 1.38603, 1.20251, 1.31173, 1.10965, 1.18009, 1.42638, 1.54885, 1.1348, 1.01505, 1.06293, 1.23147, 0.95714, 0.89268, 0.94079, 1.27319, 1.18212, 1.01407, 1.03886, 1.50527, 1.02205, 1.09161, 0.91857, 1.10077, 0.94051, 1.19162, 0.99345, 0.96782, 1.0889, 0.98132, 1.29717, 0.8425, 1.11704, 0.95051, 1.15684, 0.97961, 0.94467, 1.05905, 0.93968, 1.14615, 0.96345, 0.97578, 1.19987, 0.96535, 1.25273, 1.46243, 1.21921, 0.99922, 1.14431, 1.34353, 1.06135, 1.14405, 1.10872, 1.1588, 0.94471, 1.01308, 0.94383, 0.99273, 0.97851, 0.89198, 1.09779, 1.31177, 1.05508, 0.91714, 1.0117, 1.28832, 1.09784, 1.19667, 0.92098, 0.98378, 1.03891, 1.07858, 1.29929, 0.94354, 1.06388, 1.50705, 1.0007, 1.35362, 1.28287, 0.84574, 1.11813, 1.1825, 1.04876, 1.12893, 1.16116, 1.12585, 1.11897, 1.15162, 1.30322, 1.20265, 1.018, 0.99879, 0.90328, 1.21092, 1.0701, 1.06218, 1.10403, 1.0926, 1.05063, 1.07573, 1.20003, 1.25848, 1.34649, 1.12066, 1.50822, 1.14324, 1.4787, 1.1305, 1.14505, 1.16533, 1.14287, 1.24641, 1.38816, 1.42518, 1.1866, 1.45857, 1.17698, 1.2263, 1.01505, 1.21325, 1.36272, 1.305, 1.19874, 1.18217, 1.01807, 1.24602, 1.46217, 1.22746, 1.20492, 1.3465, 1.12878, 1.16877, 1.06974, 1.08696, 1.6092, 1.25397, 1.20201, 1.08861, 1.34872, 1.27688, 1.5104, 1.30437, 1.05297, 1.3032, 1.2672, 1.36045, 1.15533, 1.08165, 1.20493, 1.17126, 1.18099, 1.25764, 1.52555, 1.33265, 1.17044, 1.32121, 1.21081, 1.39328, 1.50488, 1.28381, 1.24675, 1.23603, 1.3193, 1.29405, 1.23259, 1.07163, 1.1052, 1.24045, 1.37927, 1.50839, 1.32285, 1.38782, 1.13484, 1.21127, 2.00278, 1.36691, 1.32213, 1.37434, 1.00254, 1.08214, 1.17335, 1.41525, 1.25392, 1.43316, 1.39572, 1.31067, 1.2846, 1.09515, 1.18724, 1.20128, 1.30643, 1.23357, 1.11402, 1.17568, 1.29277, 1.22678, 1.1362, 1.18826, 1.25873, 1.2814, 1.22295, 1.02105, 1.29626, 1.3106, 1.38573, 1.28368, 1.04758, 1.13079, 1.06747, 1.51913, 1.45844, 1.11656, 1.1972, 1.22395, 1.4347, 1.41031, 1.11466, 1.5639, 1.36293, 1.24572, 1.4447, 1.25296, 1.14388, 1.12495, 1.31276, 1.35398, 1.2105, 1.44264, 1.16726, 1.19041, 1.35889, 1.20903, 1.15845, 1.12041, 1.06639, 1.2833, 1.21736, 1.18244, 1.41925, 1.21164, 1.17543, 1.27955, 1.27399, 1.23019, 1.33022, 1.24584, 1.546, 1.32952, 1.1706, 1.31643, 1.32431, 1.26323, 1.13097, 1.34316, 1.10348, 1.33974, 1.18037, 1.18919, 1.42354, 1.37144, 1.33382, 1.39443, 1.37347, 1.18285, 1.1776, 1.31269, 1.10901, 1.33507, 1.39353, 1.28869, 1.32106, 1.36384, 1.307, 1.2118, 1.20055, 1.076, 1.20907, 1.28103, 1.2481, 1.49609, 1.25261, 1.22933, 1.23135, 1.40382, 1.47949, 1.50263, 1.27893, 1.27615, 1.34666, 1.30354, 1.1997, 1.51644, 1.42165, 1.35804, 1.19426, 1.23401, 1.36501, 1.05637, 1.11768, 1.22237, 1.39349, 1.3636, 1.33587, 1.44787, 1.23775, 1.25341, 1.15189, 1.07392, 1.29463, 1.16475, 1.13311, 1.32307, 1.04489, 1.17108, 1.24996, 1.21235, 1.90656, 1.20192, 1.24416, 1.32035]}, "num-zeros": {"start_step": 0, "end_step": 2000, "step_interval": 5, "values": [80.0, 89.0, 102.0, 88.0, 78.0, 115.0, 125.0, 114.0, 129.0, 106.0, 125.0, 179.0, 156.0, 184.0, 179.0, 191.0, 171.0, 216.0, 169.0, 200.0, 171.0, 184.0, 206.0, 173.0, 221.0, 181.0, 188.0, 209.0, 187.0, 188.0, 167.0, 165.0, 180.0, 204.0, 152.0, 155.0, 170.0, 179.0, 177.0, 197.0, 184.0, 162.0, 194.0, 184.0, 171.0, 206.0, 198.0, 200.0, 187.0, 238.0, 208.0, 173.0, 201.0, 145.0, 199.0, 194.0, 185.0, 173.0, 266.0, 238.0, 190.0, 195.0, 182.0, 188.0, 199.0, 262.0, 210.0, 233.0, 216.0, 199.0, 257.0, 213.0, 220.0, 243.0, 218.0, 215.0, 229.0, 219.0, 289.0, 212.0, 280.0, 229.0, 196.0, 274.0, 237.0, 246.0, 170.0, 203.0, 205.0, 236.0, 201.0, 203.0, 256.0, 220.0, 191.0, 173.0, 214.0, 225.0, 183.0, 151.0, 195.0, 174.0, 218.0, 189.0, 159.0, 151.0, 154.0, 154.0, 130.0, 202.0, 162.0, 186.0, 166.0, 187.0, 136.0, 145.0, 168.0, 100.0, 161.0, 124.0, 138.0, 163.0, 108.0, 167.0, 129.0, 131.0, 141.0, 148.0, 128.0, 124.0, 137.0, 168.0, 133.0, 114.0, 139.0, 123.0, 161.0, 139.0, 133.0, 152.0, 122.0, 111.0, 135.0, 155.0, 158.0, 101.0, 134.0, 164.0, 136.0, 163.0, 110.0, 153.0, 116.0, 132.0, 120.0, 115.0, 108.0, 85.0, 97.0, 169.0, 112.0, 115.0, 134.0, 105.0, 114.0, 156.0, 115.0, 103.0, 125.0, 113.0, 121.0, 138.0, 114.0, 130.0, 122.0, 118.0, 88.0, 106.0, 113.0, 121.0, 134.0, 131.0, 118.0, 130.0, 93.0, 111.0, 114.0, 111.0, 106.0, 95.0, 105.0, 107.0, 107.0, 87.0, 112.0, 90.0, 116.0, 104.0, 135.0, 140.0, 102.0, 104.0, 142.0, 144.0, 121.0, 87.0, 99.0, 136.0, 115.0, 105.0, 126.0, 112.0, 126.0, 125.0, 115.0, 116.0, 121.0, 145.0, 109.0, 111.0, 103.0, 112.0, 129.0, 115.0, 130.0, 97.0, 119.0, 103.0, 116.0, 135.0, 109.0, 115.0, 109.0, 113.0, 119.0, 116.0, 105.0, 107.0, 105.0, 109.0, 113.0, 115.0, 101.0, 114.0, 109.0, 123.0, 111.0, 117.0, 106.0, 92.0, 103.0, 118.0, 116.0, 130.0, 99.0, 107.0, 121.0, 96.0, 124.0, 112.0, 134.0, 104.0, 115.0, 104.0, 113.0, 107.0, 119.0, 124.0, 116.0, 115.0, 123.0, 139.0, 117.0, 118.0, 110.0, 112.0, 124.0, 112.0, 104.0, 98.0, 108.0, 134.0, 108.0, 126.0, 123.0, 118.0, 120.0, 122.0, 141.0, 105.0, 81.0, 122.0, 131.0, 123.0, 122.0, 101.0, 129.0, 88.0, 131.0, 124.0, 110.0, 124.0, 130.0, 141.0, 109.0, 107.0, 95.0, 104.0, 136.0, 123.0, 121.0, 123.0, 111.0, 117.0, 142.0, 120.0, 111.0, 108.0, 86.0, 121.0, 115.0, 111.0, 125.0, 128.0, 93.0, 126.0, 116.0, 124.0, 94.0, 107.0, 107.0, 128.0, 106.0, 110.0, 128.0, 104.0, 105.0, 114.0, 118.0, 117.0, 99.0, 123.0, 108.0, 107.0, 126.0, 119.0, 121.0, 121.0, 107.0, 116.0, 116.0, 116.0, 126.0, 145.0, 132.0, 133.0, 125.0, 100.0, 98.0, 129.0, 118.0, 121.0, 105.0, 107.0, 95.0, 113.0, 106.0, 108.0, 94.0, 121.0, 139.0, 118.0, 101.0, 98.0, 111.0, 117.0, 112.0, 129.0, 113.0, 119.0, 103.0, 123.0, 124.0, 107.0, 121.0, 117.0, 126.0, 123.0, 103.0, 113.0, 131.0, 117.0, 128.0, 123.0, 103.0, 149.0, 113.0, 101.0, 122.0, 110.0]}, "num-zeros vs samples": {"start_step": 0, "end_step": 2000, "step_interval": 5, "values": [80.0, 89.0, 102.0, 88.0, 78.0, 115.0, 125.0, 114.0, 129.0, 106.0, 125.0, 179.0, 156.0, 184.0, 179.0, 191.0, 171.0, 216.0, 169.0, 200.0, 171.0, 184.0, 206.0, 173.0, 221.0, 181.0, 188.0, 209.0, 187.0, 188.0, 167.0, 165.0, 180.0, 204.0, 152.0, 155.0, 170.0, 179.0, 177.0, 197.0, 184.0, 162.0, 194.0, 184.0, 171.0, 206.0, 198.0, 200.0, 187.0, 238.0, 208.0, 173.0, 201.0, 145.0, 199.0, 194.0, 185.0, 173.0, 266.0, 238.0, 190.0, 195.0, 182.0, 188.0, 199.0, 262.0, 210.0, 233.0, 216.0, 199.0, 257.0, 213.0, 220.0, 243.0, 218.0, 215.0, 229.0, 219.0, 289.0, 212.0, 280.0, 229.0, 196.0, 274.0, 237.0, 246.0, 170.0, 203.0, 205.0, 236.0, 201.0, 203.0, 256.0, 220.0, 191.0, 173.0, 214.0, 225.0, 183.0, 151.0, 195.0, 174.0, 218.0, 189.0, 159.0, 151.0, 154.0, 154.0, 130.0, 202.0, 162.0, 186.0, 166.0, 187.0, 136.0, 145.0, 168.0, 100.0, 161.0, 124.0, 138.0, 163.0, 108.0, 167.0, 129.0, 131.0, 141.0, 148.0, 128.0, 124.0, 137.0, 168.0, 133.0, 114.0, 139.0, 123.0, 161.0, 139.0, 133.0, 152.0, 122.0, 111.0, 135.0, 155.0, 158.0, 101.0, 134.0, 164.0, 136.0, 163.0, 110.0, 153.0, 116.0, 132.0, 120.0, 115.0, 108.0, 85.0, 97.0, 169.0, 112.0, 115.0, 134.0, 105.0, 114.0, 156.0, 115.0, 103.0, 125.0, 113.0, 121.0, 138.0, 114.0, 130.0, 122.0, 118.0, 88.0, 106.0, 113.0, 121.0, 134.0, 131.0, 118.0, 130.0, 93.0, 111.0, 114.0, 111.0, 106.0, 95.0, 105.0, 107.0, 107.0, 87.0, 112.0, 90.0, 116.0, 104.0, 135.0, 140.0, 102.0, 104.0, 142.0, 144.0, 121.0, 87.0, 99.0, 136.0, 115.0, 105.0, 126.0, 112.0, 126.0, 125.0, 115.0, 116.0, 121.0, 145.0, 109.0, 111.0, 103.0, 112.0, 129.0, 115.0, 130.0, 97.0, 119.0, 103.0, 116.0, 135.0, 109.0, 115.0, 109.0, 113.0, 119.0, 116.0, 105.0, 107.0, 105.0, 109.0, 113.0, 115.0, 101.0, 114.0, 109.0, 123.0, 111.0, 117.0, 106.0, 92.0, 103.0, 118.0, 116.0, 130.0, 99.0, 107.0, 121.0, 96.0, 124.0, 112.0, 134.0, 104.0, 115.0, 104.0, 113.0, 107.0, 119.0, 124.0, 116.0, 115.0, 123.0, 139.0, 117.0, 118.0, 110.0, 112.0, 124.0, 112.0, 104.0, 98.0, 108.0, 134.0, 108.0, 126.0, 123.0, 118.0, 120.0, 122.0, 141.0, 105.0, 81.0, 122.0, 131.0, 123.0, 122.0, 101.0, 129.0, 88.0, 131.0, 124.0, 110.0, 124.0, 130.0, 141.0, 109.0, 107.0, 95.0, 104.0, 136.0, 123.0, 121.0, 123.0, 111.0, 117.0, 142.0, 120.0, 111.0, 108.0, 86.0, 121.0, 115.0, 111.0, 125.0, 128.0, 93.0, 126.0, 116.0, 124.0, 94.0, 107.0, 107.0, 128.0, 106.0, 110.0, 128.0, 104.0, 105.0, 114.0, 118.0, 117.0, 99.0, 123.0, 108.0, 107.0, 126.0, 119.0, 121.0, 121.0, 107.0, 116.0, 116.0, 116.0, 126.0, 145.0, 132.0, 133.0, 125.0, 100.0, 98.0, 129.0, 118.0, 121.0, 105.0, 107.0, 95.0, 113.0, 106.0, 108.0, 94.0, 121.0, 139.0, 118.0, 101.0, 98.0, 111.0, 117.0, 112.0, 129.0, 113.0, 119.0, 103.0, 123.0, 124.0, 107.0, 121.0, 117.0, 126.0, 123.0, 103.0, 113.0, 131.0, 117.0, 128.0, 123.0, 103.0, 149.0, 113.0, 101.0, 122.0, 110.0]}, "params-norm": {"start_step": 0, "end_step": 2000, "step_interval": 5, "values": [179.95625, 179.95625, 179.95625, 179.95625, 179.95625, 179.95624, 179.95622, 179.95612, 179.95593, 179.95575, 179.95451, 179.95384, 179.95331, 179.95131, 179.95029, 179.94963, 179.94899, 179.94896, 179.94923, 179.94928, 179.94922, 179.94897, 179.94885, 179.9491, 179.94991, 179.951, 179.95213, 179.95309, 179.95415, 179.95551, 179.9574, 179.95952, 179.96179, 179.96399, 179.96649, 179.96965, 179.97318, 179.97679, 179.98051, 179.98468, 179.98955, 179.99477, 180.00044, 180.00658, 180.01337, 180.02075, 180.02858, 180.03702, 180.04625, 180.05624, 180.06699, 180.0782, 180.09018, 180.10277, 180.11606, 180.12999, 180.14421, 180.159, 180.17467, 180.19148, 180.20897, 180.22713, 180.24684, 180.26782, 180.2896, 180.31204, 180.33545, 180.35973, 180.38542, 180.41144, 180.43797, 180.46524, 180.4928, 180.52104, 180.54993, 180.57939, 180.60922, 180.63998, 180.67151, 180.70398, 180.73651, 180.76875, 180.80157, 180.83536, 180.86948, 180.90508, 180.9411, 180.97647, 181.01176, 181.04828, 181.08588, 181.12448, 181.16327, 181.20253, 181.24295, 181.28366, 181.32249, 181.35963, 181.39644, 181.43352, 181.47067, 181.50752, 181.54518, 181.58394, 181.62318, 181.66335, 181.7032, 181.74304, 181.78291, 181.82195, 181.86037, 181.89832, 181.93773, 181.97792, 182.01897, 182.05927, 182.09976, 182.14062, 182.18091, 182.22133, 182.26169, 182.30261, 182.34355, 182.38451, 182.4248, 182.46426, 182.50208, 182.53731, 182.57451, 182.61168, 182.64999, 182.68562, 182.72139, 182.75731, 182.79347, 182.83156, 182.87192, 182.91328, 182.95439, 182.99614, 183.03891, 183.07968, 183.12061, 183.16183, 183.20284, 183.24399, 183.28496, 183.325, 183.3662, 183.40788, 183.45087, 183.49307, 183.53464, 183.57661, 183.61989, 183.66231, 183.70183, 183.7419, 183.78094, 183.81953, 183.86018, 183.90375, 183.94774, 183.9931, 184.03831, 184.08267, 184.12688, 184.16986, 184.21062, 184.25189, 184.29411, 184.3373, 184.38132, 184.42554, 184.46965, 184.51401, 184.55882, 184.60381, 184.64806, 184.69025, 184.73256, 184.7748, 184.817, 184.86073, 184.90417, 184.94685, 184.98766, 185.02675, 185.06696, 185.10852, 185.15274, 185.19722, 185.24055, 185.28352, 185.32553, 185.36723, 185.40932, 185.45212, 185.49559, 185.54068, 185.58374, 185.62703, 185.6687, 185.71231, 185.75662, 185.80209, 185.84537, 185.88788, 185.93077, 185.97299, 186.01599, 186.05911, 186.10475, 186.15176, 186.19826, 186.24303, 186.28674, 186.33194, 186.377, 186.42128, 186.46397, 186.50703, 186.55083, 186.59554, 186.63943, 186.68254, 186.72632, 186.77109, 186.81587, 186.86107, 186.90485, 186.94669, 186.9883, 187.03162, 187.07474, 187.11856, 187.16187, 187.20621, 187.25069, 187.29416, 187.33778, 187.38162, 187.42618, 187.47089, 187.51416, 187.56001, 187.60674, 187.6539, 187.70016, 187.74496, 187.7905, 187.83824, 187.88522, 187.93312, 187.98019, 188.02357, 188.06801, 188.11484, 188.1615, 188.21011, 188.26111, 188.31125, 188.35876, 188.4053, 188.45084, 188.49641, 188.54265, 188.58983, 188.64067, 188.69183, 188.74222, 188.79266, 188.84273, 188.89304, 188.94508, 188.99475, 189.04398, 189.09485, 189.14598, 189.1965, 189.24777, 189.29964, 189.35378, 189.40587, 189.45831, 189.50987, 189.56148, 189.61368, 189.66797, 189.71982, 189.77005, 189.81833, 189.86722, 189.91873, 189.97101, 190.02145, 190.07199, 190.12384, 190.17366, 190.22346, 190.27402, 190.3253, 190.37793, 190.43097, 190.48424, 190.53532, 190.58551, 190.63808, 190.69084, 190.74536, 190.79968, 190.85349, 190.90894, 190.96626, 191.02402, 191.08208, 191.13948, 191.19746, 191.25615, 191.31114, 191.36597, 191.4203, 191.47542, 191.53027, 191.58527, 191.63684, 191.68701, 191.73514, 191.78677, 191.83801, 191.8905, 191.94266, 191.99596, 192.05061, 192.1071, 192.16386, 192.21751, 192.27289, 192.32852, 192.37949, 192.43187, 192.48483, 192.53804, 192.59248, 192.64667, 192.70181, 192.75798, 192.81502, 192.87016, 192.92496, 192.98015, 193.03481, 193.09019, 193.14693, 193.20465, 193.26526, 193.32504, 193.38451, 193.44281, 193.49977, 193.55804, 193.61533, 193.67177, 193.72891, 193.78667, 193.84259, 193.89799, 193.95425, 194.01086, 194.06876, 194.12726, 194.18596, 194.24385, 194.30168, 194.35782, 194.41516, 194.47411, 194.53342, 194.59587, 194.65793, 194.71797, 194.77441, 194.83284, 194.88989, 194.94766, 195.00539, 195.06413, 195.12605, 195.19096, 195.25722, 195.32449, 195.39157, 195.45724, 195.52281, 195.58981, 195.65671, 195.7216, 195.78194, 195.84415, 195.90858]}, "params-norm vs samples": {"start_step": 0, "end_step": 2000, "step_interval": 5, "values": [179.95625, 179.95625, 179.95625, 179.95625, 179.95625, 179.95624, 179.95622, 179.95612, 179.95593, 179.95575, 179.95451, 179.95384, 179.95331, 179.95131, 179.95029, 179.94963, 179.94899, 179.94896, 179.94923, 179.94928, 179.94922, 179.94897, 179.94885, 179.9491, 179.94991, 179.951, 179.95213, 179.95309, 179.95415, 179.95551, 179.9574, 179.95952, 179.96179, 179.96399, 179.96649, 179.96965, 179.97318, 179.97679, 179.98051, 179.98468, 179.98955, 179.99477, 180.00044, 180.00658, 180.01337, 180.02075, 180.02858, 180.03702, 180.04625, 180.05624, 180.06699, 180.0782, 180.09018, 180.10277, 180.11606, 180.12999, 180.14421, 180.159, 180.17467, 180.19148, 180.20897, 180.22713, 180.24684, 180.26782, 180.2896, 180.31204, 180.33545, 180.35973, 180.38542, 180.41144, 180.43797, 180.46524, 180.4928, 180.52104, 180.54993, 180.57939, 180.60922, 180.63998, 180.67151, 180.70398, 180.73651, 180.76875, 180.80157, 180.83536, 180.86948, 180.90508, 180.9411, 180.97647, 181.01176, 181.04828, 181.08588, 181.12448, 181.16327, 181.20253, 181.24295, 181.28366, 181.32249, 181.35963, 181.39644, 181.43352, 181.47067, 181.50752, 181.54518, 181.58394, 181.62318, 181.66335, 181.7032, 181.74304, 181.78291, 181.82195, 181.86037, 181.89832, 181.93773, 181.97792, 182.01897, 182.05927, 182.09976, 182.14062, 182.18091, 182.22133, 182.26169, 182.30261, 182.34355, 182.38451, 182.4248, 182.46426, 182.50208, 182.53731, 182.57451, 182.61168, 182.64999, 182.68562, 182.72139, 182.75731, 182.79347, 182.83156, 182.87192, 182.91328, 182.95439, 182.99614, 183.03891, 183.07968, 183.12061, 183.16183, 183.20284, 183.24399, 183.28496, 183.325, 183.3662, 183.40788, 183.45087, 183.49307, 183.53464, 183.57661, 183.61989, 183.66231, 183.70183, 183.7419, 183.78094, 183.81953, 183.86018, 183.90375, 183.94774, 183.9931, 184.03831, 184.08267, 184.12688, 184.16986, 184.21062, 184.25189, 184.29411, 184.3373, 184.38132, 184.42554, 184.46965, 184.51401, 184.55882, 184.60381, 184.64806, 184.69025, 184.73256, 184.7748, 184.817, 184.86073, 184.90417, 184.94685, 184.98766, 185.02675, 185.06696, 185.10852, 185.15274, 185.19722, 185.24055, 185.28352, 185.32553, 185.36723, 185.40932, 185.45212, 185.49559, 185.54068, 185.58374, 185.62703, 185.6687, 185.71231, 185.75662, 185.80209, 185.84537, 185.88788, 185.93077, 185.97299, 186.01599, 186.05911, 186.10475, 186.15176, 186.19826, 186.24303, 186.28674, 186.33194, 186.377, 186.42128, 186.46397, 186.50703, 186.55083, 186.59554, 186.63943, 186.68254, 186.72632, 186.77109, 186.81587, 186.86107, 186.90485, 186.94669, 186.9883, 187.03162, 187.07474, 187.11856, 187.16187, 187.20621, 187.25069, 187.29416, 187.33778, 187.38162, 187.42618, 187.47089, 187.51416, 187.56001, 187.60674, 187.6539, 187.70016, 187.74496, 187.7905, 187.83824, 187.88522, 187.93312, 187.98019, 188.02357, 188.06801, 188.11484, 188.1615, 188.21011, 188.26111, 188.31125, 188.35876, 188.4053, 188.45084, 188.49641, 188.54265, 188.58983, 188.64067, 188.69183, 188.74222, 188.79266, 188.84273, 188.89304, 188.94508, 188.99475, 189.04398, 189.09485, 189.14598, 189.1965, 189.24777, 189.29964, 189.35378, 189.40587, 189.45831, 189.50987, 189.56148, 189.61368, 189.66797, 189.71982, 189.77005, 189.81833, 189.86722, 189.91873, 189.97101, 190.02145, 190.07199, 190.12384, 190.17366, 190.22346, 190.27402, 190.3253, 190.37793, 190.43097, 190.48424, 190.53532, 190.58551, 190.63808, 190.69084, 190.74536, 190.79968, 190.85349, 190.90894, 190.96626, 191.02402, 191.08208, 191.13948, 191.19746, 191.25615, 191.31114, 191.36597, 191.4203, 191.47542, 191.53027, 191.58527, 191.63684, 191.68701, 191.73514, 191.78677, 191.83801, 191.8905, 191.94266, 191.99596, 192.05061, 192.1071, 192.16386, 192.21751, 192.27289, 192.32852, 192.37949, 192.43187, 192.48483, 192.53804, 192.59248, 192.64667, 192.70181, 192.75798, 192.81502, 192.87016, 192.92496, 192.98015, 193.03481, 193.09019, 193.14693, 193.20465, 193.26526, 193.32504, 193.38451, 193.44281, 193.49977, 193.55804, 193.61533, 193.67177, 193.72891, 193.78667, 193.84259, 193.89799, 193.95425, 194.01086, 194.06876, 194.12726, 194.18596, 194.24385, 194.30168, 194.35782, 194.41516, 194.47411, 194.53342, 194.59587, 194.65793, 194.71797, 194.77441, 194.83284, 194.88989, 194.94766, 195.00539, 195.06413, 195.12605, 195.19096, 195.25722, 195.32449, 195.39157, 195.45724, 195.52281, 195.58981, 195.65671, 195.7216, 195.78194, 195.84415, 195.90858]}, "iteration-time": {"start_step": 0, "end_step": 2000, "step_interval": 5, "values": [18.92793, 0.51136, 0.50959, 0.5023, 0.50706, 0.49889, 0.49918, 0.50787, 0.50805, 0.50023, 0.51244, 0.49782, 0.5011, 0.49829, 0.50242, 0.49765, 0.50512, 0.50815, 0.51211, 0.49886, 0.50327, 0.50436, 0.50354, 0.4972, 0.49868, 0.50277, 0.49981, 0.50008, 0.50203, 0.49718, 0.60026, 0.49876, 0.49477, 0.5046, 0.51537, 0.5196, 0.49706, 0.49993, 0.49908, 0.49804, 0.4994, 0.49794, 0.50015, 0.49859, 0.49669, 0.49649, 0.59124, 0.49837, 0.50138, 0.49717, 0.49966, 0.50461, 0.4977, 0.49673, 0.5025, 0.49998, 0.49865, 0.50151, 0.50846, 0.51111, 0.50552, 0.50429, 0.50589, 0.50627, 0.50795, 0.505, 0.50478, 0.50608, 0.5063, 0.50392, 0.50528, 0.50464, 0.50852, 0.50732, 0.50975, 0.70338, 0.50322, 0.50607, 0.5008, 0.51264, 0.50202, 0.51117, 0.50466, 0.50856, 0.50482, 0.5101, 0.50604, 0.50708, 0.50371, 0.50732, 0.50754, 0.50725, 0.50576, 0.50944, 0.50954, 0.50758, 0.50654, 0.5929, 0.50552, 0.50521, 0.50353, 0.50768, 0.50269, 0.50818, 0.50339, 0.50584, 0.50369, 0.50801, 0.50311, 0.50501, 0.50259, 0.50478, 0.50477, 0.50612, 0.50304, 0.5048, 0.50419, 0.50917, 0.50259, 0.59305, 0.71675, 0.50782, 0.50595, 0.50366, 0.50416, 0.5131, 0.50874, 0.50202, 0.5075, 0.50344, 0.50969, 0.50236, 0.50738, 0.5042, 0.50968, 0.50453, 0.50797, 0.50316, 0.50801, 0.50385, 0.51048, 0.50461, 0.60109, 0.50835, 0.50599, 0.50503, 0.50405, 0.50686, 0.50365, 0.50633, 0.51394, 0.507, 0.50416, 0.5072, 0.50187, 0.50987, 0.50554, 0.50964, 0.49997, 0.5086, 0.50287, 0.50901, 0.51253, 0.51268, 0.59174, 0.63218, 0.50352, 0.50458, 0.50663, 0.50624, 0.50529, 0.50834, 0.50628, 0.50536, 0.50697, 0.50514, 0.5058, 0.5064, 0.51003, 0.50482, 0.50622, 0.50306, 0.50955, 0.50288, 0.51052, 0.50915, 0.50819, 0.50518, 0.50395, 0.50908, 0.50261, 0.5111, 0.59558, 0.50726, 0.50659, 0.50692, 0.50765, 0.50516, 0.51034, 0.50537, 0.49111, 0.50535, 0.50465, 0.50275, 0.50558, 0.5014, 0.5079, 0.5078, 0.50568, 0.5069, 0.50614, 0.50631, 0.5066, 0.50398, 0.50618, 0.50721, 0.51171, 0.50602, 0.50818, 0.50511, 0.51286, 0.50398, 0.50849, 0.50801, 0.50817, 0.50985, 0.50547, 0.50729, 0.50608, 0.59229, 0.50801, 0.50242, 0.51408, 0.50883, 0.5042, 0.508, 0.51821, 0.50964, 0.50309, 0.51214, 0.59459, 0.51016, 0.50757, 0.51259, 0.50854, 0.50258, 0.50468, 0.50579, 0.50859, 0.50372, 0.50798, 0.50757, 0.51184, 0.50914, 0.50776, 0.50432, 0.50917, 0.50287, 0.50616, 0.50167, 0.5065, 0.50145, 0.51091, 0.50163, 0.51326, 0.50092, 0.50601, 0.50447, 0.50502, 0.50274, 0.50572, 0.50976, 0.5047, 0.50868, 0.50316, 0.52048, 0.50699, 0.61568, 0.50722, 0.5088, 0.50773, 0.50579, 0.50532, 0.50689, 0.50615, 0.50762, 0.5023, 0.50258, 0.50262, 0.51065, 0.50567, 0.50633, 0.50361, 0.50893, 0.50511, 0.50936, 0.59793, 0.60202, 0.51102, 0.50683, 0.50341, 0.50975, 0.50313, 0.51068, 0.50494, 0.5094, 0.50552, 0.5077, 0.50574, 0.50655, 0.51164, 0.50641, 0.50789, 0.50671, 0.61258, 0.50815, 0.50767, 0.50856, 0.51335, 0.5105, 0.50233, 0.50903, 0.50975, 0.50328, 0.50987, 0.50357, 0.50951, 0.50423, 0.50818, 0.50563, 0.50771, 0.50968, 0.50443, 0.50847, 0.50717, 0.50752, 0.50453, 0.50914, 0.50657, 0.50601, 0.51204, 0.50439, 0.59526, 0.50772, 0.50461, 0.51966, 0.50388, 0.50764, 0.50335, 0.51566, 0.50622, 0.50664, 0.50857, 0.51175, 0.50837, 0.50352, 0.50963, 0.50442, 0.50747, 0.50672, 0.50844, 0.50629, 0.50717, 0.5071, 0.50387, 0.5066, 0.50594, 0.50388, 0.50981, 0.50538, 0.5055, 0.50641, 0.50813, 0.50422, 0.50345, 0.50462, 0.50731, 0.50278, 0.50356, 0.50701, 0.5066, 0.5073, 0.51, 0.50394, 0.50873, 0.50751, 0.50848, 0.59448, 0.50862, 0.5117, 0.50484, 0.51229, 0.50735, 0.50392, 0.50744, 0.50609, 0.50765, 0.51917, 0.51153, 0.50229]}, "lm loss validation": {"start_step": 0, "end_step": 4, "step_interval": 5, "values": [5.68727]}, "lm loss validation vs samples": {"start_step": 0, "end_step": 4, "step_interval": 5, "values": [5.68727]}, "lm loss validation ppl": {"start_step": 0, "end_step": 4, "step_interval": 5, "values": [295.08755]}, "lm loss validation ppl vs samples": {"start_step": 0, "end_step": 4, "step_interval": 5, "values": [295.08755]}}
\ No newline at end of file
diff --git a/tests/functional_tests/test_cases/gpt/gpt3_345m_weekly_dgx_h100_1N8G_mcore_nondet_tp1_pp1_fp8_no_model_parallel/model_config.yaml b/tests/functional_tests/test_cases/gpt/gpt3_345m_weekly_dgx_h100_1N8G_mcore_nondet_tp1_pp1_fp8_no_model_parallel/model_config.yaml
new file mode 100644
index 0000000000..4349bc01a3
--- /dev/null
+++ b/tests/functional_tests/test_cases/gpt/gpt3_345m_weekly_dgx_h100_1N8G_mcore_nondet_tp1_pp1_fp8_no_model_parallel/model_config.yaml
@@ -0,0 +1,48 @@
+ENV_VARS:
+  CUDA_DEVICE_MAX_CONNECTIONS: 1
+  NVTE_ALLOW_NONDETERMINISTIC_ALGO: 1
+MODEL_ARGS:
+  --num-layers: 12
+  --hidden-size: 512
+  --num-attention-heads: 8
+  --log-params-norm: true
+  --log-num-zeros-in-grad: true
+  --log-validation-ppl-to-tensorboard: true
+  --log-timers-to-tensorboard: true
+  --tensorboard-dir: ${TENSORBOARD_PATH}
+  --micro-batch-size: 2
+  --global-batch-size: 128
+  --seq-length: 1024
+  --max-position-embeddings: 1024
+  --train-iters: 2000
+  --timing-log-level: 2
+  --lr-decay-iters: 320000
+  --save: ${CHECKPOINT_PATH}
+  --load: ${CHECKPOINT_PATH}
+  --data-path: ${DATA_PATH}/my-gpt3_00_text_document
+  --vocab-file: ${DATA_PATH}/bpe/vocab.json
+  --merge-file: ${DATA_PATH}/bpe/merges.txt
+  --split: 949,50,1
+  --distributed-backend: nccl
+  --lr: 0.00015
+  --lr-decay-style: cosine
+  --min-lr: 1.0e-5
+  --weight-decay: 1e-2
+  --clip-grad: 1.0
+  --lr-warmup-fraction: .01
+  --log-interval: 1
+  --save-interval: 10000
+  --eval-interval: 1000
+  --eval-iters: 10
+  --transformer-impl: transformer_engine
+  --tensor-model-parallel-size: 1
+  --pipeline-model-parallel-size: 1
+  --no-gradient-accumulation-fusion: true
+  --fp8-format: hybrid
+  --fp8-amax-history-len: 1024
+  --fp8-amax-compute-algo: max
+  --attention-softmax-in-fp32: true
+  --ckpt-format: torch_dist
+  --data-cache-path: ${DATA_CACHE_PATH}
+  --bf16: true
+TEST_TYPE: regular
diff --git a/tests/functional_tests/test_cases/gpt/gpt3_345m_weekly_dgx_h100_1N8G_mcore_tp1_pp1_bf16_baseline/golden_values_dev.json b/tests/functional_tests/test_cases/gpt/gpt3_345m_weekly_dgx_h100_1N8G_mcore_tp1_pp1_bf16_baseline/golden_values_dev.json
new file mode 100644
index 0000000000..fdeaa49aa1
--- /dev/null
+++ b/tests/functional_tests/test_cases/gpt/gpt3_345m_weekly_dgx_h100_1N8G_mcore_tp1_pp1_bf16_baseline/golden_values_dev.json
@@ -0,0 +1 @@
+{"forward-backward-time": {"start_step": 0, "end_step": 2000, "step_interval": 5, "values": [17.4566, 0.37175, 0.37134, 0.37017, 0.37156, 0.37759, 0.37765, 0.37162, 0.3761, 0.37226, 0.53616, 0.37589, 0.37516, 0.37683, 0.37327, 0.37614, 0.37342, 0.3739, 0.37649, 0.37491, 0.38081, 0.37232, 0.37401, 0.37224, 0.37132, 0.38167, 0.37456, 0.37215, 0.36647, 0.37435, 0.38453, 0.36353, 0.36605, 0.36205, 0.36329, 0.36758, 0.36245, 0.36564, 0.3674, 0.38594, 0.36767, 0.36685, 0.36727, 0.36428, 0.3664, 0.36716, 0.36619, 0.36593, 0.36805, 0.36393, 0.3666, 0.36486, 0.36817, 0.36273, 0.36485, 0.36634, 0.36443, 0.3672, 0.36462, 0.36335, 0.35994, 0.36774, 0.36167, 0.36089, 0.36216, 0.36236, 0.36412, 0.36497, 0.3673, 0.36303, 0.36566, 0.36239, 0.36323, 0.36008, 0.46258, 0.36181, 0.3621, 0.36509, 0.36772, 0.36417, 0.36489, 0.36688, 0.3704, 0.36443, 0.36411, 0.36221, 0.36185, 0.36498, 0.36202, 0.36553, 0.36574, 0.36507, 0.37335, 0.36256, 0.3648, 0.36324, 0.36253, 0.36685, 0.3644, 0.36463, 0.36584, 0.36426, 0.36134, 0.36175, 0.45788, 0.36568, 0.36196, 0.38364, 0.36164, 0.36331, 0.36346, 0.3683, 0.36544, 0.36245, 0.37051, 0.37092, 0.36741, 0.3695, 0.3651, 0.37195, 0.36315, 0.36425, 0.36904, 0.36828, 0.3648, 0.36763, 0.36895, 0.37272, 0.3749, 0.36753, 0.36573, 0.36845, 0.36886, 0.37096, 0.47625, 0.36339, 0.36255, 0.36368, 0.44639, 0.51442, 0.3673, 0.36637, 0.36885, 0.37285, 0.36987, 0.36631, 0.36485, 0.36259, 0.36217, 0.364, 0.36364, 0.36588, 0.3619, 0.36604, 0.36798, 0.36772, 0.36665, 0.36769, 0.36628, 0.36592, 0.36831, 0.36583, 0.36842, 0.36695, 0.37069, 0.36526, 0.36421, 0.3661, 0.36543, 0.36845, 0.36581, 0.3674, 0.36575, 0.36568, 0.36949, 0.36761, 0.36684, 0.36852, 0.36408, 0.37073, 0.36602, 0.36769, 0.3609, 0.36264, 0.36736, 0.36549, 0.36517, 0.36003, 0.36081, 0.36006, 0.36167, 0.36361, 0.36172, 0.36296, 0.36716, 0.36645, 0.36705, 0.36621, 0.45574, 0.36247, 0.36105, 0.36408, 0.3621, 0.36088, 0.36271, 0.36349, 0.36811, 0.36958, 0.36968, 0.36582, 0.36294, 0.36436, 0.36894, 0.36266, 0.36585, 0.36633, 0.36462, 0.36885, 0.36711, 0.36754, 0.36317, 0.36285, 0.36581, 0.37564, 0.37346, 0.3622, 0.36404, 0.45901, 0.36362, 0.36726, 0.37058, 0.36812, 0.36666, 0.37189, 0.46883, 0.37275, 0.3719, 0.36704, 0.36448, 0.3629, 0.36582, 0.36225, 0.36061, 0.4845, 0.36483, 0.36652, 0.36811, 0.36819, 0.37464, 0.36516, 0.36721, 0.36426, 0.35999, 0.36267, 0.36286, 0.36833, 0.36584, 0.3632, 0.36415, 0.36569, 0.37494, 0.36226, 0.46516, 0.36495, 0.36254, 0.36943, 0.36585, 0.36664, 0.36827, 0.36557, 0.37484, 0.36946, 0.37108, 0.36825, 0.36775, 0.36137, 0.36521, 0.3697, 0.36415, 0.36338, 0.36383, 0.36505, 0.3677, 0.36976, 0.36576, 0.36964, 0.37212, 0.36584, 0.36475, 0.36537, 0.36914, 0.36892, 0.45897, 0.36567, 0.3641, 0.36657, 0.3698, 0.36867, 0.36599, 0.3679, 0.36742, 0.36813, 0.36659, 0.36737, 0.36653, 0.36785, 0.37243, 0.36895, 0.37086, 0.365, 0.36719, 0.37471, 0.36717, 0.3738, 0.37016, 0.37206, 0.3695, 0.36911, 0.36946, 0.36669, 0.36636, 0.3628, 0.3661, 0.36516, 0.36275, 0.3657, 0.3654, 0.36521, 0.3662, 0.4682, 0.36931, 0.3668, 0.37172, 0.37189, 0.36942, 0.37165, 0.37159, 0.37333, 0.37491, 0.37221, 0.36907, 0.37154, 0.37633, 0.36937, 0.36886, 0.36922, 0.36659, 0.36692, 0.36765, 0.36709, 0.3641, 0.36625, 0.36742, 0.36073, 0.36646, 0.36662, 0.36508, 0.37343, 0.36701, 0.3642, 0.36688, 0.36861, 0.36833, 0.36153, 0.36529, 0.36657, 0.36866, 0.37542, 0.36846, 0.36817, 0.36445, 0.36398, 0.36799, 0.36631, 0.3632, 0.36525, 0.36782, 0.36786, 0.37064, 0.36604, 0.36767, 0.36737, 0.36678, 0.36919, 0.36757, 0.36912, 0.36819, 0.46929, 0.37321, 0.37017, 0.4569, 0.36994, 0.37357, 0.36984, 0.57706, 0.37035, 0.37045, 0.36802, 0.36852, 0.36742]}, "forward-compute-time": {"start_step": 0, "end_step": 2000, "step_interval": 5, "values": [9.27486, 0.20418, 0.20397, 0.20285, 0.20434, 0.20758, 0.20634, 0.20416, 0.20426, 0.20434, 0.3669, 0.20758, 0.20442, 0.20546, 0.20278, 0.20684, 0.20447, 0.20408, 0.20756, 0.20602, 0.20443, 0.20251, 0.20574, 0.20384, 0.2029, 0.21254, 0.21029, 0.20601, 0.20107, 0.20291, 0.20989, 0.19612, 0.20052, 0.19662, 0.19784, 0.20061, 0.19675, 0.19997, 0.20194, 0.22257, 0.2025, 0.20076, 0.2025, 0.20065, 0.20083, 0.19995, 0.19982, 0.20085, 0.20083, 0.19933, 0.20226, 0.20132, 0.203, 0.19623, 0.1999, 0.19978, 0.1976, 0.19962, 0.19949, 0.19977, 0.19439, 0.19749, 0.19772, 0.19546, 0.19711, 0.19707, 0.19839, 0.19731, 0.20084, 0.19819, 0.2011, 0.1983, 0.19858, 0.1937, 0.29471, 0.19528, 0.19534, 0.19901, 0.20146, 0.19982, 0.19907, 0.20086, 0.20405, 0.19915, 0.2005, 0.19581, 0.19278, 0.19863, 0.19822, 0.1993, 0.1988, 0.19998, 0.2005, 0.19725, 0.20091, 0.19918, 0.19836, 0.2016, 0.19765, 0.19811, 0.19903, 0.19646, 0.19645, 0.19682, 0.28975, 0.19888, 0.19522, 0.21159, 0.19644, 0.19881, 0.19777, 0.20279, 0.19972, 0.19755, 0.20374, 0.20397, 0.20052, 0.20409, 0.20046, 0.20573, 0.19813, 0.19893, 0.20396, 0.20108, 0.1991, 0.20018, 0.20247, 0.20606, 0.20496, 0.20146, 0.20113, 0.20109, 0.20373, 0.20131, 0.30688, 0.19978, 0.19719, 0.19856, 0.27425, 0.34575, 0.20073, 0.20027, 0.20292, 0.20753, 0.20162, 0.19901, 0.19974, 0.19616, 0.19556, 0.19818, 0.19745, 0.20023, 0.19768, 0.1993, 0.20152, 0.20191, 0.20046, 0.19952, 0.19909, 0.20067, 0.20206, 0.20028, 0.2009, 0.20109, 0.20231, 0.20057, 0.19849, 0.2014, 0.19862, 0.20162, 0.1995, 0.20168, 0.19859, 0.20023, 0.20137, 0.19954, 0.19893, 0.20032, 0.19926, 0.20288, 0.20082, 0.20203, 0.1964, 0.19744, 0.20075, 0.19839, 0.19941, 0.19592, 0.19584, 0.19507, 0.19602, 0.19868, 0.19785, 0.19642, 0.20146, 0.20135, 0.20162, 0.20061, 0.28565, 0.19898, 0.19699, 0.20018, 0.1975, 0.19765, 0.19836, 0.20012, 0.20347, 0.20455, 0.20461, 0.20103, 0.1993, 0.20097, 0.20324, 0.19779, 0.20128, 0.20136, 0.19977, 0.20189, 0.20216, 0.19869, 0.19833, 0.19963, 0.20166, 0.21162, 0.2062, 0.19807, 0.19895, 0.29325, 0.19845, 0.1994, 0.20325, 0.20285, 0.20049, 0.20554, 0.30108, 0.20617, 0.20644, 0.20131, 0.20084, 0.19867, 0.20111, 0.19928, 0.19687, 0.31861, 0.20096, 0.20262, 0.20309, 0.20325, 0.20819, 0.20113, 0.20301, 0.19969, 0.19603, 0.19693, 0.19763, 0.2004, 0.20179, 0.19742, 0.19937, 0.20128, 0.20616, 0.19831, 0.29924, 0.19973, 0.19859, 0.20413, 0.20138, 0.20285, 0.20388, 0.20206, 0.20671, 0.20471, 0.20646, 0.20241, 0.20408, 0.19861, 0.20125, 0.20732, 0.20159, 0.20035, 0.20096, 0.20012, 0.20294, 0.20424, 0.20101, 0.20564, 0.2044, 0.2008, 0.19955, 0.20264, 0.2049, 0.20446, 0.293, 0.20181, 0.20025, 0.20162, 0.20369, 0.20417, 0.20115, 0.20265, 0.20363, 0.2044, 0.20297, 0.20322, 0.20046, 0.20222, 0.20483, 0.20332, 0.20676, 0.19998, 0.2015, 0.2054, 0.20246, 0.20845, 0.20406, 0.20619, 0.20592, 0.20453, 0.20274, 0.20274, 0.20162, 0.20007, 0.20274, 0.20276, 0.19873, 0.20293, 0.20198, 0.20198, 0.20314, 0.30676, 0.20607, 0.2049, 0.20889, 0.20967, 0.2072, 0.20824, 0.20768, 0.20857, 0.20862, 0.20898, 0.20615, 0.20827, 0.21418, 0.20637, 0.20388, 0.2067, 0.20272, 0.20336, 0.20429, 0.20148, 0.20112, 0.20264, 0.20322, 0.19861, 0.20195, 0.20314, 0.1996, 0.20578, 0.2036, 0.20073, 0.20362, 0.20652, 0.20449, 0.19954, 0.20273, 0.203, 0.2032, 0.20757, 0.2034, 0.20482, 0.19991, 0.20078, 0.20474, 0.20356, 0.19886, 0.20118, 0.20177, 0.20291, 0.20253, 0.20141, 0.20341, 0.20352, 0.20319, 0.20478, 0.20413, 0.20568, 0.20319, 0.30235, 0.20813, 0.20681, 0.29099, 0.20567, 0.20759, 0.20528, 0.41177, 0.20714, 0.20416, 0.20342, 0.20429, 0.20393]}, "backward-compute-time": {"start_step": 0, "end_step": 2000, "step_interval": 5, "values": [2.48483, 0.17652, 0.17828, 0.17737, 0.17731, 0.18012, 0.18059, 0.17933, 0.18228, 0.17963, 0.17741, 0.17905, 0.17875, 0.18023, 0.17598, 0.17735, 0.17563, 0.1774, 0.17814, 0.17775, 0.1797, 0.17589, 0.17512, 0.17493, 0.17423, 0.17574, 0.17442, 0.17392, 0.17429, 0.18376, 0.17762, 0.17577, 0.17608, 0.17519, 0.17371, 0.17562, 0.1743, 0.17634, 0.17747, 0.1794, 0.17639, 0.1769, 0.17749, 0.17644, 0.17597, 0.17611, 0.17772, 0.17605, 0.17799, 0.1756, 0.17762, 0.17478, 0.17987, 0.17366, 0.17669, 0.17775, 0.17802, 0.17908, 0.17514, 0.17554, 0.17388, 0.17483, 0.17431, 0.17275, 0.17497, 0.17541, 0.17514, 0.17686, 0.17728, 0.17469, 0.17508, 0.17519, 0.17517, 0.17377, 0.17594, 0.17621, 0.17553, 0.17702, 0.18, 0.17602, 0.17593, 0.17864, 0.17997, 0.1755, 0.17822, 0.17772, 0.17671, 0.17725, 0.1778, 0.17809, 0.17954, 0.17593, 0.17541, 0.17441, 0.17679, 0.17798, 0.17778, 0.17724, 0.17552, 0.17811, 0.18023, 0.17981, 0.17557, 0.17566, 0.17625, 0.17625, 0.17558, 0.19425, 0.1762, 0.17767, 0.17763, 0.18372, 0.17971, 0.17752, 0.18218, 0.18258, 0.18042, 0.18083, 0.17934, 0.18263, 0.17612, 0.17585, 0.18209, 0.17892, 0.17504, 0.18056, 0.18269, 0.18216, 0.18105, 0.18046, 0.17895, 0.18001, 0.18287, 0.18048, 0.18107, 0.1792, 0.177, 0.17595, 0.17833, 0.17997, 0.18026, 0.18064, 0.18103, 0.18122, 0.1807, 0.17741, 0.17696, 0.175, 0.17708, 0.17762, 0.17496, 0.17994, 0.17504, 0.17879, 0.18178, 0.1796, 0.18007, 0.18397, 0.18212, 0.18076, 0.18234, 0.18066, 0.18359, 0.18244, 0.18094, 0.18093, 0.17869, 0.18132, 0.18028, 0.18293, 0.17692, 0.181, 0.1778, 0.178, 0.18006, 0.18483, 0.18337, 0.18495, 0.18069, 0.18012, 0.18124, 0.18343, 0.17705, 0.17668, 0.17849, 0.18112, 0.17754, 0.1764, 0.17576, 0.17489, 0.17603, 0.17867, 0.17875, 0.17778, 0.17783, 0.18028, 0.18098, 0.18147, 0.18117, 0.17707, 0.17356, 0.17855, 0.17723, 0.175, 0.17556, 0.17674, 0.17749, 0.17698, 0.17866, 0.17541, 0.17473, 0.17725, 0.17976, 0.17814, 0.17815, 0.17912, 0.17571, 0.18059, 0.18163, 0.17964, 0.17657, 0.1773, 0.17872, 0.18756, 0.18502, 0.17691, 0.17601, 0.1773, 0.17751, 0.17745, 0.18072, 0.17998, 0.17849, 0.18172, 0.17785, 0.18296, 0.17966, 0.18029, 0.17622, 0.17684, 0.17683, 0.17525, 0.17514, 0.17546, 0.17768, 0.17616, 0.17827, 0.17873, 0.18236, 0.17864, 0.17902, 0.17866, 0.17537, 0.17824, 0.17634, 0.17765, 0.17745, 0.17691, 0.17855, 0.17773, 0.1776, 0.17553, 0.17612, 0.17682, 0.17445, 0.17573, 0.17792, 0.17697, 0.17758, 0.17799, 0.18179, 0.17862, 0.17828, 0.17902, 0.17716, 0.17378, 0.17466, 0.17969, 0.17531, 0.17449, 0.1762, 0.17533, 0.17786, 0.17799, 0.1739, 0.17695, 0.17997, 0.17727, 0.17594, 0.17599, 0.17877, 0.17835, 0.17768, 0.17619, 0.1761, 0.17947, 0.18082, 0.17999, 0.17973, 0.18161, 0.17878, 0.18107, 0.17669, 0.17787, 0.17714, 0.17987, 0.17952, 0.18139, 0.1814, 0.17879, 0.17819, 0.17967, 0.17842, 0.18204, 0.17981, 0.18039, 0.1779, 0.17786, 0.18096, 0.17907, 0.17853, 0.17539, 0.17682, 0.17666, 0.17653, 0.17793, 0.17688, 0.1782, 0.17909, 0.17471, 0.17743, 0.17531, 0.17878, 0.17697, 0.1762, 0.17958, 0.17827, 0.17938, 0.17923, 0.17797, 0.1763, 0.17776, 0.18097, 0.17754, 0.18018, 0.17934, 0.1806, 0.1751, 0.17845, 0.18106, 0.17667, 0.17809, 0.17911, 0.17624, 0.17874, 0.1795, 0.17661, 0.18214, 0.18117, 0.17941, 0.17482, 0.17595, 0.17616, 0.17509, 0.17725, 0.17932, 0.18085, 0.18292, 0.17986, 0.17974, 0.17799, 0.17756, 0.17851, 0.17744, 0.17724, 0.17992, 0.18197, 0.18128, 0.1816, 0.17718, 0.1781, 0.18028, 0.17962, 0.18211, 0.17904, 0.18027, 0.179, 0.1805, 0.18514, 0.18111, 0.17608, 0.18024, 0.1833, 0.1823, 0.1797, 0.17902, 0.18251, 0.18061, 0.17877, 0.17926]}, "batch-generator-time": {"start_step": 0, "end_step": 2000, "step_interval": 5, "values": [0.60562, 0.0038, 0.00384, 0.00379, 0.00392, 0.00392, 0.00391, 0.00387, 0.00391, 0.00397, 0.00392, 0.00405, 0.00383, 0.00388, 0.00387, 0.0042, 0.00394, 0.00394, 0.00387, 0.00379, 0.00413, 0.00393, 0.00403, 0.00383, 0.00384, 0.004, 0.0044, 0.00355, 0.00419, 0.00392, 0.00399, 0.00394, 0.0037, 0.00364, 0.00369, 0.00383, 0.00379, 0.00369, 0.0038, 0.00364, 0.00377, 0.00393, 0.00365, 0.00367, 0.00383, 0.00366, 0.00382, 0.00371, 0.00355, 0.00439, 0.00359, 0.00368, 0.00365, 0.00383, 0.00363, 0.00374, 0.00373, 0.00378, 0.00373, 0.00352, 0.00362, 0.0036, 0.00343, 0.00349, 0.00382, 0.00374, 0.00356, 0.00374, 0.00365, 0.00391, 0.0037, 0.00375, 0.00369, 0.00366, 0.00397, 0.00372, 0.00358, 0.00365, 0.00406, 0.00355, 0.00339, 0.00398, 0.00424, 0.0036, 0.00363, 0.00389, 0.00371, 0.00377, 0.00362, 0.00383, 0.00373, 0.0037, 0.00388, 0.00356, 0.00358, 0.00363, 0.00387, 0.00375, 0.00383, 0.00372, 0.00369, 0.00374, 0.00411, 0.00364, 0.0039, 0.00376, 0.00383, 0.00364, 0.00379, 0.00378, 0.00364, 0.00365, 0.00392, 0.00347, 0.00361, 0.00377, 0.00359, 0.00364, 0.00383, 0.00375, 0.00368, 0.00367, 0.0041, 0.00379, 0.00359, 0.00366, 0.00379, 0.00376, 0.00387, 0.00368, 0.00361, 0.00375, 0.00401, 0.0038, 0.00393, 0.00377, 0.00358, 0.00402, 0.00479, 0.00399, 0.00374, 0.00392, 0.00379, 0.00391, 0.00355, 0.00378, 0.00356, 0.00362, 0.0036, 0.00351, 0.00348, 0.00422, 0.00355, 0.00359, 0.00351, 0.00373, 0.00362, 0.00377, 0.00378, 0.00386, 0.0037, 0.00367, 0.00361, 0.0038, 0.00392, 0.00338, 0.00354, 0.00357, 0.00375, 0.00369, 0.0038, 0.0036, 0.00386, 0.00388, 0.00354, 0.00367, 0.00381, 0.00354, 0.00366, 0.0038, 0.00367, 0.00378, 0.00363, 0.00368, 0.00358, 0.00359, 0.00373, 0.00355, 0.00402, 0.00361, 0.00364, 0.00369, 0.0035, 0.00356, 0.00387, 0.00375, 0.00381, 0.0038, 0.00396, 0.00375, 0.03419, 0.00346, 0.00373, 0.00413, 0.0035, 0.00359, 0.00362, 0.00344, 0.00367, 0.00349, 0.00362, 0.00369, 0.00353, 0.00388, 0.00372, 0.00358, 0.0036, 0.00347, 0.00344, 0.00368, 0.00381, 0.00355, 0.00366, 0.0035, 0.00362, 0.00372, 0.0037, 0.00382, 0.00365, 0.00381, 0.00385, 0.00362, 0.00358, 0.00369, 0.00374, 0.00368, 0.00355, 0.00377, 0.00348, 0.00351, 0.00355, 0.00339, 0.00354, 0.00335, 0.00357, 0.00367, 0.00363, 0.00377, 0.00357, 0.00363, 0.00374, 0.00361, 0.00358, 0.00354, 0.00336, 0.00361, 0.00371, 0.00365, 0.00354, 0.00394, 0.00379, 0.00378, 0.00379, 0.00401, 0.00398, 0.00384, 0.00395, 0.0042, 0.00424, 0.00421, 0.00426, 0.00442, 0.00415, 0.00404, 0.0043, 0.00406, 0.00434, 0.00442, 0.00416, 0.0043, 0.00409, 0.00403, 0.00412, 0.004, 0.00407, 0.00448, 0.00415, 0.00407, 0.0041, 0.0041, 0.00402, 0.00417, 0.00421, 0.00402, 0.00399, 0.00398, 0.00422, 0.00414, 0.00414, 0.00417, 0.00412, 0.004, 0.00405, 0.00393, 0.00399, 0.00391, 0.00392, 0.00387, 0.00417, 0.00413, 0.00408, 0.004, 0.00415, 0.00409, 0.00421, 0.00397, 0.00405, 0.00396, 0.00405, 0.00404, 0.00407, 0.00408, 0.00399, 0.004, 0.00392, 0.00412, 0.00432, 0.00438, 0.00426, 0.00415, 0.00429, 0.00422, 0.00401, 0.00419, 0.0041, 0.00398, 0.00406, 0.00453, 0.00398, 0.00413, 0.00404, 0.00406, 0.00404, 0.00404, 0.0041, 0.00409, 0.00402, 0.00399, 0.0041, 0.00413, 0.00436, 0.00417, 0.00418, 0.00424, 0.00423, 0.00429, 0.00425, 0.00417, 0.00427, 0.00432, 0.00421, 0.00425, 0.00421, 0.00433, 0.00423, 0.00439, 0.00428, 0.00423, 0.00424, 0.0041, 0.00423, 0.00424, 0.00433, 0.00424, 0.00436, 0.0043, 0.00407, 0.00429, 0.0041, 0.00429, 0.00431, 0.00428, 0.0043, 0.00425, 0.00416, 0.00427, 0.00405, 0.00443, 0.00417, 0.0042, 0.00449, 0.00406, 0.004, 0.00406, 0.0042, 0.00421, 0.00409, 0.00421, 0.00421, 0.00413]}, "layernorm-grads-all-reduce-time": {"start_step": 0, "end_step": 2000, "step_interval": 5, "values": [3e-05, 1e-05, 1e-05, 1e-05, 1e-05, 1e-05, 1e-05, 2e-05, 1e-05, 1e-05, 1e-05, 2e-05, 1e-05, 1e-05, 1e-05, 1e-05, 1e-05, 1e-05, 1e-05, 1e-05, 2e-05, 1e-05, 1e-05, 1e-05, 1e-05, 1e-05, 1e-05, 1e-05, 1e-05, 1e-05, 2e-05, 1e-05, 1e-05, 1e-05, 1e-05, 1e-05, 1e-05, 1e-05, 1e-05, 1e-05, 1e-05, 1e-05, 1e-05, 1e-05, 1e-05, 1e-05, 1e-05, 1e-05, 2e-05, 1e-05, 1e-05, 1e-05, 2e-05, 1e-05, 1e-05, 1e-05, 1e-05, 1e-05, 1e-05, 1e-05, 1e-05, 1e-05, 1e-05, 1e-05, 1e-05, 1e-05, 2e-05, 1e-05, 1e-05, 1e-05, 1e-05, 1e-05, 1e-05, 2e-05, 1e-05, 1e-05, 1e-05, 1e-05, 1e-05, 1e-05, 1e-05, 1e-05, 1e-05, 1e-05, 1e-05, 1e-05, 1e-05, 1e-05, 1e-05, 1e-05, 1e-05, 1e-05, 1e-05, 1e-05, 1e-05, 1e-05, 1e-05, 2e-05, 1e-05, 1e-05, 1e-05, 1e-05, 1e-05, 1e-05, 1e-05, 1e-05, 1e-05, 1e-05, 1e-05, 1e-05, 1e-05, 1e-05, 1e-05, 1e-05, 1e-05, 1e-05, 1e-05, 1e-05, 1e-05, 1e-05, 1e-05, 1e-05, 1e-05, 1e-05, 1e-05, 1e-05, 1e-05, 1e-05, 1e-05, 1e-05, 1e-05, 1e-05, 1e-05, 1e-05, 1e-05, 1e-05, 1e-05, 2e-05, 2e-05, 1e-05, 1e-05, 1e-05, 1e-05, 1e-05, 1e-05, 1e-05, 1e-05, 1e-05, 1e-05, 1e-05, 1e-05, 1e-05, 1e-05, 1e-05, 1e-05, 1e-05, 1e-05, 1e-05, 1e-05, 1e-05, 1e-05, 1e-05, 1e-05, 1e-05, 1e-05, 1e-05, 1e-05, 1e-05, 1e-05, 1e-05, 1e-05, 1e-05, 1e-05, 2e-05, 1e-05, 1e-05, 1e-05, 1e-05, 1e-05, 2e-05, 2e-05, 1e-05, 1e-05, 1e-05, 2e-05, 1e-05, 1e-05, 1e-05, 1e-05, 1e-05, 1e-05, 1e-05, 1e-05, 1e-05, 1e-05, 1e-05, 1e-05, 1e-05, 1e-05, 1e-05, 1e-05, 1e-05, 1e-05, 1e-05, 1e-05, 1e-05, 1e-05, 1e-05, 1e-05, 1e-05, 1e-05, 1e-05, 1e-05, 1e-05, 1e-05, 1e-05, 1e-05, 1e-05, 1e-05, 1e-05, 1e-05, 1e-05, 1e-05, 1e-05, 1e-05, 1e-05, 1e-05, 1e-05, 1e-05, 1e-05, 1e-05, 1e-05, 1e-05, 1e-05, 1e-05, 1e-05, 1e-05, 1e-05, 2e-05, 1e-05, 1e-05, 1e-05, 1e-05, 1e-05, 1e-05, 1e-05, 1e-05, 1e-05, 1e-05, 1e-05, 1e-05, 1e-05, 1e-05, 1e-05, 1e-05, 1e-05, 1e-05, 2e-05, 1e-05, 1e-05, 1e-05, 1e-05, 1e-05, 1e-05, 1e-05, 1e-05, 1e-05, 1e-05, 1e-05, 1e-05, 1e-05, 1e-05, 1e-05, 2e-05, 1e-05, 1e-05, 1e-05, 1e-05, 1e-05, 1e-05, 1e-05, 1e-05, 1e-05, 1e-05, 1e-05, 1e-05, 1e-05, 1e-05, 1e-05, 1e-05, 1e-05, 1e-05, 1e-05, 1e-05, 1e-05, 1e-05, 1e-05, 1e-05, 1e-05, 1e-05, 2e-05, 1e-05, 1e-05, 1e-05, 1e-05, 1e-05, 1e-05, 1e-05, 1e-05, 1e-05, 1e-05, 1e-05, 1e-05, 1e-05, 1e-05, 1e-05, 1e-05, 1e-05, 1e-05, 1e-05, 1e-05, 1e-05, 1e-05, 1e-05, 1e-05, 1e-05, 1e-05, 1e-05, 1e-05, 1e-05, 1e-05, 1e-05, 1e-05, 1e-05, 1e-05, 1e-05, 1e-05, 1e-05, 1e-05, 1e-05, 1e-05, 1e-05, 1e-05, 1e-05, 2e-05, 1e-05, 1e-05, 1e-05, 1e-05, 1e-05, 1e-05, 1e-05, 1e-05, 1e-05, 1e-05, 1e-05, 1e-05, 1e-05, 1e-05, 1e-05, 2e-05, 1e-05, 1e-05, 1e-05, 1e-05, 1e-05, 1e-05, 1e-05, 2e-05, 1e-05, 1e-05, 1e-05, 1e-05, 1e-05, 1e-05, 1e-05, 1e-05, 1e-05, 1e-05, 1e-05, 1e-05, 1e-05, 1e-05, 1e-05, 1e-05, 1e-05, 1e-05, 1e-05, 1e-05, 1e-05, 1e-05, 1e-05, 1e-05, 1e-05, 1e-05, 1e-05, 1e-05, 1e-05, 1e-05, 1e-05]}, "embedding-grads-all-reduce-time": {"start_step": 0, "end_step": 2000, "step_interval": 5, "values": [4e-05, 2e-05, 2e-05, 2e-05, 2e-05, 2e-05, 2e-05, 2e-05, 3e-05, 2e-05, 2e-05, 2e-05, 2e-05, 2e-05, 2e-05, 2e-05, 2e-05, 2e-05, 2e-05, 2e-05, 2e-05, 2e-05, 2e-05, 2e-05, 2e-05, 2e-05, 2e-05, 2e-05, 2e-05, 2e-05, 2e-05, 2e-05, 2e-05, 2e-05, 2e-05, 2e-05, 2e-05, 2e-05, 2e-05, 2e-05, 2e-05, 3e-05, 2e-05, 2e-05, 2e-05, 2e-05, 2e-05, 2e-05, 2e-05, 2e-05, 2e-05, 2e-05, 2e-05, 2e-05, 2e-05, 2e-05, 2e-05, 2e-05, 2e-05, 2e-05, 2e-05, 2e-05, 2e-05, 2e-05, 2e-05, 2e-05, 2e-05, 2e-05, 2e-05, 2e-05, 2e-05, 2e-05, 2e-05, 2e-05, 2e-05, 2e-05, 2e-05, 2e-05, 2e-05, 2e-05, 2e-05, 2e-05, 2e-05, 2e-05, 2e-05, 2e-05, 2e-05, 2e-05, 2e-05, 2e-05, 2e-05, 2e-05, 2e-05, 5e-05, 2e-05, 2e-05, 2e-05, 2e-05, 2e-05, 2e-05, 2e-05, 2e-05, 2e-05, 2e-05, 2e-05, 2e-05, 2e-05, 2e-05, 2e-05, 2e-05, 2e-05, 2e-05, 2e-05, 2e-05, 2e-05, 2e-05, 2e-05, 2e-05, 2e-05, 3e-05, 2e-05, 2e-05, 2e-05, 2e-05, 2e-05, 2e-05, 2e-05, 2e-05, 2e-05, 2e-05, 2e-05, 2e-05, 2e-05, 3e-05, 2e-05, 2e-05, 2e-05, 2e-05, 3e-05, 2e-05, 2e-05, 2e-05, 2e-05, 3e-05, 2e-05, 2e-05, 2e-05, 2e-05, 3e-05, 2e-05, 2e-05, 2e-05, 2e-05, 2e-05, 2e-05, 2e-05, 2e-05, 2e-05, 2e-05, 2e-05, 2e-05, 2e-05, 2e-05, 3e-05, 2e-05, 2e-05, 2e-05, 2e-05, 2e-05, 2e-05, 2e-05, 2e-05, 2e-05, 2e-05, 2e-05, 2e-05, 2e-05, 2e-05, 2e-05, 2e-05, 2e-05, 2e-05, 2e-05, 2e-05, 2e-05, 2e-05, 2e-05, 2e-05, 2e-05, 2e-05, 2e-05, 2e-05, 2e-05, 2e-05, 2e-05, 2e-05, 2e-05, 2e-05, 2e-05, 2e-05, 2e-05, 2e-05, 2e-05, 2e-05, 2e-05, 2e-05, 2e-05, 2e-05, 2e-05, 2e-05, 2e-05, 2e-05, 2e-05, 2e-05, 2e-05, 2e-05, 2e-05, 2e-05, 2e-05, 2e-05, 2e-05, 2e-05, 2e-05, 2e-05, 2e-05, 2e-05, 2e-05, 2e-05, 2e-05, 3e-05, 2e-05, 2e-05, 2e-05, 2e-05, 2e-05, 2e-05, 2e-05, 2e-05, 2e-05, 2e-05, 2e-05, 2e-05, 2e-05, 2e-05, 2e-05, 2e-05, 2e-05, 2e-05, 2e-05, 2e-05, 2e-05, 2e-05, 2e-05, 2e-05, 2e-05, 2e-05, 2e-05, 2e-05, 2e-05, 2e-05, 2e-05, 2e-05, 2e-05, 2e-05, 2e-05, 2e-05, 2e-05, 2e-05, 2e-05, 2e-05, 2e-05, 2e-05, 2e-05, 2e-05, 2e-05, 2e-05, 2e-05, 2e-05, 2e-05, 2e-05, 2e-05, 2e-05, 2e-05, 3e-05, 2e-05, 2e-05, 2e-05, 2e-05, 2e-05, 2e-05, 2e-05, 2e-05, 2e-05, 2e-05, 2e-05, 2e-05, 2e-05, 2e-05, 2e-05, 2e-05, 2e-05, 2e-05, 2e-05, 2e-05, 2e-05, 2e-05, 2e-05, 2e-05, 2e-05, 2e-05, 2e-05, 2e-05, 2e-05, 2e-05, 2e-05, 2e-05, 2e-05, 2e-05, 2e-05, 2e-05, 2e-05, 2e-05, 2e-05, 2e-05, 2e-05, 2e-05, 2e-05, 2e-05, 2e-05, 2e-05, 2e-05, 2e-05, 2e-05, 2e-05, 2e-05, 2e-05, 2e-05, 2e-05, 2e-05, 2e-05, 2e-05, 2e-05, 2e-05, 2e-05, 2e-05, 2e-05, 2e-05, 2e-05, 2e-05, 2e-05, 2e-05, 2e-05, 2e-05, 2e-05, 2e-05, 2e-05, 2e-05, 2e-05, 2e-05, 2e-05, 2e-05, 2e-05, 2e-05, 2e-05, 2e-05, 2e-05, 2e-05, 2e-05, 2e-05, 2e-05, 2e-05, 2e-05, 2e-05, 2e-05, 2e-05, 2e-05, 2e-05, 2e-05, 2e-05, 3e-05, 2e-05, 2e-05, 2e-05, 2e-05, 2e-05, 2e-05, 2e-05, 2e-05, 2e-05, 2e-05, 2e-05, 2e-05, 2e-05, 2e-05, 2e-05, 2e-05, 2e-05, 2e-05, 2e-05, 2e-05]}, "all-grads-sync-time": {"start_step": 0, "end_step": 2000, "step_interval": 5, "values": [5.81083, 0.0018, 0.00179, 0.00169, 0.00153, 0.00181, 0.00157, 0.00183, 0.00159, 0.00178, 0.00159, 0.00178, 0.00153, 0.00181, 0.0016, 0.0018, 0.00158, 0.00176, 0.00155, 0.00182, 0.00162, 0.00179, 0.00159, 0.00178, 0.0016, 0.00183, 0.00159, 0.00181, 0.0016, 0.00181, 0.00161, 0.0018, 0.00156, 0.00165, 0.0016, 0.00177, 0.00157, 0.00177, 0.00159, 0.00175, 0.00158, 0.00178, 0.00159, 0.00182, 0.00158, 0.00177, 0.00158, 0.00177, 0.00159, 0.00179, 0.00155, 0.00183, 0.00158, 0.00178, 0.00156, 0.00181, 0.00154, 0.0018, 0.00154, 0.00178, 0.00159, 0.00181, 0.00157, 0.00181, 0.00155, 0.00183, 0.00159, 0.0018, 0.00155, 0.00179, 0.00158, 0.00181, 0.00159, 0.00179, 0.00153, 0.00178, 0.00157, 0.00178, 0.00156, 0.00176, 0.00156, 0.00179, 0.00157, 0.00182, 0.00152, 0.00181, 0.00152, 0.00183, 0.00157, 0.00179, 0.00159, 0.00187, 0.00159, 0.00182, 0.00156, 0.0018, 0.00161, 0.0018, 0.00157, 0.00176, 0.00159, 0.00179, 0.00157, 0.00182, 0.00158, 0.0018, 0.0016, 0.00182, 0.00159, 0.00172, 0.00157, 0.00179, 0.00154, 0.00166, 0.00158, 0.00176, 0.00159, 0.00184, 0.00156, 0.00179, 0.00157, 0.00174, 0.00157, 0.00173, 0.00157, 0.0018, 0.00159, 0.00181, 0.00156, 0.00183, 0.00157, 0.00181, 0.00158, 0.00179, 0.00157, 0.00184, 0.00158, 0.00174, 0.00163, 0.00175, 0.00158, 0.0018, 0.00152, 0.00183, 0.00158, 0.00174, 0.00159, 0.00179, 0.00155, 0.00182, 0.00157, 0.0018, 0.00159, 0.00183, 0.00156, 0.00181, 0.00158, 0.00176, 0.00158, 0.00176, 0.00156, 0.00178, 0.00158, 0.00181, 0.00153, 0.0018, 0.00155, 0.0018, 0.0016, 0.0019, 0.0016, 0.00175, 0.0016, 0.0018, 0.00153, 0.00178, 0.00158, 0.0018, 0.00156, 0.00172, 0.00159, 0.00182, 0.00157, 0.00175, 0.00157, 0.00173, 0.00156, 0.00186, 0.00158, 0.00178, 0.00158, 0.00188, 0.00159, 0.00181, 0.00153, 0.00175, 0.00155, 0.00181, 0.00156, 0.00181, 0.00177, 0.00157, 0.00162, 0.00165, 0.00173, 0.00157, 0.00173, 0.00165, 0.00167, 0.00151, 0.00172, 0.00167, 0.00174, 0.00157, 0.00168, 0.00168, 0.00174, 0.00157, 0.00175, 0.00166, 0.00174, 0.00154, 0.00174, 0.00167, 0.00171, 0.00159, 0.00174, 0.00165, 0.00173, 0.00159, 0.00174, 0.00162, 0.00175, 0.00157, 0.00174, 0.00167, 0.00172, 0.00156, 0.00174, 0.00164, 0.00175, 0.00154, 0.00161, 0.0016, 0.00174, 0.00156, 0.00179, 0.00167, 0.00167, 0.00155, 0.00175, 0.00167, 0.00173, 0.00158, 0.00176, 0.00166, 0.00173, 0.00157, 0.00173, 0.00161, 0.00176, 0.0016, 0.00168, 0.00162, 0.00174, 0.00158, 0.00174, 0.00167, 0.00174, 0.00158, 0.00168, 0.00161, 0.00175, 0.00159, 0.00173, 0.00168, 0.00175, 0.00158, 0.00174, 0.00163, 0.00176, 0.00153, 0.00175, 0.00168, 0.00168, 0.00153, 0.00172, 0.00165, 0.00175, 0.00159, 0.00174, 0.00164, 0.00176, 0.00153, 0.00171, 0.00162, 0.00173, 0.00156, 0.00174, 0.00165, 0.00168, 0.00158, 0.00174, 0.00167, 0.00176, 0.00158, 0.00175, 0.00167, 0.00174, 0.00158, 0.00168, 0.00166, 0.00173, 0.00157, 0.00176, 0.00161, 0.00173, 0.00159, 0.00178, 0.00165, 0.00174, 0.00156, 0.00167, 0.00163, 0.00165, 0.00158, 0.00173, 0.00162, 0.00176, 0.00157, 0.00173, 0.00166, 0.00173, 0.0016, 0.0018, 0.00165, 0.00172, 0.00159, 0.00168, 0.00165, 0.00175, 0.00154, 0.00171, 0.00164, 0.00169, 0.00153, 0.00175, 0.00166, 0.00175, 0.00159, 0.00176, 0.00164, 0.00172, 0.00159, 0.00169, 0.00166, 0.00173, 0.00153, 0.00167, 0.00164, 0.00172, 0.00159, 0.00167, 0.00168, 0.00175, 0.00157, 0.00173, 0.00167, 0.00172, 0.0016, 0.00173, 0.00166, 0.00175, 0.00153, 0.00174, 0.00163, 0.00172, 0.00157, 0.00167, 0.00165, 0.00171, 0.00159, 0.00175, 0.00166, 0.00166, 0.00158, 0.00166, 0.00164, 0.00167, 0.00157, 0.0017, 0.00168, 0.00169, 0.00158, 0.00176, 0.00168, 0.00172, 0.00157, 0.00173, 0.00167]}, "optimizer-copy-to-main-grad-time": {"start_step": 0, "end_step": 2000, "step_interval": 5, "values": [0.00181, 0.00152, 0.00153, 0.0015, 0.00157, 0.00156, 0.00152, 0.00157, 0.00162, 0.0015, 0.00152, 0.00155, 0.00152, 0.00155, 0.00155, 0.00161, 0.00151, 0.00151, 0.00196, 0.0015, 0.00161, 0.0015, 0.00162, 0.00161, 0.00157, 0.00151, 0.0015, 0.0015, 0.00156, 0.00153, 0.00171, 0.00252, 0.00165, 0.0018, 0.00159, 0.00153, 0.00157, 0.00159, 0.00159, 0.00157, 0.00156, 0.00163, 0.00152, 0.0015, 0.00163, 0.00153, 0.00149, 0.00156, 0.00156, 0.00152, 0.00157, 0.00152, 0.0016, 0.00159, 0.00155, 0.00157, 0.00157, 0.00156, 0.00151, 0.00156, 0.00152, 0.00151, 0.00157, 0.00157, 0.00163, 0.00153, 0.00158, 0.00155, 0.00149, 0.00161, 0.0015, 0.00156, 0.00151, 0.00162, 0.00158, 0.00148, 0.00156, 0.0015, 0.00157, 0.00151, 0.00155, 0.00155, 0.00161, 0.0027, 0.00157, 0.00156, 0.00156, 0.00151, 0.00156, 0.00149, 0.00158, 0.0015, 0.00152, 0.00156, 0.00155, 0.0024, 0.00156, 0.0016, 0.00156, 0.0015, 0.0016, 0.00155, 0.00151, 0.00154, 0.00158, 0.0015, 0.0015, 0.00155, 0.00156, 0.00155, 0.00157, 0.0015, 0.0015, 0.00155, 0.00157, 0.00155, 0.00157, 0.0015, 0.00157, 0.00155, 0.00155, 0.0015, 0.00164, 0.0016, 0.00151, 0.0015, 0.00165, 0.00151, 0.00157, 0.00157, 0.00158, 0.00154, 0.00157, 0.0016, 0.0016, 0.00149, 0.00154, 0.00156, 0.00333, 0.00159, 0.00153, 0.00149, 0.00149, 0.00166, 0.00165, 0.00158, 0.00149, 0.00155, 0.00152, 0.00155, 0.00156, 0.00152, 0.00155, 0.00156, 0.00164, 0.00155, 0.00156, 0.00152, 0.00166, 0.00153, 0.0015, 0.0015, 0.00155, 0.00156, 0.00158, 0.00149, 0.00165, 0.00155, 0.0015, 0.0015, 0.0015, 0.00154, 0.00155, 0.00165, 0.00156, 0.00155, 0.0015, 0.00148, 0.00154, 0.00156, 0.00156, 0.0015, 0.00148, 0.00157, 0.00152, 0.0015, 0.00149, 0.00157, 0.00149, 0.00149, 0.0015, 0.0028, 0.0015, 0.00151, 0.00157, 0.00155, 0.00148, 0.0015, 0.00169, 0.00149, 0.0015, 0.00159, 0.00155, 0.00149, 0.0015, 0.00148, 0.00149, 0.00154, 0.00155, 0.00149, 0.00147, 0.00149, 0.00156, 0.00148, 0.00146, 0.00151, 0.00152, 0.00147, 0.00147, 0.00147, 0.00155, 0.00147, 0.00148, 0.00144, 0.0015, 0.0015, 0.00159, 0.00156, 0.00149, 0.00151, 0.0016, 0.00149, 0.0015, 0.00154, 0.0015, 0.00147, 0.00147, 0.00154, 0.00156, 0.00153, 0.0015, 0.0015, 0.002, 0.00151, 0.00246, 0.0015, 0.00147, 0.00144, 0.00148, 0.00171, 0.00148, 0.0015, 0.00157, 0.00174, 0.00156, 0.00157, 0.00148, 0.00147, 0.00149, 0.00148, 0.0015, 0.00148, 0.00151, 0.00158, 0.00149, 0.00147, 0.00153, 0.00151, 0.00154, 0.00148, 0.00157, 0.00157, 0.00148, 0.0016, 0.00153, 0.00155, 0.00156, 0.00157, 0.00149, 0.00154, 0.00148, 0.00151, 0.00149, 0.00155, 0.00148, 0.00155, 0.00155, 0.0015, 0.00149, 0.0015, 0.00149, 0.00153, 0.00164, 0.0016, 0.0015, 0.00153, 0.00149, 0.00158, 0.00154, 0.00149, 0.00154, 0.00165, 0.00151, 0.00148, 0.00158, 0.00157, 0.00158, 0.0015, 0.00149, 0.00154, 0.00152, 0.00155, 0.00158, 0.00149, 0.00157, 0.0015, 0.00158, 0.00163, 0.00159, 0.00158, 0.00159, 0.00157, 0.00157, 0.0015, 0.00151, 0.00151, 0.00154, 0.00154, 0.00159, 0.00155, 0.00155, 0.00148, 0.00198, 0.00154, 0.00149, 0.00156, 0.00151, 0.00157, 0.00149, 0.00148, 0.00151, 0.00154, 0.00153, 0.00148, 0.00151, 0.00149, 0.0015, 0.00155, 0.00155, 0.00151, 0.00156, 0.00154, 0.0015, 0.0015, 0.00151, 0.00157, 0.00156, 0.00158, 0.0015, 0.00155, 0.00148, 0.00153, 0.00151, 0.0015, 0.0015, 0.00152, 0.00151, 0.00156, 0.00158, 0.00151, 0.0015, 0.00149, 0.00156, 0.00156, 0.00157, 0.0015, 0.00148, 0.00158, 0.00158, 0.00156, 0.00155, 0.00154, 0.00165, 0.00162, 0.00157, 0.00166, 0.0015, 0.00156, 0.00155, 0.00152, 0.00152, 0.00154, 0.0015, 0.00153, 0.0016, 0.0015, 0.00151, 0.00152, 0.00155, 0.00155]}, "optimizer-unscale-and-check-inf-time": {"start_step": 0, "end_step": 2000, "step_interval": 5, "values": [0.60633, 0.00085, 0.00071, 0.0006, 0.00062, 0.0006, 0.00062, 0.00062, 0.00063, 0.00059, 0.00063, 0.00062, 0.00063, 0.00063, 0.00063, 0.00068, 0.00062, 0.00063, 0.00065, 0.00064, 0.00064, 0.0006, 0.00063, 0.00064, 0.00063, 0.00061, 0.00062, 0.00062, 0.00063, 0.00061, 0.0007, 0.00092, 0.00063, 0.00071, 0.00063, 0.00069, 0.00063, 0.00062, 0.00063, 0.00063, 0.00064, 0.0006, 0.00061, 0.00064, 0.00062, 0.00063, 0.00061, 0.00065, 0.00062, 0.00062, 0.0006, 0.00062, 0.00067, 0.00061, 0.00062, 0.00062, 0.00061, 0.00063, 0.00061, 0.00061, 0.0006, 0.00062, 0.00061, 0.00062, 0.00062, 0.00062, 0.00064, 0.00061, 0.00062, 0.00063, 0.00061, 0.00062, 0.00061, 0.00065, 0.00063, 0.0006, 0.0006, 0.0006, 0.00064, 0.00063, 0.00064, 0.0006, 0.00061, 0.00077, 0.00062, 0.00062, 0.00062, 0.00061, 0.00061, 0.00064, 0.00062, 0.0006, 0.00062, 0.00062, 0.00059, 0.00067, 0.00061, 0.00065, 0.0006, 0.00061, 0.00063, 0.00062, 0.00063, 0.00063, 0.00062, 0.0006, 0.00061, 0.00062, 0.00062, 0.0006, 0.00063, 0.00061, 0.0006, 0.0006, 0.00059, 0.00061, 0.0006, 0.00063, 0.00062, 0.00062, 0.00062, 0.00059, 0.00063, 0.0006, 0.00062, 0.00062, 0.00062, 0.00059, 0.00062, 0.00063, 0.0006, 0.00061, 0.0006, 0.00067, 0.00069, 0.00061, 0.00061, 0.00063, 0.00074, 0.0006, 0.00061, 0.00061, 0.00061, 0.00066, 0.00071, 0.00062, 0.00061, 0.0006, 0.00061, 0.00063, 0.0006, 0.00063, 0.00062, 0.00063, 0.00061, 0.00063, 0.00063, 0.00063, 0.00064, 0.00063, 0.00065, 0.00064, 0.00062, 0.00061, 0.00063, 0.00061, 0.00062, 0.00061, 0.00062, 0.00062, 0.00061, 0.00063, 0.00063, 0.00064, 0.00063, 0.00063, 0.00062, 0.00063, 0.00061, 0.00064, 0.00067, 0.0006, 0.00061, 0.00062, 0.00071, 0.00062, 0.00059, 0.00063, 0.00062, 0.0006, 0.00061, 0.00065, 0.00061, 0.00062, 0.00063, 0.00063, 0.00062, 0.00061, 0.00065, 0.00061, 0.00059, 0.0006, 0.00062, 0.0006, 0.00063, 0.00063, 0.0006, 0.00061, 0.00059, 0.00062, 0.00062, 0.0006, 0.00064, 0.00058, 0.00059, 0.00063, 0.00059, 0.0006, 0.00059, 0.00061, 0.00063, 0.00063, 0.0006, 0.0006, 0.00062, 0.0006, 0.00061, 0.00062, 0.00059, 0.00063, 0.0006, 0.00063, 0.0006, 0.00063, 0.00061, 0.00076, 0.0006, 0.00059, 0.00059, 0.00059, 0.0006, 0.00063, 0.00067, 0.00062, 0.00096, 0.00064, 0.00063, 0.00065, 0.00059, 0.00066, 0.00059, 0.0006, 0.00063, 0.00062, 0.00061, 0.00063, 0.00062, 0.00063, 0.00063, 0.00063, 0.0006, 0.00064, 0.00062, 0.00067, 0.00059, 0.00061, 0.00062, 0.00061, 0.00062, 0.0006, 0.0006, 0.00063, 0.00062, 0.00066, 0.00063, 0.00062, 0.00061, 0.00062, 0.00063, 0.00065, 0.00063, 0.00062, 0.00064, 0.00064, 0.00062, 0.00061, 0.00062, 0.00065, 0.00062, 0.00062, 0.00059, 0.00063, 0.00064, 0.0006, 0.00063, 0.00063, 0.00062, 0.00064, 0.00061, 0.00063, 0.00061, 0.0006, 0.00063, 0.00064, 0.00067, 0.00066, 0.00063, 0.00062, 0.00061, 0.00063, 0.00061, 0.00063, 0.00062, 0.00062, 0.00063, 0.00064, 0.00063, 0.00061, 0.00063, 0.00062, 0.00066, 0.00062, 0.00062, 0.00062, 0.00062, 0.00063, 0.00066, 0.00062, 0.00067, 0.00068, 0.00094, 0.00061, 0.00091, 0.00064, 0.00062, 0.00061, 0.00062, 0.00062, 0.00061, 0.00062, 0.00061, 0.00063, 0.00059, 0.00062, 0.00062, 0.00062, 0.00062, 0.00062, 0.00062, 0.00059, 0.00066, 0.00062, 0.00062, 0.0006, 0.00062, 0.00061, 0.00063, 0.00062, 0.00062, 0.00062, 0.00059, 0.0006, 0.00061, 0.0006, 0.00062, 0.00063, 0.00063, 0.00061, 0.00063, 0.00064, 0.00061, 0.00062, 0.00062, 0.00062, 0.00093, 0.00063, 0.00063, 0.00063, 0.00062, 0.00059, 0.00061, 0.00062, 0.00062, 0.00064, 0.00062, 0.00064, 0.00063, 0.00064, 0.00064, 0.00063, 0.00062, 0.00063, 0.00062, 0.00062, 0.00066, 0.00064, 0.00074, 0.00063, 0.00063, 0.00062]}, "optimizer-time": {"start_step": 0, "end_step": 2000, "step_interval": 5, "values": [0.60837, 0.00254, 0.00241, 0.00228, 0.01048, 0.01037, 0.01037, 0.01043, 0.01058, 0.01048, 0.01043, 0.01043, 0.01041, 0.0104, 0.01041, 0.01065, 0.01035, 0.01034, 0.01163, 0.01037, 0.01065, 0.01028, 0.01071, 0.01072, 0.01046, 0.0103, 0.01034, 0.01036, 0.01049, 0.01035, 0.01149, 0.01326, 0.01057, 0.0123, 0.01043, 0.0108, 0.01045, 0.01043, 0.01054, 0.01044, 0.01042, 0.01047, 0.01038, 0.01036, 0.01051, 0.01045, 0.01031, 0.01066, 0.01039, 0.01038, 0.01045, 0.01039, 0.01082, 0.01041, 0.01037, 0.01039, 0.0104, 0.01052, 0.01036, 0.01042, 0.01043, 0.01041, 0.01041, 0.01038, 0.01048, 0.01055, 0.01067, 0.01037, 0.01034, 0.01046, 0.01031, 0.01091, 0.01032, 0.01102, 0.0105, 0.01027, 0.01037, 0.01029, 0.01047, 0.0104, 0.01046, 0.01038, 0.01047, 0.01178, 0.0104, 0.01074, 0.01048, 0.01035, 0.01038, 0.01049, 0.01045, 0.01029, 0.0104, 0.01038, 0.01035, 0.01254, 0.01037, 0.01078, 0.01036, 0.01033, 0.01045, 0.01036, 0.01034, 0.01037, 0.01041, 0.01036, 0.01033, 0.01079, 0.01038, 0.01041, 0.01023, 0.01009, 0.01031, 0.01035, 0.01038, 0.01037, 0.01044, 0.01035, 0.01041, 0.01038, 0.01021, 0.0103, 0.01049, 0.01051, 0.01036, 0.01032, 0.01054, 0.01033, 0.01041, 0.01043, 0.01041, 0.01037, 0.01014, 0.01109, 0.01092, 0.01032, 0.01033, 0.01042, 0.02222, 0.01043, 0.01036, 0.01031, 0.01034, 0.01109, 0.01102, 0.01041, 0.01027, 0.01035, 0.0103, 0.01041, 0.01036, 0.01039, 0.01035, 0.01041, 0.01048, 0.01069, 0.01042, 0.01035, 0.01064, 0.01041, 0.01045, 0.01034, 0.01039, 0.01039, 0.01043, 0.01033, 0.01133, 0.01034, 0.01033, 0.01034, 0.01031, 0.01035, 0.0104, 0.01052, 0.01043, 0.01047, 0.01036, 0.01029, 0.01035, 0.01042, 0.01057, 0.0103, 0.0103, 0.01039, 0.0109, 0.0103, 0.0103, 0.0105, 0.01036, 0.01034, 0.01033, 0.01214, 0.01032, 0.0103, 0.01039, 0.01085, 0.01031, 0.01031, 0.01064, 0.01141, 0.01028, 0.01048, 0.01035, 0.01021, 0.01033, 0.01032, 0.01023, 0.01127, 0.01075, 0.01024, 0.01023, 0.01023, 0.01033, 0.01036, 0.01017, 0.01034, 0.01026, 0.01036, 0.01019, 0.01026, 0.01033, 0.01163, 0.0102, 0.01023, 0.01031, 0.01033, 0.01042, 0.01049, 0.01036, 0.01032, 0.01053, 0.01033, 0.01034, 0.01037, 0.01037, 0.01078, 0.01026, 0.01052, 0.01028, 0.01028, 0.01025, 0.01028, 0.01147, 0.01035, 0.01173, 0.01035, 0.01038, 0.01027, 0.01027, 0.01065, 0.01023, 0.01027, 0.01043, 0.01054, 0.01038, 0.01054, 0.01028, 0.01026, 0.0103, 0.01038, 0.0104, 0.0103, 0.0104, 0.01114, 0.01027, 0.01028, 0.01042, 0.01027, 0.01037, 0.01028, 0.01061, 0.01066, 0.01034, 0.0108, 0.01035, 0.01037, 0.01038, 0.01034, 0.01138, 0.01141, 0.01027, 0.01041, 0.01039, 0.01039, 0.01031, 0.01042, 0.01036, 0.01077, 0.01045, 0.01035, 0.0105, 0.01039, 0.01057, 0.01041, 0.01033, 0.01039, 0.01029, 0.0106, 0.01032, 0.01029, 0.01034, 0.01044, 0.01035, 0.01034, 0.0111, 0.01066, 0.01041, 0.0103, 0.01025, 0.01038, 0.01037, 0.01064, 0.0105, 0.0103, 0.01048, 0.01051, 0.01052, 0.01041, 0.0104, 0.01041, 0.01044, 0.01036, 0.01043, 0.01038, 0.01034, 0.01033, 0.01126, 0.01037, 0.01044, 0.01078, 0.01116, 0.01162, 0.01139, 0.01058, 0.0105, 0.01061, 0.01053, 0.01057, 0.01058, 0.01058, 0.01057, 0.0106, 0.01051, 0.01054, 0.01067, 0.0109, 0.01057, 0.01057, 0.01057, 0.01051, 0.01063, 0.01186, 0.0105, 0.01054, 0.01053, 0.01061, 0.01062, 0.01089, 0.01057, 0.0106, 0.01047, 0.01071, 0.0105, 0.01049, 0.01052, 0.01054, 0.01057, 0.0106, 0.01078, 0.01062, 0.01067, 0.01052, 0.01059, 0.01061, 0.01212, 0.01052, 0.01054, 0.01063, 0.0106, 0.01057, 0.01098, 0.01059, 0.01077, 0.01074, 0.01076, 0.01115, 0.01053, 0.01121, 0.01063, 0.01056, 0.01057, 0.01061, 0.01059, 0.01061, 0.01076, 0.01059, 0.01075, 0.01057, 0.01058, 0.01057]}, "learning-rate": {"start_step": 0, "end_step": 2000, "step_interval": 5, "values": [0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 1e-05, 1e-05, 1e-05, 1e-05, 1e-05, 1e-05, 1e-05, 1e-05, 1e-05, 1e-05, 1e-05, 1e-05, 1e-05, 1e-05, 1e-05, 1e-05, 1e-05, 1e-05, 1e-05, 1e-05, 1e-05, 1e-05, 1e-05, 1e-05, 1e-05, 1e-05, 1e-05, 1e-05, 1e-05, 1e-05, 1e-05, 1e-05, 1e-05, 1e-05, 1e-05, 1e-05, 1e-05, 1e-05, 1e-05, 1e-05, 1e-05, 1e-05, 1e-05, 2e-05, 2e-05, 2e-05, 2e-05, 2e-05, 2e-05, 2e-05, 2e-05, 2e-05, 2e-05, 2e-05, 2e-05, 2e-05, 2e-05, 2e-05, 2e-05, 2e-05, 2e-05, 2e-05, 2e-05, 2e-05, 2e-05, 2e-05, 2e-05, 2e-05, 2e-05, 2e-05, 2e-05, 2e-05, 2e-05, 2e-05, 2e-05, 2e-05, 2e-05, 2e-05, 2e-05, 2e-05, 2e-05, 2e-05, 2e-05, 2e-05, 2e-05, 3e-05, 3e-05, 3e-05, 3e-05, 3e-05, 3e-05, 3e-05, 3e-05, 3e-05, 3e-05, 3e-05, 3e-05, 3e-05, 3e-05, 3e-05, 3e-05, 3e-05, 3e-05, 3e-05, 3e-05, 3e-05, 3e-05, 3e-05, 3e-05, 3e-05, 3e-05, 3e-05, 3e-05, 3e-05, 3e-05, 3e-05, 3e-05, 3e-05, 3e-05, 3e-05, 3e-05, 3e-05, 3e-05, 3e-05, 3e-05, 3e-05, 3e-05, 3e-05, 4e-05, 4e-05, 4e-05, 4e-05, 4e-05, 4e-05, 4e-05, 4e-05, 4e-05, 4e-05, 4e-05, 4e-05, 4e-05, 4e-05, 4e-05, 4e-05, 4e-05, 4e-05, 4e-05, 4e-05, 4e-05, 4e-05, 4e-05, 4e-05, 4e-05, 4e-05, 4e-05, 4e-05, 4e-05, 4e-05, 4e-05, 4e-05, 4e-05, 4e-05, 4e-05, 4e-05, 4e-05, 4e-05, 4e-05, 4e-05, 4e-05, 4e-05, 4e-05, 5e-05, 5e-05, 5e-05, 5e-05, 5e-05, 5e-05, 5e-05, 5e-05, 5e-05, 5e-05, 5e-05, 5e-05, 5e-05, 5e-05, 5e-05, 5e-05, 5e-05, 5e-05, 5e-05, 5e-05, 5e-05, 5e-05, 5e-05, 5e-05, 5e-05, 5e-05, 5e-05, 5e-05, 5e-05, 5e-05, 5e-05, 5e-05, 5e-05, 5e-05, 5e-05, 5e-05, 5e-05, 5e-05, 5e-05, 5e-05, 5e-05, 5e-05, 6e-05, 6e-05, 6e-05, 6e-05, 6e-05, 6e-05, 6e-05, 6e-05, 6e-05, 6e-05, 6e-05, 6e-05, 6e-05, 6e-05, 6e-05, 6e-05, 6e-05, 6e-05, 6e-05, 6e-05, 6e-05, 6e-05, 6e-05, 6e-05, 6e-05, 6e-05, 6e-05, 6e-05, 6e-05, 6e-05, 6e-05, 6e-05, 6e-05, 6e-05, 6e-05, 6e-05, 6e-05, 6e-05, 6e-05, 6e-05, 6e-05, 6e-05, 6e-05, 7e-05, 7e-05, 7e-05, 7e-05, 7e-05, 7e-05, 7e-05, 7e-05, 7e-05, 7e-05, 7e-05, 7e-05, 7e-05, 7e-05, 7e-05, 7e-05, 7e-05, 7e-05, 7e-05, 7e-05, 7e-05, 7e-05, 7e-05, 7e-05, 7e-05, 7e-05, 7e-05, 7e-05, 7e-05, 7e-05, 7e-05, 7e-05, 7e-05, 7e-05, 7e-05, 7e-05, 7e-05, 7e-05, 7e-05, 7e-05, 7e-05, 7e-05, 7e-05, 8e-05, 8e-05, 8e-05, 8e-05, 8e-05, 8e-05, 8e-05, 8e-05, 8e-05, 8e-05, 8e-05, 8e-05, 8e-05, 8e-05, 8e-05, 8e-05, 8e-05, 8e-05, 8e-05, 8e-05, 8e-05, 8e-05, 8e-05, 8e-05, 8e-05, 8e-05, 8e-05, 8e-05, 8e-05, 8e-05, 8e-05, 8e-05, 8e-05, 8e-05, 8e-05, 8e-05, 8e-05, 8e-05, 8e-05, 8e-05, 8e-05, 8e-05, 9e-05, 9e-05, 9e-05, 9e-05, 9e-05, 9e-05, 9e-05, 9e-05, 9e-05, 9e-05, 9e-05, 9e-05, 9e-05, 9e-05, 9e-05, 9e-05, 9e-05, 9e-05, 9e-05, 9e-05, 9e-05, 9e-05, 9e-05, 9e-05, 9e-05, 9e-05, 9e-05, 9e-05, 9e-05, 9e-05, 9e-05, 9e-05, 9e-05, 9e-05]}, "learning-rate vs samples": {"start_step": 0, "end_step": 2000, "step_interval": 5, "values": [0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 1e-05, 1e-05, 1e-05, 1e-05, 1e-05, 1e-05, 1e-05, 1e-05, 1e-05, 1e-05, 1e-05, 1e-05, 1e-05, 1e-05, 1e-05, 1e-05, 1e-05, 1e-05, 1e-05, 1e-05, 1e-05, 1e-05, 1e-05, 1e-05, 1e-05, 1e-05, 1e-05, 1e-05, 1e-05, 1e-05, 1e-05, 1e-05, 1e-05, 1e-05, 1e-05, 1e-05, 1e-05, 1e-05, 1e-05, 1e-05, 1e-05, 1e-05, 1e-05, 2e-05, 2e-05, 2e-05, 2e-05, 2e-05, 2e-05, 2e-05, 2e-05, 2e-05, 2e-05, 2e-05, 2e-05, 2e-05, 2e-05, 2e-05, 2e-05, 2e-05, 2e-05, 2e-05, 2e-05, 2e-05, 2e-05, 2e-05, 2e-05, 2e-05, 2e-05, 2e-05, 2e-05, 2e-05, 2e-05, 2e-05, 2e-05, 2e-05, 2e-05, 2e-05, 2e-05, 2e-05, 2e-05, 2e-05, 2e-05, 2e-05, 2e-05, 3e-05, 3e-05, 3e-05, 3e-05, 3e-05, 3e-05, 3e-05, 3e-05, 3e-05, 3e-05, 3e-05, 3e-05, 3e-05, 3e-05, 3e-05, 3e-05, 3e-05, 3e-05, 3e-05, 3e-05, 3e-05, 3e-05, 3e-05, 3e-05, 3e-05, 3e-05, 3e-05, 3e-05, 3e-05, 3e-05, 3e-05, 3e-05, 3e-05, 3e-05, 3e-05, 3e-05, 3e-05, 3e-05, 3e-05, 3e-05, 3e-05, 3e-05, 3e-05, 4e-05, 4e-05, 4e-05, 4e-05, 4e-05, 4e-05, 4e-05, 4e-05, 4e-05, 4e-05, 4e-05, 4e-05, 4e-05, 4e-05, 4e-05, 4e-05, 4e-05, 4e-05, 4e-05, 4e-05, 4e-05, 4e-05, 4e-05, 4e-05, 4e-05, 4e-05, 4e-05, 4e-05, 4e-05, 4e-05, 4e-05, 4e-05, 4e-05, 4e-05, 4e-05, 4e-05, 4e-05, 4e-05, 4e-05, 4e-05, 4e-05, 4e-05, 4e-05, 5e-05, 5e-05, 5e-05, 5e-05, 5e-05, 5e-05, 5e-05, 5e-05, 5e-05, 5e-05, 5e-05, 5e-05, 5e-05, 5e-05, 5e-05, 5e-05, 5e-05, 5e-05, 5e-05, 5e-05, 5e-05, 5e-05, 5e-05, 5e-05, 5e-05, 5e-05, 5e-05, 5e-05, 5e-05, 5e-05, 5e-05, 5e-05, 5e-05, 5e-05, 5e-05, 5e-05, 5e-05, 5e-05, 5e-05, 5e-05, 5e-05, 5e-05, 6e-05, 6e-05, 6e-05, 6e-05, 6e-05, 6e-05, 6e-05, 6e-05, 6e-05, 6e-05, 6e-05, 6e-05, 6e-05, 6e-05, 6e-05, 6e-05, 6e-05, 6e-05, 6e-05, 6e-05, 6e-05, 6e-05, 6e-05, 6e-05, 6e-05, 6e-05, 6e-05, 6e-05, 6e-05, 6e-05, 6e-05, 6e-05, 6e-05, 6e-05, 6e-05, 6e-05, 6e-05, 6e-05, 6e-05, 6e-05, 6e-05, 6e-05, 6e-05, 7e-05, 7e-05, 7e-05, 7e-05, 7e-05, 7e-05, 7e-05, 7e-05, 7e-05, 7e-05, 7e-05, 7e-05, 7e-05, 7e-05, 7e-05, 7e-05, 7e-05, 7e-05, 7e-05, 7e-05, 7e-05, 7e-05, 7e-05, 7e-05, 7e-05, 7e-05, 7e-05, 7e-05, 7e-05, 7e-05, 7e-05, 7e-05, 7e-05, 7e-05, 7e-05, 7e-05, 7e-05, 7e-05, 7e-05, 7e-05, 7e-05, 7e-05, 7e-05, 8e-05, 8e-05, 8e-05, 8e-05, 8e-05, 8e-05, 8e-05, 8e-05, 8e-05, 8e-05, 8e-05, 8e-05, 8e-05, 8e-05, 8e-05, 8e-05, 8e-05, 8e-05, 8e-05, 8e-05, 8e-05, 8e-05, 8e-05, 8e-05, 8e-05, 8e-05, 8e-05, 8e-05, 8e-05, 8e-05, 8e-05, 8e-05, 8e-05, 8e-05, 8e-05, 8e-05, 8e-05, 8e-05, 8e-05, 8e-05, 8e-05, 8e-05, 9e-05, 9e-05, 9e-05, 9e-05, 9e-05, 9e-05, 9e-05, 9e-05, 9e-05, 9e-05, 9e-05, 9e-05, 9e-05, 9e-05, 9e-05, 9e-05, 9e-05, 9e-05, 9e-05, 9e-05, 9e-05, 9e-05, 9e-05, 9e-05, 9e-05, 9e-05, 9e-05, 9e-05, 9e-05, 9e-05, 9e-05, 9e-05, 9e-05, 9e-05]}, "batch-size": {"start_step": 0, "end_step": 2000, "step_interval": 5, "values": [128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0]}, "batch-size vs samples": {"start_step": 0, "end_step": 2000, "step_interval": 5, "values": [128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0]}, "lm loss": {"start_step": 0, "end_step": 2000, "step_interval": 5, "values": [10.89393, 10.90229, 10.90382, 10.89922, 10.90215, 10.87439, 10.80338, 10.63346, 10.44036, 10.2933, 10.02711, 10.16747, 10.13781, 9.86192, 9.97684, 9.67806, 9.59835, 9.78149, 9.50324, 9.44529, 9.35262, 9.25422, 9.27971, 9.09386, 9.28651, 9.15722, 9.24673, 9.26197, 9.39815, 9.08902, 9.03506, 9.14524, 9.15344, 8.76086, 8.82546, 8.85801, 8.78594, 8.83766, 8.7627, 8.8693, 8.76505, 8.95513, 8.94138, 8.60415, 8.49526, 8.5414, 8.6052, 8.49378, 8.54563, 8.69589, 8.47931, 8.31047, 8.34191, 8.33761, 8.38482, 8.03117, 8.21698, 8.01005, 8.36597, 8.35171, 8.1238, 8.08903, 8.03892, 7.85884, 7.86204, 7.76178, 7.63785, 8.03256, 7.82491, 7.57767, 7.87018, 7.89663, 7.66576, 7.41891, 7.57945, 7.45949, 7.58407, 7.3365, 7.75478, 7.39312, 7.46005, 7.32601, 7.32261, 7.53324, 7.28432, 7.3906, 7.10455, 7.1031, 7.135, 7.2333, 6.91495, 7.07308, 7.17321, 7.08148, 6.95568, 6.83552, 7.07146, 7.13597, 6.77633, 6.6537, 6.79923, 6.81094, 6.80156, 6.80623, 6.72479, 6.46997, 6.7029, 6.67891, 6.50414, 6.69017, 6.80201, 6.66742, 6.78223, 6.74908, 6.68039, 6.55851, 6.65127, 6.45882, 6.71595, 6.3003, 6.29947, 6.35127, 6.43626, 6.39728, 6.5005, 6.33652, 6.38489, 6.2805, 6.24364, 6.44007, 6.36837, 6.36408, 6.20465, 6.19665, 6.27951, 6.42484, 6.24039, 6.18602, 6.21368, 6.14857, 6.09651, 6.10359, 6.28963, 6.44182, 6.28988, 6.33247, 6.13546, 6.21108, 6.0349, 6.06273, 5.987, 6.28025, 6.22641, 5.99808, 5.81837, 6.16027, 5.88364, 6.139, 5.82189, 6.19536, 6.17777, 6.11785, 5.96408, 6.14649, 5.9753, 6.22609, 5.92665, 5.82529, 5.80636, 5.7182, 6.04353, 6.02584, 6.092, 5.9119, 6.06757, 5.99273, 6.02669, 6.01523, 5.97662, 5.86429, 5.97653, 5.6431, 5.7275, 5.9135, 5.8664, 5.88797, 5.78842, 5.86055, 5.75215, 5.58542, 5.74699, 5.6532, 5.85871, 5.63063, 5.7325, 5.73883, 5.92312, 5.66992, 5.87123, 5.76346, 5.89613, 5.35339, 5.91985, 5.89554, 5.87623, 5.43362, 5.42829, 5.64744, 5.61678, 5.5103, 5.59917, 5.6988, 5.49854, 5.77013, 5.53314, 5.61954, 5.64553, 5.64008, 5.53513, 5.63528, 5.69717, 5.71522, 5.60874, 5.6802, 5.39435, 5.70021, 5.64782, 5.44435, 5.60824, 5.65007, 5.57098, 5.36362, 5.55798, 5.50433, 5.50082, 5.39457, 5.57452, 5.62082, 5.40855, 5.54177, 5.50319, 5.34993, 5.52256, 5.42475, 5.457, 5.33418, 5.08125, 5.49351, 5.58285, 5.72877, 5.42977, 5.613, 5.64847, 5.2484, 5.28756, 5.41008, 5.40961, 5.34061, 5.51276, 5.19903, 5.31256, 5.26266, 5.3907, 5.27539, 5.46188, 5.55243, 5.32608, 5.4523, 5.34935, 5.085, 5.3281, 5.26395, 5.31744, 5.12555, 5.28677, 5.2827, 5.486, 5.17172, 5.28031, 5.22155, 5.37027, 4.99359, 4.92973, 5.33403, 5.3997, 5.23719, 5.33061, 5.11473, 5.1717, 5.27268, 5.07733, 5.2767, 5.0858, 5.35129, 5.2583, 5.16657, 5.25468, 5.05243, 5.32453, 5.06278, 5.03705, 5.15134, 5.12068, 5.28265, 5.15883, 5.28883, 5.10618, 5.10727, 5.2621, 5.33107, 5.26622, 5.20237, 5.15543, 5.29779, 4.95636, 5.21799, 5.10164, 5.30924, 5.18679, 5.19599, 5.12317, 4.99367, 5.00306, 5.23171, 5.32198, 5.10695, 5.0647, 4.92646, 5.13309, 5.12718, 4.93681, 5.34691, 5.03142, 5.11047, 5.16889, 5.01087, 5.07032, 5.07588, 5.00122, 5.08773, 5.16951, 4.98692, 5.18998, 4.93899, 4.92741, 5.07395, 5.00085, 4.91692, 4.78186, 4.94917, 5.12365, 5.02541, 5.02437, 5.33759, 4.96582, 5.00145, 5.05138, 4.81301, 4.74456, 5.00203, 5.04679, 4.88367, 4.95882, 5.05212, 5.03024, 4.82289, 4.89705, 4.91162, 4.83722, 4.75468, 5.01694, 4.75625, 5.21634, 4.78922, 4.99899, 4.74083, 4.79117, 4.82499, 4.65555, 4.66118, 4.84502, 4.812, 4.80818, 4.93087, 4.88819, 4.92996, 4.77146, 4.88927, 4.73848, 4.91779, 4.96467, 4.87947, 4.7104, 4.78793, 4.90438, 4.71479, 4.86815, 4.69617, 4.69095, 4.65249]}, "lm loss vs samples": {"start_step": 0, "end_step": 2000, "step_interval": 5, "values": [10.89393, 10.90229, 10.90382, 10.89922, 10.90215, 10.87439, 10.80338, 10.63346, 10.44036, 10.2933, 10.02711, 10.16747, 10.13781, 9.86192, 9.97684, 9.67806, 9.59835, 9.78149, 9.50324, 9.44529, 9.35262, 9.25422, 9.27971, 9.09386, 9.28651, 9.15722, 9.24673, 9.26197, 9.39815, 9.08902, 9.03506, 9.14524, 9.15344, 8.76086, 8.82546, 8.85801, 8.78594, 8.83766, 8.7627, 8.8693, 8.76505, 8.95513, 8.94138, 8.60415, 8.49526, 8.5414, 8.6052, 8.49378, 8.54563, 8.69589, 8.47931, 8.31047, 8.34191, 8.33761, 8.38482, 8.03117, 8.21698, 8.01005, 8.36597, 8.35171, 8.1238, 8.08903, 8.03892, 7.85884, 7.86204, 7.76178, 7.63785, 8.03256, 7.82491, 7.57767, 7.87018, 7.89663, 7.66576, 7.41891, 7.57945, 7.45949, 7.58407, 7.3365, 7.75478, 7.39312, 7.46005, 7.32601, 7.32261, 7.53324, 7.28432, 7.3906, 7.10455, 7.1031, 7.135, 7.2333, 6.91495, 7.07308, 7.17321, 7.08148, 6.95568, 6.83552, 7.07146, 7.13597, 6.77633, 6.6537, 6.79923, 6.81094, 6.80156, 6.80623, 6.72479, 6.46997, 6.7029, 6.67891, 6.50414, 6.69017, 6.80201, 6.66742, 6.78223, 6.74908, 6.68039, 6.55851, 6.65127, 6.45882, 6.71595, 6.3003, 6.29947, 6.35127, 6.43626, 6.39728, 6.5005, 6.33652, 6.38489, 6.2805, 6.24364, 6.44007, 6.36837, 6.36408, 6.20465, 6.19665, 6.27951, 6.42484, 6.24039, 6.18602, 6.21368, 6.14857, 6.09651, 6.10359, 6.28963, 6.44182, 6.28988, 6.33247, 6.13546, 6.21108, 6.0349, 6.06273, 5.987, 6.28025, 6.22641, 5.99808, 5.81837, 6.16027, 5.88364, 6.139, 5.82189, 6.19536, 6.17777, 6.11785, 5.96408, 6.14649, 5.9753, 6.22609, 5.92665, 5.82529, 5.80636, 5.7182, 6.04353, 6.02584, 6.092, 5.9119, 6.06757, 5.99273, 6.02669, 6.01523, 5.97662, 5.86429, 5.97653, 5.6431, 5.7275, 5.9135, 5.8664, 5.88797, 5.78842, 5.86055, 5.75215, 5.58542, 5.74699, 5.6532, 5.85871, 5.63063, 5.7325, 5.73883, 5.92312, 5.66992, 5.87123, 5.76346, 5.89613, 5.35339, 5.91985, 5.89554, 5.87623, 5.43362, 5.42829, 5.64744, 5.61678, 5.5103, 5.59917, 5.6988, 5.49854, 5.77013, 5.53314, 5.61954, 5.64553, 5.64008, 5.53513, 5.63528, 5.69717, 5.71522, 5.60874, 5.6802, 5.39435, 5.70021, 5.64782, 5.44435, 5.60824, 5.65007, 5.57098, 5.36362, 5.55798, 5.50433, 5.50082, 5.39457, 5.57452, 5.62082, 5.40855, 5.54177, 5.50319, 5.34993, 5.52256, 5.42475, 5.457, 5.33418, 5.08125, 5.49351, 5.58285, 5.72877, 5.42977, 5.613, 5.64847, 5.2484, 5.28756, 5.41008, 5.40961, 5.34061, 5.51276, 5.19903, 5.31256, 5.26266, 5.3907, 5.27539, 5.46188, 5.55243, 5.32608, 5.4523, 5.34935, 5.085, 5.3281, 5.26395, 5.31744, 5.12555, 5.28677, 5.2827, 5.486, 5.17172, 5.28031, 5.22155, 5.37027, 4.99359, 4.92973, 5.33403, 5.3997, 5.23719, 5.33061, 5.11473, 5.1717, 5.27268, 5.07733, 5.2767, 5.0858, 5.35129, 5.2583, 5.16657, 5.25468, 5.05243, 5.32453, 5.06278, 5.03705, 5.15134, 5.12068, 5.28265, 5.15883, 5.28883, 5.10618, 5.10727, 5.2621, 5.33107, 5.26622, 5.20237, 5.15543, 5.29779, 4.95636, 5.21799, 5.10164, 5.30924, 5.18679, 5.19599, 5.12317, 4.99367, 5.00306, 5.23171, 5.32198, 5.10695, 5.0647, 4.92646, 5.13309, 5.12718, 4.93681, 5.34691, 5.03142, 5.11047, 5.16889, 5.01087, 5.07032, 5.07588, 5.00122, 5.08773, 5.16951, 4.98692, 5.18998, 4.93899, 4.92741, 5.07395, 5.00085, 4.91692, 4.78186, 4.94917, 5.12365, 5.02541, 5.02437, 5.33759, 4.96582, 5.00145, 5.05138, 4.81301, 4.74456, 5.00203, 5.04679, 4.88367, 4.95882, 5.05212, 5.03024, 4.82289, 4.89705, 4.91162, 4.83722, 4.75468, 5.01694, 4.75625, 5.21634, 4.78922, 4.99899, 4.74083, 4.79117, 4.82499, 4.65555, 4.66118, 4.84502, 4.812, 4.80818, 4.93087, 4.88819, 4.92996, 4.77146, 4.88927, 4.73848, 4.91779, 4.96467, 4.87947, 4.7104, 4.78793, 4.90438, 4.71479, 4.86815, 4.69617, 4.69095, 4.65249]}, "loss-scale": {"start_step": 0, "end_step": 2000, "step_interval": 5, "values": [4294967296.0, 134217728.0, 4194304.0, 131072.0, 65536.0, 65536.0, 65536.0, 65536.0, 65536.0, 65536.0, 65536.0, 65536.0, 65536.0, 65536.0, 65536.0, 65536.0, 65536.0, 65536.0, 65536.0, 65536.0, 65536.0, 65536.0, 65536.0, 65536.0, 65536.0, 65536.0, 65536.0, 65536.0, 65536.0, 65536.0, 65536.0, 65536.0, 65536.0, 65536.0, 65536.0, 65536.0, 65536.0, 65536.0, 65536.0, 65536.0, 65536.0, 65536.0, 65536.0, 65536.0, 65536.0, 65536.0, 65536.0, 65536.0, 65536.0, 65536.0, 65536.0, 65536.0, 65536.0, 65536.0, 65536.0, 65536.0, 65536.0, 65536.0, 65536.0, 65536.0, 65536.0, 65536.0, 65536.0, 65536.0, 65536.0, 65536.0, 65536.0, 65536.0, 65536.0, 65536.0, 65536.0, 65536.0, 65536.0, 65536.0, 65536.0, 65536.0, 65536.0, 65536.0, 65536.0, 65536.0, 65536.0, 65536.0, 65536.0, 65536.0, 65536.0, 65536.0, 65536.0, 65536.0, 65536.0, 65536.0, 65536.0, 65536.0, 65536.0, 65536.0, 65536.0, 65536.0, 65536.0, 65536.0, 65536.0, 65536.0, 65536.0, 65536.0, 65536.0, 65536.0, 65536.0, 65536.0, 65536.0, 65536.0, 65536.0, 65536.0, 65536.0, 65536.0, 65536.0, 65536.0, 65536.0, 65536.0, 65536.0, 65536.0, 65536.0, 65536.0, 65536.0, 65536.0, 65536.0, 65536.0, 65536.0, 65536.0, 65536.0, 65536.0, 65536.0, 65536.0, 65536.0, 65536.0, 65536.0, 65536.0, 65536.0, 65536.0, 65536.0, 65536.0, 65536.0, 65536.0, 65536.0, 65536.0, 65536.0, 65536.0, 65536.0, 65536.0, 65536.0, 65536.0, 65536.0, 65536.0, 65536.0, 65536.0, 65536.0, 65536.0, 65536.0, 65536.0, 65536.0, 65536.0, 65536.0, 65536.0, 65536.0, 65536.0, 65536.0, 65536.0, 65536.0, 65536.0, 65536.0, 65536.0, 65536.0, 65536.0, 65536.0, 65536.0, 65536.0, 65536.0, 65536.0, 65536.0, 65536.0, 65536.0, 65536.0, 65536.0, 65536.0, 65536.0, 65536.0, 65536.0, 65536.0, 65536.0, 65536.0, 65536.0, 65536.0, 65536.0, 65536.0, 65536.0, 65536.0, 65536.0, 65536.0, 65536.0, 65536.0, 65536.0, 65536.0, 65536.0, 65536.0, 65536.0, 65536.0, 65536.0, 131072.0, 131072.0, 131072.0, 131072.0, 131072.0, 131072.0, 131072.0, 131072.0, 131072.0, 131072.0, 131072.0, 131072.0, 131072.0, 131072.0, 131072.0, 131072.0, 131072.0, 131072.0, 131072.0, 131072.0, 131072.0, 131072.0, 131072.0, 131072.0, 131072.0, 131072.0, 131072.0, 131072.0, 131072.0, 131072.0, 131072.0, 131072.0, 131072.0, 131072.0, 131072.0, 131072.0, 131072.0, 131072.0, 131072.0, 131072.0, 131072.0, 131072.0, 131072.0, 131072.0, 131072.0, 131072.0, 131072.0, 131072.0, 131072.0, 131072.0, 131072.0, 131072.0, 131072.0, 131072.0, 131072.0, 131072.0, 131072.0, 131072.0, 131072.0, 131072.0, 131072.0, 131072.0, 131072.0, 131072.0, 131072.0, 131072.0, 131072.0, 131072.0, 131072.0, 131072.0, 131072.0, 131072.0, 131072.0, 131072.0, 131072.0, 131072.0, 131072.0, 131072.0, 131072.0, 131072.0, 131072.0, 131072.0, 131072.0, 131072.0, 131072.0, 131072.0, 131072.0, 131072.0, 131072.0, 131072.0, 131072.0, 131072.0, 131072.0, 131072.0, 131072.0, 131072.0, 131072.0, 131072.0, 131072.0, 131072.0, 131072.0, 131072.0, 131072.0, 131072.0, 131072.0, 131072.0, 131072.0, 131072.0, 131072.0, 131072.0, 131072.0, 131072.0, 131072.0, 131072.0, 131072.0, 131072.0, 131072.0, 131072.0, 131072.0, 131072.0, 131072.0, 131072.0, 131072.0, 131072.0, 131072.0, 131072.0, 131072.0, 131072.0, 131072.0, 131072.0, 131072.0, 131072.0, 131072.0, 131072.0, 131072.0, 131072.0, 131072.0, 131072.0, 131072.0, 131072.0, 131072.0, 131072.0, 131072.0, 131072.0, 131072.0, 131072.0, 131072.0, 131072.0, 131072.0, 131072.0, 131072.0, 131072.0, 131072.0, 131072.0, 131072.0, 131072.0, 131072.0, 131072.0, 131072.0, 131072.0, 131072.0, 131072.0, 131072.0, 131072.0, 131072.0, 131072.0, 131072.0, 131072.0, 131072.0, 131072.0, 131072.0, 131072.0, 131072.0, 131072.0, 131072.0, 131072.0, 131072.0, 131072.0, 131072.0, 131072.0, 131072.0, 131072.0, 131072.0, 131072.0, 131072.0, 131072.0, 131072.0, 131072.0, 131072.0, 131072.0, 131072.0, 131072.0, 131072.0, 131072.0, 131072.0, 131072.0]}, "loss-scale vs samples": {"start_step": 0, "end_step": 2000, "step_interval": 5, "values": [4294967296.0, 134217728.0, 4194304.0, 131072.0, 65536.0, 65536.0, 65536.0, 65536.0, 65536.0, 65536.0, 65536.0, 65536.0, 65536.0, 65536.0, 65536.0, 65536.0, 65536.0, 65536.0, 65536.0, 65536.0, 65536.0, 65536.0, 65536.0, 65536.0, 65536.0, 65536.0, 65536.0, 65536.0, 65536.0, 65536.0, 65536.0, 65536.0, 65536.0, 65536.0, 65536.0, 65536.0, 65536.0, 65536.0, 65536.0, 65536.0, 65536.0, 65536.0, 65536.0, 65536.0, 65536.0, 65536.0, 65536.0, 65536.0, 65536.0, 65536.0, 65536.0, 65536.0, 65536.0, 65536.0, 65536.0, 65536.0, 65536.0, 65536.0, 65536.0, 65536.0, 65536.0, 65536.0, 65536.0, 65536.0, 65536.0, 65536.0, 65536.0, 65536.0, 65536.0, 65536.0, 65536.0, 65536.0, 65536.0, 65536.0, 65536.0, 65536.0, 65536.0, 65536.0, 65536.0, 65536.0, 65536.0, 65536.0, 65536.0, 65536.0, 65536.0, 65536.0, 65536.0, 65536.0, 65536.0, 65536.0, 65536.0, 65536.0, 65536.0, 65536.0, 65536.0, 65536.0, 65536.0, 65536.0, 65536.0, 65536.0, 65536.0, 65536.0, 65536.0, 65536.0, 65536.0, 65536.0, 65536.0, 65536.0, 65536.0, 65536.0, 65536.0, 65536.0, 65536.0, 65536.0, 65536.0, 65536.0, 65536.0, 65536.0, 65536.0, 65536.0, 65536.0, 65536.0, 65536.0, 65536.0, 65536.0, 65536.0, 65536.0, 65536.0, 65536.0, 65536.0, 65536.0, 65536.0, 65536.0, 65536.0, 65536.0, 65536.0, 65536.0, 65536.0, 65536.0, 65536.0, 65536.0, 65536.0, 65536.0, 65536.0, 65536.0, 65536.0, 65536.0, 65536.0, 65536.0, 65536.0, 65536.0, 65536.0, 65536.0, 65536.0, 65536.0, 65536.0, 65536.0, 65536.0, 65536.0, 65536.0, 65536.0, 65536.0, 65536.0, 65536.0, 65536.0, 65536.0, 65536.0, 65536.0, 65536.0, 65536.0, 65536.0, 65536.0, 65536.0, 65536.0, 65536.0, 65536.0, 65536.0, 65536.0, 65536.0, 65536.0, 65536.0, 65536.0, 65536.0, 65536.0, 65536.0, 65536.0, 65536.0, 65536.0, 65536.0, 65536.0, 65536.0, 65536.0, 65536.0, 65536.0, 65536.0, 65536.0, 65536.0, 65536.0, 65536.0, 65536.0, 65536.0, 65536.0, 65536.0, 65536.0, 131072.0, 131072.0, 131072.0, 131072.0, 131072.0, 131072.0, 131072.0, 131072.0, 131072.0, 131072.0, 131072.0, 131072.0, 131072.0, 131072.0, 131072.0, 131072.0, 131072.0, 131072.0, 131072.0, 131072.0, 131072.0, 131072.0, 131072.0, 131072.0, 131072.0, 131072.0, 131072.0, 131072.0, 131072.0, 131072.0, 131072.0, 131072.0, 131072.0, 131072.0, 131072.0, 131072.0, 131072.0, 131072.0, 131072.0, 131072.0, 131072.0, 131072.0, 131072.0, 131072.0, 131072.0, 131072.0, 131072.0, 131072.0, 131072.0, 131072.0, 131072.0, 131072.0, 131072.0, 131072.0, 131072.0, 131072.0, 131072.0, 131072.0, 131072.0, 131072.0, 131072.0, 131072.0, 131072.0, 131072.0, 131072.0, 131072.0, 131072.0, 131072.0, 131072.0, 131072.0, 131072.0, 131072.0, 131072.0, 131072.0, 131072.0, 131072.0, 131072.0, 131072.0, 131072.0, 131072.0, 131072.0, 131072.0, 131072.0, 131072.0, 131072.0, 131072.0, 131072.0, 131072.0, 131072.0, 131072.0, 131072.0, 131072.0, 131072.0, 131072.0, 131072.0, 131072.0, 131072.0, 131072.0, 131072.0, 131072.0, 131072.0, 131072.0, 131072.0, 131072.0, 131072.0, 131072.0, 131072.0, 131072.0, 131072.0, 131072.0, 131072.0, 131072.0, 131072.0, 131072.0, 131072.0, 131072.0, 131072.0, 131072.0, 131072.0, 131072.0, 131072.0, 131072.0, 131072.0, 131072.0, 131072.0, 131072.0, 131072.0, 131072.0, 131072.0, 131072.0, 131072.0, 131072.0, 131072.0, 131072.0, 131072.0, 131072.0, 131072.0, 131072.0, 131072.0, 131072.0, 131072.0, 131072.0, 131072.0, 131072.0, 131072.0, 131072.0, 131072.0, 131072.0, 131072.0, 131072.0, 131072.0, 131072.0, 131072.0, 131072.0, 131072.0, 131072.0, 131072.0, 131072.0, 131072.0, 131072.0, 131072.0, 131072.0, 131072.0, 131072.0, 131072.0, 131072.0, 131072.0, 131072.0, 131072.0, 131072.0, 131072.0, 131072.0, 131072.0, 131072.0, 131072.0, 131072.0, 131072.0, 131072.0, 131072.0, 131072.0, 131072.0, 131072.0, 131072.0, 131072.0, 131072.0, 131072.0, 131072.0, 131072.0, 131072.0, 131072.0, 131072.0, 131072.0, 131072.0, 131072.0, 131072.0, 131072.0]}, "params-norm": {"start_step": 0, "end_step": 2000, "step_interval": 5, "values": [179.95639, 179.95639, 179.95639, 179.95639, 179.95639, 179.95639, 179.95636, 179.95616, 179.95595, 179.9552, 179.95465, 179.95432, 179.95352, 179.953, 179.95229, 179.95172, 179.95114, 179.95059, 179.95015, 179.94978, 179.94951, 179.94933, 179.94916, 179.94899, 179.94891, 179.94894, 179.94923, 179.95026, 179.95171, 179.9529, 179.95413, 179.95543, 179.95691, 179.95865, 179.96053, 179.96269, 179.96513, 179.96796, 179.97112, 179.97466, 179.97838, 179.98239, 179.98705, 179.9922, 179.99811, 180.00458, 180.01144, 180.0188, 180.0265, 180.0349, 180.04382, 180.05347, 180.06361, 180.07454, 180.0863, 180.09869, 180.1114, 180.12436, 180.13821, 180.15294, 180.16814, 180.18376, 180.20035, 180.21758, 180.23528, 180.25388, 180.27333, 180.2935, 180.31477, 180.33707, 180.36023, 180.38481, 180.4104, 180.43663, 180.46335, 180.49043, 180.51775, 180.54597, 180.57475, 180.60458, 180.63466, 180.66501, 180.69615, 180.72832, 180.76106, 180.79457, 180.82857, 180.86211, 180.89636, 180.93251, 180.97021, 181.00865, 181.04654, 181.08444, 181.12204, 181.1591, 181.19463, 181.22873, 181.26352, 181.29965, 181.33498, 181.36926, 181.40433, 181.44101, 181.47787, 181.51541, 181.55309, 181.58995, 181.62593, 181.66238, 181.69963, 181.73865, 181.77856, 181.819, 181.85893, 181.89955, 181.94034, 181.98015, 182.01802, 182.05594, 182.09499, 182.13466, 182.17516, 182.21599, 182.25551, 182.29494, 182.33302, 182.36942, 182.40552, 182.44077, 182.47746, 182.51506, 182.55521, 182.59557, 182.63631, 182.67693, 182.71771, 182.75752, 182.79524, 182.83229, 182.8694, 182.90648, 182.94411, 182.98082, 183.01617, 183.05077, 183.08421, 183.11528, 183.14688, 183.17844, 183.21207, 183.24745, 183.28352, 183.31885, 183.35526, 183.39171, 183.42731, 183.46333, 183.49973, 183.53497, 183.57001, 183.60588, 183.64211, 183.6795, 183.71835, 183.75874, 183.79941, 183.83905, 183.87886, 183.91798, 183.95557, 183.99252, 184.02957, 184.06734, 184.1066, 184.14734, 184.18813, 184.22699, 184.26306, 184.29767, 184.33336, 184.36948, 184.40587, 184.44305, 184.48088, 184.51953, 184.55611, 184.58971, 184.62381, 184.65984, 184.6958, 184.73257, 184.76843, 184.80443, 184.84024, 184.87787, 184.91624, 184.9561, 184.99586, 185.03816, 185.08003, 185.12041, 185.16002, 185.19998, 185.23941, 185.27916, 185.31915, 185.35942, 185.3989, 185.43639, 185.4734, 185.51125, 185.54845, 185.5865, 185.62511, 185.66444, 185.70372, 185.74438, 185.78564, 185.82716, 185.86717, 185.90334, 185.937, 185.97195, 186.00873, 186.04741, 186.0872, 186.12794, 186.16808, 186.20654, 186.24687, 186.28903, 186.3307, 186.3723, 186.4149, 186.45834, 186.50229, 186.54523, 186.58723, 186.62804, 186.66795, 186.70871, 186.75044, 186.79398, 186.83716, 186.88002, 186.92215, 186.96371, 187.00597, 187.04924, 187.09216, 187.13554, 187.17883, 187.22208, 187.26509, 187.30769, 187.34932, 187.39163, 187.43529, 187.47867, 187.52255, 187.5659, 187.6091, 187.65163, 187.6926, 187.7334, 187.77498, 187.81706, 187.85999, 187.90363, 187.94743, 187.99174, 188.03735, 188.08296, 188.12976, 188.17722, 188.22394, 188.27153, 188.31853, 188.3636, 188.40756, 188.45032, 188.49333, 188.53738, 188.58321, 188.62881, 188.67557, 188.722, 188.76859, 188.81543, 188.86082, 188.90515, 188.94725, 188.9901, 189.0343, 189.07765, 189.12099, 189.16522, 189.21011, 189.25642, 189.3047, 189.35202, 189.39963, 189.4478, 189.49484, 189.5425, 189.59079, 189.63968, 189.68971, 189.74034, 189.79134, 189.84206, 189.89209, 189.9409, 189.99072, 190.04274, 190.09349, 190.14539, 190.19702, 190.24873, 190.30104, 190.35287, 190.4046, 190.45503, 190.50591, 190.55637, 190.60674, 190.65721, 190.70746, 190.75826, 190.80876, 190.8571, 190.90599, 190.95639, 191.00842, 191.06157, 191.11446, 191.16747, 191.22209, 191.2751, 191.32907, 191.38292, 191.43571, 191.48894, 191.54314, 191.59731, 191.65059, 191.70349, 191.75555, 191.80978, 191.86417, 191.91782, 191.97105, 192.02429, 192.0775, 192.13084, 192.18512, 192.24028, 192.29506, 192.35016, 192.40651, 192.4622, 192.51747, 192.57224, 192.62952, 192.687, 192.74483, 192.80281, 192.86006, 192.91705, 192.97177, 193.02679, 193.08273, 193.13742, 193.1917, 193.24458, 193.29779, 193.35132, 193.40689, 193.46413, 193.52164, 193.57927, 193.63789, 193.69646, 193.75464, 193.81409, 193.87488, 193.93707, 193.99841, 194.05937, 194.11984, 194.17958, 194.23772, 194.29633, 194.35521, 194.41174, 194.46733, 194.52335, 194.58064, 194.6398]}, "params-norm vs samples": {"start_step": 0, "end_step": 2000, "step_interval": 5, "values": [179.95639, 179.95639, 179.95639, 179.95639, 179.95639, 179.95639, 179.95636, 179.95616, 179.95595, 179.9552, 179.95465, 179.95432, 179.95352, 179.953, 179.95229, 179.95172, 179.95114, 179.95059, 179.95015, 179.94978, 179.94951, 179.94933, 179.94916, 179.94899, 179.94891, 179.94894, 179.94923, 179.95026, 179.95171, 179.9529, 179.95413, 179.95543, 179.95691, 179.95865, 179.96053, 179.96269, 179.96513, 179.96796, 179.97112, 179.97466, 179.97838, 179.98239, 179.98705, 179.9922, 179.99811, 180.00458, 180.01144, 180.0188, 180.0265, 180.0349, 180.04382, 180.05347, 180.06361, 180.07454, 180.0863, 180.09869, 180.1114, 180.12436, 180.13821, 180.15294, 180.16814, 180.18376, 180.20035, 180.21758, 180.23528, 180.25388, 180.27333, 180.2935, 180.31477, 180.33707, 180.36023, 180.38481, 180.4104, 180.43663, 180.46335, 180.49043, 180.51775, 180.54597, 180.57475, 180.60458, 180.63466, 180.66501, 180.69615, 180.72832, 180.76106, 180.79457, 180.82857, 180.86211, 180.89636, 180.93251, 180.97021, 181.00865, 181.04654, 181.08444, 181.12204, 181.1591, 181.19463, 181.22873, 181.26352, 181.29965, 181.33498, 181.36926, 181.40433, 181.44101, 181.47787, 181.51541, 181.55309, 181.58995, 181.62593, 181.66238, 181.69963, 181.73865, 181.77856, 181.819, 181.85893, 181.89955, 181.94034, 181.98015, 182.01802, 182.05594, 182.09499, 182.13466, 182.17516, 182.21599, 182.25551, 182.29494, 182.33302, 182.36942, 182.40552, 182.44077, 182.47746, 182.51506, 182.55521, 182.59557, 182.63631, 182.67693, 182.71771, 182.75752, 182.79524, 182.83229, 182.8694, 182.90648, 182.94411, 182.98082, 183.01617, 183.05077, 183.08421, 183.11528, 183.14688, 183.17844, 183.21207, 183.24745, 183.28352, 183.31885, 183.35526, 183.39171, 183.42731, 183.46333, 183.49973, 183.53497, 183.57001, 183.60588, 183.64211, 183.6795, 183.71835, 183.75874, 183.79941, 183.83905, 183.87886, 183.91798, 183.95557, 183.99252, 184.02957, 184.06734, 184.1066, 184.14734, 184.18813, 184.22699, 184.26306, 184.29767, 184.33336, 184.36948, 184.40587, 184.44305, 184.48088, 184.51953, 184.55611, 184.58971, 184.62381, 184.65984, 184.6958, 184.73257, 184.76843, 184.80443, 184.84024, 184.87787, 184.91624, 184.9561, 184.99586, 185.03816, 185.08003, 185.12041, 185.16002, 185.19998, 185.23941, 185.27916, 185.31915, 185.35942, 185.3989, 185.43639, 185.4734, 185.51125, 185.54845, 185.5865, 185.62511, 185.66444, 185.70372, 185.74438, 185.78564, 185.82716, 185.86717, 185.90334, 185.937, 185.97195, 186.00873, 186.04741, 186.0872, 186.12794, 186.16808, 186.20654, 186.24687, 186.28903, 186.3307, 186.3723, 186.4149, 186.45834, 186.50229, 186.54523, 186.58723, 186.62804, 186.66795, 186.70871, 186.75044, 186.79398, 186.83716, 186.88002, 186.92215, 186.96371, 187.00597, 187.04924, 187.09216, 187.13554, 187.17883, 187.22208, 187.26509, 187.30769, 187.34932, 187.39163, 187.43529, 187.47867, 187.52255, 187.5659, 187.6091, 187.65163, 187.6926, 187.7334, 187.77498, 187.81706, 187.85999, 187.90363, 187.94743, 187.99174, 188.03735, 188.08296, 188.12976, 188.17722, 188.22394, 188.27153, 188.31853, 188.3636, 188.40756, 188.45032, 188.49333, 188.53738, 188.58321, 188.62881, 188.67557, 188.722, 188.76859, 188.81543, 188.86082, 188.90515, 188.94725, 188.9901, 189.0343, 189.07765, 189.12099, 189.16522, 189.21011, 189.25642, 189.3047, 189.35202, 189.39963, 189.4478, 189.49484, 189.5425, 189.59079, 189.63968, 189.68971, 189.74034, 189.79134, 189.84206, 189.89209, 189.9409, 189.99072, 190.04274, 190.09349, 190.14539, 190.19702, 190.24873, 190.30104, 190.35287, 190.4046, 190.45503, 190.50591, 190.55637, 190.60674, 190.65721, 190.70746, 190.75826, 190.80876, 190.8571, 190.90599, 190.95639, 191.00842, 191.06157, 191.11446, 191.16747, 191.22209, 191.2751, 191.32907, 191.38292, 191.43571, 191.48894, 191.54314, 191.59731, 191.65059, 191.70349, 191.75555, 191.80978, 191.86417, 191.91782, 191.97105, 192.02429, 192.0775, 192.13084, 192.18512, 192.24028, 192.29506, 192.35016, 192.40651, 192.4622, 192.51747, 192.57224, 192.62952, 192.687, 192.74483, 192.80281, 192.86006, 192.91705, 192.97177, 193.02679, 193.08273, 193.13742, 193.1917, 193.24458, 193.29779, 193.35132, 193.40689, 193.46413, 193.52164, 193.57927, 193.63789, 193.69646, 193.75464, 193.81409, 193.87488, 193.93707, 193.99841, 194.05937, 194.11984, 194.17958, 194.23772, 194.29633, 194.35521, 194.41174, 194.46733, 194.52335, 194.58064, 194.6398]}, "iteration-time": {"start_step": 0, "end_step": 2000, "step_interval": 5, "values": [18.07681, 0.38236, 0.3815, 0.38004, 0.39049, 0.39656, 0.39642, 0.39048, 0.39523, 0.39194, 0.5552, 0.3948, 0.39398, 0.39561, 0.39214, 0.39537, 0.39216, 0.39261, 0.39694, 0.39356, 0.4003, 0.39114, 0.39355, 0.3919, 0.39064, 0.40086, 0.39355, 0.39139, 0.38492, 0.3927, 0.40428, 0.38479, 0.38466, 0.38299, 0.38174, 0.38636, 0.38086, 0.38401, 0.38601, 0.40511, 0.38629, 0.38521, 0.3855, 0.38256, 0.38493, 0.38553, 0.38438, 0.38462, 0.38628, 0.38214, 0.38492, 0.38322, 0.38706, 0.38103, 0.38314, 0.38469, 0.38271, 0.38565, 0.38283, 0.38163, 0.37833, 0.38621, 0.37993, 0.37921, 0.38058, 0.38093, 0.38301, 0.38316, 0.38564, 0.38136, 0.38386, 0.38121, 0.38145, 0.37922, 0.48103, 0.37987, 0.38025, 0.38308, 0.38613, 0.38258, 0.38336, 0.38508, 0.3887, 0.38459, 0.38233, 0.38094, 0.38026, 0.38316, 0.3802, 0.38401, 0.38409, 0.38327, 0.39188, 0.38081, 0.38297, 0.38391, 0.38075, 0.38566, 0.38249, 0.38281, 0.38433, 0.38249, 0.37955, 0.38003, 0.47628, 0.38394, 0.38015, 0.40241, 0.37987, 0.38149, 0.38158, 0.38618, 0.38356, 0.38072, 0.3889, 0.38918, 0.38574, 0.38775, 0.38338, 0.39021, 0.38146, 0.38236, 0.38742, 0.3868, 0.38407, 0.38593, 0.38727, 0.39089, 0.39337, 0.38585, 0.38443, 0.38667, 0.3868, 0.39023, 0.49507, 0.38161, 0.38081, 0.38199, 0.48238, 0.53269, 0.38537, 0.38444, 0.38705, 0.39224, 0.38871, 0.3845, 0.38286, 0.38071, 0.38022, 0.38228, 0.38177, 0.38417, 0.3801, 0.38435, 0.38639, 0.38626, 0.38489, 0.38587, 0.38488, 0.38407, 0.3867, 0.38401, 0.3866, 0.38593, 0.38916, 0.3833, 0.38389, 0.3843, 0.38359, 0.38697, 0.38383, 0.38577, 0.38399, 0.38402, 0.38788, 0.3861, 0.38511, 0.38672, 0.38227, 0.38915, 0.38446, 0.3859, 0.37898, 0.381, 0.38613, 0.38362, 0.3831, 0.37854, 0.37897, 0.37818, 0.37983, 0.38369, 0.37982, 0.38105, 0.38549, 0.38522, 0.38518, 0.38435, 0.47441, 0.38233, 0.37927, 0.38248, 0.38035, 0.37886, 0.38094, 0.3816, 0.38623, 0.38907, 0.38824, 0.38363, 0.38085, 0.38241, 0.38688, 0.3809, 0.38401, 0.3846, 0.38278, 0.38686, 0.38509, 0.38569, 0.38138, 0.38221, 0.38366, 0.39376, 0.39173, 0.38031, 0.38231, 0.47746, 0.38191, 0.38528, 0.38919, 0.38627, 0.38485, 0.39016, 0.48709, 0.39134, 0.38991, 0.38575, 0.3826, 0.38101, 0.38387, 0.38025, 0.37997, 0.50302, 0.38436, 0.38473, 0.38639, 0.38633, 0.3928, 0.38343, 0.38522, 0.38229, 0.37817, 0.38096, 0.38116, 0.3867, 0.38377, 0.38146, 0.38226, 0.38398, 0.39339, 0.3803, 0.48334, 0.38398, 0.38072, 0.38756, 0.38406, 0.38475, 0.3865, 0.3837, 0.39344, 0.38796, 0.38926, 0.38703, 0.38603, 0.37954, 0.38341, 0.38785, 0.38335, 0.38263, 0.38197, 0.38334, 0.3861, 0.38808, 0.38389, 0.38779, 0.39044, 0.38432, 0.38303, 0.38348, 0.38756, 0.38699, 0.47757, 0.38391, 0.38223, 0.38479, 0.38831, 0.38749, 0.384, 0.3864, 0.38554, 0.38656, 0.38469, 0.38559, 0.38552, 0.38634, 0.39068, 0.38718, 0.38906, 0.38314, 0.38526, 0.39355, 0.38547, 0.3918, 0.38838, 0.39149, 0.38788, 0.38735, 0.38776, 0.38498, 0.3845, 0.3809, 0.38438, 0.38342, 0.38109, 0.38385, 0.3847, 0.38354, 0.38456, 0.48679, 0.38819, 0.38623, 0.3908, 0.39049, 0.38764, 0.39009, 0.3899, 0.39171, 0.39325, 0.39116, 0.38744, 0.38994, 0.3945, 0.38791, 0.3872, 0.3882, 0.38525, 0.38534, 0.38602, 0.38534, 0.38256, 0.38598, 0.38572, 0.37898, 0.38512, 0.38512, 0.38361, 0.39213, 0.38551, 0.38269, 0.38516, 0.38696, 0.38679, 0.37971, 0.38365, 0.38484, 0.38698, 0.39395, 0.38701, 0.38655, 0.38288, 0.38233, 0.38642, 0.38468, 0.38309, 0.38362, 0.38617, 0.3863, 0.38907, 0.38471, 0.38686, 0.38576, 0.3853, 0.38783, 0.3863, 0.38804, 0.38654, 0.48838, 0.39169, 0.38856, 0.47555, 0.38859, 0.39202, 0.38824, 0.59598, 0.38895, 0.38921, 0.38633, 0.38705, 0.38574]}, "optimizer-clip-main-grad-time": {"start_step": 0, "end_step": 1983, "step_interval": 5, "values": [0.02457, 0.00089, 0.00088, 0.00089, 0.00088, 0.00089, 0.00089, 0.00089, 0.0009, 0.00089, 0.00091, 0.00095, 0.00088, 0.0009, 0.00088, 0.00088, 0.00089, 0.0009, 0.0009, 0.00089, 0.0009, 0.00088, 0.00088, 0.00088, 0.00089, 0.00089, 0.00089, 0.00088, 0.00087, 0.00088, 0.00088, 0.00088, 0.00088, 0.00089, 0.00093, 0.00088, 0.00088, 0.0009, 0.00092, 0.00089, 0.00088, 0.00088, 0.00089, 0.00088, 0.00089, 0.00089, 0.00089, 0.00099, 0.00088, 0.00088, 0.00089, 0.00089, 0.00088, 0.00088, 0.00088, 0.00088, 0.00088, 0.0009, 0.00126, 0.00088, 0.00088, 0.00088, 0.00094, 0.00088, 0.00087, 0.00088, 0.00087, 0.00088, 0.00088, 0.0009, 0.00087, 0.00088, 0.00088, 0.00088, 0.00087, 0.00088, 0.00087, 0.00125, 0.00093, 0.0009, 0.00088, 0.00088, 0.00088, 0.00088, 0.00088, 0.00088, 0.00088, 0.00098, 0.00088, 0.00112, 0.00088, 0.00088, 0.00089, 0.00087, 0.00088, 0.00087, 0.00088, 0.00088, 0.00088, 0.00089, 0.0009, 0.00087, 0.00088, 0.00088, 0.00091, 0.00088, 0.00088, 0.00088, 0.00088, 0.00092, 0.00087, 0.00066, 0.00088, 0.00088, 0.0009, 0.00065, 0.00088, 0.00088, 0.00066, 0.00089, 0.00089, 0.00066, 0.00088, 0.001, 0.00088, 0.00088, 0.0009, 0.00066, 0.00066, 0.00088, 0.00067, 0.00089, 0.00089, 0.00067, 0.00088, 0.00089, 0.00087, 0.00087, 0.00095, 0.00088, 0.00087, 0.00088, 0.00087, 0.00089, 0.00089, 0.00088, 0.00089, 0.00089, 0.00088, 0.00089, 0.0009, 0.00087, 0.00087, 0.00089, 0.00088, 0.00087, 0.00087, 0.00087, 0.00087, 0.00088, 0.00088, 0.00089, 0.00088, 0.0009, 0.00089, 0.00087, 0.00087, 0.00087, 0.00089, 0.00089, 0.00094, 0.00088, 0.00087, 0.00087, 0.00088, 0.00088, 0.00087, 0.00087, 0.00088, 0.00088, 0.00088, 0.00087, 0.00087, 0.00087, 0.00087, 0.00088, 0.00088, 0.00087, 0.00087, 0.00098, 0.00088, 0.00091, 0.00087, 0.00087, 0.00089, 0.00088, 0.00088, 0.00088, 0.00091, 0.00087, 0.00088, 0.00107, 0.00095, 0.00088, 0.00087, 0.00088, 0.00094, 0.00093, 0.00087, 0.00089, 0.00087, 0.00088, 0.00087, 0.00089, 0.00087, 0.00087, 0.00087, 0.00087, 0.00088, 0.00089, 0.00087, 0.00087, 0.00088, 0.00089, 0.00087, 0.00087, 0.00094, 0.00088, 0.00087, 0.00089, 0.00093, 0.00088, 0.00087, 0.00087, 0.00088, 0.00088, 0.00088, 0.00088, 0.00095, 0.00087, 0.00087, 0.00087, 0.00087, 0.00087, 0.00108, 0.00087, 0.00089, 0.00089, 0.00089, 0.00088, 0.001, 0.00088, 0.00094, 0.00088, 0.00087, 0.00088, 0.00095, 0.0009, 0.00089, 0.00089, 0.00088, 0.00088, 0.00089, 0.00088, 0.0009, 0.00089, 0.00088, 0.00088, 0.00087, 0.00088, 0.00089, 0.00088, 0.00087, 0.00088, 0.00087, 0.00089, 0.00091, 0.00088, 0.00096, 0.00088, 0.00092, 0.00088, 0.00088, 0.00088, 0.00088, 0.00088, 0.00088, 0.00087, 0.00089, 0.00088, 0.00091, 0.00095, 0.00088, 0.00088, 0.00095, 0.0009, 0.00089, 0.00092, 0.00093, 0.00099, 0.00088, 0.0009, 0.00087, 0.00088, 0.00096, 0.00088, 0.00097, 0.00087, 0.00088, 0.00087, 0.00088, 0.00088, 0.00098, 0.00089, 0.00097, 0.00087, 0.00087, 0.00087, 0.00088, 0.00089, 0.00088, 0.00089, 0.00088, 0.00088, 0.00087, 0.00087, 0.00099, 0.00089, 0.00088, 0.00088, 0.00087, 0.00088, 0.00088, 0.00089, 0.00087, 0.00088, 0.00088, 0.0009, 0.00091, 0.00089, 0.00087, 0.00088, 0.00089, 0.00089, 0.00087, 0.00088, 0.00094, 0.00088, 0.00088, 0.00088, 0.00088, 0.00089, 0.00087, 0.00106, 0.0009, 0.00089, 0.00088, 0.00096, 0.00089, 0.00098, 0.00088, 0.00088, 0.00088, 0.00091, 0.00087, 0.00089, 0.00088, 0.00088, 0.00088, 0.00088, 0.00087, 0.00089, 0.00089, 0.00088, 0.00089, 0.00089, 0.00088, 0.00091, 0.00089, 0.00087, 0.0009, 0.00088, 0.00089, 0.00088, 0.00093, 0.00116, 0.00101, 0.00088, 0.00095, 0.00092, 0.00089, 0.00088, 0.00087, 0.00089, 0.00105, 0.0009, 0.00087]}, "optimizer-count-zeros-time": {"start_step": 0, "end_step": 1983, "step_interval": 5, "values": [0.01277, 0.00497, 0.00488, 0.00489, 0.00489, 0.00494, 0.00489, 0.0049, 0.00489, 0.00488, 0.00497, 0.00521, 0.0049, 0.00492, 0.00492, 0.0049, 0.00494, 0.00492, 0.00489, 0.00489, 0.00493, 0.0049, 0.00492, 0.0051, 0.00487, 0.00629, 0.005, 0.0049, 0.00492, 0.0049, 0.0049, 0.0049, 0.00488, 0.00492, 0.00535, 0.0049, 0.0049, 0.00494, 0.0049, 0.00494, 0.00489, 0.00489, 0.0049, 0.00491, 0.00492, 0.00491, 0.00599, 0.00523, 0.00489, 0.00489, 0.00491, 0.00491, 0.00491, 0.00494, 0.0049, 0.00489, 0.00491, 0.0049, 0.00491, 0.0049, 0.00491, 0.0049, 0.00525, 0.00492, 0.00493, 0.00489, 0.00489, 0.00492, 0.00491, 0.0049, 0.00491, 0.00491, 0.00492, 0.00489, 0.00489, 0.00493, 0.00493, 0.00498, 0.00519, 0.00491, 0.00491, 0.00492, 0.00498, 0.00492, 0.00494, 0.0049, 0.00489, 0.00567, 0.00489, 0.00491, 0.00491, 0.00524, 0.00489, 0.00491, 0.00489, 0.00504, 0.0056, 0.00501, 0.00491, 0.00493, 0.00492, 0.00491, 0.00491, 0.00491, 0.00489, 0.0049, 0.0049, 0.0049, 0.00492, 0.0049, 0.00491, 0.00491, 0.00602, 0.0049, 0.00494, 0.00489, 0.0049, 0.0049, 0.00491, 0.00492, 0.0049, 0.0049, 0.00491, 0.00598, 0.00492, 0.00491, 0.00489, 0.00494, 0.00491, 0.00491, 0.0049, 0.00494, 0.00492, 0.00544, 0.00488, 0.00491, 0.0049, 0.0049, 0.00503, 0.00491, 0.00491, 0.00491, 0.00493, 0.00494, 0.00493, 0.00492, 0.0049, 0.00492, 0.00488, 0.00489, 0.00515, 0.0049, 0.00498, 0.00492, 0.00493, 0.0049, 0.00491, 0.005, 0.00491, 0.00491, 0.00491, 0.00491, 0.00489, 0.00491, 0.0049, 0.0049, 0.00496, 0.00492, 0.00488, 0.00492, 0.00538, 0.00492, 0.00491, 0.00492, 0.00567, 0.00488, 0.00491, 0.00493, 0.00492, 0.00487, 0.00493, 0.0049, 0.00488, 0.00491, 0.00492, 0.0049, 0.00492, 0.0049, 0.0049, 0.00492, 0.0049, 0.0051, 0.0049, 0.00519, 0.00491, 0.00491, 0.00488, 0.00488, 0.00489, 0.00489, 0.00491, 0.00583, 0.0049, 0.0049, 0.00489, 0.00488, 0.0049, 0.00489, 0.00491, 0.00488, 0.0049, 0.00501, 0.00492, 0.00491, 0.0049, 0.0049, 0.0049, 0.00488, 0.0049, 0.00489, 0.00489, 0.0049, 0.00489, 0.00492, 0.00493, 0.00488, 0.0049, 0.00489, 0.0049, 0.00489, 0.00494, 0.00489, 0.00491, 0.00489, 0.00489, 0.0049, 0.00492, 0.00487, 0.00491, 0.00491, 0.00489, 0.00489, 0.00489, 0.00491, 0.00578, 0.0049, 0.00488, 0.00487, 0.00492, 0.0049, 0.00491, 0.00489, 0.00489, 0.00488, 0.0049, 0.00489, 0.00489, 0.00491, 0.00515, 0.00494, 0.0049, 0.00489, 0.00492, 0.00489, 0.00502, 0.00489, 0.00493, 0.00489, 0.00491, 0.00491, 0.00489, 0.0049, 0.00582, 0.00487, 0.00489, 0.0049, 0.00491, 0.00488, 0.00489, 0.00492, 0.00488, 0.00489, 0.00491, 0.00489, 0.00489, 0.0049, 0.00489, 0.00558, 0.00491, 0.0056, 0.00495, 0.00488, 0.00491, 0.00489, 0.00489, 0.00488, 0.0049, 0.0049, 0.00489, 0.00492, 0.00491, 0.0049, 0.00491, 0.00489, 0.0049, 0.00491, 0.00492, 0.00512, 0.00493, 0.00491, 0.00491, 0.0049, 0.00491, 0.00492, 0.00579, 0.00626, 0.00489, 0.00489, 0.0049, 0.00489, 0.00491, 0.00494, 0.00489, 0.00491, 0.0049, 0.0049, 0.00491, 0.00512, 0.0051, 0.00514, 0.00513, 0.00513, 0.00514, 0.00513, 0.00512, 0.00511, 0.00512, 0.00514, 0.0052, 0.00512, 0.00511, 0.00513, 0.00514, 0.00511, 0.00511, 0.00514, 0.00564, 0.00511, 0.00512, 0.00509, 0.00512, 0.00512, 0.00536, 0.00513, 0.00512, 0.00513, 0.00512, 0.00513, 0.00512, 0.00512, 0.00512, 0.00512, 0.00509, 0.00512, 0.00512, 0.00513, 0.00512, 0.00514, 0.00515, 0.00514, 0.00516, 0.00512, 0.00513, 0.00514, 0.00511, 0.00513, 0.00524, 0.00511, 0.00514, 0.00512, 0.00511, 0.00509, 0.00513, 0.00511, 0.00514, 0.00513, 0.00513, 0.00512, 0.0055, 0.0054, 0.00513, 0.0051, 0.0051, 0.00512, 0.00514, 0.00515, 0.00515]}, "optimizer-inner-step-time": {"start_step": 0, "end_step": 1983, "step_interval": 5, "values": [0.00686, 0.00099, 0.00098, 0.00098, 0.00098, 0.001, 0.00099, 0.00099, 0.00098, 0.00099, 0.00101, 0.00098, 0.00099, 0.00099, 0.00099, 0.001, 0.00099, 0.001, 0.001, 0.001, 0.001, 0.00098, 0.00097, 0.00099, 0.00098, 0.00124, 0.00098, 0.00098, 0.00098, 0.00098, 0.00098, 0.00101, 0.00101, 0.001, 0.001, 0.00098, 0.00099, 0.001, 0.00102, 0.00098, 0.00098, 0.00099, 0.00098, 0.00098, 0.00098, 0.00097, 0.001, 0.00102, 0.00097, 0.00098, 0.00099, 0.001, 0.00097, 0.00102, 0.00099, 0.00098, 0.00098, 0.00098, 0.001, 0.001, 0.001, 0.00098, 0.00098, 0.00098, 0.00098, 0.00098, 0.00097, 0.00097, 0.00099, 0.00098, 0.00098, 0.00098, 0.00104, 0.00097, 0.00098, 0.00099, 0.00098, 0.00117, 0.00101, 0.00101, 0.00099, 0.00097, 0.00098, 0.00097, 0.00099, 0.00098, 0.00098, 0.00101, 0.00099, 0.00098, 0.00098, 0.00098, 0.001, 0.00097, 0.00097, 0.00098, 0.001, 0.00097, 0.00097, 0.00098, 0.00099, 0.00098, 0.00098, 0.00098, 0.00098, 0.00097, 0.00097, 0.00098, 0.001, 0.00099, 0.00097, 0.00098, 0.001, 0.00098, 0.00098, 0.00099, 0.00098, 0.00098, 0.00099, 0.00099, 0.00099, 0.00097, 0.00097, 0.00099, 0.00098, 0.00097, 0.001, 0.00099, 0.00098, 0.00099, 0.001, 0.00097, 0.00099, 0.00102, 0.00099, 0.00098, 0.00097, 0.00099, 0.00099, 0.001, 0.00097, 0.00097, 0.00098, 0.00099, 0.001, 0.001, 0.00098, 0.001, 0.001, 0.00097, 0.00101, 0.00097, 0.00099, 0.00099, 0.00098, 0.001, 0.00099, 0.00098, 0.001, 0.00097, 0.00098, 0.001, 0.00099, 0.00099, 0.00099, 0.00098, 0.00098, 0.00097, 0.00098, 0.00099, 0.00098, 0.00099, 0.00097, 0.00098, 0.00103, 0.00097, 0.00097, 0.001, 0.00099, 0.00098, 0.00098, 0.00099, 0.00097, 0.00098, 0.00098, 0.00101, 0.001, 0.00099, 0.00098, 0.00098, 0.00097, 0.00102, 0.00098, 0.00097, 0.00098, 0.00097, 0.00097, 0.00099, 0.00102, 0.00096, 0.00099, 0.00097, 0.00096, 0.00097, 0.00097, 0.00099, 0.00096, 0.00097, 0.00097, 0.00097, 0.00096, 0.00097, 0.00098, 0.00097, 0.00098, 0.00156, 0.00097, 0.00096, 0.00097, 0.00096, 0.001, 0.00101, 0.00097, 0.00099, 0.00097, 0.00096, 0.00098, 0.00098, 0.00103, 0.00097, 0.00097, 0.00097, 0.00096, 0.00097, 0.00096, 0.00097, 0.00099, 0.00097, 0.00096, 0.00098, 0.00098, 0.00097, 0.00098, 0.00099, 0.00099, 0.00098, 0.00097, 0.00098, 0.00097, 0.00098, 0.00099, 0.001, 0.00099, 0.00098, 0.001, 0.00099, 0.00099, 0.00101, 0.00102, 0.00099, 0.00099, 0.00098, 0.00098, 0.00099, 0.00098, 0.00099, 0.00099, 0.00099, 0.001, 0.00099, 0.00099, 0.00098, 0.00101, 0.00099, 0.00099, 0.00099, 0.00097, 0.00099, 0.00099, 0.00098, 0.00098, 0.00104, 0.00098, 0.00099, 0.00099, 0.00098, 0.00099, 0.00099, 0.00099, 0.00099, 0.001, 0.00097, 0.00099, 0.00098, 0.00098, 0.001, 0.00099, 0.00099, 0.00098, 0.00099, 0.00098, 0.00097, 0.00098, 0.00099, 0.00099, 0.00099, 0.00098, 0.00104, 0.00099, 0.00098, 0.00099, 0.00099, 0.001, 0.00099, 0.00098, 0.001, 0.00099, 0.00096, 0.00098, 0.00099, 0.00099, 0.001, 0.00099, 0.00097, 0.00099, 0.00099, 0.00098, 0.00099, 0.00099, 0.00103, 0.00099, 0.00098, 0.00099, 0.00097, 0.00098, 0.00099, 0.00098, 0.00098, 0.00101, 0.00098, 0.00099, 0.00099, 0.00098, 0.00156, 0.00103, 0.00098, 0.001, 0.00098, 0.00099, 0.00098, 0.00098, 0.00099, 0.00098, 0.001, 0.001, 0.00098, 0.00102, 0.00098, 0.00098, 0.00099, 0.00098, 0.00098, 0.00099, 0.001, 0.00098, 0.00098, 0.00098, 0.00098, 0.00098, 0.00099, 0.00097, 0.00099, 0.00096, 0.00102, 0.00098, 0.00099, 0.001, 0.001, 0.001, 0.001, 0.00099, 0.001, 0.00099, 0.00099, 0.001, 0.001, 0.00104, 0.00099, 0.00099, 0.00099, 0.00099, 0.001, 0.00099, 0.00099, 0.001, 0.00099, 0.00099]}, "optimizer-copy-main-to-model-params-time": {"start_step": 0, "end_step": 1983, "step_interval": 5, "values": [0.00107, 0.00103, 0.00102, 0.00102, 0.00102, 0.00102, 0.00102, 0.00102, 0.00103, 0.00103, 0.00103, 0.00103, 0.00104, 0.00101, 0.00103, 0.00103, 0.00104, 0.00105, 0.00103, 0.00103, 0.00104, 0.00103, 0.00102, 0.00104, 0.00102, 0.00163, 0.00103, 0.00103, 0.00102, 0.00102, 0.00102, 0.00102, 0.00102, 0.00104, 0.00104, 0.00103, 0.00102, 0.00103, 0.00104, 0.00103, 0.00103, 0.00103, 0.00102, 0.00103, 0.00103, 0.00102, 0.00108, 0.00106, 0.00102, 0.00103, 0.00103, 0.00104, 0.00102, 0.00103, 0.00102, 0.00103, 0.00102, 0.00103, 0.00102, 0.00103, 0.00103, 0.00103, 0.00103, 0.00104, 0.00102, 0.00103, 0.00102, 0.00103, 0.00102, 0.00103, 0.00102, 0.00102, 0.00103, 0.00103, 0.00102, 0.00103, 0.00102, 0.00115, 0.00105, 0.00126, 0.00103, 0.00103, 0.00102, 0.00103, 0.00103, 0.00103, 0.00102, 0.00106, 0.00102, 0.00103, 0.00102, 0.00114, 0.00102, 0.00103, 0.00102, 0.00102, 0.00104, 0.00103, 0.00103, 0.00103, 0.00102, 0.00103, 0.00102, 0.00102, 0.00102, 0.00102, 0.00102, 0.00103, 0.00102, 0.00103, 0.00102, 0.00103, 0.00107, 0.00103, 0.00103, 0.00103, 0.00102, 0.00103, 0.00102, 0.00103, 0.00102, 0.00103, 0.00102, 0.00109, 0.00103, 0.00103, 0.00103, 0.00105, 0.00102, 0.00103, 0.00103, 0.00103, 0.00103, 0.00105, 0.00102, 0.00102, 0.00103, 0.00103, 0.00102, 0.00102, 0.00103, 0.00103, 0.00102, 0.00103, 0.00103, 0.00103, 0.00102, 0.00103, 0.00102, 0.00102, 0.00104, 0.00103, 0.00103, 0.00103, 0.00102, 0.00102, 0.00102, 0.00102, 0.00102, 0.00102, 0.00102, 0.00103, 0.00103, 0.00103, 0.00103, 0.00103, 0.00102, 0.00102, 0.00102, 0.00104, 0.00102, 0.00103, 0.00102, 0.00102, 0.00108, 0.00103, 0.00102, 0.00103, 0.00115, 0.00102, 0.00103, 0.00103, 0.00102, 0.00103, 0.00103, 0.00103, 0.00102, 0.00103, 0.00104, 0.00103, 0.00102, 0.00106, 0.00102, 0.00102, 0.00103, 0.00103, 0.00099, 0.001, 0.00103, 0.001, 0.001, 0.00105, 0.00101, 0.00099, 0.00099, 0.001, 0.00099, 0.001, 0.00099, 0.00099, 0.00099, 0.00099, 0.00099, 0.00099, 0.001, 0.00111, 0.001, 0.00099, 0.001, 0.00099, 0.00105, 0.00099, 0.00099, 0.001, 0.00099, 0.00099, 0.00099, 0.00099, 0.001, 0.001, 0.00099, 0.001, 0.00099, 0.00099, 0.00101, 0.00099, 0.00101, 0.001, 0.00099, 0.001, 0.00106, 0.001, 0.001, 0.001, 0.00104, 0.001, 0.001, 0.001, 0.00099, 0.00106, 0.001, 0.001, 0.001, 0.001, 0.00101, 0.00102, 0.00099, 0.00101, 0.00101, 0.001, 0.00099, 0.001, 0.00101, 0.00101, 0.00101, 0.00106, 0.001, 0.00101, 0.001, 0.00102, 0.001, 0.00101, 0.00106, 0.001, 0.001, 0.00101, 0.00099, 0.00101, 0.00101, 0.00101, 0.001, 0.00101, 0.00101, 0.00101, 0.001, 0.00101, 0.00101, 0.00105, 0.00101, 0.00103, 0.00101, 0.001, 0.001, 0.00101, 0.00107, 0.001, 0.00106, 0.001, 0.001, 0.001, 0.001, 0.001, 0.001, 0.001, 0.001, 0.00101, 0.00102, 0.00102, 0.001, 0.001, 0.00101, 0.001, 0.001, 0.001, 0.00106, 0.00107, 0.00099, 0.00107, 0.001, 0.00101, 0.001, 0.001, 0.001, 0.00101, 0.001, 0.001, 0.00101, 0.001, 0.00099, 0.001, 0.00101, 0.001, 0.001, 0.001, 0.001, 0.00107, 0.001, 0.001, 0.00101, 0.001, 0.001, 0.001, 0.00101, 0.00106, 0.00099, 0.00102, 0.00102, 0.001, 0.001, 0.001, 0.001, 0.00101, 0.00099, 0.00103, 0.00101, 0.001, 0.001, 0.001, 0.001, 0.001, 0.001, 0.00101, 0.00103, 0.00102, 0.001, 0.00101, 0.001, 0.001, 0.001, 0.001, 0.001, 0.001, 0.00099, 0.00102, 0.001, 0.001, 0.001, 0.00101, 0.00101, 0.001, 0.00099, 0.001, 0.00101, 0.00102, 0.00101, 0.00102, 0.00101, 0.00101, 0.001, 0.00101, 0.00101, 0.00101, 0.001, 0.001, 0.00101, 0.001, 0.001]}, "grad-norm": {"start_step": 0, "end_step": 1983, "step_interval": 5, "values": [11.77525, 12.26804, 11.19281, 14.50237, 14.014, 11.57186, 8.3922, 7.10897, 4.47266, 4.00434, 3.4, 2.71736, 2.45629, 2.30739, 2.29493, 2.25132, 2.01839, 2.41173, 2.01298, 2.00525, 2.18932, 1.91353, 1.88951, 2.28883, 2.07903, 1.8844, 1.87495, 2.08513, 2.01874, 2.01118, 2.0102, 1.89229, 1.99489, 1.65446, 2.02134, 1.98456, 2.13312, 2.05074, 1.91832, 1.88506, 1.86975, 1.90714, 2.10548, 1.83107, 1.85561, 1.89757, 1.77389, 1.83901, 1.60882, 1.67073, 1.57953, 1.73056, 1.77582, 1.85094, 1.58796, 1.69243, 2.01012, 1.72305, 1.68342, 1.77634, 1.52051, 1.58604, 1.75613, 1.50876, 1.38814, 1.4853, 1.45829, 1.51675, 1.54655, 1.47158, 1.51099, 1.4708, 1.47268, 1.47452, 1.44323, 1.32185, 1.33599, 1.35564, 1.29533, 1.27928, 1.44962, 1.33226, 1.18991, 1.39956, 1.21257, 1.16175, 1.05645, 1.15134, 1.32979, 1.15427, 1.22191, 1.18197, 1.5911, 1.3589, 1.27604, 1.13871, 1.30626, 1.67866, 1.52014, 1.03431, 1.05476, 1.3049, 1.25479, 1.22714, 1.69201, 1.08131, 1.00908, 1.10419, 1.08066, 1.12768, 1.24403, 0.87723, 0.92972, 1.02293, 1.07062, 0.98243, 1.24502, 1.2897, 0.94461, 1.09023, 1.04658, 0.90251, 1.12421, 1.65432, 1.09595, 1.17882, 1.36022, 0.96059, 0.98043, 1.05339, 0.96416, 1.13229, 1.12844, 0.93359, 1.82877, 1.40011, 1.43068, 1.3027, 1.089, 1.64716, 1.37833, 1.56985, 1.16612, 1.85125, 1.24379, 1.71309, 1.39309, 1.27937, 1.17708, 1.73543, 1.05896, 1.24373, 1.38937, 1.36918, 1.42323, 1.77943, 1.13157, 1.27948, 1.19267, 1.34154, 1.40098, 1.16252, 1.42404, 1.2011, 1.00676, 1.48416, 1.13391, 1.33486, 1.5395, 1.27609, 1.42471, 1.30575, 1.22047, 1.81347, 1.74187, 1.56562, 1.47675, 1.51655, 1.70821, 1.44154, 1.50096, 1.28826, 1.74901, 1.90029, 1.42234, 1.44455, 1.76719, 1.84971, 1.73982, 1.24814, 1.53885, 1.39306, 1.62267, 1.27091, 1.59048, 1.06674, 1.40639, 1.29128, 1.69617, 1.31246, 1.4525, 1.29959, 1.38347, 1.4963, 1.45118, 1.62261, 1.8211, 1.48622, 1.35396, 1.364, 1.22302, 1.21036, 1.59732, 1.16621, 1.43458, 1.39264, 1.50491, 1.74865, 1.69988, 1.54719, 1.66156, 1.38606, 1.43929, 1.37822, 1.30248, 1.79296, 1.45361, 1.24972, 1.59221, 1.3686, 1.22551, 1.4158, 1.49894, 1.55813, 1.52684, 1.44435, 2.05338, 1.36019, 1.34284, 1.20815, 1.7307, 1.50669, 2.1527, 1.33714, 1.40114, 1.51052, 1.35152, 1.43159, 1.42052, 1.44093, 1.62874, 1.70468, 1.84621, 1.36339, 1.49409, 1.99351, 1.25437, 1.69787, 1.77453, 1.53971, 1.98798, 1.46692, 1.21412, 1.35855, 1.61255, 1.37129, 1.69078, 1.53059, 1.31087, 1.87886, 1.31042, 1.42235, 1.38194, 1.39636, 1.83392, 1.47651, 1.46996, 1.64541, 1.53153, 1.47267, 1.75528, 1.44853, 1.39865, 1.75941, 1.63286, 1.32552, 1.6715, 2.26149, 1.61139, 1.35216, 1.34936, 1.25166, 1.69472, 1.58245, 1.4379, 1.43627, 1.60457, 1.82215, 1.39138, 1.38678, 1.55708, 1.41296, 1.29816, 1.46066, 1.39994, 1.45437, 1.25759, 1.34921, 1.47682, 1.55246, 1.48338, 1.2271, 1.36154, 1.44453, 1.47772, 1.43402, 1.21249, 1.8034, 1.50506, 1.3131, 1.37503, 1.35584, 1.41307, 1.45748, 1.26629, 1.31721, 1.47686, 1.80237, 1.55348, 1.5369, 1.32871, 1.35524, 1.76226, 1.27945, 1.40786, 1.56063, 1.18102, 1.26595, 1.41714, 1.27185, 1.59955, 1.53902, 1.50856, 1.38342, 1.3716, 1.52597, 1.55924, 1.33891, 1.44137, 1.66178, 1.44058, 1.53213, 1.34923, 1.54826, 1.51369, 1.26166, 1.22057, 1.64988, 1.4183, 1.45977, 1.27097, 1.31805, 1.24715, 1.52412, 1.48112, 1.51313, 1.58975, 1.42731, 1.32647, 1.44532, 1.53827, 1.72661, 1.53155, 1.57687, 1.2723, 1.26403, 1.36125, 1.36611, 1.46818, 1.38679, 1.58433, 1.49566, 1.44288, 1.37271, 1.45317, 1.36918, 1.35342, 1.27732, 1.37088, 1.29411, 1.25869, 1.46478, 1.43992, 1.66108, 1.34488, 1.17599, 1.3251]}, "grad-norm vs samples": {"start_step": 0, "end_step": 1983, "step_interval": 5, "values": [11.77525, 12.26804, 11.19281, 14.50237, 14.014, 11.57186, 8.3922, 7.10897, 4.47266, 4.00434, 3.4, 2.71736, 2.45629, 2.30739, 2.29493, 2.25132, 2.01839, 2.41173, 2.01298, 2.00525, 2.18932, 1.91353, 1.88951, 2.28883, 2.07903, 1.8844, 1.87495, 2.08513, 2.01874, 2.01118, 2.0102, 1.89229, 1.99489, 1.65446, 2.02134, 1.98456, 2.13312, 2.05074, 1.91832, 1.88506, 1.86975, 1.90714, 2.10548, 1.83107, 1.85561, 1.89757, 1.77389, 1.83901, 1.60882, 1.67073, 1.57953, 1.73056, 1.77582, 1.85094, 1.58796, 1.69243, 2.01012, 1.72305, 1.68342, 1.77634, 1.52051, 1.58604, 1.75613, 1.50876, 1.38814, 1.4853, 1.45829, 1.51675, 1.54655, 1.47158, 1.51099, 1.4708, 1.47268, 1.47452, 1.44323, 1.32185, 1.33599, 1.35564, 1.29533, 1.27928, 1.44962, 1.33226, 1.18991, 1.39956, 1.21257, 1.16175, 1.05645, 1.15134, 1.32979, 1.15427, 1.22191, 1.18197, 1.5911, 1.3589, 1.27604, 1.13871, 1.30626, 1.67866, 1.52014, 1.03431, 1.05476, 1.3049, 1.25479, 1.22714, 1.69201, 1.08131, 1.00908, 1.10419, 1.08066, 1.12768, 1.24403, 0.87723, 0.92972, 1.02293, 1.07062, 0.98243, 1.24502, 1.2897, 0.94461, 1.09023, 1.04658, 0.90251, 1.12421, 1.65432, 1.09595, 1.17882, 1.36022, 0.96059, 0.98043, 1.05339, 0.96416, 1.13229, 1.12844, 0.93359, 1.82877, 1.40011, 1.43068, 1.3027, 1.089, 1.64716, 1.37833, 1.56985, 1.16612, 1.85125, 1.24379, 1.71309, 1.39309, 1.27937, 1.17708, 1.73543, 1.05896, 1.24373, 1.38937, 1.36918, 1.42323, 1.77943, 1.13157, 1.27948, 1.19267, 1.34154, 1.40098, 1.16252, 1.42404, 1.2011, 1.00676, 1.48416, 1.13391, 1.33486, 1.5395, 1.27609, 1.42471, 1.30575, 1.22047, 1.81347, 1.74187, 1.56562, 1.47675, 1.51655, 1.70821, 1.44154, 1.50096, 1.28826, 1.74901, 1.90029, 1.42234, 1.44455, 1.76719, 1.84971, 1.73982, 1.24814, 1.53885, 1.39306, 1.62267, 1.27091, 1.59048, 1.06674, 1.40639, 1.29128, 1.69617, 1.31246, 1.4525, 1.29959, 1.38347, 1.4963, 1.45118, 1.62261, 1.8211, 1.48622, 1.35396, 1.364, 1.22302, 1.21036, 1.59732, 1.16621, 1.43458, 1.39264, 1.50491, 1.74865, 1.69988, 1.54719, 1.66156, 1.38606, 1.43929, 1.37822, 1.30248, 1.79296, 1.45361, 1.24972, 1.59221, 1.3686, 1.22551, 1.4158, 1.49894, 1.55813, 1.52684, 1.44435, 2.05338, 1.36019, 1.34284, 1.20815, 1.7307, 1.50669, 2.1527, 1.33714, 1.40114, 1.51052, 1.35152, 1.43159, 1.42052, 1.44093, 1.62874, 1.70468, 1.84621, 1.36339, 1.49409, 1.99351, 1.25437, 1.69787, 1.77453, 1.53971, 1.98798, 1.46692, 1.21412, 1.35855, 1.61255, 1.37129, 1.69078, 1.53059, 1.31087, 1.87886, 1.31042, 1.42235, 1.38194, 1.39636, 1.83392, 1.47651, 1.46996, 1.64541, 1.53153, 1.47267, 1.75528, 1.44853, 1.39865, 1.75941, 1.63286, 1.32552, 1.6715, 2.26149, 1.61139, 1.35216, 1.34936, 1.25166, 1.69472, 1.58245, 1.4379, 1.43627, 1.60457, 1.82215, 1.39138, 1.38678, 1.55708, 1.41296, 1.29816, 1.46066, 1.39994, 1.45437, 1.25759, 1.34921, 1.47682, 1.55246, 1.48338, 1.2271, 1.36154, 1.44453, 1.47772, 1.43402, 1.21249, 1.8034, 1.50506, 1.3131, 1.37503, 1.35584, 1.41307, 1.45748, 1.26629, 1.31721, 1.47686, 1.80237, 1.55348, 1.5369, 1.32871, 1.35524, 1.76226, 1.27945, 1.40786, 1.56063, 1.18102, 1.26595, 1.41714, 1.27185, 1.59955, 1.53902, 1.50856, 1.38342, 1.3716, 1.52597, 1.55924, 1.33891, 1.44137, 1.66178, 1.44058, 1.53213, 1.34923, 1.54826, 1.51369, 1.26166, 1.22057, 1.64988, 1.4183, 1.45977, 1.27097, 1.31805, 1.24715, 1.52412, 1.48112, 1.51313, 1.58975, 1.42731, 1.32647, 1.44532, 1.53827, 1.72661, 1.53155, 1.57687, 1.2723, 1.26403, 1.36125, 1.36611, 1.46818, 1.38679, 1.58433, 1.49566, 1.44288, 1.37271, 1.45317, 1.36918, 1.35342, 1.27732, 1.37088, 1.29411, 1.25869, 1.46478, 1.43992, 1.66108, 1.34488, 1.17599, 1.3251]}, "num-zeros": {"start_step": 0, "end_step": 1983, "step_interval": 5, "values": [951.0, 1294.0, 1060.0, 971.0, 901.0, 1117.0, 1205.0, 1364.0, 1468.0, 1319.0, 1539.0, 1911.0, 2180.0, 1576.0, 2216.0, 1925.0, 2038.0, 2028.0, 2476.0, 2015.0, 2201.0, 2215.0, 2438.0, 3135.0, 2444.0, 2806.0, 2540.0, 2188.0, 2052.0, 2885.0, 2408.0, 3553.0, 2417.0, 2497.0, 2486.0, 3667.0, 2116.0, 2243.0, 2127.0, 2649.0, 3818.0, 2985.0, 2311.0, 2810.0, 2580.0, 2214.0, 2672.0, 2502.0, 2376.0, 2941.0, 3128.0, 2507.0, 2600.0, 2152.0, 2790.0, 3240.0, 2769.0, 2720.0, 2392.0, 3522.0, 2236.0, 2883.0, 2397.0, 2586.0, 2219.0, 3154.0, 2799.0, 2803.0, 2345.0, 2563.0, 2171.0, 2874.0, 2837.0, 2656.0, 3389.0, 2526.0, 2817.0, 2625.0, 3000.0, 2814.0, 2754.0, 2414.0, 3081.0, 2380.0, 2876.0, 2737.0, 2780.0, 2271.0, 2333.0, 2839.0, 2519.0, 3210.0, 2404.0, 2291.0, 2433.0, 2383.0, 2435.0, 1919.0, 2351.0, 2585.0, 2779.0, 2221.0, 2014.0, 2114.0, 1881.0, 2304.0, 2397.0, 2309.0, 2239.0, 2116.0, 2239.0, 2377.0, 2323.0, 2496.0, 2298.0, 2773.0, 2696.0, 1952.0, 2435.0, 2042.0, 2813.0, 2452.0, 2068.0, 2032.0, 2127.0, 2176.0, 2056.0, 2569.0, 2495.0, 2156.0, 2202.0, 2372.0, 2368.0, 2313.0, 1956.0, 2287.0, 2471.0, 2251.0, 2132.0, 1626.0, 2076.0, 2288.0, 2009.0, 1987.0, 2433.0, 1651.0, 2033.0, 2061.0, 1927.0, 2837.0, 2589.0, 2063.0, 1738.0, 1964.0, 2334.0, 1899.0, 2516.0, 2136.0, 2214.0, 1965.0, 1875.0, 2415.0, 1921.0, 2352.0, 2174.0, 1887.0, 2165.0, 2616.0, 1911.0, 1825.0, 1959.0, 1908.0, 1822.0, 1574.0, 1545.0, 2160.0, 1942.0, 2081.0, 1733.0, 2008.0, 2010.0, 2212.0, 1875.0, 1390.0, 1972.0, 2540.0, 1825.0, 2152.0, 1632.0, 2232.0, 1792.0, 1887.0, 1971.0, 2046.0, 1779.0, 2139.0, 2024.0, 1999.0, 1614.0, 1985.0, 1902.0, 2128.0, 2445.0, 2671.0, 2214.0, 2029.0, 2081.0, 2209.0, 2226.0, 1957.0, 2210.0, 2419.0, 2685.0, 2294.0, 1932.0, 2118.0, 1963.0, 1818.0, 1841.0, 2149.0, 2110.0, 2155.0, 1868.0, 2220.0, 2120.0, 2379.0, 1886.0, 2361.0, 1763.0, 2055.0, 1972.0, 2155.0, 1934.0, 2167.0, 1959.0, 1882.0, 1705.0, 1826.0, 1964.0, 2224.0, 1818.0, 1883.0, 1743.0, 2488.0, 2393.0, 2103.0, 2005.0, 2728.0, 2142.0, 2054.0, 1951.0, 1819.0, 2038.0, 2170.0, 2265.0, 1808.0, 2431.0, 1807.0, 2184.0, 2053.0, 1687.0, 1931.0, 2549.0, 2587.0, 1986.0, 2273.0, 2103.0, 2063.0, 2204.0, 2021.0, 2110.0, 2428.0, 2484.0, 2060.0, 2244.0, 2025.0, 1999.0, 1965.0, 1906.0, 2137.0, 2024.0, 2234.0, 1998.0, 2022.0, 1943.0, 2254.0, 2008.0, 1619.0, 1850.0, 2446.0, 2316.0, 1952.0, 2008.0, 2201.0, 2018.0, 2191.0, 1856.0, 2363.0, 2138.0, 2632.0, 1897.0, 2331.0, 1915.0, 2017.0, 2347.0, 2073.0, 2221.0, 2341.0, 1910.0, 1944.0, 2197.0, 2136.0, 2140.0, 2057.0, 2254.0, 1992.0, 2377.0, 1829.0, 2323.0, 2256.0, 2248.0, 2664.0, 2091.0, 2351.0, 2363.0, 2417.0, 1953.0, 2010.0, 2111.0, 2082.0, 2141.0, 2449.0, 2394.0, 2165.0, 2019.0, 2307.0, 2446.0, 2932.0, 2123.0, 2428.0, 2294.0, 2499.0, 2597.0, 2391.0, 2142.0, 2085.0, 2112.0, 2498.0, 2172.0, 2546.0, 2086.0, 2278.0, 2000.0, 2060.0, 2222.0, 2327.0, 2377.0, 2181.0, 1943.0, 2370.0, 2170.0, 2277.0, 2360.0, 2822.0, 2306.0, 2709.0, 2210.0, 2127.0, 2321.0, 2202.0, 2780.0, 2249.0, 2312.0, 2033.0, 2114.0, 2287.0, 2292.0, 2301.0, 2735.0, 2674.0, 2246.0, 2584.0, 2280.0, 2624.0, 2634.0, 2653.0, 2502.0, 2748.0, 2256.0, 2492.0, 2276.0, 2217.0, 1995.0, 2408.0, 2306.0, 2584.0, 2373.0]}, "num-zeros vs samples": {"start_step": 0, "end_step": 1983, "step_interval": 5, "values": [951.0, 1294.0, 1060.0, 971.0, 901.0, 1117.0, 1205.0, 1364.0, 1468.0, 1319.0, 1539.0, 1911.0, 2180.0, 1576.0, 2216.0, 1925.0, 2038.0, 2028.0, 2476.0, 2015.0, 2201.0, 2215.0, 2438.0, 3135.0, 2444.0, 2806.0, 2540.0, 2188.0, 2052.0, 2885.0, 2408.0, 3553.0, 2417.0, 2497.0, 2486.0, 3667.0, 2116.0, 2243.0, 2127.0, 2649.0, 3818.0, 2985.0, 2311.0, 2810.0, 2580.0, 2214.0, 2672.0, 2502.0, 2376.0, 2941.0, 3128.0, 2507.0, 2600.0, 2152.0, 2790.0, 3240.0, 2769.0, 2720.0, 2392.0, 3522.0, 2236.0, 2883.0, 2397.0, 2586.0, 2219.0, 3154.0, 2799.0, 2803.0, 2345.0, 2563.0, 2171.0, 2874.0, 2837.0, 2656.0, 3389.0, 2526.0, 2817.0, 2625.0, 3000.0, 2814.0, 2754.0, 2414.0, 3081.0, 2380.0, 2876.0, 2737.0, 2780.0, 2271.0, 2333.0, 2839.0, 2519.0, 3210.0, 2404.0, 2291.0, 2433.0, 2383.0, 2435.0, 1919.0, 2351.0, 2585.0, 2779.0, 2221.0, 2014.0, 2114.0, 1881.0, 2304.0, 2397.0, 2309.0, 2239.0, 2116.0, 2239.0, 2377.0, 2323.0, 2496.0, 2298.0, 2773.0, 2696.0, 1952.0, 2435.0, 2042.0, 2813.0, 2452.0, 2068.0, 2032.0, 2127.0, 2176.0, 2056.0, 2569.0, 2495.0, 2156.0, 2202.0, 2372.0, 2368.0, 2313.0, 1956.0, 2287.0, 2471.0, 2251.0, 2132.0, 1626.0, 2076.0, 2288.0, 2009.0, 1987.0, 2433.0, 1651.0, 2033.0, 2061.0, 1927.0, 2837.0, 2589.0, 2063.0, 1738.0, 1964.0, 2334.0, 1899.0, 2516.0, 2136.0, 2214.0, 1965.0, 1875.0, 2415.0, 1921.0, 2352.0, 2174.0, 1887.0, 2165.0, 2616.0, 1911.0, 1825.0, 1959.0, 1908.0, 1822.0, 1574.0, 1545.0, 2160.0, 1942.0, 2081.0, 1733.0, 2008.0, 2010.0, 2212.0, 1875.0, 1390.0, 1972.0, 2540.0, 1825.0, 2152.0, 1632.0, 2232.0, 1792.0, 1887.0, 1971.0, 2046.0, 1779.0, 2139.0, 2024.0, 1999.0, 1614.0, 1985.0, 1902.0, 2128.0, 2445.0, 2671.0, 2214.0, 2029.0, 2081.0, 2209.0, 2226.0, 1957.0, 2210.0, 2419.0, 2685.0, 2294.0, 1932.0, 2118.0, 1963.0, 1818.0, 1841.0, 2149.0, 2110.0, 2155.0, 1868.0, 2220.0, 2120.0, 2379.0, 1886.0, 2361.0, 1763.0, 2055.0, 1972.0, 2155.0, 1934.0, 2167.0, 1959.0, 1882.0, 1705.0, 1826.0, 1964.0, 2224.0, 1818.0, 1883.0, 1743.0, 2488.0, 2393.0, 2103.0, 2005.0, 2728.0, 2142.0, 2054.0, 1951.0, 1819.0, 2038.0, 2170.0, 2265.0, 1808.0, 2431.0, 1807.0, 2184.0, 2053.0, 1687.0, 1931.0, 2549.0, 2587.0, 1986.0, 2273.0, 2103.0, 2063.0, 2204.0, 2021.0, 2110.0, 2428.0, 2484.0, 2060.0, 2244.0, 2025.0, 1999.0, 1965.0, 1906.0, 2137.0, 2024.0, 2234.0, 1998.0, 2022.0, 1943.0, 2254.0, 2008.0, 1619.0, 1850.0, 2446.0, 2316.0, 1952.0, 2008.0, 2201.0, 2018.0, 2191.0, 1856.0, 2363.0, 2138.0, 2632.0, 1897.0, 2331.0, 1915.0, 2017.0, 2347.0, 2073.0, 2221.0, 2341.0, 1910.0, 1944.0, 2197.0, 2136.0, 2140.0, 2057.0, 2254.0, 1992.0, 2377.0, 1829.0, 2323.0, 2256.0, 2248.0, 2664.0, 2091.0, 2351.0, 2363.0, 2417.0, 1953.0, 2010.0, 2111.0, 2082.0, 2141.0, 2449.0, 2394.0, 2165.0, 2019.0, 2307.0, 2446.0, 2932.0, 2123.0, 2428.0, 2294.0, 2499.0, 2597.0, 2391.0, 2142.0, 2085.0, 2112.0, 2498.0, 2172.0, 2546.0, 2086.0, 2278.0, 2000.0, 2060.0, 2222.0, 2327.0, 2377.0, 2181.0, 1943.0, 2370.0, 2170.0, 2277.0, 2360.0, 2822.0, 2306.0, 2709.0, 2210.0, 2127.0, 2321.0, 2202.0, 2780.0, 2249.0, 2312.0, 2033.0, 2114.0, 2287.0, 2292.0, 2301.0, 2735.0, 2674.0, 2246.0, 2584.0, 2280.0, 2624.0, 2634.0, 2653.0, 2502.0, 2748.0, 2256.0, 2492.0, 2276.0, 2217.0, 1995.0, 2408.0, 2306.0, 2584.0, 2373.0]}, "lm loss validation": {"start_step": 0, "end_step": 4, "step_interval": 5, "values": [5.62692]}, "lm loss validation vs samples": {"start_step": 0, "end_step": 4, "step_interval": 5, "values": [5.62692]}, "lm loss validation ppl": {"start_step": 0, "end_step": 4, "step_interval": 5, "values": [277.80627]}, "lm loss validation ppl vs samples": {"start_step": 0, "end_step": 4, "step_interval": 5, "values": [277.80627]}}
\ No newline at end of file
diff --git a/tests/functional_tests/test_cases/gpt/gpt3_345m_weekly_dgx_h100_1N8G_mcore_tp1_pp1_bf16_baseline/golden_values_lts.json b/tests/functional_tests/test_cases/gpt/gpt3_345m_weekly_dgx_h100_1N8G_mcore_tp1_pp1_bf16_baseline/golden_values_lts.json
new file mode 100644
index 0000000000..fdeaa49aa1
--- /dev/null
+++ b/tests/functional_tests/test_cases/gpt/gpt3_345m_weekly_dgx_h100_1N8G_mcore_tp1_pp1_bf16_baseline/golden_values_lts.json
@@ -0,0 +1 @@
+{"forward-backward-time": {"start_step": 0, "end_step": 2000, "step_interval": 5, "values": [17.4566, 0.37175, 0.37134, 0.37017, 0.37156, 0.37759, 0.37765, 0.37162, 0.3761, 0.37226, 0.53616, 0.37589, 0.37516, 0.37683, 0.37327, 0.37614, 0.37342, 0.3739, 0.37649, 0.37491, 0.38081, 0.37232, 0.37401, 0.37224, 0.37132, 0.38167, 0.37456, 0.37215, 0.36647, 0.37435, 0.38453, 0.36353, 0.36605, 0.36205, 0.36329, 0.36758, 0.36245, 0.36564, 0.3674, 0.38594, 0.36767, 0.36685, 0.36727, 0.36428, 0.3664, 0.36716, 0.36619, 0.36593, 0.36805, 0.36393, 0.3666, 0.36486, 0.36817, 0.36273, 0.36485, 0.36634, 0.36443, 0.3672, 0.36462, 0.36335, 0.35994, 0.36774, 0.36167, 0.36089, 0.36216, 0.36236, 0.36412, 0.36497, 0.3673, 0.36303, 0.36566, 0.36239, 0.36323, 0.36008, 0.46258, 0.36181, 0.3621, 0.36509, 0.36772, 0.36417, 0.36489, 0.36688, 0.3704, 0.36443, 0.36411, 0.36221, 0.36185, 0.36498, 0.36202, 0.36553, 0.36574, 0.36507, 0.37335, 0.36256, 0.3648, 0.36324, 0.36253, 0.36685, 0.3644, 0.36463, 0.36584, 0.36426, 0.36134, 0.36175, 0.45788, 0.36568, 0.36196, 0.38364, 0.36164, 0.36331, 0.36346, 0.3683, 0.36544, 0.36245, 0.37051, 0.37092, 0.36741, 0.3695, 0.3651, 0.37195, 0.36315, 0.36425, 0.36904, 0.36828, 0.3648, 0.36763, 0.36895, 0.37272, 0.3749, 0.36753, 0.36573, 0.36845, 0.36886, 0.37096, 0.47625, 0.36339, 0.36255, 0.36368, 0.44639, 0.51442, 0.3673, 0.36637, 0.36885, 0.37285, 0.36987, 0.36631, 0.36485, 0.36259, 0.36217, 0.364, 0.36364, 0.36588, 0.3619, 0.36604, 0.36798, 0.36772, 0.36665, 0.36769, 0.36628, 0.36592, 0.36831, 0.36583, 0.36842, 0.36695, 0.37069, 0.36526, 0.36421, 0.3661, 0.36543, 0.36845, 0.36581, 0.3674, 0.36575, 0.36568, 0.36949, 0.36761, 0.36684, 0.36852, 0.36408, 0.37073, 0.36602, 0.36769, 0.3609, 0.36264, 0.36736, 0.36549, 0.36517, 0.36003, 0.36081, 0.36006, 0.36167, 0.36361, 0.36172, 0.36296, 0.36716, 0.36645, 0.36705, 0.36621, 0.45574, 0.36247, 0.36105, 0.36408, 0.3621, 0.36088, 0.36271, 0.36349, 0.36811, 0.36958, 0.36968, 0.36582, 0.36294, 0.36436, 0.36894, 0.36266, 0.36585, 0.36633, 0.36462, 0.36885, 0.36711, 0.36754, 0.36317, 0.36285, 0.36581, 0.37564, 0.37346, 0.3622, 0.36404, 0.45901, 0.36362, 0.36726, 0.37058, 0.36812, 0.36666, 0.37189, 0.46883, 0.37275, 0.3719, 0.36704, 0.36448, 0.3629, 0.36582, 0.36225, 0.36061, 0.4845, 0.36483, 0.36652, 0.36811, 0.36819, 0.37464, 0.36516, 0.36721, 0.36426, 0.35999, 0.36267, 0.36286, 0.36833, 0.36584, 0.3632, 0.36415, 0.36569, 0.37494, 0.36226, 0.46516, 0.36495, 0.36254, 0.36943, 0.36585, 0.36664, 0.36827, 0.36557, 0.37484, 0.36946, 0.37108, 0.36825, 0.36775, 0.36137, 0.36521, 0.3697, 0.36415, 0.36338, 0.36383, 0.36505, 0.3677, 0.36976, 0.36576, 0.36964, 0.37212, 0.36584, 0.36475, 0.36537, 0.36914, 0.36892, 0.45897, 0.36567, 0.3641, 0.36657, 0.3698, 0.36867, 0.36599, 0.3679, 0.36742, 0.36813, 0.36659, 0.36737, 0.36653, 0.36785, 0.37243, 0.36895, 0.37086, 0.365, 0.36719, 0.37471, 0.36717, 0.3738, 0.37016, 0.37206, 0.3695, 0.36911, 0.36946, 0.36669, 0.36636, 0.3628, 0.3661, 0.36516, 0.36275, 0.3657, 0.3654, 0.36521, 0.3662, 0.4682, 0.36931, 0.3668, 0.37172, 0.37189, 0.36942, 0.37165, 0.37159, 0.37333, 0.37491, 0.37221, 0.36907, 0.37154, 0.37633, 0.36937, 0.36886, 0.36922, 0.36659, 0.36692, 0.36765, 0.36709, 0.3641, 0.36625, 0.36742, 0.36073, 0.36646, 0.36662, 0.36508, 0.37343, 0.36701, 0.3642, 0.36688, 0.36861, 0.36833, 0.36153, 0.36529, 0.36657, 0.36866, 0.37542, 0.36846, 0.36817, 0.36445, 0.36398, 0.36799, 0.36631, 0.3632, 0.36525, 0.36782, 0.36786, 0.37064, 0.36604, 0.36767, 0.36737, 0.36678, 0.36919, 0.36757, 0.36912, 0.36819, 0.46929, 0.37321, 0.37017, 0.4569, 0.36994, 0.37357, 0.36984, 0.57706, 0.37035, 0.37045, 0.36802, 0.36852, 0.36742]}, "forward-compute-time": {"start_step": 0, "end_step": 2000, "step_interval": 5, "values": [9.27486, 0.20418, 0.20397, 0.20285, 0.20434, 0.20758, 0.20634, 0.20416, 0.20426, 0.20434, 0.3669, 0.20758, 0.20442, 0.20546, 0.20278, 0.20684, 0.20447, 0.20408, 0.20756, 0.20602, 0.20443, 0.20251, 0.20574, 0.20384, 0.2029, 0.21254, 0.21029, 0.20601, 0.20107, 0.20291, 0.20989, 0.19612, 0.20052, 0.19662, 0.19784, 0.20061, 0.19675, 0.19997, 0.20194, 0.22257, 0.2025, 0.20076, 0.2025, 0.20065, 0.20083, 0.19995, 0.19982, 0.20085, 0.20083, 0.19933, 0.20226, 0.20132, 0.203, 0.19623, 0.1999, 0.19978, 0.1976, 0.19962, 0.19949, 0.19977, 0.19439, 0.19749, 0.19772, 0.19546, 0.19711, 0.19707, 0.19839, 0.19731, 0.20084, 0.19819, 0.2011, 0.1983, 0.19858, 0.1937, 0.29471, 0.19528, 0.19534, 0.19901, 0.20146, 0.19982, 0.19907, 0.20086, 0.20405, 0.19915, 0.2005, 0.19581, 0.19278, 0.19863, 0.19822, 0.1993, 0.1988, 0.19998, 0.2005, 0.19725, 0.20091, 0.19918, 0.19836, 0.2016, 0.19765, 0.19811, 0.19903, 0.19646, 0.19645, 0.19682, 0.28975, 0.19888, 0.19522, 0.21159, 0.19644, 0.19881, 0.19777, 0.20279, 0.19972, 0.19755, 0.20374, 0.20397, 0.20052, 0.20409, 0.20046, 0.20573, 0.19813, 0.19893, 0.20396, 0.20108, 0.1991, 0.20018, 0.20247, 0.20606, 0.20496, 0.20146, 0.20113, 0.20109, 0.20373, 0.20131, 0.30688, 0.19978, 0.19719, 0.19856, 0.27425, 0.34575, 0.20073, 0.20027, 0.20292, 0.20753, 0.20162, 0.19901, 0.19974, 0.19616, 0.19556, 0.19818, 0.19745, 0.20023, 0.19768, 0.1993, 0.20152, 0.20191, 0.20046, 0.19952, 0.19909, 0.20067, 0.20206, 0.20028, 0.2009, 0.20109, 0.20231, 0.20057, 0.19849, 0.2014, 0.19862, 0.20162, 0.1995, 0.20168, 0.19859, 0.20023, 0.20137, 0.19954, 0.19893, 0.20032, 0.19926, 0.20288, 0.20082, 0.20203, 0.1964, 0.19744, 0.20075, 0.19839, 0.19941, 0.19592, 0.19584, 0.19507, 0.19602, 0.19868, 0.19785, 0.19642, 0.20146, 0.20135, 0.20162, 0.20061, 0.28565, 0.19898, 0.19699, 0.20018, 0.1975, 0.19765, 0.19836, 0.20012, 0.20347, 0.20455, 0.20461, 0.20103, 0.1993, 0.20097, 0.20324, 0.19779, 0.20128, 0.20136, 0.19977, 0.20189, 0.20216, 0.19869, 0.19833, 0.19963, 0.20166, 0.21162, 0.2062, 0.19807, 0.19895, 0.29325, 0.19845, 0.1994, 0.20325, 0.20285, 0.20049, 0.20554, 0.30108, 0.20617, 0.20644, 0.20131, 0.20084, 0.19867, 0.20111, 0.19928, 0.19687, 0.31861, 0.20096, 0.20262, 0.20309, 0.20325, 0.20819, 0.20113, 0.20301, 0.19969, 0.19603, 0.19693, 0.19763, 0.2004, 0.20179, 0.19742, 0.19937, 0.20128, 0.20616, 0.19831, 0.29924, 0.19973, 0.19859, 0.20413, 0.20138, 0.20285, 0.20388, 0.20206, 0.20671, 0.20471, 0.20646, 0.20241, 0.20408, 0.19861, 0.20125, 0.20732, 0.20159, 0.20035, 0.20096, 0.20012, 0.20294, 0.20424, 0.20101, 0.20564, 0.2044, 0.2008, 0.19955, 0.20264, 0.2049, 0.20446, 0.293, 0.20181, 0.20025, 0.20162, 0.20369, 0.20417, 0.20115, 0.20265, 0.20363, 0.2044, 0.20297, 0.20322, 0.20046, 0.20222, 0.20483, 0.20332, 0.20676, 0.19998, 0.2015, 0.2054, 0.20246, 0.20845, 0.20406, 0.20619, 0.20592, 0.20453, 0.20274, 0.20274, 0.20162, 0.20007, 0.20274, 0.20276, 0.19873, 0.20293, 0.20198, 0.20198, 0.20314, 0.30676, 0.20607, 0.2049, 0.20889, 0.20967, 0.2072, 0.20824, 0.20768, 0.20857, 0.20862, 0.20898, 0.20615, 0.20827, 0.21418, 0.20637, 0.20388, 0.2067, 0.20272, 0.20336, 0.20429, 0.20148, 0.20112, 0.20264, 0.20322, 0.19861, 0.20195, 0.20314, 0.1996, 0.20578, 0.2036, 0.20073, 0.20362, 0.20652, 0.20449, 0.19954, 0.20273, 0.203, 0.2032, 0.20757, 0.2034, 0.20482, 0.19991, 0.20078, 0.20474, 0.20356, 0.19886, 0.20118, 0.20177, 0.20291, 0.20253, 0.20141, 0.20341, 0.20352, 0.20319, 0.20478, 0.20413, 0.20568, 0.20319, 0.30235, 0.20813, 0.20681, 0.29099, 0.20567, 0.20759, 0.20528, 0.41177, 0.20714, 0.20416, 0.20342, 0.20429, 0.20393]}, "backward-compute-time": {"start_step": 0, "end_step": 2000, "step_interval": 5, "values": [2.48483, 0.17652, 0.17828, 0.17737, 0.17731, 0.18012, 0.18059, 0.17933, 0.18228, 0.17963, 0.17741, 0.17905, 0.17875, 0.18023, 0.17598, 0.17735, 0.17563, 0.1774, 0.17814, 0.17775, 0.1797, 0.17589, 0.17512, 0.17493, 0.17423, 0.17574, 0.17442, 0.17392, 0.17429, 0.18376, 0.17762, 0.17577, 0.17608, 0.17519, 0.17371, 0.17562, 0.1743, 0.17634, 0.17747, 0.1794, 0.17639, 0.1769, 0.17749, 0.17644, 0.17597, 0.17611, 0.17772, 0.17605, 0.17799, 0.1756, 0.17762, 0.17478, 0.17987, 0.17366, 0.17669, 0.17775, 0.17802, 0.17908, 0.17514, 0.17554, 0.17388, 0.17483, 0.17431, 0.17275, 0.17497, 0.17541, 0.17514, 0.17686, 0.17728, 0.17469, 0.17508, 0.17519, 0.17517, 0.17377, 0.17594, 0.17621, 0.17553, 0.17702, 0.18, 0.17602, 0.17593, 0.17864, 0.17997, 0.1755, 0.17822, 0.17772, 0.17671, 0.17725, 0.1778, 0.17809, 0.17954, 0.17593, 0.17541, 0.17441, 0.17679, 0.17798, 0.17778, 0.17724, 0.17552, 0.17811, 0.18023, 0.17981, 0.17557, 0.17566, 0.17625, 0.17625, 0.17558, 0.19425, 0.1762, 0.17767, 0.17763, 0.18372, 0.17971, 0.17752, 0.18218, 0.18258, 0.18042, 0.18083, 0.17934, 0.18263, 0.17612, 0.17585, 0.18209, 0.17892, 0.17504, 0.18056, 0.18269, 0.18216, 0.18105, 0.18046, 0.17895, 0.18001, 0.18287, 0.18048, 0.18107, 0.1792, 0.177, 0.17595, 0.17833, 0.17997, 0.18026, 0.18064, 0.18103, 0.18122, 0.1807, 0.17741, 0.17696, 0.175, 0.17708, 0.17762, 0.17496, 0.17994, 0.17504, 0.17879, 0.18178, 0.1796, 0.18007, 0.18397, 0.18212, 0.18076, 0.18234, 0.18066, 0.18359, 0.18244, 0.18094, 0.18093, 0.17869, 0.18132, 0.18028, 0.18293, 0.17692, 0.181, 0.1778, 0.178, 0.18006, 0.18483, 0.18337, 0.18495, 0.18069, 0.18012, 0.18124, 0.18343, 0.17705, 0.17668, 0.17849, 0.18112, 0.17754, 0.1764, 0.17576, 0.17489, 0.17603, 0.17867, 0.17875, 0.17778, 0.17783, 0.18028, 0.18098, 0.18147, 0.18117, 0.17707, 0.17356, 0.17855, 0.17723, 0.175, 0.17556, 0.17674, 0.17749, 0.17698, 0.17866, 0.17541, 0.17473, 0.17725, 0.17976, 0.17814, 0.17815, 0.17912, 0.17571, 0.18059, 0.18163, 0.17964, 0.17657, 0.1773, 0.17872, 0.18756, 0.18502, 0.17691, 0.17601, 0.1773, 0.17751, 0.17745, 0.18072, 0.17998, 0.17849, 0.18172, 0.17785, 0.18296, 0.17966, 0.18029, 0.17622, 0.17684, 0.17683, 0.17525, 0.17514, 0.17546, 0.17768, 0.17616, 0.17827, 0.17873, 0.18236, 0.17864, 0.17902, 0.17866, 0.17537, 0.17824, 0.17634, 0.17765, 0.17745, 0.17691, 0.17855, 0.17773, 0.1776, 0.17553, 0.17612, 0.17682, 0.17445, 0.17573, 0.17792, 0.17697, 0.17758, 0.17799, 0.18179, 0.17862, 0.17828, 0.17902, 0.17716, 0.17378, 0.17466, 0.17969, 0.17531, 0.17449, 0.1762, 0.17533, 0.17786, 0.17799, 0.1739, 0.17695, 0.17997, 0.17727, 0.17594, 0.17599, 0.17877, 0.17835, 0.17768, 0.17619, 0.1761, 0.17947, 0.18082, 0.17999, 0.17973, 0.18161, 0.17878, 0.18107, 0.17669, 0.17787, 0.17714, 0.17987, 0.17952, 0.18139, 0.1814, 0.17879, 0.17819, 0.17967, 0.17842, 0.18204, 0.17981, 0.18039, 0.1779, 0.17786, 0.18096, 0.17907, 0.17853, 0.17539, 0.17682, 0.17666, 0.17653, 0.17793, 0.17688, 0.1782, 0.17909, 0.17471, 0.17743, 0.17531, 0.17878, 0.17697, 0.1762, 0.17958, 0.17827, 0.17938, 0.17923, 0.17797, 0.1763, 0.17776, 0.18097, 0.17754, 0.18018, 0.17934, 0.1806, 0.1751, 0.17845, 0.18106, 0.17667, 0.17809, 0.17911, 0.17624, 0.17874, 0.1795, 0.17661, 0.18214, 0.18117, 0.17941, 0.17482, 0.17595, 0.17616, 0.17509, 0.17725, 0.17932, 0.18085, 0.18292, 0.17986, 0.17974, 0.17799, 0.17756, 0.17851, 0.17744, 0.17724, 0.17992, 0.18197, 0.18128, 0.1816, 0.17718, 0.1781, 0.18028, 0.17962, 0.18211, 0.17904, 0.18027, 0.179, 0.1805, 0.18514, 0.18111, 0.17608, 0.18024, 0.1833, 0.1823, 0.1797, 0.17902, 0.18251, 0.18061, 0.17877, 0.17926]}, "batch-generator-time": {"start_step": 0, "end_step": 2000, "step_interval": 5, "values": [0.60562, 0.0038, 0.00384, 0.00379, 0.00392, 0.00392, 0.00391, 0.00387, 0.00391, 0.00397, 0.00392, 0.00405, 0.00383, 0.00388, 0.00387, 0.0042, 0.00394, 0.00394, 0.00387, 0.00379, 0.00413, 0.00393, 0.00403, 0.00383, 0.00384, 0.004, 0.0044, 0.00355, 0.00419, 0.00392, 0.00399, 0.00394, 0.0037, 0.00364, 0.00369, 0.00383, 0.00379, 0.00369, 0.0038, 0.00364, 0.00377, 0.00393, 0.00365, 0.00367, 0.00383, 0.00366, 0.00382, 0.00371, 0.00355, 0.00439, 0.00359, 0.00368, 0.00365, 0.00383, 0.00363, 0.00374, 0.00373, 0.00378, 0.00373, 0.00352, 0.00362, 0.0036, 0.00343, 0.00349, 0.00382, 0.00374, 0.00356, 0.00374, 0.00365, 0.00391, 0.0037, 0.00375, 0.00369, 0.00366, 0.00397, 0.00372, 0.00358, 0.00365, 0.00406, 0.00355, 0.00339, 0.00398, 0.00424, 0.0036, 0.00363, 0.00389, 0.00371, 0.00377, 0.00362, 0.00383, 0.00373, 0.0037, 0.00388, 0.00356, 0.00358, 0.00363, 0.00387, 0.00375, 0.00383, 0.00372, 0.00369, 0.00374, 0.00411, 0.00364, 0.0039, 0.00376, 0.00383, 0.00364, 0.00379, 0.00378, 0.00364, 0.00365, 0.00392, 0.00347, 0.00361, 0.00377, 0.00359, 0.00364, 0.00383, 0.00375, 0.00368, 0.00367, 0.0041, 0.00379, 0.00359, 0.00366, 0.00379, 0.00376, 0.00387, 0.00368, 0.00361, 0.00375, 0.00401, 0.0038, 0.00393, 0.00377, 0.00358, 0.00402, 0.00479, 0.00399, 0.00374, 0.00392, 0.00379, 0.00391, 0.00355, 0.00378, 0.00356, 0.00362, 0.0036, 0.00351, 0.00348, 0.00422, 0.00355, 0.00359, 0.00351, 0.00373, 0.00362, 0.00377, 0.00378, 0.00386, 0.0037, 0.00367, 0.00361, 0.0038, 0.00392, 0.00338, 0.00354, 0.00357, 0.00375, 0.00369, 0.0038, 0.0036, 0.00386, 0.00388, 0.00354, 0.00367, 0.00381, 0.00354, 0.00366, 0.0038, 0.00367, 0.00378, 0.00363, 0.00368, 0.00358, 0.00359, 0.00373, 0.00355, 0.00402, 0.00361, 0.00364, 0.00369, 0.0035, 0.00356, 0.00387, 0.00375, 0.00381, 0.0038, 0.00396, 0.00375, 0.03419, 0.00346, 0.00373, 0.00413, 0.0035, 0.00359, 0.00362, 0.00344, 0.00367, 0.00349, 0.00362, 0.00369, 0.00353, 0.00388, 0.00372, 0.00358, 0.0036, 0.00347, 0.00344, 0.00368, 0.00381, 0.00355, 0.00366, 0.0035, 0.00362, 0.00372, 0.0037, 0.00382, 0.00365, 0.00381, 0.00385, 0.00362, 0.00358, 0.00369, 0.00374, 0.00368, 0.00355, 0.00377, 0.00348, 0.00351, 0.00355, 0.00339, 0.00354, 0.00335, 0.00357, 0.00367, 0.00363, 0.00377, 0.00357, 0.00363, 0.00374, 0.00361, 0.00358, 0.00354, 0.00336, 0.00361, 0.00371, 0.00365, 0.00354, 0.00394, 0.00379, 0.00378, 0.00379, 0.00401, 0.00398, 0.00384, 0.00395, 0.0042, 0.00424, 0.00421, 0.00426, 0.00442, 0.00415, 0.00404, 0.0043, 0.00406, 0.00434, 0.00442, 0.00416, 0.0043, 0.00409, 0.00403, 0.00412, 0.004, 0.00407, 0.00448, 0.00415, 0.00407, 0.0041, 0.0041, 0.00402, 0.00417, 0.00421, 0.00402, 0.00399, 0.00398, 0.00422, 0.00414, 0.00414, 0.00417, 0.00412, 0.004, 0.00405, 0.00393, 0.00399, 0.00391, 0.00392, 0.00387, 0.00417, 0.00413, 0.00408, 0.004, 0.00415, 0.00409, 0.00421, 0.00397, 0.00405, 0.00396, 0.00405, 0.00404, 0.00407, 0.00408, 0.00399, 0.004, 0.00392, 0.00412, 0.00432, 0.00438, 0.00426, 0.00415, 0.00429, 0.00422, 0.00401, 0.00419, 0.0041, 0.00398, 0.00406, 0.00453, 0.00398, 0.00413, 0.00404, 0.00406, 0.00404, 0.00404, 0.0041, 0.00409, 0.00402, 0.00399, 0.0041, 0.00413, 0.00436, 0.00417, 0.00418, 0.00424, 0.00423, 0.00429, 0.00425, 0.00417, 0.00427, 0.00432, 0.00421, 0.00425, 0.00421, 0.00433, 0.00423, 0.00439, 0.00428, 0.00423, 0.00424, 0.0041, 0.00423, 0.00424, 0.00433, 0.00424, 0.00436, 0.0043, 0.00407, 0.00429, 0.0041, 0.00429, 0.00431, 0.00428, 0.0043, 0.00425, 0.00416, 0.00427, 0.00405, 0.00443, 0.00417, 0.0042, 0.00449, 0.00406, 0.004, 0.00406, 0.0042, 0.00421, 0.00409, 0.00421, 0.00421, 0.00413]}, "layernorm-grads-all-reduce-time": {"start_step": 0, "end_step": 2000, "step_interval": 5, "values": [3e-05, 1e-05, 1e-05, 1e-05, 1e-05, 1e-05, 1e-05, 2e-05, 1e-05, 1e-05, 1e-05, 2e-05, 1e-05, 1e-05, 1e-05, 1e-05, 1e-05, 1e-05, 1e-05, 1e-05, 2e-05, 1e-05, 1e-05, 1e-05, 1e-05, 1e-05, 1e-05, 1e-05, 1e-05, 1e-05, 2e-05, 1e-05, 1e-05, 1e-05, 1e-05, 1e-05, 1e-05, 1e-05, 1e-05, 1e-05, 1e-05, 1e-05, 1e-05, 1e-05, 1e-05, 1e-05, 1e-05, 1e-05, 2e-05, 1e-05, 1e-05, 1e-05, 2e-05, 1e-05, 1e-05, 1e-05, 1e-05, 1e-05, 1e-05, 1e-05, 1e-05, 1e-05, 1e-05, 1e-05, 1e-05, 1e-05, 2e-05, 1e-05, 1e-05, 1e-05, 1e-05, 1e-05, 1e-05, 2e-05, 1e-05, 1e-05, 1e-05, 1e-05, 1e-05, 1e-05, 1e-05, 1e-05, 1e-05, 1e-05, 1e-05, 1e-05, 1e-05, 1e-05, 1e-05, 1e-05, 1e-05, 1e-05, 1e-05, 1e-05, 1e-05, 1e-05, 1e-05, 2e-05, 1e-05, 1e-05, 1e-05, 1e-05, 1e-05, 1e-05, 1e-05, 1e-05, 1e-05, 1e-05, 1e-05, 1e-05, 1e-05, 1e-05, 1e-05, 1e-05, 1e-05, 1e-05, 1e-05, 1e-05, 1e-05, 1e-05, 1e-05, 1e-05, 1e-05, 1e-05, 1e-05, 1e-05, 1e-05, 1e-05, 1e-05, 1e-05, 1e-05, 1e-05, 1e-05, 1e-05, 1e-05, 1e-05, 1e-05, 2e-05, 2e-05, 1e-05, 1e-05, 1e-05, 1e-05, 1e-05, 1e-05, 1e-05, 1e-05, 1e-05, 1e-05, 1e-05, 1e-05, 1e-05, 1e-05, 1e-05, 1e-05, 1e-05, 1e-05, 1e-05, 1e-05, 1e-05, 1e-05, 1e-05, 1e-05, 1e-05, 1e-05, 1e-05, 1e-05, 1e-05, 1e-05, 1e-05, 1e-05, 1e-05, 1e-05, 2e-05, 1e-05, 1e-05, 1e-05, 1e-05, 1e-05, 2e-05, 2e-05, 1e-05, 1e-05, 1e-05, 2e-05, 1e-05, 1e-05, 1e-05, 1e-05, 1e-05, 1e-05, 1e-05, 1e-05, 1e-05, 1e-05, 1e-05, 1e-05, 1e-05, 1e-05, 1e-05, 1e-05, 1e-05, 1e-05, 1e-05, 1e-05, 1e-05, 1e-05, 1e-05, 1e-05, 1e-05, 1e-05, 1e-05, 1e-05, 1e-05, 1e-05, 1e-05, 1e-05, 1e-05, 1e-05, 1e-05, 1e-05, 1e-05, 1e-05, 1e-05, 1e-05, 1e-05, 1e-05, 1e-05, 1e-05, 1e-05, 1e-05, 1e-05, 1e-05, 1e-05, 1e-05, 1e-05, 1e-05, 1e-05, 2e-05, 1e-05, 1e-05, 1e-05, 1e-05, 1e-05, 1e-05, 1e-05, 1e-05, 1e-05, 1e-05, 1e-05, 1e-05, 1e-05, 1e-05, 1e-05, 1e-05, 1e-05, 1e-05, 2e-05, 1e-05, 1e-05, 1e-05, 1e-05, 1e-05, 1e-05, 1e-05, 1e-05, 1e-05, 1e-05, 1e-05, 1e-05, 1e-05, 1e-05, 1e-05, 2e-05, 1e-05, 1e-05, 1e-05, 1e-05, 1e-05, 1e-05, 1e-05, 1e-05, 1e-05, 1e-05, 1e-05, 1e-05, 1e-05, 1e-05, 1e-05, 1e-05, 1e-05, 1e-05, 1e-05, 1e-05, 1e-05, 1e-05, 1e-05, 1e-05, 1e-05, 1e-05, 2e-05, 1e-05, 1e-05, 1e-05, 1e-05, 1e-05, 1e-05, 1e-05, 1e-05, 1e-05, 1e-05, 1e-05, 1e-05, 1e-05, 1e-05, 1e-05, 1e-05, 1e-05, 1e-05, 1e-05, 1e-05, 1e-05, 1e-05, 1e-05, 1e-05, 1e-05, 1e-05, 1e-05, 1e-05, 1e-05, 1e-05, 1e-05, 1e-05, 1e-05, 1e-05, 1e-05, 1e-05, 1e-05, 1e-05, 1e-05, 1e-05, 1e-05, 1e-05, 1e-05, 2e-05, 1e-05, 1e-05, 1e-05, 1e-05, 1e-05, 1e-05, 1e-05, 1e-05, 1e-05, 1e-05, 1e-05, 1e-05, 1e-05, 1e-05, 1e-05, 2e-05, 1e-05, 1e-05, 1e-05, 1e-05, 1e-05, 1e-05, 1e-05, 2e-05, 1e-05, 1e-05, 1e-05, 1e-05, 1e-05, 1e-05, 1e-05, 1e-05, 1e-05, 1e-05, 1e-05, 1e-05, 1e-05, 1e-05, 1e-05, 1e-05, 1e-05, 1e-05, 1e-05, 1e-05, 1e-05, 1e-05, 1e-05, 1e-05, 1e-05, 1e-05, 1e-05, 1e-05, 1e-05, 1e-05, 1e-05]}, "embedding-grads-all-reduce-time": {"start_step": 0, "end_step": 2000, "step_interval": 5, "values": [4e-05, 2e-05, 2e-05, 2e-05, 2e-05, 2e-05, 2e-05, 2e-05, 3e-05, 2e-05, 2e-05, 2e-05, 2e-05, 2e-05, 2e-05, 2e-05, 2e-05, 2e-05, 2e-05, 2e-05, 2e-05, 2e-05, 2e-05, 2e-05, 2e-05, 2e-05, 2e-05, 2e-05, 2e-05, 2e-05, 2e-05, 2e-05, 2e-05, 2e-05, 2e-05, 2e-05, 2e-05, 2e-05, 2e-05, 2e-05, 2e-05, 3e-05, 2e-05, 2e-05, 2e-05, 2e-05, 2e-05, 2e-05, 2e-05, 2e-05, 2e-05, 2e-05, 2e-05, 2e-05, 2e-05, 2e-05, 2e-05, 2e-05, 2e-05, 2e-05, 2e-05, 2e-05, 2e-05, 2e-05, 2e-05, 2e-05, 2e-05, 2e-05, 2e-05, 2e-05, 2e-05, 2e-05, 2e-05, 2e-05, 2e-05, 2e-05, 2e-05, 2e-05, 2e-05, 2e-05, 2e-05, 2e-05, 2e-05, 2e-05, 2e-05, 2e-05, 2e-05, 2e-05, 2e-05, 2e-05, 2e-05, 2e-05, 2e-05, 5e-05, 2e-05, 2e-05, 2e-05, 2e-05, 2e-05, 2e-05, 2e-05, 2e-05, 2e-05, 2e-05, 2e-05, 2e-05, 2e-05, 2e-05, 2e-05, 2e-05, 2e-05, 2e-05, 2e-05, 2e-05, 2e-05, 2e-05, 2e-05, 2e-05, 2e-05, 3e-05, 2e-05, 2e-05, 2e-05, 2e-05, 2e-05, 2e-05, 2e-05, 2e-05, 2e-05, 2e-05, 2e-05, 2e-05, 2e-05, 3e-05, 2e-05, 2e-05, 2e-05, 2e-05, 3e-05, 2e-05, 2e-05, 2e-05, 2e-05, 3e-05, 2e-05, 2e-05, 2e-05, 2e-05, 3e-05, 2e-05, 2e-05, 2e-05, 2e-05, 2e-05, 2e-05, 2e-05, 2e-05, 2e-05, 2e-05, 2e-05, 2e-05, 2e-05, 2e-05, 3e-05, 2e-05, 2e-05, 2e-05, 2e-05, 2e-05, 2e-05, 2e-05, 2e-05, 2e-05, 2e-05, 2e-05, 2e-05, 2e-05, 2e-05, 2e-05, 2e-05, 2e-05, 2e-05, 2e-05, 2e-05, 2e-05, 2e-05, 2e-05, 2e-05, 2e-05, 2e-05, 2e-05, 2e-05, 2e-05, 2e-05, 2e-05, 2e-05, 2e-05, 2e-05, 2e-05, 2e-05, 2e-05, 2e-05, 2e-05, 2e-05, 2e-05, 2e-05, 2e-05, 2e-05, 2e-05, 2e-05, 2e-05, 2e-05, 2e-05, 2e-05, 2e-05, 2e-05, 2e-05, 2e-05, 2e-05, 2e-05, 2e-05, 2e-05, 2e-05, 2e-05, 2e-05, 2e-05, 2e-05, 2e-05, 2e-05, 3e-05, 2e-05, 2e-05, 2e-05, 2e-05, 2e-05, 2e-05, 2e-05, 2e-05, 2e-05, 2e-05, 2e-05, 2e-05, 2e-05, 2e-05, 2e-05, 2e-05, 2e-05, 2e-05, 2e-05, 2e-05, 2e-05, 2e-05, 2e-05, 2e-05, 2e-05, 2e-05, 2e-05, 2e-05, 2e-05, 2e-05, 2e-05, 2e-05, 2e-05, 2e-05, 2e-05, 2e-05, 2e-05, 2e-05, 2e-05, 2e-05, 2e-05, 2e-05, 2e-05, 2e-05, 2e-05, 2e-05, 2e-05, 2e-05, 2e-05, 2e-05, 2e-05, 2e-05, 2e-05, 3e-05, 2e-05, 2e-05, 2e-05, 2e-05, 2e-05, 2e-05, 2e-05, 2e-05, 2e-05, 2e-05, 2e-05, 2e-05, 2e-05, 2e-05, 2e-05, 2e-05, 2e-05, 2e-05, 2e-05, 2e-05, 2e-05, 2e-05, 2e-05, 2e-05, 2e-05, 2e-05, 2e-05, 2e-05, 2e-05, 2e-05, 2e-05, 2e-05, 2e-05, 2e-05, 2e-05, 2e-05, 2e-05, 2e-05, 2e-05, 2e-05, 2e-05, 2e-05, 2e-05, 2e-05, 2e-05, 2e-05, 2e-05, 2e-05, 2e-05, 2e-05, 2e-05, 2e-05, 2e-05, 2e-05, 2e-05, 2e-05, 2e-05, 2e-05, 2e-05, 2e-05, 2e-05, 2e-05, 2e-05, 2e-05, 2e-05, 2e-05, 2e-05, 2e-05, 2e-05, 2e-05, 2e-05, 2e-05, 2e-05, 2e-05, 2e-05, 2e-05, 2e-05, 2e-05, 2e-05, 2e-05, 2e-05, 2e-05, 2e-05, 2e-05, 2e-05, 2e-05, 2e-05, 2e-05, 2e-05, 2e-05, 2e-05, 2e-05, 2e-05, 2e-05, 2e-05, 3e-05, 2e-05, 2e-05, 2e-05, 2e-05, 2e-05, 2e-05, 2e-05, 2e-05, 2e-05, 2e-05, 2e-05, 2e-05, 2e-05, 2e-05, 2e-05, 2e-05, 2e-05, 2e-05, 2e-05, 2e-05]}, "all-grads-sync-time": {"start_step": 0, "end_step": 2000, "step_interval": 5, "values": [5.81083, 0.0018, 0.00179, 0.00169, 0.00153, 0.00181, 0.00157, 0.00183, 0.00159, 0.00178, 0.00159, 0.00178, 0.00153, 0.00181, 0.0016, 0.0018, 0.00158, 0.00176, 0.00155, 0.00182, 0.00162, 0.00179, 0.00159, 0.00178, 0.0016, 0.00183, 0.00159, 0.00181, 0.0016, 0.00181, 0.00161, 0.0018, 0.00156, 0.00165, 0.0016, 0.00177, 0.00157, 0.00177, 0.00159, 0.00175, 0.00158, 0.00178, 0.00159, 0.00182, 0.00158, 0.00177, 0.00158, 0.00177, 0.00159, 0.00179, 0.00155, 0.00183, 0.00158, 0.00178, 0.00156, 0.00181, 0.00154, 0.0018, 0.00154, 0.00178, 0.00159, 0.00181, 0.00157, 0.00181, 0.00155, 0.00183, 0.00159, 0.0018, 0.00155, 0.00179, 0.00158, 0.00181, 0.00159, 0.00179, 0.00153, 0.00178, 0.00157, 0.00178, 0.00156, 0.00176, 0.00156, 0.00179, 0.00157, 0.00182, 0.00152, 0.00181, 0.00152, 0.00183, 0.00157, 0.00179, 0.00159, 0.00187, 0.00159, 0.00182, 0.00156, 0.0018, 0.00161, 0.0018, 0.00157, 0.00176, 0.00159, 0.00179, 0.00157, 0.00182, 0.00158, 0.0018, 0.0016, 0.00182, 0.00159, 0.00172, 0.00157, 0.00179, 0.00154, 0.00166, 0.00158, 0.00176, 0.00159, 0.00184, 0.00156, 0.00179, 0.00157, 0.00174, 0.00157, 0.00173, 0.00157, 0.0018, 0.00159, 0.00181, 0.00156, 0.00183, 0.00157, 0.00181, 0.00158, 0.00179, 0.00157, 0.00184, 0.00158, 0.00174, 0.00163, 0.00175, 0.00158, 0.0018, 0.00152, 0.00183, 0.00158, 0.00174, 0.00159, 0.00179, 0.00155, 0.00182, 0.00157, 0.0018, 0.00159, 0.00183, 0.00156, 0.00181, 0.00158, 0.00176, 0.00158, 0.00176, 0.00156, 0.00178, 0.00158, 0.00181, 0.00153, 0.0018, 0.00155, 0.0018, 0.0016, 0.0019, 0.0016, 0.00175, 0.0016, 0.0018, 0.00153, 0.00178, 0.00158, 0.0018, 0.00156, 0.00172, 0.00159, 0.00182, 0.00157, 0.00175, 0.00157, 0.00173, 0.00156, 0.00186, 0.00158, 0.00178, 0.00158, 0.00188, 0.00159, 0.00181, 0.00153, 0.00175, 0.00155, 0.00181, 0.00156, 0.00181, 0.00177, 0.00157, 0.00162, 0.00165, 0.00173, 0.00157, 0.00173, 0.00165, 0.00167, 0.00151, 0.00172, 0.00167, 0.00174, 0.00157, 0.00168, 0.00168, 0.00174, 0.00157, 0.00175, 0.00166, 0.00174, 0.00154, 0.00174, 0.00167, 0.00171, 0.00159, 0.00174, 0.00165, 0.00173, 0.00159, 0.00174, 0.00162, 0.00175, 0.00157, 0.00174, 0.00167, 0.00172, 0.00156, 0.00174, 0.00164, 0.00175, 0.00154, 0.00161, 0.0016, 0.00174, 0.00156, 0.00179, 0.00167, 0.00167, 0.00155, 0.00175, 0.00167, 0.00173, 0.00158, 0.00176, 0.00166, 0.00173, 0.00157, 0.00173, 0.00161, 0.00176, 0.0016, 0.00168, 0.00162, 0.00174, 0.00158, 0.00174, 0.00167, 0.00174, 0.00158, 0.00168, 0.00161, 0.00175, 0.00159, 0.00173, 0.00168, 0.00175, 0.00158, 0.00174, 0.00163, 0.00176, 0.00153, 0.00175, 0.00168, 0.00168, 0.00153, 0.00172, 0.00165, 0.00175, 0.00159, 0.00174, 0.00164, 0.00176, 0.00153, 0.00171, 0.00162, 0.00173, 0.00156, 0.00174, 0.00165, 0.00168, 0.00158, 0.00174, 0.00167, 0.00176, 0.00158, 0.00175, 0.00167, 0.00174, 0.00158, 0.00168, 0.00166, 0.00173, 0.00157, 0.00176, 0.00161, 0.00173, 0.00159, 0.00178, 0.00165, 0.00174, 0.00156, 0.00167, 0.00163, 0.00165, 0.00158, 0.00173, 0.00162, 0.00176, 0.00157, 0.00173, 0.00166, 0.00173, 0.0016, 0.0018, 0.00165, 0.00172, 0.00159, 0.00168, 0.00165, 0.00175, 0.00154, 0.00171, 0.00164, 0.00169, 0.00153, 0.00175, 0.00166, 0.00175, 0.00159, 0.00176, 0.00164, 0.00172, 0.00159, 0.00169, 0.00166, 0.00173, 0.00153, 0.00167, 0.00164, 0.00172, 0.00159, 0.00167, 0.00168, 0.00175, 0.00157, 0.00173, 0.00167, 0.00172, 0.0016, 0.00173, 0.00166, 0.00175, 0.00153, 0.00174, 0.00163, 0.00172, 0.00157, 0.00167, 0.00165, 0.00171, 0.00159, 0.00175, 0.00166, 0.00166, 0.00158, 0.00166, 0.00164, 0.00167, 0.00157, 0.0017, 0.00168, 0.00169, 0.00158, 0.00176, 0.00168, 0.00172, 0.00157, 0.00173, 0.00167]}, "optimizer-copy-to-main-grad-time": {"start_step": 0, "end_step": 2000, "step_interval": 5, "values": [0.00181, 0.00152, 0.00153, 0.0015, 0.00157, 0.00156, 0.00152, 0.00157, 0.00162, 0.0015, 0.00152, 0.00155, 0.00152, 0.00155, 0.00155, 0.00161, 0.00151, 0.00151, 0.00196, 0.0015, 0.00161, 0.0015, 0.00162, 0.00161, 0.00157, 0.00151, 0.0015, 0.0015, 0.00156, 0.00153, 0.00171, 0.00252, 0.00165, 0.0018, 0.00159, 0.00153, 0.00157, 0.00159, 0.00159, 0.00157, 0.00156, 0.00163, 0.00152, 0.0015, 0.00163, 0.00153, 0.00149, 0.00156, 0.00156, 0.00152, 0.00157, 0.00152, 0.0016, 0.00159, 0.00155, 0.00157, 0.00157, 0.00156, 0.00151, 0.00156, 0.00152, 0.00151, 0.00157, 0.00157, 0.00163, 0.00153, 0.00158, 0.00155, 0.00149, 0.00161, 0.0015, 0.00156, 0.00151, 0.00162, 0.00158, 0.00148, 0.00156, 0.0015, 0.00157, 0.00151, 0.00155, 0.00155, 0.00161, 0.0027, 0.00157, 0.00156, 0.00156, 0.00151, 0.00156, 0.00149, 0.00158, 0.0015, 0.00152, 0.00156, 0.00155, 0.0024, 0.00156, 0.0016, 0.00156, 0.0015, 0.0016, 0.00155, 0.00151, 0.00154, 0.00158, 0.0015, 0.0015, 0.00155, 0.00156, 0.00155, 0.00157, 0.0015, 0.0015, 0.00155, 0.00157, 0.00155, 0.00157, 0.0015, 0.00157, 0.00155, 0.00155, 0.0015, 0.00164, 0.0016, 0.00151, 0.0015, 0.00165, 0.00151, 0.00157, 0.00157, 0.00158, 0.00154, 0.00157, 0.0016, 0.0016, 0.00149, 0.00154, 0.00156, 0.00333, 0.00159, 0.00153, 0.00149, 0.00149, 0.00166, 0.00165, 0.00158, 0.00149, 0.00155, 0.00152, 0.00155, 0.00156, 0.00152, 0.00155, 0.00156, 0.00164, 0.00155, 0.00156, 0.00152, 0.00166, 0.00153, 0.0015, 0.0015, 0.00155, 0.00156, 0.00158, 0.00149, 0.00165, 0.00155, 0.0015, 0.0015, 0.0015, 0.00154, 0.00155, 0.00165, 0.00156, 0.00155, 0.0015, 0.00148, 0.00154, 0.00156, 0.00156, 0.0015, 0.00148, 0.00157, 0.00152, 0.0015, 0.00149, 0.00157, 0.00149, 0.00149, 0.0015, 0.0028, 0.0015, 0.00151, 0.00157, 0.00155, 0.00148, 0.0015, 0.00169, 0.00149, 0.0015, 0.00159, 0.00155, 0.00149, 0.0015, 0.00148, 0.00149, 0.00154, 0.00155, 0.00149, 0.00147, 0.00149, 0.00156, 0.00148, 0.00146, 0.00151, 0.00152, 0.00147, 0.00147, 0.00147, 0.00155, 0.00147, 0.00148, 0.00144, 0.0015, 0.0015, 0.00159, 0.00156, 0.00149, 0.00151, 0.0016, 0.00149, 0.0015, 0.00154, 0.0015, 0.00147, 0.00147, 0.00154, 0.00156, 0.00153, 0.0015, 0.0015, 0.002, 0.00151, 0.00246, 0.0015, 0.00147, 0.00144, 0.00148, 0.00171, 0.00148, 0.0015, 0.00157, 0.00174, 0.00156, 0.00157, 0.00148, 0.00147, 0.00149, 0.00148, 0.0015, 0.00148, 0.00151, 0.00158, 0.00149, 0.00147, 0.00153, 0.00151, 0.00154, 0.00148, 0.00157, 0.00157, 0.00148, 0.0016, 0.00153, 0.00155, 0.00156, 0.00157, 0.00149, 0.00154, 0.00148, 0.00151, 0.00149, 0.00155, 0.00148, 0.00155, 0.00155, 0.0015, 0.00149, 0.0015, 0.00149, 0.00153, 0.00164, 0.0016, 0.0015, 0.00153, 0.00149, 0.00158, 0.00154, 0.00149, 0.00154, 0.00165, 0.00151, 0.00148, 0.00158, 0.00157, 0.00158, 0.0015, 0.00149, 0.00154, 0.00152, 0.00155, 0.00158, 0.00149, 0.00157, 0.0015, 0.00158, 0.00163, 0.00159, 0.00158, 0.00159, 0.00157, 0.00157, 0.0015, 0.00151, 0.00151, 0.00154, 0.00154, 0.00159, 0.00155, 0.00155, 0.00148, 0.00198, 0.00154, 0.00149, 0.00156, 0.00151, 0.00157, 0.00149, 0.00148, 0.00151, 0.00154, 0.00153, 0.00148, 0.00151, 0.00149, 0.0015, 0.00155, 0.00155, 0.00151, 0.00156, 0.00154, 0.0015, 0.0015, 0.00151, 0.00157, 0.00156, 0.00158, 0.0015, 0.00155, 0.00148, 0.00153, 0.00151, 0.0015, 0.0015, 0.00152, 0.00151, 0.00156, 0.00158, 0.00151, 0.0015, 0.00149, 0.00156, 0.00156, 0.00157, 0.0015, 0.00148, 0.00158, 0.00158, 0.00156, 0.00155, 0.00154, 0.00165, 0.00162, 0.00157, 0.00166, 0.0015, 0.00156, 0.00155, 0.00152, 0.00152, 0.00154, 0.0015, 0.00153, 0.0016, 0.0015, 0.00151, 0.00152, 0.00155, 0.00155]}, "optimizer-unscale-and-check-inf-time": {"start_step": 0, "end_step": 2000, "step_interval": 5, "values": [0.60633, 0.00085, 0.00071, 0.0006, 0.00062, 0.0006, 0.00062, 0.00062, 0.00063, 0.00059, 0.00063, 0.00062, 0.00063, 0.00063, 0.00063, 0.00068, 0.00062, 0.00063, 0.00065, 0.00064, 0.00064, 0.0006, 0.00063, 0.00064, 0.00063, 0.00061, 0.00062, 0.00062, 0.00063, 0.00061, 0.0007, 0.00092, 0.00063, 0.00071, 0.00063, 0.00069, 0.00063, 0.00062, 0.00063, 0.00063, 0.00064, 0.0006, 0.00061, 0.00064, 0.00062, 0.00063, 0.00061, 0.00065, 0.00062, 0.00062, 0.0006, 0.00062, 0.00067, 0.00061, 0.00062, 0.00062, 0.00061, 0.00063, 0.00061, 0.00061, 0.0006, 0.00062, 0.00061, 0.00062, 0.00062, 0.00062, 0.00064, 0.00061, 0.00062, 0.00063, 0.00061, 0.00062, 0.00061, 0.00065, 0.00063, 0.0006, 0.0006, 0.0006, 0.00064, 0.00063, 0.00064, 0.0006, 0.00061, 0.00077, 0.00062, 0.00062, 0.00062, 0.00061, 0.00061, 0.00064, 0.00062, 0.0006, 0.00062, 0.00062, 0.00059, 0.00067, 0.00061, 0.00065, 0.0006, 0.00061, 0.00063, 0.00062, 0.00063, 0.00063, 0.00062, 0.0006, 0.00061, 0.00062, 0.00062, 0.0006, 0.00063, 0.00061, 0.0006, 0.0006, 0.00059, 0.00061, 0.0006, 0.00063, 0.00062, 0.00062, 0.00062, 0.00059, 0.00063, 0.0006, 0.00062, 0.00062, 0.00062, 0.00059, 0.00062, 0.00063, 0.0006, 0.00061, 0.0006, 0.00067, 0.00069, 0.00061, 0.00061, 0.00063, 0.00074, 0.0006, 0.00061, 0.00061, 0.00061, 0.00066, 0.00071, 0.00062, 0.00061, 0.0006, 0.00061, 0.00063, 0.0006, 0.00063, 0.00062, 0.00063, 0.00061, 0.00063, 0.00063, 0.00063, 0.00064, 0.00063, 0.00065, 0.00064, 0.00062, 0.00061, 0.00063, 0.00061, 0.00062, 0.00061, 0.00062, 0.00062, 0.00061, 0.00063, 0.00063, 0.00064, 0.00063, 0.00063, 0.00062, 0.00063, 0.00061, 0.00064, 0.00067, 0.0006, 0.00061, 0.00062, 0.00071, 0.00062, 0.00059, 0.00063, 0.00062, 0.0006, 0.00061, 0.00065, 0.00061, 0.00062, 0.00063, 0.00063, 0.00062, 0.00061, 0.00065, 0.00061, 0.00059, 0.0006, 0.00062, 0.0006, 0.00063, 0.00063, 0.0006, 0.00061, 0.00059, 0.00062, 0.00062, 0.0006, 0.00064, 0.00058, 0.00059, 0.00063, 0.00059, 0.0006, 0.00059, 0.00061, 0.00063, 0.00063, 0.0006, 0.0006, 0.00062, 0.0006, 0.00061, 0.00062, 0.00059, 0.00063, 0.0006, 0.00063, 0.0006, 0.00063, 0.00061, 0.00076, 0.0006, 0.00059, 0.00059, 0.00059, 0.0006, 0.00063, 0.00067, 0.00062, 0.00096, 0.00064, 0.00063, 0.00065, 0.00059, 0.00066, 0.00059, 0.0006, 0.00063, 0.00062, 0.00061, 0.00063, 0.00062, 0.00063, 0.00063, 0.00063, 0.0006, 0.00064, 0.00062, 0.00067, 0.00059, 0.00061, 0.00062, 0.00061, 0.00062, 0.0006, 0.0006, 0.00063, 0.00062, 0.00066, 0.00063, 0.00062, 0.00061, 0.00062, 0.00063, 0.00065, 0.00063, 0.00062, 0.00064, 0.00064, 0.00062, 0.00061, 0.00062, 0.00065, 0.00062, 0.00062, 0.00059, 0.00063, 0.00064, 0.0006, 0.00063, 0.00063, 0.00062, 0.00064, 0.00061, 0.00063, 0.00061, 0.0006, 0.00063, 0.00064, 0.00067, 0.00066, 0.00063, 0.00062, 0.00061, 0.00063, 0.00061, 0.00063, 0.00062, 0.00062, 0.00063, 0.00064, 0.00063, 0.00061, 0.00063, 0.00062, 0.00066, 0.00062, 0.00062, 0.00062, 0.00062, 0.00063, 0.00066, 0.00062, 0.00067, 0.00068, 0.00094, 0.00061, 0.00091, 0.00064, 0.00062, 0.00061, 0.00062, 0.00062, 0.00061, 0.00062, 0.00061, 0.00063, 0.00059, 0.00062, 0.00062, 0.00062, 0.00062, 0.00062, 0.00062, 0.00059, 0.00066, 0.00062, 0.00062, 0.0006, 0.00062, 0.00061, 0.00063, 0.00062, 0.00062, 0.00062, 0.00059, 0.0006, 0.00061, 0.0006, 0.00062, 0.00063, 0.00063, 0.00061, 0.00063, 0.00064, 0.00061, 0.00062, 0.00062, 0.00062, 0.00093, 0.00063, 0.00063, 0.00063, 0.00062, 0.00059, 0.00061, 0.00062, 0.00062, 0.00064, 0.00062, 0.00064, 0.00063, 0.00064, 0.00064, 0.00063, 0.00062, 0.00063, 0.00062, 0.00062, 0.00066, 0.00064, 0.00074, 0.00063, 0.00063, 0.00062]}, "optimizer-time": {"start_step": 0, "end_step": 2000, "step_interval": 5, "values": [0.60837, 0.00254, 0.00241, 0.00228, 0.01048, 0.01037, 0.01037, 0.01043, 0.01058, 0.01048, 0.01043, 0.01043, 0.01041, 0.0104, 0.01041, 0.01065, 0.01035, 0.01034, 0.01163, 0.01037, 0.01065, 0.01028, 0.01071, 0.01072, 0.01046, 0.0103, 0.01034, 0.01036, 0.01049, 0.01035, 0.01149, 0.01326, 0.01057, 0.0123, 0.01043, 0.0108, 0.01045, 0.01043, 0.01054, 0.01044, 0.01042, 0.01047, 0.01038, 0.01036, 0.01051, 0.01045, 0.01031, 0.01066, 0.01039, 0.01038, 0.01045, 0.01039, 0.01082, 0.01041, 0.01037, 0.01039, 0.0104, 0.01052, 0.01036, 0.01042, 0.01043, 0.01041, 0.01041, 0.01038, 0.01048, 0.01055, 0.01067, 0.01037, 0.01034, 0.01046, 0.01031, 0.01091, 0.01032, 0.01102, 0.0105, 0.01027, 0.01037, 0.01029, 0.01047, 0.0104, 0.01046, 0.01038, 0.01047, 0.01178, 0.0104, 0.01074, 0.01048, 0.01035, 0.01038, 0.01049, 0.01045, 0.01029, 0.0104, 0.01038, 0.01035, 0.01254, 0.01037, 0.01078, 0.01036, 0.01033, 0.01045, 0.01036, 0.01034, 0.01037, 0.01041, 0.01036, 0.01033, 0.01079, 0.01038, 0.01041, 0.01023, 0.01009, 0.01031, 0.01035, 0.01038, 0.01037, 0.01044, 0.01035, 0.01041, 0.01038, 0.01021, 0.0103, 0.01049, 0.01051, 0.01036, 0.01032, 0.01054, 0.01033, 0.01041, 0.01043, 0.01041, 0.01037, 0.01014, 0.01109, 0.01092, 0.01032, 0.01033, 0.01042, 0.02222, 0.01043, 0.01036, 0.01031, 0.01034, 0.01109, 0.01102, 0.01041, 0.01027, 0.01035, 0.0103, 0.01041, 0.01036, 0.01039, 0.01035, 0.01041, 0.01048, 0.01069, 0.01042, 0.01035, 0.01064, 0.01041, 0.01045, 0.01034, 0.01039, 0.01039, 0.01043, 0.01033, 0.01133, 0.01034, 0.01033, 0.01034, 0.01031, 0.01035, 0.0104, 0.01052, 0.01043, 0.01047, 0.01036, 0.01029, 0.01035, 0.01042, 0.01057, 0.0103, 0.0103, 0.01039, 0.0109, 0.0103, 0.0103, 0.0105, 0.01036, 0.01034, 0.01033, 0.01214, 0.01032, 0.0103, 0.01039, 0.01085, 0.01031, 0.01031, 0.01064, 0.01141, 0.01028, 0.01048, 0.01035, 0.01021, 0.01033, 0.01032, 0.01023, 0.01127, 0.01075, 0.01024, 0.01023, 0.01023, 0.01033, 0.01036, 0.01017, 0.01034, 0.01026, 0.01036, 0.01019, 0.01026, 0.01033, 0.01163, 0.0102, 0.01023, 0.01031, 0.01033, 0.01042, 0.01049, 0.01036, 0.01032, 0.01053, 0.01033, 0.01034, 0.01037, 0.01037, 0.01078, 0.01026, 0.01052, 0.01028, 0.01028, 0.01025, 0.01028, 0.01147, 0.01035, 0.01173, 0.01035, 0.01038, 0.01027, 0.01027, 0.01065, 0.01023, 0.01027, 0.01043, 0.01054, 0.01038, 0.01054, 0.01028, 0.01026, 0.0103, 0.01038, 0.0104, 0.0103, 0.0104, 0.01114, 0.01027, 0.01028, 0.01042, 0.01027, 0.01037, 0.01028, 0.01061, 0.01066, 0.01034, 0.0108, 0.01035, 0.01037, 0.01038, 0.01034, 0.01138, 0.01141, 0.01027, 0.01041, 0.01039, 0.01039, 0.01031, 0.01042, 0.01036, 0.01077, 0.01045, 0.01035, 0.0105, 0.01039, 0.01057, 0.01041, 0.01033, 0.01039, 0.01029, 0.0106, 0.01032, 0.01029, 0.01034, 0.01044, 0.01035, 0.01034, 0.0111, 0.01066, 0.01041, 0.0103, 0.01025, 0.01038, 0.01037, 0.01064, 0.0105, 0.0103, 0.01048, 0.01051, 0.01052, 0.01041, 0.0104, 0.01041, 0.01044, 0.01036, 0.01043, 0.01038, 0.01034, 0.01033, 0.01126, 0.01037, 0.01044, 0.01078, 0.01116, 0.01162, 0.01139, 0.01058, 0.0105, 0.01061, 0.01053, 0.01057, 0.01058, 0.01058, 0.01057, 0.0106, 0.01051, 0.01054, 0.01067, 0.0109, 0.01057, 0.01057, 0.01057, 0.01051, 0.01063, 0.01186, 0.0105, 0.01054, 0.01053, 0.01061, 0.01062, 0.01089, 0.01057, 0.0106, 0.01047, 0.01071, 0.0105, 0.01049, 0.01052, 0.01054, 0.01057, 0.0106, 0.01078, 0.01062, 0.01067, 0.01052, 0.01059, 0.01061, 0.01212, 0.01052, 0.01054, 0.01063, 0.0106, 0.01057, 0.01098, 0.01059, 0.01077, 0.01074, 0.01076, 0.01115, 0.01053, 0.01121, 0.01063, 0.01056, 0.01057, 0.01061, 0.01059, 0.01061, 0.01076, 0.01059, 0.01075, 0.01057, 0.01058, 0.01057]}, "learning-rate": {"start_step": 0, "end_step": 2000, "step_interval": 5, "values": [0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 1e-05, 1e-05, 1e-05, 1e-05, 1e-05, 1e-05, 1e-05, 1e-05, 1e-05, 1e-05, 1e-05, 1e-05, 1e-05, 1e-05, 1e-05, 1e-05, 1e-05, 1e-05, 1e-05, 1e-05, 1e-05, 1e-05, 1e-05, 1e-05, 1e-05, 1e-05, 1e-05, 1e-05, 1e-05, 1e-05, 1e-05, 1e-05, 1e-05, 1e-05, 1e-05, 1e-05, 1e-05, 1e-05, 1e-05, 1e-05, 1e-05, 1e-05, 1e-05, 2e-05, 2e-05, 2e-05, 2e-05, 2e-05, 2e-05, 2e-05, 2e-05, 2e-05, 2e-05, 2e-05, 2e-05, 2e-05, 2e-05, 2e-05, 2e-05, 2e-05, 2e-05, 2e-05, 2e-05, 2e-05, 2e-05, 2e-05, 2e-05, 2e-05, 2e-05, 2e-05, 2e-05, 2e-05, 2e-05, 2e-05, 2e-05, 2e-05, 2e-05, 2e-05, 2e-05, 2e-05, 2e-05, 2e-05, 2e-05, 2e-05, 2e-05, 3e-05, 3e-05, 3e-05, 3e-05, 3e-05, 3e-05, 3e-05, 3e-05, 3e-05, 3e-05, 3e-05, 3e-05, 3e-05, 3e-05, 3e-05, 3e-05, 3e-05, 3e-05, 3e-05, 3e-05, 3e-05, 3e-05, 3e-05, 3e-05, 3e-05, 3e-05, 3e-05, 3e-05, 3e-05, 3e-05, 3e-05, 3e-05, 3e-05, 3e-05, 3e-05, 3e-05, 3e-05, 3e-05, 3e-05, 3e-05, 3e-05, 3e-05, 3e-05, 4e-05, 4e-05, 4e-05, 4e-05, 4e-05, 4e-05, 4e-05, 4e-05, 4e-05, 4e-05, 4e-05, 4e-05, 4e-05, 4e-05, 4e-05, 4e-05, 4e-05, 4e-05, 4e-05, 4e-05, 4e-05, 4e-05, 4e-05, 4e-05, 4e-05, 4e-05, 4e-05, 4e-05, 4e-05, 4e-05, 4e-05, 4e-05, 4e-05, 4e-05, 4e-05, 4e-05, 4e-05, 4e-05, 4e-05, 4e-05, 4e-05, 4e-05, 4e-05, 5e-05, 5e-05, 5e-05, 5e-05, 5e-05, 5e-05, 5e-05, 5e-05, 5e-05, 5e-05, 5e-05, 5e-05, 5e-05, 5e-05, 5e-05, 5e-05, 5e-05, 5e-05, 5e-05, 5e-05, 5e-05, 5e-05, 5e-05, 5e-05, 5e-05, 5e-05, 5e-05, 5e-05, 5e-05, 5e-05, 5e-05, 5e-05, 5e-05, 5e-05, 5e-05, 5e-05, 5e-05, 5e-05, 5e-05, 5e-05, 5e-05, 5e-05, 6e-05, 6e-05, 6e-05, 6e-05, 6e-05, 6e-05, 6e-05, 6e-05, 6e-05, 6e-05, 6e-05, 6e-05, 6e-05, 6e-05, 6e-05, 6e-05, 6e-05, 6e-05, 6e-05, 6e-05, 6e-05, 6e-05, 6e-05, 6e-05, 6e-05, 6e-05, 6e-05, 6e-05, 6e-05, 6e-05, 6e-05, 6e-05, 6e-05, 6e-05, 6e-05, 6e-05, 6e-05, 6e-05, 6e-05, 6e-05, 6e-05, 6e-05, 6e-05, 7e-05, 7e-05, 7e-05, 7e-05, 7e-05, 7e-05, 7e-05, 7e-05, 7e-05, 7e-05, 7e-05, 7e-05, 7e-05, 7e-05, 7e-05, 7e-05, 7e-05, 7e-05, 7e-05, 7e-05, 7e-05, 7e-05, 7e-05, 7e-05, 7e-05, 7e-05, 7e-05, 7e-05, 7e-05, 7e-05, 7e-05, 7e-05, 7e-05, 7e-05, 7e-05, 7e-05, 7e-05, 7e-05, 7e-05, 7e-05, 7e-05, 7e-05, 7e-05, 8e-05, 8e-05, 8e-05, 8e-05, 8e-05, 8e-05, 8e-05, 8e-05, 8e-05, 8e-05, 8e-05, 8e-05, 8e-05, 8e-05, 8e-05, 8e-05, 8e-05, 8e-05, 8e-05, 8e-05, 8e-05, 8e-05, 8e-05, 8e-05, 8e-05, 8e-05, 8e-05, 8e-05, 8e-05, 8e-05, 8e-05, 8e-05, 8e-05, 8e-05, 8e-05, 8e-05, 8e-05, 8e-05, 8e-05, 8e-05, 8e-05, 8e-05, 9e-05, 9e-05, 9e-05, 9e-05, 9e-05, 9e-05, 9e-05, 9e-05, 9e-05, 9e-05, 9e-05, 9e-05, 9e-05, 9e-05, 9e-05, 9e-05, 9e-05, 9e-05, 9e-05, 9e-05, 9e-05, 9e-05, 9e-05, 9e-05, 9e-05, 9e-05, 9e-05, 9e-05, 9e-05, 9e-05, 9e-05, 9e-05, 9e-05, 9e-05]}, "learning-rate vs samples": {"start_step": 0, "end_step": 2000, "step_interval": 5, "values": [0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 1e-05, 1e-05, 1e-05, 1e-05, 1e-05, 1e-05, 1e-05, 1e-05, 1e-05, 1e-05, 1e-05, 1e-05, 1e-05, 1e-05, 1e-05, 1e-05, 1e-05, 1e-05, 1e-05, 1e-05, 1e-05, 1e-05, 1e-05, 1e-05, 1e-05, 1e-05, 1e-05, 1e-05, 1e-05, 1e-05, 1e-05, 1e-05, 1e-05, 1e-05, 1e-05, 1e-05, 1e-05, 1e-05, 1e-05, 1e-05, 1e-05, 1e-05, 1e-05, 2e-05, 2e-05, 2e-05, 2e-05, 2e-05, 2e-05, 2e-05, 2e-05, 2e-05, 2e-05, 2e-05, 2e-05, 2e-05, 2e-05, 2e-05, 2e-05, 2e-05, 2e-05, 2e-05, 2e-05, 2e-05, 2e-05, 2e-05, 2e-05, 2e-05, 2e-05, 2e-05, 2e-05, 2e-05, 2e-05, 2e-05, 2e-05, 2e-05, 2e-05, 2e-05, 2e-05, 2e-05, 2e-05, 2e-05, 2e-05, 2e-05, 2e-05, 3e-05, 3e-05, 3e-05, 3e-05, 3e-05, 3e-05, 3e-05, 3e-05, 3e-05, 3e-05, 3e-05, 3e-05, 3e-05, 3e-05, 3e-05, 3e-05, 3e-05, 3e-05, 3e-05, 3e-05, 3e-05, 3e-05, 3e-05, 3e-05, 3e-05, 3e-05, 3e-05, 3e-05, 3e-05, 3e-05, 3e-05, 3e-05, 3e-05, 3e-05, 3e-05, 3e-05, 3e-05, 3e-05, 3e-05, 3e-05, 3e-05, 3e-05, 3e-05, 4e-05, 4e-05, 4e-05, 4e-05, 4e-05, 4e-05, 4e-05, 4e-05, 4e-05, 4e-05, 4e-05, 4e-05, 4e-05, 4e-05, 4e-05, 4e-05, 4e-05, 4e-05, 4e-05, 4e-05, 4e-05, 4e-05, 4e-05, 4e-05, 4e-05, 4e-05, 4e-05, 4e-05, 4e-05, 4e-05, 4e-05, 4e-05, 4e-05, 4e-05, 4e-05, 4e-05, 4e-05, 4e-05, 4e-05, 4e-05, 4e-05, 4e-05, 4e-05, 5e-05, 5e-05, 5e-05, 5e-05, 5e-05, 5e-05, 5e-05, 5e-05, 5e-05, 5e-05, 5e-05, 5e-05, 5e-05, 5e-05, 5e-05, 5e-05, 5e-05, 5e-05, 5e-05, 5e-05, 5e-05, 5e-05, 5e-05, 5e-05, 5e-05, 5e-05, 5e-05, 5e-05, 5e-05, 5e-05, 5e-05, 5e-05, 5e-05, 5e-05, 5e-05, 5e-05, 5e-05, 5e-05, 5e-05, 5e-05, 5e-05, 5e-05, 6e-05, 6e-05, 6e-05, 6e-05, 6e-05, 6e-05, 6e-05, 6e-05, 6e-05, 6e-05, 6e-05, 6e-05, 6e-05, 6e-05, 6e-05, 6e-05, 6e-05, 6e-05, 6e-05, 6e-05, 6e-05, 6e-05, 6e-05, 6e-05, 6e-05, 6e-05, 6e-05, 6e-05, 6e-05, 6e-05, 6e-05, 6e-05, 6e-05, 6e-05, 6e-05, 6e-05, 6e-05, 6e-05, 6e-05, 6e-05, 6e-05, 6e-05, 6e-05, 7e-05, 7e-05, 7e-05, 7e-05, 7e-05, 7e-05, 7e-05, 7e-05, 7e-05, 7e-05, 7e-05, 7e-05, 7e-05, 7e-05, 7e-05, 7e-05, 7e-05, 7e-05, 7e-05, 7e-05, 7e-05, 7e-05, 7e-05, 7e-05, 7e-05, 7e-05, 7e-05, 7e-05, 7e-05, 7e-05, 7e-05, 7e-05, 7e-05, 7e-05, 7e-05, 7e-05, 7e-05, 7e-05, 7e-05, 7e-05, 7e-05, 7e-05, 7e-05, 8e-05, 8e-05, 8e-05, 8e-05, 8e-05, 8e-05, 8e-05, 8e-05, 8e-05, 8e-05, 8e-05, 8e-05, 8e-05, 8e-05, 8e-05, 8e-05, 8e-05, 8e-05, 8e-05, 8e-05, 8e-05, 8e-05, 8e-05, 8e-05, 8e-05, 8e-05, 8e-05, 8e-05, 8e-05, 8e-05, 8e-05, 8e-05, 8e-05, 8e-05, 8e-05, 8e-05, 8e-05, 8e-05, 8e-05, 8e-05, 8e-05, 8e-05, 9e-05, 9e-05, 9e-05, 9e-05, 9e-05, 9e-05, 9e-05, 9e-05, 9e-05, 9e-05, 9e-05, 9e-05, 9e-05, 9e-05, 9e-05, 9e-05, 9e-05, 9e-05, 9e-05, 9e-05, 9e-05, 9e-05, 9e-05, 9e-05, 9e-05, 9e-05, 9e-05, 9e-05, 9e-05, 9e-05, 9e-05, 9e-05, 9e-05, 9e-05]}, "batch-size": {"start_step": 0, "end_step": 2000, "step_interval": 5, "values": [128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0]}, "batch-size vs samples": {"start_step": 0, "end_step": 2000, "step_interval": 5, "values": [128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0]}, "lm loss": {"start_step": 0, "end_step": 2000, "step_interval": 5, "values": [10.89393, 10.90229, 10.90382, 10.89922, 10.90215, 10.87439, 10.80338, 10.63346, 10.44036, 10.2933, 10.02711, 10.16747, 10.13781, 9.86192, 9.97684, 9.67806, 9.59835, 9.78149, 9.50324, 9.44529, 9.35262, 9.25422, 9.27971, 9.09386, 9.28651, 9.15722, 9.24673, 9.26197, 9.39815, 9.08902, 9.03506, 9.14524, 9.15344, 8.76086, 8.82546, 8.85801, 8.78594, 8.83766, 8.7627, 8.8693, 8.76505, 8.95513, 8.94138, 8.60415, 8.49526, 8.5414, 8.6052, 8.49378, 8.54563, 8.69589, 8.47931, 8.31047, 8.34191, 8.33761, 8.38482, 8.03117, 8.21698, 8.01005, 8.36597, 8.35171, 8.1238, 8.08903, 8.03892, 7.85884, 7.86204, 7.76178, 7.63785, 8.03256, 7.82491, 7.57767, 7.87018, 7.89663, 7.66576, 7.41891, 7.57945, 7.45949, 7.58407, 7.3365, 7.75478, 7.39312, 7.46005, 7.32601, 7.32261, 7.53324, 7.28432, 7.3906, 7.10455, 7.1031, 7.135, 7.2333, 6.91495, 7.07308, 7.17321, 7.08148, 6.95568, 6.83552, 7.07146, 7.13597, 6.77633, 6.6537, 6.79923, 6.81094, 6.80156, 6.80623, 6.72479, 6.46997, 6.7029, 6.67891, 6.50414, 6.69017, 6.80201, 6.66742, 6.78223, 6.74908, 6.68039, 6.55851, 6.65127, 6.45882, 6.71595, 6.3003, 6.29947, 6.35127, 6.43626, 6.39728, 6.5005, 6.33652, 6.38489, 6.2805, 6.24364, 6.44007, 6.36837, 6.36408, 6.20465, 6.19665, 6.27951, 6.42484, 6.24039, 6.18602, 6.21368, 6.14857, 6.09651, 6.10359, 6.28963, 6.44182, 6.28988, 6.33247, 6.13546, 6.21108, 6.0349, 6.06273, 5.987, 6.28025, 6.22641, 5.99808, 5.81837, 6.16027, 5.88364, 6.139, 5.82189, 6.19536, 6.17777, 6.11785, 5.96408, 6.14649, 5.9753, 6.22609, 5.92665, 5.82529, 5.80636, 5.7182, 6.04353, 6.02584, 6.092, 5.9119, 6.06757, 5.99273, 6.02669, 6.01523, 5.97662, 5.86429, 5.97653, 5.6431, 5.7275, 5.9135, 5.8664, 5.88797, 5.78842, 5.86055, 5.75215, 5.58542, 5.74699, 5.6532, 5.85871, 5.63063, 5.7325, 5.73883, 5.92312, 5.66992, 5.87123, 5.76346, 5.89613, 5.35339, 5.91985, 5.89554, 5.87623, 5.43362, 5.42829, 5.64744, 5.61678, 5.5103, 5.59917, 5.6988, 5.49854, 5.77013, 5.53314, 5.61954, 5.64553, 5.64008, 5.53513, 5.63528, 5.69717, 5.71522, 5.60874, 5.6802, 5.39435, 5.70021, 5.64782, 5.44435, 5.60824, 5.65007, 5.57098, 5.36362, 5.55798, 5.50433, 5.50082, 5.39457, 5.57452, 5.62082, 5.40855, 5.54177, 5.50319, 5.34993, 5.52256, 5.42475, 5.457, 5.33418, 5.08125, 5.49351, 5.58285, 5.72877, 5.42977, 5.613, 5.64847, 5.2484, 5.28756, 5.41008, 5.40961, 5.34061, 5.51276, 5.19903, 5.31256, 5.26266, 5.3907, 5.27539, 5.46188, 5.55243, 5.32608, 5.4523, 5.34935, 5.085, 5.3281, 5.26395, 5.31744, 5.12555, 5.28677, 5.2827, 5.486, 5.17172, 5.28031, 5.22155, 5.37027, 4.99359, 4.92973, 5.33403, 5.3997, 5.23719, 5.33061, 5.11473, 5.1717, 5.27268, 5.07733, 5.2767, 5.0858, 5.35129, 5.2583, 5.16657, 5.25468, 5.05243, 5.32453, 5.06278, 5.03705, 5.15134, 5.12068, 5.28265, 5.15883, 5.28883, 5.10618, 5.10727, 5.2621, 5.33107, 5.26622, 5.20237, 5.15543, 5.29779, 4.95636, 5.21799, 5.10164, 5.30924, 5.18679, 5.19599, 5.12317, 4.99367, 5.00306, 5.23171, 5.32198, 5.10695, 5.0647, 4.92646, 5.13309, 5.12718, 4.93681, 5.34691, 5.03142, 5.11047, 5.16889, 5.01087, 5.07032, 5.07588, 5.00122, 5.08773, 5.16951, 4.98692, 5.18998, 4.93899, 4.92741, 5.07395, 5.00085, 4.91692, 4.78186, 4.94917, 5.12365, 5.02541, 5.02437, 5.33759, 4.96582, 5.00145, 5.05138, 4.81301, 4.74456, 5.00203, 5.04679, 4.88367, 4.95882, 5.05212, 5.03024, 4.82289, 4.89705, 4.91162, 4.83722, 4.75468, 5.01694, 4.75625, 5.21634, 4.78922, 4.99899, 4.74083, 4.79117, 4.82499, 4.65555, 4.66118, 4.84502, 4.812, 4.80818, 4.93087, 4.88819, 4.92996, 4.77146, 4.88927, 4.73848, 4.91779, 4.96467, 4.87947, 4.7104, 4.78793, 4.90438, 4.71479, 4.86815, 4.69617, 4.69095, 4.65249]}, "lm loss vs samples": {"start_step": 0, "end_step": 2000, "step_interval": 5, "values": [10.89393, 10.90229, 10.90382, 10.89922, 10.90215, 10.87439, 10.80338, 10.63346, 10.44036, 10.2933, 10.02711, 10.16747, 10.13781, 9.86192, 9.97684, 9.67806, 9.59835, 9.78149, 9.50324, 9.44529, 9.35262, 9.25422, 9.27971, 9.09386, 9.28651, 9.15722, 9.24673, 9.26197, 9.39815, 9.08902, 9.03506, 9.14524, 9.15344, 8.76086, 8.82546, 8.85801, 8.78594, 8.83766, 8.7627, 8.8693, 8.76505, 8.95513, 8.94138, 8.60415, 8.49526, 8.5414, 8.6052, 8.49378, 8.54563, 8.69589, 8.47931, 8.31047, 8.34191, 8.33761, 8.38482, 8.03117, 8.21698, 8.01005, 8.36597, 8.35171, 8.1238, 8.08903, 8.03892, 7.85884, 7.86204, 7.76178, 7.63785, 8.03256, 7.82491, 7.57767, 7.87018, 7.89663, 7.66576, 7.41891, 7.57945, 7.45949, 7.58407, 7.3365, 7.75478, 7.39312, 7.46005, 7.32601, 7.32261, 7.53324, 7.28432, 7.3906, 7.10455, 7.1031, 7.135, 7.2333, 6.91495, 7.07308, 7.17321, 7.08148, 6.95568, 6.83552, 7.07146, 7.13597, 6.77633, 6.6537, 6.79923, 6.81094, 6.80156, 6.80623, 6.72479, 6.46997, 6.7029, 6.67891, 6.50414, 6.69017, 6.80201, 6.66742, 6.78223, 6.74908, 6.68039, 6.55851, 6.65127, 6.45882, 6.71595, 6.3003, 6.29947, 6.35127, 6.43626, 6.39728, 6.5005, 6.33652, 6.38489, 6.2805, 6.24364, 6.44007, 6.36837, 6.36408, 6.20465, 6.19665, 6.27951, 6.42484, 6.24039, 6.18602, 6.21368, 6.14857, 6.09651, 6.10359, 6.28963, 6.44182, 6.28988, 6.33247, 6.13546, 6.21108, 6.0349, 6.06273, 5.987, 6.28025, 6.22641, 5.99808, 5.81837, 6.16027, 5.88364, 6.139, 5.82189, 6.19536, 6.17777, 6.11785, 5.96408, 6.14649, 5.9753, 6.22609, 5.92665, 5.82529, 5.80636, 5.7182, 6.04353, 6.02584, 6.092, 5.9119, 6.06757, 5.99273, 6.02669, 6.01523, 5.97662, 5.86429, 5.97653, 5.6431, 5.7275, 5.9135, 5.8664, 5.88797, 5.78842, 5.86055, 5.75215, 5.58542, 5.74699, 5.6532, 5.85871, 5.63063, 5.7325, 5.73883, 5.92312, 5.66992, 5.87123, 5.76346, 5.89613, 5.35339, 5.91985, 5.89554, 5.87623, 5.43362, 5.42829, 5.64744, 5.61678, 5.5103, 5.59917, 5.6988, 5.49854, 5.77013, 5.53314, 5.61954, 5.64553, 5.64008, 5.53513, 5.63528, 5.69717, 5.71522, 5.60874, 5.6802, 5.39435, 5.70021, 5.64782, 5.44435, 5.60824, 5.65007, 5.57098, 5.36362, 5.55798, 5.50433, 5.50082, 5.39457, 5.57452, 5.62082, 5.40855, 5.54177, 5.50319, 5.34993, 5.52256, 5.42475, 5.457, 5.33418, 5.08125, 5.49351, 5.58285, 5.72877, 5.42977, 5.613, 5.64847, 5.2484, 5.28756, 5.41008, 5.40961, 5.34061, 5.51276, 5.19903, 5.31256, 5.26266, 5.3907, 5.27539, 5.46188, 5.55243, 5.32608, 5.4523, 5.34935, 5.085, 5.3281, 5.26395, 5.31744, 5.12555, 5.28677, 5.2827, 5.486, 5.17172, 5.28031, 5.22155, 5.37027, 4.99359, 4.92973, 5.33403, 5.3997, 5.23719, 5.33061, 5.11473, 5.1717, 5.27268, 5.07733, 5.2767, 5.0858, 5.35129, 5.2583, 5.16657, 5.25468, 5.05243, 5.32453, 5.06278, 5.03705, 5.15134, 5.12068, 5.28265, 5.15883, 5.28883, 5.10618, 5.10727, 5.2621, 5.33107, 5.26622, 5.20237, 5.15543, 5.29779, 4.95636, 5.21799, 5.10164, 5.30924, 5.18679, 5.19599, 5.12317, 4.99367, 5.00306, 5.23171, 5.32198, 5.10695, 5.0647, 4.92646, 5.13309, 5.12718, 4.93681, 5.34691, 5.03142, 5.11047, 5.16889, 5.01087, 5.07032, 5.07588, 5.00122, 5.08773, 5.16951, 4.98692, 5.18998, 4.93899, 4.92741, 5.07395, 5.00085, 4.91692, 4.78186, 4.94917, 5.12365, 5.02541, 5.02437, 5.33759, 4.96582, 5.00145, 5.05138, 4.81301, 4.74456, 5.00203, 5.04679, 4.88367, 4.95882, 5.05212, 5.03024, 4.82289, 4.89705, 4.91162, 4.83722, 4.75468, 5.01694, 4.75625, 5.21634, 4.78922, 4.99899, 4.74083, 4.79117, 4.82499, 4.65555, 4.66118, 4.84502, 4.812, 4.80818, 4.93087, 4.88819, 4.92996, 4.77146, 4.88927, 4.73848, 4.91779, 4.96467, 4.87947, 4.7104, 4.78793, 4.90438, 4.71479, 4.86815, 4.69617, 4.69095, 4.65249]}, "loss-scale": {"start_step": 0, "end_step": 2000, "step_interval": 5, "values": [4294967296.0, 134217728.0, 4194304.0, 131072.0, 65536.0, 65536.0, 65536.0, 65536.0, 65536.0, 65536.0, 65536.0, 65536.0, 65536.0, 65536.0, 65536.0, 65536.0, 65536.0, 65536.0, 65536.0, 65536.0, 65536.0, 65536.0, 65536.0, 65536.0, 65536.0, 65536.0, 65536.0, 65536.0, 65536.0, 65536.0, 65536.0, 65536.0, 65536.0, 65536.0, 65536.0, 65536.0, 65536.0, 65536.0, 65536.0, 65536.0, 65536.0, 65536.0, 65536.0, 65536.0, 65536.0, 65536.0, 65536.0, 65536.0, 65536.0, 65536.0, 65536.0, 65536.0, 65536.0, 65536.0, 65536.0, 65536.0, 65536.0, 65536.0, 65536.0, 65536.0, 65536.0, 65536.0, 65536.0, 65536.0, 65536.0, 65536.0, 65536.0, 65536.0, 65536.0, 65536.0, 65536.0, 65536.0, 65536.0, 65536.0, 65536.0, 65536.0, 65536.0, 65536.0, 65536.0, 65536.0, 65536.0, 65536.0, 65536.0, 65536.0, 65536.0, 65536.0, 65536.0, 65536.0, 65536.0, 65536.0, 65536.0, 65536.0, 65536.0, 65536.0, 65536.0, 65536.0, 65536.0, 65536.0, 65536.0, 65536.0, 65536.0, 65536.0, 65536.0, 65536.0, 65536.0, 65536.0, 65536.0, 65536.0, 65536.0, 65536.0, 65536.0, 65536.0, 65536.0, 65536.0, 65536.0, 65536.0, 65536.0, 65536.0, 65536.0, 65536.0, 65536.0, 65536.0, 65536.0, 65536.0, 65536.0, 65536.0, 65536.0, 65536.0, 65536.0, 65536.0, 65536.0, 65536.0, 65536.0, 65536.0, 65536.0, 65536.0, 65536.0, 65536.0, 65536.0, 65536.0, 65536.0, 65536.0, 65536.0, 65536.0, 65536.0, 65536.0, 65536.0, 65536.0, 65536.0, 65536.0, 65536.0, 65536.0, 65536.0, 65536.0, 65536.0, 65536.0, 65536.0, 65536.0, 65536.0, 65536.0, 65536.0, 65536.0, 65536.0, 65536.0, 65536.0, 65536.0, 65536.0, 65536.0, 65536.0, 65536.0, 65536.0, 65536.0, 65536.0, 65536.0, 65536.0, 65536.0, 65536.0, 65536.0, 65536.0, 65536.0, 65536.0, 65536.0, 65536.0, 65536.0, 65536.0, 65536.0, 65536.0, 65536.0, 65536.0, 65536.0, 65536.0, 65536.0, 65536.0, 65536.0, 65536.0, 65536.0, 65536.0, 65536.0, 65536.0, 65536.0, 65536.0, 65536.0, 65536.0, 65536.0, 131072.0, 131072.0, 131072.0, 131072.0, 131072.0, 131072.0, 131072.0, 131072.0, 131072.0, 131072.0, 131072.0, 131072.0, 131072.0, 131072.0, 131072.0, 131072.0, 131072.0, 131072.0, 131072.0, 131072.0, 131072.0, 131072.0, 131072.0, 131072.0, 131072.0, 131072.0, 131072.0, 131072.0, 131072.0, 131072.0, 131072.0, 131072.0, 131072.0, 131072.0, 131072.0, 131072.0, 131072.0, 131072.0, 131072.0, 131072.0, 131072.0, 131072.0, 131072.0, 131072.0, 131072.0, 131072.0, 131072.0, 131072.0, 131072.0, 131072.0, 131072.0, 131072.0, 131072.0, 131072.0, 131072.0, 131072.0, 131072.0, 131072.0, 131072.0, 131072.0, 131072.0, 131072.0, 131072.0, 131072.0, 131072.0, 131072.0, 131072.0, 131072.0, 131072.0, 131072.0, 131072.0, 131072.0, 131072.0, 131072.0, 131072.0, 131072.0, 131072.0, 131072.0, 131072.0, 131072.0, 131072.0, 131072.0, 131072.0, 131072.0, 131072.0, 131072.0, 131072.0, 131072.0, 131072.0, 131072.0, 131072.0, 131072.0, 131072.0, 131072.0, 131072.0, 131072.0, 131072.0, 131072.0, 131072.0, 131072.0, 131072.0, 131072.0, 131072.0, 131072.0, 131072.0, 131072.0, 131072.0, 131072.0, 131072.0, 131072.0, 131072.0, 131072.0, 131072.0, 131072.0, 131072.0, 131072.0, 131072.0, 131072.0, 131072.0, 131072.0, 131072.0, 131072.0, 131072.0, 131072.0, 131072.0, 131072.0, 131072.0, 131072.0, 131072.0, 131072.0, 131072.0, 131072.0, 131072.0, 131072.0, 131072.0, 131072.0, 131072.0, 131072.0, 131072.0, 131072.0, 131072.0, 131072.0, 131072.0, 131072.0, 131072.0, 131072.0, 131072.0, 131072.0, 131072.0, 131072.0, 131072.0, 131072.0, 131072.0, 131072.0, 131072.0, 131072.0, 131072.0, 131072.0, 131072.0, 131072.0, 131072.0, 131072.0, 131072.0, 131072.0, 131072.0, 131072.0, 131072.0, 131072.0, 131072.0, 131072.0, 131072.0, 131072.0, 131072.0, 131072.0, 131072.0, 131072.0, 131072.0, 131072.0, 131072.0, 131072.0, 131072.0, 131072.0, 131072.0, 131072.0, 131072.0, 131072.0, 131072.0, 131072.0, 131072.0, 131072.0, 131072.0, 131072.0, 131072.0, 131072.0, 131072.0, 131072.0]}, "loss-scale vs samples": {"start_step": 0, "end_step": 2000, "step_interval": 5, "values": [4294967296.0, 134217728.0, 4194304.0, 131072.0, 65536.0, 65536.0, 65536.0, 65536.0, 65536.0, 65536.0, 65536.0, 65536.0, 65536.0, 65536.0, 65536.0, 65536.0, 65536.0, 65536.0, 65536.0, 65536.0, 65536.0, 65536.0, 65536.0, 65536.0, 65536.0, 65536.0, 65536.0, 65536.0, 65536.0, 65536.0, 65536.0, 65536.0, 65536.0, 65536.0, 65536.0, 65536.0, 65536.0, 65536.0, 65536.0, 65536.0, 65536.0, 65536.0, 65536.0, 65536.0, 65536.0, 65536.0, 65536.0, 65536.0, 65536.0, 65536.0, 65536.0, 65536.0, 65536.0, 65536.0, 65536.0, 65536.0, 65536.0, 65536.0, 65536.0, 65536.0, 65536.0, 65536.0, 65536.0, 65536.0, 65536.0, 65536.0, 65536.0, 65536.0, 65536.0, 65536.0, 65536.0, 65536.0, 65536.0, 65536.0, 65536.0, 65536.0, 65536.0, 65536.0, 65536.0, 65536.0, 65536.0, 65536.0, 65536.0, 65536.0, 65536.0, 65536.0, 65536.0, 65536.0, 65536.0, 65536.0, 65536.0, 65536.0, 65536.0, 65536.0, 65536.0, 65536.0, 65536.0, 65536.0, 65536.0, 65536.0, 65536.0, 65536.0, 65536.0, 65536.0, 65536.0, 65536.0, 65536.0, 65536.0, 65536.0, 65536.0, 65536.0, 65536.0, 65536.0, 65536.0, 65536.0, 65536.0, 65536.0, 65536.0, 65536.0, 65536.0, 65536.0, 65536.0, 65536.0, 65536.0, 65536.0, 65536.0, 65536.0, 65536.0, 65536.0, 65536.0, 65536.0, 65536.0, 65536.0, 65536.0, 65536.0, 65536.0, 65536.0, 65536.0, 65536.0, 65536.0, 65536.0, 65536.0, 65536.0, 65536.0, 65536.0, 65536.0, 65536.0, 65536.0, 65536.0, 65536.0, 65536.0, 65536.0, 65536.0, 65536.0, 65536.0, 65536.0, 65536.0, 65536.0, 65536.0, 65536.0, 65536.0, 65536.0, 65536.0, 65536.0, 65536.0, 65536.0, 65536.0, 65536.0, 65536.0, 65536.0, 65536.0, 65536.0, 65536.0, 65536.0, 65536.0, 65536.0, 65536.0, 65536.0, 65536.0, 65536.0, 65536.0, 65536.0, 65536.0, 65536.0, 65536.0, 65536.0, 65536.0, 65536.0, 65536.0, 65536.0, 65536.0, 65536.0, 65536.0, 65536.0, 65536.0, 65536.0, 65536.0, 65536.0, 65536.0, 65536.0, 65536.0, 65536.0, 65536.0, 65536.0, 131072.0, 131072.0, 131072.0, 131072.0, 131072.0, 131072.0, 131072.0, 131072.0, 131072.0, 131072.0, 131072.0, 131072.0, 131072.0, 131072.0, 131072.0, 131072.0, 131072.0, 131072.0, 131072.0, 131072.0, 131072.0, 131072.0, 131072.0, 131072.0, 131072.0, 131072.0, 131072.0, 131072.0, 131072.0, 131072.0, 131072.0, 131072.0, 131072.0, 131072.0, 131072.0, 131072.0, 131072.0, 131072.0, 131072.0, 131072.0, 131072.0, 131072.0, 131072.0, 131072.0, 131072.0, 131072.0, 131072.0, 131072.0, 131072.0, 131072.0, 131072.0, 131072.0, 131072.0, 131072.0, 131072.0, 131072.0, 131072.0, 131072.0, 131072.0, 131072.0, 131072.0, 131072.0, 131072.0, 131072.0, 131072.0, 131072.0, 131072.0, 131072.0, 131072.0, 131072.0, 131072.0, 131072.0, 131072.0, 131072.0, 131072.0, 131072.0, 131072.0, 131072.0, 131072.0, 131072.0, 131072.0, 131072.0, 131072.0, 131072.0, 131072.0, 131072.0, 131072.0, 131072.0, 131072.0, 131072.0, 131072.0, 131072.0, 131072.0, 131072.0, 131072.0, 131072.0, 131072.0, 131072.0, 131072.0, 131072.0, 131072.0, 131072.0, 131072.0, 131072.0, 131072.0, 131072.0, 131072.0, 131072.0, 131072.0, 131072.0, 131072.0, 131072.0, 131072.0, 131072.0, 131072.0, 131072.0, 131072.0, 131072.0, 131072.0, 131072.0, 131072.0, 131072.0, 131072.0, 131072.0, 131072.0, 131072.0, 131072.0, 131072.0, 131072.0, 131072.0, 131072.0, 131072.0, 131072.0, 131072.0, 131072.0, 131072.0, 131072.0, 131072.0, 131072.0, 131072.0, 131072.0, 131072.0, 131072.0, 131072.0, 131072.0, 131072.0, 131072.0, 131072.0, 131072.0, 131072.0, 131072.0, 131072.0, 131072.0, 131072.0, 131072.0, 131072.0, 131072.0, 131072.0, 131072.0, 131072.0, 131072.0, 131072.0, 131072.0, 131072.0, 131072.0, 131072.0, 131072.0, 131072.0, 131072.0, 131072.0, 131072.0, 131072.0, 131072.0, 131072.0, 131072.0, 131072.0, 131072.0, 131072.0, 131072.0, 131072.0, 131072.0, 131072.0, 131072.0, 131072.0, 131072.0, 131072.0, 131072.0, 131072.0, 131072.0, 131072.0, 131072.0, 131072.0, 131072.0, 131072.0, 131072.0, 131072.0]}, "params-norm": {"start_step": 0, "end_step": 2000, "step_interval": 5, "values": [179.95639, 179.95639, 179.95639, 179.95639, 179.95639, 179.95639, 179.95636, 179.95616, 179.95595, 179.9552, 179.95465, 179.95432, 179.95352, 179.953, 179.95229, 179.95172, 179.95114, 179.95059, 179.95015, 179.94978, 179.94951, 179.94933, 179.94916, 179.94899, 179.94891, 179.94894, 179.94923, 179.95026, 179.95171, 179.9529, 179.95413, 179.95543, 179.95691, 179.95865, 179.96053, 179.96269, 179.96513, 179.96796, 179.97112, 179.97466, 179.97838, 179.98239, 179.98705, 179.9922, 179.99811, 180.00458, 180.01144, 180.0188, 180.0265, 180.0349, 180.04382, 180.05347, 180.06361, 180.07454, 180.0863, 180.09869, 180.1114, 180.12436, 180.13821, 180.15294, 180.16814, 180.18376, 180.20035, 180.21758, 180.23528, 180.25388, 180.27333, 180.2935, 180.31477, 180.33707, 180.36023, 180.38481, 180.4104, 180.43663, 180.46335, 180.49043, 180.51775, 180.54597, 180.57475, 180.60458, 180.63466, 180.66501, 180.69615, 180.72832, 180.76106, 180.79457, 180.82857, 180.86211, 180.89636, 180.93251, 180.97021, 181.00865, 181.04654, 181.08444, 181.12204, 181.1591, 181.19463, 181.22873, 181.26352, 181.29965, 181.33498, 181.36926, 181.40433, 181.44101, 181.47787, 181.51541, 181.55309, 181.58995, 181.62593, 181.66238, 181.69963, 181.73865, 181.77856, 181.819, 181.85893, 181.89955, 181.94034, 181.98015, 182.01802, 182.05594, 182.09499, 182.13466, 182.17516, 182.21599, 182.25551, 182.29494, 182.33302, 182.36942, 182.40552, 182.44077, 182.47746, 182.51506, 182.55521, 182.59557, 182.63631, 182.67693, 182.71771, 182.75752, 182.79524, 182.83229, 182.8694, 182.90648, 182.94411, 182.98082, 183.01617, 183.05077, 183.08421, 183.11528, 183.14688, 183.17844, 183.21207, 183.24745, 183.28352, 183.31885, 183.35526, 183.39171, 183.42731, 183.46333, 183.49973, 183.53497, 183.57001, 183.60588, 183.64211, 183.6795, 183.71835, 183.75874, 183.79941, 183.83905, 183.87886, 183.91798, 183.95557, 183.99252, 184.02957, 184.06734, 184.1066, 184.14734, 184.18813, 184.22699, 184.26306, 184.29767, 184.33336, 184.36948, 184.40587, 184.44305, 184.48088, 184.51953, 184.55611, 184.58971, 184.62381, 184.65984, 184.6958, 184.73257, 184.76843, 184.80443, 184.84024, 184.87787, 184.91624, 184.9561, 184.99586, 185.03816, 185.08003, 185.12041, 185.16002, 185.19998, 185.23941, 185.27916, 185.31915, 185.35942, 185.3989, 185.43639, 185.4734, 185.51125, 185.54845, 185.5865, 185.62511, 185.66444, 185.70372, 185.74438, 185.78564, 185.82716, 185.86717, 185.90334, 185.937, 185.97195, 186.00873, 186.04741, 186.0872, 186.12794, 186.16808, 186.20654, 186.24687, 186.28903, 186.3307, 186.3723, 186.4149, 186.45834, 186.50229, 186.54523, 186.58723, 186.62804, 186.66795, 186.70871, 186.75044, 186.79398, 186.83716, 186.88002, 186.92215, 186.96371, 187.00597, 187.04924, 187.09216, 187.13554, 187.17883, 187.22208, 187.26509, 187.30769, 187.34932, 187.39163, 187.43529, 187.47867, 187.52255, 187.5659, 187.6091, 187.65163, 187.6926, 187.7334, 187.77498, 187.81706, 187.85999, 187.90363, 187.94743, 187.99174, 188.03735, 188.08296, 188.12976, 188.17722, 188.22394, 188.27153, 188.31853, 188.3636, 188.40756, 188.45032, 188.49333, 188.53738, 188.58321, 188.62881, 188.67557, 188.722, 188.76859, 188.81543, 188.86082, 188.90515, 188.94725, 188.9901, 189.0343, 189.07765, 189.12099, 189.16522, 189.21011, 189.25642, 189.3047, 189.35202, 189.39963, 189.4478, 189.49484, 189.5425, 189.59079, 189.63968, 189.68971, 189.74034, 189.79134, 189.84206, 189.89209, 189.9409, 189.99072, 190.04274, 190.09349, 190.14539, 190.19702, 190.24873, 190.30104, 190.35287, 190.4046, 190.45503, 190.50591, 190.55637, 190.60674, 190.65721, 190.70746, 190.75826, 190.80876, 190.8571, 190.90599, 190.95639, 191.00842, 191.06157, 191.11446, 191.16747, 191.22209, 191.2751, 191.32907, 191.38292, 191.43571, 191.48894, 191.54314, 191.59731, 191.65059, 191.70349, 191.75555, 191.80978, 191.86417, 191.91782, 191.97105, 192.02429, 192.0775, 192.13084, 192.18512, 192.24028, 192.29506, 192.35016, 192.40651, 192.4622, 192.51747, 192.57224, 192.62952, 192.687, 192.74483, 192.80281, 192.86006, 192.91705, 192.97177, 193.02679, 193.08273, 193.13742, 193.1917, 193.24458, 193.29779, 193.35132, 193.40689, 193.46413, 193.52164, 193.57927, 193.63789, 193.69646, 193.75464, 193.81409, 193.87488, 193.93707, 193.99841, 194.05937, 194.11984, 194.17958, 194.23772, 194.29633, 194.35521, 194.41174, 194.46733, 194.52335, 194.58064, 194.6398]}, "params-norm vs samples": {"start_step": 0, "end_step": 2000, "step_interval": 5, "values": [179.95639, 179.95639, 179.95639, 179.95639, 179.95639, 179.95639, 179.95636, 179.95616, 179.95595, 179.9552, 179.95465, 179.95432, 179.95352, 179.953, 179.95229, 179.95172, 179.95114, 179.95059, 179.95015, 179.94978, 179.94951, 179.94933, 179.94916, 179.94899, 179.94891, 179.94894, 179.94923, 179.95026, 179.95171, 179.9529, 179.95413, 179.95543, 179.95691, 179.95865, 179.96053, 179.96269, 179.96513, 179.96796, 179.97112, 179.97466, 179.97838, 179.98239, 179.98705, 179.9922, 179.99811, 180.00458, 180.01144, 180.0188, 180.0265, 180.0349, 180.04382, 180.05347, 180.06361, 180.07454, 180.0863, 180.09869, 180.1114, 180.12436, 180.13821, 180.15294, 180.16814, 180.18376, 180.20035, 180.21758, 180.23528, 180.25388, 180.27333, 180.2935, 180.31477, 180.33707, 180.36023, 180.38481, 180.4104, 180.43663, 180.46335, 180.49043, 180.51775, 180.54597, 180.57475, 180.60458, 180.63466, 180.66501, 180.69615, 180.72832, 180.76106, 180.79457, 180.82857, 180.86211, 180.89636, 180.93251, 180.97021, 181.00865, 181.04654, 181.08444, 181.12204, 181.1591, 181.19463, 181.22873, 181.26352, 181.29965, 181.33498, 181.36926, 181.40433, 181.44101, 181.47787, 181.51541, 181.55309, 181.58995, 181.62593, 181.66238, 181.69963, 181.73865, 181.77856, 181.819, 181.85893, 181.89955, 181.94034, 181.98015, 182.01802, 182.05594, 182.09499, 182.13466, 182.17516, 182.21599, 182.25551, 182.29494, 182.33302, 182.36942, 182.40552, 182.44077, 182.47746, 182.51506, 182.55521, 182.59557, 182.63631, 182.67693, 182.71771, 182.75752, 182.79524, 182.83229, 182.8694, 182.90648, 182.94411, 182.98082, 183.01617, 183.05077, 183.08421, 183.11528, 183.14688, 183.17844, 183.21207, 183.24745, 183.28352, 183.31885, 183.35526, 183.39171, 183.42731, 183.46333, 183.49973, 183.53497, 183.57001, 183.60588, 183.64211, 183.6795, 183.71835, 183.75874, 183.79941, 183.83905, 183.87886, 183.91798, 183.95557, 183.99252, 184.02957, 184.06734, 184.1066, 184.14734, 184.18813, 184.22699, 184.26306, 184.29767, 184.33336, 184.36948, 184.40587, 184.44305, 184.48088, 184.51953, 184.55611, 184.58971, 184.62381, 184.65984, 184.6958, 184.73257, 184.76843, 184.80443, 184.84024, 184.87787, 184.91624, 184.9561, 184.99586, 185.03816, 185.08003, 185.12041, 185.16002, 185.19998, 185.23941, 185.27916, 185.31915, 185.35942, 185.3989, 185.43639, 185.4734, 185.51125, 185.54845, 185.5865, 185.62511, 185.66444, 185.70372, 185.74438, 185.78564, 185.82716, 185.86717, 185.90334, 185.937, 185.97195, 186.00873, 186.04741, 186.0872, 186.12794, 186.16808, 186.20654, 186.24687, 186.28903, 186.3307, 186.3723, 186.4149, 186.45834, 186.50229, 186.54523, 186.58723, 186.62804, 186.66795, 186.70871, 186.75044, 186.79398, 186.83716, 186.88002, 186.92215, 186.96371, 187.00597, 187.04924, 187.09216, 187.13554, 187.17883, 187.22208, 187.26509, 187.30769, 187.34932, 187.39163, 187.43529, 187.47867, 187.52255, 187.5659, 187.6091, 187.65163, 187.6926, 187.7334, 187.77498, 187.81706, 187.85999, 187.90363, 187.94743, 187.99174, 188.03735, 188.08296, 188.12976, 188.17722, 188.22394, 188.27153, 188.31853, 188.3636, 188.40756, 188.45032, 188.49333, 188.53738, 188.58321, 188.62881, 188.67557, 188.722, 188.76859, 188.81543, 188.86082, 188.90515, 188.94725, 188.9901, 189.0343, 189.07765, 189.12099, 189.16522, 189.21011, 189.25642, 189.3047, 189.35202, 189.39963, 189.4478, 189.49484, 189.5425, 189.59079, 189.63968, 189.68971, 189.74034, 189.79134, 189.84206, 189.89209, 189.9409, 189.99072, 190.04274, 190.09349, 190.14539, 190.19702, 190.24873, 190.30104, 190.35287, 190.4046, 190.45503, 190.50591, 190.55637, 190.60674, 190.65721, 190.70746, 190.75826, 190.80876, 190.8571, 190.90599, 190.95639, 191.00842, 191.06157, 191.11446, 191.16747, 191.22209, 191.2751, 191.32907, 191.38292, 191.43571, 191.48894, 191.54314, 191.59731, 191.65059, 191.70349, 191.75555, 191.80978, 191.86417, 191.91782, 191.97105, 192.02429, 192.0775, 192.13084, 192.18512, 192.24028, 192.29506, 192.35016, 192.40651, 192.4622, 192.51747, 192.57224, 192.62952, 192.687, 192.74483, 192.80281, 192.86006, 192.91705, 192.97177, 193.02679, 193.08273, 193.13742, 193.1917, 193.24458, 193.29779, 193.35132, 193.40689, 193.46413, 193.52164, 193.57927, 193.63789, 193.69646, 193.75464, 193.81409, 193.87488, 193.93707, 193.99841, 194.05937, 194.11984, 194.17958, 194.23772, 194.29633, 194.35521, 194.41174, 194.46733, 194.52335, 194.58064, 194.6398]}, "iteration-time": {"start_step": 0, "end_step": 2000, "step_interval": 5, "values": [18.07681, 0.38236, 0.3815, 0.38004, 0.39049, 0.39656, 0.39642, 0.39048, 0.39523, 0.39194, 0.5552, 0.3948, 0.39398, 0.39561, 0.39214, 0.39537, 0.39216, 0.39261, 0.39694, 0.39356, 0.4003, 0.39114, 0.39355, 0.3919, 0.39064, 0.40086, 0.39355, 0.39139, 0.38492, 0.3927, 0.40428, 0.38479, 0.38466, 0.38299, 0.38174, 0.38636, 0.38086, 0.38401, 0.38601, 0.40511, 0.38629, 0.38521, 0.3855, 0.38256, 0.38493, 0.38553, 0.38438, 0.38462, 0.38628, 0.38214, 0.38492, 0.38322, 0.38706, 0.38103, 0.38314, 0.38469, 0.38271, 0.38565, 0.38283, 0.38163, 0.37833, 0.38621, 0.37993, 0.37921, 0.38058, 0.38093, 0.38301, 0.38316, 0.38564, 0.38136, 0.38386, 0.38121, 0.38145, 0.37922, 0.48103, 0.37987, 0.38025, 0.38308, 0.38613, 0.38258, 0.38336, 0.38508, 0.3887, 0.38459, 0.38233, 0.38094, 0.38026, 0.38316, 0.3802, 0.38401, 0.38409, 0.38327, 0.39188, 0.38081, 0.38297, 0.38391, 0.38075, 0.38566, 0.38249, 0.38281, 0.38433, 0.38249, 0.37955, 0.38003, 0.47628, 0.38394, 0.38015, 0.40241, 0.37987, 0.38149, 0.38158, 0.38618, 0.38356, 0.38072, 0.3889, 0.38918, 0.38574, 0.38775, 0.38338, 0.39021, 0.38146, 0.38236, 0.38742, 0.3868, 0.38407, 0.38593, 0.38727, 0.39089, 0.39337, 0.38585, 0.38443, 0.38667, 0.3868, 0.39023, 0.49507, 0.38161, 0.38081, 0.38199, 0.48238, 0.53269, 0.38537, 0.38444, 0.38705, 0.39224, 0.38871, 0.3845, 0.38286, 0.38071, 0.38022, 0.38228, 0.38177, 0.38417, 0.3801, 0.38435, 0.38639, 0.38626, 0.38489, 0.38587, 0.38488, 0.38407, 0.3867, 0.38401, 0.3866, 0.38593, 0.38916, 0.3833, 0.38389, 0.3843, 0.38359, 0.38697, 0.38383, 0.38577, 0.38399, 0.38402, 0.38788, 0.3861, 0.38511, 0.38672, 0.38227, 0.38915, 0.38446, 0.3859, 0.37898, 0.381, 0.38613, 0.38362, 0.3831, 0.37854, 0.37897, 0.37818, 0.37983, 0.38369, 0.37982, 0.38105, 0.38549, 0.38522, 0.38518, 0.38435, 0.47441, 0.38233, 0.37927, 0.38248, 0.38035, 0.37886, 0.38094, 0.3816, 0.38623, 0.38907, 0.38824, 0.38363, 0.38085, 0.38241, 0.38688, 0.3809, 0.38401, 0.3846, 0.38278, 0.38686, 0.38509, 0.38569, 0.38138, 0.38221, 0.38366, 0.39376, 0.39173, 0.38031, 0.38231, 0.47746, 0.38191, 0.38528, 0.38919, 0.38627, 0.38485, 0.39016, 0.48709, 0.39134, 0.38991, 0.38575, 0.3826, 0.38101, 0.38387, 0.38025, 0.37997, 0.50302, 0.38436, 0.38473, 0.38639, 0.38633, 0.3928, 0.38343, 0.38522, 0.38229, 0.37817, 0.38096, 0.38116, 0.3867, 0.38377, 0.38146, 0.38226, 0.38398, 0.39339, 0.3803, 0.48334, 0.38398, 0.38072, 0.38756, 0.38406, 0.38475, 0.3865, 0.3837, 0.39344, 0.38796, 0.38926, 0.38703, 0.38603, 0.37954, 0.38341, 0.38785, 0.38335, 0.38263, 0.38197, 0.38334, 0.3861, 0.38808, 0.38389, 0.38779, 0.39044, 0.38432, 0.38303, 0.38348, 0.38756, 0.38699, 0.47757, 0.38391, 0.38223, 0.38479, 0.38831, 0.38749, 0.384, 0.3864, 0.38554, 0.38656, 0.38469, 0.38559, 0.38552, 0.38634, 0.39068, 0.38718, 0.38906, 0.38314, 0.38526, 0.39355, 0.38547, 0.3918, 0.38838, 0.39149, 0.38788, 0.38735, 0.38776, 0.38498, 0.3845, 0.3809, 0.38438, 0.38342, 0.38109, 0.38385, 0.3847, 0.38354, 0.38456, 0.48679, 0.38819, 0.38623, 0.3908, 0.39049, 0.38764, 0.39009, 0.3899, 0.39171, 0.39325, 0.39116, 0.38744, 0.38994, 0.3945, 0.38791, 0.3872, 0.3882, 0.38525, 0.38534, 0.38602, 0.38534, 0.38256, 0.38598, 0.38572, 0.37898, 0.38512, 0.38512, 0.38361, 0.39213, 0.38551, 0.38269, 0.38516, 0.38696, 0.38679, 0.37971, 0.38365, 0.38484, 0.38698, 0.39395, 0.38701, 0.38655, 0.38288, 0.38233, 0.38642, 0.38468, 0.38309, 0.38362, 0.38617, 0.3863, 0.38907, 0.38471, 0.38686, 0.38576, 0.3853, 0.38783, 0.3863, 0.38804, 0.38654, 0.48838, 0.39169, 0.38856, 0.47555, 0.38859, 0.39202, 0.38824, 0.59598, 0.38895, 0.38921, 0.38633, 0.38705, 0.38574]}, "optimizer-clip-main-grad-time": {"start_step": 0, "end_step": 1983, "step_interval": 5, "values": [0.02457, 0.00089, 0.00088, 0.00089, 0.00088, 0.00089, 0.00089, 0.00089, 0.0009, 0.00089, 0.00091, 0.00095, 0.00088, 0.0009, 0.00088, 0.00088, 0.00089, 0.0009, 0.0009, 0.00089, 0.0009, 0.00088, 0.00088, 0.00088, 0.00089, 0.00089, 0.00089, 0.00088, 0.00087, 0.00088, 0.00088, 0.00088, 0.00088, 0.00089, 0.00093, 0.00088, 0.00088, 0.0009, 0.00092, 0.00089, 0.00088, 0.00088, 0.00089, 0.00088, 0.00089, 0.00089, 0.00089, 0.00099, 0.00088, 0.00088, 0.00089, 0.00089, 0.00088, 0.00088, 0.00088, 0.00088, 0.00088, 0.0009, 0.00126, 0.00088, 0.00088, 0.00088, 0.00094, 0.00088, 0.00087, 0.00088, 0.00087, 0.00088, 0.00088, 0.0009, 0.00087, 0.00088, 0.00088, 0.00088, 0.00087, 0.00088, 0.00087, 0.00125, 0.00093, 0.0009, 0.00088, 0.00088, 0.00088, 0.00088, 0.00088, 0.00088, 0.00088, 0.00098, 0.00088, 0.00112, 0.00088, 0.00088, 0.00089, 0.00087, 0.00088, 0.00087, 0.00088, 0.00088, 0.00088, 0.00089, 0.0009, 0.00087, 0.00088, 0.00088, 0.00091, 0.00088, 0.00088, 0.00088, 0.00088, 0.00092, 0.00087, 0.00066, 0.00088, 0.00088, 0.0009, 0.00065, 0.00088, 0.00088, 0.00066, 0.00089, 0.00089, 0.00066, 0.00088, 0.001, 0.00088, 0.00088, 0.0009, 0.00066, 0.00066, 0.00088, 0.00067, 0.00089, 0.00089, 0.00067, 0.00088, 0.00089, 0.00087, 0.00087, 0.00095, 0.00088, 0.00087, 0.00088, 0.00087, 0.00089, 0.00089, 0.00088, 0.00089, 0.00089, 0.00088, 0.00089, 0.0009, 0.00087, 0.00087, 0.00089, 0.00088, 0.00087, 0.00087, 0.00087, 0.00087, 0.00088, 0.00088, 0.00089, 0.00088, 0.0009, 0.00089, 0.00087, 0.00087, 0.00087, 0.00089, 0.00089, 0.00094, 0.00088, 0.00087, 0.00087, 0.00088, 0.00088, 0.00087, 0.00087, 0.00088, 0.00088, 0.00088, 0.00087, 0.00087, 0.00087, 0.00087, 0.00088, 0.00088, 0.00087, 0.00087, 0.00098, 0.00088, 0.00091, 0.00087, 0.00087, 0.00089, 0.00088, 0.00088, 0.00088, 0.00091, 0.00087, 0.00088, 0.00107, 0.00095, 0.00088, 0.00087, 0.00088, 0.00094, 0.00093, 0.00087, 0.00089, 0.00087, 0.00088, 0.00087, 0.00089, 0.00087, 0.00087, 0.00087, 0.00087, 0.00088, 0.00089, 0.00087, 0.00087, 0.00088, 0.00089, 0.00087, 0.00087, 0.00094, 0.00088, 0.00087, 0.00089, 0.00093, 0.00088, 0.00087, 0.00087, 0.00088, 0.00088, 0.00088, 0.00088, 0.00095, 0.00087, 0.00087, 0.00087, 0.00087, 0.00087, 0.00108, 0.00087, 0.00089, 0.00089, 0.00089, 0.00088, 0.001, 0.00088, 0.00094, 0.00088, 0.00087, 0.00088, 0.00095, 0.0009, 0.00089, 0.00089, 0.00088, 0.00088, 0.00089, 0.00088, 0.0009, 0.00089, 0.00088, 0.00088, 0.00087, 0.00088, 0.00089, 0.00088, 0.00087, 0.00088, 0.00087, 0.00089, 0.00091, 0.00088, 0.00096, 0.00088, 0.00092, 0.00088, 0.00088, 0.00088, 0.00088, 0.00088, 0.00088, 0.00087, 0.00089, 0.00088, 0.00091, 0.00095, 0.00088, 0.00088, 0.00095, 0.0009, 0.00089, 0.00092, 0.00093, 0.00099, 0.00088, 0.0009, 0.00087, 0.00088, 0.00096, 0.00088, 0.00097, 0.00087, 0.00088, 0.00087, 0.00088, 0.00088, 0.00098, 0.00089, 0.00097, 0.00087, 0.00087, 0.00087, 0.00088, 0.00089, 0.00088, 0.00089, 0.00088, 0.00088, 0.00087, 0.00087, 0.00099, 0.00089, 0.00088, 0.00088, 0.00087, 0.00088, 0.00088, 0.00089, 0.00087, 0.00088, 0.00088, 0.0009, 0.00091, 0.00089, 0.00087, 0.00088, 0.00089, 0.00089, 0.00087, 0.00088, 0.00094, 0.00088, 0.00088, 0.00088, 0.00088, 0.00089, 0.00087, 0.00106, 0.0009, 0.00089, 0.00088, 0.00096, 0.00089, 0.00098, 0.00088, 0.00088, 0.00088, 0.00091, 0.00087, 0.00089, 0.00088, 0.00088, 0.00088, 0.00088, 0.00087, 0.00089, 0.00089, 0.00088, 0.00089, 0.00089, 0.00088, 0.00091, 0.00089, 0.00087, 0.0009, 0.00088, 0.00089, 0.00088, 0.00093, 0.00116, 0.00101, 0.00088, 0.00095, 0.00092, 0.00089, 0.00088, 0.00087, 0.00089, 0.00105, 0.0009, 0.00087]}, "optimizer-count-zeros-time": {"start_step": 0, "end_step": 1983, "step_interval": 5, "values": [0.01277, 0.00497, 0.00488, 0.00489, 0.00489, 0.00494, 0.00489, 0.0049, 0.00489, 0.00488, 0.00497, 0.00521, 0.0049, 0.00492, 0.00492, 0.0049, 0.00494, 0.00492, 0.00489, 0.00489, 0.00493, 0.0049, 0.00492, 0.0051, 0.00487, 0.00629, 0.005, 0.0049, 0.00492, 0.0049, 0.0049, 0.0049, 0.00488, 0.00492, 0.00535, 0.0049, 0.0049, 0.00494, 0.0049, 0.00494, 0.00489, 0.00489, 0.0049, 0.00491, 0.00492, 0.00491, 0.00599, 0.00523, 0.00489, 0.00489, 0.00491, 0.00491, 0.00491, 0.00494, 0.0049, 0.00489, 0.00491, 0.0049, 0.00491, 0.0049, 0.00491, 0.0049, 0.00525, 0.00492, 0.00493, 0.00489, 0.00489, 0.00492, 0.00491, 0.0049, 0.00491, 0.00491, 0.00492, 0.00489, 0.00489, 0.00493, 0.00493, 0.00498, 0.00519, 0.00491, 0.00491, 0.00492, 0.00498, 0.00492, 0.00494, 0.0049, 0.00489, 0.00567, 0.00489, 0.00491, 0.00491, 0.00524, 0.00489, 0.00491, 0.00489, 0.00504, 0.0056, 0.00501, 0.00491, 0.00493, 0.00492, 0.00491, 0.00491, 0.00491, 0.00489, 0.0049, 0.0049, 0.0049, 0.00492, 0.0049, 0.00491, 0.00491, 0.00602, 0.0049, 0.00494, 0.00489, 0.0049, 0.0049, 0.00491, 0.00492, 0.0049, 0.0049, 0.00491, 0.00598, 0.00492, 0.00491, 0.00489, 0.00494, 0.00491, 0.00491, 0.0049, 0.00494, 0.00492, 0.00544, 0.00488, 0.00491, 0.0049, 0.0049, 0.00503, 0.00491, 0.00491, 0.00491, 0.00493, 0.00494, 0.00493, 0.00492, 0.0049, 0.00492, 0.00488, 0.00489, 0.00515, 0.0049, 0.00498, 0.00492, 0.00493, 0.0049, 0.00491, 0.005, 0.00491, 0.00491, 0.00491, 0.00491, 0.00489, 0.00491, 0.0049, 0.0049, 0.00496, 0.00492, 0.00488, 0.00492, 0.00538, 0.00492, 0.00491, 0.00492, 0.00567, 0.00488, 0.00491, 0.00493, 0.00492, 0.00487, 0.00493, 0.0049, 0.00488, 0.00491, 0.00492, 0.0049, 0.00492, 0.0049, 0.0049, 0.00492, 0.0049, 0.0051, 0.0049, 0.00519, 0.00491, 0.00491, 0.00488, 0.00488, 0.00489, 0.00489, 0.00491, 0.00583, 0.0049, 0.0049, 0.00489, 0.00488, 0.0049, 0.00489, 0.00491, 0.00488, 0.0049, 0.00501, 0.00492, 0.00491, 0.0049, 0.0049, 0.0049, 0.00488, 0.0049, 0.00489, 0.00489, 0.0049, 0.00489, 0.00492, 0.00493, 0.00488, 0.0049, 0.00489, 0.0049, 0.00489, 0.00494, 0.00489, 0.00491, 0.00489, 0.00489, 0.0049, 0.00492, 0.00487, 0.00491, 0.00491, 0.00489, 0.00489, 0.00489, 0.00491, 0.00578, 0.0049, 0.00488, 0.00487, 0.00492, 0.0049, 0.00491, 0.00489, 0.00489, 0.00488, 0.0049, 0.00489, 0.00489, 0.00491, 0.00515, 0.00494, 0.0049, 0.00489, 0.00492, 0.00489, 0.00502, 0.00489, 0.00493, 0.00489, 0.00491, 0.00491, 0.00489, 0.0049, 0.00582, 0.00487, 0.00489, 0.0049, 0.00491, 0.00488, 0.00489, 0.00492, 0.00488, 0.00489, 0.00491, 0.00489, 0.00489, 0.0049, 0.00489, 0.00558, 0.00491, 0.0056, 0.00495, 0.00488, 0.00491, 0.00489, 0.00489, 0.00488, 0.0049, 0.0049, 0.00489, 0.00492, 0.00491, 0.0049, 0.00491, 0.00489, 0.0049, 0.00491, 0.00492, 0.00512, 0.00493, 0.00491, 0.00491, 0.0049, 0.00491, 0.00492, 0.00579, 0.00626, 0.00489, 0.00489, 0.0049, 0.00489, 0.00491, 0.00494, 0.00489, 0.00491, 0.0049, 0.0049, 0.00491, 0.00512, 0.0051, 0.00514, 0.00513, 0.00513, 0.00514, 0.00513, 0.00512, 0.00511, 0.00512, 0.00514, 0.0052, 0.00512, 0.00511, 0.00513, 0.00514, 0.00511, 0.00511, 0.00514, 0.00564, 0.00511, 0.00512, 0.00509, 0.00512, 0.00512, 0.00536, 0.00513, 0.00512, 0.00513, 0.00512, 0.00513, 0.00512, 0.00512, 0.00512, 0.00512, 0.00509, 0.00512, 0.00512, 0.00513, 0.00512, 0.00514, 0.00515, 0.00514, 0.00516, 0.00512, 0.00513, 0.00514, 0.00511, 0.00513, 0.00524, 0.00511, 0.00514, 0.00512, 0.00511, 0.00509, 0.00513, 0.00511, 0.00514, 0.00513, 0.00513, 0.00512, 0.0055, 0.0054, 0.00513, 0.0051, 0.0051, 0.00512, 0.00514, 0.00515, 0.00515]}, "optimizer-inner-step-time": {"start_step": 0, "end_step": 1983, "step_interval": 5, "values": [0.00686, 0.00099, 0.00098, 0.00098, 0.00098, 0.001, 0.00099, 0.00099, 0.00098, 0.00099, 0.00101, 0.00098, 0.00099, 0.00099, 0.00099, 0.001, 0.00099, 0.001, 0.001, 0.001, 0.001, 0.00098, 0.00097, 0.00099, 0.00098, 0.00124, 0.00098, 0.00098, 0.00098, 0.00098, 0.00098, 0.00101, 0.00101, 0.001, 0.001, 0.00098, 0.00099, 0.001, 0.00102, 0.00098, 0.00098, 0.00099, 0.00098, 0.00098, 0.00098, 0.00097, 0.001, 0.00102, 0.00097, 0.00098, 0.00099, 0.001, 0.00097, 0.00102, 0.00099, 0.00098, 0.00098, 0.00098, 0.001, 0.001, 0.001, 0.00098, 0.00098, 0.00098, 0.00098, 0.00098, 0.00097, 0.00097, 0.00099, 0.00098, 0.00098, 0.00098, 0.00104, 0.00097, 0.00098, 0.00099, 0.00098, 0.00117, 0.00101, 0.00101, 0.00099, 0.00097, 0.00098, 0.00097, 0.00099, 0.00098, 0.00098, 0.00101, 0.00099, 0.00098, 0.00098, 0.00098, 0.001, 0.00097, 0.00097, 0.00098, 0.001, 0.00097, 0.00097, 0.00098, 0.00099, 0.00098, 0.00098, 0.00098, 0.00098, 0.00097, 0.00097, 0.00098, 0.001, 0.00099, 0.00097, 0.00098, 0.001, 0.00098, 0.00098, 0.00099, 0.00098, 0.00098, 0.00099, 0.00099, 0.00099, 0.00097, 0.00097, 0.00099, 0.00098, 0.00097, 0.001, 0.00099, 0.00098, 0.00099, 0.001, 0.00097, 0.00099, 0.00102, 0.00099, 0.00098, 0.00097, 0.00099, 0.00099, 0.001, 0.00097, 0.00097, 0.00098, 0.00099, 0.001, 0.001, 0.00098, 0.001, 0.001, 0.00097, 0.00101, 0.00097, 0.00099, 0.00099, 0.00098, 0.001, 0.00099, 0.00098, 0.001, 0.00097, 0.00098, 0.001, 0.00099, 0.00099, 0.00099, 0.00098, 0.00098, 0.00097, 0.00098, 0.00099, 0.00098, 0.00099, 0.00097, 0.00098, 0.00103, 0.00097, 0.00097, 0.001, 0.00099, 0.00098, 0.00098, 0.00099, 0.00097, 0.00098, 0.00098, 0.00101, 0.001, 0.00099, 0.00098, 0.00098, 0.00097, 0.00102, 0.00098, 0.00097, 0.00098, 0.00097, 0.00097, 0.00099, 0.00102, 0.00096, 0.00099, 0.00097, 0.00096, 0.00097, 0.00097, 0.00099, 0.00096, 0.00097, 0.00097, 0.00097, 0.00096, 0.00097, 0.00098, 0.00097, 0.00098, 0.00156, 0.00097, 0.00096, 0.00097, 0.00096, 0.001, 0.00101, 0.00097, 0.00099, 0.00097, 0.00096, 0.00098, 0.00098, 0.00103, 0.00097, 0.00097, 0.00097, 0.00096, 0.00097, 0.00096, 0.00097, 0.00099, 0.00097, 0.00096, 0.00098, 0.00098, 0.00097, 0.00098, 0.00099, 0.00099, 0.00098, 0.00097, 0.00098, 0.00097, 0.00098, 0.00099, 0.001, 0.00099, 0.00098, 0.001, 0.00099, 0.00099, 0.00101, 0.00102, 0.00099, 0.00099, 0.00098, 0.00098, 0.00099, 0.00098, 0.00099, 0.00099, 0.00099, 0.001, 0.00099, 0.00099, 0.00098, 0.00101, 0.00099, 0.00099, 0.00099, 0.00097, 0.00099, 0.00099, 0.00098, 0.00098, 0.00104, 0.00098, 0.00099, 0.00099, 0.00098, 0.00099, 0.00099, 0.00099, 0.00099, 0.001, 0.00097, 0.00099, 0.00098, 0.00098, 0.001, 0.00099, 0.00099, 0.00098, 0.00099, 0.00098, 0.00097, 0.00098, 0.00099, 0.00099, 0.00099, 0.00098, 0.00104, 0.00099, 0.00098, 0.00099, 0.00099, 0.001, 0.00099, 0.00098, 0.001, 0.00099, 0.00096, 0.00098, 0.00099, 0.00099, 0.001, 0.00099, 0.00097, 0.00099, 0.00099, 0.00098, 0.00099, 0.00099, 0.00103, 0.00099, 0.00098, 0.00099, 0.00097, 0.00098, 0.00099, 0.00098, 0.00098, 0.00101, 0.00098, 0.00099, 0.00099, 0.00098, 0.00156, 0.00103, 0.00098, 0.001, 0.00098, 0.00099, 0.00098, 0.00098, 0.00099, 0.00098, 0.001, 0.001, 0.00098, 0.00102, 0.00098, 0.00098, 0.00099, 0.00098, 0.00098, 0.00099, 0.001, 0.00098, 0.00098, 0.00098, 0.00098, 0.00098, 0.00099, 0.00097, 0.00099, 0.00096, 0.00102, 0.00098, 0.00099, 0.001, 0.001, 0.001, 0.001, 0.00099, 0.001, 0.00099, 0.00099, 0.001, 0.001, 0.00104, 0.00099, 0.00099, 0.00099, 0.00099, 0.001, 0.00099, 0.00099, 0.001, 0.00099, 0.00099]}, "optimizer-copy-main-to-model-params-time": {"start_step": 0, "end_step": 1983, "step_interval": 5, "values": [0.00107, 0.00103, 0.00102, 0.00102, 0.00102, 0.00102, 0.00102, 0.00102, 0.00103, 0.00103, 0.00103, 0.00103, 0.00104, 0.00101, 0.00103, 0.00103, 0.00104, 0.00105, 0.00103, 0.00103, 0.00104, 0.00103, 0.00102, 0.00104, 0.00102, 0.00163, 0.00103, 0.00103, 0.00102, 0.00102, 0.00102, 0.00102, 0.00102, 0.00104, 0.00104, 0.00103, 0.00102, 0.00103, 0.00104, 0.00103, 0.00103, 0.00103, 0.00102, 0.00103, 0.00103, 0.00102, 0.00108, 0.00106, 0.00102, 0.00103, 0.00103, 0.00104, 0.00102, 0.00103, 0.00102, 0.00103, 0.00102, 0.00103, 0.00102, 0.00103, 0.00103, 0.00103, 0.00103, 0.00104, 0.00102, 0.00103, 0.00102, 0.00103, 0.00102, 0.00103, 0.00102, 0.00102, 0.00103, 0.00103, 0.00102, 0.00103, 0.00102, 0.00115, 0.00105, 0.00126, 0.00103, 0.00103, 0.00102, 0.00103, 0.00103, 0.00103, 0.00102, 0.00106, 0.00102, 0.00103, 0.00102, 0.00114, 0.00102, 0.00103, 0.00102, 0.00102, 0.00104, 0.00103, 0.00103, 0.00103, 0.00102, 0.00103, 0.00102, 0.00102, 0.00102, 0.00102, 0.00102, 0.00103, 0.00102, 0.00103, 0.00102, 0.00103, 0.00107, 0.00103, 0.00103, 0.00103, 0.00102, 0.00103, 0.00102, 0.00103, 0.00102, 0.00103, 0.00102, 0.00109, 0.00103, 0.00103, 0.00103, 0.00105, 0.00102, 0.00103, 0.00103, 0.00103, 0.00103, 0.00105, 0.00102, 0.00102, 0.00103, 0.00103, 0.00102, 0.00102, 0.00103, 0.00103, 0.00102, 0.00103, 0.00103, 0.00103, 0.00102, 0.00103, 0.00102, 0.00102, 0.00104, 0.00103, 0.00103, 0.00103, 0.00102, 0.00102, 0.00102, 0.00102, 0.00102, 0.00102, 0.00102, 0.00103, 0.00103, 0.00103, 0.00103, 0.00103, 0.00102, 0.00102, 0.00102, 0.00104, 0.00102, 0.00103, 0.00102, 0.00102, 0.00108, 0.00103, 0.00102, 0.00103, 0.00115, 0.00102, 0.00103, 0.00103, 0.00102, 0.00103, 0.00103, 0.00103, 0.00102, 0.00103, 0.00104, 0.00103, 0.00102, 0.00106, 0.00102, 0.00102, 0.00103, 0.00103, 0.00099, 0.001, 0.00103, 0.001, 0.001, 0.00105, 0.00101, 0.00099, 0.00099, 0.001, 0.00099, 0.001, 0.00099, 0.00099, 0.00099, 0.00099, 0.00099, 0.00099, 0.001, 0.00111, 0.001, 0.00099, 0.001, 0.00099, 0.00105, 0.00099, 0.00099, 0.001, 0.00099, 0.00099, 0.00099, 0.00099, 0.001, 0.001, 0.00099, 0.001, 0.00099, 0.00099, 0.00101, 0.00099, 0.00101, 0.001, 0.00099, 0.001, 0.00106, 0.001, 0.001, 0.001, 0.00104, 0.001, 0.001, 0.001, 0.00099, 0.00106, 0.001, 0.001, 0.001, 0.001, 0.00101, 0.00102, 0.00099, 0.00101, 0.00101, 0.001, 0.00099, 0.001, 0.00101, 0.00101, 0.00101, 0.00106, 0.001, 0.00101, 0.001, 0.00102, 0.001, 0.00101, 0.00106, 0.001, 0.001, 0.00101, 0.00099, 0.00101, 0.00101, 0.00101, 0.001, 0.00101, 0.00101, 0.00101, 0.001, 0.00101, 0.00101, 0.00105, 0.00101, 0.00103, 0.00101, 0.001, 0.001, 0.00101, 0.00107, 0.001, 0.00106, 0.001, 0.001, 0.001, 0.001, 0.001, 0.001, 0.001, 0.001, 0.00101, 0.00102, 0.00102, 0.001, 0.001, 0.00101, 0.001, 0.001, 0.001, 0.00106, 0.00107, 0.00099, 0.00107, 0.001, 0.00101, 0.001, 0.001, 0.001, 0.00101, 0.001, 0.001, 0.00101, 0.001, 0.00099, 0.001, 0.00101, 0.001, 0.001, 0.001, 0.001, 0.00107, 0.001, 0.001, 0.00101, 0.001, 0.001, 0.001, 0.00101, 0.00106, 0.00099, 0.00102, 0.00102, 0.001, 0.001, 0.001, 0.001, 0.00101, 0.00099, 0.00103, 0.00101, 0.001, 0.001, 0.001, 0.001, 0.001, 0.001, 0.00101, 0.00103, 0.00102, 0.001, 0.00101, 0.001, 0.001, 0.001, 0.001, 0.001, 0.001, 0.00099, 0.00102, 0.001, 0.001, 0.001, 0.00101, 0.00101, 0.001, 0.00099, 0.001, 0.00101, 0.00102, 0.00101, 0.00102, 0.00101, 0.00101, 0.001, 0.00101, 0.00101, 0.00101, 0.001, 0.001, 0.00101, 0.001, 0.001]}, "grad-norm": {"start_step": 0, "end_step": 1983, "step_interval": 5, "values": [11.77525, 12.26804, 11.19281, 14.50237, 14.014, 11.57186, 8.3922, 7.10897, 4.47266, 4.00434, 3.4, 2.71736, 2.45629, 2.30739, 2.29493, 2.25132, 2.01839, 2.41173, 2.01298, 2.00525, 2.18932, 1.91353, 1.88951, 2.28883, 2.07903, 1.8844, 1.87495, 2.08513, 2.01874, 2.01118, 2.0102, 1.89229, 1.99489, 1.65446, 2.02134, 1.98456, 2.13312, 2.05074, 1.91832, 1.88506, 1.86975, 1.90714, 2.10548, 1.83107, 1.85561, 1.89757, 1.77389, 1.83901, 1.60882, 1.67073, 1.57953, 1.73056, 1.77582, 1.85094, 1.58796, 1.69243, 2.01012, 1.72305, 1.68342, 1.77634, 1.52051, 1.58604, 1.75613, 1.50876, 1.38814, 1.4853, 1.45829, 1.51675, 1.54655, 1.47158, 1.51099, 1.4708, 1.47268, 1.47452, 1.44323, 1.32185, 1.33599, 1.35564, 1.29533, 1.27928, 1.44962, 1.33226, 1.18991, 1.39956, 1.21257, 1.16175, 1.05645, 1.15134, 1.32979, 1.15427, 1.22191, 1.18197, 1.5911, 1.3589, 1.27604, 1.13871, 1.30626, 1.67866, 1.52014, 1.03431, 1.05476, 1.3049, 1.25479, 1.22714, 1.69201, 1.08131, 1.00908, 1.10419, 1.08066, 1.12768, 1.24403, 0.87723, 0.92972, 1.02293, 1.07062, 0.98243, 1.24502, 1.2897, 0.94461, 1.09023, 1.04658, 0.90251, 1.12421, 1.65432, 1.09595, 1.17882, 1.36022, 0.96059, 0.98043, 1.05339, 0.96416, 1.13229, 1.12844, 0.93359, 1.82877, 1.40011, 1.43068, 1.3027, 1.089, 1.64716, 1.37833, 1.56985, 1.16612, 1.85125, 1.24379, 1.71309, 1.39309, 1.27937, 1.17708, 1.73543, 1.05896, 1.24373, 1.38937, 1.36918, 1.42323, 1.77943, 1.13157, 1.27948, 1.19267, 1.34154, 1.40098, 1.16252, 1.42404, 1.2011, 1.00676, 1.48416, 1.13391, 1.33486, 1.5395, 1.27609, 1.42471, 1.30575, 1.22047, 1.81347, 1.74187, 1.56562, 1.47675, 1.51655, 1.70821, 1.44154, 1.50096, 1.28826, 1.74901, 1.90029, 1.42234, 1.44455, 1.76719, 1.84971, 1.73982, 1.24814, 1.53885, 1.39306, 1.62267, 1.27091, 1.59048, 1.06674, 1.40639, 1.29128, 1.69617, 1.31246, 1.4525, 1.29959, 1.38347, 1.4963, 1.45118, 1.62261, 1.8211, 1.48622, 1.35396, 1.364, 1.22302, 1.21036, 1.59732, 1.16621, 1.43458, 1.39264, 1.50491, 1.74865, 1.69988, 1.54719, 1.66156, 1.38606, 1.43929, 1.37822, 1.30248, 1.79296, 1.45361, 1.24972, 1.59221, 1.3686, 1.22551, 1.4158, 1.49894, 1.55813, 1.52684, 1.44435, 2.05338, 1.36019, 1.34284, 1.20815, 1.7307, 1.50669, 2.1527, 1.33714, 1.40114, 1.51052, 1.35152, 1.43159, 1.42052, 1.44093, 1.62874, 1.70468, 1.84621, 1.36339, 1.49409, 1.99351, 1.25437, 1.69787, 1.77453, 1.53971, 1.98798, 1.46692, 1.21412, 1.35855, 1.61255, 1.37129, 1.69078, 1.53059, 1.31087, 1.87886, 1.31042, 1.42235, 1.38194, 1.39636, 1.83392, 1.47651, 1.46996, 1.64541, 1.53153, 1.47267, 1.75528, 1.44853, 1.39865, 1.75941, 1.63286, 1.32552, 1.6715, 2.26149, 1.61139, 1.35216, 1.34936, 1.25166, 1.69472, 1.58245, 1.4379, 1.43627, 1.60457, 1.82215, 1.39138, 1.38678, 1.55708, 1.41296, 1.29816, 1.46066, 1.39994, 1.45437, 1.25759, 1.34921, 1.47682, 1.55246, 1.48338, 1.2271, 1.36154, 1.44453, 1.47772, 1.43402, 1.21249, 1.8034, 1.50506, 1.3131, 1.37503, 1.35584, 1.41307, 1.45748, 1.26629, 1.31721, 1.47686, 1.80237, 1.55348, 1.5369, 1.32871, 1.35524, 1.76226, 1.27945, 1.40786, 1.56063, 1.18102, 1.26595, 1.41714, 1.27185, 1.59955, 1.53902, 1.50856, 1.38342, 1.3716, 1.52597, 1.55924, 1.33891, 1.44137, 1.66178, 1.44058, 1.53213, 1.34923, 1.54826, 1.51369, 1.26166, 1.22057, 1.64988, 1.4183, 1.45977, 1.27097, 1.31805, 1.24715, 1.52412, 1.48112, 1.51313, 1.58975, 1.42731, 1.32647, 1.44532, 1.53827, 1.72661, 1.53155, 1.57687, 1.2723, 1.26403, 1.36125, 1.36611, 1.46818, 1.38679, 1.58433, 1.49566, 1.44288, 1.37271, 1.45317, 1.36918, 1.35342, 1.27732, 1.37088, 1.29411, 1.25869, 1.46478, 1.43992, 1.66108, 1.34488, 1.17599, 1.3251]}, "grad-norm vs samples": {"start_step": 0, "end_step": 1983, "step_interval": 5, "values": [11.77525, 12.26804, 11.19281, 14.50237, 14.014, 11.57186, 8.3922, 7.10897, 4.47266, 4.00434, 3.4, 2.71736, 2.45629, 2.30739, 2.29493, 2.25132, 2.01839, 2.41173, 2.01298, 2.00525, 2.18932, 1.91353, 1.88951, 2.28883, 2.07903, 1.8844, 1.87495, 2.08513, 2.01874, 2.01118, 2.0102, 1.89229, 1.99489, 1.65446, 2.02134, 1.98456, 2.13312, 2.05074, 1.91832, 1.88506, 1.86975, 1.90714, 2.10548, 1.83107, 1.85561, 1.89757, 1.77389, 1.83901, 1.60882, 1.67073, 1.57953, 1.73056, 1.77582, 1.85094, 1.58796, 1.69243, 2.01012, 1.72305, 1.68342, 1.77634, 1.52051, 1.58604, 1.75613, 1.50876, 1.38814, 1.4853, 1.45829, 1.51675, 1.54655, 1.47158, 1.51099, 1.4708, 1.47268, 1.47452, 1.44323, 1.32185, 1.33599, 1.35564, 1.29533, 1.27928, 1.44962, 1.33226, 1.18991, 1.39956, 1.21257, 1.16175, 1.05645, 1.15134, 1.32979, 1.15427, 1.22191, 1.18197, 1.5911, 1.3589, 1.27604, 1.13871, 1.30626, 1.67866, 1.52014, 1.03431, 1.05476, 1.3049, 1.25479, 1.22714, 1.69201, 1.08131, 1.00908, 1.10419, 1.08066, 1.12768, 1.24403, 0.87723, 0.92972, 1.02293, 1.07062, 0.98243, 1.24502, 1.2897, 0.94461, 1.09023, 1.04658, 0.90251, 1.12421, 1.65432, 1.09595, 1.17882, 1.36022, 0.96059, 0.98043, 1.05339, 0.96416, 1.13229, 1.12844, 0.93359, 1.82877, 1.40011, 1.43068, 1.3027, 1.089, 1.64716, 1.37833, 1.56985, 1.16612, 1.85125, 1.24379, 1.71309, 1.39309, 1.27937, 1.17708, 1.73543, 1.05896, 1.24373, 1.38937, 1.36918, 1.42323, 1.77943, 1.13157, 1.27948, 1.19267, 1.34154, 1.40098, 1.16252, 1.42404, 1.2011, 1.00676, 1.48416, 1.13391, 1.33486, 1.5395, 1.27609, 1.42471, 1.30575, 1.22047, 1.81347, 1.74187, 1.56562, 1.47675, 1.51655, 1.70821, 1.44154, 1.50096, 1.28826, 1.74901, 1.90029, 1.42234, 1.44455, 1.76719, 1.84971, 1.73982, 1.24814, 1.53885, 1.39306, 1.62267, 1.27091, 1.59048, 1.06674, 1.40639, 1.29128, 1.69617, 1.31246, 1.4525, 1.29959, 1.38347, 1.4963, 1.45118, 1.62261, 1.8211, 1.48622, 1.35396, 1.364, 1.22302, 1.21036, 1.59732, 1.16621, 1.43458, 1.39264, 1.50491, 1.74865, 1.69988, 1.54719, 1.66156, 1.38606, 1.43929, 1.37822, 1.30248, 1.79296, 1.45361, 1.24972, 1.59221, 1.3686, 1.22551, 1.4158, 1.49894, 1.55813, 1.52684, 1.44435, 2.05338, 1.36019, 1.34284, 1.20815, 1.7307, 1.50669, 2.1527, 1.33714, 1.40114, 1.51052, 1.35152, 1.43159, 1.42052, 1.44093, 1.62874, 1.70468, 1.84621, 1.36339, 1.49409, 1.99351, 1.25437, 1.69787, 1.77453, 1.53971, 1.98798, 1.46692, 1.21412, 1.35855, 1.61255, 1.37129, 1.69078, 1.53059, 1.31087, 1.87886, 1.31042, 1.42235, 1.38194, 1.39636, 1.83392, 1.47651, 1.46996, 1.64541, 1.53153, 1.47267, 1.75528, 1.44853, 1.39865, 1.75941, 1.63286, 1.32552, 1.6715, 2.26149, 1.61139, 1.35216, 1.34936, 1.25166, 1.69472, 1.58245, 1.4379, 1.43627, 1.60457, 1.82215, 1.39138, 1.38678, 1.55708, 1.41296, 1.29816, 1.46066, 1.39994, 1.45437, 1.25759, 1.34921, 1.47682, 1.55246, 1.48338, 1.2271, 1.36154, 1.44453, 1.47772, 1.43402, 1.21249, 1.8034, 1.50506, 1.3131, 1.37503, 1.35584, 1.41307, 1.45748, 1.26629, 1.31721, 1.47686, 1.80237, 1.55348, 1.5369, 1.32871, 1.35524, 1.76226, 1.27945, 1.40786, 1.56063, 1.18102, 1.26595, 1.41714, 1.27185, 1.59955, 1.53902, 1.50856, 1.38342, 1.3716, 1.52597, 1.55924, 1.33891, 1.44137, 1.66178, 1.44058, 1.53213, 1.34923, 1.54826, 1.51369, 1.26166, 1.22057, 1.64988, 1.4183, 1.45977, 1.27097, 1.31805, 1.24715, 1.52412, 1.48112, 1.51313, 1.58975, 1.42731, 1.32647, 1.44532, 1.53827, 1.72661, 1.53155, 1.57687, 1.2723, 1.26403, 1.36125, 1.36611, 1.46818, 1.38679, 1.58433, 1.49566, 1.44288, 1.37271, 1.45317, 1.36918, 1.35342, 1.27732, 1.37088, 1.29411, 1.25869, 1.46478, 1.43992, 1.66108, 1.34488, 1.17599, 1.3251]}, "num-zeros": {"start_step": 0, "end_step": 1983, "step_interval": 5, "values": [951.0, 1294.0, 1060.0, 971.0, 901.0, 1117.0, 1205.0, 1364.0, 1468.0, 1319.0, 1539.0, 1911.0, 2180.0, 1576.0, 2216.0, 1925.0, 2038.0, 2028.0, 2476.0, 2015.0, 2201.0, 2215.0, 2438.0, 3135.0, 2444.0, 2806.0, 2540.0, 2188.0, 2052.0, 2885.0, 2408.0, 3553.0, 2417.0, 2497.0, 2486.0, 3667.0, 2116.0, 2243.0, 2127.0, 2649.0, 3818.0, 2985.0, 2311.0, 2810.0, 2580.0, 2214.0, 2672.0, 2502.0, 2376.0, 2941.0, 3128.0, 2507.0, 2600.0, 2152.0, 2790.0, 3240.0, 2769.0, 2720.0, 2392.0, 3522.0, 2236.0, 2883.0, 2397.0, 2586.0, 2219.0, 3154.0, 2799.0, 2803.0, 2345.0, 2563.0, 2171.0, 2874.0, 2837.0, 2656.0, 3389.0, 2526.0, 2817.0, 2625.0, 3000.0, 2814.0, 2754.0, 2414.0, 3081.0, 2380.0, 2876.0, 2737.0, 2780.0, 2271.0, 2333.0, 2839.0, 2519.0, 3210.0, 2404.0, 2291.0, 2433.0, 2383.0, 2435.0, 1919.0, 2351.0, 2585.0, 2779.0, 2221.0, 2014.0, 2114.0, 1881.0, 2304.0, 2397.0, 2309.0, 2239.0, 2116.0, 2239.0, 2377.0, 2323.0, 2496.0, 2298.0, 2773.0, 2696.0, 1952.0, 2435.0, 2042.0, 2813.0, 2452.0, 2068.0, 2032.0, 2127.0, 2176.0, 2056.0, 2569.0, 2495.0, 2156.0, 2202.0, 2372.0, 2368.0, 2313.0, 1956.0, 2287.0, 2471.0, 2251.0, 2132.0, 1626.0, 2076.0, 2288.0, 2009.0, 1987.0, 2433.0, 1651.0, 2033.0, 2061.0, 1927.0, 2837.0, 2589.0, 2063.0, 1738.0, 1964.0, 2334.0, 1899.0, 2516.0, 2136.0, 2214.0, 1965.0, 1875.0, 2415.0, 1921.0, 2352.0, 2174.0, 1887.0, 2165.0, 2616.0, 1911.0, 1825.0, 1959.0, 1908.0, 1822.0, 1574.0, 1545.0, 2160.0, 1942.0, 2081.0, 1733.0, 2008.0, 2010.0, 2212.0, 1875.0, 1390.0, 1972.0, 2540.0, 1825.0, 2152.0, 1632.0, 2232.0, 1792.0, 1887.0, 1971.0, 2046.0, 1779.0, 2139.0, 2024.0, 1999.0, 1614.0, 1985.0, 1902.0, 2128.0, 2445.0, 2671.0, 2214.0, 2029.0, 2081.0, 2209.0, 2226.0, 1957.0, 2210.0, 2419.0, 2685.0, 2294.0, 1932.0, 2118.0, 1963.0, 1818.0, 1841.0, 2149.0, 2110.0, 2155.0, 1868.0, 2220.0, 2120.0, 2379.0, 1886.0, 2361.0, 1763.0, 2055.0, 1972.0, 2155.0, 1934.0, 2167.0, 1959.0, 1882.0, 1705.0, 1826.0, 1964.0, 2224.0, 1818.0, 1883.0, 1743.0, 2488.0, 2393.0, 2103.0, 2005.0, 2728.0, 2142.0, 2054.0, 1951.0, 1819.0, 2038.0, 2170.0, 2265.0, 1808.0, 2431.0, 1807.0, 2184.0, 2053.0, 1687.0, 1931.0, 2549.0, 2587.0, 1986.0, 2273.0, 2103.0, 2063.0, 2204.0, 2021.0, 2110.0, 2428.0, 2484.0, 2060.0, 2244.0, 2025.0, 1999.0, 1965.0, 1906.0, 2137.0, 2024.0, 2234.0, 1998.0, 2022.0, 1943.0, 2254.0, 2008.0, 1619.0, 1850.0, 2446.0, 2316.0, 1952.0, 2008.0, 2201.0, 2018.0, 2191.0, 1856.0, 2363.0, 2138.0, 2632.0, 1897.0, 2331.0, 1915.0, 2017.0, 2347.0, 2073.0, 2221.0, 2341.0, 1910.0, 1944.0, 2197.0, 2136.0, 2140.0, 2057.0, 2254.0, 1992.0, 2377.0, 1829.0, 2323.0, 2256.0, 2248.0, 2664.0, 2091.0, 2351.0, 2363.0, 2417.0, 1953.0, 2010.0, 2111.0, 2082.0, 2141.0, 2449.0, 2394.0, 2165.0, 2019.0, 2307.0, 2446.0, 2932.0, 2123.0, 2428.0, 2294.0, 2499.0, 2597.0, 2391.0, 2142.0, 2085.0, 2112.0, 2498.0, 2172.0, 2546.0, 2086.0, 2278.0, 2000.0, 2060.0, 2222.0, 2327.0, 2377.0, 2181.0, 1943.0, 2370.0, 2170.0, 2277.0, 2360.0, 2822.0, 2306.0, 2709.0, 2210.0, 2127.0, 2321.0, 2202.0, 2780.0, 2249.0, 2312.0, 2033.0, 2114.0, 2287.0, 2292.0, 2301.0, 2735.0, 2674.0, 2246.0, 2584.0, 2280.0, 2624.0, 2634.0, 2653.0, 2502.0, 2748.0, 2256.0, 2492.0, 2276.0, 2217.0, 1995.0, 2408.0, 2306.0, 2584.0, 2373.0]}, "num-zeros vs samples": {"start_step": 0, "end_step": 1983, "step_interval": 5, "values": [951.0, 1294.0, 1060.0, 971.0, 901.0, 1117.0, 1205.0, 1364.0, 1468.0, 1319.0, 1539.0, 1911.0, 2180.0, 1576.0, 2216.0, 1925.0, 2038.0, 2028.0, 2476.0, 2015.0, 2201.0, 2215.0, 2438.0, 3135.0, 2444.0, 2806.0, 2540.0, 2188.0, 2052.0, 2885.0, 2408.0, 3553.0, 2417.0, 2497.0, 2486.0, 3667.0, 2116.0, 2243.0, 2127.0, 2649.0, 3818.0, 2985.0, 2311.0, 2810.0, 2580.0, 2214.0, 2672.0, 2502.0, 2376.0, 2941.0, 3128.0, 2507.0, 2600.0, 2152.0, 2790.0, 3240.0, 2769.0, 2720.0, 2392.0, 3522.0, 2236.0, 2883.0, 2397.0, 2586.0, 2219.0, 3154.0, 2799.0, 2803.0, 2345.0, 2563.0, 2171.0, 2874.0, 2837.0, 2656.0, 3389.0, 2526.0, 2817.0, 2625.0, 3000.0, 2814.0, 2754.0, 2414.0, 3081.0, 2380.0, 2876.0, 2737.0, 2780.0, 2271.0, 2333.0, 2839.0, 2519.0, 3210.0, 2404.0, 2291.0, 2433.0, 2383.0, 2435.0, 1919.0, 2351.0, 2585.0, 2779.0, 2221.0, 2014.0, 2114.0, 1881.0, 2304.0, 2397.0, 2309.0, 2239.0, 2116.0, 2239.0, 2377.0, 2323.0, 2496.0, 2298.0, 2773.0, 2696.0, 1952.0, 2435.0, 2042.0, 2813.0, 2452.0, 2068.0, 2032.0, 2127.0, 2176.0, 2056.0, 2569.0, 2495.0, 2156.0, 2202.0, 2372.0, 2368.0, 2313.0, 1956.0, 2287.0, 2471.0, 2251.0, 2132.0, 1626.0, 2076.0, 2288.0, 2009.0, 1987.0, 2433.0, 1651.0, 2033.0, 2061.0, 1927.0, 2837.0, 2589.0, 2063.0, 1738.0, 1964.0, 2334.0, 1899.0, 2516.0, 2136.0, 2214.0, 1965.0, 1875.0, 2415.0, 1921.0, 2352.0, 2174.0, 1887.0, 2165.0, 2616.0, 1911.0, 1825.0, 1959.0, 1908.0, 1822.0, 1574.0, 1545.0, 2160.0, 1942.0, 2081.0, 1733.0, 2008.0, 2010.0, 2212.0, 1875.0, 1390.0, 1972.0, 2540.0, 1825.0, 2152.0, 1632.0, 2232.0, 1792.0, 1887.0, 1971.0, 2046.0, 1779.0, 2139.0, 2024.0, 1999.0, 1614.0, 1985.0, 1902.0, 2128.0, 2445.0, 2671.0, 2214.0, 2029.0, 2081.0, 2209.0, 2226.0, 1957.0, 2210.0, 2419.0, 2685.0, 2294.0, 1932.0, 2118.0, 1963.0, 1818.0, 1841.0, 2149.0, 2110.0, 2155.0, 1868.0, 2220.0, 2120.0, 2379.0, 1886.0, 2361.0, 1763.0, 2055.0, 1972.0, 2155.0, 1934.0, 2167.0, 1959.0, 1882.0, 1705.0, 1826.0, 1964.0, 2224.0, 1818.0, 1883.0, 1743.0, 2488.0, 2393.0, 2103.0, 2005.0, 2728.0, 2142.0, 2054.0, 1951.0, 1819.0, 2038.0, 2170.0, 2265.0, 1808.0, 2431.0, 1807.0, 2184.0, 2053.0, 1687.0, 1931.0, 2549.0, 2587.0, 1986.0, 2273.0, 2103.0, 2063.0, 2204.0, 2021.0, 2110.0, 2428.0, 2484.0, 2060.0, 2244.0, 2025.0, 1999.0, 1965.0, 1906.0, 2137.0, 2024.0, 2234.0, 1998.0, 2022.0, 1943.0, 2254.0, 2008.0, 1619.0, 1850.0, 2446.0, 2316.0, 1952.0, 2008.0, 2201.0, 2018.0, 2191.0, 1856.0, 2363.0, 2138.0, 2632.0, 1897.0, 2331.0, 1915.0, 2017.0, 2347.0, 2073.0, 2221.0, 2341.0, 1910.0, 1944.0, 2197.0, 2136.0, 2140.0, 2057.0, 2254.0, 1992.0, 2377.0, 1829.0, 2323.0, 2256.0, 2248.0, 2664.0, 2091.0, 2351.0, 2363.0, 2417.0, 1953.0, 2010.0, 2111.0, 2082.0, 2141.0, 2449.0, 2394.0, 2165.0, 2019.0, 2307.0, 2446.0, 2932.0, 2123.0, 2428.0, 2294.0, 2499.0, 2597.0, 2391.0, 2142.0, 2085.0, 2112.0, 2498.0, 2172.0, 2546.0, 2086.0, 2278.0, 2000.0, 2060.0, 2222.0, 2327.0, 2377.0, 2181.0, 1943.0, 2370.0, 2170.0, 2277.0, 2360.0, 2822.0, 2306.0, 2709.0, 2210.0, 2127.0, 2321.0, 2202.0, 2780.0, 2249.0, 2312.0, 2033.0, 2114.0, 2287.0, 2292.0, 2301.0, 2735.0, 2674.0, 2246.0, 2584.0, 2280.0, 2624.0, 2634.0, 2653.0, 2502.0, 2748.0, 2256.0, 2492.0, 2276.0, 2217.0, 1995.0, 2408.0, 2306.0, 2584.0, 2373.0]}, "lm loss validation": {"start_step": 0, "end_step": 4, "step_interval": 5, "values": [5.62692]}, "lm loss validation vs samples": {"start_step": 0, "end_step": 4, "step_interval": 5, "values": [5.62692]}, "lm loss validation ppl": {"start_step": 0, "end_step": 4, "step_interval": 5, "values": [277.80627]}, "lm loss validation ppl vs samples": {"start_step": 0, "end_step": 4, "step_interval": 5, "values": [277.80627]}}
\ No newline at end of file
diff --git a/tests/functional_tests/test_cases/gpt/gpt3_345m_weekly_dgx_h100_1N8G_mcore_tp1_pp1_bf16_baseline/model_config.yaml b/tests/functional_tests/test_cases/gpt/gpt3_345m_weekly_dgx_h100_1N8G_mcore_tp1_pp1_bf16_baseline/model_config.yaml
new file mode 100644
index 0000000000..e28cc2ba9b
--- /dev/null
+++ b/tests/functional_tests/test_cases/gpt/gpt3_345m_weekly_dgx_h100_1N8G_mcore_tp1_pp1_bf16_baseline/model_config.yaml
@@ -0,0 +1,48 @@
+ENV_VARS:
+  CUDA_DEVICE_MAX_CONNECTIONS: 1
+  NVTE_ALLOW_NONDETERMINISTIC_ALGO: 0
+  NCCL_ALGO: Tree
+  CUBLAS_WORKSPACE_CONFIG: :4096:8
+MODEL_ARGS:
+  --num-layers: 12
+  --hidden-size: 512
+  --num-attention-heads: 8
+  --log-params-norm: true
+  --log-num-zeros-in-grad: true
+  --log-validation-ppl-to-tensorboard: true
+  --log-timers-to-tensorboard: true
+  --tensorboard-dir: ${TENSORBOARD_PATH}
+  --micro-batch-size: 2
+  --global-batch-size: 128
+  --seq-length: 1024
+  --max-position-embeddings: 1024
+  --train-iters: 2000
+  --timing-log-level: 2
+  --lr-decay-iters: 320000
+  --save: ${CHECKPOINT_PATH}
+  --load: ${CHECKPOINT_PATH}
+  --data-path: ${DATA_PATH}/my-gpt3_00_text_document
+  --vocab-file: ${DATA_PATH}/bpe/vocab.json
+  --merge-file: ${DATA_PATH}/bpe/merges.txt
+  --split: 949,50,1
+  --distributed-backend: nccl
+  --lr: 0.00015
+  --lr-decay-style: cosine
+  --min-lr: 1.0e-5
+  --weight-decay: 1e-2
+  --clip-grad: 1.0
+  --lr-warmup-fraction: .01
+  --log-interval: 1
+  --save-interval: 10000
+  --eval-interval: 1000
+  --eval-iters: 10
+  --transformer-impl: local
+  --tensor-model-parallel-size: 1
+  --pipeline-model-parallel-size: 1
+  --deterministic-mode: true
+  --no-gradient-accumulation-fusion: true
+  --ckpt-format: torch_dist
+  --data-cache-path: ${DATA_CACHE_PATH}
+  --fp16: true
+  --apply-query-key-layer-scaling: true
+TEST_TYPE: regular
diff --git a/tests/functional_tests/test_cases/gpt/gpt3_345m_weekly_dgx_h100_1N8G_mcore_tp1_pp1_fp8_no_model_parallel/golden_values_dev.json b/tests/functional_tests/test_cases/gpt/gpt3_345m_weekly_dgx_h100_1N8G_mcore_tp1_pp1_fp8_no_model_parallel/golden_values_dev.json
new file mode 100644
index 0000000000..6a88c3a850
--- /dev/null
+++ b/tests/functional_tests/test_cases/gpt/gpt3_345m_weekly_dgx_h100_1N8G_mcore_tp1_pp1_fp8_no_model_parallel/golden_values_dev.json
@@ -0,0 +1 @@
+{"forward-backward-time": {"start_step": 0, "end_step": 2000, "step_interval": 5, "values": [18.14133, 0.62524, 0.62888, 0.61879, 0.62017, 0.62262, 0.61644, 0.95648, 0.62134, 0.62122, 0.62167, 0.61736, 0.62014, 0.61878, 0.61905, 0.62285, 0.62143, 0.62527, 0.6188, 0.61821, 0.62092, 0.6146, 0.62538, 0.62186, 0.62062, 0.61709, 0.61586, 0.62211, 0.62113, 0.62256, 0.91616, 0.62172, 0.62082, 0.61854, 0.61851, 0.61865, 0.61838, 0.62057, 0.62054, 0.62121, 0.62279, 0.61565, 0.61588, 0.61809, 0.6285, 0.62159, 0.619, 0.62096, 0.6161, 0.61341, 0.61939, 0.61863, 0.61901, 0.69973, 0.62205, 0.6203, 0.62205, 0.61913, 0.61593, 0.61268, 0.62209, 0.62242, 0.62178, 0.61463, 0.61723, 0.61562, 0.62222, 0.61147, 0.61537, 0.61793, 0.61712, 0.61962, 0.62226, 0.73426, 0.61519, 0.61809, 0.62057, 0.72077, 0.62008, 0.6196, 0.61771, 0.61875, 0.61628, 0.61618, 0.61608, 0.61962, 0.61838, 0.61834, 0.61866, 0.62047, 0.61852, 0.61278, 0.61478, 0.61796, 0.61939, 0.61855, 0.61816, 0.61585, 0.72525, 0.61589, 0.71497, 0.61452, 0.61899, 0.61647, 0.61769, 0.61448, 0.6133, 0.6161, 0.61341, 0.61318, 0.61661, 0.61966, 0.61316, 0.61487, 0.61573, 0.61347, 0.61386, 0.61593, 0.61745, 0.6185, 0.61792, 0.61356, 0.61533, 0.61644, 0.70276, 0.61398, 0.6159, 0.61832, 0.61774, 0.61711, 0.61411, 0.61533, 0.62272, 0.61709, 0.61557, 0.61705, 0.61893, 0.6177, 0.61888, 0.62207, 0.6181, 0.61501, 0.61758, 0.61994, 0.62402, 0.61667, 0.61599, 0.62131, 0.62011, 0.73481, 0.61752, 0.6206, 0.61654, 0.62124, 0.61775, 0.61832, 0.62597, 0.61901, 0.6153, 0.61393, 0.62147, 0.62628, 0.62091, 0.61689, 0.61436, 0.61683, 0.61743, 0.62116, 0.62033, 0.71198, 0.71973, 0.62179, 0.61968, 0.62104, 0.73504, 0.61833, 0.62098, 0.61898, 0.62766, 0.61917, 0.61475, 0.61706, 0.62025, 0.62046, 0.62146, 0.61796, 0.61756, 0.61818, 0.61889, 0.61869, 0.61959, 0.61761, 0.79997, 0.71316, 0.7092, 0.61693, 0.61553, 0.61793, 0.62191, 0.61846, 0.60521, 0.63066, 0.62491, 0.6225, 0.62102, 0.62456, 0.6247, 0.6269, 0.62537, 0.62411, 0.6231, 0.62397, 0.61873, 0.61766, 0.72647, 0.61878, 0.70741, 0.62227, 0.71605, 0.62022, 0.61781, 0.62597, 0.62427, 0.73275, 0.61764, 0.62069, 0.61913, 0.61957, 0.62075, 0.61693, 0.62163, 0.62496, 0.62065, 0.61855, 0.62534, 0.62563, 0.63027, 0.62765, 0.62046, 0.62782, 0.6225, 0.62116, 0.71019, 0.62081, 0.62867, 0.61875, 0.61378, 0.61727, 0.6238, 0.62162, 0.62088, 0.61962, 0.62082, 0.62352, 0.62164, 0.62001, 0.62139, 0.62, 0.62818, 0.6266, 0.63112, 0.62627, 0.62702, 0.62774, 0.62831, 0.62063, 0.71258, 0.62584, 0.63033, 0.62439, 0.62649, 0.61461, 0.6209, 0.61667, 0.62067, 0.61793, 0.61954, 0.61977, 0.622, 0.6288, 0.62767, 0.62589, 0.62912, 0.62368, 0.61631, 0.73714, 0.6313, 0.61624, 0.61414, 0.62482, 0.6265, 0.62661, 0.62057, 0.62063, 0.62436, 0.62886, 0.62643, 0.62055, 0.61891, 0.62228, 0.62509, 0.62152, 0.62371, 0.62145, 0.61596, 0.62278, 0.62635, 0.63114, 0.72659, 0.72093, 0.62818, 0.62831, 0.61965, 0.62825, 0.62531, 0.6239, 0.6269, 0.6223, 0.62369, 0.62215, 0.62376, 0.62336, 0.62681, 0.62299, 0.62046, 0.61497, 0.61616, 0.61762, 0.62291, 0.61731, 0.61644, 0.61524, 0.61842, 0.62286, 0.61327, 0.61596, 0.6185, 0.61983, 0.62272, 0.61746, 0.6207, 0.6179, 0.61849, 0.62196, 0.62408, 0.62953, 0.62672, 0.62606, 0.61511, 0.61549, 0.6159, 0.62334, 0.62662, 0.75567, 0.62523, 0.62516, 0.62916, 0.62575, 0.62292, 0.62685, 0.62432, 0.62244, 0.61921, 0.61816, 0.61641, 0.61968, 0.62202, 0.6208, 0.6193, 0.61995, 0.62245, 0.61844, 0.61724, 0.61904, 0.61874, 0.62205, 0.6161, 0.61772, 0.70649, 0.62431, 0.61921, 0.62093, 0.61887, 0.62189, 0.62184, 0.62081, 0.62021, 0.62093, 0.62086, 0.62164, 0.6235, 0.61872, 0.62062, 0.61908, 0.62491, 0.62732, 0.62504, 0.61899, 0.62006, 0.6215]}, "forward-compute-time": {"start_step": 0, "end_step": 2000, "step_interval": 5, "values": [9.27215, 0.36134, 0.36093, 0.35232, 0.35362, 0.35668, 0.35229, 0.68753, 0.35087, 0.35407, 0.35147, 0.35356, 0.35146, 0.35384, 0.35274, 0.35595, 0.35404, 0.35262, 0.35078, 0.34962, 0.35338, 0.34834, 0.35424, 0.35549, 0.35524, 0.34948, 0.35114, 0.35465, 0.35306, 0.35417, 0.64338, 0.35253, 0.35038, 0.34824, 0.3516, 0.35295, 0.35334, 0.3507, 0.3518, 0.35354, 0.35258, 0.3508, 0.35045, 0.35367, 0.35832, 0.35222, 0.35029, 0.35265, 0.35179, 0.34702, 0.35321, 0.35445, 0.35177, 0.43752, 0.35531, 0.35287, 0.3529, 0.34925, 0.35154, 0.34648, 0.34908, 0.35314, 0.34798, 0.3481, 0.35014, 0.35038, 0.35008, 0.34793, 0.34843, 0.35226, 0.35123, 0.34921, 0.351, 0.46524, 0.34642, 0.35022, 0.34926, 0.45533, 0.35075, 0.35197, 0.34952, 0.35294, 0.35156, 0.35367, 0.35231, 0.35148, 0.34881, 0.34904, 0.35192, 0.35269, 0.35151, 0.34592, 0.34953, 0.35046, 0.35109, 0.35197, 0.35201, 0.34972, 0.45764, 0.34845, 0.44993, 0.34761, 0.35227, 0.34673, 0.35005, 0.34603, 0.34781, 0.34961, 0.34726, 0.3482, 0.3514, 0.35199, 0.34526, 0.3478, 0.35064, 0.34875, 0.35162, 0.34733, 0.3494, 0.34825, 0.35136, 0.34918, 0.34966, 0.34867, 0.43767, 0.34863, 0.35097, 0.35094, 0.34677, 0.35081, 0.35072, 0.35015, 0.35172, 0.35213, 0.34826, 0.34865, 0.35048, 0.3496, 0.34911, 0.35588, 0.35342, 0.35191, 0.35141, 0.35102, 0.35709, 0.34876, 0.34872, 0.35106, 0.35322, 0.46707, 0.35188, 0.35176, 0.35, 0.35379, 0.3509, 0.35081, 0.3551, 0.35093, 0.34933, 0.34848, 0.35167, 0.35398, 0.34723, 0.34792, 0.34845, 0.34775, 0.35079, 0.34957, 0.35345, 0.44501, 0.45138, 0.34891, 0.35082, 0.3502, 0.46589, 0.35255, 0.35187, 0.35127, 0.35483, 0.35059, 0.34896, 0.34861, 0.35247, 0.35179, 0.34935, 0.35234, 0.34933, 0.35334, 0.34686, 0.35171, 0.35547, 0.35168, 0.52709, 0.44719, 0.44161, 0.34936, 0.34954, 0.35313, 0.34988, 0.35211, 0.33688, 0.35591, 0.3569, 0.35308, 0.35372, 0.35241, 0.35314, 0.35633, 0.353, 0.35616, 0.35467, 0.35273, 0.3514, 0.35129, 0.45541, 0.3499, 0.44221, 0.35081, 0.44665, 0.35109, 0.35024, 0.35427, 0.35423, 0.46289, 0.34881, 0.35173, 0.34964, 0.35399, 0.35206, 0.35147, 0.35326, 0.35451, 0.35111, 0.35112, 0.35937, 0.35913, 0.36067, 0.35939, 0.35289, 0.35237, 0.34936, 0.35284, 0.44138, 0.35073, 0.35858, 0.35425, 0.34953, 0.35087, 0.35453, 0.35091, 0.35251, 0.34904, 0.35282, 0.35193, 0.35492, 0.35161, 0.35115, 0.35118, 0.36151, 0.35849, 0.36407, 0.35821, 0.36041, 0.35561, 0.36252, 0.35429, 0.44699, 0.36096, 0.36201, 0.35407, 0.35747, 0.35035, 0.35103, 0.34874, 0.35637, 0.3524, 0.35102, 0.35202, 0.35462, 0.35968, 0.35397, 0.35259, 0.35547, 0.35321, 0.35018, 0.46643, 0.3583, 0.35092, 0.34697, 0.3538, 0.35589, 0.35223, 0.35164, 0.35261, 0.35967, 0.36013, 0.35806, 0.35023, 0.35024, 0.3526, 0.34984, 0.35259, 0.35298, 0.35284, 0.35138, 0.35036, 0.35288, 0.35847, 0.45332, 0.44559, 0.35561, 0.35336, 0.3521, 0.35312, 0.35227, 0.35234, 0.35359, 0.35468, 0.35224, 0.35204, 0.35651, 0.35583, 0.35358, 0.35435, 0.35427, 0.3497, 0.35079, 0.35172, 0.35517, 0.35178, 0.35126, 0.34889, 0.35033, 0.35332, 0.34892, 0.35261, 0.35094, 0.35215, 0.35764, 0.35341, 0.35384, 0.35265, 0.35263, 0.35262, 0.35604, 0.36288, 0.35642, 0.35552, 0.3484, 0.34851, 0.3514, 0.36023, 0.35789, 0.48902, 0.36035, 0.36141, 0.3626, 0.35908, 0.35622, 0.35631, 0.35269, 0.35075, 0.35039, 0.35096, 0.35039, 0.34953, 0.35289, 0.34822, 0.35154, 0.35088, 0.35383, 0.35072, 0.34872, 0.34826, 0.34902, 0.35267, 0.34801, 0.34971, 0.43955, 0.35085, 0.34994, 0.35373, 0.34855, 0.3492, 0.35231, 0.34725, 0.35003, 0.3473, 0.35104, 0.34755, 0.34992, 0.35186, 0.35388, 0.35074, 0.34993, 0.35194, 0.35167, 0.34626, 0.35392, 0.35198]}, "backward-compute-time": {"start_step": 0, "end_step": 2000, "step_interval": 5, "values": [2.80897, 0.28475, 0.2809, 0.27885, 0.27971, 0.2768, 0.27791, 0.2813, 0.2828, 0.27982, 0.28277, 0.27676, 0.28261, 0.27806, 0.28033, 0.2756, 0.28082, 0.27955, 0.28018, 0.27766, 0.27802, 0.27721, 0.28203, 0.27953, 0.27943, 0.27922, 0.27814, 0.28056, 0.28107, 0.27624, 0.28037, 0.28169, 0.2828, 0.28312, 0.28074, 0.27837, 0.27679, 0.28303, 0.2829, 0.28043, 0.27823, 0.27266, 0.27336, 0.27459, 0.28023, 0.27652, 0.27746, 0.2779, 0.27563, 0.27401, 0.27717, 0.27499, 0.27806, 0.27139, 0.27365, 0.27659, 0.28082, 0.28038, 0.27531, 0.27517, 0.28057, 0.27667, 0.28628, 0.27883, 0.27588, 0.27536, 0.27984, 0.2729, 0.27334, 0.27425, 0.27422, 0.27613, 0.27623, 0.2746, 0.27458, 0.27341, 0.27807, 0.27236, 0.27663, 0.27538, 0.27514, 0.27306, 0.2725, 0.27083, 0.27026, 0.27509, 0.27586, 0.27515, 0.27392, 0.27389, 0.27372, 0.2727, 0.27096, 0.27354, 0.27409, 0.27274, 0.27274, 0.27361, 0.27352, 0.27457, 0.27411, 0.27589, 0.27459, 0.27704, 0.27375, 0.27488, 0.27373, 0.27473, 0.27336, 0.27408, 0.27412, 0.27621, 0.27573, 0.2757, 0.27319, 0.27286, 0.27081, 0.27628, 0.27632, 0.27773, 0.27459, 0.27302, 0.27391, 0.27706, 0.27302, 0.27235, 0.2728, 0.27422, 0.27771, 0.27408, 0.273, 0.27313, 0.27881, 0.2727, 0.27535, 0.27554, 0.27602, 0.27445, 0.27748, 0.27334, 0.27196, 0.27246, 0.27334, 0.2765, 0.27324, 0.27646, 0.27446, 0.27758, 0.27638, 0.2749, 0.27379, 0.27822, 0.27586, 0.27434, 0.27452, 0.2751, 0.27681, 0.27448, 0.27334, 0.27477, 0.27831, 0.27967, 0.28117, 0.27795, 0.27331, 0.27527, 0.27361, 0.27892, 0.27512, 0.27366, 0.27646, 0.27988, 0.27713, 0.27762, 0.27574, 0.27463, 0.27934, 0.27654, 0.28122, 0.27818, 0.27487, 0.27565, 0.27548, 0.27639, 0.27869, 0.27377, 0.27686, 0.2737, 0.27871, 0.27425, 0.27333, 0.27386, 0.27879, 0.2752, 0.27707, 0.27628, 0.27433, 0.27416, 0.28211, 0.27328, 0.27772, 0.2888, 0.28238, 0.28559, 0.28328, 0.28926, 0.29069, 0.28744, 0.28541, 0.28383, 0.28569, 0.28878, 0.28294, 0.28177, 0.28457, 0.28391, 0.27915, 0.28556, 0.28795, 0.28723, 0.28157, 0.28876, 0.288, 0.28233, 0.28245, 0.28563, 0.28586, 0.27943, 0.28324, 0.27971, 0.28335, 0.28509, 0.28373, 0.28221, 0.27996, 0.2821, 0.28282, 0.28146, 0.2827, 0.29287, 0.28819, 0.28375, 0.28224, 0.28618, 0.28593, 0.27803, 0.2775, 0.27939, 0.28305, 0.28516, 0.28387, 0.28394, 0.27989, 0.28606, 0.28244, 0.28311, 0.2822, 0.28452, 0.28083, 0.28371, 0.27966, 0.28404, 0.27905, 0.28671, 0.28017, 0.28042, 0.27826, 0.27799, 0.28104, 0.28485, 0.2833, 0.27803, 0.28505, 0.28078, 0.27731, 0.27811, 0.2825, 0.2845, 0.28366, 0.28285, 0.29128, 0.28986, 0.28737, 0.28519, 0.28008, 0.28508, 0.29026, 0.27934, 0.27842, 0.28735, 0.28334, 0.29041, 0.28444, 0.28192, 0.27975, 0.28248, 0.28157, 0.28471, 0.28418, 0.28337, 0.29038, 0.28525, 0.28937, 0.28336, 0.28092, 0.28765, 0.2938, 0.28931, 0.28955, 0.29117, 0.29147, 0.29048, 0.28242, 0.29224, 0.28996, 0.28762, 0.28995, 0.28361, 0.28955, 0.28314, 0.28125, 0.28279, 0.28923, 0.28566, 0.28096, 0.27889, 0.27987, 0.28102, 0.28378, 0.27825, 0.27822, 0.28139, 0.28151, 0.284, 0.28038, 0.27763, 0.28234, 0.28237, 0.27877, 0.27839, 0.28213, 0.27969, 0.27977, 0.28461, 0.28193, 0.28295, 0.28539, 0.28439, 0.28043, 0.28021, 0.27978, 0.27678, 0.28057, 0.28152, 0.27875, 0.27736, 0.28042, 0.28071, 0.27701, 0.28009, 0.28081, 0.28054, 0.27846, 0.27695, 0.27435, 0.28018, 0.27863, 0.2831, 0.27711, 0.27774, 0.27798, 0.27776, 0.27805, 0.27924, 0.27943, 0.27863, 0.27639, 0.27628, 0.27471, 0.28218, 0.2775, 0.27692, 0.28008, 0.28228, 0.27856, 0.28233, 0.27871, 0.28388, 0.27878, 0.2831, 0.28268, 0.27716, 0.2756, 0.27712, 0.28343, 0.28463, 0.28241, 0.28327, 0.27551, 0.27892]}, "batch-generator-time": {"start_step": 0, "end_step": 2000, "step_interval": 5, "values": [0.62041, 0.00418, 0.00386, 0.00419, 0.00438, 0.0044, 0.00464, 0.00467, 0.00468, 0.00448, 0.00443, 0.00436, 0.00461, 0.00452, 0.00471, 0.00475, 0.00426, 0.00443, 0.00451, 0.00448, 0.00454, 0.00422, 0.00444, 0.00458, 0.00446, 0.00447, 0.00432, 0.00458, 0.00459, 0.00455, 0.00456, 0.0044, 0.00451, 0.00445, 0.00465, 0.00435, 0.00439, 0.00431, 0.00431, 0.00453, 0.0045, 0.00449, 0.00456, 0.00437, 0.00432, 0.0043, 0.00442, 0.0045, 0.0042, 0.00427, 0.0045, 0.00438, 0.00447, 0.00452, 0.0046, 0.00429, 0.00439, 0.00441, 0.00462, 0.00448, 0.00409, 0.00434, 0.00448, 0.0042, 0.00454, 0.00422, 0.00431, 0.00413, 0.00439, 0.00414, 0.00456, 0.00464, 0.00426, 0.00434, 0.00414, 0.00453, 0.00423, 0.00453, 0.00431, 0.00403, 0.00414, 0.0043, 0.00446, 0.00423, 0.00437, 0.00434, 0.00419, 0.0042, 0.00433, 0.00435, 0.00443, 0.00408, 0.00416, 0.00451, 0.00443, 0.00435, 0.00446, 0.00421, 0.00467, 0.00454, 0.00431, 0.00462, 0.00433, 0.00426, 0.00437, 0.00437, 0.00433, 0.00435, 0.00426, 0.00413, 0.00435, 0.00422, 0.00431, 0.00432, 0.0043, 0.00408, 0.00435, 0.00438, 0.00439, 0.00426, 0.00438, 0.00432, 0.00449, 0.00423, 0.00444, 0.00436, 0.00417, 0.00424, 0.0042, 0.00428, 0.00425, 0.00425, 0.0042, 0.00445, 0.0043, 0.00429, 0.00441, 0.0043, 0.00412, 0.00429, 0.0042, 0.00419, 0.0042, 0.00427, 0.00427, 0.00418, 0.00464, 0.00406, 0.00435, 0.0046, 0.0043, 0.00438, 0.00417, 0.00427, 0.0044, 0.00444, 0.0045, 0.00407, 0.00421, 0.00403, 0.00442, 0.00418, 0.00425, 0.00425, 0.00434, 0.00422, 0.00432, 0.00446, 0.00435, 0.00452, 0.00428, 0.00408, 0.00445, 0.00414, 0.00441, 0.00412, 0.00434, 0.00445, 0.00425, 0.00412, 0.00432, 0.00441, 0.00432, 0.00422, 0.00429, 0.00407, 0.00434, 0.00448, 0.00434, 0.00434, 0.00423, 0.00422, 0.0046, 0.00418, 0.00445, 0.00432, 0.00422, 0.00418, 0.00408, 0.00434, 0.03441, 0.00493, 0.00506, 0.00555, 0.00518, 0.00512, 0.00537, 0.00513, 0.00501, 0.00506, 0.00504, 0.00473, 0.00488, 0.00523, 0.00528, 0.00511, 0.00526, 0.00496, 0.00546, 0.00512, 0.0054, 0.00539, 0.00514, 0.00484, 0.00515, 0.00531, 0.00515, 0.00498, 0.00509, 0.0051, 0.00516, 0.00496, 0.00494, 0.00501, 0.00511, 0.00536, 0.00517, 0.00549, 0.00531, 0.00526, 0.00531, 0.00497, 0.00498, 0.00524, 0.00486, 0.00502, 0.00497, 0.00491, 0.00509, 0.00466, 0.00519, 0.00528, 0.00486, 0.00509, 0.0049, 0.005, 0.00508, 0.005, 0.00503, 0.00473, 0.00536, 0.00516, 0.00549, 0.00528, 0.00506, 0.00513, 0.00501, 0.00563, 0.00498, 0.00498, 0.0051, 0.00528, 0.00509, 0.005, 0.00495, 0.00509, 0.00508, 0.00485, 0.00479, 0.00485, 0.00507, 0.00499, 0.00463, 0.00497, 0.00487, 0.00529, 0.00518, 0.00483, 0.00513, 0.0051, 0.005, 0.005, 0.00514, 0.00496, 0.00492, 0.00547, 0.00506, 0.00502, 0.00481, 0.0051, 0.00498, 0.0051, 0.00475, 0.00498, 0.0048, 0.00528, 0.00523, 0.0053, 0.00561, 0.00522, 0.00517, 0.00528, 0.00505, 0.00511, 0.00538, 0.00531, 0.00528, 0.00554, 0.00534, 0.00512, 0.00541, 0.00533, 0.00508, 0.00518, 0.00519, 0.00548, 0.00545, 0.00554, 0.0052, 0.00506, 0.00513, 0.00502, 0.00523, 0.00513, 0.00478, 0.00487, 0.00503, 0.00512, 0.0051, 0.00529, 0.005, 0.00521, 0.00528, 0.00511, 0.00522, 0.00513, 0.00533, 0.00502, 0.0053, 0.00492, 0.00522, 0.00496, 0.00488, 0.00513, 0.00506, 0.00519, 0.00508, 0.00521, 0.00442, 0.00409, 0.00426, 0.0043, 0.00418, 0.00428, 0.00456, 0.00443, 0.00422, 0.00426, 0.0043, 0.00429, 0.00435, 0.00446, 0.0044, 0.00447, 0.00444, 0.0043, 0.0042, 0.00438, 0.00422, 0.00429, 0.00463, 0.00435, 0.00431, 0.00447, 0.00431, 0.00441, 0.00417, 0.00425, 0.0044, 0.00438, 0.00438, 0.00439, 0.00447, 0.00402, 0.00423, 0.00447, 0.00451, 0.00457, 0.00458, 0.00426]}, "layernorm-grads-all-reduce-time": {"start_step": 0, "end_step": 2000, "step_interval": 5, "values": [3e-05, 1e-05, 1e-05, 1e-05, 1e-05, 2e-05, 1e-05, 2e-05, 2e-05, 1e-05, 1e-05, 2e-05, 2e-05, 1e-05, 2e-05, 2e-05, 1e-05, 1e-05, 2e-05, 1e-05, 2e-05, 2e-05, 2e-05, 1e-05, 2e-05, 1e-05, 2e-05, 2e-05, 1e-05, 1e-05, 2e-05, 2e-05, 2e-05, 1e-05, 1e-05, 2e-05, 1e-05, 1e-05, 2e-05, 1e-05, 1e-05, 2e-05, 2e-05, 2e-05, 1e-05, 1e-05, 2e-05, 2e-05, 1e-05, 2e-05, 2e-05, 1e-05, 2e-05, 1e-05, 2e-05, 1e-05, 1e-05, 3e-05, 2e-05, 2e-05, 2e-05, 2e-05, 1e-05, 2e-05, 2e-05, 1e-05, 1e-05, 1e-05, 1e-05, 2e-05, 1e-05, 2e-05, 1e-05, 2e-05, 1e-05, 1e-05, 1e-05, 2e-05, 2e-05, 2e-05, 1e-05, 1e-05, 2e-05, 1e-05, 2e-05, 2e-05, 2e-05, 2e-05, 2e-05, 1e-05, 2e-05, 2e-05, 2e-05, 2e-05, 1e-05, 2e-05, 1e-05, 1e-05, 2e-05, 2e-05, 2e-05, 2e-05, 2e-05, 2e-05, 1e-05, 2e-05, 1e-05, 2e-05, 1e-05, 2e-05, 1e-05, 1e-05, 1e-05, 2e-05, 1e-05, 2e-05, 2e-05, 1e-05, 2e-05, 1e-05, 1e-05, 1e-05, 2e-05, 2e-05, 1e-05, 2e-05, 2e-05, 1e-05, 2e-05, 2e-05, 2e-05, 2e-05, 2e-05, 2e-05, 1e-05, 2e-05, 2e-05, 2e-05, 2e-05, 2e-05, 1e-05, 2e-05, 2e-05, 2e-05, 1e-05, 1e-05, 2e-05, 2e-05, 1e-05, 1e-05, 1e-05, 2e-05, 2e-05, 2e-05, 2e-05, 1e-05, 1e-05, 1e-05, 1e-05, 1e-05, 2e-05, 1e-05, 1e-05, 2e-05, 2e-05, 2e-05, 2e-05, 2e-05, 2e-05, 1e-05, 2e-05, 2e-05, 2e-05, 2e-05, 1e-05, 2e-05, 2e-05, 2e-05, 2e-05, 1e-05, 1e-05, 1e-05, 2e-05, 1e-05, 2e-05, 1e-05, 1e-05, 2e-05, 2e-05, 2e-05, 2e-05, 2e-05, 1e-05, 1e-05, 2e-05, 2e-05, 2e-05, 1e-05, 2e-05, 2e-05, 2e-05, 2e-05, 2e-05, 2e-05, 1e-05, 1e-05, 2e-05, 1e-05, 2e-05, 1e-05, 2e-05, 1e-05, 1e-05, 1e-05, 1e-05, 2e-05, 1e-05, 1e-05, 1e-05, 1e-05, 1e-05, 1e-05, 1e-05, 1e-05, 1e-05, 2e-05, 2e-05, 1e-05, 2e-05, 1e-05, 1e-05, 1e-05, 1e-05, 1e-05, 1e-05, 1e-05, 1e-05, 1e-05, 1e-05, 2e-05, 1e-05, 1e-05, 1e-05, 1e-05, 1e-05, 1e-05, 2e-05, 1e-05, 1e-05, 1e-05, 2e-05, 2e-05, 1e-05, 1e-05, 1e-05, 2e-05, 1e-05, 1e-05, 2e-05, 1e-05, 2e-05, 1e-05, 1e-05, 2e-05, 2e-05, 1e-05, 2e-05, 1e-05, 1e-05, 1e-05, 1e-05, 1e-05, 1e-05, 1e-05, 1e-05, 1e-05, 1e-05, 1e-05, 1e-05, 2e-05, 1e-05, 1e-05, 2e-05, 1e-05, 1e-05, 2e-05, 1e-05, 1e-05, 1e-05, 2e-05, 1e-05, 1e-05, 2e-05, 1e-05, 2e-05, 1e-05, 2e-05, 1e-05, 1e-05, 1e-05, 1e-05, 1e-05, 1e-05, 1e-05, 2e-05, 2e-05, 1e-05, 1e-05, 1e-05, 1e-05, 2e-05, 1e-05, 1e-05, 2e-05, 2e-05, 1e-05, 1e-05, 1e-05, 1e-05, 2e-05, 1e-05, 2e-05, 2e-05, 1e-05, 1e-05, 2e-05, 1e-05, 2e-05, 1e-05, 1e-05, 1e-05, 1e-05, 1e-05, 1e-05, 2e-05, 1e-05, 2e-05, 1e-05, 1e-05, 1e-05, 1e-05, 1e-05, 2e-05, 1e-05, 1e-05, 1e-05, 1e-05, 1e-05, 1e-05, 2e-05, 1e-05, 2e-05, 2e-05, 1e-05, 1e-05, 1e-05, 1e-05, 1e-05, 1e-05, 2e-05, 1e-05, 1e-05, 1e-05, 1e-05, 1e-05, 1e-05, 1e-05, 1e-05, 1e-05, 1e-05, 1e-05, 1e-05, 2e-05, 1e-05, 1e-05, 1e-05, 1e-05, 1e-05, 1e-05, 1e-05, 1e-05, 1e-05, 1e-05, 1e-05, 1e-05, 1e-05, 1e-05, 1e-05, 2e-05, 1e-05, 1e-05, 2e-05, 2e-05, 1e-05, 1e-05, 1e-05, 1e-05, 1e-05, 2e-05, 1e-05]}, "embedding-grads-all-reduce-time": {"start_step": 0, "end_step": 2000, "step_interval": 5, "values": [4e-05, 2e-05, 2e-05, 2e-05, 2e-05, 2e-05, 2e-05, 2e-05, 2e-05, 2e-05, 2e-05, 2e-05, 2e-05, 2e-05, 2e-05, 2e-05, 2e-05, 2e-05, 2e-05, 2e-05, 2e-05, 2e-05, 2e-05, 2e-05, 2e-05, 2e-05, 2e-05, 2e-05, 2e-05, 2e-05, 2e-05, 2e-05, 2e-05, 2e-05, 2e-05, 2e-05, 2e-05, 2e-05, 2e-05, 2e-05, 2e-05, 2e-05, 2e-05, 2e-05, 3e-05, 2e-05, 2e-05, 2e-05, 2e-05, 2e-05, 3e-05, 3e-05, 2e-05, 2e-05, 2e-05, 2e-05, 2e-05, 2e-05, 2e-05, 2e-05, 2e-05, 3e-05, 2e-05, 2e-05, 2e-05, 2e-05, 2e-05, 3e-05, 2e-05, 2e-05, 2e-05, 2e-05, 2e-05, 2e-05, 2e-05, 2e-05, 2e-05, 2e-05, 2e-05, 2e-05, 2e-05, 2e-05, 2e-05, 2e-05, 2e-05, 2e-05, 2e-05, 3e-05, 2e-05, 2e-05, 2e-05, 2e-05, 2e-05, 2e-05, 2e-05, 2e-05, 2e-05, 2e-05, 2e-05, 2e-05, 2e-05, 2e-05, 3e-05, 2e-05, 2e-05, 3e-05, 2e-05, 2e-05, 2e-05, 2e-05, 2e-05, 2e-05, 2e-05, 2e-05, 2e-05, 2e-05, 2e-05, 2e-05, 2e-05, 2e-05, 2e-05, 2e-05, 3e-05, 2e-05, 2e-05, 2e-05, 2e-05, 2e-05, 2e-05, 2e-05, 2e-05, 2e-05, 2e-05, 2e-05, 2e-05, 2e-05, 2e-05, 2e-05, 2e-05, 3e-05, 2e-05, 2e-05, 2e-05, 3e-05, 2e-05, 2e-05, 2e-05, 2e-05, 2e-05, 2e-05, 2e-05, 2e-05, 2e-05, 2e-05, 2e-05, 2e-05, 2e-05, 2e-05, 2e-05, 2e-05, 2e-05, 2e-05, 2e-05, 2e-05, 2e-05, 2e-05, 2e-05, 2e-05, 2e-05, 2e-05, 3e-05, 2e-05, 2e-05, 2e-05, 2e-05, 2e-05, 2e-05, 2e-05, 2e-05, 2e-05, 2e-05, 2e-05, 2e-05, 2e-05, 2e-05, 2e-05, 2e-05, 2e-05, 2e-05, 3e-05, 2e-05, 2e-05, 2e-05, 2e-05, 2e-05, 2e-05, 2e-05, 2e-05, 2e-05, 2e-05, 2e-05, 2e-05, 2e-05, 2e-05, 2e-05, 2e-05, 2e-05, 2e-05, 2e-05, 2e-05, 2e-05, 2e-05, 2e-05, 2e-05, 2e-05, 2e-05, 2e-05, 2e-05, 2e-05, 2e-05, 2e-05, 2e-05, 2e-05, 2e-05, 2e-05, 2e-05, 2e-05, 2e-05, 2e-05, 2e-05, 2e-05, 2e-05, 2e-05, 2e-05, 2e-05, 2e-05, 2e-05, 2e-05, 2e-05, 2e-05, 2e-05, 2e-05, 2e-05, 2e-05, 2e-05, 2e-05, 2e-05, 2e-05, 2e-05, 2e-05, 2e-05, 2e-05, 2e-05, 2e-05, 2e-05, 2e-05, 2e-05, 2e-05, 2e-05, 2e-05, 3e-05, 2e-05, 2e-05, 2e-05, 2e-05, 2e-05, 2e-05, 2e-05, 2e-05, 2e-05, 2e-05, 2e-05, 2e-05, 2e-05, 2e-05, 2e-05, 2e-05, 2e-05, 2e-05, 2e-05, 2e-05, 2e-05, 2e-05, 2e-05, 2e-05, 2e-05, 2e-05, 2e-05, 2e-05, 2e-05, 2e-05, 2e-05, 2e-05, 2e-05, 2e-05, 2e-05, 2e-05, 2e-05, 2e-05, 2e-05, 2e-05, 2e-05, 2e-05, 2e-05, 2e-05, 2e-05, 2e-05, 2e-05, 2e-05, 2e-05, 2e-05, 2e-05, 2e-05, 2e-05, 2e-05, 2e-05, 2e-05, 2e-05, 2e-05, 2e-05, 2e-05, 2e-05, 2e-05, 2e-05, 2e-05, 2e-05, 2e-05, 2e-05, 2e-05, 2e-05, 2e-05, 2e-05, 2e-05, 2e-05, 2e-05, 2e-05, 2e-05, 2e-05, 2e-05, 2e-05, 2e-05, 2e-05, 2e-05, 2e-05, 2e-05, 2e-05, 2e-05, 2e-05, 2e-05, 2e-05, 2e-05, 2e-05, 2e-05, 2e-05, 2e-05, 2e-05, 2e-05, 2e-05, 2e-05, 2e-05, 2e-05, 2e-05, 2e-05, 2e-05, 2e-05, 3e-05, 2e-05, 2e-05, 2e-05, 2e-05, 2e-05, 2e-05, 2e-05, 2e-05, 3e-05, 2e-05, 2e-05, 2e-05, 2e-05, 2e-05, 2e-05, 2e-05, 2e-05, 2e-05, 2e-05, 2e-05, 2e-05, 2e-05, 2e-05, 2e-05, 2e-05, 3e-05, 2e-05, 2e-05, 2e-05, 2e-05, 2e-05, 2e-05, 2e-05, 2e-05]}, "all-grads-sync-time": {"start_step": 0, "end_step": 2000, "step_interval": 5, "values": [6.22336, 0.00298, 0.00292, 0.00297, 0.0029, 0.00289, 0.00306, 0.00314, 0.00321, 0.003, 0.00296, 0.00297, 0.00294, 0.00288, 0.00301, 0.00324, 0.00323, 0.00298, 0.00292, 0.00298, 0.00295, 0.0029, 0.00308, 0.00319, 0.00324, 0.00299, 0.00292, 0.00301, 0.00293, 0.00291, 0.00326, 0.00322, 0.00323, 0.0029, 0.00293, 0.003, 0.00291, 0.00287, 0.00303, 0.0032, 0.00322, 0.00298, 0.00294, 0.00295, 0.00296, 0.0029, 0.00305, 0.00322, 0.00321, 0.003, 0.00295, 0.00299, 0.00295, 0.00292, 0.00306, 0.00323, 0.0032, 0.00298, 0.00291, 0.00297, 0.00296, 0.00287, 0.00304, 0.00322, 0.0032, 0.00299, 0.00296, 0.00297, 0.00296, 0.00291, 0.00308, 0.00321, 0.00326, 0.00301, 0.00294, 0.00292, 0.00295, 0.00287, 0.00307, 0.00321, 0.00318, 0.00296, 0.00285, 0.00302, 0.00297, 0.00291, 0.003, 0.00323, 0.0032, 0.003, 0.00292, 0.00294, 0.00297, 0.00285, 0.00306, 0.00318, 0.00314, 0.003, 0.00289, 0.00296, 0.00296, 0.00288, 0.00307, 0.00321, 0.00321, 0.00301, 0.00289, 0.00297, 0.00297, 0.0029, 0.00298, 0.00323, 0.00321, 0.003, 0.00289, 0.00287, 0.00295, 0.00292, 0.00302, 0.00323, 0.00323, 0.003, 0.00292, 0.00291, 0.00298, 0.00286, 0.00306, 0.00321, 0.00322, 0.00302, 0.00289, 0.00293, 0.00286, 0.00288, 0.00306, 0.00322, 0.00319, 0.00295, 0.00285, 0.00297, 0.00295, 0.00289, 0.00305, 0.0032, 0.00324, 0.00298, 0.00291, 0.00297, 0.00289, 0.00289, 0.00304, 0.0032, 0.00314, 0.003, 0.00289, 0.00297, 0.00295, 0.00288, 0.00301, 0.00317, 0.00314, 0.003, 0.00291, 0.00299, 0.00296, 0.0029, 0.00306, 0.00324, 0.00319, 0.00301, 0.0029, 0.00296, 0.00296, 0.0029, 0.00306, 0.00319, 0.0032, 0.003, 0.00285, 0.00298, 0.00296, 0.00281, 0.00305, 0.00318, 0.00322, 0.00297, 0.00291, 0.00299, 0.00294, 0.00292, 0.00307, 0.00323, 0.00324, 0.00299, 0.0029, 0.00299, 0.00295, 0.0029, 0.00305, 0.00319, 0.0029, 0.00305, 0.00311, 0.00325, 0.00324, 0.00308, 0.00284, 0.00305, 0.00295, 0.00305, 0.003, 0.00324, 0.0032, 0.00306, 0.00286, 0.00306, 0.00294, 0.00305, 0.0031, 0.00318, 0.00323, 0.00308, 0.00288, 0.00306, 0.00297, 0.00304, 0.00309, 0.00321, 0.00322, 0.00308, 0.00287, 0.00299, 0.00294, 0.00304, 0.00311, 0.00324, 0.00325, 0.00304, 0.00281, 0.00302, 0.00293, 0.00307, 0.0031, 0.00323, 0.00319, 0.00306, 0.00286, 0.00306, 0.00291, 0.00305, 0.00311, 0.00314, 0.00323, 0.00303, 0.00285, 0.00298, 0.00294, 0.00302, 0.00307, 0.00322, 0.00318, 0.00303, 0.00287, 0.00303, 0.00294, 0.00301, 0.00322, 0.00321, 0.00326, 0.00304, 0.00288, 0.00305, 0.00292, 0.00304, 0.00303, 0.00323, 0.00323, 0.00307, 0.00289, 0.003, 0.00295, 0.00298, 0.00307, 0.00328, 0.00312, 0.00307, 0.00289, 0.00303, 0.00294, 0.00306, 0.00309, 0.00324, 0.0032, 0.00306, 0.0029, 0.00306, 0.00294, 0.00301, 0.00301, 0.00322, 0.00321, 0.00306, 0.00289, 0.00304, 0.00293, 0.00303, 0.00312, 0.00322, 0.00325, 0.00305, 0.00286, 0.00306, 0.00293, 0.00304, 0.0031, 0.00325, 0.00326, 0.00306, 0.00287, 0.00305, 0.00296, 0.00307, 0.00314, 0.00315, 0.00323, 0.00307, 0.00288, 0.00293, 0.0029, 0.00303, 0.00304, 0.00325, 0.00322, 0.00304, 0.0028, 0.00304, 0.00292, 0.00305, 0.00308, 0.00323, 0.00323, 0.00307, 0.00289, 0.00304, 0.00294, 0.00305, 0.00311, 0.00321, 0.00322, 0.00303, 0.00281, 0.00304, 0.00296, 0.003, 0.0031, 0.00322, 0.00314, 0.00301, 0.00281, 0.00298, 0.00288, 0.00303, 0.00307, 0.00321, 0.0032, 0.00301, 0.00281, 0.00303, 0.00288, 0.00301, 0.00309, 0.00316, 0.00319, 0.00302, 0.00284, 0.00306, 0.00292, 0.003, 0.00328, 0.00321, 0.0032, 0.00301, 0.00285, 0.00297, 0.00284, 0.003, 0.003, 0.00318, 0.00319, 0.00301, 0.00281, 0.00303, 0.00289, 0.003, 0.00305, 0.00315, 0.00308, 0.00303, 0.00279, 0.00299]}, "optimizer-copy-to-main-grad-time": {"start_step": 0, "end_step": 2000, "step_interval": 5, "values": [0.0004, 0.00019, 0.00027, 0.00027, 0.00028, 0.00028, 0.00028, 0.00028, 0.00028, 0.00028, 0.00027, 0.00027, 0.00028, 0.00028, 0.00028, 0.00028, 0.00028, 0.00028, 0.00028, 0.00028, 0.00026, 0.00027, 0.00028, 0.00028, 0.00029, 0.00028, 0.00028, 0.00028, 0.00028, 0.00028, 0.00031, 0.00028, 0.00028, 0.00028, 0.00028, 0.00028, 0.00028, 0.00029, 0.00029, 0.00029, 0.00029, 0.00027, 0.00029, 0.00027, 0.00028, 0.00028, 0.00028, 0.00029, 0.00027, 0.00027, 0.00029, 0.00028, 0.0003, 0.00028, 0.00028, 0.00028, 0.00028, 0.00029, 0.00028, 0.00028, 0.00028, 0.00028, 0.00027, 0.00028, 0.00028, 0.00028, 0.00028, 0.00027, 0.00028, 0.00028, 0.00028, 0.00027, 0.00028, 0.00027, 0.00028, 0.00026, 0.00026, 0.00026, 0.00026, 0.00026, 0.00026, 0.00027, 0.00027, 0.00025, 0.00025, 0.00027, 0.00028, 0.00027, 0.00028, 0.00026, 0.00026, 0.00025, 0.00026, 0.00026, 0.00028, 0.00025, 0.00028, 0.00027, 0.00026, 0.00028, 0.00028, 0.00028, 0.00028, 0.00027, 0.00026, 0.00028, 0.00029, 0.00028, 0.00028, 0.00028, 0.00027, 0.00027, 0.00028, 0.00028, 0.00028, 0.00027, 0.00028, 0.00027, 0.00028, 0.00027, 0.00027, 0.00028, 0.00028, 0.00028, 0.00028, 0.00029, 0.00027, 0.00028, 0.00028, 0.00028, 0.00027, 0.00027, 0.00027, 0.00027, 0.00027, 0.00028, 0.00028, 0.00028, 0.00028, 0.00028, 0.00028, 0.00028, 0.00028, 0.00028, 0.00027, 0.00027, 0.00027, 0.00028, 0.00027, 0.00027, 0.00027, 0.00028, 0.00029, 0.00028, 0.00028, 0.00027, 0.00028, 0.00027, 0.00028, 0.00027, 0.00028, 0.00028, 0.00028, 0.00028, 0.00028, 0.00027, 0.00028, 0.00027, 0.00028, 0.00028, 0.00029, 0.00027, 0.00028, 0.00027, 0.00027, 0.00029, 0.00028, 0.00028, 0.00027, 0.00028, 0.00028, 0.00027, 0.00028, 0.00029, 0.00028, 0.00028, 0.00028, 0.00028, 0.00028, 0.00028, 0.00029, 0.00028, 0.00028, 0.00029, 0.00028, 0.00028, 0.00029, 0.00028, 0.00028, 0.00028, 0.00028, 0.00027, 0.00026, 0.00026, 0.00026, 0.00025, 0.00025, 0.00025, 0.00026, 0.00025, 0.00025, 0.00025, 0.00025, 0.00025, 0.00025, 0.00026, 0.00027, 0.00025, 0.00025, 0.00026, 0.00026, 0.00025, 0.00027, 0.00025, 0.00026, 0.00025, 0.00025, 0.00025, 0.00026, 0.00025, 0.00025, 0.00025, 0.00025, 0.00025, 0.00025, 0.00025, 0.00025, 0.00025, 0.00025, 0.00025, 0.00025, 0.00025, 0.00025, 0.00025, 0.00025, 0.00025, 0.00025, 0.00025, 0.00025, 0.00025, 0.00025, 0.00026, 0.00025, 0.00025, 0.00026, 0.00025, 0.00025, 0.00025, 0.00025, 0.00025, 0.00025, 0.00027, 0.00025, 0.00025, 0.00025, 0.00027, 0.00027, 0.00025, 0.00025, 0.00025, 0.00026, 0.00026, 0.00025, 0.00025, 0.00025, 0.00025, 0.00025, 0.00027, 0.00027, 0.00025, 0.00025, 0.00025, 0.00025, 0.00025, 0.00025, 0.00025, 0.00025, 0.00025, 0.00025, 0.00025, 0.00026, 0.00025, 0.00025, 0.00025, 0.00026, 0.00025, 0.00025, 0.00025, 0.00025, 0.00025, 0.00025, 0.00025, 0.00026, 0.00025, 0.00025, 0.00027, 0.00026, 0.00025, 0.00025, 0.00025, 0.00025, 0.00025, 0.00025, 0.00025, 0.00025, 0.00025, 0.00025, 0.00026, 0.00025, 0.00025, 0.00025, 0.00025, 0.00025, 0.00025, 0.00025, 0.00025, 0.00025, 0.00025, 0.00025, 0.00026, 0.00026, 0.00025, 0.00025, 0.00025, 0.00025, 0.00025, 0.00026, 0.00025, 0.00025, 0.00026, 0.00025, 0.00025, 0.00024, 0.00025, 0.00025, 0.00026, 0.00025, 0.00026, 0.00025, 0.00025, 0.00025, 0.00026, 0.00025, 0.00025, 0.00025, 0.00025, 0.00025, 0.00026, 0.00025, 0.00025, 0.00026, 0.00027, 0.00029, 0.00027, 0.00027, 0.00028, 0.00027, 0.00028, 0.00028, 0.00029, 0.00028, 0.00027, 0.00027, 0.00027, 0.00027, 0.00028, 0.00027, 0.00027, 0.00028, 0.00028, 0.00028, 0.00028, 0.00028, 0.00028, 0.00025, 0.00027, 0.00025, 0.00027, 0.00028, 0.00027, 0.00027, 0.00027, 0.00027, 0.00028, 0.00027, 0.00028, 0.00027, 0.00027, 0.00027, 0.00027]}, "optimizer-clip-main-grad-time": {"start_step": 0, "end_step": 2000, "step_interval": 5, "values": [0.6202, 0.00104, 0.00121, 0.00115, 0.00122, 0.00121, 0.00123, 0.00124, 0.00122, 0.00123, 0.00125, 0.00122, 0.00121, 0.0012, 0.00122, 0.00127, 0.00121, 0.00123, 0.0012, 0.00123, 0.00121, 0.00116, 0.00125, 0.00122, 0.00122, 0.00124, 0.00122, 0.00123, 0.0012, 0.00122, 0.00125, 0.00122, 0.00126, 0.0012, 0.00122, 0.00123, 0.00121, 0.00127, 0.00121, 0.00121, 0.00121, 0.00121, 0.00123, 0.00122, 0.00123, 0.00124, 0.00121, 0.0012, 0.00122, 0.00119, 0.00121, 0.00122, 0.00137, 0.00122, 0.00121, 0.00123, 0.0012, 0.00126, 0.00121, 0.00122, 0.00122, 0.00129, 0.00122, 0.00122, 0.00122, 0.00123, 0.00125, 0.00125, 0.00124, 0.00122, 0.00123, 0.0013, 0.00124, 0.00121, 0.00123, 0.00118, 0.00123, 0.00121, 0.00123, 0.00118, 0.00118, 0.00118, 0.00119, 0.00119, 0.00119, 0.00121, 0.00121, 0.00122, 0.00121, 0.00123, 0.00123, 0.0012, 0.00128, 0.00117, 0.00122, 0.00123, 0.00124, 0.00121, 0.00118, 0.00119, 0.00121, 0.00122, 0.00121, 0.0012, 0.00118, 0.00124, 0.00122, 0.0012, 0.00125, 0.0012, 0.00121, 0.00101, 0.0012, 0.00121, 0.00124, 0.00123, 0.00123, 0.00123, 0.00122, 0.001, 0.00122, 0.00121, 0.001, 0.00125, 0.00122, 0.00121, 0.00124, 0.00121, 0.00121, 0.00099, 0.0012, 0.00125, 0.00121, 0.001, 0.0012, 0.00122, 0.00122, 0.00122, 0.0013, 0.00097, 0.00124, 0.00122, 0.00125, 0.00121, 0.0012, 0.0012, 0.00121, 0.00123, 0.0012, 0.0012, 0.00121, 0.00125, 0.00135, 0.00122, 0.00122, 0.00123, 0.00124, 0.00121, 0.00122, 0.0012, 0.0013, 0.00122, 0.00124, 0.001, 0.00123, 0.00121, 0.00121, 0.00126, 0.00124, 0.00129, 0.00129, 0.00124, 0.00121, 0.00119, 0.0012, 0.00123, 0.00123, 0.00127, 0.00122, 0.00122, 0.0012, 0.00121, 0.00128, 0.0012, 0.00125, 0.00124, 0.00121, 0.00123, 0.00121, 0.00132, 0.00122, 0.00121, 0.0012, 0.00122, 0.00123, 0.00123, 0.00121, 0.0012, 0.00122, 0.00123, 0.0012, 0.00123, 0.0012, 0.00118, 0.00118, 0.00121, 0.00124, 0.0012, 0.00121, 0.00121, 0.00119, 0.00119, 0.0012, 0.0012, 0.0012, 0.00118, 0.00126, 0.00121, 0.00118, 0.0012, 0.00117, 0.00119, 0.00121, 0.00118, 0.00119, 0.00122, 0.0012, 0.0012, 0.00126, 0.00121, 0.00128, 0.00107, 0.00115, 0.00121, 0.00119, 0.00119, 0.00116, 0.00118, 0.0012, 0.00121, 0.00119, 0.0012, 0.0012, 0.0012, 0.00116, 0.00121, 0.0012, 0.00116, 0.00121, 0.00113, 0.00119, 0.00127, 0.0012, 0.00119, 0.00118, 0.00119, 0.0012, 0.00121, 0.00119, 0.00118, 0.00119, 0.0012, 0.00119, 0.0012, 0.0012, 0.00127, 0.00122, 0.0012, 0.00118, 0.00118, 0.00121, 0.00118, 0.00123, 0.00119, 0.00122, 0.00116, 0.0012, 0.00118, 0.0012, 0.00122, 0.00122, 0.00121, 0.00117, 0.00121, 0.00117, 0.0012, 0.00118, 0.00119, 0.00122, 0.00118, 0.00125, 0.00119, 0.00121, 0.00118, 0.00133, 0.00119, 0.00119, 0.00119, 0.0012, 0.00128, 0.00121, 0.00122, 0.0012, 0.00123, 0.00115, 0.00118, 0.0012, 0.00122, 0.00119, 0.00122, 0.00121, 0.00119, 0.00126, 0.0012, 0.0012, 0.00118, 0.00116, 0.00119, 0.00118, 0.00121, 0.00119, 0.00125, 0.00122, 0.00119, 0.00116, 0.00117, 0.00119, 0.0012, 0.0012, 0.00117, 0.00118, 0.0012, 0.00124, 0.00122, 0.0012, 0.00118, 0.0012, 0.00119, 0.0012, 0.00118, 0.00119, 0.00121, 0.00119, 0.00119, 0.00121, 0.00118, 0.00126, 0.00118, 0.0012, 0.00119, 0.00117, 0.0012, 0.00118, 0.0012, 0.00119, 0.0012, 0.00119, 0.00125, 0.00117, 0.00123, 0.00118, 0.00122, 0.00122, 0.00122, 0.00117, 0.00123, 0.00122, 0.00121, 0.00121, 0.0012, 0.00121, 0.00128, 0.00123, 0.00116, 0.0012, 0.00123, 0.00123, 0.00116, 0.00123, 0.00121, 0.0012, 0.00121, 0.00122, 0.00124, 0.00128, 0.00122, 0.00117, 0.00123, 0.00124, 0.00122, 0.00118, 0.0012, 0.00117, 0.00125, 0.00122, 0.00117, 0.00115, 0.00118, 0.00113, 0.0012]}, "optimizer-count-zeros-time": {"start_step": 0, "end_step": 2000, "step_interval": 5, "values": [0.00555, 0.00512, 0.0052, 0.0051, 0.00517, 0.00513, 0.00514, 0.00513, 0.00512, 0.00511, 0.00508, 0.0051, 0.0051, 0.00512, 0.00511, 0.00509, 0.00508, 0.00511, 0.00514, 0.0051, 0.00509, 0.0051, 0.00514, 0.00512, 0.00512, 0.00512, 0.00514, 0.00517, 0.00511, 0.00513, 0.00513, 0.00516, 0.00515, 0.00515, 0.00516, 0.00514, 0.00513, 0.00543, 0.00514, 0.00512, 0.00514, 0.00513, 0.00513, 0.00516, 0.00512, 0.00515, 0.00511, 0.00513, 0.00515, 0.00514, 0.0051, 0.00512, 0.0057, 0.00511, 0.00513, 0.00513, 0.00514, 0.0053, 0.00514, 0.00511, 0.00513, 0.00512, 0.00513, 0.00518, 0.00513, 0.00514, 0.00512, 0.00513, 0.00512, 0.00509, 0.00512, 0.00539, 0.00514, 0.00514, 0.0051, 0.00512, 0.00511, 0.00512, 0.00511, 0.00511, 0.00512, 0.00513, 0.00511, 0.00514, 0.00512, 0.0051, 0.00514, 0.00511, 0.00512, 0.00522, 0.0051, 0.00514, 0.00572, 0.0051, 0.00515, 0.00526, 0.00509, 0.00511, 0.00513, 0.00513, 0.00518, 0.00514, 0.00511, 0.00512, 0.00512, 0.00511, 0.00514, 0.00512, 0.00518, 0.00514, 0.00512, 0.00513, 0.00512, 0.00512, 0.00512, 0.00511, 0.00509, 0.00514, 0.00519, 0.00512, 0.0051, 0.00513, 0.0051, 0.00548, 0.00514, 0.00512, 0.00512, 0.00511, 0.00511, 0.00512, 0.00511, 0.00519, 0.00533, 0.00509, 0.00512, 0.0051, 0.00513, 0.00511, 0.00515, 0.00508, 0.00512, 0.00513, 0.0057, 0.00513, 0.00513, 0.00516, 0.00518, 0.00515, 0.00517, 0.00513, 0.00514, 0.00516, 0.0057, 0.00516, 0.00515, 0.00514, 0.00513, 0.00513, 0.00516, 0.00516, 0.00566, 0.00514, 0.00514, 0.00515, 0.00516, 0.00515, 0.00513, 0.00517, 0.00513, 0.00513, 0.00601, 0.00514, 0.00522, 0.00513, 0.00515, 0.00514, 0.00517, 0.00511, 0.00515, 0.00516, 0.00515, 0.00514, 0.00515, 0.00512, 0.00587, 0.00517, 0.00518, 0.00516, 0.00513, 0.00541, 0.00514, 0.00515, 0.00513, 0.00516, 0.00521, 0.00531, 0.00532, 0.00517, 0.00516, 0.00515, 0.00511, 0.00529, 0.00509, 0.00511, 0.00512, 0.00512, 0.00512, 0.00515, 0.0053, 0.0051, 0.00512, 0.00512, 0.00512, 0.00511, 0.0051, 0.00513, 0.00512, 0.00513, 0.00513, 0.00512, 0.00559, 0.00511, 0.0051, 0.0051, 0.00512, 0.00515, 0.00512, 0.00511, 0.00579, 0.00512, 0.00511, 0.00512, 0.00511, 0.00511, 0.00511, 0.00513, 0.00508, 0.00513, 0.00511, 0.00509, 0.00512, 0.0051, 0.00512, 0.00511, 0.00512, 0.00513, 0.00511, 0.00514, 0.00511, 0.00512, 0.00512, 0.0059, 0.00513, 0.00514, 0.00512, 0.00511, 0.00513, 0.00511, 0.00511, 0.0051, 0.00509, 0.0051, 0.00512, 0.0051, 0.0051, 0.00511, 0.00513, 0.00513, 0.0051, 0.00513, 0.00511, 0.0051, 0.0051, 0.00511, 0.00512, 0.00511, 0.00509, 0.00513, 0.0051, 0.0051, 0.00518, 0.0051, 0.00513, 0.00509, 0.00513, 0.00512, 0.00511, 0.00515, 0.00512, 0.00512, 0.00512, 0.00512, 0.00512, 0.00511, 0.00601, 0.00512, 0.00524, 0.00512, 0.0051, 0.00511, 0.00509, 0.00512, 0.0051, 0.00512, 0.00511, 0.00511, 0.00526, 0.0051, 0.00511, 0.00512, 0.00511, 0.00511, 0.00514, 0.00511, 0.00512, 0.00509, 0.00511, 0.00512, 0.00512, 0.00509, 0.0051, 0.00511, 0.00511, 0.00513, 0.00512, 0.00541, 0.00512, 0.00515, 0.00511, 0.00509, 0.0051, 0.00512, 0.00511, 0.00512, 0.00511, 0.00517, 0.00514, 0.00513, 0.00513, 0.00512, 0.00511, 0.00514, 0.00511, 0.00514, 0.00509, 0.00508, 0.00513, 0.00509, 0.0051, 0.00513, 0.00511, 0.00571, 0.00519, 0.00511, 0.00511, 0.0051, 0.00511, 0.00512, 0.00513, 0.00511, 0.00511, 0.00511, 0.00511, 0.00512, 0.00511, 0.00509, 0.00514, 0.00511, 0.00516, 0.00512, 0.0053, 0.00511, 0.00512, 0.00521, 0.00512, 0.00513, 0.00514, 0.00512, 0.00512, 0.00514, 0.0051, 0.00511, 0.00513, 0.00512, 0.00509, 0.00519, 0.00512, 0.0051, 0.00509, 0.00596, 0.00512, 0.0051, 0.0051, 0.00513, 0.00513, 0.0051, 0.00511, 0.00509, 0.00512, 0.00511]}, "optimizer-inner-step-time": {"start_step": 0, "end_step": 2000, "step_interval": 5, "values": [0.00417, 0.00096, 0.00098, 0.00098, 0.00099, 0.00097, 0.00098, 0.00098, 0.00097, 0.00096, 0.00097, 0.00097, 0.00097, 0.00097, 0.00097, 0.00099, 0.00098, 0.00097, 0.00098, 0.00097, 0.00097, 0.00096, 0.00098, 0.00098, 0.00099, 0.00099, 0.00097, 0.00096, 0.00098, 0.00098, 0.00101, 0.00097, 0.00097, 0.00097, 0.00098, 0.00097, 0.00097, 0.00097, 0.00098, 0.00098, 0.00098, 0.00096, 0.00097, 0.00097, 0.00097, 0.00098, 0.00098, 0.00097, 0.00097, 0.00097, 0.00098, 0.00098, 0.00099, 0.00098, 0.00097, 0.00097, 0.00096, 0.00097, 0.00096, 0.00097, 0.00098, 0.00097, 0.00097, 0.00098, 0.00097, 0.00097, 0.00097, 0.00097, 0.00097, 0.00097, 0.00098, 0.00097, 0.00097, 0.00097, 0.00097, 0.00097, 0.00097, 0.00097, 0.00097, 0.00098, 0.00096, 0.00097, 0.00097, 0.00098, 0.00096, 0.00096, 0.00097, 0.00098, 0.00096, 0.00097, 0.00096, 0.00097, 0.00099, 0.00096, 0.00098, 0.00098, 0.00097, 0.00097, 0.00096, 0.00097, 0.00096, 0.00097, 0.00099, 0.00096, 0.00096, 0.00097, 0.00097, 0.00097, 0.00098, 0.00096, 0.00097, 0.00097, 0.00098, 0.00099, 0.00098, 0.00096, 0.00096, 0.00097, 0.00097, 0.00097, 0.00097, 0.00097, 0.00097, 0.00098, 0.00099, 0.00097, 0.00096, 0.00097, 0.00097, 0.00097, 0.00097, 0.00097, 0.00097, 0.00097, 0.00097, 0.00097, 0.00098, 0.00097, 0.00098, 0.00097, 0.00096, 0.00097, 0.00099, 0.00098, 0.00097, 0.00098, 0.00096, 0.00097, 0.00097, 0.00096, 0.00097, 0.00098, 0.00099, 0.00097, 0.00097, 0.00097, 0.00098, 0.00097, 0.00097, 0.00097, 0.00099, 0.00098, 0.00097, 0.00097, 0.00098, 0.00097, 0.00096, 0.00097, 0.00098, 0.00097, 0.001, 0.00097, 0.00097, 0.00097, 0.00098, 0.00097, 0.00098, 0.00097, 0.00099, 0.00097, 0.00097, 0.00096, 0.00098, 0.00097, 0.00098, 0.00097, 0.00097, 0.00097, 0.00098, 0.00098, 0.00097, 0.00097, 0.00099, 0.00097, 0.00098, 0.00098, 0.00097, 0.00097, 0.00098, 0.00098, 0.001, 0.00097, 0.00096, 0.00097, 0.00097, 0.00097, 0.00097, 0.00097, 0.00097, 0.00097, 0.00096, 0.00096, 0.00096, 0.00097, 0.001, 0.00096, 0.00099, 0.00097, 0.00098, 0.00097, 0.00099, 0.00096, 0.00128, 0.00096, 0.00096, 0.00097, 0.00097, 0.00097, 0.00099, 0.00096, 0.00097, 0.00096, 0.00097, 0.00097, 0.00096, 0.00096, 0.00096, 0.00097, 0.00096, 0.00097, 0.00097, 0.00097, 0.00097, 0.00097, 0.00097, 0.00096, 0.00098, 0.00097, 0.00097, 0.00096, 0.00097, 0.001, 0.00097, 0.00096, 0.00097, 0.00097, 0.00097, 0.00096, 0.00097, 0.00096, 0.00096, 0.00097, 0.00096, 0.00096, 0.00096, 0.00097, 0.001, 0.00097, 0.00096, 0.00097, 0.00097, 0.00096, 0.00097, 0.00096, 0.00097, 0.00096, 0.00097, 0.00096, 0.00096, 0.00096, 0.00097, 0.00099, 0.00096, 0.00097, 0.00096, 0.00096, 0.00097, 0.00096, 0.00097, 0.00096, 0.00097, 0.00098, 0.00097, 0.00097, 0.00099, 0.00096, 0.00097, 0.00096, 0.00096, 0.00098, 0.00096, 0.00096, 0.00097, 0.00098, 0.00096, 0.00097, 0.00097, 0.00096, 0.00098, 0.00097, 0.00096, 0.00097, 0.00096, 0.00097, 0.00097, 0.00096, 0.00097, 0.00097, 0.00096, 0.00096, 0.00097, 0.00096, 0.00096, 0.00097, 0.00097, 0.00097, 0.00097, 0.00096, 0.00097, 0.00097, 0.00097, 0.00096, 0.00096, 0.00096, 0.00097, 0.00097, 0.00097, 0.00097, 0.00097, 0.00097, 0.00097, 0.00096, 0.00097, 0.00096, 0.00097, 0.00097, 0.00097, 0.00097, 0.00096, 0.00097, 0.00095, 0.00096, 0.00097, 0.00098, 0.00097, 0.00097, 0.00097, 0.00097, 0.00096, 0.00096, 0.00096, 0.00098, 0.00097, 0.00097, 0.00098, 0.00097, 0.00098, 0.00098, 0.00098, 0.00098, 0.001, 0.00098, 0.00098, 0.00098, 0.00097, 0.00097, 0.00098, 0.00098, 0.00101, 0.00098, 0.00098, 0.00097, 0.00098, 0.00097, 0.00097, 0.00099, 0.00097, 0.00098, 0.00098, 0.00096, 0.00098, 0.00097, 0.00098, 0.00099, 0.00097, 0.00098, 0.00097, 0.00097, 0.00098, 0.00098]}, "optimizer-copy-main-to-model-params-time": {"start_step": 0, "end_step": 2000, "step_interval": 5, "values": [0.00118, 0.00099, 0.00101, 0.00101, 0.00101, 0.00101, 0.00101, 0.00101, 0.00101, 0.00101, 0.00101, 0.00101, 0.00101, 0.00101, 0.001, 0.001, 0.00101, 0.00101, 0.00101, 0.00103, 0.001, 0.00101, 0.00101, 0.00101, 0.00101, 0.00101, 0.00101, 0.00101, 0.00102, 0.00101, 0.00102, 0.00101, 0.001, 0.001, 0.00101, 0.00101, 0.00101, 0.00101, 0.00101, 0.001, 0.00101, 0.001, 0.00102, 0.00102, 0.001, 0.00101, 0.00101, 0.00101, 0.001, 0.00101, 0.00101, 0.00101, 0.00101, 0.00102, 0.00101, 0.00101, 0.001, 0.001, 0.00101, 0.00102, 0.00102, 0.001, 0.00101, 0.001, 0.00101, 0.001, 0.00101, 0.00101, 0.00101, 0.00105, 0.00101, 0.00102, 0.001, 0.00101, 0.00101, 0.00101, 0.00101, 0.00101, 0.00101, 0.00101, 0.00101, 0.001, 0.001, 0.00102, 0.001, 0.00101, 0.00102, 0.00101, 0.00101, 0.00101, 0.001, 0.00101, 0.00103, 0.00101, 0.00101, 0.00101, 0.00101, 0.00101, 0.00101, 0.00102, 0.00101, 0.00101, 0.00106, 0.00101, 0.00102, 0.00101, 0.00101, 0.00101, 0.001, 0.00101, 0.001, 0.00101, 0.00101, 0.00101, 0.00101, 0.00101, 0.00102, 0.00102, 0.00102, 0.00101, 0.00101, 0.00102, 0.001, 0.00106, 0.00101, 0.00101, 0.00101, 0.00101, 0.00101, 0.00103, 0.00102, 0.00101, 0.00102, 0.00101, 0.00101, 0.00101, 0.001, 0.001, 0.001, 0.001, 0.001, 0.001, 0.00102, 0.00101, 0.001, 0.001, 0.001, 0.001, 0.001, 0.00101, 0.00101, 0.00101, 0.00102, 0.00102, 0.00101, 0.00102, 0.00103, 0.00102, 0.00101, 0.00101, 0.00102, 0.00101, 0.00101, 0.00101, 0.00102, 0.00101, 0.00101, 0.00101, 0.00102, 0.00101, 0.00103, 0.00101, 0.00101, 0.00101, 0.00101, 0.00102, 0.00102, 0.00101, 0.00102, 0.00101, 0.00101, 0.00102, 0.00102, 0.00102, 0.00105, 0.00102, 0.00102, 0.00101, 0.00101, 0.00102, 0.00101, 0.00103, 0.00102, 0.00102, 0.00101, 0.00106, 0.00102, 0.00101, 0.00103, 0.00103, 0.00102, 0.00103, 0.00102, 0.00102, 0.00103, 0.00102, 0.00102, 0.00102, 0.00102, 0.00103, 0.00102, 0.00102, 0.00102, 0.00102, 0.00108, 0.00102, 0.00104, 0.00102, 0.00102, 0.00103, 0.00103, 0.00102, 0.00107, 0.00102, 0.00102, 0.00102, 0.00102, 0.00102, 0.00102, 0.00102, 0.00103, 0.00102, 0.00102, 0.00102, 0.00102, 0.00102, 0.00102, 0.00102, 0.00102, 0.00102, 0.00107, 0.00102, 0.00102, 0.00102, 0.00102, 0.00103, 0.00103, 0.00102, 0.00102, 0.00102, 0.00102, 0.00103, 0.00103, 0.00103, 0.00102, 0.00103, 0.00102, 0.00102, 0.00104, 0.00102, 0.00104, 0.00102, 0.00102, 0.00103, 0.00103, 0.00102, 0.00103, 0.00102, 0.00102, 0.00102, 0.00102, 0.00101, 0.00103, 0.00101, 0.00102, 0.00102, 0.00102, 0.00102, 0.00103, 0.00102, 0.00102, 0.00105, 0.00102, 0.00102, 0.00104, 0.00102, 0.00102, 0.00102, 0.00102, 0.00103, 0.00102, 0.00102, 0.00101, 0.00103, 0.00104, 0.00103, 0.00103, 0.00102, 0.00102, 0.00102, 0.00102, 0.00102, 0.00102, 0.00102, 0.00102, 0.00102, 0.00108, 0.00102, 0.00102, 0.00102, 0.00102, 0.00102, 0.00103, 0.00102, 0.00122, 0.00102, 0.00103, 0.00102, 0.00102, 0.00102, 0.00102, 0.00103, 0.00102, 0.00102, 0.00101, 0.00102, 0.00102, 0.00102, 0.00103, 0.00102, 0.00102, 0.00102, 0.00102, 0.00102, 0.00102, 0.00102, 0.00102, 0.00103, 0.00102, 0.00102, 0.00101, 0.00102, 0.00103, 0.00103, 0.00103, 0.00101, 0.00102, 0.00102, 0.00102, 0.00101, 0.00101, 0.00105, 0.00102, 0.00103, 0.00102, 0.00102, 0.00101, 0.00102, 0.00102, 0.00102, 0.00101, 0.00101, 0.00101, 0.00102, 0.00101, 0.00101, 0.00102, 0.00102, 0.00102, 0.00101, 0.00102, 0.00103, 0.00101, 0.00102, 0.00102, 0.00102, 0.00102, 0.00101, 0.00104, 0.00102, 0.00102, 0.00102, 0.00102, 0.00101, 0.00102, 0.00102, 0.00102, 0.00102, 0.00102, 0.00103, 0.00102, 0.00102, 0.00102, 0.00103, 0.00102, 0.00102, 0.00101, 0.00102, 0.00101, 0.00101]}, "optimizer-time": {"start_step": 0, "end_step": 2000, "step_interval": 5, "values": [0.63386, 0.00867, 0.00903, 0.00886, 0.00906, 0.00897, 0.00901, 0.009, 0.00896, 0.00895, 0.00895, 0.00895, 0.00894, 0.00894, 0.00896, 0.009, 0.00892, 0.00896, 0.00899, 0.00897, 0.00892, 0.00887, 0.00902, 0.00897, 0.009, 0.00906, 0.00899, 0.00902, 0.00897, 0.00898, 0.0091, 0.00901, 0.00904, 0.00898, 0.00901, 0.009, 0.00902, 0.00937, 0.00899, 0.00896, 0.00901, 0.00897, 0.00899, 0.00902, 0.00897, 0.00903, 0.00895, 0.00898, 0.00899, 0.00895, 0.00896, 0.00898, 0.00978, 0.00897, 0.00898, 0.009, 0.00895, 0.0092, 0.00896, 0.00901, 0.009, 0.00904, 0.00898, 0.00902, 0.00897, 0.00899, 0.00902, 0.00902, 0.00899, 0.00899, 0.00898, 0.00934, 0.00904, 0.00896, 0.00897, 0.00891, 0.00895, 0.00892, 0.00894, 0.0089, 0.00889, 0.0089, 0.00891, 0.00892, 0.00888, 0.0089, 0.009, 0.00896, 0.00895, 0.0091, 0.00889, 0.00892, 0.00967, 0.00886, 0.009, 0.00913, 0.00896, 0.00896, 0.00889, 0.00895, 0.00901, 0.00899, 0.00903, 0.00893, 0.00893, 0.00898, 0.009, 0.00894, 0.00905, 0.00897, 0.00894, 0.00877, 0.00897, 0.00898, 0.00902, 0.00895, 0.00895, 0.009, 0.00905, 0.00875, 0.00895, 0.00897, 0.00872, 0.00942, 0.00901, 0.00898, 0.00897, 0.00894, 0.00895, 0.00876, 0.00895, 0.00907, 0.00917, 0.00872, 0.00895, 0.00893, 0.00898, 0.00897, 0.00906, 0.00866, 0.00896, 0.00897, 0.00964, 0.00897, 0.00897, 0.00898, 0.009, 0.009, 0.009, 0.00894, 0.00898, 0.00904, 0.00977, 0.00905, 0.00899, 0.00901, 0.00905, 0.00898, 0.00901, 0.00898, 0.00965, 0.009, 0.009, 0.00878, 0.00905, 0.00899, 0.00898, 0.00904, 0.00902, 0.00906, 0.01008, 0.00901, 0.00907, 0.00895, 0.00899, 0.00902, 0.00905, 0.00902, 0.00902, 0.00901, 0.00899, 0.00898, 0.00908, 0.00899, 0.00979, 0.00905, 0.00904, 0.00903, 0.009, 0.00938, 0.00899, 0.00901, 0.00904, 0.00902, 0.00909, 0.00923, 0.00917, 0.00901, 0.00905, 0.00903, 0.00899, 0.00918, 0.00889, 0.00891, 0.00894, 0.00894, 0.00896, 0.00895, 0.00912, 0.00892, 0.00889, 0.00896, 0.0089, 0.00891, 0.00901, 0.0089, 0.00904, 0.00893, 0.00893, 0.00894, 0.00942, 0.00889, 0.00938, 0.00887, 0.00892, 0.00897, 0.00893, 0.00896, 0.00974, 0.00891, 0.009, 0.00879, 0.00886, 0.00891, 0.0089, 0.00892, 0.00885, 0.00891, 0.0089, 0.00892, 0.00896, 0.0089, 0.00892, 0.00893, 0.00891, 0.00894, 0.00892, 0.00891, 0.00894, 0.00885, 0.00891, 0.00986, 0.00894, 0.00893, 0.00892, 0.00894, 0.00896, 0.00889, 0.00893, 0.00888, 0.0089, 0.00891, 0.0089, 0.0089, 0.00894, 0.00901, 0.00902, 0.00898, 0.00887, 0.00892, 0.00897, 0.00888, 0.00894, 0.00889, 0.00893, 0.00887, 0.00889, 0.00895, 0.00891, 0.00891, 0.00904, 0.00901, 0.00889, 0.00892, 0.00891, 0.00892, 0.00891, 0.00892, 0.00895, 0.00891, 0.00902, 0.00891, 0.00892, 0.00889, 0.01004, 0.00891, 0.00907, 0.00893, 0.00889, 0.00901, 0.00889, 0.00893, 0.00895, 0.00898, 0.00885, 0.00891, 0.00914, 0.00891, 0.00891, 0.00894, 0.00892, 0.00888, 0.009, 0.0089, 0.00948, 0.00889, 0.00887, 0.00893, 0.00889, 0.00889, 0.00891, 0.00896, 0.00894, 0.00893, 0.00888, 0.00921, 0.00895, 0.00893, 0.00894, 0.00887, 0.0089, 0.00897, 0.00896, 0.00894, 0.00893, 0.00896, 0.009, 0.00892, 0.00897, 0.00891, 0.00889, 0.00895, 0.0089, 0.00893, 0.00891, 0.00886, 0.009, 0.00888, 0.00889, 0.00894, 0.00885, 0.00955, 0.00901, 0.00895, 0.00891, 0.0089, 0.00889, 0.00898, 0.00888, 0.00898, 0.00889, 0.00895, 0.00895, 0.00896, 0.00891, 0.00895, 0.00904, 0.00897, 0.00901, 0.00897, 0.00919, 0.00904, 0.00899, 0.00902, 0.00895, 0.00901, 0.00901, 0.00892, 0.00909, 0.00899, 0.00896, 0.00901, 0.00899, 0.009, 0.00896, 0.00905, 0.0089, 0.00897, 0.00898, 0.00984, 0.00894, 0.00894, 0.00891, 0.00903, 0.00898, 0.00894, 0.00889, 0.0089, 0.0089, 0.00894]}, "learning-rate": {"start_step": 0, "end_step": 2000, "step_interval": 5, "values": [0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 1e-05, 1e-05, 1e-05, 1e-05, 1e-05, 1e-05, 1e-05, 1e-05, 1e-05, 1e-05, 1e-05, 1e-05, 1e-05, 1e-05, 1e-05, 1e-05, 1e-05, 1e-05, 1e-05, 1e-05, 1e-05, 1e-05, 1e-05, 1e-05, 1e-05, 1e-05, 1e-05, 1e-05, 1e-05, 1e-05, 1e-05, 1e-05, 1e-05, 1e-05, 1e-05, 1e-05, 1e-05, 1e-05, 1e-05, 1e-05, 1e-05, 1e-05, 2e-05, 2e-05, 2e-05, 2e-05, 2e-05, 2e-05, 2e-05, 2e-05, 2e-05, 2e-05, 2e-05, 2e-05, 2e-05, 2e-05, 2e-05, 2e-05, 2e-05, 2e-05, 2e-05, 2e-05, 2e-05, 2e-05, 2e-05, 2e-05, 2e-05, 2e-05, 2e-05, 2e-05, 2e-05, 2e-05, 2e-05, 2e-05, 2e-05, 2e-05, 2e-05, 2e-05, 2e-05, 2e-05, 2e-05, 2e-05, 2e-05, 2e-05, 2e-05, 3e-05, 3e-05, 3e-05, 3e-05, 3e-05, 3e-05, 3e-05, 3e-05, 3e-05, 3e-05, 3e-05, 3e-05, 3e-05, 3e-05, 3e-05, 3e-05, 3e-05, 3e-05, 3e-05, 3e-05, 3e-05, 3e-05, 3e-05, 3e-05, 3e-05, 3e-05, 3e-05, 3e-05, 3e-05, 3e-05, 3e-05, 3e-05, 3e-05, 3e-05, 3e-05, 3e-05, 3e-05, 3e-05, 3e-05, 3e-05, 3e-05, 3e-05, 3e-05, 4e-05, 4e-05, 4e-05, 4e-05, 4e-05, 4e-05, 4e-05, 4e-05, 4e-05, 4e-05, 4e-05, 4e-05, 4e-05, 4e-05, 4e-05, 4e-05, 4e-05, 4e-05, 4e-05, 4e-05, 4e-05, 4e-05, 4e-05, 4e-05, 4e-05, 4e-05, 4e-05, 4e-05, 4e-05, 4e-05, 4e-05, 4e-05, 4e-05, 4e-05, 4e-05, 4e-05, 4e-05, 4e-05, 4e-05, 4e-05, 4e-05, 4e-05, 5e-05, 5e-05, 5e-05, 5e-05, 5e-05, 5e-05, 5e-05, 5e-05, 5e-05, 5e-05, 5e-05, 5e-05, 5e-05, 5e-05, 5e-05, 5e-05, 5e-05, 5e-05, 5e-05, 5e-05, 5e-05, 5e-05, 5e-05, 5e-05, 5e-05, 5e-05, 5e-05, 5e-05, 5e-05, 5e-05, 5e-05, 5e-05, 5e-05, 5e-05, 5e-05, 5e-05, 5e-05, 5e-05, 5e-05, 5e-05, 5e-05, 5e-05, 5e-05, 6e-05, 6e-05, 6e-05, 6e-05, 6e-05, 6e-05, 6e-05, 6e-05, 6e-05, 6e-05, 6e-05, 6e-05, 6e-05, 6e-05, 6e-05, 6e-05, 6e-05, 6e-05, 6e-05, 6e-05, 6e-05, 6e-05, 6e-05, 6e-05, 6e-05, 6e-05, 6e-05, 6e-05, 6e-05, 6e-05, 6e-05, 6e-05, 6e-05, 6e-05, 6e-05, 6e-05, 6e-05, 6e-05, 6e-05, 6e-05, 6e-05, 6e-05, 6e-05, 7e-05, 7e-05, 7e-05, 7e-05, 7e-05, 7e-05, 7e-05, 7e-05, 7e-05, 7e-05, 7e-05, 7e-05, 7e-05, 7e-05, 7e-05, 7e-05, 7e-05, 7e-05, 7e-05, 7e-05, 7e-05, 7e-05, 7e-05, 7e-05, 7e-05, 7e-05, 7e-05, 7e-05, 7e-05, 7e-05, 7e-05, 7e-05, 7e-05, 7e-05, 7e-05, 7e-05, 7e-05, 7e-05, 7e-05, 7e-05, 7e-05, 7e-05, 8e-05, 8e-05, 8e-05, 8e-05, 8e-05, 8e-05, 8e-05, 8e-05, 8e-05, 8e-05, 8e-05, 8e-05, 8e-05, 8e-05, 8e-05, 8e-05, 8e-05, 8e-05, 8e-05, 8e-05, 8e-05, 8e-05, 8e-05, 8e-05, 8e-05, 8e-05, 8e-05, 8e-05, 8e-05, 8e-05, 8e-05, 8e-05, 8e-05, 8e-05, 8e-05, 8e-05, 8e-05, 8e-05, 8e-05, 8e-05, 8e-05, 8e-05, 8e-05, 9e-05, 9e-05, 9e-05, 9e-05, 9e-05, 9e-05, 9e-05, 9e-05, 9e-05, 9e-05, 9e-05, 9e-05, 9e-05, 9e-05, 9e-05, 9e-05, 9e-05, 9e-05, 9e-05, 9e-05, 9e-05, 9e-05, 9e-05, 9e-05, 9e-05, 9e-05, 9e-05, 9e-05, 9e-05, 9e-05, 9e-05, 9e-05, 9e-05, 9e-05, 9e-05, 9e-05, 9e-05]}, "learning-rate vs samples": {"start_step": 0, "end_step": 2000, "step_interval": 5, "values": [0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 1e-05, 1e-05, 1e-05, 1e-05, 1e-05, 1e-05, 1e-05, 1e-05, 1e-05, 1e-05, 1e-05, 1e-05, 1e-05, 1e-05, 1e-05, 1e-05, 1e-05, 1e-05, 1e-05, 1e-05, 1e-05, 1e-05, 1e-05, 1e-05, 1e-05, 1e-05, 1e-05, 1e-05, 1e-05, 1e-05, 1e-05, 1e-05, 1e-05, 1e-05, 1e-05, 1e-05, 1e-05, 1e-05, 1e-05, 1e-05, 1e-05, 1e-05, 2e-05, 2e-05, 2e-05, 2e-05, 2e-05, 2e-05, 2e-05, 2e-05, 2e-05, 2e-05, 2e-05, 2e-05, 2e-05, 2e-05, 2e-05, 2e-05, 2e-05, 2e-05, 2e-05, 2e-05, 2e-05, 2e-05, 2e-05, 2e-05, 2e-05, 2e-05, 2e-05, 2e-05, 2e-05, 2e-05, 2e-05, 2e-05, 2e-05, 2e-05, 2e-05, 2e-05, 2e-05, 2e-05, 2e-05, 2e-05, 2e-05, 2e-05, 2e-05, 3e-05, 3e-05, 3e-05, 3e-05, 3e-05, 3e-05, 3e-05, 3e-05, 3e-05, 3e-05, 3e-05, 3e-05, 3e-05, 3e-05, 3e-05, 3e-05, 3e-05, 3e-05, 3e-05, 3e-05, 3e-05, 3e-05, 3e-05, 3e-05, 3e-05, 3e-05, 3e-05, 3e-05, 3e-05, 3e-05, 3e-05, 3e-05, 3e-05, 3e-05, 3e-05, 3e-05, 3e-05, 3e-05, 3e-05, 3e-05, 3e-05, 3e-05, 3e-05, 4e-05, 4e-05, 4e-05, 4e-05, 4e-05, 4e-05, 4e-05, 4e-05, 4e-05, 4e-05, 4e-05, 4e-05, 4e-05, 4e-05, 4e-05, 4e-05, 4e-05, 4e-05, 4e-05, 4e-05, 4e-05, 4e-05, 4e-05, 4e-05, 4e-05, 4e-05, 4e-05, 4e-05, 4e-05, 4e-05, 4e-05, 4e-05, 4e-05, 4e-05, 4e-05, 4e-05, 4e-05, 4e-05, 4e-05, 4e-05, 4e-05, 4e-05, 5e-05, 5e-05, 5e-05, 5e-05, 5e-05, 5e-05, 5e-05, 5e-05, 5e-05, 5e-05, 5e-05, 5e-05, 5e-05, 5e-05, 5e-05, 5e-05, 5e-05, 5e-05, 5e-05, 5e-05, 5e-05, 5e-05, 5e-05, 5e-05, 5e-05, 5e-05, 5e-05, 5e-05, 5e-05, 5e-05, 5e-05, 5e-05, 5e-05, 5e-05, 5e-05, 5e-05, 5e-05, 5e-05, 5e-05, 5e-05, 5e-05, 5e-05, 5e-05, 6e-05, 6e-05, 6e-05, 6e-05, 6e-05, 6e-05, 6e-05, 6e-05, 6e-05, 6e-05, 6e-05, 6e-05, 6e-05, 6e-05, 6e-05, 6e-05, 6e-05, 6e-05, 6e-05, 6e-05, 6e-05, 6e-05, 6e-05, 6e-05, 6e-05, 6e-05, 6e-05, 6e-05, 6e-05, 6e-05, 6e-05, 6e-05, 6e-05, 6e-05, 6e-05, 6e-05, 6e-05, 6e-05, 6e-05, 6e-05, 6e-05, 6e-05, 6e-05, 7e-05, 7e-05, 7e-05, 7e-05, 7e-05, 7e-05, 7e-05, 7e-05, 7e-05, 7e-05, 7e-05, 7e-05, 7e-05, 7e-05, 7e-05, 7e-05, 7e-05, 7e-05, 7e-05, 7e-05, 7e-05, 7e-05, 7e-05, 7e-05, 7e-05, 7e-05, 7e-05, 7e-05, 7e-05, 7e-05, 7e-05, 7e-05, 7e-05, 7e-05, 7e-05, 7e-05, 7e-05, 7e-05, 7e-05, 7e-05, 7e-05, 7e-05, 8e-05, 8e-05, 8e-05, 8e-05, 8e-05, 8e-05, 8e-05, 8e-05, 8e-05, 8e-05, 8e-05, 8e-05, 8e-05, 8e-05, 8e-05, 8e-05, 8e-05, 8e-05, 8e-05, 8e-05, 8e-05, 8e-05, 8e-05, 8e-05, 8e-05, 8e-05, 8e-05, 8e-05, 8e-05, 8e-05, 8e-05, 8e-05, 8e-05, 8e-05, 8e-05, 8e-05, 8e-05, 8e-05, 8e-05, 8e-05, 8e-05, 8e-05, 8e-05, 9e-05, 9e-05, 9e-05, 9e-05, 9e-05, 9e-05, 9e-05, 9e-05, 9e-05, 9e-05, 9e-05, 9e-05, 9e-05, 9e-05, 9e-05, 9e-05, 9e-05, 9e-05, 9e-05, 9e-05, 9e-05, 9e-05, 9e-05, 9e-05, 9e-05, 9e-05, 9e-05, 9e-05, 9e-05, 9e-05, 9e-05, 9e-05, 9e-05, 9e-05, 9e-05, 9e-05, 9e-05]}, "batch-size": {"start_step": 0, "end_step": 2000, "step_interval": 5, "values": [128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0]}, "batch-size vs samples": {"start_step": 0, "end_step": 2000, "step_interval": 5, "values": [128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0]}, "lm loss": {"start_step": 0, "end_step": 2000, "step_interval": 5, "values": [10.88321, 10.90268, 10.88687, 10.83314, 10.67636, 10.64925, 10.43407, 10.15143, 9.939, 9.84142, 9.58871, 9.85432, 9.88466, 9.62953, 9.78812, 9.5115, 9.45845, 9.64924, 9.38622, 9.33216, 9.24226, 9.14549, 9.17557, 8.99547, 9.18942, 9.05996, 9.15554, 9.16495, 9.29785, 8.98464, 8.92921, 9.04391, 9.04317, 8.65502, 8.71709, 8.75344, 8.68371, 8.7343, 8.65869, 8.76488, 8.66084, 8.84969, 8.83212, 8.4992, 8.38905, 8.43151, 8.49327, 8.38449, 8.43266, 8.57974, 8.36712, 8.19218, 8.22599, 8.22213, 8.26761, 7.91363, 8.09574, 7.89107, 8.2463, 8.23044, 8.00478, 7.9653, 7.91788, 7.73983, 7.73952, 7.64266, 7.51535, 7.9067, 7.6981, 7.45174, 7.74028, 7.76751, 7.54113, 7.29838, 7.45192, 7.33549, 7.46187, 7.22351, 7.63653, 7.27884, 7.35151, 7.2129, 7.2187, 7.42237, 7.17713, 7.28373, 7.00153, 7.00528, 7.04066, 7.1397, 6.8246, 6.98624, 7.08901, 7.00075, 6.87398, 6.75446, 6.98902, 7.05484, 6.70056, 6.57618, 6.7239, 6.73842, 6.73087, 6.73636, 6.65702, 6.40579, 6.6386, 6.62005, 6.44721, 6.63067, 6.74344, 6.6111, 6.7266, 6.69523, 6.62503, 6.50683, 6.59892, 6.4067, 6.66402, 6.24864, 6.25205, 6.30302, 6.38991, 6.35064, 6.45057, 6.2892, 6.34021, 6.23934, 6.20441, 6.39672, 6.32669, 6.3228, 6.16602, 6.15875, 6.24058, 6.38585, 6.20055, 6.14534, 6.17669, 6.1094, 6.05525, 6.06665, 6.2527, 6.40409, 6.25252, 6.2934, 6.0919, 6.17395, 5.99575, 6.02272, 5.94996, 6.23797, 6.18154, 5.95877, 5.77498, 6.11727, 5.84271, 6.09751, 5.78563, 6.15394, 6.14296, 6.08411, 5.92729, 6.11238, 5.94309, 6.19339, 5.89494, 5.792, 5.77614, 5.6837, 6.01618, 5.99613, 6.06338, 5.88778, 6.04018, 5.96996, 5.99544, 5.98695, 5.94778, 5.84144, 5.95287, 5.61942, 5.70133, 5.88893, 5.84402, 5.86128, 5.76114, 5.83707, 5.72343, 5.55889, 5.72351, 5.62534, 5.83303, 5.60569, 5.7102, 5.70991, 5.89681, 5.64325, 5.84924, 5.73928, 5.87114, 5.33228, 5.89693, 5.872, 5.85316, 5.40988, 5.4088, 5.62665, 5.59641, 5.48639, 5.57896, 5.67332, 5.47579, 5.74541, 5.50851, 5.59461, 5.621, 5.62129, 5.51073, 5.61357, 5.67793, 5.68632, 5.58943, 5.66035, 5.37294, 5.67985, 5.62736, 5.42133, 5.58734, 5.63109, 5.55307, 5.34119, 5.53841, 5.48634, 5.48174, 5.37484, 5.55776, 5.60342, 5.38738, 5.52728, 5.4859, 5.33181, 5.50554, 5.40833, 5.44, 5.31717, 5.06482, 5.47629, 5.56511, 5.71212, 5.41184, 5.59499, 5.63272, 5.23153, 5.27192, 5.3912, 5.39311, 5.32484, 5.49539, 5.18175, 5.29693, 5.24506, 5.37468, 5.25384, 5.44332, 5.53548, 5.3125, 5.43753, 5.3339, 5.07, 5.31161, 5.25178, 5.30057, 5.1086, 5.27262, 5.26395, 5.46902, 5.15667, 5.26704, 5.20746, 5.35466, 4.98016, 4.91076, 5.3213, 5.39019, 5.22162, 5.3164, 5.10162, 5.1553, 5.25943, 5.06435, 5.26075, 5.07101, 5.33638, 5.24297, 5.14623, 5.23826, 5.03699, 5.31101, 5.04764, 5.02142, 5.13778, 5.10838, 5.26722, 5.14671, 5.27266, 5.09162, 5.0919, 5.24829, 5.3185, 5.25029, 5.18579, 5.14206, 5.28335, 4.94328, 5.20523, 5.08657, 5.29719, 5.17312, 5.18231, 5.10943, 4.98051, 4.99195, 5.21896, 5.30825, 5.09051, 5.05174, 4.91264, 5.11732, 5.11518, 4.92322, 5.33386, 5.02007, 5.09792, 5.16007, 4.99811, 5.05898, 5.06488, 4.98971, 5.07389, 5.15699, 4.97292, 5.17835, 4.92646, 4.91925, 5.06679, 4.99198, 4.90773, 4.77047, 4.93905, 5.10914, 5.0148, 5.01342, 5.32728, 4.95518, 4.99041, 5.04238, 4.79783, 4.72965, 4.99227, 5.0394, 4.87169, 4.95051, 5.03887, 5.01995, 4.81482, 4.88854, 4.89947, 4.82779, 4.74234, 5.00778, 4.7467, 5.20619, 4.78181, 4.98955, 4.73414, 4.78105, 4.81703, 4.64628, 4.65374, 4.83873, 4.80327, 4.79812, 4.9214, 4.87849, 4.92132, 4.76615, 4.87858, 4.72843, 4.9077, 4.95342, 4.86965, 4.70236, 4.77862, 4.89666, 4.70572, 4.85677, 4.68692, 4.68192, 4.64505]}, "lm loss vs samples": {"start_step": 0, "end_step": 2000, "step_interval": 5, "values": [10.88321, 10.90268, 10.88687, 10.83314, 10.67636, 10.64925, 10.43407, 10.15143, 9.939, 9.84142, 9.58871, 9.85432, 9.88466, 9.62953, 9.78812, 9.5115, 9.45845, 9.64924, 9.38622, 9.33216, 9.24226, 9.14549, 9.17557, 8.99547, 9.18942, 9.05996, 9.15554, 9.16495, 9.29785, 8.98464, 8.92921, 9.04391, 9.04317, 8.65502, 8.71709, 8.75344, 8.68371, 8.7343, 8.65869, 8.76488, 8.66084, 8.84969, 8.83212, 8.4992, 8.38905, 8.43151, 8.49327, 8.38449, 8.43266, 8.57974, 8.36712, 8.19218, 8.22599, 8.22213, 8.26761, 7.91363, 8.09574, 7.89107, 8.2463, 8.23044, 8.00478, 7.9653, 7.91788, 7.73983, 7.73952, 7.64266, 7.51535, 7.9067, 7.6981, 7.45174, 7.74028, 7.76751, 7.54113, 7.29838, 7.45192, 7.33549, 7.46187, 7.22351, 7.63653, 7.27884, 7.35151, 7.2129, 7.2187, 7.42237, 7.17713, 7.28373, 7.00153, 7.00528, 7.04066, 7.1397, 6.8246, 6.98624, 7.08901, 7.00075, 6.87398, 6.75446, 6.98902, 7.05484, 6.70056, 6.57618, 6.7239, 6.73842, 6.73087, 6.73636, 6.65702, 6.40579, 6.6386, 6.62005, 6.44721, 6.63067, 6.74344, 6.6111, 6.7266, 6.69523, 6.62503, 6.50683, 6.59892, 6.4067, 6.66402, 6.24864, 6.25205, 6.30302, 6.38991, 6.35064, 6.45057, 6.2892, 6.34021, 6.23934, 6.20441, 6.39672, 6.32669, 6.3228, 6.16602, 6.15875, 6.24058, 6.38585, 6.20055, 6.14534, 6.17669, 6.1094, 6.05525, 6.06665, 6.2527, 6.40409, 6.25252, 6.2934, 6.0919, 6.17395, 5.99575, 6.02272, 5.94996, 6.23797, 6.18154, 5.95877, 5.77498, 6.11727, 5.84271, 6.09751, 5.78563, 6.15394, 6.14296, 6.08411, 5.92729, 6.11238, 5.94309, 6.19339, 5.89494, 5.792, 5.77614, 5.6837, 6.01618, 5.99613, 6.06338, 5.88778, 6.04018, 5.96996, 5.99544, 5.98695, 5.94778, 5.84144, 5.95287, 5.61942, 5.70133, 5.88893, 5.84402, 5.86128, 5.76114, 5.83707, 5.72343, 5.55889, 5.72351, 5.62534, 5.83303, 5.60569, 5.7102, 5.70991, 5.89681, 5.64325, 5.84924, 5.73928, 5.87114, 5.33228, 5.89693, 5.872, 5.85316, 5.40988, 5.4088, 5.62665, 5.59641, 5.48639, 5.57896, 5.67332, 5.47579, 5.74541, 5.50851, 5.59461, 5.621, 5.62129, 5.51073, 5.61357, 5.67793, 5.68632, 5.58943, 5.66035, 5.37294, 5.67985, 5.62736, 5.42133, 5.58734, 5.63109, 5.55307, 5.34119, 5.53841, 5.48634, 5.48174, 5.37484, 5.55776, 5.60342, 5.38738, 5.52728, 5.4859, 5.33181, 5.50554, 5.40833, 5.44, 5.31717, 5.06482, 5.47629, 5.56511, 5.71212, 5.41184, 5.59499, 5.63272, 5.23153, 5.27192, 5.3912, 5.39311, 5.32484, 5.49539, 5.18175, 5.29693, 5.24506, 5.37468, 5.25384, 5.44332, 5.53548, 5.3125, 5.43753, 5.3339, 5.07, 5.31161, 5.25178, 5.30057, 5.1086, 5.27262, 5.26395, 5.46902, 5.15667, 5.26704, 5.20746, 5.35466, 4.98016, 4.91076, 5.3213, 5.39019, 5.22162, 5.3164, 5.10162, 5.1553, 5.25943, 5.06435, 5.26075, 5.07101, 5.33638, 5.24297, 5.14623, 5.23826, 5.03699, 5.31101, 5.04764, 5.02142, 5.13778, 5.10838, 5.26722, 5.14671, 5.27266, 5.09162, 5.0919, 5.24829, 5.3185, 5.25029, 5.18579, 5.14206, 5.28335, 4.94328, 5.20523, 5.08657, 5.29719, 5.17312, 5.18231, 5.10943, 4.98051, 4.99195, 5.21896, 5.30825, 5.09051, 5.05174, 4.91264, 5.11732, 5.11518, 4.92322, 5.33386, 5.02007, 5.09792, 5.16007, 4.99811, 5.05898, 5.06488, 4.98971, 5.07389, 5.15699, 4.97292, 5.17835, 4.92646, 4.91925, 5.06679, 4.99198, 4.90773, 4.77047, 4.93905, 5.10914, 5.0148, 5.01342, 5.32728, 4.95518, 4.99041, 5.04238, 4.79783, 4.72965, 4.99227, 5.0394, 4.87169, 4.95051, 5.03887, 5.01995, 4.81482, 4.88854, 4.89947, 4.82779, 4.74234, 5.00778, 4.7467, 5.20619, 4.78181, 4.98955, 4.73414, 4.78105, 4.81703, 4.64628, 4.65374, 4.83873, 4.80327, 4.79812, 4.9214, 4.87849, 4.92132, 4.76615, 4.87858, 4.72843, 4.9077, 4.95342, 4.86965, 4.70236, 4.77862, 4.89666, 4.70572, 4.85677, 4.68692, 4.68192, 4.64505]}, "loss-scale": {"start_step": 0, "end_step": 2000, "step_interval": 5, "values": [1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0]}, "loss-scale vs samples": {"start_step": 0, "end_step": 2000, "step_interval": 5, "values": [1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0]}, "grad-norm": {"start_step": 0, "end_step": 2000, "step_interval": 5, "values": [12.95641, 13.2384, 13.63492, 12.46753, 12.09519, 9.48185, 7.05331, 7.26898, 6.13791, 4.65533, 4.16677, 2.85409, 2.39258, 2.35693, 2.05902, 2.22136, 2.15373, 1.91319, 2.28507, 2.08136, 2.12587, 2.16293, 2.01255, 2.22443, 1.98488, 2.10576, 1.90696, 1.9543, 1.94666, 2.19132, 2.07534, 1.9973, 1.90676, 2.17071, 2.13949, 2.12242, 2.00142, 1.85779, 1.93941, 1.74128, 2.19131, 1.80266, 1.76804, 1.92184, 1.89627, 1.81829, 1.73892, 1.73316, 1.7548, 1.56741, 1.70661, 1.78909, 1.75371, 1.8099, 1.69083, 1.80378, 1.72805, 1.87537, 1.64718, 1.47793, 1.64751, 1.54177, 1.73678, 1.93709, 1.70003, 1.61404, 1.65733, 1.60718, 1.41019, 1.66006, 1.44415, 1.3449, 1.59801, 1.38078, 1.40657, 1.58642, 1.37384, 1.47591, 1.51235, 1.32276, 1.27695, 1.35665, 1.39793, 1.46181, 1.25641, 1.39278, 1.37555, 1.31206, 1.25327, 1.08729, 1.11608, 1.26073, 1.05493, 1.26676, 1.03825, 1.22449, 1.31527, 1.17458, 1.05643, 1.32651, 1.60257, 1.2771, 1.33646, 1.31918, 1.248, 1.20478, 1.17877, 1.39792, 1.21711, 1.31304, 1.06851, 0.90225, 1.00231, 1.02701, 1.08335, 1.06592, 1.11157, 1.35469, 1.11475, 0.96782, 1.00793, 1.10818, 0.98621, 1.2088, 1.33881, 1.44029, 1.6209, 1.4596, 1.76932, 0.95989, 1.18019, 1.10796, 1.01963, 0.97229, 1.12326, 1.18955, 1.04787, 1.17124, 1.15064, 0.95989, 1.2251, 1.2379, 1.76155, 1.26203, 1.48837, 1.2467, 1.12532, 1.2807, 1.00776, 1.29835, 1.39203, 1.19636, 1.4484, 1.31191, 1.0452, 1.72246, 1.72833, 1.28959, 1.84591, 1.35158, 1.59884, 1.36455, 1.22883, 0.94147, 1.4872, 1.47058, 1.60177, 1.17187, 1.32032, 1.16147, 1.85664, 1.34438, 1.41884, 1.939, 1.3293, 1.75251, 1.4942, 1.19914, 1.25112, 1.47923, 1.19903, 1.70249, 1.28382, 1.22996, 1.38428, 1.04416, 1.49206, 1.45812, 1.5496, 1.42558, 1.5666, 1.60373, 1.50198, 2.14466, 1.64657, 1.23816, 1.19399, 1.20748, 1.27992, 1.28244, 1.01251, 1.42205, 1.36197, 1.11149, 1.15089, 1.21404, 1.39311, 1.5652, 1.38265, 1.4134, 1.55375, 1.48078, 1.28046, 1.56958, 1.42513, 1.45697, 1.27067, 1.6129, 1.30064, 1.30128, 1.59962, 2.07562, 1.66274, 1.53273, 1.30633, 1.38281, 1.30251, 1.26134, 1.59835, 1.39505, 1.20665, 1.50419, 1.33709, 1.53729, 1.35211, 1.18328, 1.72786, 1.56925, 1.48159, 1.79747, 1.32018, 1.29802, 1.45777, 1.41144, 1.32018, 1.82833, 1.47341, 1.38161, 1.37728, 1.47317, 1.22182, 1.50379, 1.40184, 1.43299, 1.38574, 1.54027, 1.3871, 1.51693, 1.73604, 1.27623, 1.30004, 1.43266, 1.26605, 1.31063, 1.40554, 1.47355, 1.43481, 1.66877, 1.27269, 1.36414, 1.39902, 1.36787, 1.30634, 1.35432, 1.33569, 1.38439, 1.38254, 1.48327, 1.3313, 1.47336, 1.54266, 1.45093, 1.39023, 1.42073, 1.71873, 1.24142, 1.27025, 1.75206, 1.19488, 1.72063, 1.35861, 1.46103, 1.32756, 1.38252, 1.44831, 1.49026, 1.5017, 1.67806, 1.49633, 1.40813, 1.2821, 1.34708, 1.20139, 1.33134, 1.30935, 1.28049, 1.39953, 1.36021, 1.30784, 1.55113, 1.45126, 1.35267, 1.8948, 1.31989, 1.26079, 1.54872, 1.25987, 1.49108, 1.31905, 1.39623, 1.42575, 1.70894, 1.69908, 1.44957, 1.53553, 1.41451, 1.68745, 1.45251, 1.2816, 1.33701, 1.40832, 1.76682, 1.43394, 1.35911, 1.42618, 1.36908, 1.37004, 1.25362, 1.44167, 1.3631, 1.32537, 1.0708, 1.21959, 1.38245, 1.69458, 1.66343, 1.49487, 1.64475, 1.18445, 1.24234, 1.37689, 1.3449, 1.29452, 1.57163, 1.48364, 1.39813, 1.46563, 1.16757, 1.33935, 1.37732, 1.74665, 1.43255, 1.6591, 1.35981, 1.18773, 1.72037, 1.57868, 1.47314, 1.60009, 1.70452, 1.52569, 1.35993, 1.71308, 1.55029, 1.45496, 1.45713, 1.21934, 1.34612, 1.35689, 1.29738, 1.27919, 1.35703, 1.34356, 1.23723, 1.16682, 1.55154, 1.54928, 1.31127, 1.22661, 1.39907, 1.23896, 1.39069, 1.35517, 1.4518, 1.74352, 1.41812, 1.48035, 1.43537, 1.2798, 1.31958]}, "grad-norm vs samples": {"start_step": 0, "end_step": 2000, "step_interval": 5, "values": [12.95641, 13.2384, 13.63492, 12.46753, 12.09519, 9.48185, 7.05331, 7.26898, 6.13791, 4.65533, 4.16677, 2.85409, 2.39258, 2.35693, 2.05902, 2.22136, 2.15373, 1.91319, 2.28507, 2.08136, 2.12587, 2.16293, 2.01255, 2.22443, 1.98488, 2.10576, 1.90696, 1.9543, 1.94666, 2.19132, 2.07534, 1.9973, 1.90676, 2.17071, 2.13949, 2.12242, 2.00142, 1.85779, 1.93941, 1.74128, 2.19131, 1.80266, 1.76804, 1.92184, 1.89627, 1.81829, 1.73892, 1.73316, 1.7548, 1.56741, 1.70661, 1.78909, 1.75371, 1.8099, 1.69083, 1.80378, 1.72805, 1.87537, 1.64718, 1.47793, 1.64751, 1.54177, 1.73678, 1.93709, 1.70003, 1.61404, 1.65733, 1.60718, 1.41019, 1.66006, 1.44415, 1.3449, 1.59801, 1.38078, 1.40657, 1.58642, 1.37384, 1.47591, 1.51235, 1.32276, 1.27695, 1.35665, 1.39793, 1.46181, 1.25641, 1.39278, 1.37555, 1.31206, 1.25327, 1.08729, 1.11608, 1.26073, 1.05493, 1.26676, 1.03825, 1.22449, 1.31527, 1.17458, 1.05643, 1.32651, 1.60257, 1.2771, 1.33646, 1.31918, 1.248, 1.20478, 1.17877, 1.39792, 1.21711, 1.31304, 1.06851, 0.90225, 1.00231, 1.02701, 1.08335, 1.06592, 1.11157, 1.35469, 1.11475, 0.96782, 1.00793, 1.10818, 0.98621, 1.2088, 1.33881, 1.44029, 1.6209, 1.4596, 1.76932, 0.95989, 1.18019, 1.10796, 1.01963, 0.97229, 1.12326, 1.18955, 1.04787, 1.17124, 1.15064, 0.95989, 1.2251, 1.2379, 1.76155, 1.26203, 1.48837, 1.2467, 1.12532, 1.2807, 1.00776, 1.29835, 1.39203, 1.19636, 1.4484, 1.31191, 1.0452, 1.72246, 1.72833, 1.28959, 1.84591, 1.35158, 1.59884, 1.36455, 1.22883, 0.94147, 1.4872, 1.47058, 1.60177, 1.17187, 1.32032, 1.16147, 1.85664, 1.34438, 1.41884, 1.939, 1.3293, 1.75251, 1.4942, 1.19914, 1.25112, 1.47923, 1.19903, 1.70249, 1.28382, 1.22996, 1.38428, 1.04416, 1.49206, 1.45812, 1.5496, 1.42558, 1.5666, 1.60373, 1.50198, 2.14466, 1.64657, 1.23816, 1.19399, 1.20748, 1.27992, 1.28244, 1.01251, 1.42205, 1.36197, 1.11149, 1.15089, 1.21404, 1.39311, 1.5652, 1.38265, 1.4134, 1.55375, 1.48078, 1.28046, 1.56958, 1.42513, 1.45697, 1.27067, 1.6129, 1.30064, 1.30128, 1.59962, 2.07562, 1.66274, 1.53273, 1.30633, 1.38281, 1.30251, 1.26134, 1.59835, 1.39505, 1.20665, 1.50419, 1.33709, 1.53729, 1.35211, 1.18328, 1.72786, 1.56925, 1.48159, 1.79747, 1.32018, 1.29802, 1.45777, 1.41144, 1.32018, 1.82833, 1.47341, 1.38161, 1.37728, 1.47317, 1.22182, 1.50379, 1.40184, 1.43299, 1.38574, 1.54027, 1.3871, 1.51693, 1.73604, 1.27623, 1.30004, 1.43266, 1.26605, 1.31063, 1.40554, 1.47355, 1.43481, 1.66877, 1.27269, 1.36414, 1.39902, 1.36787, 1.30634, 1.35432, 1.33569, 1.38439, 1.38254, 1.48327, 1.3313, 1.47336, 1.54266, 1.45093, 1.39023, 1.42073, 1.71873, 1.24142, 1.27025, 1.75206, 1.19488, 1.72063, 1.35861, 1.46103, 1.32756, 1.38252, 1.44831, 1.49026, 1.5017, 1.67806, 1.49633, 1.40813, 1.2821, 1.34708, 1.20139, 1.33134, 1.30935, 1.28049, 1.39953, 1.36021, 1.30784, 1.55113, 1.45126, 1.35267, 1.8948, 1.31989, 1.26079, 1.54872, 1.25987, 1.49108, 1.31905, 1.39623, 1.42575, 1.70894, 1.69908, 1.44957, 1.53553, 1.41451, 1.68745, 1.45251, 1.2816, 1.33701, 1.40832, 1.76682, 1.43394, 1.35911, 1.42618, 1.36908, 1.37004, 1.25362, 1.44167, 1.3631, 1.32537, 1.0708, 1.21959, 1.38245, 1.69458, 1.66343, 1.49487, 1.64475, 1.18445, 1.24234, 1.37689, 1.3449, 1.29452, 1.57163, 1.48364, 1.39813, 1.46563, 1.16757, 1.33935, 1.37732, 1.74665, 1.43255, 1.6591, 1.35981, 1.18773, 1.72037, 1.57868, 1.47314, 1.60009, 1.70452, 1.52569, 1.35993, 1.71308, 1.55029, 1.45496, 1.45713, 1.21934, 1.34612, 1.35689, 1.29738, 1.27919, 1.35703, 1.34356, 1.23723, 1.16682, 1.55154, 1.54928, 1.31127, 1.22661, 1.39907, 1.23896, 1.39069, 1.35517, 1.4518, 1.74352, 1.41812, 1.48035, 1.43537, 1.2798, 1.31958]}, "num-zeros": {"start_step": 0, "end_step": 2000, "step_interval": 5, "values": [58.0, 81.0, 78.0, 82.0, 76.0, 95.0, 104.0, 114.0, 114.0, 147.0, 119.0, 159.0, 165.0, 173.0, 182.0, 167.0, 188.0, 176.0, 167.0, 165.0, 187.0, 162.0, 191.0, 164.0, 181.0, 170.0, 168.0, 172.0, 182.0, 180.0, 164.0, 171.0, 169.0, 154.0, 144.0, 172.0, 173.0, 198.0, 168.0, 210.0, 178.0, 156.0, 174.0, 177.0, 163.0, 172.0, 206.0, 172.0, 184.0, 197.0, 223.0, 153.0, 162.0, 187.0, 173.0, 201.0, 146.0, 152.0, 240.0, 231.0, 192.0, 208.0, 162.0, 210.0, 192.0, 282.0, 232.0, 174.0, 215.0, 186.0, 227.0, 258.0, 202.0, 265.0, 192.0, 216.0, 239.0, 200.0, 265.0, 210.0, 264.0, 231.0, 179.0, 221.0, 234.0, 184.0, 188.0, 206.0, 157.0, 228.0, 217.0, 227.0, 219.0, 233.0, 191.0, 187.0, 214.0, 190.0, 237.0, 168.0, 155.0, 174.0, 165.0, 157.0, 155.0, 136.0, 154.0, 133.0, 124.0, 167.0, 187.0, 158.0, 188.0, 161.0, 168.0, 130.0, 164.0, 109.0, 181.0, 166.0, 146.0, 145.0, 130.0, 132.0, 130.0, 145.0, 125.0, 107.0, 130.0, 147.0, 128.0, 137.0, 149.0, 151.0, 133.0, 117.0, 167.0, 153.0, 134.0, 131.0, 117.0, 116.0, 100.0, 125.0, 121.0, 139.0, 125.0, 139.0, 124.0, 118.0, 103.0, 142.0, 95.0, 127.0, 109.0, 102.0, 110.0, 119.0, 101.0, 129.0, 122.0, 143.0, 119.0, 131.0, 102.0, 117.0, 98.0, 140.0, 129.0, 106.0, 76.0, 115.0, 81.0, 87.0, 118.0, 84.0, 101.0, 118.0, 99.0, 99.0, 107.0, 108.0, 137.0, 131.0, 109.0, 123.0, 107.0, 104.0, 102.0, 138.0, 125.0, 119.0, 91.0, 79.0, 87.0, 112.0, 104.0, 98.0, 101.0, 109.0, 135.0, 98.0, 89.0, 117.0, 106.0, 127.0, 103.0, 111.0, 122.0, 102.0, 92.0, 99.0, 110.0, 93.0, 123.0, 114.0, 133.0, 87.0, 114.0, 121.0, 111.0, 95.0, 93.0, 102.0, 127.0, 88.0, 127.0, 114.0, 107.0, 110.0, 101.0, 110.0, 108.0, 99.0, 106.0, 126.0, 92.0, 96.0, 94.0, 77.0, 124.0, 119.0, 91.0, 105.0, 110.0, 103.0, 97.0, 116.0, 104.0, 97.0, 117.0, 92.0, 110.0, 114.0, 97.0, 101.0, 92.0, 105.0, 93.0, 141.0, 93.0, 106.0, 116.0, 107.0, 122.0, 107.0, 128.0, 100.0, 94.0, 105.0, 124.0, 114.0, 94.0, 80.0, 98.0, 105.0, 97.0, 99.0, 132.0, 94.0, 99.0, 93.0, 108.0, 108.0, 107.0, 111.0, 134.0, 114.0, 104.0, 102.0, 123.0, 108.0, 109.0, 107.0, 110.0, 121.0, 92.0, 94.0, 130.0, 128.0, 130.0, 83.0, 110.0, 130.0, 105.0, 99.0, 106.0, 107.0, 101.0, 100.0, 98.0, 131.0, 101.0, 116.0, 89.0, 106.0, 114.0, 115.0, 112.0, 110.0, 128.0, 92.0, 88.0, 112.0, 108.0, 106.0, 83.0, 113.0, 129.0, 126.0, 99.0, 118.0, 98.0, 101.0, 102.0, 103.0, 119.0, 126.0, 128.0, 110.0, 107.0, 128.0, 125.0, 119.0, 113.0, 89.0, 102.0, 103.0, 126.0, 141.0, 95.0, 106.0, 117.0, 109.0, 93.0, 109.0, 111.0, 138.0, 124.0, 114.0, 106.0, 92.0, 109.0, 105.0, 144.0, 122.0, 108.0, 112.0, 86.0, 100.0, 127.0, 108.0, 100.0, 113.0, 99.0, 103.0, 104.0, 96.0, 125.0, 122.0, 97.0, 128.0, 117.0, 121.0, 133.0, 115.0, 95.0, 126.0, 117.0, 136.0, 118.0, 108.0, 135.0, 109.0, 114.0, 124.0, 122.0, 106.0, 110.0, 124.0]}, "num-zeros vs samples": {"start_step": 0, "end_step": 2000, "step_interval": 5, "values": [58.0, 81.0, 78.0, 82.0, 76.0, 95.0, 104.0, 114.0, 114.0, 147.0, 119.0, 159.0, 165.0, 173.0, 182.0, 167.0, 188.0, 176.0, 167.0, 165.0, 187.0, 162.0, 191.0, 164.0, 181.0, 170.0, 168.0, 172.0, 182.0, 180.0, 164.0, 171.0, 169.0, 154.0, 144.0, 172.0, 173.0, 198.0, 168.0, 210.0, 178.0, 156.0, 174.0, 177.0, 163.0, 172.0, 206.0, 172.0, 184.0, 197.0, 223.0, 153.0, 162.0, 187.0, 173.0, 201.0, 146.0, 152.0, 240.0, 231.0, 192.0, 208.0, 162.0, 210.0, 192.0, 282.0, 232.0, 174.0, 215.0, 186.0, 227.0, 258.0, 202.0, 265.0, 192.0, 216.0, 239.0, 200.0, 265.0, 210.0, 264.0, 231.0, 179.0, 221.0, 234.0, 184.0, 188.0, 206.0, 157.0, 228.0, 217.0, 227.0, 219.0, 233.0, 191.0, 187.0, 214.0, 190.0, 237.0, 168.0, 155.0, 174.0, 165.0, 157.0, 155.0, 136.0, 154.0, 133.0, 124.0, 167.0, 187.0, 158.0, 188.0, 161.0, 168.0, 130.0, 164.0, 109.0, 181.0, 166.0, 146.0, 145.0, 130.0, 132.0, 130.0, 145.0, 125.0, 107.0, 130.0, 147.0, 128.0, 137.0, 149.0, 151.0, 133.0, 117.0, 167.0, 153.0, 134.0, 131.0, 117.0, 116.0, 100.0, 125.0, 121.0, 139.0, 125.0, 139.0, 124.0, 118.0, 103.0, 142.0, 95.0, 127.0, 109.0, 102.0, 110.0, 119.0, 101.0, 129.0, 122.0, 143.0, 119.0, 131.0, 102.0, 117.0, 98.0, 140.0, 129.0, 106.0, 76.0, 115.0, 81.0, 87.0, 118.0, 84.0, 101.0, 118.0, 99.0, 99.0, 107.0, 108.0, 137.0, 131.0, 109.0, 123.0, 107.0, 104.0, 102.0, 138.0, 125.0, 119.0, 91.0, 79.0, 87.0, 112.0, 104.0, 98.0, 101.0, 109.0, 135.0, 98.0, 89.0, 117.0, 106.0, 127.0, 103.0, 111.0, 122.0, 102.0, 92.0, 99.0, 110.0, 93.0, 123.0, 114.0, 133.0, 87.0, 114.0, 121.0, 111.0, 95.0, 93.0, 102.0, 127.0, 88.0, 127.0, 114.0, 107.0, 110.0, 101.0, 110.0, 108.0, 99.0, 106.0, 126.0, 92.0, 96.0, 94.0, 77.0, 124.0, 119.0, 91.0, 105.0, 110.0, 103.0, 97.0, 116.0, 104.0, 97.0, 117.0, 92.0, 110.0, 114.0, 97.0, 101.0, 92.0, 105.0, 93.0, 141.0, 93.0, 106.0, 116.0, 107.0, 122.0, 107.0, 128.0, 100.0, 94.0, 105.0, 124.0, 114.0, 94.0, 80.0, 98.0, 105.0, 97.0, 99.0, 132.0, 94.0, 99.0, 93.0, 108.0, 108.0, 107.0, 111.0, 134.0, 114.0, 104.0, 102.0, 123.0, 108.0, 109.0, 107.0, 110.0, 121.0, 92.0, 94.0, 130.0, 128.0, 130.0, 83.0, 110.0, 130.0, 105.0, 99.0, 106.0, 107.0, 101.0, 100.0, 98.0, 131.0, 101.0, 116.0, 89.0, 106.0, 114.0, 115.0, 112.0, 110.0, 128.0, 92.0, 88.0, 112.0, 108.0, 106.0, 83.0, 113.0, 129.0, 126.0, 99.0, 118.0, 98.0, 101.0, 102.0, 103.0, 119.0, 126.0, 128.0, 110.0, 107.0, 128.0, 125.0, 119.0, 113.0, 89.0, 102.0, 103.0, 126.0, 141.0, 95.0, 106.0, 117.0, 109.0, 93.0, 109.0, 111.0, 138.0, 124.0, 114.0, 106.0, 92.0, 109.0, 105.0, 144.0, 122.0, 108.0, 112.0, 86.0, 100.0, 127.0, 108.0, 100.0, 113.0, 99.0, 103.0, 104.0, 96.0, 125.0, 122.0, 97.0, 128.0, 117.0, 121.0, 133.0, 115.0, 95.0, 126.0, 117.0, 136.0, 118.0, 108.0, 135.0, 109.0, 114.0, 124.0, 122.0, 106.0, 110.0, 124.0]}, "params-norm": {"start_step": 0, "end_step": 2000, "step_interval": 5, "values": [179.95625, 179.95625, 179.95625, 179.95625, 179.95625, 179.95624, 179.95621, 179.95612, 179.95593, 179.95575, 179.95447, 179.9538, 179.95322, 179.95126, 179.95035, 179.94966, 179.94905, 179.94916, 179.94939, 179.94966, 179.9496, 179.94933, 179.94919, 179.94952, 179.95036, 179.95168, 179.95288, 179.95392, 179.95509, 179.9565, 179.95851, 179.96069, 179.963, 179.96532, 179.96788, 179.97118, 179.97482, 179.97873, 179.98279, 179.98714, 179.99208, 179.99753, 180.00325, 180.00955, 180.01634, 180.02382, 180.03171, 180.04016, 180.04951, 180.05969, 180.07059, 180.08221, 180.09441, 180.10721, 180.12059, 180.13457, 180.14899, 180.16373, 180.1792, 180.19586, 180.21344, 180.23199, 180.25226, 180.2733, 180.2948, 180.31709, 180.34032, 180.36464, 180.38991, 180.41573, 180.44231, 180.46947, 180.49721, 180.52528, 180.55406, 180.5829, 180.61168, 180.64125, 180.67117, 180.70154, 180.73244, 180.76378, 180.79633, 180.82928, 180.86198, 180.89581, 180.92958, 180.96359, 180.99808, 181.03401, 181.07187, 181.1104, 181.14795, 181.18536, 181.22249, 181.26071, 181.29898, 181.33658, 181.37422, 181.41164, 181.4467, 181.47968, 181.5123, 181.54552, 181.57919, 181.61421, 181.65012, 181.68695, 181.72267, 181.7587, 181.79526, 181.83344, 181.87288, 181.91354, 181.9543, 181.99518, 182.03568, 182.07515, 182.11353, 182.15218, 182.19164, 182.23108, 182.2708, 182.30989, 182.34795, 182.3871, 182.42479, 182.46089, 182.49536, 182.52867, 182.5638, 182.60063, 182.63989, 182.67992, 182.72049, 182.76151, 182.80296, 182.8448, 182.88582, 182.92665, 182.96825, 183.00778, 183.04619, 183.08208, 183.117, 183.15222, 183.18738, 183.22598, 183.2657, 183.30598, 183.34494, 183.38196, 183.41934, 183.45613, 183.49393, 183.53142, 183.56673, 183.60075, 183.63268, 183.66296, 183.69357, 183.7247, 183.76031, 183.79965, 183.83946, 183.87967, 183.91869, 183.95782, 183.99774, 184.03601, 184.07205, 184.10704, 184.14296, 184.17989, 184.21503, 184.24945, 184.28268, 184.31783, 184.35512, 184.39378, 184.43393, 184.47366, 184.51508, 184.55717, 184.59872, 184.64001, 184.68074, 184.71964, 184.75798, 184.79604, 184.83191, 184.86661, 184.90184, 184.9364, 184.96959, 185.00362, 185.0423, 185.08412, 185.12758, 185.17178, 185.21582, 185.26006, 185.30214, 185.34361, 185.3847, 185.42496, 185.46634, 185.50591, 185.54526, 185.58424, 185.62386, 185.6624, 185.7025, 185.74159, 185.78154, 185.82208, 185.86279, 185.90271, 185.94293, 185.98375, 186.0233, 186.05884, 186.09236, 186.12791, 186.16458, 186.20477, 186.24573, 186.28658, 186.32719, 186.36766, 186.40819, 186.44913, 186.48967, 186.53146, 186.57472, 186.61908, 186.66409, 186.70798, 186.75232, 186.79475, 186.83501, 186.8761, 186.91815, 186.96135, 187.00375, 187.04543, 187.08774, 187.13051, 187.17398, 187.21738, 187.26135, 187.30682, 187.3519, 187.39789, 187.44398, 187.48967, 187.53412, 187.57758, 187.62079, 187.66299, 187.70578, 187.74741, 187.79074, 187.83516, 187.8799, 187.92366, 187.9662, 188.00873, 188.0517, 188.09543, 188.13933, 188.183, 188.2269, 188.2719, 188.31848, 188.36552, 188.41412, 188.46288, 188.51031, 188.55696, 188.60126, 188.64514, 188.68958, 188.7356, 188.78317, 188.82912, 188.87651, 188.92406, 188.97069, 189.0186, 189.06526, 189.11108, 189.15532, 189.20073, 189.24802, 189.29507, 189.3419, 189.38878, 189.43637, 189.48433, 189.53323, 189.58208, 189.63031, 189.67888, 189.72659, 189.7742, 189.82292, 189.87331, 189.92422, 189.97572, 190.02654, 190.07675, 190.12685, 190.17654, 190.22655, 190.27744, 190.32918, 190.38191, 190.43228, 190.48412, 190.53688, 190.58897, 190.6412, 190.69144, 190.74126, 190.79027, 190.84029, 190.89107, 190.94135, 190.99312, 191.04454, 191.09538, 191.14601, 191.19763, 191.25024, 191.3022, 191.35342, 191.40527, 191.45781, 191.51038, 191.56477, 191.61903, 191.67284, 191.72745, 191.78351, 191.83809, 191.89211, 191.94516, 191.99768, 192.0515, 192.10683, 192.16144, 192.21646, 192.27127, 192.3248, 192.37834, 192.43166, 192.48701, 192.54335, 192.59961, 192.65665, 192.71281, 192.76929, 192.82428, 192.88118, 192.93932, 192.99641, 193.05295, 193.10945, 193.16679, 193.22235, 193.27766, 193.33466, 193.38956, 193.44543, 193.4995, 193.55339, 193.60861, 193.66547, 193.72427, 193.78304, 193.84152, 193.8996, 193.95851, 194.01683, 194.07661, 194.13618, 194.19662, 194.25862, 194.32071, 194.3831, 194.44382, 194.50331, 194.56212, 194.62186, 194.67973, 194.73642, 194.7941, 194.85469, 194.91579]}, "params-norm vs samples": {"start_step": 0, "end_step": 2000, "step_interval": 5, "values": [179.95625, 179.95625, 179.95625, 179.95625, 179.95625, 179.95624, 179.95621, 179.95612, 179.95593, 179.95575, 179.95447, 179.9538, 179.95322, 179.95126, 179.95035, 179.94966, 179.94905, 179.94916, 179.94939, 179.94966, 179.9496, 179.94933, 179.94919, 179.94952, 179.95036, 179.95168, 179.95288, 179.95392, 179.95509, 179.9565, 179.95851, 179.96069, 179.963, 179.96532, 179.96788, 179.97118, 179.97482, 179.97873, 179.98279, 179.98714, 179.99208, 179.99753, 180.00325, 180.00955, 180.01634, 180.02382, 180.03171, 180.04016, 180.04951, 180.05969, 180.07059, 180.08221, 180.09441, 180.10721, 180.12059, 180.13457, 180.14899, 180.16373, 180.1792, 180.19586, 180.21344, 180.23199, 180.25226, 180.2733, 180.2948, 180.31709, 180.34032, 180.36464, 180.38991, 180.41573, 180.44231, 180.46947, 180.49721, 180.52528, 180.55406, 180.5829, 180.61168, 180.64125, 180.67117, 180.70154, 180.73244, 180.76378, 180.79633, 180.82928, 180.86198, 180.89581, 180.92958, 180.96359, 180.99808, 181.03401, 181.07187, 181.1104, 181.14795, 181.18536, 181.22249, 181.26071, 181.29898, 181.33658, 181.37422, 181.41164, 181.4467, 181.47968, 181.5123, 181.54552, 181.57919, 181.61421, 181.65012, 181.68695, 181.72267, 181.7587, 181.79526, 181.83344, 181.87288, 181.91354, 181.9543, 181.99518, 182.03568, 182.07515, 182.11353, 182.15218, 182.19164, 182.23108, 182.2708, 182.30989, 182.34795, 182.3871, 182.42479, 182.46089, 182.49536, 182.52867, 182.5638, 182.60063, 182.63989, 182.67992, 182.72049, 182.76151, 182.80296, 182.8448, 182.88582, 182.92665, 182.96825, 183.00778, 183.04619, 183.08208, 183.117, 183.15222, 183.18738, 183.22598, 183.2657, 183.30598, 183.34494, 183.38196, 183.41934, 183.45613, 183.49393, 183.53142, 183.56673, 183.60075, 183.63268, 183.66296, 183.69357, 183.7247, 183.76031, 183.79965, 183.83946, 183.87967, 183.91869, 183.95782, 183.99774, 184.03601, 184.07205, 184.10704, 184.14296, 184.17989, 184.21503, 184.24945, 184.28268, 184.31783, 184.35512, 184.39378, 184.43393, 184.47366, 184.51508, 184.55717, 184.59872, 184.64001, 184.68074, 184.71964, 184.75798, 184.79604, 184.83191, 184.86661, 184.90184, 184.9364, 184.96959, 185.00362, 185.0423, 185.08412, 185.12758, 185.17178, 185.21582, 185.26006, 185.30214, 185.34361, 185.3847, 185.42496, 185.46634, 185.50591, 185.54526, 185.58424, 185.62386, 185.6624, 185.7025, 185.74159, 185.78154, 185.82208, 185.86279, 185.90271, 185.94293, 185.98375, 186.0233, 186.05884, 186.09236, 186.12791, 186.16458, 186.20477, 186.24573, 186.28658, 186.32719, 186.36766, 186.40819, 186.44913, 186.48967, 186.53146, 186.57472, 186.61908, 186.66409, 186.70798, 186.75232, 186.79475, 186.83501, 186.8761, 186.91815, 186.96135, 187.00375, 187.04543, 187.08774, 187.13051, 187.17398, 187.21738, 187.26135, 187.30682, 187.3519, 187.39789, 187.44398, 187.48967, 187.53412, 187.57758, 187.62079, 187.66299, 187.70578, 187.74741, 187.79074, 187.83516, 187.8799, 187.92366, 187.9662, 188.00873, 188.0517, 188.09543, 188.13933, 188.183, 188.2269, 188.2719, 188.31848, 188.36552, 188.41412, 188.46288, 188.51031, 188.55696, 188.60126, 188.64514, 188.68958, 188.7356, 188.78317, 188.82912, 188.87651, 188.92406, 188.97069, 189.0186, 189.06526, 189.11108, 189.15532, 189.20073, 189.24802, 189.29507, 189.3419, 189.38878, 189.43637, 189.48433, 189.53323, 189.58208, 189.63031, 189.67888, 189.72659, 189.7742, 189.82292, 189.87331, 189.92422, 189.97572, 190.02654, 190.07675, 190.12685, 190.17654, 190.22655, 190.27744, 190.32918, 190.38191, 190.43228, 190.48412, 190.53688, 190.58897, 190.6412, 190.69144, 190.74126, 190.79027, 190.84029, 190.89107, 190.94135, 190.99312, 191.04454, 191.09538, 191.14601, 191.19763, 191.25024, 191.3022, 191.35342, 191.40527, 191.45781, 191.51038, 191.56477, 191.61903, 191.67284, 191.72745, 191.78351, 191.83809, 191.89211, 191.94516, 191.99768, 192.0515, 192.10683, 192.16144, 192.21646, 192.27127, 192.3248, 192.37834, 192.43166, 192.48701, 192.54335, 192.59961, 192.65665, 192.71281, 192.76929, 192.82428, 192.88118, 192.93932, 192.99641, 193.05295, 193.10945, 193.16679, 193.22235, 193.27766, 193.33466, 193.38956, 193.44543, 193.4995, 193.55339, 193.60861, 193.66547, 193.72427, 193.78304, 193.84152, 193.8996, 193.95851, 194.01683, 194.07661, 194.13618, 194.19662, 194.25862, 194.32071, 194.3831, 194.44382, 194.50331, 194.56212, 194.62186, 194.67973, 194.73642, 194.7941, 194.85469, 194.91579]}, "iteration-time": {"start_step": 0, "end_step": 2000, "step_interval": 5, "values": [18.78556, 0.6433, 0.64729, 0.63688, 0.63863, 0.64094, 0.6349, 0.97491, 0.63959, 0.63938, 0.63992, 0.63559, 0.63842, 0.63697, 0.63738, 0.64112, 0.63959, 0.64348, 0.63705, 0.6364, 0.63918, 0.63292, 0.6437, 0.64018, 0.639, 0.63548, 0.63416, 0.64052, 0.6394, 0.64087, 0.93505, 0.64011, 0.63922, 0.63683, 0.63698, 0.63707, 0.63678, 0.63951, 0.63884, 0.63971, 0.64127, 0.63397, 0.63425, 0.63678, 0.64689, 0.63996, 0.6373, 0.63968, 0.63439, 0.63168, 0.63761, 0.63699, 0.63824, 0.71804, 0.64031, 0.63865, 0.64029, 0.63765, 0.63483, 0.63106, 0.64044, 0.64084, 0.64009, 0.63302, 0.63552, 0.634, 0.64042, 0.62983, 0.63367, 0.63643, 0.6354, 0.63829, 0.64059, 0.75259, 0.63372, 0.63627, 0.6387, 0.73904, 0.63828, 0.63771, 0.6359, 0.63693, 0.63456, 0.63441, 0.63425, 0.63785, 0.63673, 0.63659, 0.63691, 0.63886, 0.63666, 0.63099, 0.63434, 0.63606, 0.63766, 0.63693, 0.63641, 0.63421, 0.74335, 0.63417, 0.73325, 0.63333, 0.63749, 0.63466, 0.63579, 0.6328, 0.63166, 0.63446, 0.63178, 0.63147, 0.63478, 0.63778, 0.63144, 0.63332, 0.63409, 0.63176, 0.63302, 0.63438, 0.63574, 0.63649, 0.63622, 0.63188, 0.63339, 0.63517, 0.72118, 0.63229, 0.63429, 0.63655, 0.63599, 0.6353, 0.63271, 0.63372, 0.64125, 0.63512, 0.63455, 0.63532, 0.63725, 0.63591, 0.63729, 0.63999, 0.63638, 0.63338, 0.63695, 0.63822, 0.64221, 0.635, 0.63426, 0.63954, 0.63843, 0.75293, 0.63573, 0.63901, 0.63561, 0.63959, 0.6361, 0.63665, 0.64435, 0.63719, 0.63371, 0.63219, 0.6406, 0.64456, 0.63924, 0.635, 0.6327, 0.6352, 0.63564, 0.63957, 0.63877, 0.73034, 0.73934, 0.64019, 0.63815, 0.63937, 0.75337, 0.63669, 0.63936, 0.63737, 0.6461, 0.63756, 0.63312, 0.63542, 0.63878, 0.6388, 0.64047, 0.63637, 0.63586, 0.63666, 0.63721, 0.63734, 0.63786, 0.63594, 0.8184, 0.73163, 0.72764, 0.63564, 0.63408, 0.63622, 0.64045, 0.63686, 0.62364, 0.64914, 0.64308, 0.64069, 0.63927, 0.64269, 0.64288, 0.64533, 0.64376, 0.64236, 0.64125, 0.64212, 0.6369, 0.63583, 0.74464, 0.63698, 0.72591, 0.64074, 0.73419, 0.63849, 0.63726, 0.64412, 0.64282, 0.75083, 0.63592, 0.63941, 0.63766, 0.63791, 0.63977, 0.63509, 0.6399, 0.64297, 0.63884, 0.63671, 0.6435, 0.64374, 0.64843, 0.64579, 0.63861, 0.64594, 0.64077, 0.63925, 0.72846, 0.639, 0.64699, 0.6369, 0.63194, 0.63558, 0.64203, 0.63965, 0.63904, 0.63895, 0.63899, 0.64164, 0.63997, 0.63805, 0.63955, 0.63823, 0.64646, 0.64468, 0.64926, 0.64434, 0.6452, 0.64591, 0.64664, 0.63886, 0.731, 0.64411, 0.64842, 0.6425, 0.64476, 0.63269, 0.63913, 0.63471, 0.63896, 0.63597, 0.63778, 0.63815, 0.6401, 0.64693, 0.64595, 0.64455, 0.64718, 0.64189, 0.63449, 0.75535, 0.6495, 0.6344, 0.63238, 0.64302, 0.6447, 0.64478, 0.63878, 0.63865, 0.64385, 0.64709, 0.64475, 0.63872, 0.63717, 0.64047, 0.64341, 0.6397, 0.64191, 0.63957, 0.63403, 0.64098, 0.64479, 0.64926, 0.74478, 0.73898, 0.64632, 0.64647, 0.63797, 0.64641, 0.64397, 0.64203, 0.645, 0.64045, 0.64179, 0.64038, 0.64201, 0.64156, 0.64501, 0.64116, 0.63858, 0.63331, 0.63441, 0.63583, 0.64119, 0.6353, 0.63464, 0.63359, 0.63663, 0.64109, 0.6316, 0.63418, 0.63702, 0.63806, 0.64097, 0.63561, 0.63886, 0.63666, 0.63662, 0.64007, 0.64226, 0.64759, 0.64499, 0.6441, 0.63331, 0.63366, 0.63388, 0.64218, 0.6449, 0.7739, 0.64344, 0.64344, 0.64738, 0.64398, 0.64107, 0.64511, 0.64245, 0.64068, 0.6375, 0.63653, 0.63463, 0.63795, 0.64039, 0.6391, 0.63754, 0.63814, 0.64098, 0.63698, 0.63569, 0.63797, 0.63695, 0.64036, 0.63449, 0.63592, 0.72519, 0.64273, 0.63744, 0.63929, 0.63719, 0.64021, 0.64007, 0.63925, 0.63833, 0.63918, 0.63915, 0.64067, 0.64172, 0.63687, 0.63877, 0.63737, 0.64309, 0.6455, 0.64316, 0.63731, 0.6383, 0.63962]}, "lm loss validation": {"start_step": 0, "end_step": 4, "step_interval": 5, "values": [5.60423]}, "lm loss validation vs samples": {"start_step": 0, "end_step": 4, "step_interval": 5, "values": [5.60423]}, "lm loss validation ppl": {"start_step": 0, "end_step": 4, "step_interval": 5, "values": [271.57376]}, "lm loss validation ppl vs samples": {"start_step": 0, "end_step": 4, "step_interval": 5, "values": [271.57376]}}
\ No newline at end of file
diff --git a/tests/functional_tests/test_cases/gpt/gpt3_345m_weekly_dgx_h100_1N8G_mcore_tp1_pp1_fp8_no_model_parallel/golden_values_lts.json b/tests/functional_tests/test_cases/gpt/gpt3_345m_weekly_dgx_h100_1N8G_mcore_tp1_pp1_fp8_no_model_parallel/golden_values_lts.json
new file mode 100644
index 0000000000..6a88c3a850
--- /dev/null
+++ b/tests/functional_tests/test_cases/gpt/gpt3_345m_weekly_dgx_h100_1N8G_mcore_tp1_pp1_fp8_no_model_parallel/golden_values_lts.json
@@ -0,0 +1 @@
+{"forward-backward-time": {"start_step": 0, "end_step": 2000, "step_interval": 5, "values": [18.14133, 0.62524, 0.62888, 0.61879, 0.62017, 0.62262, 0.61644, 0.95648, 0.62134, 0.62122, 0.62167, 0.61736, 0.62014, 0.61878, 0.61905, 0.62285, 0.62143, 0.62527, 0.6188, 0.61821, 0.62092, 0.6146, 0.62538, 0.62186, 0.62062, 0.61709, 0.61586, 0.62211, 0.62113, 0.62256, 0.91616, 0.62172, 0.62082, 0.61854, 0.61851, 0.61865, 0.61838, 0.62057, 0.62054, 0.62121, 0.62279, 0.61565, 0.61588, 0.61809, 0.6285, 0.62159, 0.619, 0.62096, 0.6161, 0.61341, 0.61939, 0.61863, 0.61901, 0.69973, 0.62205, 0.6203, 0.62205, 0.61913, 0.61593, 0.61268, 0.62209, 0.62242, 0.62178, 0.61463, 0.61723, 0.61562, 0.62222, 0.61147, 0.61537, 0.61793, 0.61712, 0.61962, 0.62226, 0.73426, 0.61519, 0.61809, 0.62057, 0.72077, 0.62008, 0.6196, 0.61771, 0.61875, 0.61628, 0.61618, 0.61608, 0.61962, 0.61838, 0.61834, 0.61866, 0.62047, 0.61852, 0.61278, 0.61478, 0.61796, 0.61939, 0.61855, 0.61816, 0.61585, 0.72525, 0.61589, 0.71497, 0.61452, 0.61899, 0.61647, 0.61769, 0.61448, 0.6133, 0.6161, 0.61341, 0.61318, 0.61661, 0.61966, 0.61316, 0.61487, 0.61573, 0.61347, 0.61386, 0.61593, 0.61745, 0.6185, 0.61792, 0.61356, 0.61533, 0.61644, 0.70276, 0.61398, 0.6159, 0.61832, 0.61774, 0.61711, 0.61411, 0.61533, 0.62272, 0.61709, 0.61557, 0.61705, 0.61893, 0.6177, 0.61888, 0.62207, 0.6181, 0.61501, 0.61758, 0.61994, 0.62402, 0.61667, 0.61599, 0.62131, 0.62011, 0.73481, 0.61752, 0.6206, 0.61654, 0.62124, 0.61775, 0.61832, 0.62597, 0.61901, 0.6153, 0.61393, 0.62147, 0.62628, 0.62091, 0.61689, 0.61436, 0.61683, 0.61743, 0.62116, 0.62033, 0.71198, 0.71973, 0.62179, 0.61968, 0.62104, 0.73504, 0.61833, 0.62098, 0.61898, 0.62766, 0.61917, 0.61475, 0.61706, 0.62025, 0.62046, 0.62146, 0.61796, 0.61756, 0.61818, 0.61889, 0.61869, 0.61959, 0.61761, 0.79997, 0.71316, 0.7092, 0.61693, 0.61553, 0.61793, 0.62191, 0.61846, 0.60521, 0.63066, 0.62491, 0.6225, 0.62102, 0.62456, 0.6247, 0.6269, 0.62537, 0.62411, 0.6231, 0.62397, 0.61873, 0.61766, 0.72647, 0.61878, 0.70741, 0.62227, 0.71605, 0.62022, 0.61781, 0.62597, 0.62427, 0.73275, 0.61764, 0.62069, 0.61913, 0.61957, 0.62075, 0.61693, 0.62163, 0.62496, 0.62065, 0.61855, 0.62534, 0.62563, 0.63027, 0.62765, 0.62046, 0.62782, 0.6225, 0.62116, 0.71019, 0.62081, 0.62867, 0.61875, 0.61378, 0.61727, 0.6238, 0.62162, 0.62088, 0.61962, 0.62082, 0.62352, 0.62164, 0.62001, 0.62139, 0.62, 0.62818, 0.6266, 0.63112, 0.62627, 0.62702, 0.62774, 0.62831, 0.62063, 0.71258, 0.62584, 0.63033, 0.62439, 0.62649, 0.61461, 0.6209, 0.61667, 0.62067, 0.61793, 0.61954, 0.61977, 0.622, 0.6288, 0.62767, 0.62589, 0.62912, 0.62368, 0.61631, 0.73714, 0.6313, 0.61624, 0.61414, 0.62482, 0.6265, 0.62661, 0.62057, 0.62063, 0.62436, 0.62886, 0.62643, 0.62055, 0.61891, 0.62228, 0.62509, 0.62152, 0.62371, 0.62145, 0.61596, 0.62278, 0.62635, 0.63114, 0.72659, 0.72093, 0.62818, 0.62831, 0.61965, 0.62825, 0.62531, 0.6239, 0.6269, 0.6223, 0.62369, 0.62215, 0.62376, 0.62336, 0.62681, 0.62299, 0.62046, 0.61497, 0.61616, 0.61762, 0.62291, 0.61731, 0.61644, 0.61524, 0.61842, 0.62286, 0.61327, 0.61596, 0.6185, 0.61983, 0.62272, 0.61746, 0.6207, 0.6179, 0.61849, 0.62196, 0.62408, 0.62953, 0.62672, 0.62606, 0.61511, 0.61549, 0.6159, 0.62334, 0.62662, 0.75567, 0.62523, 0.62516, 0.62916, 0.62575, 0.62292, 0.62685, 0.62432, 0.62244, 0.61921, 0.61816, 0.61641, 0.61968, 0.62202, 0.6208, 0.6193, 0.61995, 0.62245, 0.61844, 0.61724, 0.61904, 0.61874, 0.62205, 0.6161, 0.61772, 0.70649, 0.62431, 0.61921, 0.62093, 0.61887, 0.62189, 0.62184, 0.62081, 0.62021, 0.62093, 0.62086, 0.62164, 0.6235, 0.61872, 0.62062, 0.61908, 0.62491, 0.62732, 0.62504, 0.61899, 0.62006, 0.6215]}, "forward-compute-time": {"start_step": 0, "end_step": 2000, "step_interval": 5, "values": [9.27215, 0.36134, 0.36093, 0.35232, 0.35362, 0.35668, 0.35229, 0.68753, 0.35087, 0.35407, 0.35147, 0.35356, 0.35146, 0.35384, 0.35274, 0.35595, 0.35404, 0.35262, 0.35078, 0.34962, 0.35338, 0.34834, 0.35424, 0.35549, 0.35524, 0.34948, 0.35114, 0.35465, 0.35306, 0.35417, 0.64338, 0.35253, 0.35038, 0.34824, 0.3516, 0.35295, 0.35334, 0.3507, 0.3518, 0.35354, 0.35258, 0.3508, 0.35045, 0.35367, 0.35832, 0.35222, 0.35029, 0.35265, 0.35179, 0.34702, 0.35321, 0.35445, 0.35177, 0.43752, 0.35531, 0.35287, 0.3529, 0.34925, 0.35154, 0.34648, 0.34908, 0.35314, 0.34798, 0.3481, 0.35014, 0.35038, 0.35008, 0.34793, 0.34843, 0.35226, 0.35123, 0.34921, 0.351, 0.46524, 0.34642, 0.35022, 0.34926, 0.45533, 0.35075, 0.35197, 0.34952, 0.35294, 0.35156, 0.35367, 0.35231, 0.35148, 0.34881, 0.34904, 0.35192, 0.35269, 0.35151, 0.34592, 0.34953, 0.35046, 0.35109, 0.35197, 0.35201, 0.34972, 0.45764, 0.34845, 0.44993, 0.34761, 0.35227, 0.34673, 0.35005, 0.34603, 0.34781, 0.34961, 0.34726, 0.3482, 0.3514, 0.35199, 0.34526, 0.3478, 0.35064, 0.34875, 0.35162, 0.34733, 0.3494, 0.34825, 0.35136, 0.34918, 0.34966, 0.34867, 0.43767, 0.34863, 0.35097, 0.35094, 0.34677, 0.35081, 0.35072, 0.35015, 0.35172, 0.35213, 0.34826, 0.34865, 0.35048, 0.3496, 0.34911, 0.35588, 0.35342, 0.35191, 0.35141, 0.35102, 0.35709, 0.34876, 0.34872, 0.35106, 0.35322, 0.46707, 0.35188, 0.35176, 0.35, 0.35379, 0.3509, 0.35081, 0.3551, 0.35093, 0.34933, 0.34848, 0.35167, 0.35398, 0.34723, 0.34792, 0.34845, 0.34775, 0.35079, 0.34957, 0.35345, 0.44501, 0.45138, 0.34891, 0.35082, 0.3502, 0.46589, 0.35255, 0.35187, 0.35127, 0.35483, 0.35059, 0.34896, 0.34861, 0.35247, 0.35179, 0.34935, 0.35234, 0.34933, 0.35334, 0.34686, 0.35171, 0.35547, 0.35168, 0.52709, 0.44719, 0.44161, 0.34936, 0.34954, 0.35313, 0.34988, 0.35211, 0.33688, 0.35591, 0.3569, 0.35308, 0.35372, 0.35241, 0.35314, 0.35633, 0.353, 0.35616, 0.35467, 0.35273, 0.3514, 0.35129, 0.45541, 0.3499, 0.44221, 0.35081, 0.44665, 0.35109, 0.35024, 0.35427, 0.35423, 0.46289, 0.34881, 0.35173, 0.34964, 0.35399, 0.35206, 0.35147, 0.35326, 0.35451, 0.35111, 0.35112, 0.35937, 0.35913, 0.36067, 0.35939, 0.35289, 0.35237, 0.34936, 0.35284, 0.44138, 0.35073, 0.35858, 0.35425, 0.34953, 0.35087, 0.35453, 0.35091, 0.35251, 0.34904, 0.35282, 0.35193, 0.35492, 0.35161, 0.35115, 0.35118, 0.36151, 0.35849, 0.36407, 0.35821, 0.36041, 0.35561, 0.36252, 0.35429, 0.44699, 0.36096, 0.36201, 0.35407, 0.35747, 0.35035, 0.35103, 0.34874, 0.35637, 0.3524, 0.35102, 0.35202, 0.35462, 0.35968, 0.35397, 0.35259, 0.35547, 0.35321, 0.35018, 0.46643, 0.3583, 0.35092, 0.34697, 0.3538, 0.35589, 0.35223, 0.35164, 0.35261, 0.35967, 0.36013, 0.35806, 0.35023, 0.35024, 0.3526, 0.34984, 0.35259, 0.35298, 0.35284, 0.35138, 0.35036, 0.35288, 0.35847, 0.45332, 0.44559, 0.35561, 0.35336, 0.3521, 0.35312, 0.35227, 0.35234, 0.35359, 0.35468, 0.35224, 0.35204, 0.35651, 0.35583, 0.35358, 0.35435, 0.35427, 0.3497, 0.35079, 0.35172, 0.35517, 0.35178, 0.35126, 0.34889, 0.35033, 0.35332, 0.34892, 0.35261, 0.35094, 0.35215, 0.35764, 0.35341, 0.35384, 0.35265, 0.35263, 0.35262, 0.35604, 0.36288, 0.35642, 0.35552, 0.3484, 0.34851, 0.3514, 0.36023, 0.35789, 0.48902, 0.36035, 0.36141, 0.3626, 0.35908, 0.35622, 0.35631, 0.35269, 0.35075, 0.35039, 0.35096, 0.35039, 0.34953, 0.35289, 0.34822, 0.35154, 0.35088, 0.35383, 0.35072, 0.34872, 0.34826, 0.34902, 0.35267, 0.34801, 0.34971, 0.43955, 0.35085, 0.34994, 0.35373, 0.34855, 0.3492, 0.35231, 0.34725, 0.35003, 0.3473, 0.35104, 0.34755, 0.34992, 0.35186, 0.35388, 0.35074, 0.34993, 0.35194, 0.35167, 0.34626, 0.35392, 0.35198]}, "backward-compute-time": {"start_step": 0, "end_step": 2000, "step_interval": 5, "values": [2.80897, 0.28475, 0.2809, 0.27885, 0.27971, 0.2768, 0.27791, 0.2813, 0.2828, 0.27982, 0.28277, 0.27676, 0.28261, 0.27806, 0.28033, 0.2756, 0.28082, 0.27955, 0.28018, 0.27766, 0.27802, 0.27721, 0.28203, 0.27953, 0.27943, 0.27922, 0.27814, 0.28056, 0.28107, 0.27624, 0.28037, 0.28169, 0.2828, 0.28312, 0.28074, 0.27837, 0.27679, 0.28303, 0.2829, 0.28043, 0.27823, 0.27266, 0.27336, 0.27459, 0.28023, 0.27652, 0.27746, 0.2779, 0.27563, 0.27401, 0.27717, 0.27499, 0.27806, 0.27139, 0.27365, 0.27659, 0.28082, 0.28038, 0.27531, 0.27517, 0.28057, 0.27667, 0.28628, 0.27883, 0.27588, 0.27536, 0.27984, 0.2729, 0.27334, 0.27425, 0.27422, 0.27613, 0.27623, 0.2746, 0.27458, 0.27341, 0.27807, 0.27236, 0.27663, 0.27538, 0.27514, 0.27306, 0.2725, 0.27083, 0.27026, 0.27509, 0.27586, 0.27515, 0.27392, 0.27389, 0.27372, 0.2727, 0.27096, 0.27354, 0.27409, 0.27274, 0.27274, 0.27361, 0.27352, 0.27457, 0.27411, 0.27589, 0.27459, 0.27704, 0.27375, 0.27488, 0.27373, 0.27473, 0.27336, 0.27408, 0.27412, 0.27621, 0.27573, 0.2757, 0.27319, 0.27286, 0.27081, 0.27628, 0.27632, 0.27773, 0.27459, 0.27302, 0.27391, 0.27706, 0.27302, 0.27235, 0.2728, 0.27422, 0.27771, 0.27408, 0.273, 0.27313, 0.27881, 0.2727, 0.27535, 0.27554, 0.27602, 0.27445, 0.27748, 0.27334, 0.27196, 0.27246, 0.27334, 0.2765, 0.27324, 0.27646, 0.27446, 0.27758, 0.27638, 0.2749, 0.27379, 0.27822, 0.27586, 0.27434, 0.27452, 0.2751, 0.27681, 0.27448, 0.27334, 0.27477, 0.27831, 0.27967, 0.28117, 0.27795, 0.27331, 0.27527, 0.27361, 0.27892, 0.27512, 0.27366, 0.27646, 0.27988, 0.27713, 0.27762, 0.27574, 0.27463, 0.27934, 0.27654, 0.28122, 0.27818, 0.27487, 0.27565, 0.27548, 0.27639, 0.27869, 0.27377, 0.27686, 0.2737, 0.27871, 0.27425, 0.27333, 0.27386, 0.27879, 0.2752, 0.27707, 0.27628, 0.27433, 0.27416, 0.28211, 0.27328, 0.27772, 0.2888, 0.28238, 0.28559, 0.28328, 0.28926, 0.29069, 0.28744, 0.28541, 0.28383, 0.28569, 0.28878, 0.28294, 0.28177, 0.28457, 0.28391, 0.27915, 0.28556, 0.28795, 0.28723, 0.28157, 0.28876, 0.288, 0.28233, 0.28245, 0.28563, 0.28586, 0.27943, 0.28324, 0.27971, 0.28335, 0.28509, 0.28373, 0.28221, 0.27996, 0.2821, 0.28282, 0.28146, 0.2827, 0.29287, 0.28819, 0.28375, 0.28224, 0.28618, 0.28593, 0.27803, 0.2775, 0.27939, 0.28305, 0.28516, 0.28387, 0.28394, 0.27989, 0.28606, 0.28244, 0.28311, 0.2822, 0.28452, 0.28083, 0.28371, 0.27966, 0.28404, 0.27905, 0.28671, 0.28017, 0.28042, 0.27826, 0.27799, 0.28104, 0.28485, 0.2833, 0.27803, 0.28505, 0.28078, 0.27731, 0.27811, 0.2825, 0.2845, 0.28366, 0.28285, 0.29128, 0.28986, 0.28737, 0.28519, 0.28008, 0.28508, 0.29026, 0.27934, 0.27842, 0.28735, 0.28334, 0.29041, 0.28444, 0.28192, 0.27975, 0.28248, 0.28157, 0.28471, 0.28418, 0.28337, 0.29038, 0.28525, 0.28937, 0.28336, 0.28092, 0.28765, 0.2938, 0.28931, 0.28955, 0.29117, 0.29147, 0.29048, 0.28242, 0.29224, 0.28996, 0.28762, 0.28995, 0.28361, 0.28955, 0.28314, 0.28125, 0.28279, 0.28923, 0.28566, 0.28096, 0.27889, 0.27987, 0.28102, 0.28378, 0.27825, 0.27822, 0.28139, 0.28151, 0.284, 0.28038, 0.27763, 0.28234, 0.28237, 0.27877, 0.27839, 0.28213, 0.27969, 0.27977, 0.28461, 0.28193, 0.28295, 0.28539, 0.28439, 0.28043, 0.28021, 0.27978, 0.27678, 0.28057, 0.28152, 0.27875, 0.27736, 0.28042, 0.28071, 0.27701, 0.28009, 0.28081, 0.28054, 0.27846, 0.27695, 0.27435, 0.28018, 0.27863, 0.2831, 0.27711, 0.27774, 0.27798, 0.27776, 0.27805, 0.27924, 0.27943, 0.27863, 0.27639, 0.27628, 0.27471, 0.28218, 0.2775, 0.27692, 0.28008, 0.28228, 0.27856, 0.28233, 0.27871, 0.28388, 0.27878, 0.2831, 0.28268, 0.27716, 0.2756, 0.27712, 0.28343, 0.28463, 0.28241, 0.28327, 0.27551, 0.27892]}, "batch-generator-time": {"start_step": 0, "end_step": 2000, "step_interval": 5, "values": [0.62041, 0.00418, 0.00386, 0.00419, 0.00438, 0.0044, 0.00464, 0.00467, 0.00468, 0.00448, 0.00443, 0.00436, 0.00461, 0.00452, 0.00471, 0.00475, 0.00426, 0.00443, 0.00451, 0.00448, 0.00454, 0.00422, 0.00444, 0.00458, 0.00446, 0.00447, 0.00432, 0.00458, 0.00459, 0.00455, 0.00456, 0.0044, 0.00451, 0.00445, 0.00465, 0.00435, 0.00439, 0.00431, 0.00431, 0.00453, 0.0045, 0.00449, 0.00456, 0.00437, 0.00432, 0.0043, 0.00442, 0.0045, 0.0042, 0.00427, 0.0045, 0.00438, 0.00447, 0.00452, 0.0046, 0.00429, 0.00439, 0.00441, 0.00462, 0.00448, 0.00409, 0.00434, 0.00448, 0.0042, 0.00454, 0.00422, 0.00431, 0.00413, 0.00439, 0.00414, 0.00456, 0.00464, 0.00426, 0.00434, 0.00414, 0.00453, 0.00423, 0.00453, 0.00431, 0.00403, 0.00414, 0.0043, 0.00446, 0.00423, 0.00437, 0.00434, 0.00419, 0.0042, 0.00433, 0.00435, 0.00443, 0.00408, 0.00416, 0.00451, 0.00443, 0.00435, 0.00446, 0.00421, 0.00467, 0.00454, 0.00431, 0.00462, 0.00433, 0.00426, 0.00437, 0.00437, 0.00433, 0.00435, 0.00426, 0.00413, 0.00435, 0.00422, 0.00431, 0.00432, 0.0043, 0.00408, 0.00435, 0.00438, 0.00439, 0.00426, 0.00438, 0.00432, 0.00449, 0.00423, 0.00444, 0.00436, 0.00417, 0.00424, 0.0042, 0.00428, 0.00425, 0.00425, 0.0042, 0.00445, 0.0043, 0.00429, 0.00441, 0.0043, 0.00412, 0.00429, 0.0042, 0.00419, 0.0042, 0.00427, 0.00427, 0.00418, 0.00464, 0.00406, 0.00435, 0.0046, 0.0043, 0.00438, 0.00417, 0.00427, 0.0044, 0.00444, 0.0045, 0.00407, 0.00421, 0.00403, 0.00442, 0.00418, 0.00425, 0.00425, 0.00434, 0.00422, 0.00432, 0.00446, 0.00435, 0.00452, 0.00428, 0.00408, 0.00445, 0.00414, 0.00441, 0.00412, 0.00434, 0.00445, 0.00425, 0.00412, 0.00432, 0.00441, 0.00432, 0.00422, 0.00429, 0.00407, 0.00434, 0.00448, 0.00434, 0.00434, 0.00423, 0.00422, 0.0046, 0.00418, 0.00445, 0.00432, 0.00422, 0.00418, 0.00408, 0.00434, 0.03441, 0.00493, 0.00506, 0.00555, 0.00518, 0.00512, 0.00537, 0.00513, 0.00501, 0.00506, 0.00504, 0.00473, 0.00488, 0.00523, 0.00528, 0.00511, 0.00526, 0.00496, 0.00546, 0.00512, 0.0054, 0.00539, 0.00514, 0.00484, 0.00515, 0.00531, 0.00515, 0.00498, 0.00509, 0.0051, 0.00516, 0.00496, 0.00494, 0.00501, 0.00511, 0.00536, 0.00517, 0.00549, 0.00531, 0.00526, 0.00531, 0.00497, 0.00498, 0.00524, 0.00486, 0.00502, 0.00497, 0.00491, 0.00509, 0.00466, 0.00519, 0.00528, 0.00486, 0.00509, 0.0049, 0.005, 0.00508, 0.005, 0.00503, 0.00473, 0.00536, 0.00516, 0.00549, 0.00528, 0.00506, 0.00513, 0.00501, 0.00563, 0.00498, 0.00498, 0.0051, 0.00528, 0.00509, 0.005, 0.00495, 0.00509, 0.00508, 0.00485, 0.00479, 0.00485, 0.00507, 0.00499, 0.00463, 0.00497, 0.00487, 0.00529, 0.00518, 0.00483, 0.00513, 0.0051, 0.005, 0.005, 0.00514, 0.00496, 0.00492, 0.00547, 0.00506, 0.00502, 0.00481, 0.0051, 0.00498, 0.0051, 0.00475, 0.00498, 0.0048, 0.00528, 0.00523, 0.0053, 0.00561, 0.00522, 0.00517, 0.00528, 0.00505, 0.00511, 0.00538, 0.00531, 0.00528, 0.00554, 0.00534, 0.00512, 0.00541, 0.00533, 0.00508, 0.00518, 0.00519, 0.00548, 0.00545, 0.00554, 0.0052, 0.00506, 0.00513, 0.00502, 0.00523, 0.00513, 0.00478, 0.00487, 0.00503, 0.00512, 0.0051, 0.00529, 0.005, 0.00521, 0.00528, 0.00511, 0.00522, 0.00513, 0.00533, 0.00502, 0.0053, 0.00492, 0.00522, 0.00496, 0.00488, 0.00513, 0.00506, 0.00519, 0.00508, 0.00521, 0.00442, 0.00409, 0.00426, 0.0043, 0.00418, 0.00428, 0.00456, 0.00443, 0.00422, 0.00426, 0.0043, 0.00429, 0.00435, 0.00446, 0.0044, 0.00447, 0.00444, 0.0043, 0.0042, 0.00438, 0.00422, 0.00429, 0.00463, 0.00435, 0.00431, 0.00447, 0.00431, 0.00441, 0.00417, 0.00425, 0.0044, 0.00438, 0.00438, 0.00439, 0.00447, 0.00402, 0.00423, 0.00447, 0.00451, 0.00457, 0.00458, 0.00426]}, "layernorm-grads-all-reduce-time": {"start_step": 0, "end_step": 2000, "step_interval": 5, "values": [3e-05, 1e-05, 1e-05, 1e-05, 1e-05, 2e-05, 1e-05, 2e-05, 2e-05, 1e-05, 1e-05, 2e-05, 2e-05, 1e-05, 2e-05, 2e-05, 1e-05, 1e-05, 2e-05, 1e-05, 2e-05, 2e-05, 2e-05, 1e-05, 2e-05, 1e-05, 2e-05, 2e-05, 1e-05, 1e-05, 2e-05, 2e-05, 2e-05, 1e-05, 1e-05, 2e-05, 1e-05, 1e-05, 2e-05, 1e-05, 1e-05, 2e-05, 2e-05, 2e-05, 1e-05, 1e-05, 2e-05, 2e-05, 1e-05, 2e-05, 2e-05, 1e-05, 2e-05, 1e-05, 2e-05, 1e-05, 1e-05, 3e-05, 2e-05, 2e-05, 2e-05, 2e-05, 1e-05, 2e-05, 2e-05, 1e-05, 1e-05, 1e-05, 1e-05, 2e-05, 1e-05, 2e-05, 1e-05, 2e-05, 1e-05, 1e-05, 1e-05, 2e-05, 2e-05, 2e-05, 1e-05, 1e-05, 2e-05, 1e-05, 2e-05, 2e-05, 2e-05, 2e-05, 2e-05, 1e-05, 2e-05, 2e-05, 2e-05, 2e-05, 1e-05, 2e-05, 1e-05, 1e-05, 2e-05, 2e-05, 2e-05, 2e-05, 2e-05, 2e-05, 1e-05, 2e-05, 1e-05, 2e-05, 1e-05, 2e-05, 1e-05, 1e-05, 1e-05, 2e-05, 1e-05, 2e-05, 2e-05, 1e-05, 2e-05, 1e-05, 1e-05, 1e-05, 2e-05, 2e-05, 1e-05, 2e-05, 2e-05, 1e-05, 2e-05, 2e-05, 2e-05, 2e-05, 2e-05, 2e-05, 1e-05, 2e-05, 2e-05, 2e-05, 2e-05, 2e-05, 1e-05, 2e-05, 2e-05, 2e-05, 1e-05, 1e-05, 2e-05, 2e-05, 1e-05, 1e-05, 1e-05, 2e-05, 2e-05, 2e-05, 2e-05, 1e-05, 1e-05, 1e-05, 1e-05, 1e-05, 2e-05, 1e-05, 1e-05, 2e-05, 2e-05, 2e-05, 2e-05, 2e-05, 2e-05, 1e-05, 2e-05, 2e-05, 2e-05, 2e-05, 1e-05, 2e-05, 2e-05, 2e-05, 2e-05, 1e-05, 1e-05, 1e-05, 2e-05, 1e-05, 2e-05, 1e-05, 1e-05, 2e-05, 2e-05, 2e-05, 2e-05, 2e-05, 1e-05, 1e-05, 2e-05, 2e-05, 2e-05, 1e-05, 2e-05, 2e-05, 2e-05, 2e-05, 2e-05, 2e-05, 1e-05, 1e-05, 2e-05, 1e-05, 2e-05, 1e-05, 2e-05, 1e-05, 1e-05, 1e-05, 1e-05, 2e-05, 1e-05, 1e-05, 1e-05, 1e-05, 1e-05, 1e-05, 1e-05, 1e-05, 1e-05, 2e-05, 2e-05, 1e-05, 2e-05, 1e-05, 1e-05, 1e-05, 1e-05, 1e-05, 1e-05, 1e-05, 1e-05, 1e-05, 1e-05, 2e-05, 1e-05, 1e-05, 1e-05, 1e-05, 1e-05, 1e-05, 2e-05, 1e-05, 1e-05, 1e-05, 2e-05, 2e-05, 1e-05, 1e-05, 1e-05, 2e-05, 1e-05, 1e-05, 2e-05, 1e-05, 2e-05, 1e-05, 1e-05, 2e-05, 2e-05, 1e-05, 2e-05, 1e-05, 1e-05, 1e-05, 1e-05, 1e-05, 1e-05, 1e-05, 1e-05, 1e-05, 1e-05, 1e-05, 1e-05, 2e-05, 1e-05, 1e-05, 2e-05, 1e-05, 1e-05, 2e-05, 1e-05, 1e-05, 1e-05, 2e-05, 1e-05, 1e-05, 2e-05, 1e-05, 2e-05, 1e-05, 2e-05, 1e-05, 1e-05, 1e-05, 1e-05, 1e-05, 1e-05, 1e-05, 2e-05, 2e-05, 1e-05, 1e-05, 1e-05, 1e-05, 2e-05, 1e-05, 1e-05, 2e-05, 2e-05, 1e-05, 1e-05, 1e-05, 1e-05, 2e-05, 1e-05, 2e-05, 2e-05, 1e-05, 1e-05, 2e-05, 1e-05, 2e-05, 1e-05, 1e-05, 1e-05, 1e-05, 1e-05, 1e-05, 2e-05, 1e-05, 2e-05, 1e-05, 1e-05, 1e-05, 1e-05, 1e-05, 2e-05, 1e-05, 1e-05, 1e-05, 1e-05, 1e-05, 1e-05, 2e-05, 1e-05, 2e-05, 2e-05, 1e-05, 1e-05, 1e-05, 1e-05, 1e-05, 1e-05, 2e-05, 1e-05, 1e-05, 1e-05, 1e-05, 1e-05, 1e-05, 1e-05, 1e-05, 1e-05, 1e-05, 1e-05, 1e-05, 2e-05, 1e-05, 1e-05, 1e-05, 1e-05, 1e-05, 1e-05, 1e-05, 1e-05, 1e-05, 1e-05, 1e-05, 1e-05, 1e-05, 1e-05, 1e-05, 2e-05, 1e-05, 1e-05, 2e-05, 2e-05, 1e-05, 1e-05, 1e-05, 1e-05, 1e-05, 2e-05, 1e-05]}, "embedding-grads-all-reduce-time": {"start_step": 0, "end_step": 2000, "step_interval": 5, "values": [4e-05, 2e-05, 2e-05, 2e-05, 2e-05, 2e-05, 2e-05, 2e-05, 2e-05, 2e-05, 2e-05, 2e-05, 2e-05, 2e-05, 2e-05, 2e-05, 2e-05, 2e-05, 2e-05, 2e-05, 2e-05, 2e-05, 2e-05, 2e-05, 2e-05, 2e-05, 2e-05, 2e-05, 2e-05, 2e-05, 2e-05, 2e-05, 2e-05, 2e-05, 2e-05, 2e-05, 2e-05, 2e-05, 2e-05, 2e-05, 2e-05, 2e-05, 2e-05, 2e-05, 3e-05, 2e-05, 2e-05, 2e-05, 2e-05, 2e-05, 3e-05, 3e-05, 2e-05, 2e-05, 2e-05, 2e-05, 2e-05, 2e-05, 2e-05, 2e-05, 2e-05, 3e-05, 2e-05, 2e-05, 2e-05, 2e-05, 2e-05, 3e-05, 2e-05, 2e-05, 2e-05, 2e-05, 2e-05, 2e-05, 2e-05, 2e-05, 2e-05, 2e-05, 2e-05, 2e-05, 2e-05, 2e-05, 2e-05, 2e-05, 2e-05, 2e-05, 2e-05, 3e-05, 2e-05, 2e-05, 2e-05, 2e-05, 2e-05, 2e-05, 2e-05, 2e-05, 2e-05, 2e-05, 2e-05, 2e-05, 2e-05, 2e-05, 3e-05, 2e-05, 2e-05, 3e-05, 2e-05, 2e-05, 2e-05, 2e-05, 2e-05, 2e-05, 2e-05, 2e-05, 2e-05, 2e-05, 2e-05, 2e-05, 2e-05, 2e-05, 2e-05, 2e-05, 3e-05, 2e-05, 2e-05, 2e-05, 2e-05, 2e-05, 2e-05, 2e-05, 2e-05, 2e-05, 2e-05, 2e-05, 2e-05, 2e-05, 2e-05, 2e-05, 2e-05, 3e-05, 2e-05, 2e-05, 2e-05, 3e-05, 2e-05, 2e-05, 2e-05, 2e-05, 2e-05, 2e-05, 2e-05, 2e-05, 2e-05, 2e-05, 2e-05, 2e-05, 2e-05, 2e-05, 2e-05, 2e-05, 2e-05, 2e-05, 2e-05, 2e-05, 2e-05, 2e-05, 2e-05, 2e-05, 2e-05, 2e-05, 3e-05, 2e-05, 2e-05, 2e-05, 2e-05, 2e-05, 2e-05, 2e-05, 2e-05, 2e-05, 2e-05, 2e-05, 2e-05, 2e-05, 2e-05, 2e-05, 2e-05, 2e-05, 2e-05, 3e-05, 2e-05, 2e-05, 2e-05, 2e-05, 2e-05, 2e-05, 2e-05, 2e-05, 2e-05, 2e-05, 2e-05, 2e-05, 2e-05, 2e-05, 2e-05, 2e-05, 2e-05, 2e-05, 2e-05, 2e-05, 2e-05, 2e-05, 2e-05, 2e-05, 2e-05, 2e-05, 2e-05, 2e-05, 2e-05, 2e-05, 2e-05, 2e-05, 2e-05, 2e-05, 2e-05, 2e-05, 2e-05, 2e-05, 2e-05, 2e-05, 2e-05, 2e-05, 2e-05, 2e-05, 2e-05, 2e-05, 2e-05, 2e-05, 2e-05, 2e-05, 2e-05, 2e-05, 2e-05, 2e-05, 2e-05, 2e-05, 2e-05, 2e-05, 2e-05, 2e-05, 2e-05, 2e-05, 2e-05, 2e-05, 2e-05, 2e-05, 2e-05, 2e-05, 2e-05, 2e-05, 3e-05, 2e-05, 2e-05, 2e-05, 2e-05, 2e-05, 2e-05, 2e-05, 2e-05, 2e-05, 2e-05, 2e-05, 2e-05, 2e-05, 2e-05, 2e-05, 2e-05, 2e-05, 2e-05, 2e-05, 2e-05, 2e-05, 2e-05, 2e-05, 2e-05, 2e-05, 2e-05, 2e-05, 2e-05, 2e-05, 2e-05, 2e-05, 2e-05, 2e-05, 2e-05, 2e-05, 2e-05, 2e-05, 2e-05, 2e-05, 2e-05, 2e-05, 2e-05, 2e-05, 2e-05, 2e-05, 2e-05, 2e-05, 2e-05, 2e-05, 2e-05, 2e-05, 2e-05, 2e-05, 2e-05, 2e-05, 2e-05, 2e-05, 2e-05, 2e-05, 2e-05, 2e-05, 2e-05, 2e-05, 2e-05, 2e-05, 2e-05, 2e-05, 2e-05, 2e-05, 2e-05, 2e-05, 2e-05, 2e-05, 2e-05, 2e-05, 2e-05, 2e-05, 2e-05, 2e-05, 2e-05, 2e-05, 2e-05, 2e-05, 2e-05, 2e-05, 2e-05, 2e-05, 2e-05, 2e-05, 2e-05, 2e-05, 2e-05, 2e-05, 2e-05, 2e-05, 2e-05, 2e-05, 2e-05, 2e-05, 2e-05, 2e-05, 2e-05, 2e-05, 2e-05, 3e-05, 2e-05, 2e-05, 2e-05, 2e-05, 2e-05, 2e-05, 2e-05, 2e-05, 3e-05, 2e-05, 2e-05, 2e-05, 2e-05, 2e-05, 2e-05, 2e-05, 2e-05, 2e-05, 2e-05, 2e-05, 2e-05, 2e-05, 2e-05, 2e-05, 2e-05, 3e-05, 2e-05, 2e-05, 2e-05, 2e-05, 2e-05, 2e-05, 2e-05, 2e-05]}, "all-grads-sync-time": {"start_step": 0, "end_step": 2000, "step_interval": 5, "values": [6.22336, 0.00298, 0.00292, 0.00297, 0.0029, 0.00289, 0.00306, 0.00314, 0.00321, 0.003, 0.00296, 0.00297, 0.00294, 0.00288, 0.00301, 0.00324, 0.00323, 0.00298, 0.00292, 0.00298, 0.00295, 0.0029, 0.00308, 0.00319, 0.00324, 0.00299, 0.00292, 0.00301, 0.00293, 0.00291, 0.00326, 0.00322, 0.00323, 0.0029, 0.00293, 0.003, 0.00291, 0.00287, 0.00303, 0.0032, 0.00322, 0.00298, 0.00294, 0.00295, 0.00296, 0.0029, 0.00305, 0.00322, 0.00321, 0.003, 0.00295, 0.00299, 0.00295, 0.00292, 0.00306, 0.00323, 0.0032, 0.00298, 0.00291, 0.00297, 0.00296, 0.00287, 0.00304, 0.00322, 0.0032, 0.00299, 0.00296, 0.00297, 0.00296, 0.00291, 0.00308, 0.00321, 0.00326, 0.00301, 0.00294, 0.00292, 0.00295, 0.00287, 0.00307, 0.00321, 0.00318, 0.00296, 0.00285, 0.00302, 0.00297, 0.00291, 0.003, 0.00323, 0.0032, 0.003, 0.00292, 0.00294, 0.00297, 0.00285, 0.00306, 0.00318, 0.00314, 0.003, 0.00289, 0.00296, 0.00296, 0.00288, 0.00307, 0.00321, 0.00321, 0.00301, 0.00289, 0.00297, 0.00297, 0.0029, 0.00298, 0.00323, 0.00321, 0.003, 0.00289, 0.00287, 0.00295, 0.00292, 0.00302, 0.00323, 0.00323, 0.003, 0.00292, 0.00291, 0.00298, 0.00286, 0.00306, 0.00321, 0.00322, 0.00302, 0.00289, 0.00293, 0.00286, 0.00288, 0.00306, 0.00322, 0.00319, 0.00295, 0.00285, 0.00297, 0.00295, 0.00289, 0.00305, 0.0032, 0.00324, 0.00298, 0.00291, 0.00297, 0.00289, 0.00289, 0.00304, 0.0032, 0.00314, 0.003, 0.00289, 0.00297, 0.00295, 0.00288, 0.00301, 0.00317, 0.00314, 0.003, 0.00291, 0.00299, 0.00296, 0.0029, 0.00306, 0.00324, 0.00319, 0.00301, 0.0029, 0.00296, 0.00296, 0.0029, 0.00306, 0.00319, 0.0032, 0.003, 0.00285, 0.00298, 0.00296, 0.00281, 0.00305, 0.00318, 0.00322, 0.00297, 0.00291, 0.00299, 0.00294, 0.00292, 0.00307, 0.00323, 0.00324, 0.00299, 0.0029, 0.00299, 0.00295, 0.0029, 0.00305, 0.00319, 0.0029, 0.00305, 0.00311, 0.00325, 0.00324, 0.00308, 0.00284, 0.00305, 0.00295, 0.00305, 0.003, 0.00324, 0.0032, 0.00306, 0.00286, 0.00306, 0.00294, 0.00305, 0.0031, 0.00318, 0.00323, 0.00308, 0.00288, 0.00306, 0.00297, 0.00304, 0.00309, 0.00321, 0.00322, 0.00308, 0.00287, 0.00299, 0.00294, 0.00304, 0.00311, 0.00324, 0.00325, 0.00304, 0.00281, 0.00302, 0.00293, 0.00307, 0.0031, 0.00323, 0.00319, 0.00306, 0.00286, 0.00306, 0.00291, 0.00305, 0.00311, 0.00314, 0.00323, 0.00303, 0.00285, 0.00298, 0.00294, 0.00302, 0.00307, 0.00322, 0.00318, 0.00303, 0.00287, 0.00303, 0.00294, 0.00301, 0.00322, 0.00321, 0.00326, 0.00304, 0.00288, 0.00305, 0.00292, 0.00304, 0.00303, 0.00323, 0.00323, 0.00307, 0.00289, 0.003, 0.00295, 0.00298, 0.00307, 0.00328, 0.00312, 0.00307, 0.00289, 0.00303, 0.00294, 0.00306, 0.00309, 0.00324, 0.0032, 0.00306, 0.0029, 0.00306, 0.00294, 0.00301, 0.00301, 0.00322, 0.00321, 0.00306, 0.00289, 0.00304, 0.00293, 0.00303, 0.00312, 0.00322, 0.00325, 0.00305, 0.00286, 0.00306, 0.00293, 0.00304, 0.0031, 0.00325, 0.00326, 0.00306, 0.00287, 0.00305, 0.00296, 0.00307, 0.00314, 0.00315, 0.00323, 0.00307, 0.00288, 0.00293, 0.0029, 0.00303, 0.00304, 0.00325, 0.00322, 0.00304, 0.0028, 0.00304, 0.00292, 0.00305, 0.00308, 0.00323, 0.00323, 0.00307, 0.00289, 0.00304, 0.00294, 0.00305, 0.00311, 0.00321, 0.00322, 0.00303, 0.00281, 0.00304, 0.00296, 0.003, 0.0031, 0.00322, 0.00314, 0.00301, 0.00281, 0.00298, 0.00288, 0.00303, 0.00307, 0.00321, 0.0032, 0.00301, 0.00281, 0.00303, 0.00288, 0.00301, 0.00309, 0.00316, 0.00319, 0.00302, 0.00284, 0.00306, 0.00292, 0.003, 0.00328, 0.00321, 0.0032, 0.00301, 0.00285, 0.00297, 0.00284, 0.003, 0.003, 0.00318, 0.00319, 0.00301, 0.00281, 0.00303, 0.00289, 0.003, 0.00305, 0.00315, 0.00308, 0.00303, 0.00279, 0.00299]}, "optimizer-copy-to-main-grad-time": {"start_step": 0, "end_step": 2000, "step_interval": 5, "values": [0.0004, 0.00019, 0.00027, 0.00027, 0.00028, 0.00028, 0.00028, 0.00028, 0.00028, 0.00028, 0.00027, 0.00027, 0.00028, 0.00028, 0.00028, 0.00028, 0.00028, 0.00028, 0.00028, 0.00028, 0.00026, 0.00027, 0.00028, 0.00028, 0.00029, 0.00028, 0.00028, 0.00028, 0.00028, 0.00028, 0.00031, 0.00028, 0.00028, 0.00028, 0.00028, 0.00028, 0.00028, 0.00029, 0.00029, 0.00029, 0.00029, 0.00027, 0.00029, 0.00027, 0.00028, 0.00028, 0.00028, 0.00029, 0.00027, 0.00027, 0.00029, 0.00028, 0.0003, 0.00028, 0.00028, 0.00028, 0.00028, 0.00029, 0.00028, 0.00028, 0.00028, 0.00028, 0.00027, 0.00028, 0.00028, 0.00028, 0.00028, 0.00027, 0.00028, 0.00028, 0.00028, 0.00027, 0.00028, 0.00027, 0.00028, 0.00026, 0.00026, 0.00026, 0.00026, 0.00026, 0.00026, 0.00027, 0.00027, 0.00025, 0.00025, 0.00027, 0.00028, 0.00027, 0.00028, 0.00026, 0.00026, 0.00025, 0.00026, 0.00026, 0.00028, 0.00025, 0.00028, 0.00027, 0.00026, 0.00028, 0.00028, 0.00028, 0.00028, 0.00027, 0.00026, 0.00028, 0.00029, 0.00028, 0.00028, 0.00028, 0.00027, 0.00027, 0.00028, 0.00028, 0.00028, 0.00027, 0.00028, 0.00027, 0.00028, 0.00027, 0.00027, 0.00028, 0.00028, 0.00028, 0.00028, 0.00029, 0.00027, 0.00028, 0.00028, 0.00028, 0.00027, 0.00027, 0.00027, 0.00027, 0.00027, 0.00028, 0.00028, 0.00028, 0.00028, 0.00028, 0.00028, 0.00028, 0.00028, 0.00028, 0.00027, 0.00027, 0.00027, 0.00028, 0.00027, 0.00027, 0.00027, 0.00028, 0.00029, 0.00028, 0.00028, 0.00027, 0.00028, 0.00027, 0.00028, 0.00027, 0.00028, 0.00028, 0.00028, 0.00028, 0.00028, 0.00027, 0.00028, 0.00027, 0.00028, 0.00028, 0.00029, 0.00027, 0.00028, 0.00027, 0.00027, 0.00029, 0.00028, 0.00028, 0.00027, 0.00028, 0.00028, 0.00027, 0.00028, 0.00029, 0.00028, 0.00028, 0.00028, 0.00028, 0.00028, 0.00028, 0.00029, 0.00028, 0.00028, 0.00029, 0.00028, 0.00028, 0.00029, 0.00028, 0.00028, 0.00028, 0.00028, 0.00027, 0.00026, 0.00026, 0.00026, 0.00025, 0.00025, 0.00025, 0.00026, 0.00025, 0.00025, 0.00025, 0.00025, 0.00025, 0.00025, 0.00026, 0.00027, 0.00025, 0.00025, 0.00026, 0.00026, 0.00025, 0.00027, 0.00025, 0.00026, 0.00025, 0.00025, 0.00025, 0.00026, 0.00025, 0.00025, 0.00025, 0.00025, 0.00025, 0.00025, 0.00025, 0.00025, 0.00025, 0.00025, 0.00025, 0.00025, 0.00025, 0.00025, 0.00025, 0.00025, 0.00025, 0.00025, 0.00025, 0.00025, 0.00025, 0.00025, 0.00026, 0.00025, 0.00025, 0.00026, 0.00025, 0.00025, 0.00025, 0.00025, 0.00025, 0.00025, 0.00027, 0.00025, 0.00025, 0.00025, 0.00027, 0.00027, 0.00025, 0.00025, 0.00025, 0.00026, 0.00026, 0.00025, 0.00025, 0.00025, 0.00025, 0.00025, 0.00027, 0.00027, 0.00025, 0.00025, 0.00025, 0.00025, 0.00025, 0.00025, 0.00025, 0.00025, 0.00025, 0.00025, 0.00025, 0.00026, 0.00025, 0.00025, 0.00025, 0.00026, 0.00025, 0.00025, 0.00025, 0.00025, 0.00025, 0.00025, 0.00025, 0.00026, 0.00025, 0.00025, 0.00027, 0.00026, 0.00025, 0.00025, 0.00025, 0.00025, 0.00025, 0.00025, 0.00025, 0.00025, 0.00025, 0.00025, 0.00026, 0.00025, 0.00025, 0.00025, 0.00025, 0.00025, 0.00025, 0.00025, 0.00025, 0.00025, 0.00025, 0.00025, 0.00026, 0.00026, 0.00025, 0.00025, 0.00025, 0.00025, 0.00025, 0.00026, 0.00025, 0.00025, 0.00026, 0.00025, 0.00025, 0.00024, 0.00025, 0.00025, 0.00026, 0.00025, 0.00026, 0.00025, 0.00025, 0.00025, 0.00026, 0.00025, 0.00025, 0.00025, 0.00025, 0.00025, 0.00026, 0.00025, 0.00025, 0.00026, 0.00027, 0.00029, 0.00027, 0.00027, 0.00028, 0.00027, 0.00028, 0.00028, 0.00029, 0.00028, 0.00027, 0.00027, 0.00027, 0.00027, 0.00028, 0.00027, 0.00027, 0.00028, 0.00028, 0.00028, 0.00028, 0.00028, 0.00028, 0.00025, 0.00027, 0.00025, 0.00027, 0.00028, 0.00027, 0.00027, 0.00027, 0.00027, 0.00028, 0.00027, 0.00028, 0.00027, 0.00027, 0.00027, 0.00027]}, "optimizer-clip-main-grad-time": {"start_step": 0, "end_step": 2000, "step_interval": 5, "values": [0.6202, 0.00104, 0.00121, 0.00115, 0.00122, 0.00121, 0.00123, 0.00124, 0.00122, 0.00123, 0.00125, 0.00122, 0.00121, 0.0012, 0.00122, 0.00127, 0.00121, 0.00123, 0.0012, 0.00123, 0.00121, 0.00116, 0.00125, 0.00122, 0.00122, 0.00124, 0.00122, 0.00123, 0.0012, 0.00122, 0.00125, 0.00122, 0.00126, 0.0012, 0.00122, 0.00123, 0.00121, 0.00127, 0.00121, 0.00121, 0.00121, 0.00121, 0.00123, 0.00122, 0.00123, 0.00124, 0.00121, 0.0012, 0.00122, 0.00119, 0.00121, 0.00122, 0.00137, 0.00122, 0.00121, 0.00123, 0.0012, 0.00126, 0.00121, 0.00122, 0.00122, 0.00129, 0.00122, 0.00122, 0.00122, 0.00123, 0.00125, 0.00125, 0.00124, 0.00122, 0.00123, 0.0013, 0.00124, 0.00121, 0.00123, 0.00118, 0.00123, 0.00121, 0.00123, 0.00118, 0.00118, 0.00118, 0.00119, 0.00119, 0.00119, 0.00121, 0.00121, 0.00122, 0.00121, 0.00123, 0.00123, 0.0012, 0.00128, 0.00117, 0.00122, 0.00123, 0.00124, 0.00121, 0.00118, 0.00119, 0.00121, 0.00122, 0.00121, 0.0012, 0.00118, 0.00124, 0.00122, 0.0012, 0.00125, 0.0012, 0.00121, 0.00101, 0.0012, 0.00121, 0.00124, 0.00123, 0.00123, 0.00123, 0.00122, 0.001, 0.00122, 0.00121, 0.001, 0.00125, 0.00122, 0.00121, 0.00124, 0.00121, 0.00121, 0.00099, 0.0012, 0.00125, 0.00121, 0.001, 0.0012, 0.00122, 0.00122, 0.00122, 0.0013, 0.00097, 0.00124, 0.00122, 0.00125, 0.00121, 0.0012, 0.0012, 0.00121, 0.00123, 0.0012, 0.0012, 0.00121, 0.00125, 0.00135, 0.00122, 0.00122, 0.00123, 0.00124, 0.00121, 0.00122, 0.0012, 0.0013, 0.00122, 0.00124, 0.001, 0.00123, 0.00121, 0.00121, 0.00126, 0.00124, 0.00129, 0.00129, 0.00124, 0.00121, 0.00119, 0.0012, 0.00123, 0.00123, 0.00127, 0.00122, 0.00122, 0.0012, 0.00121, 0.00128, 0.0012, 0.00125, 0.00124, 0.00121, 0.00123, 0.00121, 0.00132, 0.00122, 0.00121, 0.0012, 0.00122, 0.00123, 0.00123, 0.00121, 0.0012, 0.00122, 0.00123, 0.0012, 0.00123, 0.0012, 0.00118, 0.00118, 0.00121, 0.00124, 0.0012, 0.00121, 0.00121, 0.00119, 0.00119, 0.0012, 0.0012, 0.0012, 0.00118, 0.00126, 0.00121, 0.00118, 0.0012, 0.00117, 0.00119, 0.00121, 0.00118, 0.00119, 0.00122, 0.0012, 0.0012, 0.00126, 0.00121, 0.00128, 0.00107, 0.00115, 0.00121, 0.00119, 0.00119, 0.00116, 0.00118, 0.0012, 0.00121, 0.00119, 0.0012, 0.0012, 0.0012, 0.00116, 0.00121, 0.0012, 0.00116, 0.00121, 0.00113, 0.00119, 0.00127, 0.0012, 0.00119, 0.00118, 0.00119, 0.0012, 0.00121, 0.00119, 0.00118, 0.00119, 0.0012, 0.00119, 0.0012, 0.0012, 0.00127, 0.00122, 0.0012, 0.00118, 0.00118, 0.00121, 0.00118, 0.00123, 0.00119, 0.00122, 0.00116, 0.0012, 0.00118, 0.0012, 0.00122, 0.00122, 0.00121, 0.00117, 0.00121, 0.00117, 0.0012, 0.00118, 0.00119, 0.00122, 0.00118, 0.00125, 0.00119, 0.00121, 0.00118, 0.00133, 0.00119, 0.00119, 0.00119, 0.0012, 0.00128, 0.00121, 0.00122, 0.0012, 0.00123, 0.00115, 0.00118, 0.0012, 0.00122, 0.00119, 0.00122, 0.00121, 0.00119, 0.00126, 0.0012, 0.0012, 0.00118, 0.00116, 0.00119, 0.00118, 0.00121, 0.00119, 0.00125, 0.00122, 0.00119, 0.00116, 0.00117, 0.00119, 0.0012, 0.0012, 0.00117, 0.00118, 0.0012, 0.00124, 0.00122, 0.0012, 0.00118, 0.0012, 0.00119, 0.0012, 0.00118, 0.00119, 0.00121, 0.00119, 0.00119, 0.00121, 0.00118, 0.00126, 0.00118, 0.0012, 0.00119, 0.00117, 0.0012, 0.00118, 0.0012, 0.00119, 0.0012, 0.00119, 0.00125, 0.00117, 0.00123, 0.00118, 0.00122, 0.00122, 0.00122, 0.00117, 0.00123, 0.00122, 0.00121, 0.00121, 0.0012, 0.00121, 0.00128, 0.00123, 0.00116, 0.0012, 0.00123, 0.00123, 0.00116, 0.00123, 0.00121, 0.0012, 0.00121, 0.00122, 0.00124, 0.00128, 0.00122, 0.00117, 0.00123, 0.00124, 0.00122, 0.00118, 0.0012, 0.00117, 0.00125, 0.00122, 0.00117, 0.00115, 0.00118, 0.00113, 0.0012]}, "optimizer-count-zeros-time": {"start_step": 0, "end_step": 2000, "step_interval": 5, "values": [0.00555, 0.00512, 0.0052, 0.0051, 0.00517, 0.00513, 0.00514, 0.00513, 0.00512, 0.00511, 0.00508, 0.0051, 0.0051, 0.00512, 0.00511, 0.00509, 0.00508, 0.00511, 0.00514, 0.0051, 0.00509, 0.0051, 0.00514, 0.00512, 0.00512, 0.00512, 0.00514, 0.00517, 0.00511, 0.00513, 0.00513, 0.00516, 0.00515, 0.00515, 0.00516, 0.00514, 0.00513, 0.00543, 0.00514, 0.00512, 0.00514, 0.00513, 0.00513, 0.00516, 0.00512, 0.00515, 0.00511, 0.00513, 0.00515, 0.00514, 0.0051, 0.00512, 0.0057, 0.00511, 0.00513, 0.00513, 0.00514, 0.0053, 0.00514, 0.00511, 0.00513, 0.00512, 0.00513, 0.00518, 0.00513, 0.00514, 0.00512, 0.00513, 0.00512, 0.00509, 0.00512, 0.00539, 0.00514, 0.00514, 0.0051, 0.00512, 0.00511, 0.00512, 0.00511, 0.00511, 0.00512, 0.00513, 0.00511, 0.00514, 0.00512, 0.0051, 0.00514, 0.00511, 0.00512, 0.00522, 0.0051, 0.00514, 0.00572, 0.0051, 0.00515, 0.00526, 0.00509, 0.00511, 0.00513, 0.00513, 0.00518, 0.00514, 0.00511, 0.00512, 0.00512, 0.00511, 0.00514, 0.00512, 0.00518, 0.00514, 0.00512, 0.00513, 0.00512, 0.00512, 0.00512, 0.00511, 0.00509, 0.00514, 0.00519, 0.00512, 0.0051, 0.00513, 0.0051, 0.00548, 0.00514, 0.00512, 0.00512, 0.00511, 0.00511, 0.00512, 0.00511, 0.00519, 0.00533, 0.00509, 0.00512, 0.0051, 0.00513, 0.00511, 0.00515, 0.00508, 0.00512, 0.00513, 0.0057, 0.00513, 0.00513, 0.00516, 0.00518, 0.00515, 0.00517, 0.00513, 0.00514, 0.00516, 0.0057, 0.00516, 0.00515, 0.00514, 0.00513, 0.00513, 0.00516, 0.00516, 0.00566, 0.00514, 0.00514, 0.00515, 0.00516, 0.00515, 0.00513, 0.00517, 0.00513, 0.00513, 0.00601, 0.00514, 0.00522, 0.00513, 0.00515, 0.00514, 0.00517, 0.00511, 0.00515, 0.00516, 0.00515, 0.00514, 0.00515, 0.00512, 0.00587, 0.00517, 0.00518, 0.00516, 0.00513, 0.00541, 0.00514, 0.00515, 0.00513, 0.00516, 0.00521, 0.00531, 0.00532, 0.00517, 0.00516, 0.00515, 0.00511, 0.00529, 0.00509, 0.00511, 0.00512, 0.00512, 0.00512, 0.00515, 0.0053, 0.0051, 0.00512, 0.00512, 0.00512, 0.00511, 0.0051, 0.00513, 0.00512, 0.00513, 0.00513, 0.00512, 0.00559, 0.00511, 0.0051, 0.0051, 0.00512, 0.00515, 0.00512, 0.00511, 0.00579, 0.00512, 0.00511, 0.00512, 0.00511, 0.00511, 0.00511, 0.00513, 0.00508, 0.00513, 0.00511, 0.00509, 0.00512, 0.0051, 0.00512, 0.00511, 0.00512, 0.00513, 0.00511, 0.00514, 0.00511, 0.00512, 0.00512, 0.0059, 0.00513, 0.00514, 0.00512, 0.00511, 0.00513, 0.00511, 0.00511, 0.0051, 0.00509, 0.0051, 0.00512, 0.0051, 0.0051, 0.00511, 0.00513, 0.00513, 0.0051, 0.00513, 0.00511, 0.0051, 0.0051, 0.00511, 0.00512, 0.00511, 0.00509, 0.00513, 0.0051, 0.0051, 0.00518, 0.0051, 0.00513, 0.00509, 0.00513, 0.00512, 0.00511, 0.00515, 0.00512, 0.00512, 0.00512, 0.00512, 0.00512, 0.00511, 0.00601, 0.00512, 0.00524, 0.00512, 0.0051, 0.00511, 0.00509, 0.00512, 0.0051, 0.00512, 0.00511, 0.00511, 0.00526, 0.0051, 0.00511, 0.00512, 0.00511, 0.00511, 0.00514, 0.00511, 0.00512, 0.00509, 0.00511, 0.00512, 0.00512, 0.00509, 0.0051, 0.00511, 0.00511, 0.00513, 0.00512, 0.00541, 0.00512, 0.00515, 0.00511, 0.00509, 0.0051, 0.00512, 0.00511, 0.00512, 0.00511, 0.00517, 0.00514, 0.00513, 0.00513, 0.00512, 0.00511, 0.00514, 0.00511, 0.00514, 0.00509, 0.00508, 0.00513, 0.00509, 0.0051, 0.00513, 0.00511, 0.00571, 0.00519, 0.00511, 0.00511, 0.0051, 0.00511, 0.00512, 0.00513, 0.00511, 0.00511, 0.00511, 0.00511, 0.00512, 0.00511, 0.00509, 0.00514, 0.00511, 0.00516, 0.00512, 0.0053, 0.00511, 0.00512, 0.00521, 0.00512, 0.00513, 0.00514, 0.00512, 0.00512, 0.00514, 0.0051, 0.00511, 0.00513, 0.00512, 0.00509, 0.00519, 0.00512, 0.0051, 0.00509, 0.00596, 0.00512, 0.0051, 0.0051, 0.00513, 0.00513, 0.0051, 0.00511, 0.00509, 0.00512, 0.00511]}, "optimizer-inner-step-time": {"start_step": 0, "end_step": 2000, "step_interval": 5, "values": [0.00417, 0.00096, 0.00098, 0.00098, 0.00099, 0.00097, 0.00098, 0.00098, 0.00097, 0.00096, 0.00097, 0.00097, 0.00097, 0.00097, 0.00097, 0.00099, 0.00098, 0.00097, 0.00098, 0.00097, 0.00097, 0.00096, 0.00098, 0.00098, 0.00099, 0.00099, 0.00097, 0.00096, 0.00098, 0.00098, 0.00101, 0.00097, 0.00097, 0.00097, 0.00098, 0.00097, 0.00097, 0.00097, 0.00098, 0.00098, 0.00098, 0.00096, 0.00097, 0.00097, 0.00097, 0.00098, 0.00098, 0.00097, 0.00097, 0.00097, 0.00098, 0.00098, 0.00099, 0.00098, 0.00097, 0.00097, 0.00096, 0.00097, 0.00096, 0.00097, 0.00098, 0.00097, 0.00097, 0.00098, 0.00097, 0.00097, 0.00097, 0.00097, 0.00097, 0.00097, 0.00098, 0.00097, 0.00097, 0.00097, 0.00097, 0.00097, 0.00097, 0.00097, 0.00097, 0.00098, 0.00096, 0.00097, 0.00097, 0.00098, 0.00096, 0.00096, 0.00097, 0.00098, 0.00096, 0.00097, 0.00096, 0.00097, 0.00099, 0.00096, 0.00098, 0.00098, 0.00097, 0.00097, 0.00096, 0.00097, 0.00096, 0.00097, 0.00099, 0.00096, 0.00096, 0.00097, 0.00097, 0.00097, 0.00098, 0.00096, 0.00097, 0.00097, 0.00098, 0.00099, 0.00098, 0.00096, 0.00096, 0.00097, 0.00097, 0.00097, 0.00097, 0.00097, 0.00097, 0.00098, 0.00099, 0.00097, 0.00096, 0.00097, 0.00097, 0.00097, 0.00097, 0.00097, 0.00097, 0.00097, 0.00097, 0.00097, 0.00098, 0.00097, 0.00098, 0.00097, 0.00096, 0.00097, 0.00099, 0.00098, 0.00097, 0.00098, 0.00096, 0.00097, 0.00097, 0.00096, 0.00097, 0.00098, 0.00099, 0.00097, 0.00097, 0.00097, 0.00098, 0.00097, 0.00097, 0.00097, 0.00099, 0.00098, 0.00097, 0.00097, 0.00098, 0.00097, 0.00096, 0.00097, 0.00098, 0.00097, 0.001, 0.00097, 0.00097, 0.00097, 0.00098, 0.00097, 0.00098, 0.00097, 0.00099, 0.00097, 0.00097, 0.00096, 0.00098, 0.00097, 0.00098, 0.00097, 0.00097, 0.00097, 0.00098, 0.00098, 0.00097, 0.00097, 0.00099, 0.00097, 0.00098, 0.00098, 0.00097, 0.00097, 0.00098, 0.00098, 0.001, 0.00097, 0.00096, 0.00097, 0.00097, 0.00097, 0.00097, 0.00097, 0.00097, 0.00097, 0.00096, 0.00096, 0.00096, 0.00097, 0.001, 0.00096, 0.00099, 0.00097, 0.00098, 0.00097, 0.00099, 0.00096, 0.00128, 0.00096, 0.00096, 0.00097, 0.00097, 0.00097, 0.00099, 0.00096, 0.00097, 0.00096, 0.00097, 0.00097, 0.00096, 0.00096, 0.00096, 0.00097, 0.00096, 0.00097, 0.00097, 0.00097, 0.00097, 0.00097, 0.00097, 0.00096, 0.00098, 0.00097, 0.00097, 0.00096, 0.00097, 0.001, 0.00097, 0.00096, 0.00097, 0.00097, 0.00097, 0.00096, 0.00097, 0.00096, 0.00096, 0.00097, 0.00096, 0.00096, 0.00096, 0.00097, 0.001, 0.00097, 0.00096, 0.00097, 0.00097, 0.00096, 0.00097, 0.00096, 0.00097, 0.00096, 0.00097, 0.00096, 0.00096, 0.00096, 0.00097, 0.00099, 0.00096, 0.00097, 0.00096, 0.00096, 0.00097, 0.00096, 0.00097, 0.00096, 0.00097, 0.00098, 0.00097, 0.00097, 0.00099, 0.00096, 0.00097, 0.00096, 0.00096, 0.00098, 0.00096, 0.00096, 0.00097, 0.00098, 0.00096, 0.00097, 0.00097, 0.00096, 0.00098, 0.00097, 0.00096, 0.00097, 0.00096, 0.00097, 0.00097, 0.00096, 0.00097, 0.00097, 0.00096, 0.00096, 0.00097, 0.00096, 0.00096, 0.00097, 0.00097, 0.00097, 0.00097, 0.00096, 0.00097, 0.00097, 0.00097, 0.00096, 0.00096, 0.00096, 0.00097, 0.00097, 0.00097, 0.00097, 0.00097, 0.00097, 0.00097, 0.00096, 0.00097, 0.00096, 0.00097, 0.00097, 0.00097, 0.00097, 0.00096, 0.00097, 0.00095, 0.00096, 0.00097, 0.00098, 0.00097, 0.00097, 0.00097, 0.00097, 0.00096, 0.00096, 0.00096, 0.00098, 0.00097, 0.00097, 0.00098, 0.00097, 0.00098, 0.00098, 0.00098, 0.00098, 0.001, 0.00098, 0.00098, 0.00098, 0.00097, 0.00097, 0.00098, 0.00098, 0.00101, 0.00098, 0.00098, 0.00097, 0.00098, 0.00097, 0.00097, 0.00099, 0.00097, 0.00098, 0.00098, 0.00096, 0.00098, 0.00097, 0.00098, 0.00099, 0.00097, 0.00098, 0.00097, 0.00097, 0.00098, 0.00098]}, "optimizer-copy-main-to-model-params-time": {"start_step": 0, "end_step": 2000, "step_interval": 5, "values": [0.00118, 0.00099, 0.00101, 0.00101, 0.00101, 0.00101, 0.00101, 0.00101, 0.00101, 0.00101, 0.00101, 0.00101, 0.00101, 0.00101, 0.001, 0.001, 0.00101, 0.00101, 0.00101, 0.00103, 0.001, 0.00101, 0.00101, 0.00101, 0.00101, 0.00101, 0.00101, 0.00101, 0.00102, 0.00101, 0.00102, 0.00101, 0.001, 0.001, 0.00101, 0.00101, 0.00101, 0.00101, 0.00101, 0.001, 0.00101, 0.001, 0.00102, 0.00102, 0.001, 0.00101, 0.00101, 0.00101, 0.001, 0.00101, 0.00101, 0.00101, 0.00101, 0.00102, 0.00101, 0.00101, 0.001, 0.001, 0.00101, 0.00102, 0.00102, 0.001, 0.00101, 0.001, 0.00101, 0.001, 0.00101, 0.00101, 0.00101, 0.00105, 0.00101, 0.00102, 0.001, 0.00101, 0.00101, 0.00101, 0.00101, 0.00101, 0.00101, 0.00101, 0.00101, 0.001, 0.001, 0.00102, 0.001, 0.00101, 0.00102, 0.00101, 0.00101, 0.00101, 0.001, 0.00101, 0.00103, 0.00101, 0.00101, 0.00101, 0.00101, 0.00101, 0.00101, 0.00102, 0.00101, 0.00101, 0.00106, 0.00101, 0.00102, 0.00101, 0.00101, 0.00101, 0.001, 0.00101, 0.001, 0.00101, 0.00101, 0.00101, 0.00101, 0.00101, 0.00102, 0.00102, 0.00102, 0.00101, 0.00101, 0.00102, 0.001, 0.00106, 0.00101, 0.00101, 0.00101, 0.00101, 0.00101, 0.00103, 0.00102, 0.00101, 0.00102, 0.00101, 0.00101, 0.00101, 0.001, 0.001, 0.001, 0.001, 0.001, 0.001, 0.00102, 0.00101, 0.001, 0.001, 0.001, 0.001, 0.001, 0.00101, 0.00101, 0.00101, 0.00102, 0.00102, 0.00101, 0.00102, 0.00103, 0.00102, 0.00101, 0.00101, 0.00102, 0.00101, 0.00101, 0.00101, 0.00102, 0.00101, 0.00101, 0.00101, 0.00102, 0.00101, 0.00103, 0.00101, 0.00101, 0.00101, 0.00101, 0.00102, 0.00102, 0.00101, 0.00102, 0.00101, 0.00101, 0.00102, 0.00102, 0.00102, 0.00105, 0.00102, 0.00102, 0.00101, 0.00101, 0.00102, 0.00101, 0.00103, 0.00102, 0.00102, 0.00101, 0.00106, 0.00102, 0.00101, 0.00103, 0.00103, 0.00102, 0.00103, 0.00102, 0.00102, 0.00103, 0.00102, 0.00102, 0.00102, 0.00102, 0.00103, 0.00102, 0.00102, 0.00102, 0.00102, 0.00108, 0.00102, 0.00104, 0.00102, 0.00102, 0.00103, 0.00103, 0.00102, 0.00107, 0.00102, 0.00102, 0.00102, 0.00102, 0.00102, 0.00102, 0.00102, 0.00103, 0.00102, 0.00102, 0.00102, 0.00102, 0.00102, 0.00102, 0.00102, 0.00102, 0.00102, 0.00107, 0.00102, 0.00102, 0.00102, 0.00102, 0.00103, 0.00103, 0.00102, 0.00102, 0.00102, 0.00102, 0.00103, 0.00103, 0.00103, 0.00102, 0.00103, 0.00102, 0.00102, 0.00104, 0.00102, 0.00104, 0.00102, 0.00102, 0.00103, 0.00103, 0.00102, 0.00103, 0.00102, 0.00102, 0.00102, 0.00102, 0.00101, 0.00103, 0.00101, 0.00102, 0.00102, 0.00102, 0.00102, 0.00103, 0.00102, 0.00102, 0.00105, 0.00102, 0.00102, 0.00104, 0.00102, 0.00102, 0.00102, 0.00102, 0.00103, 0.00102, 0.00102, 0.00101, 0.00103, 0.00104, 0.00103, 0.00103, 0.00102, 0.00102, 0.00102, 0.00102, 0.00102, 0.00102, 0.00102, 0.00102, 0.00102, 0.00108, 0.00102, 0.00102, 0.00102, 0.00102, 0.00102, 0.00103, 0.00102, 0.00122, 0.00102, 0.00103, 0.00102, 0.00102, 0.00102, 0.00102, 0.00103, 0.00102, 0.00102, 0.00101, 0.00102, 0.00102, 0.00102, 0.00103, 0.00102, 0.00102, 0.00102, 0.00102, 0.00102, 0.00102, 0.00102, 0.00102, 0.00103, 0.00102, 0.00102, 0.00101, 0.00102, 0.00103, 0.00103, 0.00103, 0.00101, 0.00102, 0.00102, 0.00102, 0.00101, 0.00101, 0.00105, 0.00102, 0.00103, 0.00102, 0.00102, 0.00101, 0.00102, 0.00102, 0.00102, 0.00101, 0.00101, 0.00101, 0.00102, 0.00101, 0.00101, 0.00102, 0.00102, 0.00102, 0.00101, 0.00102, 0.00103, 0.00101, 0.00102, 0.00102, 0.00102, 0.00102, 0.00101, 0.00104, 0.00102, 0.00102, 0.00102, 0.00102, 0.00101, 0.00102, 0.00102, 0.00102, 0.00102, 0.00102, 0.00103, 0.00102, 0.00102, 0.00102, 0.00103, 0.00102, 0.00102, 0.00101, 0.00102, 0.00101, 0.00101]}, "optimizer-time": {"start_step": 0, "end_step": 2000, "step_interval": 5, "values": [0.63386, 0.00867, 0.00903, 0.00886, 0.00906, 0.00897, 0.00901, 0.009, 0.00896, 0.00895, 0.00895, 0.00895, 0.00894, 0.00894, 0.00896, 0.009, 0.00892, 0.00896, 0.00899, 0.00897, 0.00892, 0.00887, 0.00902, 0.00897, 0.009, 0.00906, 0.00899, 0.00902, 0.00897, 0.00898, 0.0091, 0.00901, 0.00904, 0.00898, 0.00901, 0.009, 0.00902, 0.00937, 0.00899, 0.00896, 0.00901, 0.00897, 0.00899, 0.00902, 0.00897, 0.00903, 0.00895, 0.00898, 0.00899, 0.00895, 0.00896, 0.00898, 0.00978, 0.00897, 0.00898, 0.009, 0.00895, 0.0092, 0.00896, 0.00901, 0.009, 0.00904, 0.00898, 0.00902, 0.00897, 0.00899, 0.00902, 0.00902, 0.00899, 0.00899, 0.00898, 0.00934, 0.00904, 0.00896, 0.00897, 0.00891, 0.00895, 0.00892, 0.00894, 0.0089, 0.00889, 0.0089, 0.00891, 0.00892, 0.00888, 0.0089, 0.009, 0.00896, 0.00895, 0.0091, 0.00889, 0.00892, 0.00967, 0.00886, 0.009, 0.00913, 0.00896, 0.00896, 0.00889, 0.00895, 0.00901, 0.00899, 0.00903, 0.00893, 0.00893, 0.00898, 0.009, 0.00894, 0.00905, 0.00897, 0.00894, 0.00877, 0.00897, 0.00898, 0.00902, 0.00895, 0.00895, 0.009, 0.00905, 0.00875, 0.00895, 0.00897, 0.00872, 0.00942, 0.00901, 0.00898, 0.00897, 0.00894, 0.00895, 0.00876, 0.00895, 0.00907, 0.00917, 0.00872, 0.00895, 0.00893, 0.00898, 0.00897, 0.00906, 0.00866, 0.00896, 0.00897, 0.00964, 0.00897, 0.00897, 0.00898, 0.009, 0.009, 0.009, 0.00894, 0.00898, 0.00904, 0.00977, 0.00905, 0.00899, 0.00901, 0.00905, 0.00898, 0.00901, 0.00898, 0.00965, 0.009, 0.009, 0.00878, 0.00905, 0.00899, 0.00898, 0.00904, 0.00902, 0.00906, 0.01008, 0.00901, 0.00907, 0.00895, 0.00899, 0.00902, 0.00905, 0.00902, 0.00902, 0.00901, 0.00899, 0.00898, 0.00908, 0.00899, 0.00979, 0.00905, 0.00904, 0.00903, 0.009, 0.00938, 0.00899, 0.00901, 0.00904, 0.00902, 0.00909, 0.00923, 0.00917, 0.00901, 0.00905, 0.00903, 0.00899, 0.00918, 0.00889, 0.00891, 0.00894, 0.00894, 0.00896, 0.00895, 0.00912, 0.00892, 0.00889, 0.00896, 0.0089, 0.00891, 0.00901, 0.0089, 0.00904, 0.00893, 0.00893, 0.00894, 0.00942, 0.00889, 0.00938, 0.00887, 0.00892, 0.00897, 0.00893, 0.00896, 0.00974, 0.00891, 0.009, 0.00879, 0.00886, 0.00891, 0.0089, 0.00892, 0.00885, 0.00891, 0.0089, 0.00892, 0.00896, 0.0089, 0.00892, 0.00893, 0.00891, 0.00894, 0.00892, 0.00891, 0.00894, 0.00885, 0.00891, 0.00986, 0.00894, 0.00893, 0.00892, 0.00894, 0.00896, 0.00889, 0.00893, 0.00888, 0.0089, 0.00891, 0.0089, 0.0089, 0.00894, 0.00901, 0.00902, 0.00898, 0.00887, 0.00892, 0.00897, 0.00888, 0.00894, 0.00889, 0.00893, 0.00887, 0.00889, 0.00895, 0.00891, 0.00891, 0.00904, 0.00901, 0.00889, 0.00892, 0.00891, 0.00892, 0.00891, 0.00892, 0.00895, 0.00891, 0.00902, 0.00891, 0.00892, 0.00889, 0.01004, 0.00891, 0.00907, 0.00893, 0.00889, 0.00901, 0.00889, 0.00893, 0.00895, 0.00898, 0.00885, 0.00891, 0.00914, 0.00891, 0.00891, 0.00894, 0.00892, 0.00888, 0.009, 0.0089, 0.00948, 0.00889, 0.00887, 0.00893, 0.00889, 0.00889, 0.00891, 0.00896, 0.00894, 0.00893, 0.00888, 0.00921, 0.00895, 0.00893, 0.00894, 0.00887, 0.0089, 0.00897, 0.00896, 0.00894, 0.00893, 0.00896, 0.009, 0.00892, 0.00897, 0.00891, 0.00889, 0.00895, 0.0089, 0.00893, 0.00891, 0.00886, 0.009, 0.00888, 0.00889, 0.00894, 0.00885, 0.00955, 0.00901, 0.00895, 0.00891, 0.0089, 0.00889, 0.00898, 0.00888, 0.00898, 0.00889, 0.00895, 0.00895, 0.00896, 0.00891, 0.00895, 0.00904, 0.00897, 0.00901, 0.00897, 0.00919, 0.00904, 0.00899, 0.00902, 0.00895, 0.00901, 0.00901, 0.00892, 0.00909, 0.00899, 0.00896, 0.00901, 0.00899, 0.009, 0.00896, 0.00905, 0.0089, 0.00897, 0.00898, 0.00984, 0.00894, 0.00894, 0.00891, 0.00903, 0.00898, 0.00894, 0.00889, 0.0089, 0.0089, 0.00894]}, "learning-rate": {"start_step": 0, "end_step": 2000, "step_interval": 5, "values": [0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 1e-05, 1e-05, 1e-05, 1e-05, 1e-05, 1e-05, 1e-05, 1e-05, 1e-05, 1e-05, 1e-05, 1e-05, 1e-05, 1e-05, 1e-05, 1e-05, 1e-05, 1e-05, 1e-05, 1e-05, 1e-05, 1e-05, 1e-05, 1e-05, 1e-05, 1e-05, 1e-05, 1e-05, 1e-05, 1e-05, 1e-05, 1e-05, 1e-05, 1e-05, 1e-05, 1e-05, 1e-05, 1e-05, 1e-05, 1e-05, 1e-05, 1e-05, 2e-05, 2e-05, 2e-05, 2e-05, 2e-05, 2e-05, 2e-05, 2e-05, 2e-05, 2e-05, 2e-05, 2e-05, 2e-05, 2e-05, 2e-05, 2e-05, 2e-05, 2e-05, 2e-05, 2e-05, 2e-05, 2e-05, 2e-05, 2e-05, 2e-05, 2e-05, 2e-05, 2e-05, 2e-05, 2e-05, 2e-05, 2e-05, 2e-05, 2e-05, 2e-05, 2e-05, 2e-05, 2e-05, 2e-05, 2e-05, 2e-05, 2e-05, 2e-05, 3e-05, 3e-05, 3e-05, 3e-05, 3e-05, 3e-05, 3e-05, 3e-05, 3e-05, 3e-05, 3e-05, 3e-05, 3e-05, 3e-05, 3e-05, 3e-05, 3e-05, 3e-05, 3e-05, 3e-05, 3e-05, 3e-05, 3e-05, 3e-05, 3e-05, 3e-05, 3e-05, 3e-05, 3e-05, 3e-05, 3e-05, 3e-05, 3e-05, 3e-05, 3e-05, 3e-05, 3e-05, 3e-05, 3e-05, 3e-05, 3e-05, 3e-05, 3e-05, 4e-05, 4e-05, 4e-05, 4e-05, 4e-05, 4e-05, 4e-05, 4e-05, 4e-05, 4e-05, 4e-05, 4e-05, 4e-05, 4e-05, 4e-05, 4e-05, 4e-05, 4e-05, 4e-05, 4e-05, 4e-05, 4e-05, 4e-05, 4e-05, 4e-05, 4e-05, 4e-05, 4e-05, 4e-05, 4e-05, 4e-05, 4e-05, 4e-05, 4e-05, 4e-05, 4e-05, 4e-05, 4e-05, 4e-05, 4e-05, 4e-05, 4e-05, 5e-05, 5e-05, 5e-05, 5e-05, 5e-05, 5e-05, 5e-05, 5e-05, 5e-05, 5e-05, 5e-05, 5e-05, 5e-05, 5e-05, 5e-05, 5e-05, 5e-05, 5e-05, 5e-05, 5e-05, 5e-05, 5e-05, 5e-05, 5e-05, 5e-05, 5e-05, 5e-05, 5e-05, 5e-05, 5e-05, 5e-05, 5e-05, 5e-05, 5e-05, 5e-05, 5e-05, 5e-05, 5e-05, 5e-05, 5e-05, 5e-05, 5e-05, 5e-05, 6e-05, 6e-05, 6e-05, 6e-05, 6e-05, 6e-05, 6e-05, 6e-05, 6e-05, 6e-05, 6e-05, 6e-05, 6e-05, 6e-05, 6e-05, 6e-05, 6e-05, 6e-05, 6e-05, 6e-05, 6e-05, 6e-05, 6e-05, 6e-05, 6e-05, 6e-05, 6e-05, 6e-05, 6e-05, 6e-05, 6e-05, 6e-05, 6e-05, 6e-05, 6e-05, 6e-05, 6e-05, 6e-05, 6e-05, 6e-05, 6e-05, 6e-05, 6e-05, 7e-05, 7e-05, 7e-05, 7e-05, 7e-05, 7e-05, 7e-05, 7e-05, 7e-05, 7e-05, 7e-05, 7e-05, 7e-05, 7e-05, 7e-05, 7e-05, 7e-05, 7e-05, 7e-05, 7e-05, 7e-05, 7e-05, 7e-05, 7e-05, 7e-05, 7e-05, 7e-05, 7e-05, 7e-05, 7e-05, 7e-05, 7e-05, 7e-05, 7e-05, 7e-05, 7e-05, 7e-05, 7e-05, 7e-05, 7e-05, 7e-05, 7e-05, 8e-05, 8e-05, 8e-05, 8e-05, 8e-05, 8e-05, 8e-05, 8e-05, 8e-05, 8e-05, 8e-05, 8e-05, 8e-05, 8e-05, 8e-05, 8e-05, 8e-05, 8e-05, 8e-05, 8e-05, 8e-05, 8e-05, 8e-05, 8e-05, 8e-05, 8e-05, 8e-05, 8e-05, 8e-05, 8e-05, 8e-05, 8e-05, 8e-05, 8e-05, 8e-05, 8e-05, 8e-05, 8e-05, 8e-05, 8e-05, 8e-05, 8e-05, 8e-05, 9e-05, 9e-05, 9e-05, 9e-05, 9e-05, 9e-05, 9e-05, 9e-05, 9e-05, 9e-05, 9e-05, 9e-05, 9e-05, 9e-05, 9e-05, 9e-05, 9e-05, 9e-05, 9e-05, 9e-05, 9e-05, 9e-05, 9e-05, 9e-05, 9e-05, 9e-05, 9e-05, 9e-05, 9e-05, 9e-05, 9e-05, 9e-05, 9e-05, 9e-05, 9e-05, 9e-05, 9e-05]}, "learning-rate vs samples": {"start_step": 0, "end_step": 2000, "step_interval": 5, "values": [0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 1e-05, 1e-05, 1e-05, 1e-05, 1e-05, 1e-05, 1e-05, 1e-05, 1e-05, 1e-05, 1e-05, 1e-05, 1e-05, 1e-05, 1e-05, 1e-05, 1e-05, 1e-05, 1e-05, 1e-05, 1e-05, 1e-05, 1e-05, 1e-05, 1e-05, 1e-05, 1e-05, 1e-05, 1e-05, 1e-05, 1e-05, 1e-05, 1e-05, 1e-05, 1e-05, 1e-05, 1e-05, 1e-05, 1e-05, 1e-05, 1e-05, 1e-05, 2e-05, 2e-05, 2e-05, 2e-05, 2e-05, 2e-05, 2e-05, 2e-05, 2e-05, 2e-05, 2e-05, 2e-05, 2e-05, 2e-05, 2e-05, 2e-05, 2e-05, 2e-05, 2e-05, 2e-05, 2e-05, 2e-05, 2e-05, 2e-05, 2e-05, 2e-05, 2e-05, 2e-05, 2e-05, 2e-05, 2e-05, 2e-05, 2e-05, 2e-05, 2e-05, 2e-05, 2e-05, 2e-05, 2e-05, 2e-05, 2e-05, 2e-05, 2e-05, 3e-05, 3e-05, 3e-05, 3e-05, 3e-05, 3e-05, 3e-05, 3e-05, 3e-05, 3e-05, 3e-05, 3e-05, 3e-05, 3e-05, 3e-05, 3e-05, 3e-05, 3e-05, 3e-05, 3e-05, 3e-05, 3e-05, 3e-05, 3e-05, 3e-05, 3e-05, 3e-05, 3e-05, 3e-05, 3e-05, 3e-05, 3e-05, 3e-05, 3e-05, 3e-05, 3e-05, 3e-05, 3e-05, 3e-05, 3e-05, 3e-05, 3e-05, 3e-05, 4e-05, 4e-05, 4e-05, 4e-05, 4e-05, 4e-05, 4e-05, 4e-05, 4e-05, 4e-05, 4e-05, 4e-05, 4e-05, 4e-05, 4e-05, 4e-05, 4e-05, 4e-05, 4e-05, 4e-05, 4e-05, 4e-05, 4e-05, 4e-05, 4e-05, 4e-05, 4e-05, 4e-05, 4e-05, 4e-05, 4e-05, 4e-05, 4e-05, 4e-05, 4e-05, 4e-05, 4e-05, 4e-05, 4e-05, 4e-05, 4e-05, 4e-05, 5e-05, 5e-05, 5e-05, 5e-05, 5e-05, 5e-05, 5e-05, 5e-05, 5e-05, 5e-05, 5e-05, 5e-05, 5e-05, 5e-05, 5e-05, 5e-05, 5e-05, 5e-05, 5e-05, 5e-05, 5e-05, 5e-05, 5e-05, 5e-05, 5e-05, 5e-05, 5e-05, 5e-05, 5e-05, 5e-05, 5e-05, 5e-05, 5e-05, 5e-05, 5e-05, 5e-05, 5e-05, 5e-05, 5e-05, 5e-05, 5e-05, 5e-05, 5e-05, 6e-05, 6e-05, 6e-05, 6e-05, 6e-05, 6e-05, 6e-05, 6e-05, 6e-05, 6e-05, 6e-05, 6e-05, 6e-05, 6e-05, 6e-05, 6e-05, 6e-05, 6e-05, 6e-05, 6e-05, 6e-05, 6e-05, 6e-05, 6e-05, 6e-05, 6e-05, 6e-05, 6e-05, 6e-05, 6e-05, 6e-05, 6e-05, 6e-05, 6e-05, 6e-05, 6e-05, 6e-05, 6e-05, 6e-05, 6e-05, 6e-05, 6e-05, 6e-05, 7e-05, 7e-05, 7e-05, 7e-05, 7e-05, 7e-05, 7e-05, 7e-05, 7e-05, 7e-05, 7e-05, 7e-05, 7e-05, 7e-05, 7e-05, 7e-05, 7e-05, 7e-05, 7e-05, 7e-05, 7e-05, 7e-05, 7e-05, 7e-05, 7e-05, 7e-05, 7e-05, 7e-05, 7e-05, 7e-05, 7e-05, 7e-05, 7e-05, 7e-05, 7e-05, 7e-05, 7e-05, 7e-05, 7e-05, 7e-05, 7e-05, 7e-05, 8e-05, 8e-05, 8e-05, 8e-05, 8e-05, 8e-05, 8e-05, 8e-05, 8e-05, 8e-05, 8e-05, 8e-05, 8e-05, 8e-05, 8e-05, 8e-05, 8e-05, 8e-05, 8e-05, 8e-05, 8e-05, 8e-05, 8e-05, 8e-05, 8e-05, 8e-05, 8e-05, 8e-05, 8e-05, 8e-05, 8e-05, 8e-05, 8e-05, 8e-05, 8e-05, 8e-05, 8e-05, 8e-05, 8e-05, 8e-05, 8e-05, 8e-05, 8e-05, 9e-05, 9e-05, 9e-05, 9e-05, 9e-05, 9e-05, 9e-05, 9e-05, 9e-05, 9e-05, 9e-05, 9e-05, 9e-05, 9e-05, 9e-05, 9e-05, 9e-05, 9e-05, 9e-05, 9e-05, 9e-05, 9e-05, 9e-05, 9e-05, 9e-05, 9e-05, 9e-05, 9e-05, 9e-05, 9e-05, 9e-05, 9e-05, 9e-05, 9e-05, 9e-05, 9e-05, 9e-05]}, "batch-size": {"start_step": 0, "end_step": 2000, "step_interval": 5, "values": [128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0]}, "batch-size vs samples": {"start_step": 0, "end_step": 2000, "step_interval": 5, "values": [128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0]}, "lm loss": {"start_step": 0, "end_step": 2000, "step_interval": 5, "values": [10.88321, 10.90268, 10.88687, 10.83314, 10.67636, 10.64925, 10.43407, 10.15143, 9.939, 9.84142, 9.58871, 9.85432, 9.88466, 9.62953, 9.78812, 9.5115, 9.45845, 9.64924, 9.38622, 9.33216, 9.24226, 9.14549, 9.17557, 8.99547, 9.18942, 9.05996, 9.15554, 9.16495, 9.29785, 8.98464, 8.92921, 9.04391, 9.04317, 8.65502, 8.71709, 8.75344, 8.68371, 8.7343, 8.65869, 8.76488, 8.66084, 8.84969, 8.83212, 8.4992, 8.38905, 8.43151, 8.49327, 8.38449, 8.43266, 8.57974, 8.36712, 8.19218, 8.22599, 8.22213, 8.26761, 7.91363, 8.09574, 7.89107, 8.2463, 8.23044, 8.00478, 7.9653, 7.91788, 7.73983, 7.73952, 7.64266, 7.51535, 7.9067, 7.6981, 7.45174, 7.74028, 7.76751, 7.54113, 7.29838, 7.45192, 7.33549, 7.46187, 7.22351, 7.63653, 7.27884, 7.35151, 7.2129, 7.2187, 7.42237, 7.17713, 7.28373, 7.00153, 7.00528, 7.04066, 7.1397, 6.8246, 6.98624, 7.08901, 7.00075, 6.87398, 6.75446, 6.98902, 7.05484, 6.70056, 6.57618, 6.7239, 6.73842, 6.73087, 6.73636, 6.65702, 6.40579, 6.6386, 6.62005, 6.44721, 6.63067, 6.74344, 6.6111, 6.7266, 6.69523, 6.62503, 6.50683, 6.59892, 6.4067, 6.66402, 6.24864, 6.25205, 6.30302, 6.38991, 6.35064, 6.45057, 6.2892, 6.34021, 6.23934, 6.20441, 6.39672, 6.32669, 6.3228, 6.16602, 6.15875, 6.24058, 6.38585, 6.20055, 6.14534, 6.17669, 6.1094, 6.05525, 6.06665, 6.2527, 6.40409, 6.25252, 6.2934, 6.0919, 6.17395, 5.99575, 6.02272, 5.94996, 6.23797, 6.18154, 5.95877, 5.77498, 6.11727, 5.84271, 6.09751, 5.78563, 6.15394, 6.14296, 6.08411, 5.92729, 6.11238, 5.94309, 6.19339, 5.89494, 5.792, 5.77614, 5.6837, 6.01618, 5.99613, 6.06338, 5.88778, 6.04018, 5.96996, 5.99544, 5.98695, 5.94778, 5.84144, 5.95287, 5.61942, 5.70133, 5.88893, 5.84402, 5.86128, 5.76114, 5.83707, 5.72343, 5.55889, 5.72351, 5.62534, 5.83303, 5.60569, 5.7102, 5.70991, 5.89681, 5.64325, 5.84924, 5.73928, 5.87114, 5.33228, 5.89693, 5.872, 5.85316, 5.40988, 5.4088, 5.62665, 5.59641, 5.48639, 5.57896, 5.67332, 5.47579, 5.74541, 5.50851, 5.59461, 5.621, 5.62129, 5.51073, 5.61357, 5.67793, 5.68632, 5.58943, 5.66035, 5.37294, 5.67985, 5.62736, 5.42133, 5.58734, 5.63109, 5.55307, 5.34119, 5.53841, 5.48634, 5.48174, 5.37484, 5.55776, 5.60342, 5.38738, 5.52728, 5.4859, 5.33181, 5.50554, 5.40833, 5.44, 5.31717, 5.06482, 5.47629, 5.56511, 5.71212, 5.41184, 5.59499, 5.63272, 5.23153, 5.27192, 5.3912, 5.39311, 5.32484, 5.49539, 5.18175, 5.29693, 5.24506, 5.37468, 5.25384, 5.44332, 5.53548, 5.3125, 5.43753, 5.3339, 5.07, 5.31161, 5.25178, 5.30057, 5.1086, 5.27262, 5.26395, 5.46902, 5.15667, 5.26704, 5.20746, 5.35466, 4.98016, 4.91076, 5.3213, 5.39019, 5.22162, 5.3164, 5.10162, 5.1553, 5.25943, 5.06435, 5.26075, 5.07101, 5.33638, 5.24297, 5.14623, 5.23826, 5.03699, 5.31101, 5.04764, 5.02142, 5.13778, 5.10838, 5.26722, 5.14671, 5.27266, 5.09162, 5.0919, 5.24829, 5.3185, 5.25029, 5.18579, 5.14206, 5.28335, 4.94328, 5.20523, 5.08657, 5.29719, 5.17312, 5.18231, 5.10943, 4.98051, 4.99195, 5.21896, 5.30825, 5.09051, 5.05174, 4.91264, 5.11732, 5.11518, 4.92322, 5.33386, 5.02007, 5.09792, 5.16007, 4.99811, 5.05898, 5.06488, 4.98971, 5.07389, 5.15699, 4.97292, 5.17835, 4.92646, 4.91925, 5.06679, 4.99198, 4.90773, 4.77047, 4.93905, 5.10914, 5.0148, 5.01342, 5.32728, 4.95518, 4.99041, 5.04238, 4.79783, 4.72965, 4.99227, 5.0394, 4.87169, 4.95051, 5.03887, 5.01995, 4.81482, 4.88854, 4.89947, 4.82779, 4.74234, 5.00778, 4.7467, 5.20619, 4.78181, 4.98955, 4.73414, 4.78105, 4.81703, 4.64628, 4.65374, 4.83873, 4.80327, 4.79812, 4.9214, 4.87849, 4.92132, 4.76615, 4.87858, 4.72843, 4.9077, 4.95342, 4.86965, 4.70236, 4.77862, 4.89666, 4.70572, 4.85677, 4.68692, 4.68192, 4.64505]}, "lm loss vs samples": {"start_step": 0, "end_step": 2000, "step_interval": 5, "values": [10.88321, 10.90268, 10.88687, 10.83314, 10.67636, 10.64925, 10.43407, 10.15143, 9.939, 9.84142, 9.58871, 9.85432, 9.88466, 9.62953, 9.78812, 9.5115, 9.45845, 9.64924, 9.38622, 9.33216, 9.24226, 9.14549, 9.17557, 8.99547, 9.18942, 9.05996, 9.15554, 9.16495, 9.29785, 8.98464, 8.92921, 9.04391, 9.04317, 8.65502, 8.71709, 8.75344, 8.68371, 8.7343, 8.65869, 8.76488, 8.66084, 8.84969, 8.83212, 8.4992, 8.38905, 8.43151, 8.49327, 8.38449, 8.43266, 8.57974, 8.36712, 8.19218, 8.22599, 8.22213, 8.26761, 7.91363, 8.09574, 7.89107, 8.2463, 8.23044, 8.00478, 7.9653, 7.91788, 7.73983, 7.73952, 7.64266, 7.51535, 7.9067, 7.6981, 7.45174, 7.74028, 7.76751, 7.54113, 7.29838, 7.45192, 7.33549, 7.46187, 7.22351, 7.63653, 7.27884, 7.35151, 7.2129, 7.2187, 7.42237, 7.17713, 7.28373, 7.00153, 7.00528, 7.04066, 7.1397, 6.8246, 6.98624, 7.08901, 7.00075, 6.87398, 6.75446, 6.98902, 7.05484, 6.70056, 6.57618, 6.7239, 6.73842, 6.73087, 6.73636, 6.65702, 6.40579, 6.6386, 6.62005, 6.44721, 6.63067, 6.74344, 6.6111, 6.7266, 6.69523, 6.62503, 6.50683, 6.59892, 6.4067, 6.66402, 6.24864, 6.25205, 6.30302, 6.38991, 6.35064, 6.45057, 6.2892, 6.34021, 6.23934, 6.20441, 6.39672, 6.32669, 6.3228, 6.16602, 6.15875, 6.24058, 6.38585, 6.20055, 6.14534, 6.17669, 6.1094, 6.05525, 6.06665, 6.2527, 6.40409, 6.25252, 6.2934, 6.0919, 6.17395, 5.99575, 6.02272, 5.94996, 6.23797, 6.18154, 5.95877, 5.77498, 6.11727, 5.84271, 6.09751, 5.78563, 6.15394, 6.14296, 6.08411, 5.92729, 6.11238, 5.94309, 6.19339, 5.89494, 5.792, 5.77614, 5.6837, 6.01618, 5.99613, 6.06338, 5.88778, 6.04018, 5.96996, 5.99544, 5.98695, 5.94778, 5.84144, 5.95287, 5.61942, 5.70133, 5.88893, 5.84402, 5.86128, 5.76114, 5.83707, 5.72343, 5.55889, 5.72351, 5.62534, 5.83303, 5.60569, 5.7102, 5.70991, 5.89681, 5.64325, 5.84924, 5.73928, 5.87114, 5.33228, 5.89693, 5.872, 5.85316, 5.40988, 5.4088, 5.62665, 5.59641, 5.48639, 5.57896, 5.67332, 5.47579, 5.74541, 5.50851, 5.59461, 5.621, 5.62129, 5.51073, 5.61357, 5.67793, 5.68632, 5.58943, 5.66035, 5.37294, 5.67985, 5.62736, 5.42133, 5.58734, 5.63109, 5.55307, 5.34119, 5.53841, 5.48634, 5.48174, 5.37484, 5.55776, 5.60342, 5.38738, 5.52728, 5.4859, 5.33181, 5.50554, 5.40833, 5.44, 5.31717, 5.06482, 5.47629, 5.56511, 5.71212, 5.41184, 5.59499, 5.63272, 5.23153, 5.27192, 5.3912, 5.39311, 5.32484, 5.49539, 5.18175, 5.29693, 5.24506, 5.37468, 5.25384, 5.44332, 5.53548, 5.3125, 5.43753, 5.3339, 5.07, 5.31161, 5.25178, 5.30057, 5.1086, 5.27262, 5.26395, 5.46902, 5.15667, 5.26704, 5.20746, 5.35466, 4.98016, 4.91076, 5.3213, 5.39019, 5.22162, 5.3164, 5.10162, 5.1553, 5.25943, 5.06435, 5.26075, 5.07101, 5.33638, 5.24297, 5.14623, 5.23826, 5.03699, 5.31101, 5.04764, 5.02142, 5.13778, 5.10838, 5.26722, 5.14671, 5.27266, 5.09162, 5.0919, 5.24829, 5.3185, 5.25029, 5.18579, 5.14206, 5.28335, 4.94328, 5.20523, 5.08657, 5.29719, 5.17312, 5.18231, 5.10943, 4.98051, 4.99195, 5.21896, 5.30825, 5.09051, 5.05174, 4.91264, 5.11732, 5.11518, 4.92322, 5.33386, 5.02007, 5.09792, 5.16007, 4.99811, 5.05898, 5.06488, 4.98971, 5.07389, 5.15699, 4.97292, 5.17835, 4.92646, 4.91925, 5.06679, 4.99198, 4.90773, 4.77047, 4.93905, 5.10914, 5.0148, 5.01342, 5.32728, 4.95518, 4.99041, 5.04238, 4.79783, 4.72965, 4.99227, 5.0394, 4.87169, 4.95051, 5.03887, 5.01995, 4.81482, 4.88854, 4.89947, 4.82779, 4.74234, 5.00778, 4.7467, 5.20619, 4.78181, 4.98955, 4.73414, 4.78105, 4.81703, 4.64628, 4.65374, 4.83873, 4.80327, 4.79812, 4.9214, 4.87849, 4.92132, 4.76615, 4.87858, 4.72843, 4.9077, 4.95342, 4.86965, 4.70236, 4.77862, 4.89666, 4.70572, 4.85677, 4.68692, 4.68192, 4.64505]}, "loss-scale": {"start_step": 0, "end_step": 2000, "step_interval": 5, "values": [1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0]}, "loss-scale vs samples": {"start_step": 0, "end_step": 2000, "step_interval": 5, "values": [1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0]}, "grad-norm": {"start_step": 0, "end_step": 2000, "step_interval": 5, "values": [12.95641, 13.2384, 13.63492, 12.46753, 12.09519, 9.48185, 7.05331, 7.26898, 6.13791, 4.65533, 4.16677, 2.85409, 2.39258, 2.35693, 2.05902, 2.22136, 2.15373, 1.91319, 2.28507, 2.08136, 2.12587, 2.16293, 2.01255, 2.22443, 1.98488, 2.10576, 1.90696, 1.9543, 1.94666, 2.19132, 2.07534, 1.9973, 1.90676, 2.17071, 2.13949, 2.12242, 2.00142, 1.85779, 1.93941, 1.74128, 2.19131, 1.80266, 1.76804, 1.92184, 1.89627, 1.81829, 1.73892, 1.73316, 1.7548, 1.56741, 1.70661, 1.78909, 1.75371, 1.8099, 1.69083, 1.80378, 1.72805, 1.87537, 1.64718, 1.47793, 1.64751, 1.54177, 1.73678, 1.93709, 1.70003, 1.61404, 1.65733, 1.60718, 1.41019, 1.66006, 1.44415, 1.3449, 1.59801, 1.38078, 1.40657, 1.58642, 1.37384, 1.47591, 1.51235, 1.32276, 1.27695, 1.35665, 1.39793, 1.46181, 1.25641, 1.39278, 1.37555, 1.31206, 1.25327, 1.08729, 1.11608, 1.26073, 1.05493, 1.26676, 1.03825, 1.22449, 1.31527, 1.17458, 1.05643, 1.32651, 1.60257, 1.2771, 1.33646, 1.31918, 1.248, 1.20478, 1.17877, 1.39792, 1.21711, 1.31304, 1.06851, 0.90225, 1.00231, 1.02701, 1.08335, 1.06592, 1.11157, 1.35469, 1.11475, 0.96782, 1.00793, 1.10818, 0.98621, 1.2088, 1.33881, 1.44029, 1.6209, 1.4596, 1.76932, 0.95989, 1.18019, 1.10796, 1.01963, 0.97229, 1.12326, 1.18955, 1.04787, 1.17124, 1.15064, 0.95989, 1.2251, 1.2379, 1.76155, 1.26203, 1.48837, 1.2467, 1.12532, 1.2807, 1.00776, 1.29835, 1.39203, 1.19636, 1.4484, 1.31191, 1.0452, 1.72246, 1.72833, 1.28959, 1.84591, 1.35158, 1.59884, 1.36455, 1.22883, 0.94147, 1.4872, 1.47058, 1.60177, 1.17187, 1.32032, 1.16147, 1.85664, 1.34438, 1.41884, 1.939, 1.3293, 1.75251, 1.4942, 1.19914, 1.25112, 1.47923, 1.19903, 1.70249, 1.28382, 1.22996, 1.38428, 1.04416, 1.49206, 1.45812, 1.5496, 1.42558, 1.5666, 1.60373, 1.50198, 2.14466, 1.64657, 1.23816, 1.19399, 1.20748, 1.27992, 1.28244, 1.01251, 1.42205, 1.36197, 1.11149, 1.15089, 1.21404, 1.39311, 1.5652, 1.38265, 1.4134, 1.55375, 1.48078, 1.28046, 1.56958, 1.42513, 1.45697, 1.27067, 1.6129, 1.30064, 1.30128, 1.59962, 2.07562, 1.66274, 1.53273, 1.30633, 1.38281, 1.30251, 1.26134, 1.59835, 1.39505, 1.20665, 1.50419, 1.33709, 1.53729, 1.35211, 1.18328, 1.72786, 1.56925, 1.48159, 1.79747, 1.32018, 1.29802, 1.45777, 1.41144, 1.32018, 1.82833, 1.47341, 1.38161, 1.37728, 1.47317, 1.22182, 1.50379, 1.40184, 1.43299, 1.38574, 1.54027, 1.3871, 1.51693, 1.73604, 1.27623, 1.30004, 1.43266, 1.26605, 1.31063, 1.40554, 1.47355, 1.43481, 1.66877, 1.27269, 1.36414, 1.39902, 1.36787, 1.30634, 1.35432, 1.33569, 1.38439, 1.38254, 1.48327, 1.3313, 1.47336, 1.54266, 1.45093, 1.39023, 1.42073, 1.71873, 1.24142, 1.27025, 1.75206, 1.19488, 1.72063, 1.35861, 1.46103, 1.32756, 1.38252, 1.44831, 1.49026, 1.5017, 1.67806, 1.49633, 1.40813, 1.2821, 1.34708, 1.20139, 1.33134, 1.30935, 1.28049, 1.39953, 1.36021, 1.30784, 1.55113, 1.45126, 1.35267, 1.8948, 1.31989, 1.26079, 1.54872, 1.25987, 1.49108, 1.31905, 1.39623, 1.42575, 1.70894, 1.69908, 1.44957, 1.53553, 1.41451, 1.68745, 1.45251, 1.2816, 1.33701, 1.40832, 1.76682, 1.43394, 1.35911, 1.42618, 1.36908, 1.37004, 1.25362, 1.44167, 1.3631, 1.32537, 1.0708, 1.21959, 1.38245, 1.69458, 1.66343, 1.49487, 1.64475, 1.18445, 1.24234, 1.37689, 1.3449, 1.29452, 1.57163, 1.48364, 1.39813, 1.46563, 1.16757, 1.33935, 1.37732, 1.74665, 1.43255, 1.6591, 1.35981, 1.18773, 1.72037, 1.57868, 1.47314, 1.60009, 1.70452, 1.52569, 1.35993, 1.71308, 1.55029, 1.45496, 1.45713, 1.21934, 1.34612, 1.35689, 1.29738, 1.27919, 1.35703, 1.34356, 1.23723, 1.16682, 1.55154, 1.54928, 1.31127, 1.22661, 1.39907, 1.23896, 1.39069, 1.35517, 1.4518, 1.74352, 1.41812, 1.48035, 1.43537, 1.2798, 1.31958]}, "grad-norm vs samples": {"start_step": 0, "end_step": 2000, "step_interval": 5, "values": [12.95641, 13.2384, 13.63492, 12.46753, 12.09519, 9.48185, 7.05331, 7.26898, 6.13791, 4.65533, 4.16677, 2.85409, 2.39258, 2.35693, 2.05902, 2.22136, 2.15373, 1.91319, 2.28507, 2.08136, 2.12587, 2.16293, 2.01255, 2.22443, 1.98488, 2.10576, 1.90696, 1.9543, 1.94666, 2.19132, 2.07534, 1.9973, 1.90676, 2.17071, 2.13949, 2.12242, 2.00142, 1.85779, 1.93941, 1.74128, 2.19131, 1.80266, 1.76804, 1.92184, 1.89627, 1.81829, 1.73892, 1.73316, 1.7548, 1.56741, 1.70661, 1.78909, 1.75371, 1.8099, 1.69083, 1.80378, 1.72805, 1.87537, 1.64718, 1.47793, 1.64751, 1.54177, 1.73678, 1.93709, 1.70003, 1.61404, 1.65733, 1.60718, 1.41019, 1.66006, 1.44415, 1.3449, 1.59801, 1.38078, 1.40657, 1.58642, 1.37384, 1.47591, 1.51235, 1.32276, 1.27695, 1.35665, 1.39793, 1.46181, 1.25641, 1.39278, 1.37555, 1.31206, 1.25327, 1.08729, 1.11608, 1.26073, 1.05493, 1.26676, 1.03825, 1.22449, 1.31527, 1.17458, 1.05643, 1.32651, 1.60257, 1.2771, 1.33646, 1.31918, 1.248, 1.20478, 1.17877, 1.39792, 1.21711, 1.31304, 1.06851, 0.90225, 1.00231, 1.02701, 1.08335, 1.06592, 1.11157, 1.35469, 1.11475, 0.96782, 1.00793, 1.10818, 0.98621, 1.2088, 1.33881, 1.44029, 1.6209, 1.4596, 1.76932, 0.95989, 1.18019, 1.10796, 1.01963, 0.97229, 1.12326, 1.18955, 1.04787, 1.17124, 1.15064, 0.95989, 1.2251, 1.2379, 1.76155, 1.26203, 1.48837, 1.2467, 1.12532, 1.2807, 1.00776, 1.29835, 1.39203, 1.19636, 1.4484, 1.31191, 1.0452, 1.72246, 1.72833, 1.28959, 1.84591, 1.35158, 1.59884, 1.36455, 1.22883, 0.94147, 1.4872, 1.47058, 1.60177, 1.17187, 1.32032, 1.16147, 1.85664, 1.34438, 1.41884, 1.939, 1.3293, 1.75251, 1.4942, 1.19914, 1.25112, 1.47923, 1.19903, 1.70249, 1.28382, 1.22996, 1.38428, 1.04416, 1.49206, 1.45812, 1.5496, 1.42558, 1.5666, 1.60373, 1.50198, 2.14466, 1.64657, 1.23816, 1.19399, 1.20748, 1.27992, 1.28244, 1.01251, 1.42205, 1.36197, 1.11149, 1.15089, 1.21404, 1.39311, 1.5652, 1.38265, 1.4134, 1.55375, 1.48078, 1.28046, 1.56958, 1.42513, 1.45697, 1.27067, 1.6129, 1.30064, 1.30128, 1.59962, 2.07562, 1.66274, 1.53273, 1.30633, 1.38281, 1.30251, 1.26134, 1.59835, 1.39505, 1.20665, 1.50419, 1.33709, 1.53729, 1.35211, 1.18328, 1.72786, 1.56925, 1.48159, 1.79747, 1.32018, 1.29802, 1.45777, 1.41144, 1.32018, 1.82833, 1.47341, 1.38161, 1.37728, 1.47317, 1.22182, 1.50379, 1.40184, 1.43299, 1.38574, 1.54027, 1.3871, 1.51693, 1.73604, 1.27623, 1.30004, 1.43266, 1.26605, 1.31063, 1.40554, 1.47355, 1.43481, 1.66877, 1.27269, 1.36414, 1.39902, 1.36787, 1.30634, 1.35432, 1.33569, 1.38439, 1.38254, 1.48327, 1.3313, 1.47336, 1.54266, 1.45093, 1.39023, 1.42073, 1.71873, 1.24142, 1.27025, 1.75206, 1.19488, 1.72063, 1.35861, 1.46103, 1.32756, 1.38252, 1.44831, 1.49026, 1.5017, 1.67806, 1.49633, 1.40813, 1.2821, 1.34708, 1.20139, 1.33134, 1.30935, 1.28049, 1.39953, 1.36021, 1.30784, 1.55113, 1.45126, 1.35267, 1.8948, 1.31989, 1.26079, 1.54872, 1.25987, 1.49108, 1.31905, 1.39623, 1.42575, 1.70894, 1.69908, 1.44957, 1.53553, 1.41451, 1.68745, 1.45251, 1.2816, 1.33701, 1.40832, 1.76682, 1.43394, 1.35911, 1.42618, 1.36908, 1.37004, 1.25362, 1.44167, 1.3631, 1.32537, 1.0708, 1.21959, 1.38245, 1.69458, 1.66343, 1.49487, 1.64475, 1.18445, 1.24234, 1.37689, 1.3449, 1.29452, 1.57163, 1.48364, 1.39813, 1.46563, 1.16757, 1.33935, 1.37732, 1.74665, 1.43255, 1.6591, 1.35981, 1.18773, 1.72037, 1.57868, 1.47314, 1.60009, 1.70452, 1.52569, 1.35993, 1.71308, 1.55029, 1.45496, 1.45713, 1.21934, 1.34612, 1.35689, 1.29738, 1.27919, 1.35703, 1.34356, 1.23723, 1.16682, 1.55154, 1.54928, 1.31127, 1.22661, 1.39907, 1.23896, 1.39069, 1.35517, 1.4518, 1.74352, 1.41812, 1.48035, 1.43537, 1.2798, 1.31958]}, "num-zeros": {"start_step": 0, "end_step": 2000, "step_interval": 5, "values": [58.0, 81.0, 78.0, 82.0, 76.0, 95.0, 104.0, 114.0, 114.0, 147.0, 119.0, 159.0, 165.0, 173.0, 182.0, 167.0, 188.0, 176.0, 167.0, 165.0, 187.0, 162.0, 191.0, 164.0, 181.0, 170.0, 168.0, 172.0, 182.0, 180.0, 164.0, 171.0, 169.0, 154.0, 144.0, 172.0, 173.0, 198.0, 168.0, 210.0, 178.0, 156.0, 174.0, 177.0, 163.0, 172.0, 206.0, 172.0, 184.0, 197.0, 223.0, 153.0, 162.0, 187.0, 173.0, 201.0, 146.0, 152.0, 240.0, 231.0, 192.0, 208.0, 162.0, 210.0, 192.0, 282.0, 232.0, 174.0, 215.0, 186.0, 227.0, 258.0, 202.0, 265.0, 192.0, 216.0, 239.0, 200.0, 265.0, 210.0, 264.0, 231.0, 179.0, 221.0, 234.0, 184.0, 188.0, 206.0, 157.0, 228.0, 217.0, 227.0, 219.0, 233.0, 191.0, 187.0, 214.0, 190.0, 237.0, 168.0, 155.0, 174.0, 165.0, 157.0, 155.0, 136.0, 154.0, 133.0, 124.0, 167.0, 187.0, 158.0, 188.0, 161.0, 168.0, 130.0, 164.0, 109.0, 181.0, 166.0, 146.0, 145.0, 130.0, 132.0, 130.0, 145.0, 125.0, 107.0, 130.0, 147.0, 128.0, 137.0, 149.0, 151.0, 133.0, 117.0, 167.0, 153.0, 134.0, 131.0, 117.0, 116.0, 100.0, 125.0, 121.0, 139.0, 125.0, 139.0, 124.0, 118.0, 103.0, 142.0, 95.0, 127.0, 109.0, 102.0, 110.0, 119.0, 101.0, 129.0, 122.0, 143.0, 119.0, 131.0, 102.0, 117.0, 98.0, 140.0, 129.0, 106.0, 76.0, 115.0, 81.0, 87.0, 118.0, 84.0, 101.0, 118.0, 99.0, 99.0, 107.0, 108.0, 137.0, 131.0, 109.0, 123.0, 107.0, 104.0, 102.0, 138.0, 125.0, 119.0, 91.0, 79.0, 87.0, 112.0, 104.0, 98.0, 101.0, 109.0, 135.0, 98.0, 89.0, 117.0, 106.0, 127.0, 103.0, 111.0, 122.0, 102.0, 92.0, 99.0, 110.0, 93.0, 123.0, 114.0, 133.0, 87.0, 114.0, 121.0, 111.0, 95.0, 93.0, 102.0, 127.0, 88.0, 127.0, 114.0, 107.0, 110.0, 101.0, 110.0, 108.0, 99.0, 106.0, 126.0, 92.0, 96.0, 94.0, 77.0, 124.0, 119.0, 91.0, 105.0, 110.0, 103.0, 97.0, 116.0, 104.0, 97.0, 117.0, 92.0, 110.0, 114.0, 97.0, 101.0, 92.0, 105.0, 93.0, 141.0, 93.0, 106.0, 116.0, 107.0, 122.0, 107.0, 128.0, 100.0, 94.0, 105.0, 124.0, 114.0, 94.0, 80.0, 98.0, 105.0, 97.0, 99.0, 132.0, 94.0, 99.0, 93.0, 108.0, 108.0, 107.0, 111.0, 134.0, 114.0, 104.0, 102.0, 123.0, 108.0, 109.0, 107.0, 110.0, 121.0, 92.0, 94.0, 130.0, 128.0, 130.0, 83.0, 110.0, 130.0, 105.0, 99.0, 106.0, 107.0, 101.0, 100.0, 98.0, 131.0, 101.0, 116.0, 89.0, 106.0, 114.0, 115.0, 112.0, 110.0, 128.0, 92.0, 88.0, 112.0, 108.0, 106.0, 83.0, 113.0, 129.0, 126.0, 99.0, 118.0, 98.0, 101.0, 102.0, 103.0, 119.0, 126.0, 128.0, 110.0, 107.0, 128.0, 125.0, 119.0, 113.0, 89.0, 102.0, 103.0, 126.0, 141.0, 95.0, 106.0, 117.0, 109.0, 93.0, 109.0, 111.0, 138.0, 124.0, 114.0, 106.0, 92.0, 109.0, 105.0, 144.0, 122.0, 108.0, 112.0, 86.0, 100.0, 127.0, 108.0, 100.0, 113.0, 99.0, 103.0, 104.0, 96.0, 125.0, 122.0, 97.0, 128.0, 117.0, 121.0, 133.0, 115.0, 95.0, 126.0, 117.0, 136.0, 118.0, 108.0, 135.0, 109.0, 114.0, 124.0, 122.0, 106.0, 110.0, 124.0]}, "num-zeros vs samples": {"start_step": 0, "end_step": 2000, "step_interval": 5, "values": [58.0, 81.0, 78.0, 82.0, 76.0, 95.0, 104.0, 114.0, 114.0, 147.0, 119.0, 159.0, 165.0, 173.0, 182.0, 167.0, 188.0, 176.0, 167.0, 165.0, 187.0, 162.0, 191.0, 164.0, 181.0, 170.0, 168.0, 172.0, 182.0, 180.0, 164.0, 171.0, 169.0, 154.0, 144.0, 172.0, 173.0, 198.0, 168.0, 210.0, 178.0, 156.0, 174.0, 177.0, 163.0, 172.0, 206.0, 172.0, 184.0, 197.0, 223.0, 153.0, 162.0, 187.0, 173.0, 201.0, 146.0, 152.0, 240.0, 231.0, 192.0, 208.0, 162.0, 210.0, 192.0, 282.0, 232.0, 174.0, 215.0, 186.0, 227.0, 258.0, 202.0, 265.0, 192.0, 216.0, 239.0, 200.0, 265.0, 210.0, 264.0, 231.0, 179.0, 221.0, 234.0, 184.0, 188.0, 206.0, 157.0, 228.0, 217.0, 227.0, 219.0, 233.0, 191.0, 187.0, 214.0, 190.0, 237.0, 168.0, 155.0, 174.0, 165.0, 157.0, 155.0, 136.0, 154.0, 133.0, 124.0, 167.0, 187.0, 158.0, 188.0, 161.0, 168.0, 130.0, 164.0, 109.0, 181.0, 166.0, 146.0, 145.0, 130.0, 132.0, 130.0, 145.0, 125.0, 107.0, 130.0, 147.0, 128.0, 137.0, 149.0, 151.0, 133.0, 117.0, 167.0, 153.0, 134.0, 131.0, 117.0, 116.0, 100.0, 125.0, 121.0, 139.0, 125.0, 139.0, 124.0, 118.0, 103.0, 142.0, 95.0, 127.0, 109.0, 102.0, 110.0, 119.0, 101.0, 129.0, 122.0, 143.0, 119.0, 131.0, 102.0, 117.0, 98.0, 140.0, 129.0, 106.0, 76.0, 115.0, 81.0, 87.0, 118.0, 84.0, 101.0, 118.0, 99.0, 99.0, 107.0, 108.0, 137.0, 131.0, 109.0, 123.0, 107.0, 104.0, 102.0, 138.0, 125.0, 119.0, 91.0, 79.0, 87.0, 112.0, 104.0, 98.0, 101.0, 109.0, 135.0, 98.0, 89.0, 117.0, 106.0, 127.0, 103.0, 111.0, 122.0, 102.0, 92.0, 99.0, 110.0, 93.0, 123.0, 114.0, 133.0, 87.0, 114.0, 121.0, 111.0, 95.0, 93.0, 102.0, 127.0, 88.0, 127.0, 114.0, 107.0, 110.0, 101.0, 110.0, 108.0, 99.0, 106.0, 126.0, 92.0, 96.0, 94.0, 77.0, 124.0, 119.0, 91.0, 105.0, 110.0, 103.0, 97.0, 116.0, 104.0, 97.0, 117.0, 92.0, 110.0, 114.0, 97.0, 101.0, 92.0, 105.0, 93.0, 141.0, 93.0, 106.0, 116.0, 107.0, 122.0, 107.0, 128.0, 100.0, 94.0, 105.0, 124.0, 114.0, 94.0, 80.0, 98.0, 105.0, 97.0, 99.0, 132.0, 94.0, 99.0, 93.0, 108.0, 108.0, 107.0, 111.0, 134.0, 114.0, 104.0, 102.0, 123.0, 108.0, 109.0, 107.0, 110.0, 121.0, 92.0, 94.0, 130.0, 128.0, 130.0, 83.0, 110.0, 130.0, 105.0, 99.0, 106.0, 107.0, 101.0, 100.0, 98.0, 131.0, 101.0, 116.0, 89.0, 106.0, 114.0, 115.0, 112.0, 110.0, 128.0, 92.0, 88.0, 112.0, 108.0, 106.0, 83.0, 113.0, 129.0, 126.0, 99.0, 118.0, 98.0, 101.0, 102.0, 103.0, 119.0, 126.0, 128.0, 110.0, 107.0, 128.0, 125.0, 119.0, 113.0, 89.0, 102.0, 103.0, 126.0, 141.0, 95.0, 106.0, 117.0, 109.0, 93.0, 109.0, 111.0, 138.0, 124.0, 114.0, 106.0, 92.0, 109.0, 105.0, 144.0, 122.0, 108.0, 112.0, 86.0, 100.0, 127.0, 108.0, 100.0, 113.0, 99.0, 103.0, 104.0, 96.0, 125.0, 122.0, 97.0, 128.0, 117.0, 121.0, 133.0, 115.0, 95.0, 126.0, 117.0, 136.0, 118.0, 108.0, 135.0, 109.0, 114.0, 124.0, 122.0, 106.0, 110.0, 124.0]}, "params-norm": {"start_step": 0, "end_step": 2000, "step_interval": 5, "values": [179.95625, 179.95625, 179.95625, 179.95625, 179.95625, 179.95624, 179.95621, 179.95612, 179.95593, 179.95575, 179.95447, 179.9538, 179.95322, 179.95126, 179.95035, 179.94966, 179.94905, 179.94916, 179.94939, 179.94966, 179.9496, 179.94933, 179.94919, 179.94952, 179.95036, 179.95168, 179.95288, 179.95392, 179.95509, 179.9565, 179.95851, 179.96069, 179.963, 179.96532, 179.96788, 179.97118, 179.97482, 179.97873, 179.98279, 179.98714, 179.99208, 179.99753, 180.00325, 180.00955, 180.01634, 180.02382, 180.03171, 180.04016, 180.04951, 180.05969, 180.07059, 180.08221, 180.09441, 180.10721, 180.12059, 180.13457, 180.14899, 180.16373, 180.1792, 180.19586, 180.21344, 180.23199, 180.25226, 180.2733, 180.2948, 180.31709, 180.34032, 180.36464, 180.38991, 180.41573, 180.44231, 180.46947, 180.49721, 180.52528, 180.55406, 180.5829, 180.61168, 180.64125, 180.67117, 180.70154, 180.73244, 180.76378, 180.79633, 180.82928, 180.86198, 180.89581, 180.92958, 180.96359, 180.99808, 181.03401, 181.07187, 181.1104, 181.14795, 181.18536, 181.22249, 181.26071, 181.29898, 181.33658, 181.37422, 181.41164, 181.4467, 181.47968, 181.5123, 181.54552, 181.57919, 181.61421, 181.65012, 181.68695, 181.72267, 181.7587, 181.79526, 181.83344, 181.87288, 181.91354, 181.9543, 181.99518, 182.03568, 182.07515, 182.11353, 182.15218, 182.19164, 182.23108, 182.2708, 182.30989, 182.34795, 182.3871, 182.42479, 182.46089, 182.49536, 182.52867, 182.5638, 182.60063, 182.63989, 182.67992, 182.72049, 182.76151, 182.80296, 182.8448, 182.88582, 182.92665, 182.96825, 183.00778, 183.04619, 183.08208, 183.117, 183.15222, 183.18738, 183.22598, 183.2657, 183.30598, 183.34494, 183.38196, 183.41934, 183.45613, 183.49393, 183.53142, 183.56673, 183.60075, 183.63268, 183.66296, 183.69357, 183.7247, 183.76031, 183.79965, 183.83946, 183.87967, 183.91869, 183.95782, 183.99774, 184.03601, 184.07205, 184.10704, 184.14296, 184.17989, 184.21503, 184.24945, 184.28268, 184.31783, 184.35512, 184.39378, 184.43393, 184.47366, 184.51508, 184.55717, 184.59872, 184.64001, 184.68074, 184.71964, 184.75798, 184.79604, 184.83191, 184.86661, 184.90184, 184.9364, 184.96959, 185.00362, 185.0423, 185.08412, 185.12758, 185.17178, 185.21582, 185.26006, 185.30214, 185.34361, 185.3847, 185.42496, 185.46634, 185.50591, 185.54526, 185.58424, 185.62386, 185.6624, 185.7025, 185.74159, 185.78154, 185.82208, 185.86279, 185.90271, 185.94293, 185.98375, 186.0233, 186.05884, 186.09236, 186.12791, 186.16458, 186.20477, 186.24573, 186.28658, 186.32719, 186.36766, 186.40819, 186.44913, 186.48967, 186.53146, 186.57472, 186.61908, 186.66409, 186.70798, 186.75232, 186.79475, 186.83501, 186.8761, 186.91815, 186.96135, 187.00375, 187.04543, 187.08774, 187.13051, 187.17398, 187.21738, 187.26135, 187.30682, 187.3519, 187.39789, 187.44398, 187.48967, 187.53412, 187.57758, 187.62079, 187.66299, 187.70578, 187.74741, 187.79074, 187.83516, 187.8799, 187.92366, 187.9662, 188.00873, 188.0517, 188.09543, 188.13933, 188.183, 188.2269, 188.2719, 188.31848, 188.36552, 188.41412, 188.46288, 188.51031, 188.55696, 188.60126, 188.64514, 188.68958, 188.7356, 188.78317, 188.82912, 188.87651, 188.92406, 188.97069, 189.0186, 189.06526, 189.11108, 189.15532, 189.20073, 189.24802, 189.29507, 189.3419, 189.38878, 189.43637, 189.48433, 189.53323, 189.58208, 189.63031, 189.67888, 189.72659, 189.7742, 189.82292, 189.87331, 189.92422, 189.97572, 190.02654, 190.07675, 190.12685, 190.17654, 190.22655, 190.27744, 190.32918, 190.38191, 190.43228, 190.48412, 190.53688, 190.58897, 190.6412, 190.69144, 190.74126, 190.79027, 190.84029, 190.89107, 190.94135, 190.99312, 191.04454, 191.09538, 191.14601, 191.19763, 191.25024, 191.3022, 191.35342, 191.40527, 191.45781, 191.51038, 191.56477, 191.61903, 191.67284, 191.72745, 191.78351, 191.83809, 191.89211, 191.94516, 191.99768, 192.0515, 192.10683, 192.16144, 192.21646, 192.27127, 192.3248, 192.37834, 192.43166, 192.48701, 192.54335, 192.59961, 192.65665, 192.71281, 192.76929, 192.82428, 192.88118, 192.93932, 192.99641, 193.05295, 193.10945, 193.16679, 193.22235, 193.27766, 193.33466, 193.38956, 193.44543, 193.4995, 193.55339, 193.60861, 193.66547, 193.72427, 193.78304, 193.84152, 193.8996, 193.95851, 194.01683, 194.07661, 194.13618, 194.19662, 194.25862, 194.32071, 194.3831, 194.44382, 194.50331, 194.56212, 194.62186, 194.67973, 194.73642, 194.7941, 194.85469, 194.91579]}, "params-norm vs samples": {"start_step": 0, "end_step": 2000, "step_interval": 5, "values": [179.95625, 179.95625, 179.95625, 179.95625, 179.95625, 179.95624, 179.95621, 179.95612, 179.95593, 179.95575, 179.95447, 179.9538, 179.95322, 179.95126, 179.95035, 179.94966, 179.94905, 179.94916, 179.94939, 179.94966, 179.9496, 179.94933, 179.94919, 179.94952, 179.95036, 179.95168, 179.95288, 179.95392, 179.95509, 179.9565, 179.95851, 179.96069, 179.963, 179.96532, 179.96788, 179.97118, 179.97482, 179.97873, 179.98279, 179.98714, 179.99208, 179.99753, 180.00325, 180.00955, 180.01634, 180.02382, 180.03171, 180.04016, 180.04951, 180.05969, 180.07059, 180.08221, 180.09441, 180.10721, 180.12059, 180.13457, 180.14899, 180.16373, 180.1792, 180.19586, 180.21344, 180.23199, 180.25226, 180.2733, 180.2948, 180.31709, 180.34032, 180.36464, 180.38991, 180.41573, 180.44231, 180.46947, 180.49721, 180.52528, 180.55406, 180.5829, 180.61168, 180.64125, 180.67117, 180.70154, 180.73244, 180.76378, 180.79633, 180.82928, 180.86198, 180.89581, 180.92958, 180.96359, 180.99808, 181.03401, 181.07187, 181.1104, 181.14795, 181.18536, 181.22249, 181.26071, 181.29898, 181.33658, 181.37422, 181.41164, 181.4467, 181.47968, 181.5123, 181.54552, 181.57919, 181.61421, 181.65012, 181.68695, 181.72267, 181.7587, 181.79526, 181.83344, 181.87288, 181.91354, 181.9543, 181.99518, 182.03568, 182.07515, 182.11353, 182.15218, 182.19164, 182.23108, 182.2708, 182.30989, 182.34795, 182.3871, 182.42479, 182.46089, 182.49536, 182.52867, 182.5638, 182.60063, 182.63989, 182.67992, 182.72049, 182.76151, 182.80296, 182.8448, 182.88582, 182.92665, 182.96825, 183.00778, 183.04619, 183.08208, 183.117, 183.15222, 183.18738, 183.22598, 183.2657, 183.30598, 183.34494, 183.38196, 183.41934, 183.45613, 183.49393, 183.53142, 183.56673, 183.60075, 183.63268, 183.66296, 183.69357, 183.7247, 183.76031, 183.79965, 183.83946, 183.87967, 183.91869, 183.95782, 183.99774, 184.03601, 184.07205, 184.10704, 184.14296, 184.17989, 184.21503, 184.24945, 184.28268, 184.31783, 184.35512, 184.39378, 184.43393, 184.47366, 184.51508, 184.55717, 184.59872, 184.64001, 184.68074, 184.71964, 184.75798, 184.79604, 184.83191, 184.86661, 184.90184, 184.9364, 184.96959, 185.00362, 185.0423, 185.08412, 185.12758, 185.17178, 185.21582, 185.26006, 185.30214, 185.34361, 185.3847, 185.42496, 185.46634, 185.50591, 185.54526, 185.58424, 185.62386, 185.6624, 185.7025, 185.74159, 185.78154, 185.82208, 185.86279, 185.90271, 185.94293, 185.98375, 186.0233, 186.05884, 186.09236, 186.12791, 186.16458, 186.20477, 186.24573, 186.28658, 186.32719, 186.36766, 186.40819, 186.44913, 186.48967, 186.53146, 186.57472, 186.61908, 186.66409, 186.70798, 186.75232, 186.79475, 186.83501, 186.8761, 186.91815, 186.96135, 187.00375, 187.04543, 187.08774, 187.13051, 187.17398, 187.21738, 187.26135, 187.30682, 187.3519, 187.39789, 187.44398, 187.48967, 187.53412, 187.57758, 187.62079, 187.66299, 187.70578, 187.74741, 187.79074, 187.83516, 187.8799, 187.92366, 187.9662, 188.00873, 188.0517, 188.09543, 188.13933, 188.183, 188.2269, 188.2719, 188.31848, 188.36552, 188.41412, 188.46288, 188.51031, 188.55696, 188.60126, 188.64514, 188.68958, 188.7356, 188.78317, 188.82912, 188.87651, 188.92406, 188.97069, 189.0186, 189.06526, 189.11108, 189.15532, 189.20073, 189.24802, 189.29507, 189.3419, 189.38878, 189.43637, 189.48433, 189.53323, 189.58208, 189.63031, 189.67888, 189.72659, 189.7742, 189.82292, 189.87331, 189.92422, 189.97572, 190.02654, 190.07675, 190.12685, 190.17654, 190.22655, 190.27744, 190.32918, 190.38191, 190.43228, 190.48412, 190.53688, 190.58897, 190.6412, 190.69144, 190.74126, 190.79027, 190.84029, 190.89107, 190.94135, 190.99312, 191.04454, 191.09538, 191.14601, 191.19763, 191.25024, 191.3022, 191.35342, 191.40527, 191.45781, 191.51038, 191.56477, 191.61903, 191.67284, 191.72745, 191.78351, 191.83809, 191.89211, 191.94516, 191.99768, 192.0515, 192.10683, 192.16144, 192.21646, 192.27127, 192.3248, 192.37834, 192.43166, 192.48701, 192.54335, 192.59961, 192.65665, 192.71281, 192.76929, 192.82428, 192.88118, 192.93932, 192.99641, 193.05295, 193.10945, 193.16679, 193.22235, 193.27766, 193.33466, 193.38956, 193.44543, 193.4995, 193.55339, 193.60861, 193.66547, 193.72427, 193.78304, 193.84152, 193.8996, 193.95851, 194.01683, 194.07661, 194.13618, 194.19662, 194.25862, 194.32071, 194.3831, 194.44382, 194.50331, 194.56212, 194.62186, 194.67973, 194.73642, 194.7941, 194.85469, 194.91579]}, "iteration-time": {"start_step": 0, "end_step": 2000, "step_interval": 5, "values": [18.78556, 0.6433, 0.64729, 0.63688, 0.63863, 0.64094, 0.6349, 0.97491, 0.63959, 0.63938, 0.63992, 0.63559, 0.63842, 0.63697, 0.63738, 0.64112, 0.63959, 0.64348, 0.63705, 0.6364, 0.63918, 0.63292, 0.6437, 0.64018, 0.639, 0.63548, 0.63416, 0.64052, 0.6394, 0.64087, 0.93505, 0.64011, 0.63922, 0.63683, 0.63698, 0.63707, 0.63678, 0.63951, 0.63884, 0.63971, 0.64127, 0.63397, 0.63425, 0.63678, 0.64689, 0.63996, 0.6373, 0.63968, 0.63439, 0.63168, 0.63761, 0.63699, 0.63824, 0.71804, 0.64031, 0.63865, 0.64029, 0.63765, 0.63483, 0.63106, 0.64044, 0.64084, 0.64009, 0.63302, 0.63552, 0.634, 0.64042, 0.62983, 0.63367, 0.63643, 0.6354, 0.63829, 0.64059, 0.75259, 0.63372, 0.63627, 0.6387, 0.73904, 0.63828, 0.63771, 0.6359, 0.63693, 0.63456, 0.63441, 0.63425, 0.63785, 0.63673, 0.63659, 0.63691, 0.63886, 0.63666, 0.63099, 0.63434, 0.63606, 0.63766, 0.63693, 0.63641, 0.63421, 0.74335, 0.63417, 0.73325, 0.63333, 0.63749, 0.63466, 0.63579, 0.6328, 0.63166, 0.63446, 0.63178, 0.63147, 0.63478, 0.63778, 0.63144, 0.63332, 0.63409, 0.63176, 0.63302, 0.63438, 0.63574, 0.63649, 0.63622, 0.63188, 0.63339, 0.63517, 0.72118, 0.63229, 0.63429, 0.63655, 0.63599, 0.6353, 0.63271, 0.63372, 0.64125, 0.63512, 0.63455, 0.63532, 0.63725, 0.63591, 0.63729, 0.63999, 0.63638, 0.63338, 0.63695, 0.63822, 0.64221, 0.635, 0.63426, 0.63954, 0.63843, 0.75293, 0.63573, 0.63901, 0.63561, 0.63959, 0.6361, 0.63665, 0.64435, 0.63719, 0.63371, 0.63219, 0.6406, 0.64456, 0.63924, 0.635, 0.6327, 0.6352, 0.63564, 0.63957, 0.63877, 0.73034, 0.73934, 0.64019, 0.63815, 0.63937, 0.75337, 0.63669, 0.63936, 0.63737, 0.6461, 0.63756, 0.63312, 0.63542, 0.63878, 0.6388, 0.64047, 0.63637, 0.63586, 0.63666, 0.63721, 0.63734, 0.63786, 0.63594, 0.8184, 0.73163, 0.72764, 0.63564, 0.63408, 0.63622, 0.64045, 0.63686, 0.62364, 0.64914, 0.64308, 0.64069, 0.63927, 0.64269, 0.64288, 0.64533, 0.64376, 0.64236, 0.64125, 0.64212, 0.6369, 0.63583, 0.74464, 0.63698, 0.72591, 0.64074, 0.73419, 0.63849, 0.63726, 0.64412, 0.64282, 0.75083, 0.63592, 0.63941, 0.63766, 0.63791, 0.63977, 0.63509, 0.6399, 0.64297, 0.63884, 0.63671, 0.6435, 0.64374, 0.64843, 0.64579, 0.63861, 0.64594, 0.64077, 0.63925, 0.72846, 0.639, 0.64699, 0.6369, 0.63194, 0.63558, 0.64203, 0.63965, 0.63904, 0.63895, 0.63899, 0.64164, 0.63997, 0.63805, 0.63955, 0.63823, 0.64646, 0.64468, 0.64926, 0.64434, 0.6452, 0.64591, 0.64664, 0.63886, 0.731, 0.64411, 0.64842, 0.6425, 0.64476, 0.63269, 0.63913, 0.63471, 0.63896, 0.63597, 0.63778, 0.63815, 0.6401, 0.64693, 0.64595, 0.64455, 0.64718, 0.64189, 0.63449, 0.75535, 0.6495, 0.6344, 0.63238, 0.64302, 0.6447, 0.64478, 0.63878, 0.63865, 0.64385, 0.64709, 0.64475, 0.63872, 0.63717, 0.64047, 0.64341, 0.6397, 0.64191, 0.63957, 0.63403, 0.64098, 0.64479, 0.64926, 0.74478, 0.73898, 0.64632, 0.64647, 0.63797, 0.64641, 0.64397, 0.64203, 0.645, 0.64045, 0.64179, 0.64038, 0.64201, 0.64156, 0.64501, 0.64116, 0.63858, 0.63331, 0.63441, 0.63583, 0.64119, 0.6353, 0.63464, 0.63359, 0.63663, 0.64109, 0.6316, 0.63418, 0.63702, 0.63806, 0.64097, 0.63561, 0.63886, 0.63666, 0.63662, 0.64007, 0.64226, 0.64759, 0.64499, 0.6441, 0.63331, 0.63366, 0.63388, 0.64218, 0.6449, 0.7739, 0.64344, 0.64344, 0.64738, 0.64398, 0.64107, 0.64511, 0.64245, 0.64068, 0.6375, 0.63653, 0.63463, 0.63795, 0.64039, 0.6391, 0.63754, 0.63814, 0.64098, 0.63698, 0.63569, 0.63797, 0.63695, 0.64036, 0.63449, 0.63592, 0.72519, 0.64273, 0.63744, 0.63929, 0.63719, 0.64021, 0.64007, 0.63925, 0.63833, 0.63918, 0.63915, 0.64067, 0.64172, 0.63687, 0.63877, 0.63737, 0.64309, 0.6455, 0.64316, 0.63731, 0.6383, 0.63962]}, "lm loss validation": {"start_step": 0, "end_step": 4, "step_interval": 5, "values": [5.60423]}, "lm loss validation vs samples": {"start_step": 0, "end_step": 4, "step_interval": 5, "values": [5.60423]}, "lm loss validation ppl": {"start_step": 0, "end_step": 4, "step_interval": 5, "values": [271.57376]}, "lm loss validation ppl vs samples": {"start_step": 0, "end_step": 4, "step_interval": 5, "values": [271.57376]}}
\ No newline at end of file
diff --git a/tests/functional_tests/test_cases/gpt/gpt3_345m_weekly_dgx_h100_1N8G_mcore_tp1_pp1_fp8_no_model_parallel/model_config.yaml b/tests/functional_tests/test_cases/gpt/gpt3_345m_weekly_dgx_h100_1N8G_mcore_tp1_pp1_fp8_no_model_parallel/model_config.yaml
new file mode 100644
index 0000000000..399dbd1c6e
--- /dev/null
+++ b/tests/functional_tests/test_cases/gpt/gpt3_345m_weekly_dgx_h100_1N8G_mcore_tp1_pp1_fp8_no_model_parallel/model_config.yaml
@@ -0,0 +1,52 @@
+ENV_VARS:
+  CUDA_DEVICE_MAX_CONNECTIONS: 1
+  NVTE_ALLOW_NONDETERMINISTIC_ALGO: 0
+  NVTE_FUSED_ATTN: 0
+  NCCL_ALGO: Tree
+  CUBLAS_WORKSPACE_CONFIG: :4096:8
+MODEL_ARGS:
+  --num-layers: 12
+  --hidden-size: 512
+  --num-attention-heads: 8
+  --log-params-norm: true
+  --log-num-zeros-in-grad: true
+  --log-validation-ppl-to-tensorboard: true
+  --log-timers-to-tensorboard: true
+  --tensorboard-dir: ${TENSORBOARD_PATH}
+  --micro-batch-size: 2
+  --global-batch-size: 128
+  --seq-length: 1024
+  --max-position-embeddings: 1024
+  --train-iters: 2000
+  --timing-log-level: 2
+  --lr-decay-iters: 320000
+  --save: ${CHECKPOINT_PATH}
+  --load: ${CHECKPOINT_PATH}
+  --data-path: ${DATA_PATH}/my-gpt3_00_text_document
+  --vocab-file: ${DATA_PATH}/bpe/vocab.json
+  --merge-file: ${DATA_PATH}/bpe/merges.txt
+  --split: 949,50,1
+  --distributed-backend: nccl
+  --lr: 0.00015
+  --lr-decay-style: cosine
+  --min-lr: 1.0e-5
+  --weight-decay: 1e-2
+  --clip-grad: 1.0
+  --lr-warmup-fraction: .01
+  --log-interval: 1
+  --save-interval: 10000
+  --eval-interval: 1000
+  --eval-iters: 10
+  --transformer-impl: transformer_engine
+  --tensor-model-parallel-size: 1
+  --pipeline-model-parallel-size: 1
+  --deterministic-mode: true
+  --no-gradient-accumulation-fusion: true
+  --fp8-format: hybrid
+  --fp8-amax-history-len: 1024
+  --fp8-amax-compute-algo: max
+  --attention-softmax-in-fp32: true
+  --ckpt-format: torch_dist
+  --data-cache-path: ${DATA_CACHE_PATH}
+  --bf16: true
+TEST_TYPE: regular
diff --git a/tests/functional_tests/test_cases/gpt/gpt3_345m_weekly_dgx_h100_1N8G_mcore_tp1_pp2_fp8_pp/golden_values_dev.json b/tests/functional_tests/test_cases/gpt/gpt3_345m_weekly_dgx_h100_1N8G_mcore_tp1_pp2_fp8_pp/golden_values_dev.json
new file mode 100644
index 0000000000..e59a5682c9
--- /dev/null
+++ b/tests/functional_tests/test_cases/gpt/gpt3_345m_weekly_dgx_h100_1N8G_mcore_tp1_pp2_fp8_pp/golden_values_dev.json
@@ -0,0 +1 @@
+{"forward-backward-time": {"start_step": 0, "end_step": 2000, "step_interval": 5, "values": [21.16929, 0.69842, 0.69865, 0.68092, 0.68114, 0.68076, 0.68553, 0.6784, 0.70132, 0.68656, 0.68867, 0.69143, 0.69023, 0.68774, 0.70094, 0.68596, 0.68549, 0.6811, 0.68151, 0.67743, 0.6818, 0.67512, 0.68645, 0.67903, 0.68158, 0.68543, 0.68715, 0.68897, 0.70747, 0.68759, 0.68732, 0.68723, 0.69033, 0.68094, 0.68856, 0.6856, 0.69221, 0.68087, 0.69125, 0.68605, 0.69475, 0.68504, 0.6893, 0.69096, 0.69541, 0.70004, 0.69576, 0.69211, 0.70539, 0.69068, 0.68902, 0.69335, 0.68369, 0.68436, 0.68239, 0.68834, 0.6958, 0.68962, 0.68485, 0.69578, 0.6843, 0.68984, 0.69245, 0.68747, 0.68675, 0.69129, 0.68873, 0.68069, 0.69138, 0.69036, 0.68756, 0.68003, 0.68118, 0.68219, 0.68967, 0.68462, 0.68795, 0.68699, 0.6881, 0.6895, 0.6908, 0.68981, 0.68371, 0.68631, 0.68376, 0.81573, 0.69039, 0.69127, 0.69453, 0.69743, 0.69357, 0.68918, 0.68915, 0.68957, 0.69407, 0.68945, 0.69186, 0.68603, 0.68977, 0.70044, 0.69469, 0.69533, 0.69415, 0.69884, 0.69538, 0.69372, 0.69623, 0.69454, 0.6948, 0.69135, 0.69206, 0.68673, 0.68936, 0.68303, 0.68538, 0.68582, 0.69851, 0.70083, 0.69592, 0.69452, 0.69303, 0.69071, 0.70246, 0.6973, 0.69795, 0.69114, 0.69795, 0.69698, 0.69429, 0.69158, 0.69376, 0.69794, 0.69244, 0.69205, 0.69394, 0.69551, 0.69657, 0.69487, 0.69462, 0.69874, 0.69622, 0.69596, 0.69702, 0.69605, 0.69381, 0.68895, 0.69096, 0.69099, 0.69224, 0.68822, 0.69238, 0.68894, 0.69956, 0.69462, 0.69596, 0.69826, 0.69791, 0.69829, 0.69528, 0.69581, 0.69246, 0.69712, 0.69164, 0.69373, 0.69112, 0.69522, 0.68973, 0.69375, 0.69191, 0.69554, 0.69908, 0.69725, 0.69744, 0.69566, 0.69832, 0.69791, 0.69806, 0.69817, 0.69569, 0.69697, 0.69849, 0.69511, 0.69491, 0.69873, 0.69972, 0.70371, 0.69973, 0.70041, 0.69955, 0.69404, 0.69642, 0.69525, 0.70125, 0.69189, 0.70768, 0.71527, 0.70077, 0.69532, 0.6961, 0.7031, 0.67909, 0.68793, 0.70461, 0.69523, 0.69673, 0.70017, 0.69796, 0.69461, 0.70307, 0.69829, 0.69545, 0.69288, 0.75214, 0.70015, 0.70134, 0.69495, 0.70155, 0.70094, 0.69651, 0.69772, 0.69954, 0.69592, 0.6977, 0.69059, 0.69677, 0.69829, 0.69779, 0.69192, 0.69617, 0.69978, 0.68964, 0.69432, 0.69761, 0.69629, 0.69975, 0.69141, 0.69977, 0.69704, 0.70403, 0.68958, 0.69117, 0.68705, 0.69675, 0.68817, 0.69828, 0.69189, 0.69446, 0.6924, 0.69063, 0.691, 0.69163, 0.69402, 0.69605, 0.69383, 0.69327, 0.69636, 0.69175, 0.69468, 0.69281, 0.70044, 0.70067, 0.7016, 0.69557, 0.69614, 0.69761, 0.69793, 0.69322, 0.69689, 0.70043, 0.69446, 0.69543, 0.69346, 0.69441, 0.68931, 0.69592, 0.6914, 0.6929, 0.69539, 0.69954, 0.69999, 0.69447, 0.69508, 0.69638, 0.69699, 0.69614, 0.69655, 0.6957, 0.69348, 0.698, 0.70136, 0.69861, 0.69224, 0.69369, 0.69763, 0.69759, 0.69166, 0.69413, 0.69071, 0.69463, 0.69072, 0.69754, 0.69663, 0.69249, 0.69603, 0.80113, 0.69556, 0.69325, 0.69439, 0.69712, 0.69274, 0.69473, 0.68837, 0.69493, 0.69602, 0.69314, 0.69884, 0.70264, 0.70625, 0.69696, 0.69541, 0.69344, 0.70656, 0.69704, 0.69417, 0.70121, 0.69558, 0.7002, 0.815, 0.69817, 0.69499, 0.70038, 0.70281, 0.70226, 0.69884, 0.69724, 0.69581, 0.69287, 0.69618, 0.71318, 0.69943, 0.70407, 0.69607, 0.69718, 0.68881, 0.69211, 0.69118, 0.69873, 0.69888, 0.70284, 0.6967, 0.70012, 0.69679, 0.69994, 0.69768, 0.7015, 0.70388, 0.69342, 0.69641, 0.70208, 0.6909, 0.69959, 0.69723, 0.69969, 0.70232, 0.69828, 0.697, 0.69714, 0.69676, 0.69506, 0.69683, 0.69519, 0.68973, 0.70075, 0.69457, 0.69842, 0.69584, 0.69872, 0.69358, 0.69875, 0.69346, 0.70004, 0.69971, 0.70151, 0.70016, 0.70414, 0.70754, 0.70082, 0.69723, 0.70207, 0.70466, 0.70276, 0.69824, 0.70085, 0.70049, 0.70134, 0.70037, 0.705, 0.70761, 0.70114, 0.69824]}, "forward-compute-time": {"start_step": 0, "end_step": 2000, "step_interval": 5, "values": [4.92979, 0.36862, 0.36896, 0.35994, 0.3634, 0.36131, 0.37528, 0.36745, 0.39414, 0.37596, 0.3798, 0.38001, 0.38263, 0.37794, 0.39251, 0.3769, 0.37612, 0.3675, 0.37072, 0.36701, 0.37163, 0.36679, 0.37704, 0.36833, 0.37308, 0.37264, 0.37893, 0.37759, 0.39953, 0.37377, 0.37903, 0.37511, 0.37891, 0.37243, 0.38146, 0.37534, 0.38244, 0.37164, 0.38228, 0.37646, 0.38605, 0.37539, 0.38035, 0.38244, 0.38642, 0.3893, 0.38511, 0.3827, 0.39156, 0.3782, 0.37799, 0.38401, 0.37401, 0.37169, 0.37072, 0.37641, 0.38295, 0.38051, 0.37444, 0.38482, 0.37469, 0.38129, 0.38054, 0.37571, 0.37578, 0.37992, 0.37782, 0.37386, 0.3813, 0.38374, 0.3775, 0.37428, 0.37254, 0.37234, 0.37719, 0.37627, 0.37853, 0.37526, 0.38087, 0.38099, 0.38071, 0.38191, 0.37329, 0.3773, 0.3734, 0.5018, 0.38253, 0.38164, 0.38606, 0.38733, 0.38592, 0.38071, 0.37964, 0.37907, 0.38532, 0.37904, 0.38222, 0.37656, 0.38031, 0.38646, 0.38574, 0.38602, 0.37899, 0.38893, 0.38764, 0.38446, 0.38488, 0.38659, 0.38646, 0.38256, 0.38198, 0.37894, 0.38195, 0.37524, 0.37462, 0.37752, 0.38757, 0.39104, 0.38931, 0.38235, 0.38351, 0.38268, 0.39375, 0.3868, 0.38798, 0.38182, 0.39008, 0.38803, 0.38668, 0.38465, 0.38639, 0.38737, 0.38331, 0.37911, 0.38492, 0.38652, 0.38697, 0.38654, 0.38596, 0.39074, 0.38492, 0.38717, 0.38731, 0.38942, 0.386, 0.38148, 0.38444, 0.38374, 0.38416, 0.37792, 0.37748, 0.37957, 0.39104, 0.38581, 0.38566, 0.38678, 0.38966, 0.38882, 0.38683, 0.38264, 0.38507, 0.38712, 0.38306, 0.38289, 0.38103, 0.38363, 0.37743, 0.37875, 0.37956, 0.38316, 0.3891, 0.38796, 0.38596, 0.38565, 0.38554, 0.38556, 0.38505, 0.38092, 0.38387, 0.38393, 0.38859, 0.37887, 0.38497, 0.38623, 0.39043, 0.39246, 0.38914, 0.38962, 0.38901, 0.38336, 0.38644, 0.38387, 0.38958, 0.38133, 0.39066, 0.39461, 0.39129, 0.38237, 0.3862, 0.39181, 0.37212, 0.37912, 0.39389, 0.384, 0.38439, 0.38586, 0.38505, 0.38157, 0.38622, 0.38765, 0.38617, 0.38274, 0.44388, 0.39087, 0.3907, 0.38612, 0.38867, 0.39114, 0.38539, 0.38934, 0.38921, 0.38784, 0.38206, 0.38157, 0.38685, 0.39031, 0.38789, 0.38326, 0.38644, 0.38897, 0.38075, 0.3856, 0.38903, 0.3866, 0.38941, 0.37995, 0.38647, 0.388, 0.3933, 0.38074, 0.38111, 0.37964, 0.38635, 0.37942, 0.38546, 0.38117, 0.38291, 0.38281, 0.38246, 0.38276, 0.38171, 0.382, 0.3865, 0.37957, 0.3856, 0.38543, 0.38204, 0.38551, 0.38485, 0.39262, 0.39183, 0.38966, 0.38778, 0.38805, 0.3857, 0.3903, 0.38332, 0.38621, 0.38966, 0.38839, 0.3794, 0.38725, 0.38481, 0.38106, 0.38522, 0.3806, 0.38384, 0.38521, 0.38656, 0.39255, 0.38382, 0.38686, 0.38703, 0.38844, 0.38459, 0.38745, 0.38311, 0.38465, 0.38785, 0.39146, 0.38846, 0.38178, 0.38121, 0.38932, 0.38613, 0.38272, 0.38328, 0.38309, 0.38433, 0.38086, 0.38574, 0.38715, 0.38325, 0.38613, 0.4565, 0.38631, 0.38538, 0.38553, 0.38639, 0.38282, 0.38384, 0.37918, 0.38658, 0.38666, 0.38487, 0.39121, 0.3908, 0.39786, 0.3849, 0.38844, 0.38522, 0.394, 0.38769, 0.38524, 0.39367, 0.38775, 0.39338, 0.50382, 0.39159, 0.38743, 0.39102, 0.39523, 0.39356, 0.39205, 0.38578, 0.38801, 0.38304, 0.38678, 0.3987, 0.39171, 0.39597, 0.38708, 0.3908, 0.38146, 0.38222, 0.38202, 0.39012, 0.39068, 0.39269, 0.38682, 0.39099, 0.38924, 0.39219, 0.38971, 0.39066, 0.39542, 0.38474, 0.38829, 0.39181, 0.38288, 0.38918, 0.3886, 0.39087, 0.39457, 0.3877, 0.3877, 0.38997, 0.39047, 0.38458, 0.38887, 0.3875, 0.38266, 0.38907, 0.38748, 0.38772, 0.387, 0.38822, 0.38247, 0.39155, 0.38528, 0.39151, 0.39019, 0.39332, 0.39078, 0.3911, 0.39847, 0.3899, 0.39043, 0.39299, 0.39763, 0.39582, 0.39107, 0.39252, 0.39507, 0.39717, 0.3953, 0.40187, 0.40236, 0.39559, 0.39145]}, "backward-compute-time": {"start_step": 0, "end_step": 2000, "step_interval": 5, "values": [2.8012, 0.29387, 0.2986, 0.28406, 0.28522, 0.28969, 0.29061, 0.28796, 0.29063, 0.28667, 0.29358, 0.29506, 0.2922, 0.2852, 0.28989, 0.28483, 0.28642, 0.28342, 0.28232, 0.28136, 0.28422, 0.28036, 0.28492, 0.28314, 0.281, 0.28245, 0.28442, 0.28445, 0.28814, 0.28551, 0.2857, 0.28486, 0.28705, 0.28407, 0.28536, 0.28489, 0.28989, 0.28255, 0.28845, 0.28647, 0.28944, 0.28337, 0.28838, 0.28849, 0.2897, 0.29269, 0.28788, 0.28852, 0.29394, 0.28953, 0.28786, 0.28768, 0.28428, 0.28563, 0.28458, 0.28775, 0.29324, 0.28892, 0.28616, 0.29034, 0.28456, 0.28682, 0.28841, 0.28729, 0.28425, 0.28778, 0.28741, 0.2839, 0.28832, 0.28804, 0.2861, 0.28333, 0.28362, 0.28274, 0.28476, 0.28495, 0.28365, 0.28409, 0.28405, 0.28625, 0.28429, 0.28647, 0.28314, 0.28367, 0.28409, 0.28622, 0.28505, 0.28438, 0.28134, 0.28462, 0.28536, 0.28398, 0.28654, 0.2869, 0.28809, 0.28601, 0.28761, 0.28425, 0.28676, 0.2862, 0.28997, 0.28934, 0.28731, 0.29342, 0.28795, 0.28707, 0.2867, 0.28661, 0.28811, 0.28616, 0.28592, 0.28428, 0.28508, 0.28396, 0.28659, 0.28265, 0.28697, 0.2894, 0.28687, 0.28772, 0.28913, 0.28621, 0.29195, 0.28847, 0.29125, 0.28862, 0.29011, 0.29025, 0.28931, 0.28814, 0.28955, 0.2908, 0.28871, 0.28801, 0.28793, 0.28964, 0.29306, 0.29007, 0.28963, 0.29251, 0.29069, 0.29194, 0.28984, 0.29084, 0.28995, 0.28615, 0.28778, 0.28795, 0.2882, 0.28737, 0.2876, 0.28691, 0.29135, 0.28807, 0.28993, 0.29202, 0.29116, 0.29034, 0.28863, 0.29346, 0.29111, 0.29416, 0.29263, 0.293, 0.29317, 0.2931, 0.28845, 0.288, 0.28664, 0.28885, 0.29051, 0.28976, 0.28937, 0.29252, 0.29727, 0.29583, 0.29602, 0.29658, 0.2931, 0.29603, 0.29621, 0.29395, 0.29259, 0.29542, 0.29412, 0.29939, 0.29634, 0.2902, 0.29267, 0.28896, 0.2887, 0.28951, 0.29196, 0.29075, 0.29727, 0.30019, 0.29535, 0.2896, 0.28882, 0.29318, 0.28687, 0.28581, 0.29387, 0.28979, 0.28852, 0.29025, 0.28988, 0.28996, 0.2906, 0.29127, 0.29091, 0.29027, 0.34386, 0.29092, 0.29145, 0.28886, 0.29332, 0.29127, 0.29064, 0.29054, 0.29117, 0.28886, 0.28689, 0.28524, 0.29113, 0.29077, 0.28956, 0.28788, 0.28875, 0.29066, 0.28696, 0.28828, 0.28986, 0.28975, 0.29179, 0.28765, 0.29054, 0.29018, 0.29236, 0.28513, 0.28796, 0.28625, 0.28988, 0.28486, 0.2901, 0.28715, 0.28807, 0.29103, 0.28636, 0.28731, 0.28709, 0.2878, 0.28863, 0.28922, 0.28858, 0.28861, 0.28721, 0.28911, 0.28891, 0.29009, 0.29181, 0.29183, 0.2921, 0.28906, 0.29246, 0.29132, 0.28922, 0.29183, 0.29154, 0.29016, 0.29033, 0.29069, 0.28941, 0.28627, 0.28999, 0.28617, 0.28792, 0.2909, 0.29099, 0.29284, 0.29202, 0.28998, 0.29186, 0.29297, 0.29177, 0.2896, 0.29112, 0.28824, 0.29124, 0.29518, 0.29288, 0.28876, 0.29026, 0.29318, 0.2932, 0.2894, 0.28931, 0.28848, 0.28934, 0.28881, 0.29144, 0.28798, 0.28986, 0.29212, 0.28958, 0.2898, 0.28969, 0.2893, 0.29213, 0.29, 0.29098, 0.29085, 0.29077, 0.29035, 0.29027, 0.29142, 0.29441, 0.29571, 0.29203, 0.29018, 0.29127, 0.29433, 0.29091, 0.28877, 0.29354, 0.29063, 0.29084, 0.29118, 0.29114, 0.29201, 0.29191, 0.29316, 0.29428, 0.29139, 0.29115, 0.29268, 0.28887, 0.29386, 0.29765, 0.29295, 0.29535, 0.29245, 0.29159, 0.28784, 0.29096, 0.28864, 0.2923, 0.29471, 0.29453, 0.2914, 0.29447, 0.29151, 0.29226, 0.29155, 0.29343, 0.29271, 0.28917, 0.29026, 0.2943, 0.28854, 0.29114, 0.29123, 0.2918, 0.29223, 0.29626, 0.29746, 0.29042, 0.29175, 0.29069, 0.29, 0.2892, 0.28808, 0.29535, 0.28977, 0.29205, 0.29056, 0.29189, 0.2899, 0.28981, 0.2895, 0.2929, 0.29123, 0.29288, 0.29252, 0.29518, 0.29616, 0.29356, 0.29361, 0.29532, 0.29564, 0.29465, 0.29223, 0.29483, 0.29279, 0.29075, 0.29144, 0.29105, 0.29375, 0.28857, 0.288]}, "batch-generator-time": {"start_step": 0, "end_step": 2000, "step_interval": 5, "values": [0.30565, 0.00631, 0.0066, 0.00601, 0.00609, 0.00586, 0.00613, 0.00583, 0.00602, 0.00583, 0.00598, 0.00604, 0.00582, 0.00568, 0.00583, 0.0058, 0.00563, 0.00578, 0.00557, 0.0058, 0.00592, 0.00586, 0.0058, 0.00562, 0.00562, 0.00571, 0.00557, 0.00573, 0.00596, 0.00583, 0.00566, 0.00601, 0.00607, 0.00572, 0.00607, 0.00595, 0.00598, 0.00592, 0.00585, 0.00609, 0.00585, 0.0059, 0.00582, 0.00578, 0.00588, 0.00604, 0.00563, 0.00593, 0.00592, 0.00559, 0.00549, 0.00584, 0.00593, 0.00559, 0.00713, 0.00734, 0.00689, 0.00723, 0.00685, 0.00763, 0.00701, 0.00722, 0.0072, 0.00755, 0.00717, 0.00727, 0.00721, 0.00707, 0.00703, 0.00729, 0.00703, 0.00682, 0.00659, 0.00573, 0.00594, 0.00596, 0.00621, 0.00602, 0.00602, 0.00599, 0.00597, 0.00616, 0.0059, 0.00598, 0.00575, 0.00606, 0.00592, 0.00596, 0.00602, 0.00605, 0.00587, 0.00585, 0.00596, 0.00675, 0.00617, 0.0062, 0.00592, 0.00581, 0.00613, 0.00611, 0.00624, 0.00629, 0.00603, 0.00622, 0.00608, 0.00595, 0.00632, 0.00599, 0.00611, 0.00597, 0.00588, 0.00587, 0.0057, 0.00574, 0.00589, 0.00569, 0.00565, 0.00566, 0.0061, 0.00592, 0.00603, 0.00553, 0.00587, 0.00577, 0.00567, 0.00584, 0.00581, 0.00607, 0.00583, 0.00565, 0.00581, 0.0058, 0.00582, 0.00595, 0.0057, 0.00596, 0.00605, 0.00582, 0.00559, 0.00575, 0.00572, 0.00562, 0.00565, 0.00583, 0.00603, 0.00568, 0.00564, 0.00603, 0.00593, 0.0059, 0.00581, 0.0055, 0.00598, 0.00604, 0.00607, 0.00585, 0.00585, 0.00603, 0.00588, 0.00599, 0.00567, 0.00593, 0.00614, 0.0058, 0.00592, 0.00575, 0.00581, 0.00624, 0.00582, 0.00616, 0.00572, 0.00591, 0.0061, 0.00614, 0.00597, 0.00606, 0.00588, 0.00578, 0.00631, 0.00589, 0.00584, 0.00574, 0.00613, 0.00566, 0.0061, 0.00599, 0.0059, 0.00589, 0.00595, 0.00596, 0.00595, 0.00595, 0.00613, 0.00585, 0.00569, 0.00609, 0.00603, 0.00615, 0.00617, 0.00606, 0.06212, 0.00708, 0.00731, 0.00708, 0.00688, 0.0068, 0.00715, 0.00694, 0.00689, 0.00682, 0.00592, 0.00599, 0.00671, 0.00709, 0.00695, 0.00727, 0.00736, 0.00727, 0.00737, 0.00678, 0.00708, 0.00694, 0.00721, 0.00727, 0.00742, 0.00681, 0.00707, 0.00694, 0.00708, 0.00695, 0.00706, 0.00698, 0.00707, 0.0067, 0.00718, 0.00733, 0.00718, 0.00687, 0.00725, 0.00712, 0.00718, 0.00685, 0.00603, 0.00744, 0.00676, 0.00683, 0.00724, 0.00706, 0.00733, 0.00734, 0.00681, 0.00744, 0.00713, 0.00687, 0.00667, 0.00687, 0.00723, 0.00685, 0.00677, 0.00724, 0.00676, 0.00673, 0.0071, 0.00721, 0.00713, 0.00707, 0.00719, 0.00656, 0.00681, 0.0069, 0.00711, 0.00704, 0.00728, 0.00686, 0.00705, 0.00647, 0.00678, 0.00724, 0.00671, 0.00729, 0.00729, 0.00693, 0.00727, 0.00705, 0.0073, 0.0069, 0.00703, 0.00703, 0.00673, 0.00641, 0.00649, 0.0059, 0.00591, 0.00589, 0.00611, 0.00602, 0.00581, 0.00591, 0.006, 0.00615, 0.00591, 0.00611, 0.00606, 0.00605, 0.00645, 0.00595, 0.00594, 0.00596, 0.006, 0.00598, 0.00594, 0.00601, 0.00655, 0.00617, 0.00603, 0.0059, 0.00628, 0.00583, 0.00608, 0.00585, 0.00604, 0.00603, 0.00594, 0.00582, 0.00576, 0.00596, 0.00605, 0.00641, 0.00601, 0.00602, 0.0061, 0.00618, 0.00595, 0.00602, 0.00597, 0.00581, 0.00598, 0.00598, 0.00614, 0.00599, 0.00582, 0.00612, 0.00597, 0.00575, 0.00572, 0.00623, 0.00601, 0.00597, 0.00619, 0.00626, 0.00606, 0.00592, 0.00607, 0.00584, 0.00593, 0.00602, 0.00617, 0.00621, 0.00612, 0.00602, 0.00597, 0.00594, 0.00615, 0.00599, 0.00604, 0.00617, 0.00631, 0.00558, 0.00552, 0.0057, 0.00568, 0.00594, 0.00614, 0.00588, 0.006, 0.00605, 0.00607, 0.00624, 0.00636, 0.00582, 0.00604, 0.00595, 0.0061, 0.00615, 0.00599, 0.00599, 0.00621, 0.00604, 0.00599, 0.00599, 0.00589, 0.00621, 0.00584, 0.00586, 0.00593, 0.00614, 0.00623, 0.00591, 0.00632, 0.00604]}, "forward-recv-time": {"start_step": 0, "end_step": 2000, "step_interval": 5, "values": [4.95821, 0.02363, 0.0227, 0.02332, 0.02256, 0.02319, 0.0228, 0.02261, 0.0228, 0.02242, 0.02284, 0.02259, 0.02245, 0.02309, 0.02332, 0.02185, 0.02227, 0.02241, 0.02251, 0.02246, 0.02257, 0.02259, 0.02212, 0.02254, 0.02299, 0.02339, 0.02258, 0.02339, 0.02279, 0.02234, 0.0221, 0.02333, 0.02239, 0.02203, 0.02184, 0.02211, 0.02224, 0.022, 0.0223, 0.02282, 0.02196, 0.02285, 0.02194, 0.02233, 0.02238, 0.0221, 0.02287, 0.02259, 0.02353, 0.02258, 0.02174, 0.02244, 0.02248, 0.02249, 0.02286, 0.02274, 0.02231, 0.02301, 0.02252, 0.02226, 0.02309, 0.0226, 0.02248, 0.02257, 0.02247, 0.02239, 0.02245, 0.02239, 0.02245, 0.02226, 0.02251, 0.02235, 0.02229, 0.02229, 0.02224, 0.02218, 0.02269, 0.02222, 0.02297, 0.0233, 0.02355, 0.02353, 0.02351, 0.02353, 0.0231, 0.02266, 0.02205, 0.02248, 0.02239, 0.02243, 0.02337, 0.02243, 0.02265, 0.02251, 0.0227, 0.02251, 0.02262, 0.0223, 0.02239, 0.02302, 0.02253, 0.0224, 0.02341, 0.02267, 0.02201, 0.02288, 0.02223, 0.02234, 0.02247, 0.02274, 0.0227, 0.02223, 0.02278, 0.02249, 0.02233, 0.02353, 0.02284, 0.02293, 0.02146, 0.02395, 0.02287, 0.02228, 0.02286, 0.02372, 0.02285, 0.02195, 0.02251, 0.02292, 0.02278, 0.02298, 0.02247, 0.02293, 0.02269, 0.02272, 0.02289, 0.0229, 0.0226, 0.02277, 0.02291, 0.02243, 0.02298, 0.02242, 0.02233, 0.02273, 0.0224, 0.02231, 0.02213, 0.02282, 0.02271, 0.02257, 0.02245, 0.02266, 0.02226, 0.02234, 0.02242, 0.02287, 0.02231, 0.02272, 0.02271, 0.02261, 0.02279, 0.02239, 0.02238, 0.02237, 0.02245, 0.02246, 0.023, 0.02279, 0.02277, 0.02299, 0.02326, 0.0223, 0.02341, 0.02259, 0.02308, 0.02252, 0.02308, 0.02263, 0.02343, 0.02234, 0.02287, 0.02253, 0.02261, 0.02291, 0.02258, 0.02266, 0.02272, 0.02323, 0.02251, 0.02228, 0.0226, 0.02245, 0.02282, 0.02319, 0.02275, 0.02246, 0.02327, 0.02259, 0.02253, 0.0224, 0.01758, 0.02244, 0.02255, 0.02222, 0.02295, 0.02246, 0.02236, 0.02202, 0.02348, 0.02237, 0.02232, 0.02231, 0.02262, 0.02284, 0.02278, 0.02292, 0.02249, 0.02264, 0.02288, 0.02264, 0.02232, 0.02331, 0.02235, 0.02266, 0.02272, 0.02229, 0.02285, 0.02276, 0.02283, 0.02355, 0.02243, 0.02224, 0.02272, 0.02285, 0.02224, 0.02355, 0.02275, 0.02246, 0.02254, 0.02335, 0.02272, 0.02208, 0.02249, 0.02229, 0.02237, 0.02251, 0.0228, 0.02259, 0.02238, 0.02269, 0.02278, 0.02234, 0.02262, 0.02237, 0.02265, 0.02234, 0.0239, 0.02204, 0.02217, 0.02222, 0.02262, 0.02231, 0.02208, 0.02252, 0.02267, 0.02293, 0.02253, 0.02228, 0.02237, 0.02246, 0.02294, 0.02246, 0.02182, 0.0225, 0.02229, 0.02265, 0.02222, 0.02222, 0.02264, 0.02241, 0.02246, 0.02208, 0.02243, 0.0227, 0.02237, 0.02231, 0.02228, 0.02312, 0.02228, 0.02236, 0.02245, 0.02239, 0.02316, 0.02216, 0.02227, 0.02241, 0.0226, 0.02206, 0.02266, 0.0223, 0.02225, 0.02286, 0.0223, 0.02201, 0.02235, 0.02378, 0.02224, 0.02326, 0.02229, 0.02293, 0.02211, 0.02198, 0.02233, 0.0224, 0.02212, 0.02248, 0.02253, 0.02253, 0.02258, 0.02203, 0.02237, 0.02274, 0.0222, 0.02237, 0.02238, 0.02242, 0.02229, 0.02263, 0.02196, 0.02243, 0.02239, 0.02243, 0.02221, 0.02264, 0.02264, 0.02249, 0.02235, 0.0226, 0.02289, 0.02232, 0.0227, 0.02252, 0.02225, 0.02254, 0.02223, 0.02268, 0.02244, 0.02292, 0.02284, 0.02271, 0.02275, 0.02258, 0.02303, 0.02263, 0.02297, 0.02275, 0.0227, 0.023, 0.02298, 0.02297, 0.02199, 0.02326, 0.02298, 0.02263, 0.02262, 0.02296, 0.02268, 0.0225, 0.02268, 0.02273, 0.02239, 0.02231, 0.02302, 0.02284, 0.02258, 0.02376, 0.02298, 0.02258, 0.02269, 0.02282, 0.02248, 0.02296, 0.02259, 0.02303, 0.02252, 0.02322, 0.02265, 0.0226, 0.02282, 0.0227, 0.02325, 0.02263, 0.02282, 0.02297, 0.02259, 0.02313, 0.02262, 0.02287, 0.02288, 0.02356]}, "forward-send-time": {"start_step": 0, "end_step": 2000, "step_interval": 5, "values": [3.00337, 0.00014, 0.00014, 0.00014, 0.00014, 0.00015, 0.00014, 0.00014, 0.00015, 0.00015, 0.00014, 0.00015, 0.00014, 0.00014, 0.00016, 0.00014, 0.00014, 0.00014, 0.00014, 0.00014, 0.00015, 0.00015, 0.00014, 0.00014, 0.00014, 0.00015, 0.00014, 0.00014, 0.00014, 0.00014, 0.00014, 0.00015, 0.00015, 0.00014, 0.00014, 0.00014, 0.00014, 0.00014, 0.00014, 0.00014, 0.00013, 0.00014, 0.00014, 0.00014, 0.00014, 0.00013, 0.00017, 0.00014, 0.00015, 0.00014, 0.00014, 0.00014, 0.00014, 0.00014, 0.00014, 0.00014, 0.00014, 0.00014, 0.00014, 0.00015, 0.00013, 0.00014, 0.00014, 0.00013, 0.00014, 0.00015, 0.00014, 0.00013, 0.00014, 0.00014, 0.00014, 0.00014, 0.00014, 0.00013, 0.00013, 0.00014, 0.00014, 0.00014, 0.00014, 0.00014, 0.00014, 0.00014, 0.00014, 0.00014, 0.00014, 0.00014, 0.00014, 0.00013, 0.00014, 0.00014, 0.00014, 0.00013, 0.00014, 0.00014, 0.00014, 0.00014, 0.00014, 0.00014, 0.00014, 0.00014, 0.00014, 0.00014, 0.00015, 0.00014, 0.00013, 0.00014, 0.00015, 0.00014, 0.00014, 0.00014, 0.00014, 0.00013, 0.00014, 0.00013, 0.00013, 0.00014, 0.00014, 0.00014, 0.00013, 0.00015, 0.00013, 0.00014, 0.00014, 0.00015, 0.00014, 0.00013, 0.00014, 0.00015, 0.00014, 0.00016, 0.00014, 0.00014, 0.00014, 0.00014, 0.00014, 0.00014, 0.00013, 0.00014, 0.00014, 0.00013, 0.00014, 0.00014, 0.00014, 0.00014, 0.00014, 0.00014, 0.00013, 0.00013, 0.00014, 0.00015, 0.00014, 0.00014, 0.00014, 0.00014, 0.00014, 0.00014, 0.00014, 0.00013, 0.00014, 0.00014, 0.00014, 0.00013, 0.00014, 0.00014, 0.00014, 0.00014, 0.00014, 0.00014, 0.00013, 0.00015, 0.00015, 0.00014, 0.00016, 0.00013, 0.00016, 0.00014, 0.00015, 0.00013, 0.00014, 0.00014, 0.00014, 0.00014, 0.00014, 0.00014, 0.00015, 0.00014, 0.00014, 0.00015, 0.00014, 0.00014, 0.00014, 0.00014, 0.00014, 0.00016, 0.00014, 0.00014, 0.00015, 0.00014, 0.00015, 0.00014, 0.00018, 0.00014, 0.00015, 0.00014, 0.00015, 0.00015, 0.00014, 0.00014, 0.00015, 0.00013, 0.00014, 0.00015, 0.00014, 0.00014, 0.00014, 0.00014, 0.00014, 0.00014, 0.00014, 0.00015, 0.00015, 0.00015, 0.00014, 0.00014, 0.00015, 0.00014, 0.00015, 0.00014, 0.00015, 0.00014, 0.00014, 0.00017, 0.00014, 0.00014, 0.00014, 0.00015, 0.00015, 0.00015, 0.00014, 0.00017, 0.00014, 0.00015, 0.00014, 0.00014, 0.00013, 0.00015, 0.00014, 0.00014, 0.00015, 0.00014, 0.00014, 0.00014, 0.00014, 0.00014, 0.00015, 0.00014, 0.00018, 0.00014, 0.00015, 0.00015, 0.00014, 0.00014, 0.00014, 0.00014, 0.00015, 0.00015, 0.00014, 0.00014, 0.00014, 0.00014, 0.00014, 0.00014, 0.00013, 0.00014, 0.00014, 0.00014, 0.00014, 0.00014, 0.00015, 0.00014, 0.00015, 0.00013, 0.00014, 0.00015, 0.00016, 0.00014, 0.00014, 0.00015, 0.00014, 0.00014, 0.00014, 0.00014, 0.00015, 0.00013, 0.00014, 0.00014, 0.00014, 0.00013, 0.00014, 0.00014, 0.00014, 0.00015, 0.00014, 0.00014, 0.00014, 0.00015, 0.00014, 0.00015, 0.00014, 0.00015, 0.00014, 0.00014, 0.00015, 0.00014, 0.00014, 0.00014, 0.00015, 0.00015, 0.00014, 0.00014, 0.00014, 0.00015, 0.00014, 0.00014, 0.00014, 0.00014, 0.00014, 0.00015, 0.00014, 0.00014, 0.00014, 0.00015, 0.00014, 0.00015, 0.00014, 0.00014, 0.00014, 0.00015, 0.00014, 0.00014, 0.00015, 0.00014, 0.00014, 0.00014, 0.00014, 0.00015, 0.00014, 0.00015, 0.00014, 0.00015, 0.00014, 0.00014, 0.00015, 0.00015, 0.00015, 0.00014, 0.00014, 0.00015, 0.00014, 0.00014, 0.00014, 0.00014, 0.00016, 0.00014, 0.00014, 0.00015, 0.00015, 0.00015, 0.00015, 0.00014, 0.00014, 0.00014, 0.00015, 0.00015, 0.00014, 0.00016, 0.00014, 0.00014, 0.00015, 0.00014, 0.00014, 0.00015, 0.00015, 0.00014, 0.00015, 0.00014, 0.00015, 0.00014, 0.00014, 0.00015, 0.00014, 0.00014, 0.00015, 0.00015, 0.00014, 0.00016, 0.00014, 0.00015, 0.00015, 0.00015]}, "backward-recv-time": {"start_step": 0, "end_step": 2000, "step_interval": 5, "values": [0.02248, 0.02331, 0.02263, 0.02336, 0.02223, 0.02299, 0.02211, 0.02247, 0.0226, 0.02292, 0.02307, 0.02276, 0.02341, 0.02329, 0.02311, 0.02274, 0.02235, 0.0235, 0.02241, 0.02254, 0.0226, 0.02238, 0.02202, 0.02262, 0.02257, 0.02202, 0.02244, 0.02212, 0.02257, 0.02222, 0.02301, 0.02231, 0.02146, 0.02328, 0.0228, 0.02276, 0.02277, 0.02305, 0.02315, 0.02206, 0.02273, 0.02196, 0.02292, 0.0229, 0.02318, 0.02404, 0.02342, 0.02372, 0.024, 0.02283, 0.02293, 0.02329, 0.02241, 0.02288, 0.02249, 0.02209, 0.0225, 0.02317, 0.02289, 0.02337, 0.02275, 0.02241, 0.02374, 0.02164, 0.02208, 0.02228, 0.02281, 0.02282, 0.02272, 0.0226, 0.0227, 0.02228, 0.02281, 0.02266, 0.02389, 0.02245, 0.02241, 0.02233, 0.02295, 0.02231, 0.0221, 0.02223, 0.0226, 0.02234, 0.02195, 0.02202, 0.02245, 0.0226, 0.02275, 0.02248, 0.0222, 0.02241, 0.02244, 0.02231, 0.02257, 0.02222, 0.02266, 0.02423, 0.02272, 0.02227, 0.02299, 0.02249, 0.0224, 0.02471, 0.02315, 0.02261, 0.02228, 0.02296, 0.02277, 0.02251, 0.02275, 0.02249, 0.02349, 0.022, 0.02327, 0.0234, 0.02263, 0.02233, 0.02301, 0.02227, 0.02246, 0.02257, 0.02278, 0.02253, 0.02246, 0.02297, 0.02258, 0.02373, 0.02268, 0.02299, 0.02323, 0.02295, 0.02269, 0.02271, 0.02329, 0.02248, 0.02289, 0.02291, 0.02254, 0.02282, 0.02401, 0.02262, 0.02444, 0.02261, 0.0226, 0.02263, 0.02259, 0.02307, 0.02224, 0.02211, 0.02289, 0.02273, 0.02385, 0.02337, 0.02258, 0.02316, 0.02269, 0.02287, 0.02301, 0.0225, 0.02248, 0.02339, 0.02296, 0.02226, 0.02308, 0.02301, 0.02193, 0.02223, 0.02389, 0.02273, 0.02314, 0.0224, 0.02271, 0.02292, 0.0234, 0.02311, 0.02278, 0.02281, 0.02287, 0.02271, 0.02258, 0.02224, 0.02289, 0.02216, 0.02306, 0.02215, 0.02293, 0.02325, 0.02272, 0.02257, 0.02265, 0.02257, 0.02237, 0.02338, 0.02396, 0.02264, 0.02255, 0.02263, 0.02261, 0.02319, 0.02273, 0.0227, 0.02359, 0.02237, 0.02352, 0.02453, 0.02244, 0.02254, 0.02341, 0.02295, 0.02318, 0.02233, 0.02248, 0.02304, 0.02424, 0.02304, 0.02275, 0.02374, 0.02258, 0.02316, 0.02275, 0.02259, 0.02278, 0.02276, 0.02303, 0.02314, 0.02359, 0.02289, 0.02295, 0.02301, 0.02271, 0.02295, 0.02286, 0.02295, 0.02288, 0.02247, 0.02599, 0.02329, 0.02375, 0.02231, 0.0227, 0.0222, 0.02287, 0.02291, 0.02232, 0.02287, 0.02269, 0.0222, 0.02306, 0.02281, 0.0228, 0.02143, 0.02285, 0.02337, 0.02236, 0.02228, 0.02243, 0.02313, 0.02393, 0.02356, 0.02319, 0.02319, 0.02354, 0.02282, 0.02254, 0.02335, 0.02225, 0.02305, 0.0231, 0.02313, 0.02277, 0.02351, 0.02342, 0.02326, 0.02253, 0.02222, 0.02252, 0.02264, 0.02318, 0.02321, 0.02292, 0.02334, 0.02285, 0.02282, 0.02307, 0.02259, 0.02166, 0.02265, 0.02214, 0.02373, 0.02309, 0.0232, 0.02261, 0.02274, 0.02256, 0.02221, 0.02164, 0.02324, 0.02299, 0.02313, 0.02404, 0.02301, 0.02264, 0.02252, 0.02325, 0.02343, 0.02291, 0.02247, 0.0231, 0.02252, 0.02239, 0.02337, 0.02232, 0.02332, 0.02306, 0.02293, 0.02287, 0.02295, 0.02297, 0.02351, 0.02268, 0.02263, 0.02425, 0.02263, 0.02361, 0.023, 0.02223, 0.02273, 0.02318, 0.02333, 0.0232, 0.02407, 0.02312, 0.0227, 0.02288, 0.02285, 0.02227, 0.0233, 0.02303, 0.02288, 0.0233, 0.0231, 0.02299, 0.02245, 0.02284, 0.02224, 0.02277, 0.02352, 0.02304, 0.02289, 0.02369, 0.02293, 0.02308, 0.02248, 0.02362, 0.02358, 0.02328, 0.02302, 0.0234, 0.02273, 0.02296, 0.02329, 0.0228, 0.0234, 0.02231, 0.02262, 0.02265, 0.02299, 0.02199, 0.02303, 0.02291, 0.02278, 0.02341, 0.0232, 0.02291, 0.02339, 0.02355, 0.02363, 0.02324, 0.02236, 0.023, 0.02327, 0.02343, 0.02262, 0.02317, 0.02371, 0.02282, 0.02307, 0.0239, 0.02366, 0.02297, 0.02286, 0.02285, 0.0232, 0.02342, 0.02385, 0.02348, 0.02254, 0.02321, 0.02256]}, "backward-send-time": {"start_step": 0, "end_step": 2000, "step_interval": 5, "values": [0.00014, 0.00018, 0.00017, 0.00019, 0.00013, 0.00016, 0.00015, 0.00015, 0.00015, 0.00015, 0.00015, 0.00016, 0.00015, 0.00015, 0.00015, 0.00015, 0.00015, 0.00017, 0.00015, 0.00016, 0.00015, 0.00015, 0.00017, 0.00015, 0.00015, 0.00015, 0.00016, 0.00015, 0.00015, 0.00015, 0.00015, 0.00016, 0.00015, 0.00015, 0.00015, 0.00015, 0.00015, 0.00015, 0.00015, 0.00016, 0.00015, 0.00015, 0.00015, 0.00016, 0.00015, 0.00017, 0.00016, 0.00015, 0.00015, 0.00016, 0.00014, 0.00015, 0.00015, 0.00015, 0.00015, 0.00015, 0.00019, 0.00015, 0.00015, 0.00017, 0.00015, 0.00015, 0.00015, 0.00015, 0.00015, 0.00015, 0.00014, 0.00012, 0.00013, 0.00013, 0.00013, 0.00012, 0.00013, 0.00016, 0.00017, 0.00016, 0.00012, 0.00016, 0.00012, 0.00012, 0.00013, 0.00013, 0.00016, 0.00013, 0.00013, 0.00014, 0.00013, 0.00012, 0.00012, 0.00012, 0.00014, 0.00013, 0.00012, 0.00013, 0.00012, 0.00012, 0.00013, 0.00017, 0.00014, 0.00017, 0.00013, 0.00013, 0.00013, 0.00019, 0.00014, 0.00014, 0.00013, 0.00018, 0.00013, 0.00014, 0.00013, 0.00016, 0.00015, 0.00012, 0.00013, 0.00013, 0.00013, 0.00013, 0.00012, 0.00014, 0.00015, 0.00013, 0.00014, 0.00012, 0.00013, 0.00013, 0.00012, 0.00016, 0.00017, 0.00013, 0.00014, 0.00013, 0.00015, 0.00013, 0.00013, 0.00015, 0.00016, 0.00013, 0.00013, 0.00013, 0.00015, 0.00013, 0.00016, 0.00015, 0.00013, 0.00013, 0.00013, 0.00013, 0.00017, 0.00015, 0.00017, 0.00014, 0.00013, 0.00014, 0.00013, 0.00014, 0.00013, 0.00013, 0.00015, 0.00014, 0.00013, 0.00015, 0.00014, 0.00012, 0.00014, 0.00013, 0.00016, 0.00015, 0.00015, 0.00013, 0.00013, 0.00013, 0.00013, 0.00016, 0.00012, 0.00013, 0.00015, 0.00013, 0.00015, 0.00014, 0.00016, 0.00013, 0.00013, 0.00015, 0.00016, 0.00012, 0.00016, 0.00013, 0.00013, 0.00013, 0.00013, 0.00013, 0.00012, 0.00013, 0.00019, 0.00013, 0.00013, 0.00014, 0.00015, 0.00014, 0.00013, 0.00014, 0.00016, 0.00014, 0.00013, 0.00013, 0.00013, 0.00013, 0.00015, 0.00013, 0.00013, 0.00014, 0.00014, 0.00013, 0.00016, 0.00013, 0.00018, 0.00012, 0.00014, 0.00013, 0.00013, 0.00012, 0.00013, 0.00013, 0.00014, 0.00016, 0.00012, 0.00012, 0.00013, 0.00014, 0.00012, 0.00018, 0.00013, 0.00013, 0.00013, 0.00014, 0.00015, 0.00014, 0.00014, 0.00012, 0.00013, 0.00013, 0.00014, 0.00014, 0.00015, 0.00015, 0.00013, 0.00013, 0.00014, 0.00013, 0.00012, 0.00013, 0.00012, 0.00015, 0.00013, 0.00013, 0.00014, 0.00015, 0.00012, 0.00013, 0.00013, 0.00014, 0.00013, 0.00012, 0.00017, 0.00013, 0.00015, 0.00017, 0.00013, 0.00014, 0.00016, 0.00012, 0.00014, 0.00013, 0.00014, 0.00013, 0.00015, 0.00015, 0.00016, 0.00017, 0.00013, 0.00018, 0.00014, 0.00013, 0.00013, 0.00013, 0.00012, 0.00013, 0.00013, 0.00014, 0.00016, 0.00014, 0.00013, 0.00014, 0.00012, 0.00013, 0.00012, 0.00016, 0.00012, 0.00015, 0.00013, 0.00013, 0.00013, 0.00012, 0.00016, 0.00017, 0.00013, 0.00013, 0.00013, 0.00014, 0.00014, 0.00013, 0.00013, 0.00013, 0.00013, 0.00012, 0.00013, 0.00014, 0.00012, 0.00013, 0.00013, 0.00012, 0.00013, 0.00014, 0.00015, 0.00013, 0.00013, 0.00013, 0.00017, 0.00014, 0.00014, 0.00016, 0.00013, 0.00015, 0.00014, 0.00017, 0.00016, 0.00014, 0.00014, 0.00013, 0.00015, 0.00012, 0.00013, 0.00012, 0.00013, 0.00014, 0.00013, 0.00014, 0.00013, 0.00013, 0.00013, 0.00013, 0.00013, 0.00013, 0.00013, 0.00014, 0.00012, 0.00013, 0.00015, 0.00014, 0.00012, 0.00012, 0.00013, 0.00013, 0.00013, 0.00014, 0.00012, 0.00012, 0.00012, 0.00013, 0.00013, 0.00012, 0.00014, 0.00013, 0.00013, 0.00013, 0.00012, 0.00015, 0.00016, 0.00013, 0.00013, 0.00014, 0.00014, 0.00017, 0.00012, 0.00015, 0.00016, 0.00016, 0.00013, 0.00015, 0.00014, 0.00013, 0.00013, 0.00012, 0.00012, 0.00017, 0.00013, 0.00013, 0.00012, 0.00012]}, "forward-send-backward-recv-time": {"start_step": 0, "end_step": 2000, "step_interval": 5, "values": [7.29163, 0.07663, 0.08035, 0.06332, 0.06621, 0.06965, 0.06672, 0.06872, 0.07455, 0.0683, 0.06975, 0.07264, 0.07308, 0.06869, 0.0749, 0.06785, 0.06696, 0.07011, 0.07008, 0.06771, 0.06763, 0.06853, 0.06929, 0.06793, 0.0646, 0.06794, 0.06582, 0.06618, 0.07898, 0.06585, 0.0677, 0.06681, 0.07017, 0.06602, 0.06883, 0.06722, 0.06997, 0.06853, 0.07057, 0.06872, 0.06884, 0.06699, 0.06869, 0.07012, 0.06782, 0.06999, 0.06845, 0.06563, 0.07187, 0.06575, 0.06637, 0.06468, 0.06438, 0.06646, 0.06395, 0.06524, 0.08025, 0.06764, 0.06976, 0.06968, 0.06431, 0.06784, 0.06839, 0.06965, 0.06878, 0.06848, 0.06691, 0.06998, 0.07092, 0.06857, 0.0693, 0.06815, 0.07095, 0.07046, 0.07279, 0.07009, 0.07045, 0.07242, 0.06971, 0.06878, 0.0711, 0.06854, 0.0703, 0.07136, 0.07206, 0.19699, 0.06856, 0.07017, 0.0772, 0.07413, 0.06965, 0.06662, 0.06863, 0.07002, 0.06852, 0.06895, 0.06723, 0.06766, 0.06739, 0.07615, 0.06865, 0.0659, 0.07051, 0.0678, 0.06754, 0.06717, 0.07145, 0.07015, 0.06808, 0.06744, 0.06521, 0.06518, 0.06265, 0.06299, 0.06279, 0.06454, 0.07004, 0.06844, 0.06842, 0.06744, 0.06305, 0.06615, 0.07084, 0.06889, 0.06934, 0.0652, 0.07021, 0.0665, 0.06497, 0.06458, 0.06483, 0.0654, 0.0651, 0.06488, 0.06369, 0.06434, 0.06672, 0.06482, 0.06827, 0.06829, 0.0643, 0.06825, 0.06762, 0.06752, 0.06536, 0.06267, 0.06412, 0.06238, 0.0644, 0.06315, 0.06427, 0.06278, 0.06772, 0.06453, 0.06547, 0.06433, 0.06477, 0.06262, 0.06246, 0.0656, 0.06412, 0.06447, 0.06356, 0.06614, 0.0655, 0.06558, 0.06542, 0.06499, 0.06312, 0.06403, 0.06715, 0.06427, 0.06479, 0.06361, 0.06722, 0.06583, 0.06476, 0.06651, 0.06877, 0.06755, 0.06567, 0.06624, 0.06526, 0.06717, 0.06755, 0.06946, 0.06655, 0.06526, 0.06418, 0.06359, 0.06533, 0.06548, 0.06698, 0.06537, 0.06464, 0.07565, 0.06673, 0.06462, 0.06523, 0.06525, 0.05829, 0.06037, 0.06399, 0.06429, 0.06234, 0.06138, 0.06591, 0.06529, 0.06565, 0.06508, 0.0686, 0.06838, 0.12228, 0.06666, 0.06636, 0.0641, 0.06601, 0.06468, 0.06395, 0.06568, 0.06779, 0.06425, 0.06928, 0.06612, 0.06928, 0.0652, 0.06359, 0.06153, 0.06449, 0.06439, 0.06432, 0.06445, 0.06351, 0.06481, 0.06503, 0.06334, 0.0646, 0.06418, 0.06493, 0.06414, 0.06257, 0.06426, 0.06752, 0.06251, 0.06434, 0.06117, 0.06509, 0.06177, 0.06484, 0.06385, 0.06538, 0.06711, 0.0659, 0.06606, 0.06549, 0.06518, 0.06537, 0.06313, 0.0654, 0.0676, 0.06603, 0.06663, 0.06705, 0.06676, 0.0651, 0.0677, 0.06421, 0.06506, 0.06513, 0.06577, 0.06915, 0.06804, 0.06617, 0.06569, 0.06722, 0.06636, 0.06674, 0.06574, 0.06698, 0.06664, 0.06663, 0.06459, 0.06384, 0.06515, 0.06699, 0.06757, 0.06645, 0.06668, 0.0657, 0.06812, 0.06673, 0.06651, 0.06468, 0.06953, 0.06688, 0.06585, 0.06531, 0.06508, 0.06559, 0.06487, 0.0647, 0.06539, 0.06861, 0.06738, 0.06026, 0.06597, 0.06493, 0.06467, 0.06738, 0.06641, 0.06506, 0.0673, 0.06795, 0.06714, 0.06848, 0.06828, 0.07103, 0.0742, 0.06691, 0.06638, 0.06521, 0.06791, 0.06493, 0.06647, 0.06851, 0.06674, 0.06949, 0.18067, 0.06896, 0.0653, 0.06795, 0.06966, 0.06981, 0.0677, 0.06607, 0.06924, 0.06499, 0.06831, 0.06832, 0.06949, 0.07135, 0.06537, 0.07037, 0.06461, 0.06603, 0.06572, 0.06904, 0.06866, 0.06911, 0.06296, 0.0684, 0.06727, 0.06737, 0.069, 0.06738, 0.07025, 0.06407, 0.06509, 0.06963, 0.06441, 0.07069, 0.07222, 0.07463, 0.07367, 0.07032, 0.07129, 0.07156, 0.07253, 0.06858, 0.06926, 0.06916, 0.06788, 0.06771, 0.06859, 0.06745, 0.07278, 0.06943, 0.06671, 0.0691, 0.06585, 0.06975, 0.07019, 0.07413, 0.0711, 0.07228, 0.07684, 0.07091, 0.0736, 0.07134, 0.07497, 0.07213, 0.06976, 0.07166, 0.0746, 0.0763, 0.06965, 0.07059, 0.07384, 0.07021, 0.07072]}, "backward-send-forward-recv-time": {"start_step": 0, "end_step": 2000, "step_interval": 5, "values": [2.84189, 0.0034, 0.00335, 0.0028, 0.00275, 0.0029, 0.00298, 0.00297, 0.00304, 0.00306, 0.00309, 0.00308, 0.00301, 0.00302, 0.00299, 0.00294, 0.003, 0.00307, 0.0031, 0.00304, 0.00303, 0.00294, 0.00305, 0.00298, 0.00301, 0.00306, 0.0029, 0.00302, 0.00303, 0.0031, 0.00306, 0.00304, 0.00303, 0.00301, 0.00294, 0.00305, 0.00312, 0.00303, 0.00301, 0.00328, 0.00302, 0.00288, 0.00306, 0.00304, 0.00304, 0.00303, 0.00299, 0.00297, 0.003, 0.00305, 0.00302, 0.00306, 0.00303, 0.00307, 0.00305, 0.00294, 0.00385, 0.00305, 0.00293, 0.00307, 0.00295, 0.003, 0.00297, 0.00308, 0.00305, 0.00303, 0.00302, 0.00254, 0.00275, 0.00284, 0.00252, 0.00253, 0.00257, 0.00262, 0.00255, 0.00266, 0.00264, 0.0026, 0.00255, 0.00265, 0.00267, 0.00266, 0.00269, 0.0026, 0.00263, 0.00301, 0.00264, 0.00265, 0.00269, 0.00261, 0.00267, 0.00257, 0.00268, 0.0027, 0.00261, 0.00268, 0.00261, 0.00264, 0.00255, 0.00261, 0.00281, 0.00269, 0.00271, 0.00271, 0.00264, 0.00265, 0.00268, 0.0026, 0.00262, 0.00283, 0.00271, 0.00272, 0.00266, 0.00257, 0.00253, 0.00256, 0.00276, 0.00272, 0.00264, 0.00283, 0.00271, 0.00262, 0.00269, 0.00277, 0.00266, 0.0026, 0.00277, 0.00282, 0.00271, 0.00264, 0.00273, 0.00268, 0.00264, 0.00266, 0.0027, 0.00274, 0.00274, 0.0027, 0.00271, 0.00273, 0.00279, 0.0027, 0.00276, 0.00265, 0.0028, 0.00278, 0.00273, 0.00287, 0.00273, 0.00277, 0.00273, 0.00265, 0.00272, 0.00267, 0.00277, 0.00265, 0.00267, 0.0027, 0.00268, 0.00269, 0.00264, 0.00278, 0.00271, 0.00267, 0.00258, 0.00265, 0.00262, 0.00273, 0.00273, 0.00285, 0.00277, 0.00264, 0.00285, 0.00276, 0.00269, 0.00275, 0.00339, 0.00271, 0.00288, 0.00276, 0.00282, 0.00266, 0.00281, 0.00268, 0.00277, 0.00269, 0.00271, 0.0028, 0.00273, 0.00293, 0.00264, 0.00265, 0.00285, 0.0026, 0.00269, 0.00287, 0.00272, 0.00278, 0.0028, 0.00271, 0.00259, 0.00259, 0.00273, 0.00266, 0.0027, 0.00278, 0.00275, 0.0029, 0.00268, 0.00277, 0.0027, 0.00273, 0.00744, 0.00272, 0.00261, 0.00274, 0.00281, 0.00282, 0.00277, 0.00264, 0.00277, 0.00268, 0.00266, 0.00256, 0.00267, 0.00276, 0.00287, 0.00271, 0.00271, 0.00265, 0.00268, 0.00304, 0.00294, 0.00305, 0.0029, 0.00293, 0.00278, 0.00294, 0.00291, 0.00285, 0.00291, 0.00286, 0.00284, 0.00295, 0.0029, 0.0029, 0.00287, 0.00287, 0.0029, 0.00282, 0.00289, 0.0028, 0.0029, 0.00288, 0.0028, 0.00266, 0.0026, 0.00273, 0.00266, 0.00275, 0.00276, 0.00275, 0.00283, 0.0027, 0.00268, 0.00279, 0.00265, 0.00277, 0.00279, 0.00278, 0.00276, 0.00273, 0.00266, 0.00264, 0.00265, 0.00264, 0.00268, 0.00279, 0.00284, 0.00276, 0.00269, 0.00277, 0.00277, 0.00268, 0.00268, 0.00266, 0.00263, 0.00274, 0.0026, 0.00268, 0.00269, 0.00259, 0.00258, 0.00283, 0.00267, 0.00256, 0.00279, 0.0026, 0.00276, 0.00258, 0.00269, 0.00264, 0.00266, 0.00272, 0.10829, 0.00271, 0.00273, 0.00261, 0.00278, 0.00265, 0.00268, 0.00259, 0.00272, 0.00286, 0.00273, 0.00271, 0.00286, 0.00269, 0.00267, 0.0027, 0.00281, 0.0027, 0.00267, 0.00273, 0.0027, 0.00257, 0.0026, 0.00298, 0.0026, 0.00269, 0.00264, 0.00279, 0.00281, 0.00269, 0.0031, 0.0027, 0.0027, 0.00273, 0.0028, 0.00277, 0.00279, 0.00274, 0.00279, 0.00256, 0.00277, 0.00273, 0.00275, 0.00268, 0.00277, 0.00282, 0.0028, 0.00268, 0.00285, 0.00263, 0.00275, 0.00272, 0.0027, 0.00272, 0.00269, 0.00263, 0.00272, 0.00262, 0.00268, 0.0027, 0.00275, 0.0027, 0.00256, 0.00261, 0.00265, 0.00271, 0.00266, 0.00266, 0.00275, 0.00281, 0.00274, 0.00263, 0.00267, 0.00277, 0.00271, 0.00263, 0.00267, 0.00269, 0.00285, 0.00267, 0.00275, 0.00276, 0.00277, 0.0026, 0.00277, 0.0027, 0.00279, 0.00284, 0.00284, 0.0028, 0.00331, 0.00286, 0.0027, 0.00271, 0.00257, 0.00255]}, "layernorm-grads-all-reduce-time": {"start_step": 0, "end_step": 2000, "step_interval": 5, "values": [4e-05, 1e-05, 1e-05, 1e-05, 1e-05, 1e-05, 1e-05, 1e-05, 1e-05, 1e-05, 1e-05, 1e-05, 1e-05, 1e-05, 1e-05, 1e-05, 1e-05, 1e-05, 1e-05, 1e-05, 1e-05, 1e-05, 1e-05, 1e-05, 1e-05, 1e-05, 1e-05, 1e-05, 1e-05, 1e-05, 1e-05, 1e-05, 1e-05, 1e-05, 1e-05, 1e-05, 2e-05, 1e-05, 1e-05, 1e-05, 1e-05, 1e-05, 1e-05, 1e-05, 1e-05, 1e-05, 1e-05, 1e-05, 1e-05, 2e-05, 1e-05, 1e-05, 1e-05, 1e-05, 1e-05, 1e-05, 1e-05, 1e-05, 1e-05, 1e-05, 1e-05, 1e-05, 1e-05, 1e-05, 1e-05, 1e-05, 1e-05, 1e-05, 2e-05, 1e-05, 1e-05, 1e-05, 1e-05, 1e-05, 1e-05, 1e-05, 1e-05, 1e-05, 1e-05, 1e-05, 2e-05, 1e-05, 2e-05, 1e-05, 2e-05, 1e-05, 1e-05, 1e-05, 1e-05, 1e-05, 1e-05, 1e-05, 1e-05, 1e-05, 1e-05, 1e-05, 1e-05, 1e-05, 1e-05, 2e-05, 1e-05, 2e-05, 1e-05, 1e-05, 1e-05, 2e-05, 1e-05, 1e-05, 1e-05, 1e-05, 2e-05, 1e-05, 1e-05, 1e-05, 1e-05, 1e-05, 1e-05, 1e-05, 1e-05, 1e-05, 1e-05, 1e-05, 3e-05, 1e-05, 1e-05, 1e-05, 1e-05, 1e-05, 1e-05, 1e-05, 1e-05, 1e-05, 1e-05, 1e-05, 1e-05, 1e-05, 1e-05, 1e-05, 1e-05, 1e-05, 1e-05, 1e-05, 1e-05, 1e-05, 1e-05, 1e-05, 1e-05, 1e-05, 1e-05, 1e-05, 1e-05, 1e-05, 1e-05, 1e-05, 1e-05, 1e-05, 1e-05, 1e-05, 1e-05, 1e-05, 1e-05, 1e-05, 1e-05, 1e-05, 1e-05, 1e-05, 2e-05, 1e-05, 1e-05, 1e-05, 1e-05, 1e-05, 1e-05, 1e-05, 1e-05, 1e-05, 1e-05, 1e-05, 1e-05, 1e-05, 1e-05, 1e-05, 1e-05, 1e-05, 1e-05, 1e-05, 1e-05, 1e-05, 1e-05, 1e-05, 1e-05, 1e-05, 1e-05, 1e-05, 1e-05, 1e-05, 1e-05, 1e-05, 1e-05, 1e-05, 1e-05, 1e-05, 1e-05, 1e-05, 1e-05, 2e-05, 1e-05, 1e-05, 1e-05, 1e-05, 1e-05, 1e-05, 1e-05, 1e-05, 1e-05, 1e-05, 1e-05, 1e-05, 1e-05, 1e-05, 1e-05, 1e-05, 1e-05, 1e-05, 1e-05, 1e-05, 1e-05, 1e-05, 1e-05, 1e-05, 1e-05, 1e-05, 1e-05, 1e-05, 1e-05, 1e-05, 1e-05, 1e-05, 1e-05, 1e-05, 1e-05, 1e-05, 1e-05, 1e-05, 1e-05, 1e-05, 1e-05, 1e-05, 1e-05, 1e-05, 1e-05, 1e-05, 1e-05, 1e-05, 1e-05, 1e-05, 1e-05, 1e-05, 1e-05, 1e-05, 1e-05, 2e-05, 1e-05, 1e-05, 1e-05, 2e-05, 2e-05, 1e-05, 1e-05, 1e-05, 1e-05, 1e-05, 1e-05, 1e-05, 1e-05, 2e-05, 1e-05, 1e-05, 1e-05, 1e-05, 1e-05, 1e-05, 1e-05, 1e-05, 1e-05, 1e-05, 1e-05, 1e-05, 1e-05, 1e-05, 1e-05, 1e-05, 1e-05, 1e-05, 1e-05, 1e-05, 1e-05, 1e-05, 1e-05, 1e-05, 1e-05, 1e-05, 1e-05, 1e-05, 1e-05, 1e-05, 1e-05, 1e-05, 1e-05, 1e-05, 1e-05, 1e-05, 1e-05, 1e-05, 1e-05, 1e-05, 1e-05, 1e-05, 1e-05, 1e-05, 1e-05, 1e-05, 1e-05, 1e-05, 1e-05, 1e-05, 1e-05, 1e-05, 1e-05, 1e-05, 1e-05, 1e-05, 1e-05, 1e-05, 1e-05, 1e-05, 1e-05, 1e-05, 1e-05, 1e-05, 1e-05, 1e-05, 1e-05, 1e-05, 1e-05, 1e-05, 1e-05, 1e-05, 1e-05, 1e-05, 1e-05, 1e-05, 1e-05, 1e-05, 1e-05, 1e-05, 1e-05, 1e-05, 1e-05, 1e-05, 1e-05, 1e-05, 1e-05, 1e-05, 1e-05, 1e-05, 1e-05, 1e-05, 1e-05, 1e-05, 1e-05, 1e-05, 1e-05, 1e-05, 1e-05, 1e-05, 1e-05, 1e-05, 1e-05, 1e-05, 1e-05, 1e-05, 1e-05, 1e-05, 1e-05, 1e-05, 1e-05, 1e-05, 1e-05, 1e-05, 1e-05, 1e-05, 1e-05, 1e-05, 1e-05, 1e-05, 1e-05, 2e-05, 1e-05, 1e-05]}, "embedding-grads-all-reduce-time": {"start_step": 0, "end_step": 2000, "step_interval": 5, "values": [0.00071, 0.00045, 0.00045, 0.00046, 0.00046, 0.00046, 0.00046, 0.00045, 0.00045, 0.00045, 0.00046, 0.00045, 0.00047, 0.00045, 0.00045, 0.00046, 0.00045, 0.00046, 0.00049, 0.00046, 0.00046, 0.00046, 0.00045, 0.00045, 0.00045, 0.00046, 0.00045, 0.00045, 0.00046, 0.00045, 0.00045, 0.00046, 0.00046, 0.00045, 0.00046, 0.00046, 0.00047, 0.00045, 0.00046, 0.00046, 0.00045, 0.00047, 0.00045, 0.00046, 0.00046, 0.00046, 0.00048, 0.00046, 0.00048, 0.00045, 0.00046, 0.00048, 0.00045, 0.00045, 0.00046, 0.00046, 0.00046, 0.00047, 0.00048, 0.00046, 0.00045, 0.00045, 0.00046, 0.00046, 0.00045, 0.00046, 0.00046, 0.00047, 0.00044, 0.00044, 0.00045, 0.00045, 0.00047, 0.00045, 0.00044, 0.00045, 0.00045, 0.00045, 0.00045, 0.00045, 0.00047, 0.00045, 0.00045, 0.00045, 0.00046, 0.00045, 0.00045, 0.00047, 0.00045, 0.00045, 0.00049, 0.00045, 0.00046, 0.00044, 0.00046, 0.00047, 0.00045, 0.00045, 0.00045, 0.00046, 0.00047, 0.00045, 0.00044, 0.00045, 0.00044, 0.00045, 0.00045, 0.00045, 0.00046, 0.00045, 0.00045, 0.00045, 0.00045, 0.00044, 0.00081, 0.00046, 0.00046, 0.00047, 0.00045, 0.00045, 0.00045, 0.00045, 0.00046, 0.00048, 0.00047, 0.00045, 0.00045, 0.00045, 0.00045, 0.00045, 0.00045, 0.00045, 0.00045, 0.00044, 0.00046, 0.00045, 0.00045, 0.00047, 0.00045, 0.00045, 0.00045, 0.00045, 0.00045, 0.00045, 0.00045, 0.00045, 0.00045, 0.00044, 0.00045, 0.00044, 0.00045, 0.00045, 0.00047, 0.00045, 0.00045, 0.00044, 0.00045, 0.00045, 0.00045, 0.00045, 0.00045, 0.00045, 0.00045, 0.00045, 0.00044, 0.00046, 0.00047, 0.00046, 0.00047, 0.00045, 0.00046, 0.00044, 0.00045, 0.00045, 0.00045, 0.00044, 0.00046, 0.00045, 0.00046, 0.00045, 0.00046, 0.00045, 0.00045, 0.00045, 0.00045, 0.00046, 0.00045, 0.00045, 0.00045, 0.00046, 0.00046, 0.00045, 0.00045, 0.00047, 0.00045, 0.00045, 0.00045, 0.00045, 0.00045, 0.00045, 0.00045, 0.00045, 0.00047, 0.00049, 0.00047, 0.00045, 0.00045, 0.00049, 0.00045, 0.00049, 0.00045, 0.00045, 0.00045, 0.00045, 0.00045, 0.00045, 0.00046, 0.00045, 0.00045, 0.00049, 0.00046, 0.00046, 0.00045, 0.00045, 0.00045, 0.00045, 0.00045, 0.00045, 0.00048, 0.00045, 0.00046, 0.00046, 0.00044, 0.00045, 0.00045, 0.00044, 0.00046, 0.00045, 0.00044, 0.00048, 0.00045, 0.00044, 0.00045, 0.00045, 0.00045, 0.00045, 0.00047, 0.00046, 0.00048, 0.00045, 0.00045, 0.00045, 0.00047, 0.00045, 0.00046, 0.00045, 0.00045, 0.00045, 0.00045, 0.00045, 0.00045, 0.00051, 0.00049, 0.00045, 0.00046, 0.00047, 0.00046, 0.00045, 0.00045, 0.00045, 0.00046, 0.00047, 0.00049, 0.0005, 0.00046, 0.00045, 0.00047, 0.00046, 0.00045, 0.00045, 0.00049, 0.00045, 0.00049, 0.00045, 0.00045, 0.00046, 0.00045, 0.0005, 0.00045, 0.00046, 0.00044, 0.00045, 0.00049, 0.00045, 0.00045, 0.00045, 0.00045, 0.00045, 0.00046, 0.00046, 0.00045, 0.00045, 0.00045, 0.00045, 0.00049, 0.00046, 0.00048, 0.00047, 0.00045, 0.00045, 0.00046, 0.00048, 0.00045, 0.00046, 0.00045, 0.00045, 0.00047, 0.00046, 0.00048, 0.00048, 0.00048, 0.00048, 0.00045, 0.00045, 0.00048, 0.00047, 0.00045, 0.00048, 0.00045, 0.00046, 0.00045, 0.00046, 0.00045, 0.00047, 0.00047, 0.00046, 0.00046, 0.00047, 0.00047, 0.00044, 0.00045, 0.00045, 0.00048, 0.00048, 0.00048, 0.00045, 0.00045, 0.00046, 0.00045, 0.00048, 0.00048, 0.00045, 0.00045, 0.00045, 0.00046, 0.00044, 0.00045, 0.00045, 0.00049, 0.00045, 0.00045, 0.00045, 0.00045, 0.00048, 0.00045, 0.00046, 0.00049, 0.00046, 0.00046, 0.00044, 0.00048, 0.00045, 0.00045, 0.00045, 0.00046, 0.00045, 0.00045, 0.00045, 0.00045, 0.00045, 0.00048, 0.00047, 0.00049, 0.00045, 0.00045, 0.00053, 0.00045, 0.00045, 0.00045, 0.00045, 0.00046, 0.00044, 0.00049, 0.00045, 0.00044, 0.00048, 0.00045, 0.00045, 0.00045, 0.00045]}, "all-grads-sync-time": {"start_step": 0, "end_step": 2000, "step_interval": 5, "values": [6.13385, 0.00147, 0.00148, 0.00147, 0.00149, 0.00151, 0.00148, 0.00148, 0.00147, 0.00149, 0.00149, 0.00147, 0.00149, 0.00149, 0.00147, 0.00147, 0.00147, 0.00149, 0.00149, 0.00148, 0.00148, 0.00148, 0.0015, 0.0015, 0.00147, 0.00148, 0.00149, 0.00148, 0.00148, 0.00148, 0.00147, 0.00148, 0.00149, 0.00149, 0.00148, 0.00148, 0.00149, 0.00147, 0.00148, 0.00148, 0.00147, 0.00147, 0.00148, 0.00149, 0.00148, 0.00148, 0.00149, 0.00148, 0.00149, 0.00148, 0.00147, 0.00147, 0.00147, 0.00148, 0.00148, 0.00149, 0.00148, 0.00149, 0.00147, 0.00147, 0.00149, 0.00148, 0.00148, 0.00149, 0.0015, 0.00148, 0.00148, 0.00149, 0.00148, 0.00149, 0.00148, 0.00147, 0.00149, 0.00149, 0.00148, 0.00146, 0.00147, 0.00148, 0.00147, 0.00148, 0.00149, 0.00147, 0.00146, 0.00148, 0.00148, 0.00147, 0.00149, 0.00148, 0.00149, 0.0015, 0.00148, 0.00147, 0.00147, 0.00147, 0.00149, 0.00149, 0.00148, 0.00148, 0.00148, 0.00148, 0.00147, 0.00149, 0.00148, 0.00148, 0.00149, 0.00148, 0.00147, 0.00147, 0.00147, 0.00148, 0.00149, 0.00147, 0.00148, 0.00148, 0.00147, 0.00149, 0.00147, 0.00147, 0.00149, 0.00149, 0.00146, 0.00149, 0.00147, 0.00149, 0.00149, 0.00148, 0.00147, 0.00148, 0.00148, 0.00148, 0.00149, 0.00148, 0.00147, 0.00149, 0.00151, 0.00147, 0.00148, 0.00147, 0.00148, 0.00148, 0.00147, 0.00147, 0.0015, 0.00149, 0.00148, 0.00147, 0.00148, 0.00147, 0.00148, 0.00148, 0.00147, 0.0015, 0.00147, 0.00147, 0.00147, 0.00148, 0.0015, 0.00148, 0.00148, 0.00147, 0.00148, 0.00149, 0.00149, 0.00148, 0.00148, 0.00148, 0.00148, 0.00147, 0.00149, 0.00149, 0.00149, 0.00147, 0.00147, 0.00148, 0.00147, 0.00147, 0.00147, 0.00148, 0.00146, 0.00148, 0.00147, 0.00149, 0.00147, 0.00149, 0.00149, 0.00147, 0.00147, 0.00148, 0.00147, 0.00148, 0.00148, 0.00148, 0.00148, 0.00149, 0.00147, 0.00149, 0.00148, 0.00148, 0.00148, 0.00149, 0.0015, 0.00148, 0.00149, 0.00148, 0.00149, 0.00149, 0.00149, 0.00148, 0.00148, 0.00149, 0.00149, 0.0015, 0.00149, 0.00148, 0.00149, 0.00149, 0.00149, 0.00151, 0.00148, 0.0015, 0.00148, 0.00149, 0.00149, 0.00149, 0.00149, 0.00149, 0.00149, 0.0015, 0.0015, 0.0015, 0.00149, 0.0015, 0.00149, 0.00149, 0.00147, 0.00148, 0.00149, 0.0015, 0.0015, 0.00149, 0.00147, 0.00149, 0.0015, 0.00149, 0.00148, 0.00148, 0.00148, 0.00148, 0.00148, 0.0015, 0.0015, 0.0015, 0.00148, 0.00149, 0.00149, 0.00149, 0.00149, 0.00148, 0.0015, 0.00149, 0.00148, 0.00151, 0.00149, 0.00148, 0.00149, 0.00147, 0.00147, 0.00154, 0.00149, 0.00147, 0.00148, 0.0015, 0.00149, 0.00152, 0.00148, 0.00148, 0.00148, 0.00148, 0.00149, 0.00148, 0.00151, 0.00147, 0.00148, 0.00151, 0.0015, 0.00149, 0.00147, 0.00148, 0.00149, 0.00149, 0.00151, 0.00148, 0.00149, 0.00149, 0.00149, 0.00147, 0.00148, 0.00148, 0.00147, 0.00148, 0.00148, 0.00149, 0.00148, 0.00148, 0.00149, 0.00148, 0.00148, 0.00152, 0.00149, 0.0015, 0.00148, 0.00148, 0.00147, 0.00148, 0.00149, 0.00149, 0.00147, 0.00149, 0.00151, 0.00147, 0.00148, 0.00148, 0.00149, 0.00147, 0.0015, 0.00149, 0.00149, 0.00148, 0.00149, 0.00149, 0.00149, 0.00149, 0.00148, 0.00148, 0.00148, 0.00148, 0.00149, 0.00149, 0.00149, 0.00149, 0.00148, 0.00149, 0.00149, 0.00149, 0.00148, 0.0015, 0.00148, 0.00151, 0.00148, 0.00151, 0.00147, 0.00147, 0.00149, 0.00148, 0.00148, 0.00148, 0.00148, 0.00147, 0.00149, 0.00149, 0.00149, 0.00148, 0.00149, 0.0015, 0.00148, 0.00148, 0.00149, 0.00148, 0.00148, 0.00149, 0.00148, 0.00149, 0.0015, 0.00147, 0.00149, 0.00148, 0.00149, 0.00149, 0.00148, 0.00147, 0.00149, 0.0015, 0.0015, 0.00149, 0.00148, 0.00147, 0.00149, 0.00147, 0.0015, 0.00149, 0.00149, 0.00149, 0.0015, 0.00148, 0.00149, 0.00149, 0.0015, 0.00148, 0.00148, 0.00148]}, "optimizer-copy-to-main-grad-time": {"start_step": 0, "end_step": 2000, "step_interval": 5, "values": [0.00022, 0.00015, 0.00014, 0.00013, 0.00013, 0.00013, 0.00012, 0.00012, 0.00012, 0.00012, 0.00012, 0.00013, 0.00012, 0.00012, 0.00013, 0.00012, 0.00013, 0.00013, 0.00012, 0.00014, 0.00013, 0.00012, 0.00012, 0.00013, 0.00012, 0.00012, 0.00014, 0.00014, 0.00012, 0.00012, 0.00012, 0.00013, 0.00012, 0.00012, 0.00012, 0.00014, 0.00012, 0.00012, 0.00012, 0.00013, 0.00012, 0.00013, 0.00012, 0.00012, 0.00012, 0.00013, 0.00012, 0.00013, 0.00013, 0.00013, 0.00013, 0.00012, 0.00012, 0.00012, 0.00013, 0.00013, 0.00014, 0.00013, 0.00012, 0.00014, 0.00013, 0.00012, 0.00013, 0.00013, 0.00012, 0.00012, 0.00012, 0.00012, 0.00013, 0.00014, 0.00012, 0.00012, 0.00012, 0.00013, 0.00012, 0.00012, 0.00012, 0.00012, 0.00012, 0.00012, 0.00013, 0.00012, 0.00014, 0.00012, 0.00013, 0.00012, 0.00012, 0.00013, 0.00012, 0.00012, 0.00012, 0.00013, 0.00012, 0.00012, 0.00012, 0.00012, 0.00013, 0.00013, 0.00012, 0.00013, 0.00013, 0.00013, 0.00012, 0.00012, 0.00012, 0.00014, 0.00012, 0.00013, 0.00012, 0.00012, 0.00013, 0.00014, 0.00012, 0.00012, 0.00013, 0.00012, 0.00014, 0.00012, 0.00012, 0.00013, 0.00012, 0.00012, 0.00013, 0.00012, 0.00012, 0.00012, 0.00012, 0.00015, 0.00014, 0.00012, 0.00012, 0.00013, 0.00012, 0.00012, 0.00013, 0.00012, 0.00012, 0.00012, 0.00013, 0.00012, 0.00013, 0.00013, 0.00012, 0.00012, 0.00012, 0.00015, 0.00013, 0.00014, 0.00014, 0.00012, 0.00014, 0.00013, 0.00012, 0.00012, 0.00014, 0.00013, 0.00012, 0.00013, 0.00012, 0.00014, 0.00012, 0.00014, 0.00014, 0.00012, 0.00012, 0.00014, 0.00013, 0.00014, 0.00012, 0.00014, 0.00014, 0.00014, 0.00013, 0.00014, 0.00012, 0.00013, 0.00014, 0.00012, 0.00014, 0.00013, 0.00014, 0.00012, 0.00014, 0.00012, 0.00014, 0.00012, 0.00012, 0.00012, 0.00012, 0.00014, 0.00012, 0.00013, 0.00014, 0.00012, 0.00014, 0.00012, 0.00013, 0.00013, 0.00014, 0.00012, 0.00015, 0.00014, 0.00014, 0.00014, 0.00014, 0.00014, 0.00014, 0.00014, 0.00012, 0.00012, 0.00013, 0.00013, 0.00013, 0.00013, 0.00014, 0.00014, 0.00014, 0.00014, 0.00014, 0.00013, 0.00013, 0.00013, 0.00014, 0.00013, 0.00013, 0.00013, 0.00013, 0.00015, 0.00013, 0.00014, 0.00013, 0.00013, 0.00013, 0.00013, 0.00013, 0.00013, 0.00014, 0.00012, 0.00013, 0.00013, 0.00015, 0.00013, 0.00013, 0.00013, 0.00013, 0.00014, 0.00013, 0.00013, 0.00014, 0.00014, 0.00013, 0.00014, 0.00012, 0.00014, 0.00013, 0.00013, 0.00013, 0.00013, 0.00013, 0.00013, 0.00013, 0.00015, 0.00014, 0.00014, 0.00013, 0.00012, 0.00014, 0.00013, 0.00013, 0.00013, 0.00014, 0.00015, 0.00015, 0.00014, 0.00013, 0.00013, 0.00013, 0.00013, 0.00013, 0.00015, 0.00014, 0.00015, 0.00013, 0.00013, 0.00014, 0.00013, 0.00014, 0.00014, 0.00013, 0.00013, 0.00013, 0.00012, 0.00013, 0.00012, 0.00012, 0.00012, 0.00012, 0.00012, 0.00012, 0.00012, 0.00012, 0.00012, 0.00012, 0.00012, 0.00013, 0.00013, 0.00013, 0.00013, 0.00012, 0.00012, 0.00012, 0.00012, 0.00013, 0.00012, 0.00013, 0.00012, 0.00012, 0.00012, 0.00012, 0.00012, 0.00012, 0.00012, 0.00013, 0.00013, 0.00012, 0.00012, 0.00012, 0.00013, 0.00012, 0.00012, 0.00012, 0.00012, 0.00012, 0.00012, 0.00017, 0.00014, 0.00012, 0.00012, 0.00012, 0.00012, 0.00012, 0.00012, 0.00012, 0.00012, 0.00012, 0.00012, 0.00012, 0.00012, 0.00012, 0.00012, 0.00012, 0.00012, 0.00013, 0.00012, 0.00012, 0.00012, 0.00012, 0.00012, 0.00012, 0.00013, 0.00012, 0.00012, 0.00013, 0.00013, 0.00013, 0.00012, 0.00013, 0.00012, 0.00012, 0.00013, 0.00013, 0.00013, 0.00013, 0.00013, 0.00013, 0.00012, 0.00012, 0.00012, 0.00013, 0.00013, 0.00013, 0.00013, 0.00013, 0.00013, 0.00012, 0.00012, 0.00013, 0.00013, 0.00012, 0.00012, 0.00014, 0.00013, 0.00012, 0.00013, 0.00013, 0.00012, 0.00012, 0.00012, 0.00012, 0.00012]}, "optimizer-clip-main-grad-time": {"start_step": 0, "end_step": 2000, "step_interval": 5, "values": [2.11156, 0.00067, 0.00064, 0.00065, 0.00062, 0.00063, 0.00062, 0.00063, 0.00062, 0.00062, 0.00062, 0.00062, 0.00062, 0.00062, 0.00067, 0.00062, 0.00063, 0.00063, 0.00063, 0.00063, 0.00062, 0.00062, 0.00061, 0.00062, 0.00062, 0.00062, 0.00064, 0.00064, 0.00064, 0.00063, 0.00062, 0.00062, 0.00063, 0.00062, 0.00062, 0.00066, 0.00062, 0.00062, 0.00063, 0.00063, 0.00063, 0.00062, 0.00062, 0.00062, 0.00062, 0.00065, 0.00062, 0.00064, 0.00066, 0.00062, 0.00062, 0.00063, 0.00062, 0.00062, 0.00065, 0.00065, 0.00064, 0.00063, 0.00062, 0.00064, 0.00063, 0.00062, 0.00067, 0.00062, 0.00062, 0.00063, 0.00062, 0.00062, 0.00063, 0.00064, 0.00062, 0.00062, 0.00062, 0.00064, 0.00062, 0.00062, 0.00063, 0.00062, 0.00062, 0.00062, 0.00064, 0.00063, 0.00064, 0.00063, 0.00066, 0.00062, 0.00062, 0.00062, 0.00061, 0.00062, 0.00062, 0.00062, 0.00062, 0.00062, 0.00062, 0.00061, 0.00062, 0.00071, 0.00046, 0.00069, 0.00062, 0.00068, 0.00062, 0.00062, 0.00045, 0.00062, 0.00062, 0.00063, 0.00062, 0.00062, 0.0005, 0.00048, 0.00062, 0.00062, 0.00062, 0.00062, 0.00048, 0.00062, 0.00062, 0.00064, 0.00047, 0.00062, 0.00066, 0.00062, 0.00062, 0.00062, 0.00062, 0.00064, 0.00064, 0.00062, 0.00046, 0.00062, 0.00062, 0.00062, 0.00065, 0.00062, 0.00062, 0.00062, 0.00062, 0.00062, 0.00062, 0.00062, 0.00062, 0.00062, 0.00067, 0.00064, 0.00061, 0.00063, 0.00064, 0.00061, 0.00064, 0.00062, 0.00062, 0.00062, 0.00047, 0.00062, 0.00062, 0.00062, 0.00062, 0.00064, 0.00061, 0.00064, 0.00064, 0.00062, 0.00063, 0.00064, 0.00067, 0.00064, 0.00062, 0.00064, 0.00063, 0.00062, 0.00064, 0.00063, 0.00062, 0.00065, 0.00064, 0.00064, 0.00064, 0.00063, 0.00064, 0.00063, 0.00065, 0.00062, 0.00063, 0.00062, 0.00065, 0.00062, 0.00061, 0.00063, 0.00061, 0.00062, 0.00066, 0.00062, 0.00065, 0.00062, 0.00061, 0.00063, 0.00063, 0.00062, 0.00069, 0.00066, 0.00066, 0.00067, 0.00067, 0.00071, 0.00067, 0.00067, 0.00065, 0.00065, 0.00067, 0.00067, 0.00066, 0.00066, 0.00067, 0.00067, 0.00071, 0.00066, 0.00066, 0.00066, 0.00066, 0.00067, 0.00067, 0.00067, 0.00066, 0.00068, 0.00066, 0.00067, 0.00065, 0.00066, 0.00066, 0.00065, 0.00069, 0.00067, 0.00066, 0.00066, 0.00068, 0.00065, 0.00064, 0.00065, 0.00067, 0.00065, 0.00066, 0.00066, 0.00067, 0.00066, 0.00066, 0.00067, 0.00066, 0.00067, 0.00067, 0.00067, 0.00066, 0.00067, 0.00067, 0.00067, 0.00066, 0.00066, 0.00066, 0.00066, 0.00066, 0.00073, 0.00069, 0.00066, 0.00065, 0.00064, 0.00067, 0.00066, 0.00067, 0.00066, 0.00073, 0.00067, 0.00067, 0.00067, 0.00067, 0.00066, 0.00068, 0.00065, 0.00065, 0.00067, 0.00067, 0.00066, 0.00066, 0.00067, 0.00067, 0.00067, 0.00067, 0.00066, 0.00064, 0.00066, 0.00067, 0.00064, 0.00064, 0.00064, 0.00064, 0.00064, 0.00064, 0.00064, 0.00063, 0.00064, 0.00064, 0.00064, 0.00064, 0.00066, 0.00065, 0.00064, 0.00064, 0.00064, 0.00064, 0.00063, 0.00064, 0.00064, 0.00065, 0.00065, 0.00064, 0.00073, 0.00064, 0.00063, 0.00064, 0.00063, 0.00064, 0.00064, 0.00064, 0.00064, 0.00064, 0.00064, 0.00066, 0.00065, 0.00064, 0.00063, 0.00063, 0.00064, 0.00065, 0.00065, 0.00065, 0.00065, 0.00063, 0.00064, 0.00063, 0.00063, 0.00064, 0.00064, 0.00065, 0.00064, 0.00063, 0.00063, 0.00065, 0.00063, 0.00064, 0.00063, 0.00064, 0.00063, 0.00066, 0.00063, 0.00065, 0.00064, 0.00063, 0.00064, 0.00063, 0.00064, 0.00064, 0.00064, 0.00066, 0.00066, 0.00065, 0.00064, 0.00066, 0.00065, 0.00065, 0.00065, 0.00066, 0.00065, 0.00065, 0.00065, 0.00066, 0.00066, 0.00064, 0.00063, 0.00065, 0.00065, 0.00066, 0.00064, 0.00066, 0.00065, 0.00066, 0.00067, 0.00066, 0.00066, 0.00065, 0.00066, 0.00065, 0.00068, 0.00066, 0.00066, 0.00065, 0.00063, 0.00064, 0.00063, 0.00063, 0.00064]}, "optimizer-count-zeros-time": {"start_step": 0, "end_step": 2000, "step_interval": 5, "values": [0.00352, 0.00261, 0.00262, 0.00279, 0.00266, 0.00279, 0.00264, 0.00264, 0.00265, 0.00263, 0.00263, 0.00263, 0.00266, 0.00265, 0.00265, 0.00266, 0.00262, 0.00265, 0.00264, 0.00267, 0.00262, 0.00264, 0.00263, 0.00264, 0.00265, 0.00263, 0.00264, 0.00266, 0.00265, 0.00262, 0.00263, 0.00265, 0.00266, 0.00263, 0.00264, 0.00264, 0.00264, 0.00264, 0.00264, 0.00265, 0.00265, 0.00264, 0.00265, 0.00266, 0.00264, 0.00316, 0.00266, 0.00263, 0.00279, 0.0027, 0.00263, 0.00263, 0.00267, 0.00263, 0.00264, 0.00264, 0.00265, 0.00262, 0.00265, 0.00265, 0.00264, 0.00266, 0.00277, 0.00265, 0.00266, 0.00266, 0.00265, 0.00265, 0.00264, 0.00266, 0.00267, 0.00263, 0.00263, 0.00266, 0.00265, 0.00263, 0.00263, 0.00265, 0.00263, 0.00265, 0.00293, 0.00263, 0.00273, 0.00264, 0.00285, 0.00263, 0.00265, 0.00265, 0.00265, 0.00263, 0.00264, 0.00265, 0.00264, 0.00263, 0.00263, 0.00265, 0.00262, 0.00298, 0.00265, 0.0031, 0.00263, 0.00312, 0.00264, 0.00267, 0.00263, 0.00296, 0.00265, 0.00262, 0.00266, 0.00263, 0.00298, 0.00266, 0.00265, 0.00263, 0.00276, 0.00265, 0.00266, 0.00264, 0.00264, 0.00266, 0.00264, 0.00265, 0.00268, 0.00265, 0.00264, 0.00264, 0.00263, 0.00266, 0.00264, 0.00265, 0.00264, 0.00264, 0.00263, 0.00262, 0.00284, 0.00263, 0.00263, 0.00265, 0.00265, 0.00264, 0.00263, 0.00263, 0.00264, 0.00265, 0.00298, 0.00264, 0.00263, 0.00266, 0.00264, 0.00265, 0.00264, 0.00264, 0.00267, 0.00264, 0.00265, 0.00262, 0.00264, 0.00271, 0.00266, 0.00266, 0.00265, 0.00266, 0.00267, 0.00268, 0.00263, 0.00265, 0.00282, 0.00266, 0.0027, 0.00265, 0.00266, 0.00265, 0.00264, 0.00267, 0.00269, 0.00278, 0.00264, 0.00268, 0.00264, 0.00265, 0.00265, 0.00267, 0.00267, 0.00265, 0.00265, 0.00265, 0.00267, 0.00265, 0.00266, 0.00264, 0.00265, 0.00263, 0.00265, 0.00265, 0.00267, 0.00267, 0.00263, 0.00264, 0.00264, 0.00265, 0.00262, 0.00264, 0.00266, 0.00263, 0.00267, 0.00264, 0.00264, 0.00264, 0.00266, 0.00265, 0.00266, 0.00264, 0.00264, 0.00267, 0.00265, 0.00262, 0.00266, 0.00265, 0.00267, 0.00266, 0.00267, 0.00295, 0.00267, 0.00268, 0.00263, 0.00265, 0.00265, 0.00263, 0.00266, 0.00299, 0.00264, 0.00267, 0.00262, 0.00269, 0.00265, 0.00264, 0.00265, 0.00263, 0.00265, 0.00265, 0.00286, 0.00266, 0.00266, 0.00264, 0.00264, 0.00265, 0.00264, 0.00266, 0.00266, 0.00267, 0.00264, 0.00265, 0.00265, 0.00265, 0.00266, 0.00264, 0.00268, 0.00264, 0.00262, 0.00267, 0.00263, 0.00312, 0.00265, 0.00265, 0.00264, 0.00263, 0.00265, 0.00265, 0.00264, 0.00266, 0.00268, 0.00264, 0.00266, 0.00263, 0.00267, 0.00265, 0.00263, 0.00266, 0.0027, 0.00266, 0.00263, 0.00264, 0.00276, 0.00265, 0.00266, 0.00264, 0.00264, 0.00264, 0.00302, 0.00265, 0.00265, 0.00269, 0.00264, 0.00263, 0.00266, 0.00264, 0.00267, 0.00263, 0.00264, 0.00265, 0.00266, 0.00264, 0.00265, 0.00265, 0.00265, 0.00267, 0.00261, 0.00262, 0.00266, 0.00263, 0.00265, 0.00266, 0.00265, 0.00262, 0.00266, 0.00267, 0.00262, 0.00266, 0.00265, 0.00264, 0.00263, 0.00265, 0.00263, 0.00268, 0.00282, 0.00266, 0.00264, 0.00264, 0.00262, 0.00266, 0.00265, 0.00266, 0.00264, 0.00276, 0.00264, 0.00264, 0.00265, 0.00263, 0.00265, 0.00265, 0.00266, 0.00265, 0.00265, 0.00264, 0.00262, 0.00264, 0.00264, 0.00265, 0.00265, 0.00266, 0.00267, 0.00266, 0.00268, 0.00265, 0.00275, 0.00263, 0.00275, 0.00263, 0.00265, 0.00264, 0.00265, 0.00264, 0.00265, 0.00264, 0.00266, 0.00269, 0.00266, 0.00264, 0.00263, 0.00266, 0.00267, 0.00266, 0.00266, 0.00268, 0.00267, 0.00265, 0.00265, 0.00266, 0.00265, 0.00265, 0.00263, 0.00266, 0.00264, 0.00268, 0.00266, 0.00263, 0.00268, 0.00265, 0.00265, 0.00278, 0.0027, 0.00264, 0.00264, 0.00263, 0.00265, 0.00266, 0.00265, 0.00269, 0.00264, 0.00265]}, "optimizer-inner-step-time": {"start_step": 0, "end_step": 2000, "step_interval": 5, "values": [0.0024, 0.00067, 0.00066, 0.00066, 0.00066, 0.00067, 0.00067, 0.00067, 0.00066, 0.00066, 0.00066, 0.00067, 0.00066, 0.00067, 0.00066, 0.00066, 0.00066, 0.00066, 0.00066, 0.00066, 0.00066, 0.00066, 0.00066, 0.00066, 0.00066, 0.00066, 0.00067, 0.00066, 0.00066, 0.00066, 0.00065, 0.00066, 0.00066, 0.00066, 0.00066, 0.00066, 0.00066, 0.00066, 0.00065, 0.00066, 0.00066, 0.00065, 0.00066, 0.00067, 0.00066, 0.00067, 0.00065, 0.00065, 0.00066, 0.0007, 0.00065, 0.00066, 0.00065, 0.00066, 0.00066, 0.00067, 0.00066, 0.00067, 0.00067, 0.00066, 0.00065, 0.00066, 0.00066, 0.00066, 0.00066, 0.00066, 0.00065, 0.00065, 0.00066, 0.00066, 0.00065, 0.00065, 0.00065, 0.00066, 0.00065, 0.00065, 0.00065, 0.00065, 0.00066, 0.00065, 0.00067, 0.00065, 0.00066, 0.00065, 0.00066, 0.00066, 0.00065, 0.00066, 0.00066, 0.00065, 0.00065, 0.00065, 0.00065, 0.00065, 0.00065, 0.00065, 0.00065, 0.00066, 0.00066, 0.00066, 0.00066, 0.00066, 0.00065, 0.00065, 0.00065, 0.00067, 0.00065, 0.00065, 0.00065, 0.00065, 0.00066, 0.00065, 0.00066, 0.00065, 0.00066, 0.00065, 0.00066, 0.00066, 0.00065, 0.00065, 0.00065, 0.00065, 0.00065, 0.00065, 0.00065, 0.00065, 0.00065, 0.00069, 0.00066, 0.00065, 0.00065, 0.00065, 0.00065, 0.00066, 0.00066, 0.00065, 0.00066, 0.00065, 0.00065, 0.00066, 0.00065, 0.00065, 0.00065, 0.00066, 0.00067, 0.00066, 0.00065, 0.00066, 0.00066, 0.00065, 0.00066, 0.00065, 0.00066, 0.00065, 0.00065, 0.00065, 0.00065, 0.00068, 0.00065, 0.00066, 0.00065, 0.00066, 0.00066, 0.00066, 0.00065, 0.00065, 0.00066, 0.00066, 0.00066, 0.00066, 0.00066, 0.00065, 0.00065, 0.00066, 0.00065, 0.00066, 0.00065, 0.00065, 0.00066, 0.00065, 0.00066, 0.00066, 0.00065, 0.00065, 0.00066, 0.00065, 0.00065, 0.00066, 0.00065, 0.00065, 0.00066, 0.00066, 0.00066, 0.00065, 0.00066, 0.00066, 0.00065, 0.00065, 0.00067, 0.00066, 0.00069, 0.00068, 0.00069, 0.00069, 0.00068, 0.0007, 0.00069, 0.00069, 0.00067, 0.00067, 0.00068, 0.00068, 0.00068, 0.00068, 0.00069, 0.00068, 0.00069, 0.00068, 0.00068, 0.00069, 0.00091, 0.00068, 0.00068, 0.00069, 0.00068, 0.00068, 0.00068, 0.00068, 0.00068, 0.00069, 0.00068, 0.00068, 0.00068, 0.00067, 0.00068, 0.00067, 0.00068, 0.00067, 0.00067, 0.00068, 0.00071, 0.00068, 0.00068, 0.00068, 0.00068, 0.00069, 0.00068, 0.00067, 0.00068, 0.00067, 0.0007, 0.00069, 0.00067, 0.00069, 0.00068, 0.00068, 0.00068, 0.00068, 0.00068, 0.00068, 0.00067, 0.00069, 0.00068, 0.00068, 0.00068, 0.00067, 0.00068, 0.00068, 0.00067, 0.00068, 0.00068, 0.00068, 0.00068, 0.00068, 0.00067, 0.00068, 0.00067, 0.00067, 0.00068, 0.00067, 0.00068, 0.00068, 0.00069, 0.00068, 0.00069, 0.00068, 0.00068, 0.00068, 0.00067, 0.00068, 0.00068, 0.00067, 0.00067, 0.00066, 0.00067, 0.00067, 0.00067, 0.00067, 0.00067, 0.00066, 0.00066, 0.00068, 0.00067, 0.00067, 0.00067, 0.00066, 0.00067, 0.00066, 0.00067, 0.00067, 0.00067, 0.00066, 0.00067, 0.00067, 0.00067, 0.00066, 0.00067, 0.00067, 0.00067, 0.00068, 0.00067, 0.00067, 0.00066, 0.00067, 0.00066, 0.00067, 0.00067, 0.00067, 0.00067, 0.00067, 0.00066, 0.00066, 0.00067, 0.00068, 0.00066, 0.00067, 0.00067, 0.00067, 0.00068, 0.00067, 0.00067, 0.00067, 0.00067, 0.00067, 0.00068, 0.00067, 0.00068, 0.00067, 0.00066, 0.00066, 0.00066, 0.00066, 0.00066, 0.00066, 0.00067, 0.00067, 0.00067, 0.00067, 0.00066, 0.00066, 0.00067, 0.00067, 0.00067, 0.00067, 0.00068, 0.00066, 0.00067, 0.00067, 0.00067, 0.00067, 0.00067, 0.00067, 0.00067, 0.00067, 0.00067, 0.00067, 0.00066, 0.00066, 0.00067, 0.00067, 0.00068, 0.00067, 0.00067, 0.00068, 0.00068, 0.00067, 0.00067, 0.00067, 0.00067, 0.00067, 0.00068, 0.00067, 0.00069, 0.00067, 0.00067, 0.00066, 0.00067, 0.00066, 0.00067, 0.00066]}, "optimizer-copy-main-to-model-params-time": {"start_step": 0, "end_step": 2000, "step_interval": 5, "values": [0.0006, 0.00055, 0.00055, 0.00054, 0.00054, 0.00053, 0.00053, 0.00053, 0.00054, 0.00052, 0.00052, 0.00052, 0.00052, 0.00053, 0.00053, 0.00053, 0.00053, 0.00053, 0.00053, 0.00053, 0.00053, 0.00053, 0.00053, 0.00053, 0.00053, 0.00052, 0.00053, 0.00053, 0.00053, 0.00053, 0.00053, 0.00053, 0.00053, 0.00053, 0.00053, 0.00053, 0.00053, 0.00053, 0.00053, 0.00053, 0.00053, 0.00053, 0.00053, 0.00053, 0.00053, 0.00053, 0.00052, 0.00052, 0.00053, 0.00061, 0.00052, 0.00053, 0.00053, 0.00053, 0.00053, 0.00053, 0.00053, 0.00053, 0.00052, 0.00053, 0.00053, 0.00053, 0.00054, 0.00052, 0.00053, 0.00053, 0.00053, 0.00052, 0.00052, 0.00053, 0.00052, 0.00052, 0.00052, 0.00053, 0.00052, 0.00052, 0.00052, 0.00052, 0.00052, 0.00052, 0.00053, 0.00053, 0.00053, 0.00052, 0.00052, 0.00053, 0.00052, 0.00052, 0.00052, 0.00052, 0.00052, 0.00052, 0.00052, 0.00052, 0.00052, 0.00052, 0.00052, 0.00053, 0.00052, 0.00053, 0.00052, 0.00053, 0.00052, 0.00052, 0.00052, 0.00053, 0.00053, 0.00052, 0.00052, 0.00053, 0.00056, 0.00053, 0.00052, 0.00052, 0.00052, 0.00052, 0.00052, 0.00052, 0.00052, 0.00052, 0.00052, 0.00052, 0.00053, 0.00052, 0.00052, 0.00052, 0.00052, 0.00053, 0.00053, 0.00053, 0.00052, 0.00052, 0.00052, 0.00052, 0.00052, 0.00052, 0.00052, 0.00052, 0.00053, 0.00052, 0.00055, 0.00052, 0.00052, 0.00052, 0.00053, 0.00053, 0.00052, 0.00053, 0.00052, 0.00052, 0.00052, 0.00052, 0.00052, 0.00052, 0.00053, 0.00053, 0.00052, 0.00052, 0.00052, 0.00052, 0.00052, 0.00053, 0.00053, 0.00052, 0.00052, 0.00052, 0.00053, 0.00053, 0.00052, 0.00052, 0.00052, 0.00052, 0.00052, 0.00053, 0.00052, 0.00054, 0.00053, 0.00052, 0.00052, 0.00052, 0.00053, 0.00052, 0.00052, 0.00052, 0.00053, 0.00052, 0.00053, 0.00052, 0.00052, 0.00053, 0.00052, 0.00052, 0.00053, 0.00052, 0.00052, 0.00053, 0.00053, 0.00052, 0.00052, 0.00052, 0.00053, 0.00054, 0.00054, 0.00054, 0.00053, 0.00054, 0.00053, 0.00054, 0.00053, 0.00054, 0.00053, 0.00054, 0.00053, 0.00053, 0.00054, 0.00053, 0.00053, 0.00053, 0.00053, 0.00054, 0.00054, 0.00054, 0.00054, 0.00054, 0.00053, 0.00054, 0.00054, 0.00054, 0.00053, 0.00056, 0.00054, 0.00054, 0.00053, 0.00054, 0.00054, 0.00053, 0.00054, 0.00053, 0.00054, 0.00054, 0.00054, 0.00053, 0.00053, 0.00053, 0.00053, 0.00054, 0.00053, 0.00054, 0.00053, 0.00053, 0.00053, 0.00055, 0.00053, 0.00054, 0.00054, 0.00054, 0.00053, 0.00054, 0.00054, 0.00054, 0.00053, 0.00055, 0.00053, 0.00054, 0.00053, 0.00053, 0.00054, 0.00055, 0.00053, 0.00055, 0.00054, 0.00054, 0.00054, 0.00054, 0.00053, 0.00054, 0.00054, 0.00054, 0.00054, 0.00055, 0.00054, 0.00054, 0.00054, 0.00054, 0.00054, 0.00054, 0.00054, 0.00054, 0.00053, 0.00054, 0.00054, 0.00054, 0.00053, 0.00054, 0.00053, 0.00054, 0.00053, 0.00054, 0.00053, 0.00053, 0.00053, 0.00054, 0.00053, 0.00053, 0.00053, 0.00054, 0.00053, 0.00053, 0.00053, 0.00053, 0.00054, 0.00053, 0.00053, 0.00053, 0.00053, 0.00054, 0.00053, 0.00053, 0.00053, 0.00053, 0.00053, 0.00053, 0.00054, 0.00054, 0.00053, 0.00053, 0.00054, 0.00054, 0.00054, 0.00054, 0.00054, 0.00054, 0.00053, 0.00053, 0.00053, 0.00053, 0.00053, 0.00054, 0.00054, 0.00053, 0.00053, 0.00055, 0.00053, 0.00054, 0.00053, 0.00054, 0.00054, 0.00053, 0.00053, 0.00054, 0.00053, 0.00053, 0.00054, 0.00054, 0.00054, 0.00054, 0.00053, 0.00053, 0.00054, 0.00053, 0.00053, 0.00054, 0.00054, 0.00054, 0.00054, 0.00054, 0.00053, 0.00054, 0.00053, 0.00054, 0.00053, 0.00054, 0.00053, 0.00053, 0.00054, 0.00054, 0.00053, 0.00054, 0.00054, 0.00054, 0.00053, 0.00053, 0.00053, 0.00053, 0.00053, 0.00053, 0.00054, 0.00053, 0.00054, 0.00053, 0.00053, 0.00054, 0.00053, 0.00054, 0.00053, 0.00053, 0.00053, 0.00053, 0.00054, 0.0006]}, "optimizer-time": {"start_step": 0, "end_step": 2000, "step_interval": 5, "values": [2.12049, 0.00501, 0.00496, 0.00513, 0.00494, 0.00512, 0.00493, 0.00495, 0.00494, 0.00491, 0.00493, 0.00491, 0.00494, 0.00492, 0.00498, 0.00492, 0.0049, 0.00495, 0.00492, 0.00497, 0.00492, 0.00491, 0.00492, 0.00492, 0.00492, 0.00491, 0.00496, 0.00498, 0.00494, 0.00491, 0.0049, 0.00492, 0.00494, 0.00492, 0.00491, 0.00497, 0.00492, 0.00491, 0.00492, 0.00493, 0.00493, 0.00491, 0.00492, 0.00494, 0.00492, 0.00556, 0.00493, 0.00491, 0.00512, 0.00512, 0.00492, 0.00493, 0.00494, 0.0049, 0.00494, 0.00495, 0.00496, 0.00491, 0.00491, 0.00496, 0.00492, 0.00493, 0.00512, 0.00493, 0.00493, 0.00494, 0.00491, 0.0049, 0.00491, 0.00496, 0.00492, 0.0049, 0.00489, 0.00495, 0.00491, 0.00488, 0.00493, 0.00491, 0.0049, 0.0049, 0.00526, 0.00491, 0.00503, 0.0049, 0.00519, 0.00488, 0.00492, 0.00491, 0.0049, 0.00491, 0.00489, 0.00491, 0.0049, 0.00487, 0.00489, 0.0049, 0.00489, 0.00539, 0.00473, 0.00548, 0.00489, 0.00551, 0.0049, 0.00493, 0.00471, 0.00529, 0.00491, 0.0049, 0.00491, 0.00489, 0.00522, 0.00479, 0.00492, 0.00492, 0.00503, 0.0049, 0.0048, 0.0049, 0.00492, 0.00494, 0.00475, 0.0049, 0.00498, 0.0049, 0.0049, 0.00489, 0.0049, 0.00536, 0.00494, 0.00492, 0.00474, 0.00491, 0.0049, 0.00491, 0.00516, 0.00489, 0.00491, 0.0049, 0.00492, 0.00493, 0.00506, 0.00489, 0.00489, 0.00491, 0.00534, 0.00497, 0.00488, 0.00496, 0.00493, 0.00489, 0.00494, 0.0049, 0.00493, 0.00492, 0.00478, 0.00489, 0.0049, 0.00501, 0.00493, 0.00496, 0.0049, 0.00496, 0.00496, 0.00496, 0.00492, 0.00494, 0.00516, 0.00496, 0.00497, 0.00495, 0.00494, 0.00494, 0.00493, 0.00496, 0.00494, 0.0051, 0.00495, 0.00495, 0.00493, 0.00492, 0.00495, 0.00493, 0.00498, 0.00491, 0.00494, 0.00492, 0.00496, 0.00491, 0.00491, 0.00493, 0.00492, 0.0049, 0.005, 0.00491, 0.00498, 0.00494, 0.00489, 0.00494, 0.00496, 0.00491, 0.00501, 0.00504, 0.00502, 0.00501, 0.00506, 0.00508, 0.00502, 0.00501, 0.00497, 0.00496, 0.005, 0.005, 0.00498, 0.00504, 0.00502, 0.00497, 0.00511, 0.00499, 0.00502, 0.00502, 0.00535, 0.00532, 0.00503, 0.00507, 0.005, 0.00501, 0.005, 0.00499, 0.00499, 0.00538, 0.00498, 0.00502, 0.00499, 0.00505, 0.00503, 0.00497, 0.00504, 0.00493, 0.00495, 0.00499, 0.00529, 0.00499, 0.00499, 0.00502, 0.00499, 0.00504, 0.00497, 0.00502, 0.005, 0.00501, 0.00503, 0.00504, 0.00496, 0.00502, 0.00502, 0.00501, 0.00503, 0.005, 0.00501, 0.00502, 0.00495, 0.00563, 0.00504, 0.005, 0.00496, 0.00494, 0.00501, 0.005, 0.00499, 0.0054, 0.00512, 0.00507, 0.00502, 0.005, 0.00501, 0.005, 0.00499, 0.00498, 0.00504, 0.00503, 0.00499, 0.00501, 0.00511, 0.00502, 0.00506, 0.00502, 0.00501, 0.00499, 0.00535, 0.00498, 0.00501, 0.00499, 0.00494, 0.00493, 0.00496, 0.00494, 0.00496, 0.00495, 0.00495, 0.00494, 0.00498, 0.00495, 0.00498, 0.00498, 0.00495, 0.005, 0.00492, 0.00493, 0.00494, 0.00492, 0.00498, 0.00494, 0.00496, 0.00495, 0.00497, 0.00506, 0.00494, 0.00497, 0.00498, 0.00495, 0.00494, 0.00495, 0.00497, 0.005, 0.00512, 0.00495, 0.00495, 0.00497, 0.00493, 0.00495, 0.00494, 0.00498, 0.00495, 0.00509, 0.005, 0.00498, 0.00493, 0.00494, 0.00496, 0.00495, 0.00497, 0.00495, 0.00495, 0.00496, 0.00491, 0.00494, 0.00498, 0.00494, 0.00494, 0.00495, 0.00496, 0.00495, 0.00501, 0.00495, 0.00508, 0.00493, 0.00505, 0.00493, 0.00494, 0.00495, 0.00495, 0.00496, 0.00501, 0.00497, 0.00499, 0.00499, 0.00499, 0.00495, 0.00494, 0.00498, 0.00498, 0.00498, 0.00497, 0.00499, 0.00499, 0.00497, 0.00494, 0.00495, 0.00497, 0.00497, 0.00496, 0.00496, 0.00496, 0.00501, 0.00501, 0.00497, 0.00503, 0.00498, 0.00498, 0.0051, 0.00507, 0.005, 0.00498, 0.00497, 0.00499, 0.00495, 0.00494, 0.00496, 0.00495, 0.00502]}, "learning-rate": {"start_step": 0, "end_step": 2000, "step_interval": 5, "values": [0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 1e-05, 1e-05, 1e-05, 1e-05, 1e-05, 1e-05, 1e-05, 1e-05, 1e-05, 1e-05, 1e-05, 1e-05, 1e-05, 1e-05, 1e-05, 1e-05, 1e-05, 1e-05, 1e-05, 1e-05, 1e-05, 1e-05, 1e-05, 1e-05, 1e-05, 1e-05, 1e-05, 1e-05, 1e-05, 1e-05, 1e-05, 1e-05, 1e-05, 1e-05, 1e-05, 1e-05, 1e-05, 1e-05, 1e-05, 1e-05, 1e-05, 1e-05, 2e-05, 2e-05, 2e-05, 2e-05, 2e-05, 2e-05, 2e-05, 2e-05, 2e-05, 2e-05, 2e-05, 2e-05, 2e-05, 2e-05, 2e-05, 2e-05, 2e-05, 2e-05, 2e-05, 2e-05, 2e-05, 2e-05, 2e-05, 2e-05, 2e-05, 2e-05, 2e-05, 2e-05, 2e-05, 2e-05, 2e-05, 2e-05, 2e-05, 2e-05, 2e-05, 2e-05, 2e-05, 2e-05, 2e-05, 2e-05, 2e-05, 2e-05, 2e-05, 3e-05, 3e-05, 3e-05, 3e-05, 3e-05, 3e-05, 3e-05, 3e-05, 3e-05, 3e-05, 3e-05, 3e-05, 3e-05, 3e-05, 3e-05, 3e-05, 3e-05, 3e-05, 3e-05, 3e-05, 3e-05, 3e-05, 3e-05, 3e-05, 3e-05, 3e-05, 3e-05, 3e-05, 3e-05, 3e-05, 3e-05, 3e-05, 3e-05, 3e-05, 3e-05, 3e-05, 3e-05, 3e-05, 3e-05, 3e-05, 3e-05, 3e-05, 3e-05, 4e-05, 4e-05, 4e-05, 4e-05, 4e-05, 4e-05, 4e-05, 4e-05, 4e-05, 4e-05, 4e-05, 4e-05, 4e-05, 4e-05, 4e-05, 4e-05, 4e-05, 4e-05, 4e-05, 4e-05, 4e-05, 4e-05, 4e-05, 4e-05, 4e-05, 4e-05, 4e-05, 4e-05, 4e-05, 4e-05, 4e-05, 4e-05, 4e-05, 4e-05, 4e-05, 4e-05, 4e-05, 4e-05, 4e-05, 4e-05, 4e-05, 4e-05, 5e-05, 5e-05, 5e-05, 5e-05, 5e-05, 5e-05, 5e-05, 5e-05, 5e-05, 5e-05, 5e-05, 5e-05, 5e-05, 5e-05, 5e-05, 5e-05, 5e-05, 5e-05, 5e-05, 5e-05, 5e-05, 5e-05, 5e-05, 5e-05, 5e-05, 5e-05, 5e-05, 5e-05, 5e-05, 5e-05, 5e-05, 5e-05, 5e-05, 5e-05, 5e-05, 5e-05, 5e-05, 5e-05, 5e-05, 5e-05, 5e-05, 5e-05, 5e-05, 6e-05, 6e-05, 6e-05, 6e-05, 6e-05, 6e-05, 6e-05, 6e-05, 6e-05, 6e-05, 6e-05, 6e-05, 6e-05, 6e-05, 6e-05, 6e-05, 6e-05, 6e-05, 6e-05, 6e-05, 6e-05, 6e-05, 6e-05, 6e-05, 6e-05, 6e-05, 6e-05, 6e-05, 6e-05, 6e-05, 6e-05, 6e-05, 6e-05, 6e-05, 6e-05, 6e-05, 6e-05, 6e-05, 6e-05, 6e-05, 6e-05, 6e-05, 6e-05, 7e-05, 7e-05, 7e-05, 7e-05, 7e-05, 7e-05, 7e-05, 7e-05, 7e-05, 7e-05, 7e-05, 7e-05, 7e-05, 7e-05, 7e-05, 7e-05, 7e-05, 7e-05, 7e-05, 7e-05, 7e-05, 7e-05, 7e-05, 7e-05, 7e-05, 7e-05, 7e-05, 7e-05, 7e-05, 7e-05, 7e-05, 7e-05, 7e-05, 7e-05, 7e-05, 7e-05, 7e-05, 7e-05, 7e-05, 7e-05, 7e-05, 7e-05, 8e-05, 8e-05, 8e-05, 8e-05, 8e-05, 8e-05, 8e-05, 8e-05, 8e-05, 8e-05, 8e-05, 8e-05, 8e-05, 8e-05, 8e-05, 8e-05, 8e-05, 8e-05, 8e-05, 8e-05, 8e-05, 8e-05, 8e-05, 8e-05, 8e-05, 8e-05, 8e-05, 8e-05, 8e-05, 8e-05, 8e-05, 8e-05, 8e-05, 8e-05, 8e-05, 8e-05, 8e-05, 8e-05, 8e-05, 8e-05, 8e-05, 8e-05, 8e-05, 9e-05, 9e-05, 9e-05, 9e-05, 9e-05, 9e-05, 9e-05, 9e-05, 9e-05, 9e-05, 9e-05, 9e-05, 9e-05, 9e-05, 9e-05, 9e-05, 9e-05, 9e-05, 9e-05, 9e-05, 9e-05, 9e-05, 9e-05, 9e-05, 9e-05, 9e-05, 9e-05, 9e-05, 9e-05, 9e-05, 9e-05, 9e-05, 9e-05, 9e-05, 9e-05, 9e-05, 9e-05]}, "learning-rate vs samples": {"start_step": 0, "end_step": 2000, "step_interval": 5, "values": [0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 1e-05, 1e-05, 1e-05, 1e-05, 1e-05, 1e-05, 1e-05, 1e-05, 1e-05, 1e-05, 1e-05, 1e-05, 1e-05, 1e-05, 1e-05, 1e-05, 1e-05, 1e-05, 1e-05, 1e-05, 1e-05, 1e-05, 1e-05, 1e-05, 1e-05, 1e-05, 1e-05, 1e-05, 1e-05, 1e-05, 1e-05, 1e-05, 1e-05, 1e-05, 1e-05, 1e-05, 1e-05, 1e-05, 1e-05, 1e-05, 1e-05, 1e-05, 2e-05, 2e-05, 2e-05, 2e-05, 2e-05, 2e-05, 2e-05, 2e-05, 2e-05, 2e-05, 2e-05, 2e-05, 2e-05, 2e-05, 2e-05, 2e-05, 2e-05, 2e-05, 2e-05, 2e-05, 2e-05, 2e-05, 2e-05, 2e-05, 2e-05, 2e-05, 2e-05, 2e-05, 2e-05, 2e-05, 2e-05, 2e-05, 2e-05, 2e-05, 2e-05, 2e-05, 2e-05, 2e-05, 2e-05, 2e-05, 2e-05, 2e-05, 2e-05, 3e-05, 3e-05, 3e-05, 3e-05, 3e-05, 3e-05, 3e-05, 3e-05, 3e-05, 3e-05, 3e-05, 3e-05, 3e-05, 3e-05, 3e-05, 3e-05, 3e-05, 3e-05, 3e-05, 3e-05, 3e-05, 3e-05, 3e-05, 3e-05, 3e-05, 3e-05, 3e-05, 3e-05, 3e-05, 3e-05, 3e-05, 3e-05, 3e-05, 3e-05, 3e-05, 3e-05, 3e-05, 3e-05, 3e-05, 3e-05, 3e-05, 3e-05, 3e-05, 4e-05, 4e-05, 4e-05, 4e-05, 4e-05, 4e-05, 4e-05, 4e-05, 4e-05, 4e-05, 4e-05, 4e-05, 4e-05, 4e-05, 4e-05, 4e-05, 4e-05, 4e-05, 4e-05, 4e-05, 4e-05, 4e-05, 4e-05, 4e-05, 4e-05, 4e-05, 4e-05, 4e-05, 4e-05, 4e-05, 4e-05, 4e-05, 4e-05, 4e-05, 4e-05, 4e-05, 4e-05, 4e-05, 4e-05, 4e-05, 4e-05, 4e-05, 5e-05, 5e-05, 5e-05, 5e-05, 5e-05, 5e-05, 5e-05, 5e-05, 5e-05, 5e-05, 5e-05, 5e-05, 5e-05, 5e-05, 5e-05, 5e-05, 5e-05, 5e-05, 5e-05, 5e-05, 5e-05, 5e-05, 5e-05, 5e-05, 5e-05, 5e-05, 5e-05, 5e-05, 5e-05, 5e-05, 5e-05, 5e-05, 5e-05, 5e-05, 5e-05, 5e-05, 5e-05, 5e-05, 5e-05, 5e-05, 5e-05, 5e-05, 5e-05, 6e-05, 6e-05, 6e-05, 6e-05, 6e-05, 6e-05, 6e-05, 6e-05, 6e-05, 6e-05, 6e-05, 6e-05, 6e-05, 6e-05, 6e-05, 6e-05, 6e-05, 6e-05, 6e-05, 6e-05, 6e-05, 6e-05, 6e-05, 6e-05, 6e-05, 6e-05, 6e-05, 6e-05, 6e-05, 6e-05, 6e-05, 6e-05, 6e-05, 6e-05, 6e-05, 6e-05, 6e-05, 6e-05, 6e-05, 6e-05, 6e-05, 6e-05, 6e-05, 7e-05, 7e-05, 7e-05, 7e-05, 7e-05, 7e-05, 7e-05, 7e-05, 7e-05, 7e-05, 7e-05, 7e-05, 7e-05, 7e-05, 7e-05, 7e-05, 7e-05, 7e-05, 7e-05, 7e-05, 7e-05, 7e-05, 7e-05, 7e-05, 7e-05, 7e-05, 7e-05, 7e-05, 7e-05, 7e-05, 7e-05, 7e-05, 7e-05, 7e-05, 7e-05, 7e-05, 7e-05, 7e-05, 7e-05, 7e-05, 7e-05, 7e-05, 8e-05, 8e-05, 8e-05, 8e-05, 8e-05, 8e-05, 8e-05, 8e-05, 8e-05, 8e-05, 8e-05, 8e-05, 8e-05, 8e-05, 8e-05, 8e-05, 8e-05, 8e-05, 8e-05, 8e-05, 8e-05, 8e-05, 8e-05, 8e-05, 8e-05, 8e-05, 8e-05, 8e-05, 8e-05, 8e-05, 8e-05, 8e-05, 8e-05, 8e-05, 8e-05, 8e-05, 8e-05, 8e-05, 8e-05, 8e-05, 8e-05, 8e-05, 8e-05, 9e-05, 9e-05, 9e-05, 9e-05, 9e-05, 9e-05, 9e-05, 9e-05, 9e-05, 9e-05, 9e-05, 9e-05, 9e-05, 9e-05, 9e-05, 9e-05, 9e-05, 9e-05, 9e-05, 9e-05, 9e-05, 9e-05, 9e-05, 9e-05, 9e-05, 9e-05, 9e-05, 9e-05, 9e-05, 9e-05, 9e-05, 9e-05, 9e-05, 9e-05, 9e-05, 9e-05, 9e-05]}, "batch-size": {"start_step": 0, "end_step": 2000, "step_interval": 5, "values": [128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0]}, "batch-size vs samples": {"start_step": 0, "end_step": 2000, "step_interval": 5, "values": [128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0]}, "lm loss": {"start_step": 0, "end_step": 2000, "step_interval": 5, "values": [10.85966, 10.87073, 10.85528, 10.80344, 10.64111, 10.62649, 10.41586, 10.12808, 9.92567, 9.82477, 9.56932, 9.84031, 9.86916, 9.61422, 9.77599, 9.50086, 9.45226, 9.6411, 9.38013, 9.32634, 9.2385, 9.14186, 9.17287, 8.9927, 9.18814, 9.05768, 9.15476, 9.16458, 9.29864, 8.98678, 8.93067, 9.0473, 9.04611, 8.65648, 8.71651, 8.75511, 8.6848, 8.73632, 8.66102, 8.76482, 8.66202, 8.84911, 8.83074, 8.49813, 8.38745, 8.42847, 8.49038, 8.38199, 8.43014, 8.57752, 8.36366, 8.18998, 8.22416, 8.21877, 8.26315, 7.90938, 8.09005, 7.88773, 8.24, 8.22485, 7.99867, 7.95704, 7.91177, 7.73255, 7.73299, 7.63614, 7.50837, 7.90027, 7.69288, 7.44749, 7.73489, 7.76278, 7.53675, 7.29662, 7.44913, 7.33262, 7.46188, 7.22442, 7.63668, 7.27892, 7.3525, 7.21173, 7.21816, 7.422, 7.17639, 7.28501, 7.00259, 7.00597, 7.03995, 7.14192, 6.82608, 6.98941, 7.09192, 7.00491, 6.87719, 6.75925, 6.994, 7.05741, 6.70391, 6.57997, 6.72686, 6.74254, 6.73498, 6.73924, 6.65693, 6.40819, 6.63945, 6.61998, 6.44777, 6.63026, 6.7458, 6.60872, 6.72566, 6.6941, 6.62478, 6.5113, 6.60016, 6.40683, 6.66647, 6.25038, 6.25487, 6.30344, 6.39244, 6.35319, 6.45279, 6.29501, 6.34432, 6.24122, 6.20479, 6.40226, 6.3298, 6.33253, 6.17365, 6.1703, 6.25122, 6.39707, 6.21313, 6.16095, 6.19193, 6.12904, 6.07716, 6.08434, 6.27156, 6.42116, 6.27092, 6.31502, 6.1099, 6.19051, 6.01202, 6.04186, 5.96572, 6.2566, 6.1994, 5.97238, 5.79066, 6.13517, 5.8567, 6.11381, 5.79621, 6.16806, 6.15725, 6.09481, 5.94172, 6.12313, 5.95406, 6.20205, 5.90266, 5.80426, 5.78673, 5.69691, 6.02057, 6.00205, 6.07073, 5.89354, 6.04415, 5.97229, 5.99763, 5.99201, 5.9504, 5.83989, 5.95152, 5.61741, 5.70128, 5.88995, 5.84414, 5.86222, 5.76021, 5.83835, 5.72362, 5.56328, 5.72206, 5.62699, 5.83296, 5.60473, 5.71241, 5.71399, 5.89863, 5.64481, 5.85045, 5.74116, 5.86786, 5.33069, 5.89739, 5.87147, 5.85621, 5.41402, 5.40885, 5.6244, 5.5909, 5.48288, 5.57328, 5.66993, 5.47325, 5.74532, 5.50733, 5.58951, 5.62335, 5.61873, 5.50712, 5.61686, 5.67259, 5.68325, 5.58652, 5.65724, 5.37154, 5.68206, 5.62545, 5.42293, 5.5898, 5.63487, 5.55215, 5.34318, 5.53918, 5.48775, 5.48384, 5.38046, 5.5524, 5.6054, 5.39011, 5.52269, 5.48564, 5.33339, 5.50751, 5.41235, 5.44463, 5.32284, 5.07354, 5.47834, 5.57158, 5.71691, 5.41899, 5.60533, 5.64283, 5.2342, 5.27417, 5.39872, 5.39954, 5.33267, 5.50546, 5.18598, 5.3031, 5.25146, 5.37886, 5.25856, 5.45542, 5.53656, 5.3141, 5.4389, 5.34171, 5.07715, 5.31356, 5.26151, 5.30932, 5.1132, 5.27888, 5.26913, 5.47802, 5.16411, 5.27179, 5.21046, 5.36047, 4.98558, 4.92161, 5.33001, 5.39104, 5.23106, 5.32226, 5.1108, 5.16307, 5.26011, 5.06878, 5.26621, 5.0712, 5.34447, 5.24947, 5.15197, 5.24511, 5.04213, 5.3173, 5.05677, 5.03031, 5.14366, 5.11315, 5.27152, 5.15384, 5.27818, 5.09471, 5.09718, 5.25022, 5.32221, 5.25368, 5.19177, 5.14141, 5.29041, 4.95105, 5.2074, 5.08987, 5.30215, 5.17471, 5.18799, 5.1137, 4.98327, 4.99184, 5.2222, 5.31185, 5.09737, 5.05507, 4.91447, 5.12386, 5.11467, 4.92535, 5.33586, 5.02667, 5.10506, 5.16491, 5.00221, 5.06296, 5.06915, 4.9949, 5.07922, 5.16029, 4.97927, 5.18201, 4.92792, 4.92204, 5.06399, 4.99471, 4.90735, 4.77765, 4.94535, 5.11795, 5.01969, 5.02225, 5.33057, 4.96058, 4.9931, 5.0457, 4.81181, 4.74328, 4.99687, 5.0383, 4.87423, 4.95276, 5.04325, 5.02264, 4.81956, 4.89599, 4.90754, 4.8294, 4.74438, 5.01179, 4.75262, 5.2095, 4.78557, 4.99344, 4.73813, 4.78739, 4.82401, 4.64885, 4.65631, 4.84474, 4.80822, 4.80327, 4.92878, 4.88473, 4.93264, 4.7706, 4.88531, 4.73767, 4.91524, 4.95719, 4.87814, 4.70608, 4.7878, 4.89822, 4.71172, 4.87123, 4.69258, 4.69633, 4.64631]}, "lm loss vs samples": {"start_step": 0, "end_step": 2000, "step_interval": 5, "values": [10.85966, 10.87073, 10.85528, 10.80344, 10.64111, 10.62649, 10.41586, 10.12808, 9.92567, 9.82477, 9.56932, 9.84031, 9.86916, 9.61422, 9.77599, 9.50086, 9.45226, 9.6411, 9.38013, 9.32634, 9.2385, 9.14186, 9.17287, 8.9927, 9.18814, 9.05768, 9.15476, 9.16458, 9.29864, 8.98678, 8.93067, 9.0473, 9.04611, 8.65648, 8.71651, 8.75511, 8.6848, 8.73632, 8.66102, 8.76482, 8.66202, 8.84911, 8.83074, 8.49813, 8.38745, 8.42847, 8.49038, 8.38199, 8.43014, 8.57752, 8.36366, 8.18998, 8.22416, 8.21877, 8.26315, 7.90938, 8.09005, 7.88773, 8.24, 8.22485, 7.99867, 7.95704, 7.91177, 7.73255, 7.73299, 7.63614, 7.50837, 7.90027, 7.69288, 7.44749, 7.73489, 7.76278, 7.53675, 7.29662, 7.44913, 7.33262, 7.46188, 7.22442, 7.63668, 7.27892, 7.3525, 7.21173, 7.21816, 7.422, 7.17639, 7.28501, 7.00259, 7.00597, 7.03995, 7.14192, 6.82608, 6.98941, 7.09192, 7.00491, 6.87719, 6.75925, 6.994, 7.05741, 6.70391, 6.57997, 6.72686, 6.74254, 6.73498, 6.73924, 6.65693, 6.40819, 6.63945, 6.61998, 6.44777, 6.63026, 6.7458, 6.60872, 6.72566, 6.6941, 6.62478, 6.5113, 6.60016, 6.40683, 6.66647, 6.25038, 6.25487, 6.30344, 6.39244, 6.35319, 6.45279, 6.29501, 6.34432, 6.24122, 6.20479, 6.40226, 6.3298, 6.33253, 6.17365, 6.1703, 6.25122, 6.39707, 6.21313, 6.16095, 6.19193, 6.12904, 6.07716, 6.08434, 6.27156, 6.42116, 6.27092, 6.31502, 6.1099, 6.19051, 6.01202, 6.04186, 5.96572, 6.2566, 6.1994, 5.97238, 5.79066, 6.13517, 5.8567, 6.11381, 5.79621, 6.16806, 6.15725, 6.09481, 5.94172, 6.12313, 5.95406, 6.20205, 5.90266, 5.80426, 5.78673, 5.69691, 6.02057, 6.00205, 6.07073, 5.89354, 6.04415, 5.97229, 5.99763, 5.99201, 5.9504, 5.83989, 5.95152, 5.61741, 5.70128, 5.88995, 5.84414, 5.86222, 5.76021, 5.83835, 5.72362, 5.56328, 5.72206, 5.62699, 5.83296, 5.60473, 5.71241, 5.71399, 5.89863, 5.64481, 5.85045, 5.74116, 5.86786, 5.33069, 5.89739, 5.87147, 5.85621, 5.41402, 5.40885, 5.6244, 5.5909, 5.48288, 5.57328, 5.66993, 5.47325, 5.74532, 5.50733, 5.58951, 5.62335, 5.61873, 5.50712, 5.61686, 5.67259, 5.68325, 5.58652, 5.65724, 5.37154, 5.68206, 5.62545, 5.42293, 5.5898, 5.63487, 5.55215, 5.34318, 5.53918, 5.48775, 5.48384, 5.38046, 5.5524, 5.6054, 5.39011, 5.52269, 5.48564, 5.33339, 5.50751, 5.41235, 5.44463, 5.32284, 5.07354, 5.47834, 5.57158, 5.71691, 5.41899, 5.60533, 5.64283, 5.2342, 5.27417, 5.39872, 5.39954, 5.33267, 5.50546, 5.18598, 5.3031, 5.25146, 5.37886, 5.25856, 5.45542, 5.53656, 5.3141, 5.4389, 5.34171, 5.07715, 5.31356, 5.26151, 5.30932, 5.1132, 5.27888, 5.26913, 5.47802, 5.16411, 5.27179, 5.21046, 5.36047, 4.98558, 4.92161, 5.33001, 5.39104, 5.23106, 5.32226, 5.1108, 5.16307, 5.26011, 5.06878, 5.26621, 5.0712, 5.34447, 5.24947, 5.15197, 5.24511, 5.04213, 5.3173, 5.05677, 5.03031, 5.14366, 5.11315, 5.27152, 5.15384, 5.27818, 5.09471, 5.09718, 5.25022, 5.32221, 5.25368, 5.19177, 5.14141, 5.29041, 4.95105, 5.2074, 5.08987, 5.30215, 5.17471, 5.18799, 5.1137, 4.98327, 4.99184, 5.2222, 5.31185, 5.09737, 5.05507, 4.91447, 5.12386, 5.11467, 4.92535, 5.33586, 5.02667, 5.10506, 5.16491, 5.00221, 5.06296, 5.06915, 4.9949, 5.07922, 5.16029, 4.97927, 5.18201, 4.92792, 4.92204, 5.06399, 4.99471, 4.90735, 4.77765, 4.94535, 5.11795, 5.01969, 5.02225, 5.33057, 4.96058, 4.9931, 5.0457, 4.81181, 4.74328, 4.99687, 5.0383, 4.87423, 4.95276, 5.04325, 5.02264, 4.81956, 4.89599, 4.90754, 4.8294, 4.74438, 5.01179, 4.75262, 5.2095, 4.78557, 4.99344, 4.73813, 4.78739, 4.82401, 4.64885, 4.65631, 4.84474, 4.80822, 4.80327, 4.92878, 4.88473, 4.93264, 4.7706, 4.88531, 4.73767, 4.91524, 4.95719, 4.87814, 4.70608, 4.7878, 4.89822, 4.71172, 4.87123, 4.69258, 4.69633, 4.64631]}, "loss-scale": {"start_step": 0, "end_step": 2000, "step_interval": 5, "values": [1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0]}, "loss-scale vs samples": {"start_step": 0, "end_step": 2000, "step_interval": 5, "values": [1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0]}, "grad-norm": {"start_step": 0, "end_step": 2000, "step_interval": 5, "values": [13.56517, 13.52183, 13.82389, 12.68199, 12.11513, 9.42628, 6.78009, 6.96682, 6.03524, 4.63457, 4.1513, 2.87067, 2.35463, 2.3279, 2.02459, 2.22441, 2.16108, 1.87618, 2.21105, 2.06296, 2.12729, 2.152, 2.00687, 2.2248, 1.98285, 2.1147, 1.92124, 1.92395, 1.94527, 2.15653, 2.0865, 1.94545, 1.87214, 2.15774, 2.14492, 2.10813, 1.99702, 1.84398, 1.93326, 1.73194, 2.15655, 1.83365, 1.74796, 1.87637, 1.87935, 1.82812, 1.70882, 1.75031, 1.75541, 1.56033, 1.72362, 1.80715, 1.77318, 1.81611, 1.66844, 1.80559, 1.7625, 1.84598, 1.62632, 1.48661, 1.64786, 1.45473, 1.77763, 1.80854, 1.64942, 1.65627, 1.70353, 1.60171, 1.44031, 1.72339, 1.43433, 1.37767, 1.68581, 1.37671, 1.40648, 1.61691, 1.50881, 1.38382, 1.44532, 1.27357, 1.36667, 1.33118, 1.30365, 1.39513, 1.39043, 1.4631, 1.55974, 1.45774, 1.22995, 1.11972, 1.09726, 1.20059, 1.10224, 1.31175, 1.01034, 1.30362, 1.38885, 1.05046, 0.94787, 1.76252, 1.11012, 1.2148, 1.71468, 1.62278, 0.95552, 1.16789, 1.17655, 1.03922, 1.21282, 1.1032, 0.98669, 0.95678, 1.1193, 1.05737, 1.01498, 1.16799, 0.97578, 1.42941, 1.13594, 1.05985, 0.9398, 1.10182, 1.02064, 1.3517, 1.44708, 2.04415, 1.69036, 1.40806, 1.38738, 1.3424, 0.99552, 1.67778, 1.38915, 1.16703, 1.21285, 1.27027, 1.08112, 1.56529, 1.11243, 1.55047, 1.88478, 1.49661, 1.24747, 1.30858, 1.0413, 1.79193, 1.1894, 1.10832, 1.14553, 1.37473, 1.12916, 1.19043, 1.55147, 1.14787, 0.9831, 1.97748, 1.30968, 1.75548, 1.42903, 1.47772, 1.63806, 1.08487, 1.3989, 1.02365, 1.24838, 1.43469, 1.42662, 1.30881, 1.20964, 1.49347, 1.21919, 1.05332, 1.18399, 1.38555, 1.13727, 1.36432, 1.2528, 1.17022, 1.32348, 1.07935, 1.19539, 1.48684, 1.19029, 1.2198, 1.81559, 1.52452, 1.79334, 1.66013, 1.20616, 1.67532, 1.19437, 1.28, 1.33364, 1.69679, 1.53842, 1.37202, 1.34387, 1.37081, 1.28649, 1.5618, 1.03326, 1.39685, 1.27238, 1.20598, 1.32922, 1.41054, 1.32813, 1.46075, 1.18533, 1.18314, 1.37783, 1.39264, 1.2322, 1.35301, 1.51994, 1.29479, 1.54145, 1.57876, 1.23038, 1.67935, 1.59903, 1.7688, 1.38891, 1.39714, 1.41056, 1.56263, 1.84649, 1.31226, 2.25632, 1.5966, 1.20159, 1.49708, 1.73963, 1.47932, 1.74434, 1.84578, 1.28148, 1.58712, 1.57826, 1.14575, 1.37743, 1.14726, 1.36495, 1.54092, 1.1998, 1.83908, 1.60608, 1.22735, 1.39352, 1.48052, 1.44922, 1.5986, 1.86828, 1.2133, 1.28534, 1.44591, 1.40707, 1.6217, 1.68123, 1.16996, 1.40545, 1.79994, 1.32408, 1.35454, 1.82216, 1.50619, 1.25331, 1.36593, 1.33067, 1.20379, 1.1715, 1.34612, 1.23828, 1.2249, 1.23199, 1.50931, 1.24187, 1.31666, 1.33544, 1.15247, 1.35164, 1.31814, 1.51121, 1.22179, 1.26518, 1.48248, 1.47105, 2.08081, 1.48841, 1.53234, 1.46321, 1.4755, 1.16048, 1.44268, 1.5642, 1.52523, 1.38495, 1.80119, 1.63483, 1.41261, 1.60553, 1.28802, 1.15347, 1.54912, 1.53753, 1.36296, 1.66631, 1.63888, 1.24348, 1.42956, 1.32686, 1.487, 1.7063, 1.383, 1.67566, 1.4665, 1.41433, 1.44807, 1.36307, 1.13744, 1.63129, 1.56395, 1.59787, 1.49857, 1.45091, 1.60777, 1.36633, 1.34096, 1.63579, 1.34741, 1.48819, 1.66258, 1.532, 1.46235, 1.36272, 1.36735, 1.33239, 1.3176, 1.2966, 1.56971, 1.31551, 1.50053, 1.27598, 1.29926, 1.5045, 1.39074, 1.41138, 1.40198, 1.46432, 1.38696, 1.52639, 1.55526, 1.4432, 1.27923, 1.48503, 1.17404, 1.20825, 1.60545, 1.81024, 1.35059, 1.28697, 1.50174, 1.46699, 1.33784, 1.08159, 1.61115, 1.46019, 1.37898, 1.35614, 1.65157, 1.46597, 1.60688, 1.72399, 1.30124, 1.44364, 1.32297, 1.13212, 1.45342, 1.38164, 1.21948, 1.26404, 1.33477, 1.30704, 1.51357, 1.26848, 1.55252, 1.33368, 1.41811, 1.47778, 1.31706, 1.20105, 1.48475, 1.28543, 1.46568, 1.42638, 1.25259, 1.60254, 1.36812, 1.3586, 1.15672]}, "grad-norm vs samples": {"start_step": 0, "end_step": 2000, "step_interval": 5, "values": [13.56517, 13.52183, 13.82389, 12.68199, 12.11513, 9.42628, 6.78009, 6.96682, 6.03524, 4.63457, 4.1513, 2.87067, 2.35463, 2.3279, 2.02459, 2.22441, 2.16108, 1.87618, 2.21105, 2.06296, 2.12729, 2.152, 2.00687, 2.2248, 1.98285, 2.1147, 1.92124, 1.92395, 1.94527, 2.15653, 2.0865, 1.94545, 1.87214, 2.15774, 2.14492, 2.10813, 1.99702, 1.84398, 1.93326, 1.73194, 2.15655, 1.83365, 1.74796, 1.87637, 1.87935, 1.82812, 1.70882, 1.75031, 1.75541, 1.56033, 1.72362, 1.80715, 1.77318, 1.81611, 1.66844, 1.80559, 1.7625, 1.84598, 1.62632, 1.48661, 1.64786, 1.45473, 1.77763, 1.80854, 1.64942, 1.65627, 1.70353, 1.60171, 1.44031, 1.72339, 1.43433, 1.37767, 1.68581, 1.37671, 1.40648, 1.61691, 1.50881, 1.38382, 1.44532, 1.27357, 1.36667, 1.33118, 1.30365, 1.39513, 1.39043, 1.4631, 1.55974, 1.45774, 1.22995, 1.11972, 1.09726, 1.20059, 1.10224, 1.31175, 1.01034, 1.30362, 1.38885, 1.05046, 0.94787, 1.76252, 1.11012, 1.2148, 1.71468, 1.62278, 0.95552, 1.16789, 1.17655, 1.03922, 1.21282, 1.1032, 0.98669, 0.95678, 1.1193, 1.05737, 1.01498, 1.16799, 0.97578, 1.42941, 1.13594, 1.05985, 0.9398, 1.10182, 1.02064, 1.3517, 1.44708, 2.04415, 1.69036, 1.40806, 1.38738, 1.3424, 0.99552, 1.67778, 1.38915, 1.16703, 1.21285, 1.27027, 1.08112, 1.56529, 1.11243, 1.55047, 1.88478, 1.49661, 1.24747, 1.30858, 1.0413, 1.79193, 1.1894, 1.10832, 1.14553, 1.37473, 1.12916, 1.19043, 1.55147, 1.14787, 0.9831, 1.97748, 1.30968, 1.75548, 1.42903, 1.47772, 1.63806, 1.08487, 1.3989, 1.02365, 1.24838, 1.43469, 1.42662, 1.30881, 1.20964, 1.49347, 1.21919, 1.05332, 1.18399, 1.38555, 1.13727, 1.36432, 1.2528, 1.17022, 1.32348, 1.07935, 1.19539, 1.48684, 1.19029, 1.2198, 1.81559, 1.52452, 1.79334, 1.66013, 1.20616, 1.67532, 1.19437, 1.28, 1.33364, 1.69679, 1.53842, 1.37202, 1.34387, 1.37081, 1.28649, 1.5618, 1.03326, 1.39685, 1.27238, 1.20598, 1.32922, 1.41054, 1.32813, 1.46075, 1.18533, 1.18314, 1.37783, 1.39264, 1.2322, 1.35301, 1.51994, 1.29479, 1.54145, 1.57876, 1.23038, 1.67935, 1.59903, 1.7688, 1.38891, 1.39714, 1.41056, 1.56263, 1.84649, 1.31226, 2.25632, 1.5966, 1.20159, 1.49708, 1.73963, 1.47932, 1.74434, 1.84578, 1.28148, 1.58712, 1.57826, 1.14575, 1.37743, 1.14726, 1.36495, 1.54092, 1.1998, 1.83908, 1.60608, 1.22735, 1.39352, 1.48052, 1.44922, 1.5986, 1.86828, 1.2133, 1.28534, 1.44591, 1.40707, 1.6217, 1.68123, 1.16996, 1.40545, 1.79994, 1.32408, 1.35454, 1.82216, 1.50619, 1.25331, 1.36593, 1.33067, 1.20379, 1.1715, 1.34612, 1.23828, 1.2249, 1.23199, 1.50931, 1.24187, 1.31666, 1.33544, 1.15247, 1.35164, 1.31814, 1.51121, 1.22179, 1.26518, 1.48248, 1.47105, 2.08081, 1.48841, 1.53234, 1.46321, 1.4755, 1.16048, 1.44268, 1.5642, 1.52523, 1.38495, 1.80119, 1.63483, 1.41261, 1.60553, 1.28802, 1.15347, 1.54912, 1.53753, 1.36296, 1.66631, 1.63888, 1.24348, 1.42956, 1.32686, 1.487, 1.7063, 1.383, 1.67566, 1.4665, 1.41433, 1.44807, 1.36307, 1.13744, 1.63129, 1.56395, 1.59787, 1.49857, 1.45091, 1.60777, 1.36633, 1.34096, 1.63579, 1.34741, 1.48819, 1.66258, 1.532, 1.46235, 1.36272, 1.36735, 1.33239, 1.3176, 1.2966, 1.56971, 1.31551, 1.50053, 1.27598, 1.29926, 1.5045, 1.39074, 1.41138, 1.40198, 1.46432, 1.38696, 1.52639, 1.55526, 1.4432, 1.27923, 1.48503, 1.17404, 1.20825, 1.60545, 1.81024, 1.35059, 1.28697, 1.50174, 1.46699, 1.33784, 1.08159, 1.61115, 1.46019, 1.37898, 1.35614, 1.65157, 1.46597, 1.60688, 1.72399, 1.30124, 1.44364, 1.32297, 1.13212, 1.45342, 1.38164, 1.21948, 1.26404, 1.33477, 1.30704, 1.51357, 1.26848, 1.55252, 1.33368, 1.41811, 1.47778, 1.31706, 1.20105, 1.48475, 1.28543, 1.46568, 1.42638, 1.25259, 1.60254, 1.36812, 1.3586, 1.15672]}, "num-zeros": {"start_step": 0, "end_step": 2000, "step_interval": 5, "values": [78.0, 71.0, 69.0, 77.0, 83.0, 93.0, 106.0, 92.0, 92.0, 132.0, 100.0, 151.0, 124.0, 174.0, 156.0, 150.0, 169.0, 195.0, 167.0, 147.0, 152.0, 152.0, 200.0, 189.0, 169.0, 153.0, 197.0, 164.0, 147.0, 172.0, 144.0, 157.0, 169.0, 165.0, 146.0, 179.0, 172.0, 212.0, 186.0, 196.0, 171.0, 138.0, 152.0, 197.0, 156.0, 167.0, 212.0, 178.0, 187.0, 180.0, 190.0, 159.0, 176.0, 163.0, 179.0, 191.0, 150.0, 150.0, 227.0, 225.0, 197.0, 184.0, 184.0, 199.0, 214.0, 235.0, 186.0, 197.0, 214.0, 222.0, 193.0, 241.0, 159.0, 264.0, 193.0, 187.0, 201.0, 208.0, 227.0, 223.0, 225.0, 212.0, 231.0, 219.0, 202.0, 196.0, 178.0, 182.0, 185.0, 210.0, 201.0, 198.0, 213.0, 214.0, 205.0, 161.0, 183.0, 193.0, 198.0, 178.0, 190.0, 166.0, 137.0, 154.0, 183.0, 150.0, 165.0, 166.0, 127.0, 174.0, 160.0, 171.0, 188.0, 172.0, 159.0, 152.0, 151.0, 127.0, 137.0, 145.0, 172.0, 135.0, 151.0, 158.0, 141.0, 113.0, 114.0, 93.0, 113.0, 128.0, 148.0, 125.0, 114.0, 127.0, 121.0, 117.0, 146.0, 116.0, 148.0, 137.0, 108.0, 114.0, 129.0, 141.0, 130.0, 107.0, 113.0, 126.0, 130.0, 102.0, 127.0, 110.0, 108.0, 109.0, 112.0, 65.0, 98.0, 84.0, 105.0, 108.0, 95.0, 135.0, 103.0, 123.0, 101.0, 102.0, 101.0, 117.0, 109.0, 106.0, 123.0, 114.0, 102.0, 88.0, 131.0, 104.0, 116.0, 108.0, 142.0, 118.0, 121.0, 115.0, 118.0, 115.0, 106.0, 119.0, 105.0, 84.0, 106.0, 91.0, 120.0, 114.0, 140.0, 96.0, 85.0, 100.0, 114.0, 103.0, 153.0, 88.0, 120.0, 96.0, 122.0, 111.0, 89.0, 107.0, 111.0, 97.0, 128.0, 103.0, 123.0, 90.0, 94.0, 82.0, 100.0, 109.0, 112.0, 104.0, 119.0, 90.0, 77.0, 114.0, 82.0, 103.0, 104.0, 104.0, 97.0, 127.0, 67.0, 99.0, 126.0, 90.0, 84.0, 109.0, 94.0, 97.0, 107.0, 113.0, 127.0, 100.0, 115.0, 102.0, 96.0, 116.0, 125.0, 102.0, 91.0, 126.0, 114.0, 101.0, 113.0, 110.0, 96.0, 126.0, 121.0, 99.0, 104.0, 108.0, 86.0, 143.0, 120.0, 83.0, 115.0, 92.0, 73.0, 113.0, 117.0, 111.0, 93.0, 106.0, 131.0, 93.0, 121.0, 109.0, 108.0, 115.0, 117.0, 116.0, 105.0, 110.0, 103.0, 112.0, 85.0, 118.0, 126.0, 119.0, 120.0, 104.0, 112.0, 111.0, 108.0, 107.0, 126.0, 123.0, 100.0, 81.0, 101.0, 106.0, 93.0, 109.0, 104.0, 131.0, 134.0, 98.0, 105.0, 129.0, 83.0, 87.0, 128.0, 116.0, 114.0, 111.0, 94.0, 114.0, 91.0, 97.0, 93.0, 116.0, 135.0, 122.0, 111.0, 126.0, 107.0, 107.0, 101.0, 82.0, 120.0, 142.0, 124.0, 120.0, 124.0, 122.0, 97.0, 96.0, 107.0, 102.0, 123.0, 115.0, 126.0, 116.0, 122.0, 115.0, 107.0, 111.0, 95.0, 93.0, 113.0, 117.0, 101.0, 110.0, 126.0, 113.0, 112.0, 127.0, 138.0, 118.0, 133.0, 94.0, 105.0, 119.0, 121.0, 122.0, 102.0, 98.0, 119.0, 103.0, 108.0, 134.0, 116.0, 107.0, 105.0, 99.0, 99.0, 117.0, 106.0, 133.0, 108.0, 110.0, 99.0, 140.0, 107.0, 104.0, 114.0, 112.0, 117.0, 106.0, 105.0, 92.0, 111.0, 99.0, 124.0, 101.0, 102.0, 144.0, 129.0, 122.0, 110.0, 116.0, 123.0, 136.0]}, "num-zeros vs samples": {"start_step": 0, "end_step": 2000, "step_interval": 5, "values": [78.0, 71.0, 69.0, 77.0, 83.0, 93.0, 106.0, 92.0, 92.0, 132.0, 100.0, 151.0, 124.0, 174.0, 156.0, 150.0, 169.0, 195.0, 167.0, 147.0, 152.0, 152.0, 200.0, 189.0, 169.0, 153.0, 197.0, 164.0, 147.0, 172.0, 144.0, 157.0, 169.0, 165.0, 146.0, 179.0, 172.0, 212.0, 186.0, 196.0, 171.0, 138.0, 152.0, 197.0, 156.0, 167.0, 212.0, 178.0, 187.0, 180.0, 190.0, 159.0, 176.0, 163.0, 179.0, 191.0, 150.0, 150.0, 227.0, 225.0, 197.0, 184.0, 184.0, 199.0, 214.0, 235.0, 186.0, 197.0, 214.0, 222.0, 193.0, 241.0, 159.0, 264.0, 193.0, 187.0, 201.0, 208.0, 227.0, 223.0, 225.0, 212.0, 231.0, 219.0, 202.0, 196.0, 178.0, 182.0, 185.0, 210.0, 201.0, 198.0, 213.0, 214.0, 205.0, 161.0, 183.0, 193.0, 198.0, 178.0, 190.0, 166.0, 137.0, 154.0, 183.0, 150.0, 165.0, 166.0, 127.0, 174.0, 160.0, 171.0, 188.0, 172.0, 159.0, 152.0, 151.0, 127.0, 137.0, 145.0, 172.0, 135.0, 151.0, 158.0, 141.0, 113.0, 114.0, 93.0, 113.0, 128.0, 148.0, 125.0, 114.0, 127.0, 121.0, 117.0, 146.0, 116.0, 148.0, 137.0, 108.0, 114.0, 129.0, 141.0, 130.0, 107.0, 113.0, 126.0, 130.0, 102.0, 127.0, 110.0, 108.0, 109.0, 112.0, 65.0, 98.0, 84.0, 105.0, 108.0, 95.0, 135.0, 103.0, 123.0, 101.0, 102.0, 101.0, 117.0, 109.0, 106.0, 123.0, 114.0, 102.0, 88.0, 131.0, 104.0, 116.0, 108.0, 142.0, 118.0, 121.0, 115.0, 118.0, 115.0, 106.0, 119.0, 105.0, 84.0, 106.0, 91.0, 120.0, 114.0, 140.0, 96.0, 85.0, 100.0, 114.0, 103.0, 153.0, 88.0, 120.0, 96.0, 122.0, 111.0, 89.0, 107.0, 111.0, 97.0, 128.0, 103.0, 123.0, 90.0, 94.0, 82.0, 100.0, 109.0, 112.0, 104.0, 119.0, 90.0, 77.0, 114.0, 82.0, 103.0, 104.0, 104.0, 97.0, 127.0, 67.0, 99.0, 126.0, 90.0, 84.0, 109.0, 94.0, 97.0, 107.0, 113.0, 127.0, 100.0, 115.0, 102.0, 96.0, 116.0, 125.0, 102.0, 91.0, 126.0, 114.0, 101.0, 113.0, 110.0, 96.0, 126.0, 121.0, 99.0, 104.0, 108.0, 86.0, 143.0, 120.0, 83.0, 115.0, 92.0, 73.0, 113.0, 117.0, 111.0, 93.0, 106.0, 131.0, 93.0, 121.0, 109.0, 108.0, 115.0, 117.0, 116.0, 105.0, 110.0, 103.0, 112.0, 85.0, 118.0, 126.0, 119.0, 120.0, 104.0, 112.0, 111.0, 108.0, 107.0, 126.0, 123.0, 100.0, 81.0, 101.0, 106.0, 93.0, 109.0, 104.0, 131.0, 134.0, 98.0, 105.0, 129.0, 83.0, 87.0, 128.0, 116.0, 114.0, 111.0, 94.0, 114.0, 91.0, 97.0, 93.0, 116.0, 135.0, 122.0, 111.0, 126.0, 107.0, 107.0, 101.0, 82.0, 120.0, 142.0, 124.0, 120.0, 124.0, 122.0, 97.0, 96.0, 107.0, 102.0, 123.0, 115.0, 126.0, 116.0, 122.0, 115.0, 107.0, 111.0, 95.0, 93.0, 113.0, 117.0, 101.0, 110.0, 126.0, 113.0, 112.0, 127.0, 138.0, 118.0, 133.0, 94.0, 105.0, 119.0, 121.0, 122.0, 102.0, 98.0, 119.0, 103.0, 108.0, 134.0, 116.0, 107.0, 105.0, 99.0, 99.0, 117.0, 106.0, 133.0, 108.0, 110.0, 99.0, 140.0, 107.0, 104.0, 114.0, 112.0, 117.0, 106.0, 105.0, 92.0, 111.0, 99.0, 124.0, 101.0, 102.0, 144.0, 129.0, 122.0, 110.0, 116.0, 123.0, 136.0]}, "params-norm": {"start_step": 0, "end_step": 2000, "step_interval": 5, "values": [179.94354, 179.94354, 179.94354, 179.94353, 179.94351, 179.94351, 179.9435, 179.94337, 179.94319, 179.94301, 179.94168, 179.94092, 179.94034, 179.9382, 179.93718, 179.93637, 179.93611, 179.93633, 179.93683, 179.93695, 179.93684, 179.93649, 179.9361, 179.93663, 179.93771, 179.93913, 179.94032, 179.94113, 179.94214, 179.94365, 179.94586, 179.94824, 179.95052, 179.95296, 179.95572, 179.95921, 179.96291, 179.96681, 179.97093, 179.97545, 179.98062, 179.98616, 179.99197, 179.99846, 180.00552, 180.01314, 180.02119, 180.03004, 180.0396, 180.05011, 180.06131, 180.07315, 180.08542, 180.0985, 180.11215, 180.12645, 180.14087, 180.15598, 180.17198, 180.18895, 180.20711, 180.22621, 180.24666, 180.26831, 180.28981, 180.31268, 180.33565, 180.35945, 180.38472, 180.41133, 180.43765, 180.46451, 180.49187, 180.51939, 180.54758, 180.57634, 180.60477, 180.63396, 180.66389, 180.69472, 180.72603, 180.7572, 180.78957, 180.823, 180.85631, 180.88991, 180.92371, 180.95706, 180.99092, 181.02626, 181.06326, 181.10162, 181.1391, 181.17641, 181.21402, 181.25211, 181.28955, 181.32634, 181.36447, 181.40189, 181.4381, 181.47331, 181.50807, 181.54071, 181.57346, 181.60866, 181.64577, 181.68417, 181.72168, 181.75914, 181.79767, 181.83748, 181.87747, 181.91742, 181.95695, 181.99832, 182.03812, 182.07738, 182.11449, 182.15204, 182.19035, 182.22978, 182.2695, 182.31001, 182.34891, 182.38696, 182.42218, 182.45525, 182.48941, 182.52226, 182.55621, 182.58896, 182.62086, 182.65288, 182.68657, 182.72272, 182.76212, 182.80115, 182.83951, 182.87524, 182.90919, 182.94313, 182.97842, 183.01477, 183.0529, 183.09117, 183.127, 183.16306, 183.20122, 183.24178, 183.28111, 183.32036, 183.35971, 183.3998, 183.43983, 183.47787, 183.51186, 183.54558, 183.57816, 183.6123, 183.64774, 183.68333, 183.72012, 183.75874, 183.79793, 183.83867, 183.87993, 183.92157, 183.96465, 184.00539, 184.04436, 184.0843, 184.12569, 184.16653, 184.20705, 184.24741, 184.28691, 184.32756, 184.36906, 184.41148, 184.45378, 184.4951, 184.53712, 184.57993, 184.62045, 184.65775, 184.69293, 184.72659, 184.76007, 184.79503, 184.83018, 184.86899, 184.90979, 184.95056, 184.99091, 185.03053, 185.07204, 185.11502, 185.15868, 185.20329, 185.24709, 185.29115, 185.33409, 185.37717, 185.4185, 185.45804, 185.49718, 185.53632, 185.57599, 185.61728, 185.65776, 185.69963, 185.74083, 185.78281, 185.82603, 185.86871, 185.91023, 185.94936, 185.98782, 186.0262, 186.06454, 186.10416, 186.14491, 186.1852, 186.2245, 186.26433, 186.30334, 186.34256, 186.38142, 186.41753, 186.45586, 186.49515, 186.5363, 186.57649, 186.61508, 186.65221, 186.6895, 186.72816, 186.76711, 186.80779, 186.84801, 186.88885, 186.93158, 186.97491, 187.01726, 187.06096, 187.10196, 187.14183, 187.18462, 187.22882, 187.27315, 187.31848, 187.36339, 187.40767, 187.45337, 187.49886, 187.54268, 187.58609, 187.62961, 187.67044, 187.71268, 187.75528, 187.79819, 187.84183, 187.88416, 187.92462, 187.96719, 188.0098, 188.0549, 188.10202, 188.14798, 188.19414, 188.23969, 188.28632, 188.33499, 188.38423, 188.43146, 188.47794, 188.52431, 188.57013, 188.61865, 188.66565, 188.71187, 188.75861, 188.80621, 188.85393, 188.90173, 188.94839, 188.99448, 189.04036, 189.08531, 189.13077, 189.17767, 189.22517, 189.27315, 189.32074, 189.36909, 189.41704, 189.46393, 189.5119, 189.5609, 189.61021, 189.66124, 189.71246, 189.76324, 189.81259, 189.86185, 189.91013, 189.96013, 190.0108, 190.061, 190.11232, 190.1635, 190.21367, 190.2627, 190.31346, 190.36389, 190.41492, 190.46727, 190.51939, 190.57338, 190.62749, 190.68044, 190.73311, 190.78491, 190.83577, 190.8877, 190.93848, 190.98965, 191.04053, 191.09221, 191.1438, 191.19595, 191.24683, 191.29836, 191.35121, 191.40576, 191.45865, 191.51144, 191.56329, 191.61534, 191.66661, 191.71944, 191.77365, 191.82733, 191.88013, 191.93358, 191.98837, 192.04231, 192.09724, 192.15228, 192.20715, 192.26242, 192.32021, 192.37662, 192.4319, 192.48772, 192.54413, 192.59987, 192.65529, 192.71152, 192.76802, 192.82562, 192.88312, 192.94026, 192.99599, 193.05467, 193.11278, 193.17015, 193.22783, 193.28326, 193.33839, 193.39395, 193.44897, 193.50545, 193.563, 193.61928, 193.67555, 193.73364, 193.79195, 193.85016, 193.90939, 193.96805, 194.02667, 194.08534, 194.14226, 194.20026, 194.25986, 194.32065, 194.38155, 194.44293, 194.50323, 194.56407, 194.62587, 194.68752, 194.74759, 194.80595, 194.86389, 194.92307, 194.98349]}, "params-norm vs samples": {"start_step": 0, "end_step": 2000, "step_interval": 5, "values": [179.94354, 179.94354, 179.94354, 179.94353, 179.94351, 179.94351, 179.9435, 179.94337, 179.94319, 179.94301, 179.94168, 179.94092, 179.94034, 179.9382, 179.93718, 179.93637, 179.93611, 179.93633, 179.93683, 179.93695, 179.93684, 179.93649, 179.9361, 179.93663, 179.93771, 179.93913, 179.94032, 179.94113, 179.94214, 179.94365, 179.94586, 179.94824, 179.95052, 179.95296, 179.95572, 179.95921, 179.96291, 179.96681, 179.97093, 179.97545, 179.98062, 179.98616, 179.99197, 179.99846, 180.00552, 180.01314, 180.02119, 180.03004, 180.0396, 180.05011, 180.06131, 180.07315, 180.08542, 180.0985, 180.11215, 180.12645, 180.14087, 180.15598, 180.17198, 180.18895, 180.20711, 180.22621, 180.24666, 180.26831, 180.28981, 180.31268, 180.33565, 180.35945, 180.38472, 180.41133, 180.43765, 180.46451, 180.49187, 180.51939, 180.54758, 180.57634, 180.60477, 180.63396, 180.66389, 180.69472, 180.72603, 180.7572, 180.78957, 180.823, 180.85631, 180.88991, 180.92371, 180.95706, 180.99092, 181.02626, 181.06326, 181.10162, 181.1391, 181.17641, 181.21402, 181.25211, 181.28955, 181.32634, 181.36447, 181.40189, 181.4381, 181.47331, 181.50807, 181.54071, 181.57346, 181.60866, 181.64577, 181.68417, 181.72168, 181.75914, 181.79767, 181.83748, 181.87747, 181.91742, 181.95695, 181.99832, 182.03812, 182.07738, 182.11449, 182.15204, 182.19035, 182.22978, 182.2695, 182.31001, 182.34891, 182.38696, 182.42218, 182.45525, 182.48941, 182.52226, 182.55621, 182.58896, 182.62086, 182.65288, 182.68657, 182.72272, 182.76212, 182.80115, 182.83951, 182.87524, 182.90919, 182.94313, 182.97842, 183.01477, 183.0529, 183.09117, 183.127, 183.16306, 183.20122, 183.24178, 183.28111, 183.32036, 183.35971, 183.3998, 183.43983, 183.47787, 183.51186, 183.54558, 183.57816, 183.6123, 183.64774, 183.68333, 183.72012, 183.75874, 183.79793, 183.83867, 183.87993, 183.92157, 183.96465, 184.00539, 184.04436, 184.0843, 184.12569, 184.16653, 184.20705, 184.24741, 184.28691, 184.32756, 184.36906, 184.41148, 184.45378, 184.4951, 184.53712, 184.57993, 184.62045, 184.65775, 184.69293, 184.72659, 184.76007, 184.79503, 184.83018, 184.86899, 184.90979, 184.95056, 184.99091, 185.03053, 185.07204, 185.11502, 185.15868, 185.20329, 185.24709, 185.29115, 185.33409, 185.37717, 185.4185, 185.45804, 185.49718, 185.53632, 185.57599, 185.61728, 185.65776, 185.69963, 185.74083, 185.78281, 185.82603, 185.86871, 185.91023, 185.94936, 185.98782, 186.0262, 186.06454, 186.10416, 186.14491, 186.1852, 186.2245, 186.26433, 186.30334, 186.34256, 186.38142, 186.41753, 186.45586, 186.49515, 186.5363, 186.57649, 186.61508, 186.65221, 186.6895, 186.72816, 186.76711, 186.80779, 186.84801, 186.88885, 186.93158, 186.97491, 187.01726, 187.06096, 187.10196, 187.14183, 187.18462, 187.22882, 187.27315, 187.31848, 187.36339, 187.40767, 187.45337, 187.49886, 187.54268, 187.58609, 187.62961, 187.67044, 187.71268, 187.75528, 187.79819, 187.84183, 187.88416, 187.92462, 187.96719, 188.0098, 188.0549, 188.10202, 188.14798, 188.19414, 188.23969, 188.28632, 188.33499, 188.38423, 188.43146, 188.47794, 188.52431, 188.57013, 188.61865, 188.66565, 188.71187, 188.75861, 188.80621, 188.85393, 188.90173, 188.94839, 188.99448, 189.04036, 189.08531, 189.13077, 189.17767, 189.22517, 189.27315, 189.32074, 189.36909, 189.41704, 189.46393, 189.5119, 189.5609, 189.61021, 189.66124, 189.71246, 189.76324, 189.81259, 189.86185, 189.91013, 189.96013, 190.0108, 190.061, 190.11232, 190.1635, 190.21367, 190.2627, 190.31346, 190.36389, 190.41492, 190.46727, 190.51939, 190.57338, 190.62749, 190.68044, 190.73311, 190.78491, 190.83577, 190.8877, 190.93848, 190.98965, 191.04053, 191.09221, 191.1438, 191.19595, 191.24683, 191.29836, 191.35121, 191.40576, 191.45865, 191.51144, 191.56329, 191.61534, 191.66661, 191.71944, 191.77365, 191.82733, 191.88013, 191.93358, 191.98837, 192.04231, 192.09724, 192.15228, 192.20715, 192.26242, 192.32021, 192.37662, 192.4319, 192.48772, 192.54413, 192.59987, 192.65529, 192.71152, 192.76802, 192.82562, 192.88312, 192.94026, 192.99599, 193.05467, 193.11278, 193.17015, 193.22783, 193.28326, 193.33839, 193.39395, 193.44897, 193.50545, 193.563, 193.61928, 193.67555, 193.73364, 193.79195, 193.85016, 193.90939, 193.96805, 194.02667, 194.08534, 194.14226, 194.20026, 194.25986, 194.32065, 194.38155, 194.44293, 194.50323, 194.56407, 194.62587, 194.68752, 194.74759, 194.80595, 194.86389, 194.92307, 194.98349]}, "iteration-time": {"start_step": 0, "end_step": 2000, "step_interval": 5, "values": [23.29918, 0.71187, 0.71207, 0.69449, 0.69446, 0.69443, 0.6988, 0.69196, 0.7146, 0.69983, 0.70196, 0.70471, 0.70358, 0.70105, 0.71451, 0.69917, 0.69866, 0.69442, 0.6948, 0.69086, 0.69495, 0.68836, 0.69965, 0.69226, 0.69484, 0.69875, 0.70073, 0.70246, 0.72083, 0.7009, 0.70048, 0.7008, 0.70366, 0.69412, 0.70178, 0.69908, 0.70543, 0.69424, 0.70464, 0.69955, 0.70803, 0.69841, 0.70257, 0.70418, 0.70875, 0.715, 0.70906, 0.70541, 0.71931, 0.7041, 0.70223, 0.70658, 0.69701, 0.69756, 0.69594, 0.70155, 0.70926, 0.70288, 0.6981, 0.70914, 0.69799, 0.70314, 0.70633, 0.70075, 0.70007, 0.70459, 0.70195, 0.69392, 0.7045, 0.70374, 0.70075, 0.69331, 0.69436, 0.6955, 0.70291, 0.69782, 0.70126, 0.70025, 0.70132, 0.7027, 0.70476, 0.70307, 0.69742, 0.69952, 0.69723, 0.8289, 0.70367, 0.7045, 0.70784, 0.71072, 0.70676, 0.70275, 0.70232, 0.70275, 0.70734, 0.70267, 0.70508, 0.70045, 0.70283, 0.71431, 0.708, 0.70934, 0.70749, 0.71204, 0.70839, 0.70834, 0.70947, 0.70787, 0.70812, 0.70457, 0.70563, 0.69994, 0.70262, 0.69627, 0.69863, 0.69913, 0.71178, 0.71423, 0.70926, 0.70785, 0.70607, 0.70391, 0.71582, 0.71055, 0.71123, 0.70438, 0.71121, 0.71074, 0.70765, 0.70483, 0.70686, 0.71125, 0.70564, 0.70533, 0.7078, 0.70873, 0.70986, 0.70805, 0.70797, 0.71206, 0.70956, 0.70912, 0.71021, 0.70934, 0.70819, 0.70233, 0.70414, 0.70448, 0.70564, 0.7015, 0.70586, 0.70217, 0.7129, 0.70787, 0.7092, 0.71158, 0.7112, 0.71167, 0.70869, 0.70914, 0.70573, 0.7106, 0.70502, 0.70709, 0.70454, 0.70862, 0.70342, 0.70716, 0.70517, 0.70888, 0.71242, 0.71066, 0.71063, 0.70907, 0.71159, 0.71233, 0.7117, 0.7115, 0.70892, 0.71015, 0.71212, 0.70842, 0.70856, 0.71199, 0.71305, 0.71701, 0.71312, 0.71367, 0.71284, 0.70741, 0.70964, 0.70851, 0.71466, 0.70509, 0.72116, 0.72852, 0.71403, 0.70864, 0.70955, 0.7163, 0.6926, 0.70139, 0.71844, 0.70855, 0.71025, 0.71363, 0.7113, 0.7081, 0.71651, 0.71161, 0.7088, 0.70621, 0.76558, 0.71366, 0.71465, 0.70832, 0.71501, 0.71439, 0.70996, 0.71112, 0.71318, 0.71005, 0.71114, 0.70462, 0.71021, 0.71174, 0.71118, 0.70552, 0.70941, 0.71352, 0.70296, 0.7077, 0.71087, 0.70967, 0.71319, 0.70487, 0.71314, 0.71027, 0.71726, 0.70291, 0.70583, 0.70043, 0.71003, 0.70162, 0.71159, 0.70538, 0.70772, 0.7058, 0.70393, 0.70436, 0.70523, 0.7076, 0.70951, 0.7073, 0.70677, 0.70977, 0.70523, 0.70814, 0.70619, 0.71387, 0.71394, 0.71664, 0.709, 0.70954, 0.71091, 0.71119, 0.7066, 0.71015, 0.71379, 0.70807, 0.7089, 0.70687, 0.70782, 0.70284, 0.7093, 0.70472, 0.70627, 0.70878, 0.7131, 0.71354, 0.70817, 0.7085, 0.70989, 0.7104, 0.70981, 0.70998, 0.70926, 0.70687, 0.71184, 0.7147, 0.71202, 0.70554, 0.70696, 0.71095, 0.7109, 0.70487, 0.7074, 0.70395, 0.70783, 0.70406, 0.71161, 0.70987, 0.70579, 0.70936, 0.81441, 0.70896, 0.70653, 0.70759, 0.71046, 0.70652, 0.70807, 0.70162, 0.70833, 0.70934, 0.70659, 0.71222, 0.71582, 0.71966, 0.71029, 0.70866, 0.70674, 0.71991, 0.7103, 0.70757, 0.71472, 0.70914, 0.71354, 0.8287, 0.71145, 0.70825, 0.71369, 0.71612, 0.71567, 0.71261, 0.71066, 0.70918, 0.70607, 0.70956, 0.72641, 0.7127, 0.71743, 0.70933, 0.71054, 0.70211, 0.7054, 0.70442, 0.712, 0.71222, 0.71615, 0.71003, 0.71338, 0.71009, 0.71334, 0.71107, 0.71501, 0.71714, 0.70686, 0.70974, 0.71546, 0.70423, 0.71293, 0.71055, 0.71309, 0.71563, 0.71163, 0.71034, 0.71044, 0.71, 0.70833, 0.71033, 0.70852, 0.7031, 0.71412, 0.70792, 0.71185, 0.70919, 0.7121, 0.70689, 0.71208, 0.70677, 0.7134, 0.71312, 0.71483, 0.71357, 0.71752, 0.7209, 0.71431, 0.71061, 0.71548, 0.7187, 0.71617, 0.71164, 0.71417, 0.71386, 0.71464, 0.71363, 0.71829, 0.72097, 0.71465, 0.7123]}, "lm loss validation": {"start_step": 0, "end_step": 4, "step_interval": 5, "values": [5.60433]}, "lm loss validation vs samples": {"start_step": 0, "end_step": 4, "step_interval": 5, "values": [5.60433]}, "lm loss validation ppl": {"start_step": 0, "end_step": 4, "step_interval": 5, "values": [271.59912]}, "lm loss validation ppl vs samples": {"start_step": 0, "end_step": 4, "step_interval": 5, "values": [271.59912]}}
\ No newline at end of file
diff --git a/tests/functional_tests/test_cases/gpt/gpt3_345m_weekly_dgx_h100_1N8G_mcore_tp1_pp2_fp8_pp/golden_values_lts.json b/tests/functional_tests/test_cases/gpt/gpt3_345m_weekly_dgx_h100_1N8G_mcore_tp1_pp2_fp8_pp/golden_values_lts.json
new file mode 100644
index 0000000000..e59a5682c9
--- /dev/null
+++ b/tests/functional_tests/test_cases/gpt/gpt3_345m_weekly_dgx_h100_1N8G_mcore_tp1_pp2_fp8_pp/golden_values_lts.json
@@ -0,0 +1 @@
+{"forward-backward-time": {"start_step": 0, "end_step": 2000, "step_interval": 5, "values": [21.16929, 0.69842, 0.69865, 0.68092, 0.68114, 0.68076, 0.68553, 0.6784, 0.70132, 0.68656, 0.68867, 0.69143, 0.69023, 0.68774, 0.70094, 0.68596, 0.68549, 0.6811, 0.68151, 0.67743, 0.6818, 0.67512, 0.68645, 0.67903, 0.68158, 0.68543, 0.68715, 0.68897, 0.70747, 0.68759, 0.68732, 0.68723, 0.69033, 0.68094, 0.68856, 0.6856, 0.69221, 0.68087, 0.69125, 0.68605, 0.69475, 0.68504, 0.6893, 0.69096, 0.69541, 0.70004, 0.69576, 0.69211, 0.70539, 0.69068, 0.68902, 0.69335, 0.68369, 0.68436, 0.68239, 0.68834, 0.6958, 0.68962, 0.68485, 0.69578, 0.6843, 0.68984, 0.69245, 0.68747, 0.68675, 0.69129, 0.68873, 0.68069, 0.69138, 0.69036, 0.68756, 0.68003, 0.68118, 0.68219, 0.68967, 0.68462, 0.68795, 0.68699, 0.6881, 0.6895, 0.6908, 0.68981, 0.68371, 0.68631, 0.68376, 0.81573, 0.69039, 0.69127, 0.69453, 0.69743, 0.69357, 0.68918, 0.68915, 0.68957, 0.69407, 0.68945, 0.69186, 0.68603, 0.68977, 0.70044, 0.69469, 0.69533, 0.69415, 0.69884, 0.69538, 0.69372, 0.69623, 0.69454, 0.6948, 0.69135, 0.69206, 0.68673, 0.68936, 0.68303, 0.68538, 0.68582, 0.69851, 0.70083, 0.69592, 0.69452, 0.69303, 0.69071, 0.70246, 0.6973, 0.69795, 0.69114, 0.69795, 0.69698, 0.69429, 0.69158, 0.69376, 0.69794, 0.69244, 0.69205, 0.69394, 0.69551, 0.69657, 0.69487, 0.69462, 0.69874, 0.69622, 0.69596, 0.69702, 0.69605, 0.69381, 0.68895, 0.69096, 0.69099, 0.69224, 0.68822, 0.69238, 0.68894, 0.69956, 0.69462, 0.69596, 0.69826, 0.69791, 0.69829, 0.69528, 0.69581, 0.69246, 0.69712, 0.69164, 0.69373, 0.69112, 0.69522, 0.68973, 0.69375, 0.69191, 0.69554, 0.69908, 0.69725, 0.69744, 0.69566, 0.69832, 0.69791, 0.69806, 0.69817, 0.69569, 0.69697, 0.69849, 0.69511, 0.69491, 0.69873, 0.69972, 0.70371, 0.69973, 0.70041, 0.69955, 0.69404, 0.69642, 0.69525, 0.70125, 0.69189, 0.70768, 0.71527, 0.70077, 0.69532, 0.6961, 0.7031, 0.67909, 0.68793, 0.70461, 0.69523, 0.69673, 0.70017, 0.69796, 0.69461, 0.70307, 0.69829, 0.69545, 0.69288, 0.75214, 0.70015, 0.70134, 0.69495, 0.70155, 0.70094, 0.69651, 0.69772, 0.69954, 0.69592, 0.6977, 0.69059, 0.69677, 0.69829, 0.69779, 0.69192, 0.69617, 0.69978, 0.68964, 0.69432, 0.69761, 0.69629, 0.69975, 0.69141, 0.69977, 0.69704, 0.70403, 0.68958, 0.69117, 0.68705, 0.69675, 0.68817, 0.69828, 0.69189, 0.69446, 0.6924, 0.69063, 0.691, 0.69163, 0.69402, 0.69605, 0.69383, 0.69327, 0.69636, 0.69175, 0.69468, 0.69281, 0.70044, 0.70067, 0.7016, 0.69557, 0.69614, 0.69761, 0.69793, 0.69322, 0.69689, 0.70043, 0.69446, 0.69543, 0.69346, 0.69441, 0.68931, 0.69592, 0.6914, 0.6929, 0.69539, 0.69954, 0.69999, 0.69447, 0.69508, 0.69638, 0.69699, 0.69614, 0.69655, 0.6957, 0.69348, 0.698, 0.70136, 0.69861, 0.69224, 0.69369, 0.69763, 0.69759, 0.69166, 0.69413, 0.69071, 0.69463, 0.69072, 0.69754, 0.69663, 0.69249, 0.69603, 0.80113, 0.69556, 0.69325, 0.69439, 0.69712, 0.69274, 0.69473, 0.68837, 0.69493, 0.69602, 0.69314, 0.69884, 0.70264, 0.70625, 0.69696, 0.69541, 0.69344, 0.70656, 0.69704, 0.69417, 0.70121, 0.69558, 0.7002, 0.815, 0.69817, 0.69499, 0.70038, 0.70281, 0.70226, 0.69884, 0.69724, 0.69581, 0.69287, 0.69618, 0.71318, 0.69943, 0.70407, 0.69607, 0.69718, 0.68881, 0.69211, 0.69118, 0.69873, 0.69888, 0.70284, 0.6967, 0.70012, 0.69679, 0.69994, 0.69768, 0.7015, 0.70388, 0.69342, 0.69641, 0.70208, 0.6909, 0.69959, 0.69723, 0.69969, 0.70232, 0.69828, 0.697, 0.69714, 0.69676, 0.69506, 0.69683, 0.69519, 0.68973, 0.70075, 0.69457, 0.69842, 0.69584, 0.69872, 0.69358, 0.69875, 0.69346, 0.70004, 0.69971, 0.70151, 0.70016, 0.70414, 0.70754, 0.70082, 0.69723, 0.70207, 0.70466, 0.70276, 0.69824, 0.70085, 0.70049, 0.70134, 0.70037, 0.705, 0.70761, 0.70114, 0.69824]}, "forward-compute-time": {"start_step": 0, "end_step": 2000, "step_interval": 5, "values": [4.92979, 0.36862, 0.36896, 0.35994, 0.3634, 0.36131, 0.37528, 0.36745, 0.39414, 0.37596, 0.3798, 0.38001, 0.38263, 0.37794, 0.39251, 0.3769, 0.37612, 0.3675, 0.37072, 0.36701, 0.37163, 0.36679, 0.37704, 0.36833, 0.37308, 0.37264, 0.37893, 0.37759, 0.39953, 0.37377, 0.37903, 0.37511, 0.37891, 0.37243, 0.38146, 0.37534, 0.38244, 0.37164, 0.38228, 0.37646, 0.38605, 0.37539, 0.38035, 0.38244, 0.38642, 0.3893, 0.38511, 0.3827, 0.39156, 0.3782, 0.37799, 0.38401, 0.37401, 0.37169, 0.37072, 0.37641, 0.38295, 0.38051, 0.37444, 0.38482, 0.37469, 0.38129, 0.38054, 0.37571, 0.37578, 0.37992, 0.37782, 0.37386, 0.3813, 0.38374, 0.3775, 0.37428, 0.37254, 0.37234, 0.37719, 0.37627, 0.37853, 0.37526, 0.38087, 0.38099, 0.38071, 0.38191, 0.37329, 0.3773, 0.3734, 0.5018, 0.38253, 0.38164, 0.38606, 0.38733, 0.38592, 0.38071, 0.37964, 0.37907, 0.38532, 0.37904, 0.38222, 0.37656, 0.38031, 0.38646, 0.38574, 0.38602, 0.37899, 0.38893, 0.38764, 0.38446, 0.38488, 0.38659, 0.38646, 0.38256, 0.38198, 0.37894, 0.38195, 0.37524, 0.37462, 0.37752, 0.38757, 0.39104, 0.38931, 0.38235, 0.38351, 0.38268, 0.39375, 0.3868, 0.38798, 0.38182, 0.39008, 0.38803, 0.38668, 0.38465, 0.38639, 0.38737, 0.38331, 0.37911, 0.38492, 0.38652, 0.38697, 0.38654, 0.38596, 0.39074, 0.38492, 0.38717, 0.38731, 0.38942, 0.386, 0.38148, 0.38444, 0.38374, 0.38416, 0.37792, 0.37748, 0.37957, 0.39104, 0.38581, 0.38566, 0.38678, 0.38966, 0.38882, 0.38683, 0.38264, 0.38507, 0.38712, 0.38306, 0.38289, 0.38103, 0.38363, 0.37743, 0.37875, 0.37956, 0.38316, 0.3891, 0.38796, 0.38596, 0.38565, 0.38554, 0.38556, 0.38505, 0.38092, 0.38387, 0.38393, 0.38859, 0.37887, 0.38497, 0.38623, 0.39043, 0.39246, 0.38914, 0.38962, 0.38901, 0.38336, 0.38644, 0.38387, 0.38958, 0.38133, 0.39066, 0.39461, 0.39129, 0.38237, 0.3862, 0.39181, 0.37212, 0.37912, 0.39389, 0.384, 0.38439, 0.38586, 0.38505, 0.38157, 0.38622, 0.38765, 0.38617, 0.38274, 0.44388, 0.39087, 0.3907, 0.38612, 0.38867, 0.39114, 0.38539, 0.38934, 0.38921, 0.38784, 0.38206, 0.38157, 0.38685, 0.39031, 0.38789, 0.38326, 0.38644, 0.38897, 0.38075, 0.3856, 0.38903, 0.3866, 0.38941, 0.37995, 0.38647, 0.388, 0.3933, 0.38074, 0.38111, 0.37964, 0.38635, 0.37942, 0.38546, 0.38117, 0.38291, 0.38281, 0.38246, 0.38276, 0.38171, 0.382, 0.3865, 0.37957, 0.3856, 0.38543, 0.38204, 0.38551, 0.38485, 0.39262, 0.39183, 0.38966, 0.38778, 0.38805, 0.3857, 0.3903, 0.38332, 0.38621, 0.38966, 0.38839, 0.3794, 0.38725, 0.38481, 0.38106, 0.38522, 0.3806, 0.38384, 0.38521, 0.38656, 0.39255, 0.38382, 0.38686, 0.38703, 0.38844, 0.38459, 0.38745, 0.38311, 0.38465, 0.38785, 0.39146, 0.38846, 0.38178, 0.38121, 0.38932, 0.38613, 0.38272, 0.38328, 0.38309, 0.38433, 0.38086, 0.38574, 0.38715, 0.38325, 0.38613, 0.4565, 0.38631, 0.38538, 0.38553, 0.38639, 0.38282, 0.38384, 0.37918, 0.38658, 0.38666, 0.38487, 0.39121, 0.3908, 0.39786, 0.3849, 0.38844, 0.38522, 0.394, 0.38769, 0.38524, 0.39367, 0.38775, 0.39338, 0.50382, 0.39159, 0.38743, 0.39102, 0.39523, 0.39356, 0.39205, 0.38578, 0.38801, 0.38304, 0.38678, 0.3987, 0.39171, 0.39597, 0.38708, 0.3908, 0.38146, 0.38222, 0.38202, 0.39012, 0.39068, 0.39269, 0.38682, 0.39099, 0.38924, 0.39219, 0.38971, 0.39066, 0.39542, 0.38474, 0.38829, 0.39181, 0.38288, 0.38918, 0.3886, 0.39087, 0.39457, 0.3877, 0.3877, 0.38997, 0.39047, 0.38458, 0.38887, 0.3875, 0.38266, 0.38907, 0.38748, 0.38772, 0.387, 0.38822, 0.38247, 0.39155, 0.38528, 0.39151, 0.39019, 0.39332, 0.39078, 0.3911, 0.39847, 0.3899, 0.39043, 0.39299, 0.39763, 0.39582, 0.39107, 0.39252, 0.39507, 0.39717, 0.3953, 0.40187, 0.40236, 0.39559, 0.39145]}, "backward-compute-time": {"start_step": 0, "end_step": 2000, "step_interval": 5, "values": [2.8012, 0.29387, 0.2986, 0.28406, 0.28522, 0.28969, 0.29061, 0.28796, 0.29063, 0.28667, 0.29358, 0.29506, 0.2922, 0.2852, 0.28989, 0.28483, 0.28642, 0.28342, 0.28232, 0.28136, 0.28422, 0.28036, 0.28492, 0.28314, 0.281, 0.28245, 0.28442, 0.28445, 0.28814, 0.28551, 0.2857, 0.28486, 0.28705, 0.28407, 0.28536, 0.28489, 0.28989, 0.28255, 0.28845, 0.28647, 0.28944, 0.28337, 0.28838, 0.28849, 0.2897, 0.29269, 0.28788, 0.28852, 0.29394, 0.28953, 0.28786, 0.28768, 0.28428, 0.28563, 0.28458, 0.28775, 0.29324, 0.28892, 0.28616, 0.29034, 0.28456, 0.28682, 0.28841, 0.28729, 0.28425, 0.28778, 0.28741, 0.2839, 0.28832, 0.28804, 0.2861, 0.28333, 0.28362, 0.28274, 0.28476, 0.28495, 0.28365, 0.28409, 0.28405, 0.28625, 0.28429, 0.28647, 0.28314, 0.28367, 0.28409, 0.28622, 0.28505, 0.28438, 0.28134, 0.28462, 0.28536, 0.28398, 0.28654, 0.2869, 0.28809, 0.28601, 0.28761, 0.28425, 0.28676, 0.2862, 0.28997, 0.28934, 0.28731, 0.29342, 0.28795, 0.28707, 0.2867, 0.28661, 0.28811, 0.28616, 0.28592, 0.28428, 0.28508, 0.28396, 0.28659, 0.28265, 0.28697, 0.2894, 0.28687, 0.28772, 0.28913, 0.28621, 0.29195, 0.28847, 0.29125, 0.28862, 0.29011, 0.29025, 0.28931, 0.28814, 0.28955, 0.2908, 0.28871, 0.28801, 0.28793, 0.28964, 0.29306, 0.29007, 0.28963, 0.29251, 0.29069, 0.29194, 0.28984, 0.29084, 0.28995, 0.28615, 0.28778, 0.28795, 0.2882, 0.28737, 0.2876, 0.28691, 0.29135, 0.28807, 0.28993, 0.29202, 0.29116, 0.29034, 0.28863, 0.29346, 0.29111, 0.29416, 0.29263, 0.293, 0.29317, 0.2931, 0.28845, 0.288, 0.28664, 0.28885, 0.29051, 0.28976, 0.28937, 0.29252, 0.29727, 0.29583, 0.29602, 0.29658, 0.2931, 0.29603, 0.29621, 0.29395, 0.29259, 0.29542, 0.29412, 0.29939, 0.29634, 0.2902, 0.29267, 0.28896, 0.2887, 0.28951, 0.29196, 0.29075, 0.29727, 0.30019, 0.29535, 0.2896, 0.28882, 0.29318, 0.28687, 0.28581, 0.29387, 0.28979, 0.28852, 0.29025, 0.28988, 0.28996, 0.2906, 0.29127, 0.29091, 0.29027, 0.34386, 0.29092, 0.29145, 0.28886, 0.29332, 0.29127, 0.29064, 0.29054, 0.29117, 0.28886, 0.28689, 0.28524, 0.29113, 0.29077, 0.28956, 0.28788, 0.28875, 0.29066, 0.28696, 0.28828, 0.28986, 0.28975, 0.29179, 0.28765, 0.29054, 0.29018, 0.29236, 0.28513, 0.28796, 0.28625, 0.28988, 0.28486, 0.2901, 0.28715, 0.28807, 0.29103, 0.28636, 0.28731, 0.28709, 0.2878, 0.28863, 0.28922, 0.28858, 0.28861, 0.28721, 0.28911, 0.28891, 0.29009, 0.29181, 0.29183, 0.2921, 0.28906, 0.29246, 0.29132, 0.28922, 0.29183, 0.29154, 0.29016, 0.29033, 0.29069, 0.28941, 0.28627, 0.28999, 0.28617, 0.28792, 0.2909, 0.29099, 0.29284, 0.29202, 0.28998, 0.29186, 0.29297, 0.29177, 0.2896, 0.29112, 0.28824, 0.29124, 0.29518, 0.29288, 0.28876, 0.29026, 0.29318, 0.2932, 0.2894, 0.28931, 0.28848, 0.28934, 0.28881, 0.29144, 0.28798, 0.28986, 0.29212, 0.28958, 0.2898, 0.28969, 0.2893, 0.29213, 0.29, 0.29098, 0.29085, 0.29077, 0.29035, 0.29027, 0.29142, 0.29441, 0.29571, 0.29203, 0.29018, 0.29127, 0.29433, 0.29091, 0.28877, 0.29354, 0.29063, 0.29084, 0.29118, 0.29114, 0.29201, 0.29191, 0.29316, 0.29428, 0.29139, 0.29115, 0.29268, 0.28887, 0.29386, 0.29765, 0.29295, 0.29535, 0.29245, 0.29159, 0.28784, 0.29096, 0.28864, 0.2923, 0.29471, 0.29453, 0.2914, 0.29447, 0.29151, 0.29226, 0.29155, 0.29343, 0.29271, 0.28917, 0.29026, 0.2943, 0.28854, 0.29114, 0.29123, 0.2918, 0.29223, 0.29626, 0.29746, 0.29042, 0.29175, 0.29069, 0.29, 0.2892, 0.28808, 0.29535, 0.28977, 0.29205, 0.29056, 0.29189, 0.2899, 0.28981, 0.2895, 0.2929, 0.29123, 0.29288, 0.29252, 0.29518, 0.29616, 0.29356, 0.29361, 0.29532, 0.29564, 0.29465, 0.29223, 0.29483, 0.29279, 0.29075, 0.29144, 0.29105, 0.29375, 0.28857, 0.288]}, "batch-generator-time": {"start_step": 0, "end_step": 2000, "step_interval": 5, "values": [0.30565, 0.00631, 0.0066, 0.00601, 0.00609, 0.00586, 0.00613, 0.00583, 0.00602, 0.00583, 0.00598, 0.00604, 0.00582, 0.00568, 0.00583, 0.0058, 0.00563, 0.00578, 0.00557, 0.0058, 0.00592, 0.00586, 0.0058, 0.00562, 0.00562, 0.00571, 0.00557, 0.00573, 0.00596, 0.00583, 0.00566, 0.00601, 0.00607, 0.00572, 0.00607, 0.00595, 0.00598, 0.00592, 0.00585, 0.00609, 0.00585, 0.0059, 0.00582, 0.00578, 0.00588, 0.00604, 0.00563, 0.00593, 0.00592, 0.00559, 0.00549, 0.00584, 0.00593, 0.00559, 0.00713, 0.00734, 0.00689, 0.00723, 0.00685, 0.00763, 0.00701, 0.00722, 0.0072, 0.00755, 0.00717, 0.00727, 0.00721, 0.00707, 0.00703, 0.00729, 0.00703, 0.00682, 0.00659, 0.00573, 0.00594, 0.00596, 0.00621, 0.00602, 0.00602, 0.00599, 0.00597, 0.00616, 0.0059, 0.00598, 0.00575, 0.00606, 0.00592, 0.00596, 0.00602, 0.00605, 0.00587, 0.00585, 0.00596, 0.00675, 0.00617, 0.0062, 0.00592, 0.00581, 0.00613, 0.00611, 0.00624, 0.00629, 0.00603, 0.00622, 0.00608, 0.00595, 0.00632, 0.00599, 0.00611, 0.00597, 0.00588, 0.00587, 0.0057, 0.00574, 0.00589, 0.00569, 0.00565, 0.00566, 0.0061, 0.00592, 0.00603, 0.00553, 0.00587, 0.00577, 0.00567, 0.00584, 0.00581, 0.00607, 0.00583, 0.00565, 0.00581, 0.0058, 0.00582, 0.00595, 0.0057, 0.00596, 0.00605, 0.00582, 0.00559, 0.00575, 0.00572, 0.00562, 0.00565, 0.00583, 0.00603, 0.00568, 0.00564, 0.00603, 0.00593, 0.0059, 0.00581, 0.0055, 0.00598, 0.00604, 0.00607, 0.00585, 0.00585, 0.00603, 0.00588, 0.00599, 0.00567, 0.00593, 0.00614, 0.0058, 0.00592, 0.00575, 0.00581, 0.00624, 0.00582, 0.00616, 0.00572, 0.00591, 0.0061, 0.00614, 0.00597, 0.00606, 0.00588, 0.00578, 0.00631, 0.00589, 0.00584, 0.00574, 0.00613, 0.00566, 0.0061, 0.00599, 0.0059, 0.00589, 0.00595, 0.00596, 0.00595, 0.00595, 0.00613, 0.00585, 0.00569, 0.00609, 0.00603, 0.00615, 0.00617, 0.00606, 0.06212, 0.00708, 0.00731, 0.00708, 0.00688, 0.0068, 0.00715, 0.00694, 0.00689, 0.00682, 0.00592, 0.00599, 0.00671, 0.00709, 0.00695, 0.00727, 0.00736, 0.00727, 0.00737, 0.00678, 0.00708, 0.00694, 0.00721, 0.00727, 0.00742, 0.00681, 0.00707, 0.00694, 0.00708, 0.00695, 0.00706, 0.00698, 0.00707, 0.0067, 0.00718, 0.00733, 0.00718, 0.00687, 0.00725, 0.00712, 0.00718, 0.00685, 0.00603, 0.00744, 0.00676, 0.00683, 0.00724, 0.00706, 0.00733, 0.00734, 0.00681, 0.00744, 0.00713, 0.00687, 0.00667, 0.00687, 0.00723, 0.00685, 0.00677, 0.00724, 0.00676, 0.00673, 0.0071, 0.00721, 0.00713, 0.00707, 0.00719, 0.00656, 0.00681, 0.0069, 0.00711, 0.00704, 0.00728, 0.00686, 0.00705, 0.00647, 0.00678, 0.00724, 0.00671, 0.00729, 0.00729, 0.00693, 0.00727, 0.00705, 0.0073, 0.0069, 0.00703, 0.00703, 0.00673, 0.00641, 0.00649, 0.0059, 0.00591, 0.00589, 0.00611, 0.00602, 0.00581, 0.00591, 0.006, 0.00615, 0.00591, 0.00611, 0.00606, 0.00605, 0.00645, 0.00595, 0.00594, 0.00596, 0.006, 0.00598, 0.00594, 0.00601, 0.00655, 0.00617, 0.00603, 0.0059, 0.00628, 0.00583, 0.00608, 0.00585, 0.00604, 0.00603, 0.00594, 0.00582, 0.00576, 0.00596, 0.00605, 0.00641, 0.00601, 0.00602, 0.0061, 0.00618, 0.00595, 0.00602, 0.00597, 0.00581, 0.00598, 0.00598, 0.00614, 0.00599, 0.00582, 0.00612, 0.00597, 0.00575, 0.00572, 0.00623, 0.00601, 0.00597, 0.00619, 0.00626, 0.00606, 0.00592, 0.00607, 0.00584, 0.00593, 0.00602, 0.00617, 0.00621, 0.00612, 0.00602, 0.00597, 0.00594, 0.00615, 0.00599, 0.00604, 0.00617, 0.00631, 0.00558, 0.00552, 0.0057, 0.00568, 0.00594, 0.00614, 0.00588, 0.006, 0.00605, 0.00607, 0.00624, 0.00636, 0.00582, 0.00604, 0.00595, 0.0061, 0.00615, 0.00599, 0.00599, 0.00621, 0.00604, 0.00599, 0.00599, 0.00589, 0.00621, 0.00584, 0.00586, 0.00593, 0.00614, 0.00623, 0.00591, 0.00632, 0.00604]}, "forward-recv-time": {"start_step": 0, "end_step": 2000, "step_interval": 5, "values": [4.95821, 0.02363, 0.0227, 0.02332, 0.02256, 0.02319, 0.0228, 0.02261, 0.0228, 0.02242, 0.02284, 0.02259, 0.02245, 0.02309, 0.02332, 0.02185, 0.02227, 0.02241, 0.02251, 0.02246, 0.02257, 0.02259, 0.02212, 0.02254, 0.02299, 0.02339, 0.02258, 0.02339, 0.02279, 0.02234, 0.0221, 0.02333, 0.02239, 0.02203, 0.02184, 0.02211, 0.02224, 0.022, 0.0223, 0.02282, 0.02196, 0.02285, 0.02194, 0.02233, 0.02238, 0.0221, 0.02287, 0.02259, 0.02353, 0.02258, 0.02174, 0.02244, 0.02248, 0.02249, 0.02286, 0.02274, 0.02231, 0.02301, 0.02252, 0.02226, 0.02309, 0.0226, 0.02248, 0.02257, 0.02247, 0.02239, 0.02245, 0.02239, 0.02245, 0.02226, 0.02251, 0.02235, 0.02229, 0.02229, 0.02224, 0.02218, 0.02269, 0.02222, 0.02297, 0.0233, 0.02355, 0.02353, 0.02351, 0.02353, 0.0231, 0.02266, 0.02205, 0.02248, 0.02239, 0.02243, 0.02337, 0.02243, 0.02265, 0.02251, 0.0227, 0.02251, 0.02262, 0.0223, 0.02239, 0.02302, 0.02253, 0.0224, 0.02341, 0.02267, 0.02201, 0.02288, 0.02223, 0.02234, 0.02247, 0.02274, 0.0227, 0.02223, 0.02278, 0.02249, 0.02233, 0.02353, 0.02284, 0.02293, 0.02146, 0.02395, 0.02287, 0.02228, 0.02286, 0.02372, 0.02285, 0.02195, 0.02251, 0.02292, 0.02278, 0.02298, 0.02247, 0.02293, 0.02269, 0.02272, 0.02289, 0.0229, 0.0226, 0.02277, 0.02291, 0.02243, 0.02298, 0.02242, 0.02233, 0.02273, 0.0224, 0.02231, 0.02213, 0.02282, 0.02271, 0.02257, 0.02245, 0.02266, 0.02226, 0.02234, 0.02242, 0.02287, 0.02231, 0.02272, 0.02271, 0.02261, 0.02279, 0.02239, 0.02238, 0.02237, 0.02245, 0.02246, 0.023, 0.02279, 0.02277, 0.02299, 0.02326, 0.0223, 0.02341, 0.02259, 0.02308, 0.02252, 0.02308, 0.02263, 0.02343, 0.02234, 0.02287, 0.02253, 0.02261, 0.02291, 0.02258, 0.02266, 0.02272, 0.02323, 0.02251, 0.02228, 0.0226, 0.02245, 0.02282, 0.02319, 0.02275, 0.02246, 0.02327, 0.02259, 0.02253, 0.0224, 0.01758, 0.02244, 0.02255, 0.02222, 0.02295, 0.02246, 0.02236, 0.02202, 0.02348, 0.02237, 0.02232, 0.02231, 0.02262, 0.02284, 0.02278, 0.02292, 0.02249, 0.02264, 0.02288, 0.02264, 0.02232, 0.02331, 0.02235, 0.02266, 0.02272, 0.02229, 0.02285, 0.02276, 0.02283, 0.02355, 0.02243, 0.02224, 0.02272, 0.02285, 0.02224, 0.02355, 0.02275, 0.02246, 0.02254, 0.02335, 0.02272, 0.02208, 0.02249, 0.02229, 0.02237, 0.02251, 0.0228, 0.02259, 0.02238, 0.02269, 0.02278, 0.02234, 0.02262, 0.02237, 0.02265, 0.02234, 0.0239, 0.02204, 0.02217, 0.02222, 0.02262, 0.02231, 0.02208, 0.02252, 0.02267, 0.02293, 0.02253, 0.02228, 0.02237, 0.02246, 0.02294, 0.02246, 0.02182, 0.0225, 0.02229, 0.02265, 0.02222, 0.02222, 0.02264, 0.02241, 0.02246, 0.02208, 0.02243, 0.0227, 0.02237, 0.02231, 0.02228, 0.02312, 0.02228, 0.02236, 0.02245, 0.02239, 0.02316, 0.02216, 0.02227, 0.02241, 0.0226, 0.02206, 0.02266, 0.0223, 0.02225, 0.02286, 0.0223, 0.02201, 0.02235, 0.02378, 0.02224, 0.02326, 0.02229, 0.02293, 0.02211, 0.02198, 0.02233, 0.0224, 0.02212, 0.02248, 0.02253, 0.02253, 0.02258, 0.02203, 0.02237, 0.02274, 0.0222, 0.02237, 0.02238, 0.02242, 0.02229, 0.02263, 0.02196, 0.02243, 0.02239, 0.02243, 0.02221, 0.02264, 0.02264, 0.02249, 0.02235, 0.0226, 0.02289, 0.02232, 0.0227, 0.02252, 0.02225, 0.02254, 0.02223, 0.02268, 0.02244, 0.02292, 0.02284, 0.02271, 0.02275, 0.02258, 0.02303, 0.02263, 0.02297, 0.02275, 0.0227, 0.023, 0.02298, 0.02297, 0.02199, 0.02326, 0.02298, 0.02263, 0.02262, 0.02296, 0.02268, 0.0225, 0.02268, 0.02273, 0.02239, 0.02231, 0.02302, 0.02284, 0.02258, 0.02376, 0.02298, 0.02258, 0.02269, 0.02282, 0.02248, 0.02296, 0.02259, 0.02303, 0.02252, 0.02322, 0.02265, 0.0226, 0.02282, 0.0227, 0.02325, 0.02263, 0.02282, 0.02297, 0.02259, 0.02313, 0.02262, 0.02287, 0.02288, 0.02356]}, "forward-send-time": {"start_step": 0, "end_step": 2000, "step_interval": 5, "values": [3.00337, 0.00014, 0.00014, 0.00014, 0.00014, 0.00015, 0.00014, 0.00014, 0.00015, 0.00015, 0.00014, 0.00015, 0.00014, 0.00014, 0.00016, 0.00014, 0.00014, 0.00014, 0.00014, 0.00014, 0.00015, 0.00015, 0.00014, 0.00014, 0.00014, 0.00015, 0.00014, 0.00014, 0.00014, 0.00014, 0.00014, 0.00015, 0.00015, 0.00014, 0.00014, 0.00014, 0.00014, 0.00014, 0.00014, 0.00014, 0.00013, 0.00014, 0.00014, 0.00014, 0.00014, 0.00013, 0.00017, 0.00014, 0.00015, 0.00014, 0.00014, 0.00014, 0.00014, 0.00014, 0.00014, 0.00014, 0.00014, 0.00014, 0.00014, 0.00015, 0.00013, 0.00014, 0.00014, 0.00013, 0.00014, 0.00015, 0.00014, 0.00013, 0.00014, 0.00014, 0.00014, 0.00014, 0.00014, 0.00013, 0.00013, 0.00014, 0.00014, 0.00014, 0.00014, 0.00014, 0.00014, 0.00014, 0.00014, 0.00014, 0.00014, 0.00014, 0.00014, 0.00013, 0.00014, 0.00014, 0.00014, 0.00013, 0.00014, 0.00014, 0.00014, 0.00014, 0.00014, 0.00014, 0.00014, 0.00014, 0.00014, 0.00014, 0.00015, 0.00014, 0.00013, 0.00014, 0.00015, 0.00014, 0.00014, 0.00014, 0.00014, 0.00013, 0.00014, 0.00013, 0.00013, 0.00014, 0.00014, 0.00014, 0.00013, 0.00015, 0.00013, 0.00014, 0.00014, 0.00015, 0.00014, 0.00013, 0.00014, 0.00015, 0.00014, 0.00016, 0.00014, 0.00014, 0.00014, 0.00014, 0.00014, 0.00014, 0.00013, 0.00014, 0.00014, 0.00013, 0.00014, 0.00014, 0.00014, 0.00014, 0.00014, 0.00014, 0.00013, 0.00013, 0.00014, 0.00015, 0.00014, 0.00014, 0.00014, 0.00014, 0.00014, 0.00014, 0.00014, 0.00013, 0.00014, 0.00014, 0.00014, 0.00013, 0.00014, 0.00014, 0.00014, 0.00014, 0.00014, 0.00014, 0.00013, 0.00015, 0.00015, 0.00014, 0.00016, 0.00013, 0.00016, 0.00014, 0.00015, 0.00013, 0.00014, 0.00014, 0.00014, 0.00014, 0.00014, 0.00014, 0.00015, 0.00014, 0.00014, 0.00015, 0.00014, 0.00014, 0.00014, 0.00014, 0.00014, 0.00016, 0.00014, 0.00014, 0.00015, 0.00014, 0.00015, 0.00014, 0.00018, 0.00014, 0.00015, 0.00014, 0.00015, 0.00015, 0.00014, 0.00014, 0.00015, 0.00013, 0.00014, 0.00015, 0.00014, 0.00014, 0.00014, 0.00014, 0.00014, 0.00014, 0.00014, 0.00015, 0.00015, 0.00015, 0.00014, 0.00014, 0.00015, 0.00014, 0.00015, 0.00014, 0.00015, 0.00014, 0.00014, 0.00017, 0.00014, 0.00014, 0.00014, 0.00015, 0.00015, 0.00015, 0.00014, 0.00017, 0.00014, 0.00015, 0.00014, 0.00014, 0.00013, 0.00015, 0.00014, 0.00014, 0.00015, 0.00014, 0.00014, 0.00014, 0.00014, 0.00014, 0.00015, 0.00014, 0.00018, 0.00014, 0.00015, 0.00015, 0.00014, 0.00014, 0.00014, 0.00014, 0.00015, 0.00015, 0.00014, 0.00014, 0.00014, 0.00014, 0.00014, 0.00014, 0.00013, 0.00014, 0.00014, 0.00014, 0.00014, 0.00014, 0.00015, 0.00014, 0.00015, 0.00013, 0.00014, 0.00015, 0.00016, 0.00014, 0.00014, 0.00015, 0.00014, 0.00014, 0.00014, 0.00014, 0.00015, 0.00013, 0.00014, 0.00014, 0.00014, 0.00013, 0.00014, 0.00014, 0.00014, 0.00015, 0.00014, 0.00014, 0.00014, 0.00015, 0.00014, 0.00015, 0.00014, 0.00015, 0.00014, 0.00014, 0.00015, 0.00014, 0.00014, 0.00014, 0.00015, 0.00015, 0.00014, 0.00014, 0.00014, 0.00015, 0.00014, 0.00014, 0.00014, 0.00014, 0.00014, 0.00015, 0.00014, 0.00014, 0.00014, 0.00015, 0.00014, 0.00015, 0.00014, 0.00014, 0.00014, 0.00015, 0.00014, 0.00014, 0.00015, 0.00014, 0.00014, 0.00014, 0.00014, 0.00015, 0.00014, 0.00015, 0.00014, 0.00015, 0.00014, 0.00014, 0.00015, 0.00015, 0.00015, 0.00014, 0.00014, 0.00015, 0.00014, 0.00014, 0.00014, 0.00014, 0.00016, 0.00014, 0.00014, 0.00015, 0.00015, 0.00015, 0.00015, 0.00014, 0.00014, 0.00014, 0.00015, 0.00015, 0.00014, 0.00016, 0.00014, 0.00014, 0.00015, 0.00014, 0.00014, 0.00015, 0.00015, 0.00014, 0.00015, 0.00014, 0.00015, 0.00014, 0.00014, 0.00015, 0.00014, 0.00014, 0.00015, 0.00015, 0.00014, 0.00016, 0.00014, 0.00015, 0.00015, 0.00015]}, "backward-recv-time": {"start_step": 0, "end_step": 2000, "step_interval": 5, "values": [0.02248, 0.02331, 0.02263, 0.02336, 0.02223, 0.02299, 0.02211, 0.02247, 0.0226, 0.02292, 0.02307, 0.02276, 0.02341, 0.02329, 0.02311, 0.02274, 0.02235, 0.0235, 0.02241, 0.02254, 0.0226, 0.02238, 0.02202, 0.02262, 0.02257, 0.02202, 0.02244, 0.02212, 0.02257, 0.02222, 0.02301, 0.02231, 0.02146, 0.02328, 0.0228, 0.02276, 0.02277, 0.02305, 0.02315, 0.02206, 0.02273, 0.02196, 0.02292, 0.0229, 0.02318, 0.02404, 0.02342, 0.02372, 0.024, 0.02283, 0.02293, 0.02329, 0.02241, 0.02288, 0.02249, 0.02209, 0.0225, 0.02317, 0.02289, 0.02337, 0.02275, 0.02241, 0.02374, 0.02164, 0.02208, 0.02228, 0.02281, 0.02282, 0.02272, 0.0226, 0.0227, 0.02228, 0.02281, 0.02266, 0.02389, 0.02245, 0.02241, 0.02233, 0.02295, 0.02231, 0.0221, 0.02223, 0.0226, 0.02234, 0.02195, 0.02202, 0.02245, 0.0226, 0.02275, 0.02248, 0.0222, 0.02241, 0.02244, 0.02231, 0.02257, 0.02222, 0.02266, 0.02423, 0.02272, 0.02227, 0.02299, 0.02249, 0.0224, 0.02471, 0.02315, 0.02261, 0.02228, 0.02296, 0.02277, 0.02251, 0.02275, 0.02249, 0.02349, 0.022, 0.02327, 0.0234, 0.02263, 0.02233, 0.02301, 0.02227, 0.02246, 0.02257, 0.02278, 0.02253, 0.02246, 0.02297, 0.02258, 0.02373, 0.02268, 0.02299, 0.02323, 0.02295, 0.02269, 0.02271, 0.02329, 0.02248, 0.02289, 0.02291, 0.02254, 0.02282, 0.02401, 0.02262, 0.02444, 0.02261, 0.0226, 0.02263, 0.02259, 0.02307, 0.02224, 0.02211, 0.02289, 0.02273, 0.02385, 0.02337, 0.02258, 0.02316, 0.02269, 0.02287, 0.02301, 0.0225, 0.02248, 0.02339, 0.02296, 0.02226, 0.02308, 0.02301, 0.02193, 0.02223, 0.02389, 0.02273, 0.02314, 0.0224, 0.02271, 0.02292, 0.0234, 0.02311, 0.02278, 0.02281, 0.02287, 0.02271, 0.02258, 0.02224, 0.02289, 0.02216, 0.02306, 0.02215, 0.02293, 0.02325, 0.02272, 0.02257, 0.02265, 0.02257, 0.02237, 0.02338, 0.02396, 0.02264, 0.02255, 0.02263, 0.02261, 0.02319, 0.02273, 0.0227, 0.02359, 0.02237, 0.02352, 0.02453, 0.02244, 0.02254, 0.02341, 0.02295, 0.02318, 0.02233, 0.02248, 0.02304, 0.02424, 0.02304, 0.02275, 0.02374, 0.02258, 0.02316, 0.02275, 0.02259, 0.02278, 0.02276, 0.02303, 0.02314, 0.02359, 0.02289, 0.02295, 0.02301, 0.02271, 0.02295, 0.02286, 0.02295, 0.02288, 0.02247, 0.02599, 0.02329, 0.02375, 0.02231, 0.0227, 0.0222, 0.02287, 0.02291, 0.02232, 0.02287, 0.02269, 0.0222, 0.02306, 0.02281, 0.0228, 0.02143, 0.02285, 0.02337, 0.02236, 0.02228, 0.02243, 0.02313, 0.02393, 0.02356, 0.02319, 0.02319, 0.02354, 0.02282, 0.02254, 0.02335, 0.02225, 0.02305, 0.0231, 0.02313, 0.02277, 0.02351, 0.02342, 0.02326, 0.02253, 0.02222, 0.02252, 0.02264, 0.02318, 0.02321, 0.02292, 0.02334, 0.02285, 0.02282, 0.02307, 0.02259, 0.02166, 0.02265, 0.02214, 0.02373, 0.02309, 0.0232, 0.02261, 0.02274, 0.02256, 0.02221, 0.02164, 0.02324, 0.02299, 0.02313, 0.02404, 0.02301, 0.02264, 0.02252, 0.02325, 0.02343, 0.02291, 0.02247, 0.0231, 0.02252, 0.02239, 0.02337, 0.02232, 0.02332, 0.02306, 0.02293, 0.02287, 0.02295, 0.02297, 0.02351, 0.02268, 0.02263, 0.02425, 0.02263, 0.02361, 0.023, 0.02223, 0.02273, 0.02318, 0.02333, 0.0232, 0.02407, 0.02312, 0.0227, 0.02288, 0.02285, 0.02227, 0.0233, 0.02303, 0.02288, 0.0233, 0.0231, 0.02299, 0.02245, 0.02284, 0.02224, 0.02277, 0.02352, 0.02304, 0.02289, 0.02369, 0.02293, 0.02308, 0.02248, 0.02362, 0.02358, 0.02328, 0.02302, 0.0234, 0.02273, 0.02296, 0.02329, 0.0228, 0.0234, 0.02231, 0.02262, 0.02265, 0.02299, 0.02199, 0.02303, 0.02291, 0.02278, 0.02341, 0.0232, 0.02291, 0.02339, 0.02355, 0.02363, 0.02324, 0.02236, 0.023, 0.02327, 0.02343, 0.02262, 0.02317, 0.02371, 0.02282, 0.02307, 0.0239, 0.02366, 0.02297, 0.02286, 0.02285, 0.0232, 0.02342, 0.02385, 0.02348, 0.02254, 0.02321, 0.02256]}, "backward-send-time": {"start_step": 0, "end_step": 2000, "step_interval": 5, "values": [0.00014, 0.00018, 0.00017, 0.00019, 0.00013, 0.00016, 0.00015, 0.00015, 0.00015, 0.00015, 0.00015, 0.00016, 0.00015, 0.00015, 0.00015, 0.00015, 0.00015, 0.00017, 0.00015, 0.00016, 0.00015, 0.00015, 0.00017, 0.00015, 0.00015, 0.00015, 0.00016, 0.00015, 0.00015, 0.00015, 0.00015, 0.00016, 0.00015, 0.00015, 0.00015, 0.00015, 0.00015, 0.00015, 0.00015, 0.00016, 0.00015, 0.00015, 0.00015, 0.00016, 0.00015, 0.00017, 0.00016, 0.00015, 0.00015, 0.00016, 0.00014, 0.00015, 0.00015, 0.00015, 0.00015, 0.00015, 0.00019, 0.00015, 0.00015, 0.00017, 0.00015, 0.00015, 0.00015, 0.00015, 0.00015, 0.00015, 0.00014, 0.00012, 0.00013, 0.00013, 0.00013, 0.00012, 0.00013, 0.00016, 0.00017, 0.00016, 0.00012, 0.00016, 0.00012, 0.00012, 0.00013, 0.00013, 0.00016, 0.00013, 0.00013, 0.00014, 0.00013, 0.00012, 0.00012, 0.00012, 0.00014, 0.00013, 0.00012, 0.00013, 0.00012, 0.00012, 0.00013, 0.00017, 0.00014, 0.00017, 0.00013, 0.00013, 0.00013, 0.00019, 0.00014, 0.00014, 0.00013, 0.00018, 0.00013, 0.00014, 0.00013, 0.00016, 0.00015, 0.00012, 0.00013, 0.00013, 0.00013, 0.00013, 0.00012, 0.00014, 0.00015, 0.00013, 0.00014, 0.00012, 0.00013, 0.00013, 0.00012, 0.00016, 0.00017, 0.00013, 0.00014, 0.00013, 0.00015, 0.00013, 0.00013, 0.00015, 0.00016, 0.00013, 0.00013, 0.00013, 0.00015, 0.00013, 0.00016, 0.00015, 0.00013, 0.00013, 0.00013, 0.00013, 0.00017, 0.00015, 0.00017, 0.00014, 0.00013, 0.00014, 0.00013, 0.00014, 0.00013, 0.00013, 0.00015, 0.00014, 0.00013, 0.00015, 0.00014, 0.00012, 0.00014, 0.00013, 0.00016, 0.00015, 0.00015, 0.00013, 0.00013, 0.00013, 0.00013, 0.00016, 0.00012, 0.00013, 0.00015, 0.00013, 0.00015, 0.00014, 0.00016, 0.00013, 0.00013, 0.00015, 0.00016, 0.00012, 0.00016, 0.00013, 0.00013, 0.00013, 0.00013, 0.00013, 0.00012, 0.00013, 0.00019, 0.00013, 0.00013, 0.00014, 0.00015, 0.00014, 0.00013, 0.00014, 0.00016, 0.00014, 0.00013, 0.00013, 0.00013, 0.00013, 0.00015, 0.00013, 0.00013, 0.00014, 0.00014, 0.00013, 0.00016, 0.00013, 0.00018, 0.00012, 0.00014, 0.00013, 0.00013, 0.00012, 0.00013, 0.00013, 0.00014, 0.00016, 0.00012, 0.00012, 0.00013, 0.00014, 0.00012, 0.00018, 0.00013, 0.00013, 0.00013, 0.00014, 0.00015, 0.00014, 0.00014, 0.00012, 0.00013, 0.00013, 0.00014, 0.00014, 0.00015, 0.00015, 0.00013, 0.00013, 0.00014, 0.00013, 0.00012, 0.00013, 0.00012, 0.00015, 0.00013, 0.00013, 0.00014, 0.00015, 0.00012, 0.00013, 0.00013, 0.00014, 0.00013, 0.00012, 0.00017, 0.00013, 0.00015, 0.00017, 0.00013, 0.00014, 0.00016, 0.00012, 0.00014, 0.00013, 0.00014, 0.00013, 0.00015, 0.00015, 0.00016, 0.00017, 0.00013, 0.00018, 0.00014, 0.00013, 0.00013, 0.00013, 0.00012, 0.00013, 0.00013, 0.00014, 0.00016, 0.00014, 0.00013, 0.00014, 0.00012, 0.00013, 0.00012, 0.00016, 0.00012, 0.00015, 0.00013, 0.00013, 0.00013, 0.00012, 0.00016, 0.00017, 0.00013, 0.00013, 0.00013, 0.00014, 0.00014, 0.00013, 0.00013, 0.00013, 0.00013, 0.00012, 0.00013, 0.00014, 0.00012, 0.00013, 0.00013, 0.00012, 0.00013, 0.00014, 0.00015, 0.00013, 0.00013, 0.00013, 0.00017, 0.00014, 0.00014, 0.00016, 0.00013, 0.00015, 0.00014, 0.00017, 0.00016, 0.00014, 0.00014, 0.00013, 0.00015, 0.00012, 0.00013, 0.00012, 0.00013, 0.00014, 0.00013, 0.00014, 0.00013, 0.00013, 0.00013, 0.00013, 0.00013, 0.00013, 0.00013, 0.00014, 0.00012, 0.00013, 0.00015, 0.00014, 0.00012, 0.00012, 0.00013, 0.00013, 0.00013, 0.00014, 0.00012, 0.00012, 0.00012, 0.00013, 0.00013, 0.00012, 0.00014, 0.00013, 0.00013, 0.00013, 0.00012, 0.00015, 0.00016, 0.00013, 0.00013, 0.00014, 0.00014, 0.00017, 0.00012, 0.00015, 0.00016, 0.00016, 0.00013, 0.00015, 0.00014, 0.00013, 0.00013, 0.00012, 0.00012, 0.00017, 0.00013, 0.00013, 0.00012, 0.00012]}, "forward-send-backward-recv-time": {"start_step": 0, "end_step": 2000, "step_interval": 5, "values": [7.29163, 0.07663, 0.08035, 0.06332, 0.06621, 0.06965, 0.06672, 0.06872, 0.07455, 0.0683, 0.06975, 0.07264, 0.07308, 0.06869, 0.0749, 0.06785, 0.06696, 0.07011, 0.07008, 0.06771, 0.06763, 0.06853, 0.06929, 0.06793, 0.0646, 0.06794, 0.06582, 0.06618, 0.07898, 0.06585, 0.0677, 0.06681, 0.07017, 0.06602, 0.06883, 0.06722, 0.06997, 0.06853, 0.07057, 0.06872, 0.06884, 0.06699, 0.06869, 0.07012, 0.06782, 0.06999, 0.06845, 0.06563, 0.07187, 0.06575, 0.06637, 0.06468, 0.06438, 0.06646, 0.06395, 0.06524, 0.08025, 0.06764, 0.06976, 0.06968, 0.06431, 0.06784, 0.06839, 0.06965, 0.06878, 0.06848, 0.06691, 0.06998, 0.07092, 0.06857, 0.0693, 0.06815, 0.07095, 0.07046, 0.07279, 0.07009, 0.07045, 0.07242, 0.06971, 0.06878, 0.0711, 0.06854, 0.0703, 0.07136, 0.07206, 0.19699, 0.06856, 0.07017, 0.0772, 0.07413, 0.06965, 0.06662, 0.06863, 0.07002, 0.06852, 0.06895, 0.06723, 0.06766, 0.06739, 0.07615, 0.06865, 0.0659, 0.07051, 0.0678, 0.06754, 0.06717, 0.07145, 0.07015, 0.06808, 0.06744, 0.06521, 0.06518, 0.06265, 0.06299, 0.06279, 0.06454, 0.07004, 0.06844, 0.06842, 0.06744, 0.06305, 0.06615, 0.07084, 0.06889, 0.06934, 0.0652, 0.07021, 0.0665, 0.06497, 0.06458, 0.06483, 0.0654, 0.0651, 0.06488, 0.06369, 0.06434, 0.06672, 0.06482, 0.06827, 0.06829, 0.0643, 0.06825, 0.06762, 0.06752, 0.06536, 0.06267, 0.06412, 0.06238, 0.0644, 0.06315, 0.06427, 0.06278, 0.06772, 0.06453, 0.06547, 0.06433, 0.06477, 0.06262, 0.06246, 0.0656, 0.06412, 0.06447, 0.06356, 0.06614, 0.0655, 0.06558, 0.06542, 0.06499, 0.06312, 0.06403, 0.06715, 0.06427, 0.06479, 0.06361, 0.06722, 0.06583, 0.06476, 0.06651, 0.06877, 0.06755, 0.06567, 0.06624, 0.06526, 0.06717, 0.06755, 0.06946, 0.06655, 0.06526, 0.06418, 0.06359, 0.06533, 0.06548, 0.06698, 0.06537, 0.06464, 0.07565, 0.06673, 0.06462, 0.06523, 0.06525, 0.05829, 0.06037, 0.06399, 0.06429, 0.06234, 0.06138, 0.06591, 0.06529, 0.06565, 0.06508, 0.0686, 0.06838, 0.12228, 0.06666, 0.06636, 0.0641, 0.06601, 0.06468, 0.06395, 0.06568, 0.06779, 0.06425, 0.06928, 0.06612, 0.06928, 0.0652, 0.06359, 0.06153, 0.06449, 0.06439, 0.06432, 0.06445, 0.06351, 0.06481, 0.06503, 0.06334, 0.0646, 0.06418, 0.06493, 0.06414, 0.06257, 0.06426, 0.06752, 0.06251, 0.06434, 0.06117, 0.06509, 0.06177, 0.06484, 0.06385, 0.06538, 0.06711, 0.0659, 0.06606, 0.06549, 0.06518, 0.06537, 0.06313, 0.0654, 0.0676, 0.06603, 0.06663, 0.06705, 0.06676, 0.0651, 0.0677, 0.06421, 0.06506, 0.06513, 0.06577, 0.06915, 0.06804, 0.06617, 0.06569, 0.06722, 0.06636, 0.06674, 0.06574, 0.06698, 0.06664, 0.06663, 0.06459, 0.06384, 0.06515, 0.06699, 0.06757, 0.06645, 0.06668, 0.0657, 0.06812, 0.06673, 0.06651, 0.06468, 0.06953, 0.06688, 0.06585, 0.06531, 0.06508, 0.06559, 0.06487, 0.0647, 0.06539, 0.06861, 0.06738, 0.06026, 0.06597, 0.06493, 0.06467, 0.06738, 0.06641, 0.06506, 0.0673, 0.06795, 0.06714, 0.06848, 0.06828, 0.07103, 0.0742, 0.06691, 0.06638, 0.06521, 0.06791, 0.06493, 0.06647, 0.06851, 0.06674, 0.06949, 0.18067, 0.06896, 0.0653, 0.06795, 0.06966, 0.06981, 0.0677, 0.06607, 0.06924, 0.06499, 0.06831, 0.06832, 0.06949, 0.07135, 0.06537, 0.07037, 0.06461, 0.06603, 0.06572, 0.06904, 0.06866, 0.06911, 0.06296, 0.0684, 0.06727, 0.06737, 0.069, 0.06738, 0.07025, 0.06407, 0.06509, 0.06963, 0.06441, 0.07069, 0.07222, 0.07463, 0.07367, 0.07032, 0.07129, 0.07156, 0.07253, 0.06858, 0.06926, 0.06916, 0.06788, 0.06771, 0.06859, 0.06745, 0.07278, 0.06943, 0.06671, 0.0691, 0.06585, 0.06975, 0.07019, 0.07413, 0.0711, 0.07228, 0.07684, 0.07091, 0.0736, 0.07134, 0.07497, 0.07213, 0.06976, 0.07166, 0.0746, 0.0763, 0.06965, 0.07059, 0.07384, 0.07021, 0.07072]}, "backward-send-forward-recv-time": {"start_step": 0, "end_step": 2000, "step_interval": 5, "values": [2.84189, 0.0034, 0.00335, 0.0028, 0.00275, 0.0029, 0.00298, 0.00297, 0.00304, 0.00306, 0.00309, 0.00308, 0.00301, 0.00302, 0.00299, 0.00294, 0.003, 0.00307, 0.0031, 0.00304, 0.00303, 0.00294, 0.00305, 0.00298, 0.00301, 0.00306, 0.0029, 0.00302, 0.00303, 0.0031, 0.00306, 0.00304, 0.00303, 0.00301, 0.00294, 0.00305, 0.00312, 0.00303, 0.00301, 0.00328, 0.00302, 0.00288, 0.00306, 0.00304, 0.00304, 0.00303, 0.00299, 0.00297, 0.003, 0.00305, 0.00302, 0.00306, 0.00303, 0.00307, 0.00305, 0.00294, 0.00385, 0.00305, 0.00293, 0.00307, 0.00295, 0.003, 0.00297, 0.00308, 0.00305, 0.00303, 0.00302, 0.00254, 0.00275, 0.00284, 0.00252, 0.00253, 0.00257, 0.00262, 0.00255, 0.00266, 0.00264, 0.0026, 0.00255, 0.00265, 0.00267, 0.00266, 0.00269, 0.0026, 0.00263, 0.00301, 0.00264, 0.00265, 0.00269, 0.00261, 0.00267, 0.00257, 0.00268, 0.0027, 0.00261, 0.00268, 0.00261, 0.00264, 0.00255, 0.00261, 0.00281, 0.00269, 0.00271, 0.00271, 0.00264, 0.00265, 0.00268, 0.0026, 0.00262, 0.00283, 0.00271, 0.00272, 0.00266, 0.00257, 0.00253, 0.00256, 0.00276, 0.00272, 0.00264, 0.00283, 0.00271, 0.00262, 0.00269, 0.00277, 0.00266, 0.0026, 0.00277, 0.00282, 0.00271, 0.00264, 0.00273, 0.00268, 0.00264, 0.00266, 0.0027, 0.00274, 0.00274, 0.0027, 0.00271, 0.00273, 0.00279, 0.0027, 0.00276, 0.00265, 0.0028, 0.00278, 0.00273, 0.00287, 0.00273, 0.00277, 0.00273, 0.00265, 0.00272, 0.00267, 0.00277, 0.00265, 0.00267, 0.0027, 0.00268, 0.00269, 0.00264, 0.00278, 0.00271, 0.00267, 0.00258, 0.00265, 0.00262, 0.00273, 0.00273, 0.00285, 0.00277, 0.00264, 0.00285, 0.00276, 0.00269, 0.00275, 0.00339, 0.00271, 0.00288, 0.00276, 0.00282, 0.00266, 0.00281, 0.00268, 0.00277, 0.00269, 0.00271, 0.0028, 0.00273, 0.00293, 0.00264, 0.00265, 0.00285, 0.0026, 0.00269, 0.00287, 0.00272, 0.00278, 0.0028, 0.00271, 0.00259, 0.00259, 0.00273, 0.00266, 0.0027, 0.00278, 0.00275, 0.0029, 0.00268, 0.00277, 0.0027, 0.00273, 0.00744, 0.00272, 0.00261, 0.00274, 0.00281, 0.00282, 0.00277, 0.00264, 0.00277, 0.00268, 0.00266, 0.00256, 0.00267, 0.00276, 0.00287, 0.00271, 0.00271, 0.00265, 0.00268, 0.00304, 0.00294, 0.00305, 0.0029, 0.00293, 0.00278, 0.00294, 0.00291, 0.00285, 0.00291, 0.00286, 0.00284, 0.00295, 0.0029, 0.0029, 0.00287, 0.00287, 0.0029, 0.00282, 0.00289, 0.0028, 0.0029, 0.00288, 0.0028, 0.00266, 0.0026, 0.00273, 0.00266, 0.00275, 0.00276, 0.00275, 0.00283, 0.0027, 0.00268, 0.00279, 0.00265, 0.00277, 0.00279, 0.00278, 0.00276, 0.00273, 0.00266, 0.00264, 0.00265, 0.00264, 0.00268, 0.00279, 0.00284, 0.00276, 0.00269, 0.00277, 0.00277, 0.00268, 0.00268, 0.00266, 0.00263, 0.00274, 0.0026, 0.00268, 0.00269, 0.00259, 0.00258, 0.00283, 0.00267, 0.00256, 0.00279, 0.0026, 0.00276, 0.00258, 0.00269, 0.00264, 0.00266, 0.00272, 0.10829, 0.00271, 0.00273, 0.00261, 0.00278, 0.00265, 0.00268, 0.00259, 0.00272, 0.00286, 0.00273, 0.00271, 0.00286, 0.00269, 0.00267, 0.0027, 0.00281, 0.0027, 0.00267, 0.00273, 0.0027, 0.00257, 0.0026, 0.00298, 0.0026, 0.00269, 0.00264, 0.00279, 0.00281, 0.00269, 0.0031, 0.0027, 0.0027, 0.00273, 0.0028, 0.00277, 0.00279, 0.00274, 0.00279, 0.00256, 0.00277, 0.00273, 0.00275, 0.00268, 0.00277, 0.00282, 0.0028, 0.00268, 0.00285, 0.00263, 0.00275, 0.00272, 0.0027, 0.00272, 0.00269, 0.00263, 0.00272, 0.00262, 0.00268, 0.0027, 0.00275, 0.0027, 0.00256, 0.00261, 0.00265, 0.00271, 0.00266, 0.00266, 0.00275, 0.00281, 0.00274, 0.00263, 0.00267, 0.00277, 0.00271, 0.00263, 0.00267, 0.00269, 0.00285, 0.00267, 0.00275, 0.00276, 0.00277, 0.0026, 0.00277, 0.0027, 0.00279, 0.00284, 0.00284, 0.0028, 0.00331, 0.00286, 0.0027, 0.00271, 0.00257, 0.00255]}, "layernorm-grads-all-reduce-time": {"start_step": 0, "end_step": 2000, "step_interval": 5, "values": [4e-05, 1e-05, 1e-05, 1e-05, 1e-05, 1e-05, 1e-05, 1e-05, 1e-05, 1e-05, 1e-05, 1e-05, 1e-05, 1e-05, 1e-05, 1e-05, 1e-05, 1e-05, 1e-05, 1e-05, 1e-05, 1e-05, 1e-05, 1e-05, 1e-05, 1e-05, 1e-05, 1e-05, 1e-05, 1e-05, 1e-05, 1e-05, 1e-05, 1e-05, 1e-05, 1e-05, 2e-05, 1e-05, 1e-05, 1e-05, 1e-05, 1e-05, 1e-05, 1e-05, 1e-05, 1e-05, 1e-05, 1e-05, 1e-05, 2e-05, 1e-05, 1e-05, 1e-05, 1e-05, 1e-05, 1e-05, 1e-05, 1e-05, 1e-05, 1e-05, 1e-05, 1e-05, 1e-05, 1e-05, 1e-05, 1e-05, 1e-05, 1e-05, 2e-05, 1e-05, 1e-05, 1e-05, 1e-05, 1e-05, 1e-05, 1e-05, 1e-05, 1e-05, 1e-05, 1e-05, 2e-05, 1e-05, 2e-05, 1e-05, 2e-05, 1e-05, 1e-05, 1e-05, 1e-05, 1e-05, 1e-05, 1e-05, 1e-05, 1e-05, 1e-05, 1e-05, 1e-05, 1e-05, 1e-05, 2e-05, 1e-05, 2e-05, 1e-05, 1e-05, 1e-05, 2e-05, 1e-05, 1e-05, 1e-05, 1e-05, 2e-05, 1e-05, 1e-05, 1e-05, 1e-05, 1e-05, 1e-05, 1e-05, 1e-05, 1e-05, 1e-05, 1e-05, 3e-05, 1e-05, 1e-05, 1e-05, 1e-05, 1e-05, 1e-05, 1e-05, 1e-05, 1e-05, 1e-05, 1e-05, 1e-05, 1e-05, 1e-05, 1e-05, 1e-05, 1e-05, 1e-05, 1e-05, 1e-05, 1e-05, 1e-05, 1e-05, 1e-05, 1e-05, 1e-05, 1e-05, 1e-05, 1e-05, 1e-05, 1e-05, 1e-05, 1e-05, 1e-05, 1e-05, 1e-05, 1e-05, 1e-05, 1e-05, 1e-05, 1e-05, 1e-05, 1e-05, 2e-05, 1e-05, 1e-05, 1e-05, 1e-05, 1e-05, 1e-05, 1e-05, 1e-05, 1e-05, 1e-05, 1e-05, 1e-05, 1e-05, 1e-05, 1e-05, 1e-05, 1e-05, 1e-05, 1e-05, 1e-05, 1e-05, 1e-05, 1e-05, 1e-05, 1e-05, 1e-05, 1e-05, 1e-05, 1e-05, 1e-05, 1e-05, 1e-05, 1e-05, 1e-05, 1e-05, 1e-05, 1e-05, 1e-05, 2e-05, 1e-05, 1e-05, 1e-05, 1e-05, 1e-05, 1e-05, 1e-05, 1e-05, 1e-05, 1e-05, 1e-05, 1e-05, 1e-05, 1e-05, 1e-05, 1e-05, 1e-05, 1e-05, 1e-05, 1e-05, 1e-05, 1e-05, 1e-05, 1e-05, 1e-05, 1e-05, 1e-05, 1e-05, 1e-05, 1e-05, 1e-05, 1e-05, 1e-05, 1e-05, 1e-05, 1e-05, 1e-05, 1e-05, 1e-05, 1e-05, 1e-05, 1e-05, 1e-05, 1e-05, 1e-05, 1e-05, 1e-05, 1e-05, 1e-05, 1e-05, 1e-05, 1e-05, 1e-05, 1e-05, 1e-05, 2e-05, 1e-05, 1e-05, 1e-05, 2e-05, 2e-05, 1e-05, 1e-05, 1e-05, 1e-05, 1e-05, 1e-05, 1e-05, 1e-05, 2e-05, 1e-05, 1e-05, 1e-05, 1e-05, 1e-05, 1e-05, 1e-05, 1e-05, 1e-05, 1e-05, 1e-05, 1e-05, 1e-05, 1e-05, 1e-05, 1e-05, 1e-05, 1e-05, 1e-05, 1e-05, 1e-05, 1e-05, 1e-05, 1e-05, 1e-05, 1e-05, 1e-05, 1e-05, 1e-05, 1e-05, 1e-05, 1e-05, 1e-05, 1e-05, 1e-05, 1e-05, 1e-05, 1e-05, 1e-05, 1e-05, 1e-05, 1e-05, 1e-05, 1e-05, 1e-05, 1e-05, 1e-05, 1e-05, 1e-05, 1e-05, 1e-05, 1e-05, 1e-05, 1e-05, 1e-05, 1e-05, 1e-05, 1e-05, 1e-05, 1e-05, 1e-05, 1e-05, 1e-05, 1e-05, 1e-05, 1e-05, 1e-05, 1e-05, 1e-05, 1e-05, 1e-05, 1e-05, 1e-05, 1e-05, 1e-05, 1e-05, 1e-05, 1e-05, 1e-05, 1e-05, 1e-05, 1e-05, 1e-05, 1e-05, 1e-05, 1e-05, 1e-05, 1e-05, 1e-05, 1e-05, 1e-05, 1e-05, 1e-05, 1e-05, 1e-05, 1e-05, 1e-05, 1e-05, 1e-05, 1e-05, 1e-05, 1e-05, 1e-05, 1e-05, 1e-05, 1e-05, 1e-05, 1e-05, 1e-05, 1e-05, 1e-05, 1e-05, 1e-05, 1e-05, 1e-05, 1e-05, 1e-05, 1e-05, 1e-05, 1e-05, 1e-05, 2e-05, 1e-05, 1e-05]}, "embedding-grads-all-reduce-time": {"start_step": 0, "end_step": 2000, "step_interval": 5, "values": [0.00071, 0.00045, 0.00045, 0.00046, 0.00046, 0.00046, 0.00046, 0.00045, 0.00045, 0.00045, 0.00046, 0.00045, 0.00047, 0.00045, 0.00045, 0.00046, 0.00045, 0.00046, 0.00049, 0.00046, 0.00046, 0.00046, 0.00045, 0.00045, 0.00045, 0.00046, 0.00045, 0.00045, 0.00046, 0.00045, 0.00045, 0.00046, 0.00046, 0.00045, 0.00046, 0.00046, 0.00047, 0.00045, 0.00046, 0.00046, 0.00045, 0.00047, 0.00045, 0.00046, 0.00046, 0.00046, 0.00048, 0.00046, 0.00048, 0.00045, 0.00046, 0.00048, 0.00045, 0.00045, 0.00046, 0.00046, 0.00046, 0.00047, 0.00048, 0.00046, 0.00045, 0.00045, 0.00046, 0.00046, 0.00045, 0.00046, 0.00046, 0.00047, 0.00044, 0.00044, 0.00045, 0.00045, 0.00047, 0.00045, 0.00044, 0.00045, 0.00045, 0.00045, 0.00045, 0.00045, 0.00047, 0.00045, 0.00045, 0.00045, 0.00046, 0.00045, 0.00045, 0.00047, 0.00045, 0.00045, 0.00049, 0.00045, 0.00046, 0.00044, 0.00046, 0.00047, 0.00045, 0.00045, 0.00045, 0.00046, 0.00047, 0.00045, 0.00044, 0.00045, 0.00044, 0.00045, 0.00045, 0.00045, 0.00046, 0.00045, 0.00045, 0.00045, 0.00045, 0.00044, 0.00081, 0.00046, 0.00046, 0.00047, 0.00045, 0.00045, 0.00045, 0.00045, 0.00046, 0.00048, 0.00047, 0.00045, 0.00045, 0.00045, 0.00045, 0.00045, 0.00045, 0.00045, 0.00045, 0.00044, 0.00046, 0.00045, 0.00045, 0.00047, 0.00045, 0.00045, 0.00045, 0.00045, 0.00045, 0.00045, 0.00045, 0.00045, 0.00045, 0.00044, 0.00045, 0.00044, 0.00045, 0.00045, 0.00047, 0.00045, 0.00045, 0.00044, 0.00045, 0.00045, 0.00045, 0.00045, 0.00045, 0.00045, 0.00045, 0.00045, 0.00044, 0.00046, 0.00047, 0.00046, 0.00047, 0.00045, 0.00046, 0.00044, 0.00045, 0.00045, 0.00045, 0.00044, 0.00046, 0.00045, 0.00046, 0.00045, 0.00046, 0.00045, 0.00045, 0.00045, 0.00045, 0.00046, 0.00045, 0.00045, 0.00045, 0.00046, 0.00046, 0.00045, 0.00045, 0.00047, 0.00045, 0.00045, 0.00045, 0.00045, 0.00045, 0.00045, 0.00045, 0.00045, 0.00047, 0.00049, 0.00047, 0.00045, 0.00045, 0.00049, 0.00045, 0.00049, 0.00045, 0.00045, 0.00045, 0.00045, 0.00045, 0.00045, 0.00046, 0.00045, 0.00045, 0.00049, 0.00046, 0.00046, 0.00045, 0.00045, 0.00045, 0.00045, 0.00045, 0.00045, 0.00048, 0.00045, 0.00046, 0.00046, 0.00044, 0.00045, 0.00045, 0.00044, 0.00046, 0.00045, 0.00044, 0.00048, 0.00045, 0.00044, 0.00045, 0.00045, 0.00045, 0.00045, 0.00047, 0.00046, 0.00048, 0.00045, 0.00045, 0.00045, 0.00047, 0.00045, 0.00046, 0.00045, 0.00045, 0.00045, 0.00045, 0.00045, 0.00045, 0.00051, 0.00049, 0.00045, 0.00046, 0.00047, 0.00046, 0.00045, 0.00045, 0.00045, 0.00046, 0.00047, 0.00049, 0.0005, 0.00046, 0.00045, 0.00047, 0.00046, 0.00045, 0.00045, 0.00049, 0.00045, 0.00049, 0.00045, 0.00045, 0.00046, 0.00045, 0.0005, 0.00045, 0.00046, 0.00044, 0.00045, 0.00049, 0.00045, 0.00045, 0.00045, 0.00045, 0.00045, 0.00046, 0.00046, 0.00045, 0.00045, 0.00045, 0.00045, 0.00049, 0.00046, 0.00048, 0.00047, 0.00045, 0.00045, 0.00046, 0.00048, 0.00045, 0.00046, 0.00045, 0.00045, 0.00047, 0.00046, 0.00048, 0.00048, 0.00048, 0.00048, 0.00045, 0.00045, 0.00048, 0.00047, 0.00045, 0.00048, 0.00045, 0.00046, 0.00045, 0.00046, 0.00045, 0.00047, 0.00047, 0.00046, 0.00046, 0.00047, 0.00047, 0.00044, 0.00045, 0.00045, 0.00048, 0.00048, 0.00048, 0.00045, 0.00045, 0.00046, 0.00045, 0.00048, 0.00048, 0.00045, 0.00045, 0.00045, 0.00046, 0.00044, 0.00045, 0.00045, 0.00049, 0.00045, 0.00045, 0.00045, 0.00045, 0.00048, 0.00045, 0.00046, 0.00049, 0.00046, 0.00046, 0.00044, 0.00048, 0.00045, 0.00045, 0.00045, 0.00046, 0.00045, 0.00045, 0.00045, 0.00045, 0.00045, 0.00048, 0.00047, 0.00049, 0.00045, 0.00045, 0.00053, 0.00045, 0.00045, 0.00045, 0.00045, 0.00046, 0.00044, 0.00049, 0.00045, 0.00044, 0.00048, 0.00045, 0.00045, 0.00045, 0.00045]}, "all-grads-sync-time": {"start_step": 0, "end_step": 2000, "step_interval": 5, "values": [6.13385, 0.00147, 0.00148, 0.00147, 0.00149, 0.00151, 0.00148, 0.00148, 0.00147, 0.00149, 0.00149, 0.00147, 0.00149, 0.00149, 0.00147, 0.00147, 0.00147, 0.00149, 0.00149, 0.00148, 0.00148, 0.00148, 0.0015, 0.0015, 0.00147, 0.00148, 0.00149, 0.00148, 0.00148, 0.00148, 0.00147, 0.00148, 0.00149, 0.00149, 0.00148, 0.00148, 0.00149, 0.00147, 0.00148, 0.00148, 0.00147, 0.00147, 0.00148, 0.00149, 0.00148, 0.00148, 0.00149, 0.00148, 0.00149, 0.00148, 0.00147, 0.00147, 0.00147, 0.00148, 0.00148, 0.00149, 0.00148, 0.00149, 0.00147, 0.00147, 0.00149, 0.00148, 0.00148, 0.00149, 0.0015, 0.00148, 0.00148, 0.00149, 0.00148, 0.00149, 0.00148, 0.00147, 0.00149, 0.00149, 0.00148, 0.00146, 0.00147, 0.00148, 0.00147, 0.00148, 0.00149, 0.00147, 0.00146, 0.00148, 0.00148, 0.00147, 0.00149, 0.00148, 0.00149, 0.0015, 0.00148, 0.00147, 0.00147, 0.00147, 0.00149, 0.00149, 0.00148, 0.00148, 0.00148, 0.00148, 0.00147, 0.00149, 0.00148, 0.00148, 0.00149, 0.00148, 0.00147, 0.00147, 0.00147, 0.00148, 0.00149, 0.00147, 0.00148, 0.00148, 0.00147, 0.00149, 0.00147, 0.00147, 0.00149, 0.00149, 0.00146, 0.00149, 0.00147, 0.00149, 0.00149, 0.00148, 0.00147, 0.00148, 0.00148, 0.00148, 0.00149, 0.00148, 0.00147, 0.00149, 0.00151, 0.00147, 0.00148, 0.00147, 0.00148, 0.00148, 0.00147, 0.00147, 0.0015, 0.00149, 0.00148, 0.00147, 0.00148, 0.00147, 0.00148, 0.00148, 0.00147, 0.0015, 0.00147, 0.00147, 0.00147, 0.00148, 0.0015, 0.00148, 0.00148, 0.00147, 0.00148, 0.00149, 0.00149, 0.00148, 0.00148, 0.00148, 0.00148, 0.00147, 0.00149, 0.00149, 0.00149, 0.00147, 0.00147, 0.00148, 0.00147, 0.00147, 0.00147, 0.00148, 0.00146, 0.00148, 0.00147, 0.00149, 0.00147, 0.00149, 0.00149, 0.00147, 0.00147, 0.00148, 0.00147, 0.00148, 0.00148, 0.00148, 0.00148, 0.00149, 0.00147, 0.00149, 0.00148, 0.00148, 0.00148, 0.00149, 0.0015, 0.00148, 0.00149, 0.00148, 0.00149, 0.00149, 0.00149, 0.00148, 0.00148, 0.00149, 0.00149, 0.0015, 0.00149, 0.00148, 0.00149, 0.00149, 0.00149, 0.00151, 0.00148, 0.0015, 0.00148, 0.00149, 0.00149, 0.00149, 0.00149, 0.00149, 0.00149, 0.0015, 0.0015, 0.0015, 0.00149, 0.0015, 0.00149, 0.00149, 0.00147, 0.00148, 0.00149, 0.0015, 0.0015, 0.00149, 0.00147, 0.00149, 0.0015, 0.00149, 0.00148, 0.00148, 0.00148, 0.00148, 0.00148, 0.0015, 0.0015, 0.0015, 0.00148, 0.00149, 0.00149, 0.00149, 0.00149, 0.00148, 0.0015, 0.00149, 0.00148, 0.00151, 0.00149, 0.00148, 0.00149, 0.00147, 0.00147, 0.00154, 0.00149, 0.00147, 0.00148, 0.0015, 0.00149, 0.00152, 0.00148, 0.00148, 0.00148, 0.00148, 0.00149, 0.00148, 0.00151, 0.00147, 0.00148, 0.00151, 0.0015, 0.00149, 0.00147, 0.00148, 0.00149, 0.00149, 0.00151, 0.00148, 0.00149, 0.00149, 0.00149, 0.00147, 0.00148, 0.00148, 0.00147, 0.00148, 0.00148, 0.00149, 0.00148, 0.00148, 0.00149, 0.00148, 0.00148, 0.00152, 0.00149, 0.0015, 0.00148, 0.00148, 0.00147, 0.00148, 0.00149, 0.00149, 0.00147, 0.00149, 0.00151, 0.00147, 0.00148, 0.00148, 0.00149, 0.00147, 0.0015, 0.00149, 0.00149, 0.00148, 0.00149, 0.00149, 0.00149, 0.00149, 0.00148, 0.00148, 0.00148, 0.00148, 0.00149, 0.00149, 0.00149, 0.00149, 0.00148, 0.00149, 0.00149, 0.00149, 0.00148, 0.0015, 0.00148, 0.00151, 0.00148, 0.00151, 0.00147, 0.00147, 0.00149, 0.00148, 0.00148, 0.00148, 0.00148, 0.00147, 0.00149, 0.00149, 0.00149, 0.00148, 0.00149, 0.0015, 0.00148, 0.00148, 0.00149, 0.00148, 0.00148, 0.00149, 0.00148, 0.00149, 0.0015, 0.00147, 0.00149, 0.00148, 0.00149, 0.00149, 0.00148, 0.00147, 0.00149, 0.0015, 0.0015, 0.00149, 0.00148, 0.00147, 0.00149, 0.00147, 0.0015, 0.00149, 0.00149, 0.00149, 0.0015, 0.00148, 0.00149, 0.00149, 0.0015, 0.00148, 0.00148, 0.00148]}, "optimizer-copy-to-main-grad-time": {"start_step": 0, "end_step": 2000, "step_interval": 5, "values": [0.00022, 0.00015, 0.00014, 0.00013, 0.00013, 0.00013, 0.00012, 0.00012, 0.00012, 0.00012, 0.00012, 0.00013, 0.00012, 0.00012, 0.00013, 0.00012, 0.00013, 0.00013, 0.00012, 0.00014, 0.00013, 0.00012, 0.00012, 0.00013, 0.00012, 0.00012, 0.00014, 0.00014, 0.00012, 0.00012, 0.00012, 0.00013, 0.00012, 0.00012, 0.00012, 0.00014, 0.00012, 0.00012, 0.00012, 0.00013, 0.00012, 0.00013, 0.00012, 0.00012, 0.00012, 0.00013, 0.00012, 0.00013, 0.00013, 0.00013, 0.00013, 0.00012, 0.00012, 0.00012, 0.00013, 0.00013, 0.00014, 0.00013, 0.00012, 0.00014, 0.00013, 0.00012, 0.00013, 0.00013, 0.00012, 0.00012, 0.00012, 0.00012, 0.00013, 0.00014, 0.00012, 0.00012, 0.00012, 0.00013, 0.00012, 0.00012, 0.00012, 0.00012, 0.00012, 0.00012, 0.00013, 0.00012, 0.00014, 0.00012, 0.00013, 0.00012, 0.00012, 0.00013, 0.00012, 0.00012, 0.00012, 0.00013, 0.00012, 0.00012, 0.00012, 0.00012, 0.00013, 0.00013, 0.00012, 0.00013, 0.00013, 0.00013, 0.00012, 0.00012, 0.00012, 0.00014, 0.00012, 0.00013, 0.00012, 0.00012, 0.00013, 0.00014, 0.00012, 0.00012, 0.00013, 0.00012, 0.00014, 0.00012, 0.00012, 0.00013, 0.00012, 0.00012, 0.00013, 0.00012, 0.00012, 0.00012, 0.00012, 0.00015, 0.00014, 0.00012, 0.00012, 0.00013, 0.00012, 0.00012, 0.00013, 0.00012, 0.00012, 0.00012, 0.00013, 0.00012, 0.00013, 0.00013, 0.00012, 0.00012, 0.00012, 0.00015, 0.00013, 0.00014, 0.00014, 0.00012, 0.00014, 0.00013, 0.00012, 0.00012, 0.00014, 0.00013, 0.00012, 0.00013, 0.00012, 0.00014, 0.00012, 0.00014, 0.00014, 0.00012, 0.00012, 0.00014, 0.00013, 0.00014, 0.00012, 0.00014, 0.00014, 0.00014, 0.00013, 0.00014, 0.00012, 0.00013, 0.00014, 0.00012, 0.00014, 0.00013, 0.00014, 0.00012, 0.00014, 0.00012, 0.00014, 0.00012, 0.00012, 0.00012, 0.00012, 0.00014, 0.00012, 0.00013, 0.00014, 0.00012, 0.00014, 0.00012, 0.00013, 0.00013, 0.00014, 0.00012, 0.00015, 0.00014, 0.00014, 0.00014, 0.00014, 0.00014, 0.00014, 0.00014, 0.00012, 0.00012, 0.00013, 0.00013, 0.00013, 0.00013, 0.00014, 0.00014, 0.00014, 0.00014, 0.00014, 0.00013, 0.00013, 0.00013, 0.00014, 0.00013, 0.00013, 0.00013, 0.00013, 0.00015, 0.00013, 0.00014, 0.00013, 0.00013, 0.00013, 0.00013, 0.00013, 0.00013, 0.00014, 0.00012, 0.00013, 0.00013, 0.00015, 0.00013, 0.00013, 0.00013, 0.00013, 0.00014, 0.00013, 0.00013, 0.00014, 0.00014, 0.00013, 0.00014, 0.00012, 0.00014, 0.00013, 0.00013, 0.00013, 0.00013, 0.00013, 0.00013, 0.00013, 0.00015, 0.00014, 0.00014, 0.00013, 0.00012, 0.00014, 0.00013, 0.00013, 0.00013, 0.00014, 0.00015, 0.00015, 0.00014, 0.00013, 0.00013, 0.00013, 0.00013, 0.00013, 0.00015, 0.00014, 0.00015, 0.00013, 0.00013, 0.00014, 0.00013, 0.00014, 0.00014, 0.00013, 0.00013, 0.00013, 0.00012, 0.00013, 0.00012, 0.00012, 0.00012, 0.00012, 0.00012, 0.00012, 0.00012, 0.00012, 0.00012, 0.00012, 0.00012, 0.00013, 0.00013, 0.00013, 0.00013, 0.00012, 0.00012, 0.00012, 0.00012, 0.00013, 0.00012, 0.00013, 0.00012, 0.00012, 0.00012, 0.00012, 0.00012, 0.00012, 0.00012, 0.00013, 0.00013, 0.00012, 0.00012, 0.00012, 0.00013, 0.00012, 0.00012, 0.00012, 0.00012, 0.00012, 0.00012, 0.00017, 0.00014, 0.00012, 0.00012, 0.00012, 0.00012, 0.00012, 0.00012, 0.00012, 0.00012, 0.00012, 0.00012, 0.00012, 0.00012, 0.00012, 0.00012, 0.00012, 0.00012, 0.00013, 0.00012, 0.00012, 0.00012, 0.00012, 0.00012, 0.00012, 0.00013, 0.00012, 0.00012, 0.00013, 0.00013, 0.00013, 0.00012, 0.00013, 0.00012, 0.00012, 0.00013, 0.00013, 0.00013, 0.00013, 0.00013, 0.00013, 0.00012, 0.00012, 0.00012, 0.00013, 0.00013, 0.00013, 0.00013, 0.00013, 0.00013, 0.00012, 0.00012, 0.00013, 0.00013, 0.00012, 0.00012, 0.00014, 0.00013, 0.00012, 0.00013, 0.00013, 0.00012, 0.00012, 0.00012, 0.00012, 0.00012]}, "optimizer-clip-main-grad-time": {"start_step": 0, "end_step": 2000, "step_interval": 5, "values": [2.11156, 0.00067, 0.00064, 0.00065, 0.00062, 0.00063, 0.00062, 0.00063, 0.00062, 0.00062, 0.00062, 0.00062, 0.00062, 0.00062, 0.00067, 0.00062, 0.00063, 0.00063, 0.00063, 0.00063, 0.00062, 0.00062, 0.00061, 0.00062, 0.00062, 0.00062, 0.00064, 0.00064, 0.00064, 0.00063, 0.00062, 0.00062, 0.00063, 0.00062, 0.00062, 0.00066, 0.00062, 0.00062, 0.00063, 0.00063, 0.00063, 0.00062, 0.00062, 0.00062, 0.00062, 0.00065, 0.00062, 0.00064, 0.00066, 0.00062, 0.00062, 0.00063, 0.00062, 0.00062, 0.00065, 0.00065, 0.00064, 0.00063, 0.00062, 0.00064, 0.00063, 0.00062, 0.00067, 0.00062, 0.00062, 0.00063, 0.00062, 0.00062, 0.00063, 0.00064, 0.00062, 0.00062, 0.00062, 0.00064, 0.00062, 0.00062, 0.00063, 0.00062, 0.00062, 0.00062, 0.00064, 0.00063, 0.00064, 0.00063, 0.00066, 0.00062, 0.00062, 0.00062, 0.00061, 0.00062, 0.00062, 0.00062, 0.00062, 0.00062, 0.00062, 0.00061, 0.00062, 0.00071, 0.00046, 0.00069, 0.00062, 0.00068, 0.00062, 0.00062, 0.00045, 0.00062, 0.00062, 0.00063, 0.00062, 0.00062, 0.0005, 0.00048, 0.00062, 0.00062, 0.00062, 0.00062, 0.00048, 0.00062, 0.00062, 0.00064, 0.00047, 0.00062, 0.00066, 0.00062, 0.00062, 0.00062, 0.00062, 0.00064, 0.00064, 0.00062, 0.00046, 0.00062, 0.00062, 0.00062, 0.00065, 0.00062, 0.00062, 0.00062, 0.00062, 0.00062, 0.00062, 0.00062, 0.00062, 0.00062, 0.00067, 0.00064, 0.00061, 0.00063, 0.00064, 0.00061, 0.00064, 0.00062, 0.00062, 0.00062, 0.00047, 0.00062, 0.00062, 0.00062, 0.00062, 0.00064, 0.00061, 0.00064, 0.00064, 0.00062, 0.00063, 0.00064, 0.00067, 0.00064, 0.00062, 0.00064, 0.00063, 0.00062, 0.00064, 0.00063, 0.00062, 0.00065, 0.00064, 0.00064, 0.00064, 0.00063, 0.00064, 0.00063, 0.00065, 0.00062, 0.00063, 0.00062, 0.00065, 0.00062, 0.00061, 0.00063, 0.00061, 0.00062, 0.00066, 0.00062, 0.00065, 0.00062, 0.00061, 0.00063, 0.00063, 0.00062, 0.00069, 0.00066, 0.00066, 0.00067, 0.00067, 0.00071, 0.00067, 0.00067, 0.00065, 0.00065, 0.00067, 0.00067, 0.00066, 0.00066, 0.00067, 0.00067, 0.00071, 0.00066, 0.00066, 0.00066, 0.00066, 0.00067, 0.00067, 0.00067, 0.00066, 0.00068, 0.00066, 0.00067, 0.00065, 0.00066, 0.00066, 0.00065, 0.00069, 0.00067, 0.00066, 0.00066, 0.00068, 0.00065, 0.00064, 0.00065, 0.00067, 0.00065, 0.00066, 0.00066, 0.00067, 0.00066, 0.00066, 0.00067, 0.00066, 0.00067, 0.00067, 0.00067, 0.00066, 0.00067, 0.00067, 0.00067, 0.00066, 0.00066, 0.00066, 0.00066, 0.00066, 0.00073, 0.00069, 0.00066, 0.00065, 0.00064, 0.00067, 0.00066, 0.00067, 0.00066, 0.00073, 0.00067, 0.00067, 0.00067, 0.00067, 0.00066, 0.00068, 0.00065, 0.00065, 0.00067, 0.00067, 0.00066, 0.00066, 0.00067, 0.00067, 0.00067, 0.00067, 0.00066, 0.00064, 0.00066, 0.00067, 0.00064, 0.00064, 0.00064, 0.00064, 0.00064, 0.00064, 0.00064, 0.00063, 0.00064, 0.00064, 0.00064, 0.00064, 0.00066, 0.00065, 0.00064, 0.00064, 0.00064, 0.00064, 0.00063, 0.00064, 0.00064, 0.00065, 0.00065, 0.00064, 0.00073, 0.00064, 0.00063, 0.00064, 0.00063, 0.00064, 0.00064, 0.00064, 0.00064, 0.00064, 0.00064, 0.00066, 0.00065, 0.00064, 0.00063, 0.00063, 0.00064, 0.00065, 0.00065, 0.00065, 0.00065, 0.00063, 0.00064, 0.00063, 0.00063, 0.00064, 0.00064, 0.00065, 0.00064, 0.00063, 0.00063, 0.00065, 0.00063, 0.00064, 0.00063, 0.00064, 0.00063, 0.00066, 0.00063, 0.00065, 0.00064, 0.00063, 0.00064, 0.00063, 0.00064, 0.00064, 0.00064, 0.00066, 0.00066, 0.00065, 0.00064, 0.00066, 0.00065, 0.00065, 0.00065, 0.00066, 0.00065, 0.00065, 0.00065, 0.00066, 0.00066, 0.00064, 0.00063, 0.00065, 0.00065, 0.00066, 0.00064, 0.00066, 0.00065, 0.00066, 0.00067, 0.00066, 0.00066, 0.00065, 0.00066, 0.00065, 0.00068, 0.00066, 0.00066, 0.00065, 0.00063, 0.00064, 0.00063, 0.00063, 0.00064]}, "optimizer-count-zeros-time": {"start_step": 0, "end_step": 2000, "step_interval": 5, "values": [0.00352, 0.00261, 0.00262, 0.00279, 0.00266, 0.00279, 0.00264, 0.00264, 0.00265, 0.00263, 0.00263, 0.00263, 0.00266, 0.00265, 0.00265, 0.00266, 0.00262, 0.00265, 0.00264, 0.00267, 0.00262, 0.00264, 0.00263, 0.00264, 0.00265, 0.00263, 0.00264, 0.00266, 0.00265, 0.00262, 0.00263, 0.00265, 0.00266, 0.00263, 0.00264, 0.00264, 0.00264, 0.00264, 0.00264, 0.00265, 0.00265, 0.00264, 0.00265, 0.00266, 0.00264, 0.00316, 0.00266, 0.00263, 0.00279, 0.0027, 0.00263, 0.00263, 0.00267, 0.00263, 0.00264, 0.00264, 0.00265, 0.00262, 0.00265, 0.00265, 0.00264, 0.00266, 0.00277, 0.00265, 0.00266, 0.00266, 0.00265, 0.00265, 0.00264, 0.00266, 0.00267, 0.00263, 0.00263, 0.00266, 0.00265, 0.00263, 0.00263, 0.00265, 0.00263, 0.00265, 0.00293, 0.00263, 0.00273, 0.00264, 0.00285, 0.00263, 0.00265, 0.00265, 0.00265, 0.00263, 0.00264, 0.00265, 0.00264, 0.00263, 0.00263, 0.00265, 0.00262, 0.00298, 0.00265, 0.0031, 0.00263, 0.00312, 0.00264, 0.00267, 0.00263, 0.00296, 0.00265, 0.00262, 0.00266, 0.00263, 0.00298, 0.00266, 0.00265, 0.00263, 0.00276, 0.00265, 0.00266, 0.00264, 0.00264, 0.00266, 0.00264, 0.00265, 0.00268, 0.00265, 0.00264, 0.00264, 0.00263, 0.00266, 0.00264, 0.00265, 0.00264, 0.00264, 0.00263, 0.00262, 0.00284, 0.00263, 0.00263, 0.00265, 0.00265, 0.00264, 0.00263, 0.00263, 0.00264, 0.00265, 0.00298, 0.00264, 0.00263, 0.00266, 0.00264, 0.00265, 0.00264, 0.00264, 0.00267, 0.00264, 0.00265, 0.00262, 0.00264, 0.00271, 0.00266, 0.00266, 0.00265, 0.00266, 0.00267, 0.00268, 0.00263, 0.00265, 0.00282, 0.00266, 0.0027, 0.00265, 0.00266, 0.00265, 0.00264, 0.00267, 0.00269, 0.00278, 0.00264, 0.00268, 0.00264, 0.00265, 0.00265, 0.00267, 0.00267, 0.00265, 0.00265, 0.00265, 0.00267, 0.00265, 0.00266, 0.00264, 0.00265, 0.00263, 0.00265, 0.00265, 0.00267, 0.00267, 0.00263, 0.00264, 0.00264, 0.00265, 0.00262, 0.00264, 0.00266, 0.00263, 0.00267, 0.00264, 0.00264, 0.00264, 0.00266, 0.00265, 0.00266, 0.00264, 0.00264, 0.00267, 0.00265, 0.00262, 0.00266, 0.00265, 0.00267, 0.00266, 0.00267, 0.00295, 0.00267, 0.00268, 0.00263, 0.00265, 0.00265, 0.00263, 0.00266, 0.00299, 0.00264, 0.00267, 0.00262, 0.00269, 0.00265, 0.00264, 0.00265, 0.00263, 0.00265, 0.00265, 0.00286, 0.00266, 0.00266, 0.00264, 0.00264, 0.00265, 0.00264, 0.00266, 0.00266, 0.00267, 0.00264, 0.00265, 0.00265, 0.00265, 0.00266, 0.00264, 0.00268, 0.00264, 0.00262, 0.00267, 0.00263, 0.00312, 0.00265, 0.00265, 0.00264, 0.00263, 0.00265, 0.00265, 0.00264, 0.00266, 0.00268, 0.00264, 0.00266, 0.00263, 0.00267, 0.00265, 0.00263, 0.00266, 0.0027, 0.00266, 0.00263, 0.00264, 0.00276, 0.00265, 0.00266, 0.00264, 0.00264, 0.00264, 0.00302, 0.00265, 0.00265, 0.00269, 0.00264, 0.00263, 0.00266, 0.00264, 0.00267, 0.00263, 0.00264, 0.00265, 0.00266, 0.00264, 0.00265, 0.00265, 0.00265, 0.00267, 0.00261, 0.00262, 0.00266, 0.00263, 0.00265, 0.00266, 0.00265, 0.00262, 0.00266, 0.00267, 0.00262, 0.00266, 0.00265, 0.00264, 0.00263, 0.00265, 0.00263, 0.00268, 0.00282, 0.00266, 0.00264, 0.00264, 0.00262, 0.00266, 0.00265, 0.00266, 0.00264, 0.00276, 0.00264, 0.00264, 0.00265, 0.00263, 0.00265, 0.00265, 0.00266, 0.00265, 0.00265, 0.00264, 0.00262, 0.00264, 0.00264, 0.00265, 0.00265, 0.00266, 0.00267, 0.00266, 0.00268, 0.00265, 0.00275, 0.00263, 0.00275, 0.00263, 0.00265, 0.00264, 0.00265, 0.00264, 0.00265, 0.00264, 0.00266, 0.00269, 0.00266, 0.00264, 0.00263, 0.00266, 0.00267, 0.00266, 0.00266, 0.00268, 0.00267, 0.00265, 0.00265, 0.00266, 0.00265, 0.00265, 0.00263, 0.00266, 0.00264, 0.00268, 0.00266, 0.00263, 0.00268, 0.00265, 0.00265, 0.00278, 0.0027, 0.00264, 0.00264, 0.00263, 0.00265, 0.00266, 0.00265, 0.00269, 0.00264, 0.00265]}, "optimizer-inner-step-time": {"start_step": 0, "end_step": 2000, "step_interval": 5, "values": [0.0024, 0.00067, 0.00066, 0.00066, 0.00066, 0.00067, 0.00067, 0.00067, 0.00066, 0.00066, 0.00066, 0.00067, 0.00066, 0.00067, 0.00066, 0.00066, 0.00066, 0.00066, 0.00066, 0.00066, 0.00066, 0.00066, 0.00066, 0.00066, 0.00066, 0.00066, 0.00067, 0.00066, 0.00066, 0.00066, 0.00065, 0.00066, 0.00066, 0.00066, 0.00066, 0.00066, 0.00066, 0.00066, 0.00065, 0.00066, 0.00066, 0.00065, 0.00066, 0.00067, 0.00066, 0.00067, 0.00065, 0.00065, 0.00066, 0.0007, 0.00065, 0.00066, 0.00065, 0.00066, 0.00066, 0.00067, 0.00066, 0.00067, 0.00067, 0.00066, 0.00065, 0.00066, 0.00066, 0.00066, 0.00066, 0.00066, 0.00065, 0.00065, 0.00066, 0.00066, 0.00065, 0.00065, 0.00065, 0.00066, 0.00065, 0.00065, 0.00065, 0.00065, 0.00066, 0.00065, 0.00067, 0.00065, 0.00066, 0.00065, 0.00066, 0.00066, 0.00065, 0.00066, 0.00066, 0.00065, 0.00065, 0.00065, 0.00065, 0.00065, 0.00065, 0.00065, 0.00065, 0.00066, 0.00066, 0.00066, 0.00066, 0.00066, 0.00065, 0.00065, 0.00065, 0.00067, 0.00065, 0.00065, 0.00065, 0.00065, 0.00066, 0.00065, 0.00066, 0.00065, 0.00066, 0.00065, 0.00066, 0.00066, 0.00065, 0.00065, 0.00065, 0.00065, 0.00065, 0.00065, 0.00065, 0.00065, 0.00065, 0.00069, 0.00066, 0.00065, 0.00065, 0.00065, 0.00065, 0.00066, 0.00066, 0.00065, 0.00066, 0.00065, 0.00065, 0.00066, 0.00065, 0.00065, 0.00065, 0.00066, 0.00067, 0.00066, 0.00065, 0.00066, 0.00066, 0.00065, 0.00066, 0.00065, 0.00066, 0.00065, 0.00065, 0.00065, 0.00065, 0.00068, 0.00065, 0.00066, 0.00065, 0.00066, 0.00066, 0.00066, 0.00065, 0.00065, 0.00066, 0.00066, 0.00066, 0.00066, 0.00066, 0.00065, 0.00065, 0.00066, 0.00065, 0.00066, 0.00065, 0.00065, 0.00066, 0.00065, 0.00066, 0.00066, 0.00065, 0.00065, 0.00066, 0.00065, 0.00065, 0.00066, 0.00065, 0.00065, 0.00066, 0.00066, 0.00066, 0.00065, 0.00066, 0.00066, 0.00065, 0.00065, 0.00067, 0.00066, 0.00069, 0.00068, 0.00069, 0.00069, 0.00068, 0.0007, 0.00069, 0.00069, 0.00067, 0.00067, 0.00068, 0.00068, 0.00068, 0.00068, 0.00069, 0.00068, 0.00069, 0.00068, 0.00068, 0.00069, 0.00091, 0.00068, 0.00068, 0.00069, 0.00068, 0.00068, 0.00068, 0.00068, 0.00068, 0.00069, 0.00068, 0.00068, 0.00068, 0.00067, 0.00068, 0.00067, 0.00068, 0.00067, 0.00067, 0.00068, 0.00071, 0.00068, 0.00068, 0.00068, 0.00068, 0.00069, 0.00068, 0.00067, 0.00068, 0.00067, 0.0007, 0.00069, 0.00067, 0.00069, 0.00068, 0.00068, 0.00068, 0.00068, 0.00068, 0.00068, 0.00067, 0.00069, 0.00068, 0.00068, 0.00068, 0.00067, 0.00068, 0.00068, 0.00067, 0.00068, 0.00068, 0.00068, 0.00068, 0.00068, 0.00067, 0.00068, 0.00067, 0.00067, 0.00068, 0.00067, 0.00068, 0.00068, 0.00069, 0.00068, 0.00069, 0.00068, 0.00068, 0.00068, 0.00067, 0.00068, 0.00068, 0.00067, 0.00067, 0.00066, 0.00067, 0.00067, 0.00067, 0.00067, 0.00067, 0.00066, 0.00066, 0.00068, 0.00067, 0.00067, 0.00067, 0.00066, 0.00067, 0.00066, 0.00067, 0.00067, 0.00067, 0.00066, 0.00067, 0.00067, 0.00067, 0.00066, 0.00067, 0.00067, 0.00067, 0.00068, 0.00067, 0.00067, 0.00066, 0.00067, 0.00066, 0.00067, 0.00067, 0.00067, 0.00067, 0.00067, 0.00066, 0.00066, 0.00067, 0.00068, 0.00066, 0.00067, 0.00067, 0.00067, 0.00068, 0.00067, 0.00067, 0.00067, 0.00067, 0.00067, 0.00068, 0.00067, 0.00068, 0.00067, 0.00066, 0.00066, 0.00066, 0.00066, 0.00066, 0.00066, 0.00067, 0.00067, 0.00067, 0.00067, 0.00066, 0.00066, 0.00067, 0.00067, 0.00067, 0.00067, 0.00068, 0.00066, 0.00067, 0.00067, 0.00067, 0.00067, 0.00067, 0.00067, 0.00067, 0.00067, 0.00067, 0.00067, 0.00066, 0.00066, 0.00067, 0.00067, 0.00068, 0.00067, 0.00067, 0.00068, 0.00068, 0.00067, 0.00067, 0.00067, 0.00067, 0.00067, 0.00068, 0.00067, 0.00069, 0.00067, 0.00067, 0.00066, 0.00067, 0.00066, 0.00067, 0.00066]}, "optimizer-copy-main-to-model-params-time": {"start_step": 0, "end_step": 2000, "step_interval": 5, "values": [0.0006, 0.00055, 0.00055, 0.00054, 0.00054, 0.00053, 0.00053, 0.00053, 0.00054, 0.00052, 0.00052, 0.00052, 0.00052, 0.00053, 0.00053, 0.00053, 0.00053, 0.00053, 0.00053, 0.00053, 0.00053, 0.00053, 0.00053, 0.00053, 0.00053, 0.00052, 0.00053, 0.00053, 0.00053, 0.00053, 0.00053, 0.00053, 0.00053, 0.00053, 0.00053, 0.00053, 0.00053, 0.00053, 0.00053, 0.00053, 0.00053, 0.00053, 0.00053, 0.00053, 0.00053, 0.00053, 0.00052, 0.00052, 0.00053, 0.00061, 0.00052, 0.00053, 0.00053, 0.00053, 0.00053, 0.00053, 0.00053, 0.00053, 0.00052, 0.00053, 0.00053, 0.00053, 0.00054, 0.00052, 0.00053, 0.00053, 0.00053, 0.00052, 0.00052, 0.00053, 0.00052, 0.00052, 0.00052, 0.00053, 0.00052, 0.00052, 0.00052, 0.00052, 0.00052, 0.00052, 0.00053, 0.00053, 0.00053, 0.00052, 0.00052, 0.00053, 0.00052, 0.00052, 0.00052, 0.00052, 0.00052, 0.00052, 0.00052, 0.00052, 0.00052, 0.00052, 0.00052, 0.00053, 0.00052, 0.00053, 0.00052, 0.00053, 0.00052, 0.00052, 0.00052, 0.00053, 0.00053, 0.00052, 0.00052, 0.00053, 0.00056, 0.00053, 0.00052, 0.00052, 0.00052, 0.00052, 0.00052, 0.00052, 0.00052, 0.00052, 0.00052, 0.00052, 0.00053, 0.00052, 0.00052, 0.00052, 0.00052, 0.00053, 0.00053, 0.00053, 0.00052, 0.00052, 0.00052, 0.00052, 0.00052, 0.00052, 0.00052, 0.00052, 0.00053, 0.00052, 0.00055, 0.00052, 0.00052, 0.00052, 0.00053, 0.00053, 0.00052, 0.00053, 0.00052, 0.00052, 0.00052, 0.00052, 0.00052, 0.00052, 0.00053, 0.00053, 0.00052, 0.00052, 0.00052, 0.00052, 0.00052, 0.00053, 0.00053, 0.00052, 0.00052, 0.00052, 0.00053, 0.00053, 0.00052, 0.00052, 0.00052, 0.00052, 0.00052, 0.00053, 0.00052, 0.00054, 0.00053, 0.00052, 0.00052, 0.00052, 0.00053, 0.00052, 0.00052, 0.00052, 0.00053, 0.00052, 0.00053, 0.00052, 0.00052, 0.00053, 0.00052, 0.00052, 0.00053, 0.00052, 0.00052, 0.00053, 0.00053, 0.00052, 0.00052, 0.00052, 0.00053, 0.00054, 0.00054, 0.00054, 0.00053, 0.00054, 0.00053, 0.00054, 0.00053, 0.00054, 0.00053, 0.00054, 0.00053, 0.00053, 0.00054, 0.00053, 0.00053, 0.00053, 0.00053, 0.00054, 0.00054, 0.00054, 0.00054, 0.00054, 0.00053, 0.00054, 0.00054, 0.00054, 0.00053, 0.00056, 0.00054, 0.00054, 0.00053, 0.00054, 0.00054, 0.00053, 0.00054, 0.00053, 0.00054, 0.00054, 0.00054, 0.00053, 0.00053, 0.00053, 0.00053, 0.00054, 0.00053, 0.00054, 0.00053, 0.00053, 0.00053, 0.00055, 0.00053, 0.00054, 0.00054, 0.00054, 0.00053, 0.00054, 0.00054, 0.00054, 0.00053, 0.00055, 0.00053, 0.00054, 0.00053, 0.00053, 0.00054, 0.00055, 0.00053, 0.00055, 0.00054, 0.00054, 0.00054, 0.00054, 0.00053, 0.00054, 0.00054, 0.00054, 0.00054, 0.00055, 0.00054, 0.00054, 0.00054, 0.00054, 0.00054, 0.00054, 0.00054, 0.00054, 0.00053, 0.00054, 0.00054, 0.00054, 0.00053, 0.00054, 0.00053, 0.00054, 0.00053, 0.00054, 0.00053, 0.00053, 0.00053, 0.00054, 0.00053, 0.00053, 0.00053, 0.00054, 0.00053, 0.00053, 0.00053, 0.00053, 0.00054, 0.00053, 0.00053, 0.00053, 0.00053, 0.00054, 0.00053, 0.00053, 0.00053, 0.00053, 0.00053, 0.00053, 0.00054, 0.00054, 0.00053, 0.00053, 0.00054, 0.00054, 0.00054, 0.00054, 0.00054, 0.00054, 0.00053, 0.00053, 0.00053, 0.00053, 0.00053, 0.00054, 0.00054, 0.00053, 0.00053, 0.00055, 0.00053, 0.00054, 0.00053, 0.00054, 0.00054, 0.00053, 0.00053, 0.00054, 0.00053, 0.00053, 0.00054, 0.00054, 0.00054, 0.00054, 0.00053, 0.00053, 0.00054, 0.00053, 0.00053, 0.00054, 0.00054, 0.00054, 0.00054, 0.00054, 0.00053, 0.00054, 0.00053, 0.00054, 0.00053, 0.00054, 0.00053, 0.00053, 0.00054, 0.00054, 0.00053, 0.00054, 0.00054, 0.00054, 0.00053, 0.00053, 0.00053, 0.00053, 0.00053, 0.00053, 0.00054, 0.00053, 0.00054, 0.00053, 0.00053, 0.00054, 0.00053, 0.00054, 0.00053, 0.00053, 0.00053, 0.00053, 0.00054, 0.0006]}, "optimizer-time": {"start_step": 0, "end_step": 2000, "step_interval": 5, "values": [2.12049, 0.00501, 0.00496, 0.00513, 0.00494, 0.00512, 0.00493, 0.00495, 0.00494, 0.00491, 0.00493, 0.00491, 0.00494, 0.00492, 0.00498, 0.00492, 0.0049, 0.00495, 0.00492, 0.00497, 0.00492, 0.00491, 0.00492, 0.00492, 0.00492, 0.00491, 0.00496, 0.00498, 0.00494, 0.00491, 0.0049, 0.00492, 0.00494, 0.00492, 0.00491, 0.00497, 0.00492, 0.00491, 0.00492, 0.00493, 0.00493, 0.00491, 0.00492, 0.00494, 0.00492, 0.00556, 0.00493, 0.00491, 0.00512, 0.00512, 0.00492, 0.00493, 0.00494, 0.0049, 0.00494, 0.00495, 0.00496, 0.00491, 0.00491, 0.00496, 0.00492, 0.00493, 0.00512, 0.00493, 0.00493, 0.00494, 0.00491, 0.0049, 0.00491, 0.00496, 0.00492, 0.0049, 0.00489, 0.00495, 0.00491, 0.00488, 0.00493, 0.00491, 0.0049, 0.0049, 0.00526, 0.00491, 0.00503, 0.0049, 0.00519, 0.00488, 0.00492, 0.00491, 0.0049, 0.00491, 0.00489, 0.00491, 0.0049, 0.00487, 0.00489, 0.0049, 0.00489, 0.00539, 0.00473, 0.00548, 0.00489, 0.00551, 0.0049, 0.00493, 0.00471, 0.00529, 0.00491, 0.0049, 0.00491, 0.00489, 0.00522, 0.00479, 0.00492, 0.00492, 0.00503, 0.0049, 0.0048, 0.0049, 0.00492, 0.00494, 0.00475, 0.0049, 0.00498, 0.0049, 0.0049, 0.00489, 0.0049, 0.00536, 0.00494, 0.00492, 0.00474, 0.00491, 0.0049, 0.00491, 0.00516, 0.00489, 0.00491, 0.0049, 0.00492, 0.00493, 0.00506, 0.00489, 0.00489, 0.00491, 0.00534, 0.00497, 0.00488, 0.00496, 0.00493, 0.00489, 0.00494, 0.0049, 0.00493, 0.00492, 0.00478, 0.00489, 0.0049, 0.00501, 0.00493, 0.00496, 0.0049, 0.00496, 0.00496, 0.00496, 0.00492, 0.00494, 0.00516, 0.00496, 0.00497, 0.00495, 0.00494, 0.00494, 0.00493, 0.00496, 0.00494, 0.0051, 0.00495, 0.00495, 0.00493, 0.00492, 0.00495, 0.00493, 0.00498, 0.00491, 0.00494, 0.00492, 0.00496, 0.00491, 0.00491, 0.00493, 0.00492, 0.0049, 0.005, 0.00491, 0.00498, 0.00494, 0.00489, 0.00494, 0.00496, 0.00491, 0.00501, 0.00504, 0.00502, 0.00501, 0.00506, 0.00508, 0.00502, 0.00501, 0.00497, 0.00496, 0.005, 0.005, 0.00498, 0.00504, 0.00502, 0.00497, 0.00511, 0.00499, 0.00502, 0.00502, 0.00535, 0.00532, 0.00503, 0.00507, 0.005, 0.00501, 0.005, 0.00499, 0.00499, 0.00538, 0.00498, 0.00502, 0.00499, 0.00505, 0.00503, 0.00497, 0.00504, 0.00493, 0.00495, 0.00499, 0.00529, 0.00499, 0.00499, 0.00502, 0.00499, 0.00504, 0.00497, 0.00502, 0.005, 0.00501, 0.00503, 0.00504, 0.00496, 0.00502, 0.00502, 0.00501, 0.00503, 0.005, 0.00501, 0.00502, 0.00495, 0.00563, 0.00504, 0.005, 0.00496, 0.00494, 0.00501, 0.005, 0.00499, 0.0054, 0.00512, 0.00507, 0.00502, 0.005, 0.00501, 0.005, 0.00499, 0.00498, 0.00504, 0.00503, 0.00499, 0.00501, 0.00511, 0.00502, 0.00506, 0.00502, 0.00501, 0.00499, 0.00535, 0.00498, 0.00501, 0.00499, 0.00494, 0.00493, 0.00496, 0.00494, 0.00496, 0.00495, 0.00495, 0.00494, 0.00498, 0.00495, 0.00498, 0.00498, 0.00495, 0.005, 0.00492, 0.00493, 0.00494, 0.00492, 0.00498, 0.00494, 0.00496, 0.00495, 0.00497, 0.00506, 0.00494, 0.00497, 0.00498, 0.00495, 0.00494, 0.00495, 0.00497, 0.005, 0.00512, 0.00495, 0.00495, 0.00497, 0.00493, 0.00495, 0.00494, 0.00498, 0.00495, 0.00509, 0.005, 0.00498, 0.00493, 0.00494, 0.00496, 0.00495, 0.00497, 0.00495, 0.00495, 0.00496, 0.00491, 0.00494, 0.00498, 0.00494, 0.00494, 0.00495, 0.00496, 0.00495, 0.00501, 0.00495, 0.00508, 0.00493, 0.00505, 0.00493, 0.00494, 0.00495, 0.00495, 0.00496, 0.00501, 0.00497, 0.00499, 0.00499, 0.00499, 0.00495, 0.00494, 0.00498, 0.00498, 0.00498, 0.00497, 0.00499, 0.00499, 0.00497, 0.00494, 0.00495, 0.00497, 0.00497, 0.00496, 0.00496, 0.00496, 0.00501, 0.00501, 0.00497, 0.00503, 0.00498, 0.00498, 0.0051, 0.00507, 0.005, 0.00498, 0.00497, 0.00499, 0.00495, 0.00494, 0.00496, 0.00495, 0.00502]}, "learning-rate": {"start_step": 0, "end_step": 2000, "step_interval": 5, "values": [0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 1e-05, 1e-05, 1e-05, 1e-05, 1e-05, 1e-05, 1e-05, 1e-05, 1e-05, 1e-05, 1e-05, 1e-05, 1e-05, 1e-05, 1e-05, 1e-05, 1e-05, 1e-05, 1e-05, 1e-05, 1e-05, 1e-05, 1e-05, 1e-05, 1e-05, 1e-05, 1e-05, 1e-05, 1e-05, 1e-05, 1e-05, 1e-05, 1e-05, 1e-05, 1e-05, 1e-05, 1e-05, 1e-05, 1e-05, 1e-05, 1e-05, 1e-05, 2e-05, 2e-05, 2e-05, 2e-05, 2e-05, 2e-05, 2e-05, 2e-05, 2e-05, 2e-05, 2e-05, 2e-05, 2e-05, 2e-05, 2e-05, 2e-05, 2e-05, 2e-05, 2e-05, 2e-05, 2e-05, 2e-05, 2e-05, 2e-05, 2e-05, 2e-05, 2e-05, 2e-05, 2e-05, 2e-05, 2e-05, 2e-05, 2e-05, 2e-05, 2e-05, 2e-05, 2e-05, 2e-05, 2e-05, 2e-05, 2e-05, 2e-05, 2e-05, 3e-05, 3e-05, 3e-05, 3e-05, 3e-05, 3e-05, 3e-05, 3e-05, 3e-05, 3e-05, 3e-05, 3e-05, 3e-05, 3e-05, 3e-05, 3e-05, 3e-05, 3e-05, 3e-05, 3e-05, 3e-05, 3e-05, 3e-05, 3e-05, 3e-05, 3e-05, 3e-05, 3e-05, 3e-05, 3e-05, 3e-05, 3e-05, 3e-05, 3e-05, 3e-05, 3e-05, 3e-05, 3e-05, 3e-05, 3e-05, 3e-05, 3e-05, 3e-05, 4e-05, 4e-05, 4e-05, 4e-05, 4e-05, 4e-05, 4e-05, 4e-05, 4e-05, 4e-05, 4e-05, 4e-05, 4e-05, 4e-05, 4e-05, 4e-05, 4e-05, 4e-05, 4e-05, 4e-05, 4e-05, 4e-05, 4e-05, 4e-05, 4e-05, 4e-05, 4e-05, 4e-05, 4e-05, 4e-05, 4e-05, 4e-05, 4e-05, 4e-05, 4e-05, 4e-05, 4e-05, 4e-05, 4e-05, 4e-05, 4e-05, 4e-05, 5e-05, 5e-05, 5e-05, 5e-05, 5e-05, 5e-05, 5e-05, 5e-05, 5e-05, 5e-05, 5e-05, 5e-05, 5e-05, 5e-05, 5e-05, 5e-05, 5e-05, 5e-05, 5e-05, 5e-05, 5e-05, 5e-05, 5e-05, 5e-05, 5e-05, 5e-05, 5e-05, 5e-05, 5e-05, 5e-05, 5e-05, 5e-05, 5e-05, 5e-05, 5e-05, 5e-05, 5e-05, 5e-05, 5e-05, 5e-05, 5e-05, 5e-05, 5e-05, 6e-05, 6e-05, 6e-05, 6e-05, 6e-05, 6e-05, 6e-05, 6e-05, 6e-05, 6e-05, 6e-05, 6e-05, 6e-05, 6e-05, 6e-05, 6e-05, 6e-05, 6e-05, 6e-05, 6e-05, 6e-05, 6e-05, 6e-05, 6e-05, 6e-05, 6e-05, 6e-05, 6e-05, 6e-05, 6e-05, 6e-05, 6e-05, 6e-05, 6e-05, 6e-05, 6e-05, 6e-05, 6e-05, 6e-05, 6e-05, 6e-05, 6e-05, 6e-05, 7e-05, 7e-05, 7e-05, 7e-05, 7e-05, 7e-05, 7e-05, 7e-05, 7e-05, 7e-05, 7e-05, 7e-05, 7e-05, 7e-05, 7e-05, 7e-05, 7e-05, 7e-05, 7e-05, 7e-05, 7e-05, 7e-05, 7e-05, 7e-05, 7e-05, 7e-05, 7e-05, 7e-05, 7e-05, 7e-05, 7e-05, 7e-05, 7e-05, 7e-05, 7e-05, 7e-05, 7e-05, 7e-05, 7e-05, 7e-05, 7e-05, 7e-05, 8e-05, 8e-05, 8e-05, 8e-05, 8e-05, 8e-05, 8e-05, 8e-05, 8e-05, 8e-05, 8e-05, 8e-05, 8e-05, 8e-05, 8e-05, 8e-05, 8e-05, 8e-05, 8e-05, 8e-05, 8e-05, 8e-05, 8e-05, 8e-05, 8e-05, 8e-05, 8e-05, 8e-05, 8e-05, 8e-05, 8e-05, 8e-05, 8e-05, 8e-05, 8e-05, 8e-05, 8e-05, 8e-05, 8e-05, 8e-05, 8e-05, 8e-05, 8e-05, 9e-05, 9e-05, 9e-05, 9e-05, 9e-05, 9e-05, 9e-05, 9e-05, 9e-05, 9e-05, 9e-05, 9e-05, 9e-05, 9e-05, 9e-05, 9e-05, 9e-05, 9e-05, 9e-05, 9e-05, 9e-05, 9e-05, 9e-05, 9e-05, 9e-05, 9e-05, 9e-05, 9e-05, 9e-05, 9e-05, 9e-05, 9e-05, 9e-05, 9e-05, 9e-05, 9e-05, 9e-05]}, "learning-rate vs samples": {"start_step": 0, "end_step": 2000, "step_interval": 5, "values": [0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 1e-05, 1e-05, 1e-05, 1e-05, 1e-05, 1e-05, 1e-05, 1e-05, 1e-05, 1e-05, 1e-05, 1e-05, 1e-05, 1e-05, 1e-05, 1e-05, 1e-05, 1e-05, 1e-05, 1e-05, 1e-05, 1e-05, 1e-05, 1e-05, 1e-05, 1e-05, 1e-05, 1e-05, 1e-05, 1e-05, 1e-05, 1e-05, 1e-05, 1e-05, 1e-05, 1e-05, 1e-05, 1e-05, 1e-05, 1e-05, 1e-05, 1e-05, 2e-05, 2e-05, 2e-05, 2e-05, 2e-05, 2e-05, 2e-05, 2e-05, 2e-05, 2e-05, 2e-05, 2e-05, 2e-05, 2e-05, 2e-05, 2e-05, 2e-05, 2e-05, 2e-05, 2e-05, 2e-05, 2e-05, 2e-05, 2e-05, 2e-05, 2e-05, 2e-05, 2e-05, 2e-05, 2e-05, 2e-05, 2e-05, 2e-05, 2e-05, 2e-05, 2e-05, 2e-05, 2e-05, 2e-05, 2e-05, 2e-05, 2e-05, 2e-05, 3e-05, 3e-05, 3e-05, 3e-05, 3e-05, 3e-05, 3e-05, 3e-05, 3e-05, 3e-05, 3e-05, 3e-05, 3e-05, 3e-05, 3e-05, 3e-05, 3e-05, 3e-05, 3e-05, 3e-05, 3e-05, 3e-05, 3e-05, 3e-05, 3e-05, 3e-05, 3e-05, 3e-05, 3e-05, 3e-05, 3e-05, 3e-05, 3e-05, 3e-05, 3e-05, 3e-05, 3e-05, 3e-05, 3e-05, 3e-05, 3e-05, 3e-05, 3e-05, 4e-05, 4e-05, 4e-05, 4e-05, 4e-05, 4e-05, 4e-05, 4e-05, 4e-05, 4e-05, 4e-05, 4e-05, 4e-05, 4e-05, 4e-05, 4e-05, 4e-05, 4e-05, 4e-05, 4e-05, 4e-05, 4e-05, 4e-05, 4e-05, 4e-05, 4e-05, 4e-05, 4e-05, 4e-05, 4e-05, 4e-05, 4e-05, 4e-05, 4e-05, 4e-05, 4e-05, 4e-05, 4e-05, 4e-05, 4e-05, 4e-05, 4e-05, 5e-05, 5e-05, 5e-05, 5e-05, 5e-05, 5e-05, 5e-05, 5e-05, 5e-05, 5e-05, 5e-05, 5e-05, 5e-05, 5e-05, 5e-05, 5e-05, 5e-05, 5e-05, 5e-05, 5e-05, 5e-05, 5e-05, 5e-05, 5e-05, 5e-05, 5e-05, 5e-05, 5e-05, 5e-05, 5e-05, 5e-05, 5e-05, 5e-05, 5e-05, 5e-05, 5e-05, 5e-05, 5e-05, 5e-05, 5e-05, 5e-05, 5e-05, 5e-05, 6e-05, 6e-05, 6e-05, 6e-05, 6e-05, 6e-05, 6e-05, 6e-05, 6e-05, 6e-05, 6e-05, 6e-05, 6e-05, 6e-05, 6e-05, 6e-05, 6e-05, 6e-05, 6e-05, 6e-05, 6e-05, 6e-05, 6e-05, 6e-05, 6e-05, 6e-05, 6e-05, 6e-05, 6e-05, 6e-05, 6e-05, 6e-05, 6e-05, 6e-05, 6e-05, 6e-05, 6e-05, 6e-05, 6e-05, 6e-05, 6e-05, 6e-05, 6e-05, 7e-05, 7e-05, 7e-05, 7e-05, 7e-05, 7e-05, 7e-05, 7e-05, 7e-05, 7e-05, 7e-05, 7e-05, 7e-05, 7e-05, 7e-05, 7e-05, 7e-05, 7e-05, 7e-05, 7e-05, 7e-05, 7e-05, 7e-05, 7e-05, 7e-05, 7e-05, 7e-05, 7e-05, 7e-05, 7e-05, 7e-05, 7e-05, 7e-05, 7e-05, 7e-05, 7e-05, 7e-05, 7e-05, 7e-05, 7e-05, 7e-05, 7e-05, 8e-05, 8e-05, 8e-05, 8e-05, 8e-05, 8e-05, 8e-05, 8e-05, 8e-05, 8e-05, 8e-05, 8e-05, 8e-05, 8e-05, 8e-05, 8e-05, 8e-05, 8e-05, 8e-05, 8e-05, 8e-05, 8e-05, 8e-05, 8e-05, 8e-05, 8e-05, 8e-05, 8e-05, 8e-05, 8e-05, 8e-05, 8e-05, 8e-05, 8e-05, 8e-05, 8e-05, 8e-05, 8e-05, 8e-05, 8e-05, 8e-05, 8e-05, 8e-05, 9e-05, 9e-05, 9e-05, 9e-05, 9e-05, 9e-05, 9e-05, 9e-05, 9e-05, 9e-05, 9e-05, 9e-05, 9e-05, 9e-05, 9e-05, 9e-05, 9e-05, 9e-05, 9e-05, 9e-05, 9e-05, 9e-05, 9e-05, 9e-05, 9e-05, 9e-05, 9e-05, 9e-05, 9e-05, 9e-05, 9e-05, 9e-05, 9e-05, 9e-05, 9e-05, 9e-05, 9e-05]}, "batch-size": {"start_step": 0, "end_step": 2000, "step_interval": 5, "values": [128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0]}, "batch-size vs samples": {"start_step": 0, "end_step": 2000, "step_interval": 5, "values": [128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0]}, "lm loss": {"start_step": 0, "end_step": 2000, "step_interval": 5, "values": [10.85966, 10.87073, 10.85528, 10.80344, 10.64111, 10.62649, 10.41586, 10.12808, 9.92567, 9.82477, 9.56932, 9.84031, 9.86916, 9.61422, 9.77599, 9.50086, 9.45226, 9.6411, 9.38013, 9.32634, 9.2385, 9.14186, 9.17287, 8.9927, 9.18814, 9.05768, 9.15476, 9.16458, 9.29864, 8.98678, 8.93067, 9.0473, 9.04611, 8.65648, 8.71651, 8.75511, 8.6848, 8.73632, 8.66102, 8.76482, 8.66202, 8.84911, 8.83074, 8.49813, 8.38745, 8.42847, 8.49038, 8.38199, 8.43014, 8.57752, 8.36366, 8.18998, 8.22416, 8.21877, 8.26315, 7.90938, 8.09005, 7.88773, 8.24, 8.22485, 7.99867, 7.95704, 7.91177, 7.73255, 7.73299, 7.63614, 7.50837, 7.90027, 7.69288, 7.44749, 7.73489, 7.76278, 7.53675, 7.29662, 7.44913, 7.33262, 7.46188, 7.22442, 7.63668, 7.27892, 7.3525, 7.21173, 7.21816, 7.422, 7.17639, 7.28501, 7.00259, 7.00597, 7.03995, 7.14192, 6.82608, 6.98941, 7.09192, 7.00491, 6.87719, 6.75925, 6.994, 7.05741, 6.70391, 6.57997, 6.72686, 6.74254, 6.73498, 6.73924, 6.65693, 6.40819, 6.63945, 6.61998, 6.44777, 6.63026, 6.7458, 6.60872, 6.72566, 6.6941, 6.62478, 6.5113, 6.60016, 6.40683, 6.66647, 6.25038, 6.25487, 6.30344, 6.39244, 6.35319, 6.45279, 6.29501, 6.34432, 6.24122, 6.20479, 6.40226, 6.3298, 6.33253, 6.17365, 6.1703, 6.25122, 6.39707, 6.21313, 6.16095, 6.19193, 6.12904, 6.07716, 6.08434, 6.27156, 6.42116, 6.27092, 6.31502, 6.1099, 6.19051, 6.01202, 6.04186, 5.96572, 6.2566, 6.1994, 5.97238, 5.79066, 6.13517, 5.8567, 6.11381, 5.79621, 6.16806, 6.15725, 6.09481, 5.94172, 6.12313, 5.95406, 6.20205, 5.90266, 5.80426, 5.78673, 5.69691, 6.02057, 6.00205, 6.07073, 5.89354, 6.04415, 5.97229, 5.99763, 5.99201, 5.9504, 5.83989, 5.95152, 5.61741, 5.70128, 5.88995, 5.84414, 5.86222, 5.76021, 5.83835, 5.72362, 5.56328, 5.72206, 5.62699, 5.83296, 5.60473, 5.71241, 5.71399, 5.89863, 5.64481, 5.85045, 5.74116, 5.86786, 5.33069, 5.89739, 5.87147, 5.85621, 5.41402, 5.40885, 5.6244, 5.5909, 5.48288, 5.57328, 5.66993, 5.47325, 5.74532, 5.50733, 5.58951, 5.62335, 5.61873, 5.50712, 5.61686, 5.67259, 5.68325, 5.58652, 5.65724, 5.37154, 5.68206, 5.62545, 5.42293, 5.5898, 5.63487, 5.55215, 5.34318, 5.53918, 5.48775, 5.48384, 5.38046, 5.5524, 5.6054, 5.39011, 5.52269, 5.48564, 5.33339, 5.50751, 5.41235, 5.44463, 5.32284, 5.07354, 5.47834, 5.57158, 5.71691, 5.41899, 5.60533, 5.64283, 5.2342, 5.27417, 5.39872, 5.39954, 5.33267, 5.50546, 5.18598, 5.3031, 5.25146, 5.37886, 5.25856, 5.45542, 5.53656, 5.3141, 5.4389, 5.34171, 5.07715, 5.31356, 5.26151, 5.30932, 5.1132, 5.27888, 5.26913, 5.47802, 5.16411, 5.27179, 5.21046, 5.36047, 4.98558, 4.92161, 5.33001, 5.39104, 5.23106, 5.32226, 5.1108, 5.16307, 5.26011, 5.06878, 5.26621, 5.0712, 5.34447, 5.24947, 5.15197, 5.24511, 5.04213, 5.3173, 5.05677, 5.03031, 5.14366, 5.11315, 5.27152, 5.15384, 5.27818, 5.09471, 5.09718, 5.25022, 5.32221, 5.25368, 5.19177, 5.14141, 5.29041, 4.95105, 5.2074, 5.08987, 5.30215, 5.17471, 5.18799, 5.1137, 4.98327, 4.99184, 5.2222, 5.31185, 5.09737, 5.05507, 4.91447, 5.12386, 5.11467, 4.92535, 5.33586, 5.02667, 5.10506, 5.16491, 5.00221, 5.06296, 5.06915, 4.9949, 5.07922, 5.16029, 4.97927, 5.18201, 4.92792, 4.92204, 5.06399, 4.99471, 4.90735, 4.77765, 4.94535, 5.11795, 5.01969, 5.02225, 5.33057, 4.96058, 4.9931, 5.0457, 4.81181, 4.74328, 4.99687, 5.0383, 4.87423, 4.95276, 5.04325, 5.02264, 4.81956, 4.89599, 4.90754, 4.8294, 4.74438, 5.01179, 4.75262, 5.2095, 4.78557, 4.99344, 4.73813, 4.78739, 4.82401, 4.64885, 4.65631, 4.84474, 4.80822, 4.80327, 4.92878, 4.88473, 4.93264, 4.7706, 4.88531, 4.73767, 4.91524, 4.95719, 4.87814, 4.70608, 4.7878, 4.89822, 4.71172, 4.87123, 4.69258, 4.69633, 4.64631]}, "lm loss vs samples": {"start_step": 0, "end_step": 2000, "step_interval": 5, "values": [10.85966, 10.87073, 10.85528, 10.80344, 10.64111, 10.62649, 10.41586, 10.12808, 9.92567, 9.82477, 9.56932, 9.84031, 9.86916, 9.61422, 9.77599, 9.50086, 9.45226, 9.6411, 9.38013, 9.32634, 9.2385, 9.14186, 9.17287, 8.9927, 9.18814, 9.05768, 9.15476, 9.16458, 9.29864, 8.98678, 8.93067, 9.0473, 9.04611, 8.65648, 8.71651, 8.75511, 8.6848, 8.73632, 8.66102, 8.76482, 8.66202, 8.84911, 8.83074, 8.49813, 8.38745, 8.42847, 8.49038, 8.38199, 8.43014, 8.57752, 8.36366, 8.18998, 8.22416, 8.21877, 8.26315, 7.90938, 8.09005, 7.88773, 8.24, 8.22485, 7.99867, 7.95704, 7.91177, 7.73255, 7.73299, 7.63614, 7.50837, 7.90027, 7.69288, 7.44749, 7.73489, 7.76278, 7.53675, 7.29662, 7.44913, 7.33262, 7.46188, 7.22442, 7.63668, 7.27892, 7.3525, 7.21173, 7.21816, 7.422, 7.17639, 7.28501, 7.00259, 7.00597, 7.03995, 7.14192, 6.82608, 6.98941, 7.09192, 7.00491, 6.87719, 6.75925, 6.994, 7.05741, 6.70391, 6.57997, 6.72686, 6.74254, 6.73498, 6.73924, 6.65693, 6.40819, 6.63945, 6.61998, 6.44777, 6.63026, 6.7458, 6.60872, 6.72566, 6.6941, 6.62478, 6.5113, 6.60016, 6.40683, 6.66647, 6.25038, 6.25487, 6.30344, 6.39244, 6.35319, 6.45279, 6.29501, 6.34432, 6.24122, 6.20479, 6.40226, 6.3298, 6.33253, 6.17365, 6.1703, 6.25122, 6.39707, 6.21313, 6.16095, 6.19193, 6.12904, 6.07716, 6.08434, 6.27156, 6.42116, 6.27092, 6.31502, 6.1099, 6.19051, 6.01202, 6.04186, 5.96572, 6.2566, 6.1994, 5.97238, 5.79066, 6.13517, 5.8567, 6.11381, 5.79621, 6.16806, 6.15725, 6.09481, 5.94172, 6.12313, 5.95406, 6.20205, 5.90266, 5.80426, 5.78673, 5.69691, 6.02057, 6.00205, 6.07073, 5.89354, 6.04415, 5.97229, 5.99763, 5.99201, 5.9504, 5.83989, 5.95152, 5.61741, 5.70128, 5.88995, 5.84414, 5.86222, 5.76021, 5.83835, 5.72362, 5.56328, 5.72206, 5.62699, 5.83296, 5.60473, 5.71241, 5.71399, 5.89863, 5.64481, 5.85045, 5.74116, 5.86786, 5.33069, 5.89739, 5.87147, 5.85621, 5.41402, 5.40885, 5.6244, 5.5909, 5.48288, 5.57328, 5.66993, 5.47325, 5.74532, 5.50733, 5.58951, 5.62335, 5.61873, 5.50712, 5.61686, 5.67259, 5.68325, 5.58652, 5.65724, 5.37154, 5.68206, 5.62545, 5.42293, 5.5898, 5.63487, 5.55215, 5.34318, 5.53918, 5.48775, 5.48384, 5.38046, 5.5524, 5.6054, 5.39011, 5.52269, 5.48564, 5.33339, 5.50751, 5.41235, 5.44463, 5.32284, 5.07354, 5.47834, 5.57158, 5.71691, 5.41899, 5.60533, 5.64283, 5.2342, 5.27417, 5.39872, 5.39954, 5.33267, 5.50546, 5.18598, 5.3031, 5.25146, 5.37886, 5.25856, 5.45542, 5.53656, 5.3141, 5.4389, 5.34171, 5.07715, 5.31356, 5.26151, 5.30932, 5.1132, 5.27888, 5.26913, 5.47802, 5.16411, 5.27179, 5.21046, 5.36047, 4.98558, 4.92161, 5.33001, 5.39104, 5.23106, 5.32226, 5.1108, 5.16307, 5.26011, 5.06878, 5.26621, 5.0712, 5.34447, 5.24947, 5.15197, 5.24511, 5.04213, 5.3173, 5.05677, 5.03031, 5.14366, 5.11315, 5.27152, 5.15384, 5.27818, 5.09471, 5.09718, 5.25022, 5.32221, 5.25368, 5.19177, 5.14141, 5.29041, 4.95105, 5.2074, 5.08987, 5.30215, 5.17471, 5.18799, 5.1137, 4.98327, 4.99184, 5.2222, 5.31185, 5.09737, 5.05507, 4.91447, 5.12386, 5.11467, 4.92535, 5.33586, 5.02667, 5.10506, 5.16491, 5.00221, 5.06296, 5.06915, 4.9949, 5.07922, 5.16029, 4.97927, 5.18201, 4.92792, 4.92204, 5.06399, 4.99471, 4.90735, 4.77765, 4.94535, 5.11795, 5.01969, 5.02225, 5.33057, 4.96058, 4.9931, 5.0457, 4.81181, 4.74328, 4.99687, 5.0383, 4.87423, 4.95276, 5.04325, 5.02264, 4.81956, 4.89599, 4.90754, 4.8294, 4.74438, 5.01179, 4.75262, 5.2095, 4.78557, 4.99344, 4.73813, 4.78739, 4.82401, 4.64885, 4.65631, 4.84474, 4.80822, 4.80327, 4.92878, 4.88473, 4.93264, 4.7706, 4.88531, 4.73767, 4.91524, 4.95719, 4.87814, 4.70608, 4.7878, 4.89822, 4.71172, 4.87123, 4.69258, 4.69633, 4.64631]}, "loss-scale": {"start_step": 0, "end_step": 2000, "step_interval": 5, "values": [1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0]}, "loss-scale vs samples": {"start_step": 0, "end_step": 2000, "step_interval": 5, "values": [1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0]}, "grad-norm": {"start_step": 0, "end_step": 2000, "step_interval": 5, "values": [13.56517, 13.52183, 13.82389, 12.68199, 12.11513, 9.42628, 6.78009, 6.96682, 6.03524, 4.63457, 4.1513, 2.87067, 2.35463, 2.3279, 2.02459, 2.22441, 2.16108, 1.87618, 2.21105, 2.06296, 2.12729, 2.152, 2.00687, 2.2248, 1.98285, 2.1147, 1.92124, 1.92395, 1.94527, 2.15653, 2.0865, 1.94545, 1.87214, 2.15774, 2.14492, 2.10813, 1.99702, 1.84398, 1.93326, 1.73194, 2.15655, 1.83365, 1.74796, 1.87637, 1.87935, 1.82812, 1.70882, 1.75031, 1.75541, 1.56033, 1.72362, 1.80715, 1.77318, 1.81611, 1.66844, 1.80559, 1.7625, 1.84598, 1.62632, 1.48661, 1.64786, 1.45473, 1.77763, 1.80854, 1.64942, 1.65627, 1.70353, 1.60171, 1.44031, 1.72339, 1.43433, 1.37767, 1.68581, 1.37671, 1.40648, 1.61691, 1.50881, 1.38382, 1.44532, 1.27357, 1.36667, 1.33118, 1.30365, 1.39513, 1.39043, 1.4631, 1.55974, 1.45774, 1.22995, 1.11972, 1.09726, 1.20059, 1.10224, 1.31175, 1.01034, 1.30362, 1.38885, 1.05046, 0.94787, 1.76252, 1.11012, 1.2148, 1.71468, 1.62278, 0.95552, 1.16789, 1.17655, 1.03922, 1.21282, 1.1032, 0.98669, 0.95678, 1.1193, 1.05737, 1.01498, 1.16799, 0.97578, 1.42941, 1.13594, 1.05985, 0.9398, 1.10182, 1.02064, 1.3517, 1.44708, 2.04415, 1.69036, 1.40806, 1.38738, 1.3424, 0.99552, 1.67778, 1.38915, 1.16703, 1.21285, 1.27027, 1.08112, 1.56529, 1.11243, 1.55047, 1.88478, 1.49661, 1.24747, 1.30858, 1.0413, 1.79193, 1.1894, 1.10832, 1.14553, 1.37473, 1.12916, 1.19043, 1.55147, 1.14787, 0.9831, 1.97748, 1.30968, 1.75548, 1.42903, 1.47772, 1.63806, 1.08487, 1.3989, 1.02365, 1.24838, 1.43469, 1.42662, 1.30881, 1.20964, 1.49347, 1.21919, 1.05332, 1.18399, 1.38555, 1.13727, 1.36432, 1.2528, 1.17022, 1.32348, 1.07935, 1.19539, 1.48684, 1.19029, 1.2198, 1.81559, 1.52452, 1.79334, 1.66013, 1.20616, 1.67532, 1.19437, 1.28, 1.33364, 1.69679, 1.53842, 1.37202, 1.34387, 1.37081, 1.28649, 1.5618, 1.03326, 1.39685, 1.27238, 1.20598, 1.32922, 1.41054, 1.32813, 1.46075, 1.18533, 1.18314, 1.37783, 1.39264, 1.2322, 1.35301, 1.51994, 1.29479, 1.54145, 1.57876, 1.23038, 1.67935, 1.59903, 1.7688, 1.38891, 1.39714, 1.41056, 1.56263, 1.84649, 1.31226, 2.25632, 1.5966, 1.20159, 1.49708, 1.73963, 1.47932, 1.74434, 1.84578, 1.28148, 1.58712, 1.57826, 1.14575, 1.37743, 1.14726, 1.36495, 1.54092, 1.1998, 1.83908, 1.60608, 1.22735, 1.39352, 1.48052, 1.44922, 1.5986, 1.86828, 1.2133, 1.28534, 1.44591, 1.40707, 1.6217, 1.68123, 1.16996, 1.40545, 1.79994, 1.32408, 1.35454, 1.82216, 1.50619, 1.25331, 1.36593, 1.33067, 1.20379, 1.1715, 1.34612, 1.23828, 1.2249, 1.23199, 1.50931, 1.24187, 1.31666, 1.33544, 1.15247, 1.35164, 1.31814, 1.51121, 1.22179, 1.26518, 1.48248, 1.47105, 2.08081, 1.48841, 1.53234, 1.46321, 1.4755, 1.16048, 1.44268, 1.5642, 1.52523, 1.38495, 1.80119, 1.63483, 1.41261, 1.60553, 1.28802, 1.15347, 1.54912, 1.53753, 1.36296, 1.66631, 1.63888, 1.24348, 1.42956, 1.32686, 1.487, 1.7063, 1.383, 1.67566, 1.4665, 1.41433, 1.44807, 1.36307, 1.13744, 1.63129, 1.56395, 1.59787, 1.49857, 1.45091, 1.60777, 1.36633, 1.34096, 1.63579, 1.34741, 1.48819, 1.66258, 1.532, 1.46235, 1.36272, 1.36735, 1.33239, 1.3176, 1.2966, 1.56971, 1.31551, 1.50053, 1.27598, 1.29926, 1.5045, 1.39074, 1.41138, 1.40198, 1.46432, 1.38696, 1.52639, 1.55526, 1.4432, 1.27923, 1.48503, 1.17404, 1.20825, 1.60545, 1.81024, 1.35059, 1.28697, 1.50174, 1.46699, 1.33784, 1.08159, 1.61115, 1.46019, 1.37898, 1.35614, 1.65157, 1.46597, 1.60688, 1.72399, 1.30124, 1.44364, 1.32297, 1.13212, 1.45342, 1.38164, 1.21948, 1.26404, 1.33477, 1.30704, 1.51357, 1.26848, 1.55252, 1.33368, 1.41811, 1.47778, 1.31706, 1.20105, 1.48475, 1.28543, 1.46568, 1.42638, 1.25259, 1.60254, 1.36812, 1.3586, 1.15672]}, "grad-norm vs samples": {"start_step": 0, "end_step": 2000, "step_interval": 5, "values": [13.56517, 13.52183, 13.82389, 12.68199, 12.11513, 9.42628, 6.78009, 6.96682, 6.03524, 4.63457, 4.1513, 2.87067, 2.35463, 2.3279, 2.02459, 2.22441, 2.16108, 1.87618, 2.21105, 2.06296, 2.12729, 2.152, 2.00687, 2.2248, 1.98285, 2.1147, 1.92124, 1.92395, 1.94527, 2.15653, 2.0865, 1.94545, 1.87214, 2.15774, 2.14492, 2.10813, 1.99702, 1.84398, 1.93326, 1.73194, 2.15655, 1.83365, 1.74796, 1.87637, 1.87935, 1.82812, 1.70882, 1.75031, 1.75541, 1.56033, 1.72362, 1.80715, 1.77318, 1.81611, 1.66844, 1.80559, 1.7625, 1.84598, 1.62632, 1.48661, 1.64786, 1.45473, 1.77763, 1.80854, 1.64942, 1.65627, 1.70353, 1.60171, 1.44031, 1.72339, 1.43433, 1.37767, 1.68581, 1.37671, 1.40648, 1.61691, 1.50881, 1.38382, 1.44532, 1.27357, 1.36667, 1.33118, 1.30365, 1.39513, 1.39043, 1.4631, 1.55974, 1.45774, 1.22995, 1.11972, 1.09726, 1.20059, 1.10224, 1.31175, 1.01034, 1.30362, 1.38885, 1.05046, 0.94787, 1.76252, 1.11012, 1.2148, 1.71468, 1.62278, 0.95552, 1.16789, 1.17655, 1.03922, 1.21282, 1.1032, 0.98669, 0.95678, 1.1193, 1.05737, 1.01498, 1.16799, 0.97578, 1.42941, 1.13594, 1.05985, 0.9398, 1.10182, 1.02064, 1.3517, 1.44708, 2.04415, 1.69036, 1.40806, 1.38738, 1.3424, 0.99552, 1.67778, 1.38915, 1.16703, 1.21285, 1.27027, 1.08112, 1.56529, 1.11243, 1.55047, 1.88478, 1.49661, 1.24747, 1.30858, 1.0413, 1.79193, 1.1894, 1.10832, 1.14553, 1.37473, 1.12916, 1.19043, 1.55147, 1.14787, 0.9831, 1.97748, 1.30968, 1.75548, 1.42903, 1.47772, 1.63806, 1.08487, 1.3989, 1.02365, 1.24838, 1.43469, 1.42662, 1.30881, 1.20964, 1.49347, 1.21919, 1.05332, 1.18399, 1.38555, 1.13727, 1.36432, 1.2528, 1.17022, 1.32348, 1.07935, 1.19539, 1.48684, 1.19029, 1.2198, 1.81559, 1.52452, 1.79334, 1.66013, 1.20616, 1.67532, 1.19437, 1.28, 1.33364, 1.69679, 1.53842, 1.37202, 1.34387, 1.37081, 1.28649, 1.5618, 1.03326, 1.39685, 1.27238, 1.20598, 1.32922, 1.41054, 1.32813, 1.46075, 1.18533, 1.18314, 1.37783, 1.39264, 1.2322, 1.35301, 1.51994, 1.29479, 1.54145, 1.57876, 1.23038, 1.67935, 1.59903, 1.7688, 1.38891, 1.39714, 1.41056, 1.56263, 1.84649, 1.31226, 2.25632, 1.5966, 1.20159, 1.49708, 1.73963, 1.47932, 1.74434, 1.84578, 1.28148, 1.58712, 1.57826, 1.14575, 1.37743, 1.14726, 1.36495, 1.54092, 1.1998, 1.83908, 1.60608, 1.22735, 1.39352, 1.48052, 1.44922, 1.5986, 1.86828, 1.2133, 1.28534, 1.44591, 1.40707, 1.6217, 1.68123, 1.16996, 1.40545, 1.79994, 1.32408, 1.35454, 1.82216, 1.50619, 1.25331, 1.36593, 1.33067, 1.20379, 1.1715, 1.34612, 1.23828, 1.2249, 1.23199, 1.50931, 1.24187, 1.31666, 1.33544, 1.15247, 1.35164, 1.31814, 1.51121, 1.22179, 1.26518, 1.48248, 1.47105, 2.08081, 1.48841, 1.53234, 1.46321, 1.4755, 1.16048, 1.44268, 1.5642, 1.52523, 1.38495, 1.80119, 1.63483, 1.41261, 1.60553, 1.28802, 1.15347, 1.54912, 1.53753, 1.36296, 1.66631, 1.63888, 1.24348, 1.42956, 1.32686, 1.487, 1.7063, 1.383, 1.67566, 1.4665, 1.41433, 1.44807, 1.36307, 1.13744, 1.63129, 1.56395, 1.59787, 1.49857, 1.45091, 1.60777, 1.36633, 1.34096, 1.63579, 1.34741, 1.48819, 1.66258, 1.532, 1.46235, 1.36272, 1.36735, 1.33239, 1.3176, 1.2966, 1.56971, 1.31551, 1.50053, 1.27598, 1.29926, 1.5045, 1.39074, 1.41138, 1.40198, 1.46432, 1.38696, 1.52639, 1.55526, 1.4432, 1.27923, 1.48503, 1.17404, 1.20825, 1.60545, 1.81024, 1.35059, 1.28697, 1.50174, 1.46699, 1.33784, 1.08159, 1.61115, 1.46019, 1.37898, 1.35614, 1.65157, 1.46597, 1.60688, 1.72399, 1.30124, 1.44364, 1.32297, 1.13212, 1.45342, 1.38164, 1.21948, 1.26404, 1.33477, 1.30704, 1.51357, 1.26848, 1.55252, 1.33368, 1.41811, 1.47778, 1.31706, 1.20105, 1.48475, 1.28543, 1.46568, 1.42638, 1.25259, 1.60254, 1.36812, 1.3586, 1.15672]}, "num-zeros": {"start_step": 0, "end_step": 2000, "step_interval": 5, "values": [78.0, 71.0, 69.0, 77.0, 83.0, 93.0, 106.0, 92.0, 92.0, 132.0, 100.0, 151.0, 124.0, 174.0, 156.0, 150.0, 169.0, 195.0, 167.0, 147.0, 152.0, 152.0, 200.0, 189.0, 169.0, 153.0, 197.0, 164.0, 147.0, 172.0, 144.0, 157.0, 169.0, 165.0, 146.0, 179.0, 172.0, 212.0, 186.0, 196.0, 171.0, 138.0, 152.0, 197.0, 156.0, 167.0, 212.0, 178.0, 187.0, 180.0, 190.0, 159.0, 176.0, 163.0, 179.0, 191.0, 150.0, 150.0, 227.0, 225.0, 197.0, 184.0, 184.0, 199.0, 214.0, 235.0, 186.0, 197.0, 214.0, 222.0, 193.0, 241.0, 159.0, 264.0, 193.0, 187.0, 201.0, 208.0, 227.0, 223.0, 225.0, 212.0, 231.0, 219.0, 202.0, 196.0, 178.0, 182.0, 185.0, 210.0, 201.0, 198.0, 213.0, 214.0, 205.0, 161.0, 183.0, 193.0, 198.0, 178.0, 190.0, 166.0, 137.0, 154.0, 183.0, 150.0, 165.0, 166.0, 127.0, 174.0, 160.0, 171.0, 188.0, 172.0, 159.0, 152.0, 151.0, 127.0, 137.0, 145.0, 172.0, 135.0, 151.0, 158.0, 141.0, 113.0, 114.0, 93.0, 113.0, 128.0, 148.0, 125.0, 114.0, 127.0, 121.0, 117.0, 146.0, 116.0, 148.0, 137.0, 108.0, 114.0, 129.0, 141.0, 130.0, 107.0, 113.0, 126.0, 130.0, 102.0, 127.0, 110.0, 108.0, 109.0, 112.0, 65.0, 98.0, 84.0, 105.0, 108.0, 95.0, 135.0, 103.0, 123.0, 101.0, 102.0, 101.0, 117.0, 109.0, 106.0, 123.0, 114.0, 102.0, 88.0, 131.0, 104.0, 116.0, 108.0, 142.0, 118.0, 121.0, 115.0, 118.0, 115.0, 106.0, 119.0, 105.0, 84.0, 106.0, 91.0, 120.0, 114.0, 140.0, 96.0, 85.0, 100.0, 114.0, 103.0, 153.0, 88.0, 120.0, 96.0, 122.0, 111.0, 89.0, 107.0, 111.0, 97.0, 128.0, 103.0, 123.0, 90.0, 94.0, 82.0, 100.0, 109.0, 112.0, 104.0, 119.0, 90.0, 77.0, 114.0, 82.0, 103.0, 104.0, 104.0, 97.0, 127.0, 67.0, 99.0, 126.0, 90.0, 84.0, 109.0, 94.0, 97.0, 107.0, 113.0, 127.0, 100.0, 115.0, 102.0, 96.0, 116.0, 125.0, 102.0, 91.0, 126.0, 114.0, 101.0, 113.0, 110.0, 96.0, 126.0, 121.0, 99.0, 104.0, 108.0, 86.0, 143.0, 120.0, 83.0, 115.0, 92.0, 73.0, 113.0, 117.0, 111.0, 93.0, 106.0, 131.0, 93.0, 121.0, 109.0, 108.0, 115.0, 117.0, 116.0, 105.0, 110.0, 103.0, 112.0, 85.0, 118.0, 126.0, 119.0, 120.0, 104.0, 112.0, 111.0, 108.0, 107.0, 126.0, 123.0, 100.0, 81.0, 101.0, 106.0, 93.0, 109.0, 104.0, 131.0, 134.0, 98.0, 105.0, 129.0, 83.0, 87.0, 128.0, 116.0, 114.0, 111.0, 94.0, 114.0, 91.0, 97.0, 93.0, 116.0, 135.0, 122.0, 111.0, 126.0, 107.0, 107.0, 101.0, 82.0, 120.0, 142.0, 124.0, 120.0, 124.0, 122.0, 97.0, 96.0, 107.0, 102.0, 123.0, 115.0, 126.0, 116.0, 122.0, 115.0, 107.0, 111.0, 95.0, 93.0, 113.0, 117.0, 101.0, 110.0, 126.0, 113.0, 112.0, 127.0, 138.0, 118.0, 133.0, 94.0, 105.0, 119.0, 121.0, 122.0, 102.0, 98.0, 119.0, 103.0, 108.0, 134.0, 116.0, 107.0, 105.0, 99.0, 99.0, 117.0, 106.0, 133.0, 108.0, 110.0, 99.0, 140.0, 107.0, 104.0, 114.0, 112.0, 117.0, 106.0, 105.0, 92.0, 111.0, 99.0, 124.0, 101.0, 102.0, 144.0, 129.0, 122.0, 110.0, 116.0, 123.0, 136.0]}, "num-zeros vs samples": {"start_step": 0, "end_step": 2000, "step_interval": 5, "values": [78.0, 71.0, 69.0, 77.0, 83.0, 93.0, 106.0, 92.0, 92.0, 132.0, 100.0, 151.0, 124.0, 174.0, 156.0, 150.0, 169.0, 195.0, 167.0, 147.0, 152.0, 152.0, 200.0, 189.0, 169.0, 153.0, 197.0, 164.0, 147.0, 172.0, 144.0, 157.0, 169.0, 165.0, 146.0, 179.0, 172.0, 212.0, 186.0, 196.0, 171.0, 138.0, 152.0, 197.0, 156.0, 167.0, 212.0, 178.0, 187.0, 180.0, 190.0, 159.0, 176.0, 163.0, 179.0, 191.0, 150.0, 150.0, 227.0, 225.0, 197.0, 184.0, 184.0, 199.0, 214.0, 235.0, 186.0, 197.0, 214.0, 222.0, 193.0, 241.0, 159.0, 264.0, 193.0, 187.0, 201.0, 208.0, 227.0, 223.0, 225.0, 212.0, 231.0, 219.0, 202.0, 196.0, 178.0, 182.0, 185.0, 210.0, 201.0, 198.0, 213.0, 214.0, 205.0, 161.0, 183.0, 193.0, 198.0, 178.0, 190.0, 166.0, 137.0, 154.0, 183.0, 150.0, 165.0, 166.0, 127.0, 174.0, 160.0, 171.0, 188.0, 172.0, 159.0, 152.0, 151.0, 127.0, 137.0, 145.0, 172.0, 135.0, 151.0, 158.0, 141.0, 113.0, 114.0, 93.0, 113.0, 128.0, 148.0, 125.0, 114.0, 127.0, 121.0, 117.0, 146.0, 116.0, 148.0, 137.0, 108.0, 114.0, 129.0, 141.0, 130.0, 107.0, 113.0, 126.0, 130.0, 102.0, 127.0, 110.0, 108.0, 109.0, 112.0, 65.0, 98.0, 84.0, 105.0, 108.0, 95.0, 135.0, 103.0, 123.0, 101.0, 102.0, 101.0, 117.0, 109.0, 106.0, 123.0, 114.0, 102.0, 88.0, 131.0, 104.0, 116.0, 108.0, 142.0, 118.0, 121.0, 115.0, 118.0, 115.0, 106.0, 119.0, 105.0, 84.0, 106.0, 91.0, 120.0, 114.0, 140.0, 96.0, 85.0, 100.0, 114.0, 103.0, 153.0, 88.0, 120.0, 96.0, 122.0, 111.0, 89.0, 107.0, 111.0, 97.0, 128.0, 103.0, 123.0, 90.0, 94.0, 82.0, 100.0, 109.0, 112.0, 104.0, 119.0, 90.0, 77.0, 114.0, 82.0, 103.0, 104.0, 104.0, 97.0, 127.0, 67.0, 99.0, 126.0, 90.0, 84.0, 109.0, 94.0, 97.0, 107.0, 113.0, 127.0, 100.0, 115.0, 102.0, 96.0, 116.0, 125.0, 102.0, 91.0, 126.0, 114.0, 101.0, 113.0, 110.0, 96.0, 126.0, 121.0, 99.0, 104.0, 108.0, 86.0, 143.0, 120.0, 83.0, 115.0, 92.0, 73.0, 113.0, 117.0, 111.0, 93.0, 106.0, 131.0, 93.0, 121.0, 109.0, 108.0, 115.0, 117.0, 116.0, 105.0, 110.0, 103.0, 112.0, 85.0, 118.0, 126.0, 119.0, 120.0, 104.0, 112.0, 111.0, 108.0, 107.0, 126.0, 123.0, 100.0, 81.0, 101.0, 106.0, 93.0, 109.0, 104.0, 131.0, 134.0, 98.0, 105.0, 129.0, 83.0, 87.0, 128.0, 116.0, 114.0, 111.0, 94.0, 114.0, 91.0, 97.0, 93.0, 116.0, 135.0, 122.0, 111.0, 126.0, 107.0, 107.0, 101.0, 82.0, 120.0, 142.0, 124.0, 120.0, 124.0, 122.0, 97.0, 96.0, 107.0, 102.0, 123.0, 115.0, 126.0, 116.0, 122.0, 115.0, 107.0, 111.0, 95.0, 93.0, 113.0, 117.0, 101.0, 110.0, 126.0, 113.0, 112.0, 127.0, 138.0, 118.0, 133.0, 94.0, 105.0, 119.0, 121.0, 122.0, 102.0, 98.0, 119.0, 103.0, 108.0, 134.0, 116.0, 107.0, 105.0, 99.0, 99.0, 117.0, 106.0, 133.0, 108.0, 110.0, 99.0, 140.0, 107.0, 104.0, 114.0, 112.0, 117.0, 106.0, 105.0, 92.0, 111.0, 99.0, 124.0, 101.0, 102.0, 144.0, 129.0, 122.0, 110.0, 116.0, 123.0, 136.0]}, "params-norm": {"start_step": 0, "end_step": 2000, "step_interval": 5, "values": [179.94354, 179.94354, 179.94354, 179.94353, 179.94351, 179.94351, 179.9435, 179.94337, 179.94319, 179.94301, 179.94168, 179.94092, 179.94034, 179.9382, 179.93718, 179.93637, 179.93611, 179.93633, 179.93683, 179.93695, 179.93684, 179.93649, 179.9361, 179.93663, 179.93771, 179.93913, 179.94032, 179.94113, 179.94214, 179.94365, 179.94586, 179.94824, 179.95052, 179.95296, 179.95572, 179.95921, 179.96291, 179.96681, 179.97093, 179.97545, 179.98062, 179.98616, 179.99197, 179.99846, 180.00552, 180.01314, 180.02119, 180.03004, 180.0396, 180.05011, 180.06131, 180.07315, 180.08542, 180.0985, 180.11215, 180.12645, 180.14087, 180.15598, 180.17198, 180.18895, 180.20711, 180.22621, 180.24666, 180.26831, 180.28981, 180.31268, 180.33565, 180.35945, 180.38472, 180.41133, 180.43765, 180.46451, 180.49187, 180.51939, 180.54758, 180.57634, 180.60477, 180.63396, 180.66389, 180.69472, 180.72603, 180.7572, 180.78957, 180.823, 180.85631, 180.88991, 180.92371, 180.95706, 180.99092, 181.02626, 181.06326, 181.10162, 181.1391, 181.17641, 181.21402, 181.25211, 181.28955, 181.32634, 181.36447, 181.40189, 181.4381, 181.47331, 181.50807, 181.54071, 181.57346, 181.60866, 181.64577, 181.68417, 181.72168, 181.75914, 181.79767, 181.83748, 181.87747, 181.91742, 181.95695, 181.99832, 182.03812, 182.07738, 182.11449, 182.15204, 182.19035, 182.22978, 182.2695, 182.31001, 182.34891, 182.38696, 182.42218, 182.45525, 182.48941, 182.52226, 182.55621, 182.58896, 182.62086, 182.65288, 182.68657, 182.72272, 182.76212, 182.80115, 182.83951, 182.87524, 182.90919, 182.94313, 182.97842, 183.01477, 183.0529, 183.09117, 183.127, 183.16306, 183.20122, 183.24178, 183.28111, 183.32036, 183.35971, 183.3998, 183.43983, 183.47787, 183.51186, 183.54558, 183.57816, 183.6123, 183.64774, 183.68333, 183.72012, 183.75874, 183.79793, 183.83867, 183.87993, 183.92157, 183.96465, 184.00539, 184.04436, 184.0843, 184.12569, 184.16653, 184.20705, 184.24741, 184.28691, 184.32756, 184.36906, 184.41148, 184.45378, 184.4951, 184.53712, 184.57993, 184.62045, 184.65775, 184.69293, 184.72659, 184.76007, 184.79503, 184.83018, 184.86899, 184.90979, 184.95056, 184.99091, 185.03053, 185.07204, 185.11502, 185.15868, 185.20329, 185.24709, 185.29115, 185.33409, 185.37717, 185.4185, 185.45804, 185.49718, 185.53632, 185.57599, 185.61728, 185.65776, 185.69963, 185.74083, 185.78281, 185.82603, 185.86871, 185.91023, 185.94936, 185.98782, 186.0262, 186.06454, 186.10416, 186.14491, 186.1852, 186.2245, 186.26433, 186.30334, 186.34256, 186.38142, 186.41753, 186.45586, 186.49515, 186.5363, 186.57649, 186.61508, 186.65221, 186.6895, 186.72816, 186.76711, 186.80779, 186.84801, 186.88885, 186.93158, 186.97491, 187.01726, 187.06096, 187.10196, 187.14183, 187.18462, 187.22882, 187.27315, 187.31848, 187.36339, 187.40767, 187.45337, 187.49886, 187.54268, 187.58609, 187.62961, 187.67044, 187.71268, 187.75528, 187.79819, 187.84183, 187.88416, 187.92462, 187.96719, 188.0098, 188.0549, 188.10202, 188.14798, 188.19414, 188.23969, 188.28632, 188.33499, 188.38423, 188.43146, 188.47794, 188.52431, 188.57013, 188.61865, 188.66565, 188.71187, 188.75861, 188.80621, 188.85393, 188.90173, 188.94839, 188.99448, 189.04036, 189.08531, 189.13077, 189.17767, 189.22517, 189.27315, 189.32074, 189.36909, 189.41704, 189.46393, 189.5119, 189.5609, 189.61021, 189.66124, 189.71246, 189.76324, 189.81259, 189.86185, 189.91013, 189.96013, 190.0108, 190.061, 190.11232, 190.1635, 190.21367, 190.2627, 190.31346, 190.36389, 190.41492, 190.46727, 190.51939, 190.57338, 190.62749, 190.68044, 190.73311, 190.78491, 190.83577, 190.8877, 190.93848, 190.98965, 191.04053, 191.09221, 191.1438, 191.19595, 191.24683, 191.29836, 191.35121, 191.40576, 191.45865, 191.51144, 191.56329, 191.61534, 191.66661, 191.71944, 191.77365, 191.82733, 191.88013, 191.93358, 191.98837, 192.04231, 192.09724, 192.15228, 192.20715, 192.26242, 192.32021, 192.37662, 192.4319, 192.48772, 192.54413, 192.59987, 192.65529, 192.71152, 192.76802, 192.82562, 192.88312, 192.94026, 192.99599, 193.05467, 193.11278, 193.17015, 193.22783, 193.28326, 193.33839, 193.39395, 193.44897, 193.50545, 193.563, 193.61928, 193.67555, 193.73364, 193.79195, 193.85016, 193.90939, 193.96805, 194.02667, 194.08534, 194.14226, 194.20026, 194.25986, 194.32065, 194.38155, 194.44293, 194.50323, 194.56407, 194.62587, 194.68752, 194.74759, 194.80595, 194.86389, 194.92307, 194.98349]}, "params-norm vs samples": {"start_step": 0, "end_step": 2000, "step_interval": 5, "values": [179.94354, 179.94354, 179.94354, 179.94353, 179.94351, 179.94351, 179.9435, 179.94337, 179.94319, 179.94301, 179.94168, 179.94092, 179.94034, 179.9382, 179.93718, 179.93637, 179.93611, 179.93633, 179.93683, 179.93695, 179.93684, 179.93649, 179.9361, 179.93663, 179.93771, 179.93913, 179.94032, 179.94113, 179.94214, 179.94365, 179.94586, 179.94824, 179.95052, 179.95296, 179.95572, 179.95921, 179.96291, 179.96681, 179.97093, 179.97545, 179.98062, 179.98616, 179.99197, 179.99846, 180.00552, 180.01314, 180.02119, 180.03004, 180.0396, 180.05011, 180.06131, 180.07315, 180.08542, 180.0985, 180.11215, 180.12645, 180.14087, 180.15598, 180.17198, 180.18895, 180.20711, 180.22621, 180.24666, 180.26831, 180.28981, 180.31268, 180.33565, 180.35945, 180.38472, 180.41133, 180.43765, 180.46451, 180.49187, 180.51939, 180.54758, 180.57634, 180.60477, 180.63396, 180.66389, 180.69472, 180.72603, 180.7572, 180.78957, 180.823, 180.85631, 180.88991, 180.92371, 180.95706, 180.99092, 181.02626, 181.06326, 181.10162, 181.1391, 181.17641, 181.21402, 181.25211, 181.28955, 181.32634, 181.36447, 181.40189, 181.4381, 181.47331, 181.50807, 181.54071, 181.57346, 181.60866, 181.64577, 181.68417, 181.72168, 181.75914, 181.79767, 181.83748, 181.87747, 181.91742, 181.95695, 181.99832, 182.03812, 182.07738, 182.11449, 182.15204, 182.19035, 182.22978, 182.2695, 182.31001, 182.34891, 182.38696, 182.42218, 182.45525, 182.48941, 182.52226, 182.55621, 182.58896, 182.62086, 182.65288, 182.68657, 182.72272, 182.76212, 182.80115, 182.83951, 182.87524, 182.90919, 182.94313, 182.97842, 183.01477, 183.0529, 183.09117, 183.127, 183.16306, 183.20122, 183.24178, 183.28111, 183.32036, 183.35971, 183.3998, 183.43983, 183.47787, 183.51186, 183.54558, 183.57816, 183.6123, 183.64774, 183.68333, 183.72012, 183.75874, 183.79793, 183.83867, 183.87993, 183.92157, 183.96465, 184.00539, 184.04436, 184.0843, 184.12569, 184.16653, 184.20705, 184.24741, 184.28691, 184.32756, 184.36906, 184.41148, 184.45378, 184.4951, 184.53712, 184.57993, 184.62045, 184.65775, 184.69293, 184.72659, 184.76007, 184.79503, 184.83018, 184.86899, 184.90979, 184.95056, 184.99091, 185.03053, 185.07204, 185.11502, 185.15868, 185.20329, 185.24709, 185.29115, 185.33409, 185.37717, 185.4185, 185.45804, 185.49718, 185.53632, 185.57599, 185.61728, 185.65776, 185.69963, 185.74083, 185.78281, 185.82603, 185.86871, 185.91023, 185.94936, 185.98782, 186.0262, 186.06454, 186.10416, 186.14491, 186.1852, 186.2245, 186.26433, 186.30334, 186.34256, 186.38142, 186.41753, 186.45586, 186.49515, 186.5363, 186.57649, 186.61508, 186.65221, 186.6895, 186.72816, 186.76711, 186.80779, 186.84801, 186.88885, 186.93158, 186.97491, 187.01726, 187.06096, 187.10196, 187.14183, 187.18462, 187.22882, 187.27315, 187.31848, 187.36339, 187.40767, 187.45337, 187.49886, 187.54268, 187.58609, 187.62961, 187.67044, 187.71268, 187.75528, 187.79819, 187.84183, 187.88416, 187.92462, 187.96719, 188.0098, 188.0549, 188.10202, 188.14798, 188.19414, 188.23969, 188.28632, 188.33499, 188.38423, 188.43146, 188.47794, 188.52431, 188.57013, 188.61865, 188.66565, 188.71187, 188.75861, 188.80621, 188.85393, 188.90173, 188.94839, 188.99448, 189.04036, 189.08531, 189.13077, 189.17767, 189.22517, 189.27315, 189.32074, 189.36909, 189.41704, 189.46393, 189.5119, 189.5609, 189.61021, 189.66124, 189.71246, 189.76324, 189.81259, 189.86185, 189.91013, 189.96013, 190.0108, 190.061, 190.11232, 190.1635, 190.21367, 190.2627, 190.31346, 190.36389, 190.41492, 190.46727, 190.51939, 190.57338, 190.62749, 190.68044, 190.73311, 190.78491, 190.83577, 190.8877, 190.93848, 190.98965, 191.04053, 191.09221, 191.1438, 191.19595, 191.24683, 191.29836, 191.35121, 191.40576, 191.45865, 191.51144, 191.56329, 191.61534, 191.66661, 191.71944, 191.77365, 191.82733, 191.88013, 191.93358, 191.98837, 192.04231, 192.09724, 192.15228, 192.20715, 192.26242, 192.32021, 192.37662, 192.4319, 192.48772, 192.54413, 192.59987, 192.65529, 192.71152, 192.76802, 192.82562, 192.88312, 192.94026, 192.99599, 193.05467, 193.11278, 193.17015, 193.22783, 193.28326, 193.33839, 193.39395, 193.44897, 193.50545, 193.563, 193.61928, 193.67555, 193.73364, 193.79195, 193.85016, 193.90939, 193.96805, 194.02667, 194.08534, 194.14226, 194.20026, 194.25986, 194.32065, 194.38155, 194.44293, 194.50323, 194.56407, 194.62587, 194.68752, 194.74759, 194.80595, 194.86389, 194.92307, 194.98349]}, "iteration-time": {"start_step": 0, "end_step": 2000, "step_interval": 5, "values": [23.29918, 0.71187, 0.71207, 0.69449, 0.69446, 0.69443, 0.6988, 0.69196, 0.7146, 0.69983, 0.70196, 0.70471, 0.70358, 0.70105, 0.71451, 0.69917, 0.69866, 0.69442, 0.6948, 0.69086, 0.69495, 0.68836, 0.69965, 0.69226, 0.69484, 0.69875, 0.70073, 0.70246, 0.72083, 0.7009, 0.70048, 0.7008, 0.70366, 0.69412, 0.70178, 0.69908, 0.70543, 0.69424, 0.70464, 0.69955, 0.70803, 0.69841, 0.70257, 0.70418, 0.70875, 0.715, 0.70906, 0.70541, 0.71931, 0.7041, 0.70223, 0.70658, 0.69701, 0.69756, 0.69594, 0.70155, 0.70926, 0.70288, 0.6981, 0.70914, 0.69799, 0.70314, 0.70633, 0.70075, 0.70007, 0.70459, 0.70195, 0.69392, 0.7045, 0.70374, 0.70075, 0.69331, 0.69436, 0.6955, 0.70291, 0.69782, 0.70126, 0.70025, 0.70132, 0.7027, 0.70476, 0.70307, 0.69742, 0.69952, 0.69723, 0.8289, 0.70367, 0.7045, 0.70784, 0.71072, 0.70676, 0.70275, 0.70232, 0.70275, 0.70734, 0.70267, 0.70508, 0.70045, 0.70283, 0.71431, 0.708, 0.70934, 0.70749, 0.71204, 0.70839, 0.70834, 0.70947, 0.70787, 0.70812, 0.70457, 0.70563, 0.69994, 0.70262, 0.69627, 0.69863, 0.69913, 0.71178, 0.71423, 0.70926, 0.70785, 0.70607, 0.70391, 0.71582, 0.71055, 0.71123, 0.70438, 0.71121, 0.71074, 0.70765, 0.70483, 0.70686, 0.71125, 0.70564, 0.70533, 0.7078, 0.70873, 0.70986, 0.70805, 0.70797, 0.71206, 0.70956, 0.70912, 0.71021, 0.70934, 0.70819, 0.70233, 0.70414, 0.70448, 0.70564, 0.7015, 0.70586, 0.70217, 0.7129, 0.70787, 0.7092, 0.71158, 0.7112, 0.71167, 0.70869, 0.70914, 0.70573, 0.7106, 0.70502, 0.70709, 0.70454, 0.70862, 0.70342, 0.70716, 0.70517, 0.70888, 0.71242, 0.71066, 0.71063, 0.70907, 0.71159, 0.71233, 0.7117, 0.7115, 0.70892, 0.71015, 0.71212, 0.70842, 0.70856, 0.71199, 0.71305, 0.71701, 0.71312, 0.71367, 0.71284, 0.70741, 0.70964, 0.70851, 0.71466, 0.70509, 0.72116, 0.72852, 0.71403, 0.70864, 0.70955, 0.7163, 0.6926, 0.70139, 0.71844, 0.70855, 0.71025, 0.71363, 0.7113, 0.7081, 0.71651, 0.71161, 0.7088, 0.70621, 0.76558, 0.71366, 0.71465, 0.70832, 0.71501, 0.71439, 0.70996, 0.71112, 0.71318, 0.71005, 0.71114, 0.70462, 0.71021, 0.71174, 0.71118, 0.70552, 0.70941, 0.71352, 0.70296, 0.7077, 0.71087, 0.70967, 0.71319, 0.70487, 0.71314, 0.71027, 0.71726, 0.70291, 0.70583, 0.70043, 0.71003, 0.70162, 0.71159, 0.70538, 0.70772, 0.7058, 0.70393, 0.70436, 0.70523, 0.7076, 0.70951, 0.7073, 0.70677, 0.70977, 0.70523, 0.70814, 0.70619, 0.71387, 0.71394, 0.71664, 0.709, 0.70954, 0.71091, 0.71119, 0.7066, 0.71015, 0.71379, 0.70807, 0.7089, 0.70687, 0.70782, 0.70284, 0.7093, 0.70472, 0.70627, 0.70878, 0.7131, 0.71354, 0.70817, 0.7085, 0.70989, 0.7104, 0.70981, 0.70998, 0.70926, 0.70687, 0.71184, 0.7147, 0.71202, 0.70554, 0.70696, 0.71095, 0.7109, 0.70487, 0.7074, 0.70395, 0.70783, 0.70406, 0.71161, 0.70987, 0.70579, 0.70936, 0.81441, 0.70896, 0.70653, 0.70759, 0.71046, 0.70652, 0.70807, 0.70162, 0.70833, 0.70934, 0.70659, 0.71222, 0.71582, 0.71966, 0.71029, 0.70866, 0.70674, 0.71991, 0.7103, 0.70757, 0.71472, 0.70914, 0.71354, 0.8287, 0.71145, 0.70825, 0.71369, 0.71612, 0.71567, 0.71261, 0.71066, 0.70918, 0.70607, 0.70956, 0.72641, 0.7127, 0.71743, 0.70933, 0.71054, 0.70211, 0.7054, 0.70442, 0.712, 0.71222, 0.71615, 0.71003, 0.71338, 0.71009, 0.71334, 0.71107, 0.71501, 0.71714, 0.70686, 0.70974, 0.71546, 0.70423, 0.71293, 0.71055, 0.71309, 0.71563, 0.71163, 0.71034, 0.71044, 0.71, 0.70833, 0.71033, 0.70852, 0.7031, 0.71412, 0.70792, 0.71185, 0.70919, 0.7121, 0.70689, 0.71208, 0.70677, 0.7134, 0.71312, 0.71483, 0.71357, 0.71752, 0.7209, 0.71431, 0.71061, 0.71548, 0.7187, 0.71617, 0.71164, 0.71417, 0.71386, 0.71464, 0.71363, 0.71829, 0.72097, 0.71465, 0.7123]}, "lm loss validation": {"start_step": 0, "end_step": 4, "step_interval": 5, "values": [5.60433]}, "lm loss validation vs samples": {"start_step": 0, "end_step": 4, "step_interval": 5, "values": [5.60433]}, "lm loss validation ppl": {"start_step": 0, "end_step": 4, "step_interval": 5, "values": [271.59912]}, "lm loss validation ppl vs samples": {"start_step": 0, "end_step": 4, "step_interval": 5, "values": [271.59912]}}
\ No newline at end of file
diff --git a/tests/functional_tests/test_cases/gpt/gpt3_345m_weekly_dgx_h100_1N8G_mcore_tp1_pp2_fp8_pp/model_config.yaml b/tests/functional_tests/test_cases/gpt/gpt3_345m_weekly_dgx_h100_1N8G_mcore_tp1_pp2_fp8_pp/model_config.yaml
new file mode 100644
index 0000000000..48acb1e697
--- /dev/null
+++ b/tests/functional_tests/test_cases/gpt/gpt3_345m_weekly_dgx_h100_1N8G_mcore_tp1_pp2_fp8_pp/model_config.yaml
@@ -0,0 +1,52 @@
+ENV_VARS:
+  CUDA_DEVICE_MAX_CONNECTIONS: 1
+  NVTE_ALLOW_NONDETERMINISTIC_ALGO: 0
+  NVTE_FUSED_ATTN: 0
+  NCCL_ALGO: Tree
+  CUBLAS_WORKSPACE_CONFIG: :4096:8
+MODEL_ARGS:
+  --num-layers: 12
+  --hidden-size: 512
+  --num-attention-heads: 8
+  --log-params-norm: true
+  --log-num-zeros-in-grad: true
+  --log-validation-ppl-to-tensorboard: true
+  --log-timers-to-tensorboard: true
+  --tensorboard-dir: ${TENSORBOARD_PATH}
+  --micro-batch-size: 2
+  --global-batch-size: 128
+  --seq-length: 1024
+  --max-position-embeddings: 1024
+  --train-iters: 2000
+  --timing-log-level: 2
+  --lr-decay-iters: 320000
+  --save: ${CHECKPOINT_PATH}
+  --load: ${CHECKPOINT_PATH}
+  --data-path: ${DATA_PATH}/my-gpt3_00_text_document
+  --vocab-file: ${DATA_PATH}/bpe/vocab.json
+  --merge-file: ${DATA_PATH}/bpe/merges.txt
+  --split: 949,50,1
+  --distributed-backend: nccl
+  --lr: 0.00015
+  --lr-decay-style: cosine
+  --min-lr: 1.0e-5
+  --weight-decay: 1e-2
+  --clip-grad: 1.0
+  --lr-warmup-fraction: .01
+  --log-interval: 1
+  --save-interval: 10000
+  --eval-interval: 1000
+  --eval-iters: 10
+  --transformer-impl: transformer_engine
+  --tensor-model-parallel-size: 1
+  --pipeline-model-parallel-size: 2
+  --deterministic-mode: true
+  --no-gradient-accumulation-fusion: true
+  --fp8-format: hybrid
+  --fp8-amax-history-len: 1024
+  --fp8-amax-compute-algo: max
+  --attention-softmax-in-fp32: true
+  --ckpt-format: torch_dist
+  --data-cache-path: ${DATA_CACHE_PATH}
+  --bf16: true
+TEST_TYPE: regular
diff --git a/tests/functional_tests/test_cases/gpt/gpt3_345m_weekly_dgx_h100_1N8G_mcore_tp2_pp2_fp8_tp_pp/golden_values_dev.json b/tests/functional_tests/test_cases/gpt/gpt3_345m_weekly_dgx_h100_1N8G_mcore_tp2_pp2_fp8_tp_pp/golden_values_dev.json
new file mode 100644
index 0000000000..d314392934
--- /dev/null
+++ b/tests/functional_tests/test_cases/gpt/gpt3_345m_weekly_dgx_h100_1N8G_mcore_tp2_pp2_fp8_tp_pp/golden_values_dev.json
@@ -0,0 +1 @@
+{"forward-backward-time": {"start_step": 0, "end_step": 2000, "step_interval": 5, "values": [20.88514, 1.46887, 1.45698, 1.45724, 1.47204, 1.4532, 1.46049, 1.46232, 1.46114, 1.45572, 1.45278, 1.45251, 1.4606, 1.45971, 1.45327, 1.45649, 1.45387, 1.44992, 1.45853, 1.46565, 1.45437, 1.4525, 1.45638, 1.45952, 1.45173, 1.46389, 1.45431, 1.45274, 1.4583, 1.45541, 1.44989, 1.45048, 1.44894, 1.45131, 1.45345, 1.44108, 1.44133, 1.44014, 1.45925, 1.44689, 1.44677, 1.45727, 1.45173, 1.45401, 1.46616, 1.45271, 1.45499, 1.46938, 1.4604, 1.4635, 1.4619, 1.46438, 1.45747, 1.46752, 1.45729, 1.46194, 1.46122, 1.46137, 1.46148, 1.46024, 1.45382, 1.46877, 1.45937, 1.46525, 1.46624, 1.46409, 1.4727, 1.46116, 1.46451, 1.4659, 1.45827, 1.45377, 1.47607, 1.46536, 1.45984, 1.46776, 1.47935, 1.47512, 1.47012, 1.47272, 1.47499, 1.47329, 1.4585, 1.45704, 1.4555, 1.46025, 1.46072, 1.45592, 1.45507, 1.45416, 1.45424, 1.46471, 1.45308, 1.45358, 1.45797, 1.46272, 1.45587, 1.47021, 1.47373, 1.47488, 1.45879, 1.45526, 1.46684, 1.45424, 1.46048, 1.45539, 1.45476, 1.46257, 1.46204, 1.4552, 1.46046, 1.45792, 1.45501, 1.46191, 1.47519, 1.45861, 1.46195, 1.4555, 1.46541, 1.45771, 1.45708, 1.46256, 1.46253, 1.45733, 1.46154, 1.46224, 1.45714, 1.46628, 1.462, 1.46251, 1.46041, 1.45921, 1.45844, 1.46129, 1.45453, 1.45615, 1.45383, 1.45915, 1.45368, 1.46097, 1.4609, 1.4519, 1.46109, 1.45906, 1.45677, 1.46323, 1.45746, 1.45755, 1.46188, 1.45867, 1.45807, 1.45578, 1.46681, 1.46385, 1.46569, 1.4551, 1.46369, 1.45943, 1.45524, 1.45829, 1.45857, 1.45785, 1.45457, 1.44886, 1.45654, 1.4591, 1.4583, 1.46482, 1.45668, 1.45572, 1.45853, 1.46203, 1.46116, 1.45964, 1.4598, 1.46157, 1.46339, 1.45804, 1.46302, 1.4604, 1.4681, 1.4619, 1.46043, 1.46458, 1.44955, 1.45921, 1.46214, 1.45918, 1.45767, 1.45627, 1.45501, 1.46271, 1.46011, 1.45047, 1.45537, 1.45774, 1.45791, 1.45844, 1.45736, 1.45685, 1.44897, 1.46515, 1.44824, 1.4544, 1.46501, 1.45918, 1.45782, 1.45713, 1.45546, 1.4536, 1.46366, 1.45823, 1.45916, 1.45823, 1.45337, 1.46118, 1.46699, 1.4587, 1.46699, 1.47055, 1.46344, 1.46652, 1.46046, 1.46265, 1.46449, 1.46285, 1.46692, 1.45814, 1.45886, 1.46803, 1.46061, 1.45819, 1.4648, 1.46266, 1.46133, 1.46278, 1.4587, 1.46188, 1.46627, 1.45851, 1.45538, 1.46707, 1.4652, 1.45779, 1.46235, 1.45952, 1.56522, 1.45535, 1.46212, 1.53267, 1.46331, 1.56631, 1.46611, 1.4675, 1.46789, 1.46422, 1.46465, 1.46332, 1.46526, 1.46728, 1.46084, 1.46879, 1.4673, 1.46097, 1.4632, 1.46893, 1.46312, 1.47082, 1.47286, 1.46203, 1.46457, 1.46392, 1.47428, 1.46372, 1.46741, 1.46293, 1.46502, 1.46743, 1.46135, 1.45986, 1.46485, 1.45803, 1.46118, 1.46355, 1.46477, 1.4597, 1.46145, 1.46577, 1.46316, 1.46246, 1.45852, 1.46444, 1.46127, 1.46343, 1.46846, 1.46172, 1.4611, 1.46651, 1.46449, 1.45901, 1.46118, 1.46452, 1.47046, 1.46733, 1.46134, 1.4708, 1.46233, 1.46381, 1.46441, 1.47211, 1.46336, 1.46499, 1.45935, 1.46955, 1.46104, 1.46986, 1.47015, 1.46324, 1.46425, 1.46739, 1.46074, 1.46764, 1.46483, 1.46352, 1.46907, 1.4704, 1.47514, 1.4677, 1.47074, 1.46865, 1.4746, 1.47247, 1.47112, 1.47411, 1.47813, 1.47421, 1.46569, 1.46574, 1.47004, 1.46433, 1.45849, 1.46834, 1.47747, 1.46919, 1.47242, 1.46719, 1.45884, 1.462, 1.45808, 1.46357, 1.46256, 1.4583, 1.53085, 1.46007, 1.56675, 1.46277, 1.46292, 1.54903, 1.46448, 1.46847, 1.46708, 1.47477, 1.46444, 1.46433, 1.46714, 1.46403, 1.46557, 1.4607, 1.4618, 1.4615, 1.45857, 1.46496, 1.46801, 1.46664, 1.45296, 1.45665, 1.46006, 1.46236, 1.46106, 1.4622, 1.46573, 1.46166, 1.45667, 1.4563, 1.46152, 1.45678, 1.45303, 1.46242, 1.46316, 1.46041, 1.4655, 1.45096, 1.45962, 1.46428, 1.45196, 1.46789, 1.45986, 1.45627, 1.46454, 1.46424]}, "forward-compute-time": {"start_step": 0, "end_step": 2000, "step_interval": 5, "values": [6.36252, 0.75642, 0.75338, 0.74782, 0.75864, 0.75119, 0.75271, 0.75652, 0.75238, 0.74967, 0.74518, 0.74699, 0.74982, 0.74683, 0.74477, 0.74825, 0.75424, 0.74304, 0.74908, 0.74831, 0.74285, 0.74505, 0.75194, 0.75268, 0.74597, 0.75419, 0.74822, 0.74832, 0.75308, 0.7494, 0.74312, 0.74787, 0.74249, 0.74586, 0.74659, 0.74391, 0.7376, 0.74214, 0.75476, 0.74522, 0.74687, 0.75765, 0.7462, 0.75118, 0.75883, 0.7495, 0.7508, 0.75734, 0.7532, 0.75555, 0.75913, 0.75728, 0.75891, 0.75923, 0.75304, 0.75387, 0.75689, 0.75658, 0.76074, 0.76432, 0.75769, 0.76347, 0.75739, 0.7616, 0.76613, 0.76452, 0.76556, 0.76205, 0.76331, 0.76266, 0.7584, 0.75596, 0.77338, 0.76537, 0.75847, 0.77247, 0.7698, 0.76711, 0.76502, 0.76683, 0.76807, 0.76879, 0.75959, 0.75609, 0.7542, 0.75889, 0.7586, 0.75685, 0.75677, 0.7569, 0.75222, 0.75781, 0.74463, 0.74619, 0.75051, 0.75082, 0.74909, 0.7631, 0.75774, 0.76204, 0.75145, 0.745, 0.75456, 0.75, 0.75135, 0.75247, 0.74698, 0.7545, 0.75599, 0.74765, 0.75411, 0.75279, 0.74869, 0.75208, 0.75762, 0.74974, 0.75249, 0.74767, 0.75172, 0.74899, 0.751, 0.74685, 0.75057, 0.75145, 0.7525, 0.75608, 0.74708, 0.75458, 0.7537, 0.74712, 0.75411, 0.7543, 0.74836, 0.74769, 0.74953, 0.75136, 0.75937, 0.76403, 0.75925, 0.76123, 0.76488, 0.75935, 0.76327, 0.7569, 0.75895, 0.76622, 0.76412, 0.75914, 0.76039, 0.76442, 0.76455, 0.76016, 0.76196, 0.76613, 0.76729, 0.75679, 0.75985, 0.75945, 0.76323, 0.7635, 0.75457, 0.75811, 0.75642, 0.74425, 0.74872, 0.75503, 0.74958, 0.75606, 0.7608, 0.75663, 0.75567, 0.76176, 0.76045, 0.76145, 0.76278, 0.76702, 0.76166, 0.75954, 0.76405, 0.76075, 0.76028, 0.75744, 0.76195, 0.75996, 0.76397, 0.76843, 0.76911, 0.76882, 0.76899, 0.76126, 0.76583, 0.77184, 0.76598, 0.76126, 0.76043, 0.75584, 0.7596, 0.7606, 0.75826, 0.75896, 0.75754, 0.76441, 0.75157, 0.75476, 0.76479, 0.75674, 0.75885, 0.75822, 0.75074, 0.75763, 0.76244, 0.75885, 0.75847, 0.7616, 0.75912, 0.76519, 0.75935, 0.75886, 0.75905, 0.76846, 0.7612, 0.7615, 0.76008, 0.76429, 0.75844, 0.75869, 0.76255, 0.76097, 0.75995, 0.76319, 0.76129, 0.76036, 0.76016, 0.76111, 0.76323, 0.76537, 0.759, 0.7601, 0.76445, 0.75571, 0.75685, 0.76075, 0.75723, 0.75653, 0.75845, 0.75674, 0.86396, 0.75777, 0.76008, 0.79802, 0.76226, 0.86191, 0.76011, 0.76317, 0.76386, 0.7605, 0.76066, 0.76276, 0.76322, 0.7613, 0.7592, 0.762, 0.76075, 0.75635, 0.75896, 0.7677, 0.7624, 0.76381, 0.76676, 0.75786, 0.75925, 0.76099, 0.76684, 0.7623, 0.76206, 0.76286, 0.76089, 0.75817, 0.75534, 0.75831, 0.76571, 0.76592, 0.76306, 0.76728, 0.76327, 0.76387, 0.7666, 0.76417, 0.7663, 0.7669, 0.76023, 0.76799, 0.76358, 0.76252, 0.76815, 0.76889, 0.76519, 0.77456, 0.76596, 0.76411, 0.76815, 0.77016, 0.77392, 0.76784, 0.76277, 0.77204, 0.76778, 0.7655, 0.76653, 0.76663, 0.7655, 0.76981, 0.76378, 0.76855, 0.76427, 0.77286, 0.76279, 0.75723, 0.75876, 0.76093, 0.75608, 0.76062, 0.75705, 0.75985, 0.76693, 0.76742, 0.77256, 0.76978, 0.76789, 0.76969, 0.76933, 0.77265, 0.76608, 0.76739, 0.77128, 0.76748, 0.75765, 0.75397, 0.76206, 0.75882, 0.75813, 0.76547, 0.77479, 0.76791, 0.77465, 0.76715, 0.75994, 0.76202, 0.75688, 0.75371, 0.75879, 0.75648, 0.78313, 0.75471, 0.85298, 0.75745, 0.75629, 0.79889, 0.75755, 0.7675, 0.76401, 0.77476, 0.7623, 0.76426, 0.77061, 0.76259, 0.76592, 0.76419, 0.76322, 0.76581, 0.76288, 0.76458, 0.76887, 0.76604, 0.7592, 0.7636, 0.76038, 0.76398, 0.76433, 0.76564, 0.7642, 0.76491, 0.76122, 0.76383, 0.76659, 0.76312, 0.76135, 0.76522, 0.76474, 0.76522, 0.76449, 0.75942, 0.76396, 0.76563, 0.75814, 0.76753, 0.76464, 0.7621, 0.77007, 0.76728]}, "backward-compute-time": {"start_step": 0, "end_step": 2000, "step_interval": 5, "values": [4.28133, 0.68196, 0.6748, 0.67881, 0.68478, 0.67217, 0.67802, 0.67659, 0.67892, 0.67668, 0.67659, 0.67465, 0.67463, 0.67462, 0.67762, 0.67642, 0.6769, 0.67572, 0.67809, 0.68097, 0.67934, 0.67704, 0.67406, 0.67837, 0.6757, 0.67949, 0.67968, 0.6787, 0.67717, 0.68038, 0.67537, 0.67968, 0.67434, 0.67314, 0.67835, 0.66827, 0.67483, 0.66865, 0.67777, 0.67612, 0.66888, 0.68034, 0.67914, 0.67754, 0.686, 0.67891, 0.6825, 0.69249, 0.68805, 0.68071, 0.6807, 0.68401, 0.68197, 0.68831, 0.67921, 0.68344, 0.68292, 0.68269, 0.67859, 0.67491, 0.67595, 0.68683, 0.68164, 0.68009, 0.68194, 0.68378, 0.68844, 0.68048, 0.67795, 0.68343, 0.6796, 0.67682, 0.6863, 0.68552, 0.67712, 0.67901, 0.6881, 0.68205, 0.67931, 0.68414, 0.68584, 0.68259, 0.67712, 0.67748, 0.67636, 0.67686, 0.67957, 0.67669, 0.67544, 0.67461, 0.67469, 0.68134, 0.68, 0.67587, 0.68021, 0.68045, 0.67544, 0.67937, 0.68676, 0.68585, 0.67936, 0.68061, 0.68245, 0.67815, 0.67775, 0.6759, 0.67787, 0.68054, 0.6803, 0.67305, 0.67653, 0.67563, 0.67417, 0.68429, 0.68658, 0.67537, 0.68025, 0.6803, 0.68056, 0.6828, 0.68066, 0.68532, 0.67902, 0.67418, 0.68192, 0.6772, 0.6791, 0.68139, 0.68311, 0.68253, 0.67839, 0.67915, 0.67948, 0.68314, 0.67734, 0.67756, 0.67316, 0.67604, 0.6758, 0.67978, 0.67641, 0.67242, 0.67813, 0.67872, 0.6783, 0.67885, 0.67431, 0.67749, 0.67801, 0.6758, 0.67622, 0.67701, 0.68426, 0.6762, 0.67926, 0.67417, 0.68505, 0.67444, 0.67174, 0.67764, 0.67913, 0.67644, 0.67728, 0.67567, 0.67951, 0.67766, 0.67997, 0.68347, 0.67314, 0.66987, 0.67882, 0.67735, 0.67469, 0.67484, 0.67452, 0.67036, 0.67219, 0.66928, 0.67596, 0.68103, 0.68041, 0.67951, 0.67362, 0.6784, 0.6726, 0.67127, 0.67283, 0.67413, 0.67371, 0.67426, 0.67198, 0.67275, 0.67579, 0.66994, 0.67168, 0.6776, 0.67237, 0.67165, 0.67104, 0.67192, 0.67427, 0.67627, 0.66668, 0.66922, 0.67584, 0.67473, 0.6708, 0.67557, 0.67335, 0.67079, 0.67545, 0.67499, 0.67953, 0.67406, 0.67059, 0.67194, 0.67815, 0.67685, 0.67968, 0.67768, 0.67845, 0.68065, 0.67662, 0.67606, 0.68139, 0.67895, 0.67961, 0.67462, 0.67355, 0.68106, 0.67561, 0.67393, 0.67793, 0.67786, 0.6746, 0.67779, 0.67398, 0.67743, 0.67735, 0.67743, 0.67124, 0.68018, 0.68312, 0.67575, 0.67441, 0.67795, 0.77498, 0.67162, 0.6764, 0.67127, 0.67597, 0.68008, 0.68042, 0.67905, 0.68174, 0.67734, 0.68026, 0.6787, 0.67714, 0.682, 0.67394, 0.68013, 0.68188, 0.67889, 0.67722, 0.67427, 0.67656, 0.68229, 0.68021, 0.6768, 0.68025, 0.67886, 0.68439, 0.67958, 0.6764, 0.67518, 0.67551, 0.68714, 0.67915, 0.67531, 0.67638, 0.674, 0.67847, 0.67644, 0.67977, 0.674, 0.67593, 0.68097, 0.67926, 0.67773, 0.67609, 0.6796, 0.67785, 0.67882, 0.67923, 0.6747, 0.67544, 0.67361, 0.68038, 0.67547, 0.67624, 0.67248, 0.67952, 0.68043, 0.67937, 0.67985, 0.67588, 0.68025, 0.67916, 0.68539, 0.67959, 0.67855, 0.67714, 0.68454, 0.67696, 0.67981, 0.683, 0.68247, 0.6825, 0.68134, 0.67836, 0.68273, 0.68212, 0.68044, 0.67659, 0.67798, 0.67887, 0.67623, 0.67774, 0.67659, 0.67891, 0.67811, 0.68204, 0.68313, 0.68107, 0.68061, 0.68094, 0.68548, 0.68238, 0.67942, 0.67349, 0.67874, 0.67949, 0.67779, 0.67431, 0.67512, 0.67432, 0.67473, 0.67593, 0.68238, 0.67917, 0.67651, 0.68094, 0.67897, 0.68533, 0.67806, 0.68435, 0.68504, 0.682, 0.68404, 0.68368, 0.68461, 0.68091, 0.6825, 0.67628, 0.68089, 0.6828, 0.67779, 0.67875, 0.67869, 0.67726, 0.67954, 0.68441, 0.67716, 0.67303, 0.67398, 0.67541, 0.6785, 0.67881, 0.67645, 0.68188, 0.67884, 0.67565, 0.67403, 0.67785, 0.67584, 0.67366, 0.67828, 0.67909, 0.67494, 0.68175, 0.67414, 0.67764, 0.68174, 0.67366, 0.68332, 0.67954, 0.67548, 0.67937, 0.67851]}, "batch-generator-time": {"start_step": 0, "end_step": 2000, "step_interval": 5, "values": [2.31358, 0.01342, 0.01402, 0.01374, 0.01299, 0.01268, 0.01392, 0.01354, 0.01304, 0.01288, 0.01303, 0.01298, 0.01232, 0.01255, 0.01299, 0.01326, 0.01362, 0.0129, 0.01443, 0.01263, 0.01254, 0.01285, 0.01249, 0.01344, 0.01424, 0.01237, 0.01372, 0.01224, 0.013, 0.01253, 0.01341, 0.01286, 0.01401, 0.01393, 0.01367, 0.01532, 0.01387, 0.01392, 0.01291, 0.01426, 0.0158, 0.01586, 0.01402, 0.01614, 0.01699, 0.0155, 0.01558, 0.01634, 0.01595, 0.01549, 0.01633, 0.01561, 0.01611, 0.01605, 0.01621, 0.01402, 0.01567, 0.01545, 0.0163, 0.01651, 0.01564, 0.01603, 0.01693, 0.01689, 0.01357, 0.0139, 0.01398, 0.01321, 0.0147, 0.01234, 0.01211, 0.01284, 0.01261, 0.01263, 0.01246, 0.01271, 0.01272, 0.01352, 0.01254, 0.01474, 0.01286, 0.01466, 0.01388, 0.01269, 0.01267, 0.01231, 0.01228, 0.01211, 0.01249, 0.01199, 0.01406, 0.01239, 0.012, 0.01243, 0.01264, 0.01202, 0.01259, 0.01295, 0.01265, 0.01251, 0.01294, 0.01235, 0.01204, 0.01263, 0.01427, 0.01248, 0.01231, 0.01225, 0.01258, 0.01178, 0.01262, 0.01236, 0.01219, 0.01244, 0.01253, 0.01287, 0.01341, 0.01255, 0.01211, 0.01241, 0.01252, 0.01245, 0.01248, 0.01249, 0.01246, 0.01257, 0.01439, 0.01257, 0.01277, 0.01231, 0.01239, 0.01246, 0.01285, 0.01264, 0.01226, 0.01308, 0.01475, 0.01426, 0.01226, 0.01234, 0.0128, 0.01255, 0.01327, 0.01286, 0.01198, 0.0126, 0.01182, 0.01221, 0.01291, 0.01266, 0.0138, 0.01491, 0.01556, 0.01521, 0.01547, 0.01523, 0.01535, 0.01539, 0.01545, 0.01502, 0.01553, 0.01548, 0.01523, 0.0158, 0.0149, 0.01554, 0.01524, 0.01563, 0.01495, 0.01509, 0.01539, 0.01542, 0.01541, 0.01496, 0.0133, 0.01391, 0.01409, 0.01274, 0.01438, 0.01341, 0.01299, 0.01457, 0.0135, 0.01472, 0.01228, 0.01294, 0.01287, 0.01243, 0.01296, 0.01232, 0.0131, 0.01254, 0.01253, 0.01203, 0.01548, 0.01457, 0.01673, 0.01491, 0.01608, 0.01713, 0.20109, 0.01559, 0.01542, 0.01587, 0.01537, 0.01617, 0.01548, 0.01476, 0.01531, 0.01468, 0.01359, 0.01328, 0.01334, 0.01271, 0.01326, 0.01281, 0.01274, 0.01235, 0.01343, 0.01378, 0.01234, 0.01331, 0.01322, 0.01409, 0.01395, 0.01384, 0.01454, 0.01599, 0.01706, 0.01595, 0.01555, 0.01494, 0.01652, 0.01668, 0.01556, 0.01656, 0.01651, 0.01523, 0.01549, 0.01748, 0.0151, 0.01561, 0.01593, 0.01703, 0.01695, 0.01519, 0.11815, 0.01383, 0.01413, 0.01352, 0.0127, 0.01447, 0.01336, 0.0136, 0.0135, 0.01283, 0.01313, 0.01327, 0.01457, 0.0137, 0.01312, 0.01422, 0.01356, 0.01359, 0.01298, 0.01365, 0.01348, 0.01345, 0.01333, 0.01313, 0.01267, 0.01374, 0.01318, 0.01263, 0.01428, 0.01505, 0.01249, 0.01321, 0.01297, 0.01239, 0.01264, 0.01257, 0.01217, 0.0122, 0.0122, 0.01198, 0.0127, 0.01478, 0.01247, 0.01244, 0.01216, 0.0125, 0.01376, 0.01279, 0.01258, 0.01297, 0.01503, 0.01572, 0.01498, 0.01367, 0.01289, 0.01246, 0.01343, 0.01425, 0.01243, 0.01244, 0.0128, 0.01271, 0.01294, 0.01314, 0.01241, 0.01281, 0.01413, 0.01267, 0.01236, 0.01278, 0.01212, 0.01253, 0.01258, 0.01307, 0.0136, 0.01249, 0.0128, 0.01213, 0.01404, 0.01391, 0.01279, 0.0132, 0.01312, 0.01257, 0.01296, 0.01486, 0.01348, 0.01408, 0.01312, 0.01352, 0.01264, 0.01361, 0.01373, 0.01287, 0.01447, 0.01273, 0.0134, 0.01256, 0.01471, 0.01292, 0.01296, 0.01556, 0.01269, 0.01275, 0.01262, 0.01243, 0.01254, 0.01292, 0.01389, 0.01214, 0.01259, 0.01322, 0.01252, 0.01284, 0.01326, 0.01406, 0.01221, 0.01209, 0.01445, 0.01235, 0.01243, 0.01521, 0.01303, 0.01308, 0.01361, 0.01255, 0.01227, 0.01283, 0.01623, 0.01515, 0.01582, 0.01716, 0.01637, 0.01737, 0.01732, 0.01611, 0.01683, 0.01561, 0.01502, 0.01608, 0.015, 0.01699, 0.017, 0.0159, 0.01671, 0.016, 0.01726, 0.01765, 0.01553, 0.01619, 0.01499, 0.01559, 0.01568, 0.01579]}, "forward-recv-time": {"start_step": 0, "end_step": 2000, "step_interval": 5, "values": [5.69523, 0.02394, 0.02348, 0.02329, 0.02364, 0.02293, 0.02376, 0.0234, 0.02371, 0.02468, 0.02324, 0.02396, 0.02501, 0.0256, 0.02468, 0.02408, 0.02484, 0.02364, 0.02322, 0.02328, 0.02362, 0.02407, 0.02284, 0.02422, 0.02402, 0.02397, 0.0233, 0.02317, 0.0238, 0.02388, 0.02326, 0.02363, 0.02416, 0.02354, 0.02309, 0.02365, 0.02345, 0.02308, 0.02317, 0.02313, 0.02335, 0.023, 0.02326, 0.0233, 0.0238, 0.02375, 0.02493, 0.02394, 0.02412, 0.0238, 0.02339, 0.02351, 0.02335, 0.0266, 0.0234, 0.02405, 0.02373, 0.0237, 0.02385, 0.02378, 0.02359, 0.02689, 0.02333, 0.02338, 0.02322, 0.02354, 0.0233, 0.02329, 0.02452, 0.02693, 0.02345, 0.02326, 0.02375, 0.02341, 0.02388, 0.0233, 0.02333, 0.02476, 0.02365, 0.0236, 0.02356, 0.02344, 0.02363, 0.02334, 0.0233, 0.02313, 0.02387, 0.02342, 0.02362, 0.02319, 0.02461, 0.02359, 0.0234, 0.02397, 0.02524, 0.02331, 0.02386, 0.02533, 0.02416, 0.02445, 0.02309, 0.02381, 0.02352, 0.02393, 0.02341, 0.02313, 0.02371, 0.02364, 0.02387, 0.02355, 0.02449, 0.02408, 0.02363, 0.02317, 0.02331, 0.0239, 0.02385, 0.0235, 0.02309, 0.0239, 0.02371, 0.0232, 0.0236, 0.0237, 0.0241, 0.02434, 0.02347, 0.02522, 0.02461, 0.02418, 0.02376, 0.02318, 0.02386, 0.02379, 0.02334, 0.02333, 0.02452, 0.02365, 0.02364, 0.02368, 0.02399, 0.02426, 0.02355, 0.02382, 0.02423, 0.02653, 0.02379, 0.02327, 0.02414, 0.02462, 0.02631, 0.02476, 0.02402, 0.02578, 0.02427, 0.02403, 0.02365, 0.02467, 0.02569, 0.02364, 0.02413, 0.02503, 0.02507, 0.02438, 0.02416, 0.02449, 0.02518, 0.02522, 0.02409, 0.02476, 0.02466, 0.02482, 0.02437, 0.02418, 0.0241, 0.02501, 0.02478, 0.02401, 0.02483, 0.02545, 0.02468, 0.02391, 0.02507, 0.02466, 0.02414, 0.02353, 0.0242, 0.02477, 0.02356, 0.02431, 0.02316, 0.02439, 0.02399, 0.02385, 0.02354, 0.02465, 0.02547, 0.02508, 0.02419, 0.02477, 0.01768, 0.02429, 0.02356, 0.02577, 0.02434, 0.02473, 0.02445, 0.02378, 0.02439, 0.02389, 0.02352, 0.02408, 0.02328, 0.02452, 0.02367, 0.02386, 0.02413, 0.02431, 0.02462, 0.02369, 0.02376, 0.02491, 0.02439, 0.02403, 0.02377, 0.02464, 0.02435, 0.02348, 0.02371, 0.0252, 0.02368, 0.02387, 0.02399, 0.02427, 0.02729, 0.02472, 0.02405, 0.02401, 0.02437, 0.02492, 0.02402, 0.02449, 0.02457, 0.02418, 0.02405, 0.02463, 0.02494, 0.02411, 0.02427, 0.02434, 0.02507, 0.02381, 0.02365, 0.02529, 0.02396, 0.02466, 0.0235, 0.02361, 0.02374, 0.02465, 0.02472, 0.02388, 0.02377, 0.02493, 0.02356, 0.02375, 0.024, 0.02421, 0.02437, 0.02348, 0.02314, 0.02411, 0.02461, 0.02389, 0.0247, 0.02407, 0.0246, 0.02474, 0.02412, 0.02434, 0.02469, 0.02369, 0.02397, 0.02513, 0.02411, 0.02363, 0.02383, 0.02511, 0.02474, 0.02401, 0.02392, 0.0241, 0.02386, 0.02404, 0.02408, 0.02406, 0.02452, 0.02544, 0.02797, 0.0258, 0.02429, 0.02521, 0.02549, 0.02471, 0.02437, 0.02521, 0.02445, 0.0245, 0.0237, 0.02743, 0.02449, 0.02397, 0.02369, 0.02461, 0.02423, 0.02547, 0.02366, 0.02466, 0.02473, 0.02447, 0.02511, 0.02472, 0.02518, 0.02397, 0.02404, 0.02493, 0.02555, 0.02496, 0.02436, 0.02395, 0.02507, 0.02456, 0.0243, 0.02385, 0.02539, 0.02483, 0.02431, 0.02399, 0.02469, 0.0254, 0.02512, 0.03429, 0.0364, 0.03571, 0.03561, 0.03474, 0.02415, 0.02604, 0.02499, 0.02494, 0.0246, 0.02567, 0.02501, 0.02468, 0.02397, 0.02793, 0.02468, 0.02491, 0.02539, 0.02409, 0.02475, 0.02441, 0.02562, 0.02394, 0.02557, 0.02449, 0.02381, 0.02425, 0.02474, 0.02431, 0.02389, 0.02357, 0.02526, 0.0266, 0.02574, 0.02347, 0.02485, 0.02498, 0.02413, 0.02387, 0.02515, 0.02481, 0.02439, 0.02404, 0.02457, 0.02585, 0.02502, 0.02382, 0.02429, 0.02509, 0.02444, 0.02418, 0.02439, 0.02469, 0.0242, 0.0249, 0.02556, 0.0254, 0.02589, 0.02426]}, "forward-send-time": {"start_step": 0, "end_step": 2000, "step_interval": 5, "values": [2.90859, 0.00013, 0.00013, 0.00012, 0.00012, 0.00011, 0.00012, 0.00012, 0.00012, 0.00012, 0.00012, 0.00012, 0.00013, 0.00013, 0.00012, 0.00012, 0.00014, 0.00012, 0.00012, 0.00012, 0.00012, 0.00012, 0.00012, 0.00012, 0.00012, 0.00012, 0.00013, 0.00012, 0.00012, 0.00012, 0.00012, 0.00012, 0.00012, 0.00012, 0.00012, 0.00012, 0.00012, 0.00012, 0.00012, 0.00012, 0.00011, 0.00012, 0.00012, 0.00012, 0.00012, 0.00012, 0.00012, 0.00012, 0.00012, 0.00041, 0.00012, 0.00012, 0.00012, 0.00013, 0.00012, 0.00012, 0.00012, 0.00012, 0.00012, 0.00012, 0.00013, 0.00013, 0.00011, 0.00013, 0.00011, 0.00012, 0.00011, 0.00012, 0.00012, 0.00012, 0.00012, 0.00013, 0.00012, 0.00011, 0.00012, 0.00012, 0.00011, 0.00012, 0.00012, 0.00012, 0.00013, 0.00012, 0.00012, 0.00012, 0.00011, 0.00012, 0.00012, 0.00012, 0.00012, 0.00012, 0.00013, 0.00012, 0.00012, 0.00012, 0.00012, 0.00011, 0.00012, 0.00012, 0.00012, 0.00012, 0.00012, 0.00012, 0.00013, 0.00012, 0.00012, 0.00012, 0.00012, 0.00012, 0.00012, 0.00012, 0.00012, 0.00012, 0.00012, 0.00012, 0.00012, 0.00013, 0.00012, 0.00012, 0.00012, 0.00012, 0.00012, 0.00012, 0.00012, 0.00012, 0.00013, 0.00013, 0.00012, 0.00012, 0.00012, 0.00012, 0.00012, 0.00012, 0.00012, 0.00012, 0.00012, 0.00012, 0.00012, 0.00012, 0.00012, 0.00012, 0.00013, 0.00012, 0.00012, 0.00012, 0.00012, 0.00012, 0.00014, 0.00012, 0.00012, 0.00012, 0.00012, 0.00013, 0.00012, 0.00012, 0.00012, 0.00012, 0.00012, 0.00013, 0.00013, 0.00012, 0.00013, 0.00012, 0.00013, 0.00012, 0.00012, 0.00013, 0.00012, 0.00013, 0.00012, 0.00012, 0.00012, 0.00012, 0.00012, 0.00012, 0.00012, 0.00013, 0.00012, 0.00012, 0.00013, 0.00012, 0.00014, 0.00012, 0.00013, 0.00013, 0.00012, 0.00012, 0.00012, 0.00012, 0.00012, 0.00012, 0.00012, 0.00012, 0.00013, 0.00012, 0.00012, 0.00012, 0.00013, 0.00012, 0.00012, 0.00012, 0.00011, 0.00012, 0.00012, 0.00012, 0.00012, 0.00012, 0.00012, 0.00012, 0.00012, 0.00012, 0.00013, 0.00012, 0.00012, 0.00013, 0.00012, 0.00012, 0.00012, 0.00013, 0.00013, 0.00012, 0.00012, 0.00013, 0.00013, 0.00012, 0.00012, 0.00012, 0.00012, 0.00012, 0.00012, 0.00012, 0.00013, 0.00012, 0.00012, 0.00012, 0.00013, 0.00014, 0.00012, 0.00012, 0.00012, 0.00012, 0.00012, 0.00012, 0.00012, 0.00012, 0.00012, 0.00012, 0.00012, 0.00012, 0.00012, 0.00012, 0.00013, 0.00013, 0.00012, 0.00012, 0.00012, 0.00013, 0.00012, 0.00012, 0.00012, 0.00012, 0.00012, 0.00012, 0.00012, 0.00012, 0.00013, 0.00014, 0.00012, 0.00014, 0.00012, 0.00012, 0.00012, 0.00013, 0.00012, 0.00012, 0.00012, 0.00012, 0.00013, 0.00012, 0.00012, 0.00012, 0.00013, 0.00012, 0.00012, 0.00013, 0.00012, 0.00012, 0.00012, 0.00012, 0.00012, 0.00012, 0.00012, 0.00013, 0.00012, 0.00013, 0.00012, 0.00012, 0.00012, 0.00013, 0.00013, 0.00013, 0.00012, 0.00013, 0.00013, 0.00012, 0.00012, 0.00012, 0.00013, 0.00012, 0.00013, 0.00012, 0.00014, 0.00012, 0.00011, 0.00012, 0.00013, 0.00013, 0.00012, 0.00012, 0.00012, 0.00012, 0.00013, 0.00012, 0.00012, 0.00012, 0.00012, 0.00013, 0.00013, 0.00012, 0.00012, 0.00012, 0.00013, 0.00012, 0.00012, 0.00012, 0.00012, 0.00013, 0.00012, 0.00012, 0.00012, 0.00013, 0.00013, 0.00014, 0.00017, 0.00016, 0.00012, 0.00017, 0.00011, 0.00012, 0.00012, 0.00012, 0.00014, 0.00013, 0.00012, 0.00013, 0.00012, 0.00013, 0.00013, 0.00014, 0.00013, 0.00012, 0.00012, 0.00012, 0.00012, 0.00011, 0.00013, 0.00012, 0.00012, 0.00013, 0.00012, 0.00012, 0.00012, 0.00012, 0.00012, 0.00013, 0.00013, 0.00012, 0.00012, 0.00013, 0.00012, 0.00012, 0.00012, 0.00012, 0.00012, 0.00012, 0.00011, 0.00012, 0.00012, 0.00013, 0.00012, 0.00013, 0.00012, 0.00012, 0.00012, 0.00012, 0.00012, 0.00012, 0.00011, 0.00012, 0.00013, 0.00013]}, "backward-recv-time": {"start_step": 0, "end_step": 2000, "step_interval": 5, "values": [0.02368, 0.02348, 0.02394, 0.02364, 0.02449, 0.02409, 0.02505, 0.02374, 0.02528, 0.0259, 0.02358, 0.0242, 0.02637, 0.02354, 0.0251, 0.02307, 0.02342, 0.02386, 0.02487, 0.02353, 0.02241, 0.02358, 0.02336, 0.02385, 0.02423, 0.02362, 0.02431, 0.02368, 0.02447, 0.02388, 0.02278, 0.02395, 0.02289, 0.02372, 0.0236, 0.02367, 0.02368, 0.02432, 0.02399, 0.02338, 0.02355, 0.02343, 0.02344, 0.02565, 0.02464, 0.02367, 0.02563, 0.02365, 0.02498, 0.02382, 0.02437, 0.02419, 0.02505, 0.02388, 0.02389, 0.02396, 0.02377, 0.02399, 0.02396, 0.02304, 0.02377, 0.02724, 0.02399, 0.02408, 0.02416, 0.02465, 0.02583, 0.02394, 0.02408, 0.02617, 0.02288, 0.02529, 0.0259, 0.02468, 0.02405, 0.02424, 0.02366, 0.02431, 0.02501, 0.02416, 0.02392, 0.02398, 0.02395, 0.02361, 0.02493, 0.02419, 0.02355, 0.02345, 0.02429, 0.02305, 0.02433, 0.02418, 0.02434, 0.02361, 0.02432, 0.02418, 0.0234, 0.02415, 0.02349, 0.02463, 0.02416, 0.02344, 0.02561, 0.02358, 0.02435, 0.024, 0.02522, 0.02503, 0.02562, 0.02467, 0.02425, 0.02421, 0.02382, 0.0242, 0.02401, 0.02416, 0.02588, 0.0247, 0.02434, 0.02473, 0.02524, 0.02511, 0.02494, 0.02375, 0.02595, 0.02432, 0.02337, 0.02414, 0.02486, 0.0245, 0.02433, 0.02431, 0.02365, 0.02411, 0.02342, 0.02427, 0.02467, 0.02469, 0.02352, 0.02452, 0.02337, 0.02463, 0.02478, 0.02463, 0.02462, 0.02668, 0.02409, 0.02498, 0.02302, 0.02351, 0.02626, 0.02404, 0.02319, 0.02423, 0.02437, 0.02371, 0.02423, 0.02372, 0.02372, 0.02417, 0.02394, 0.02401, 0.02428, 0.02406, 0.02443, 0.02396, 0.02341, 0.02439, 0.02392, 0.02389, 0.02372, 0.02654, 0.02468, 0.02413, 0.02396, 0.02411, 0.02434, 0.02436, 0.02416, 0.02432, 0.02413, 0.02462, 0.0275, 0.02423, 0.02396, 0.027, 0.02446, 0.02452, 0.025, 0.02481, 0.02389, 0.02952, 0.02408, 0.02468, 0.02725, 0.02317, 0.02402, 0.02623, 0.02326, 0.02418, 0.0249, 0.0242, 0.02443, 0.02409, 0.0256, 0.02406, 0.02355, 0.02409, 0.02372, 0.02539, 0.02507, 0.02461, 0.02483, 0.02426, 0.02423, 0.02431, 0.02427, 0.02447, 0.02382, 0.02564, 0.02441, 0.02556, 0.02403, 0.02573, 0.02428, 0.02401, 0.02513, 0.02382, 0.02364, 0.02454, 0.02477, 0.02397, 0.0253, 0.02422, 0.02361, 0.02617, 0.02493, 0.02542, 0.0241, 0.02392, 0.02412, 0.02369, 0.02392, 0.02434, 0.02381, 0.02437, 0.02629, 0.02397, 0.0244, 0.02457, 0.02396, 0.02392, 0.02359, 0.02513, 0.02438, 0.02434, 0.02525, 0.02462, 0.02406, 0.02675, 0.0243, 0.02493, 0.02442, 0.02465, 0.02474, 0.02404, 0.02508, 0.02549, 0.02338, 0.02287, 0.02444, 0.02513, 0.02493, 0.02474, 0.0248, 0.02431, 0.0245, 0.02863, 0.02409, 0.02427, 0.02391, 0.02367, 0.02441, 0.02399, 0.02425, 0.02368, 0.0241, 0.02393, 0.02417, 0.02474, 0.02369, 0.02638, 0.02436, 0.02611, 0.02434, 0.02576, 0.02383, 0.02442, 0.02353, 0.02419, 0.02477, 0.02466, 0.02579, 0.02455, 0.0242, 0.02475, 0.02338, 0.02403, 0.02538, 0.02364, 0.02364, 0.02423, 0.02324, 0.02408, 0.02434, 0.02456, 0.0243, 0.02403, 0.02448, 0.02338, 0.02413, 0.02447, 0.02323, 0.02365, 0.02506, 0.02554, 0.02565, 0.02416, 0.025, 0.02532, 0.02482, 0.02683, 0.02458, 0.02498, 0.02491, 0.02422, 0.0243, 0.02428, 0.02417, 0.02376, 0.02431, 0.02339, 0.02362, 0.02365, 0.02371, 0.02421, 0.02393, 0.02386, 0.02374, 0.0249, 0.02454, 0.02401, 0.02418, 0.02411, 0.02461, 0.02418, 0.02303, 0.02369, 0.02384, 0.02685, 0.02364, 0.02436, 0.02417, 0.02486, 0.02423, 0.02448, 0.02462, 0.02366, 0.02415, 0.02421, 0.0243, 0.02378, 0.02574, 0.02403, 0.02374, 0.02434, 0.02432, 0.02579, 0.02343, 0.02354, 0.02396, 0.02392, 0.02373, 0.02416, 0.02348, 0.02355, 0.02427, 0.0252, 0.02486, 0.02405, 0.02393, 0.0234, 0.02443, 0.02418, 0.02422, 0.02504, 0.02408, 0.0243, 0.02762, 0.02382]}, "backward-send-time": {"start_step": 0, "end_step": 2000, "step_interval": 5, "values": [0.00016, 0.00016, 0.00019, 0.00016, 0.00017, 0.00018, 0.00016, 0.00016, 0.00019, 0.00016, 0.00017, 0.00016, 0.00016, 0.00016, 0.00018, 0.00018, 0.00016, 0.00016, 0.00019, 0.00016, 0.00018, 0.00019, 0.00018, 0.00015, 0.00016, 0.00016, 0.00016, 0.00016, 0.00016, 0.00016, 0.00016, 0.00017, 0.00018, 0.00016, 0.00016, 0.00016, 0.00016, 0.00017, 0.00018, 0.00019, 0.00018, 0.00018, 0.00018, 0.00017, 0.00018, 0.00016, 0.00016, 0.00016, 0.00017, 0.00016, 0.00017, 0.00019, 0.00016, 0.00017, 0.00017, 0.00016, 0.00019, 0.00018, 0.00018, 0.00016, 0.00018, 0.00016, 0.00017, 0.00016, 0.00018, 0.00016, 0.00019, 0.00018, 0.00018, 0.00016, 0.00018, 0.00016, 0.00016, 0.00016, 0.00018, 0.00016, 0.00017, 0.00016, 0.00018, 0.00018, 0.00018, 0.00017, 0.00017, 0.00017, 0.00018, 0.00016, 0.00018, 0.00018, 0.00019, 0.00016, 0.00018, 0.00016, 0.00016, 0.00016, 0.00016, 0.00016, 0.00019, 0.00016, 0.00018, 0.00016, 0.00017, 0.00016, 0.00018, 0.00016, 0.00018, 0.00016, 0.00017, 0.00032, 0.00018, 0.00016, 0.00016, 0.00016, 0.00018, 0.00018, 0.00018, 0.00019, 0.00016, 0.00016, 0.00016, 0.00016, 0.00018, 0.00019, 0.00016, 0.00016, 0.00016, 0.00017, 0.00019, 0.00018, 0.00016, 0.00018, 0.00018, 0.00016, 0.00018, 0.00017, 0.00016, 0.00016, 0.00018, 0.00016, 0.00018, 0.00017, 0.00016, 0.00017, 0.00025, 0.00016, 0.00016, 0.00018, 0.00016, 0.00016, 0.00016, 0.00018, 0.00016, 0.00016, 0.00016, 0.00016, 0.00018, 0.00017, 0.00019, 0.00016, 0.00016, 0.00018, 0.00018, 0.00016, 0.00016, 0.00016, 0.00016, 0.00016, 0.00016, 0.00031, 0.00016, 0.00016, 0.00025, 0.00018, 0.00016, 0.00016, 0.00016, 0.00018, 0.00017, 0.00016, 0.00018, 0.00018, 0.00016, 0.00016, 0.00022, 0.00016, 0.00016, 0.00016, 0.00018, 0.00016, 0.00017, 0.00017, 0.00015, 0.00016, 0.00018, 0.00016, 0.00016, 0.00016, 0.00017, 0.00018, 0.00019, 0.00017, 0.00017, 0.00018, 0.00016, 0.00018, 0.00018, 0.00018, 0.00015, 0.00016, 0.00017, 0.00016, 0.00016, 0.00017, 0.00016, 0.00017, 0.00018, 0.00017, 0.00018, 0.00018, 0.00016, 0.00016, 0.00018, 0.00018, 0.00016, 0.00016, 0.00016, 0.00018, 0.00017, 0.00017, 0.00018, 0.00018, 0.00016, 0.00017, 0.00017, 0.00016, 0.00017, 0.00019, 0.00019, 0.00028, 0.00017, 0.00017, 0.00016, 0.00016, 0.00016, 0.00016, 0.00015, 0.00017, 0.00016, 0.00016, 0.00017, 0.00018, 0.00016, 0.00016, 0.00016, 0.00017, 0.00018, 0.0002, 0.00016, 0.00017, 0.00017, 0.00018, 0.00018, 0.00016, 0.00016, 0.00017, 0.00018, 0.00018, 0.00016, 0.00023, 0.00016, 0.00016, 0.00016, 0.00017, 0.00016, 0.00019, 0.00017, 0.00016, 0.00016, 0.00015, 0.00016, 0.00018, 0.00019, 0.00016, 0.00018, 0.00017, 0.00016, 0.00017, 0.00018, 0.00018, 0.00022, 0.00016, 0.00016, 0.0002, 0.00019, 0.00017, 0.00016, 0.00018, 0.00016, 0.00016, 0.00017, 0.00016, 0.00017, 0.00019, 0.00016, 0.00016, 0.00018, 0.00017, 0.00018, 0.00015, 0.00016, 0.00016, 0.00019, 0.00018, 0.00018, 0.00016, 0.00018, 0.00016, 0.00016, 0.00016, 0.00017, 0.00016, 0.00016, 0.00018, 0.00016, 0.00016, 0.00016, 0.00018, 0.00018, 0.00018, 0.00016, 0.00017, 0.00022, 0.00016, 0.00017, 0.00016, 0.00018, 0.00016, 0.00018, 0.00018, 0.00018, 0.00017, 0.00016, 0.00017, 0.00016, 0.00026, 0.00018, 0.00016, 0.00016, 0.00016, 0.00017, 0.00018, 0.00016, 0.00016, 0.00019, 0.00017, 0.00018, 0.00016, 0.00016, 0.00016, 0.00016, 0.00018, 0.00019, 0.00016, 0.00016, 0.00016, 0.00016, 0.00016, 0.00018, 0.00019, 0.00016, 0.00018, 0.00018, 0.00016, 0.00017, 0.00018, 0.00031, 0.00018, 0.00017, 0.00016, 0.00016, 0.00016, 0.00016, 0.00016, 0.00017, 0.00016, 0.00016, 0.00017, 0.00016, 0.00016, 0.00019, 0.00018, 0.00016, 0.00017, 0.00018, 0.00018, 0.00018, 0.00016, 0.00018, 0.00019, 0.00016, 0.00019]}, "forward-send-backward-recv-time": {"start_step": 0, "end_step": 2000, "step_interval": 5, "values": [7.32739, 0.12477, 0.12666, 0.128, 0.12835, 0.12967, 0.1275, 0.13153, 0.12112, 0.12816, 0.12128, 0.1203, 0.12267, 0.122, 0.12207, 0.1236, 0.12689, 0.12116, 0.11515, 0.1236, 0.11731, 0.11801, 0.12855, 0.12095, 0.12421, 0.12165, 0.12224, 0.11784, 0.12171, 0.11872, 0.11626, 0.12467, 0.1241, 0.11907, 0.11776, 0.12636, 0.11891, 0.12432, 0.12301, 0.12655, 0.12996, 0.13374, 0.12156, 0.12801, 0.13689, 0.1275, 0.13219, 0.13231, 0.13041, 0.12833, 0.13716, 0.13099, 0.1317, 0.1252, 0.12341, 0.12286, 0.12995, 0.12336, 0.13226, 0.13381, 0.12738, 0.13598, 0.13071, 0.13531, 0.14271, 0.14199, 0.13871, 0.142, 0.14001, 0.14332, 0.13666, 0.13328, 0.14543, 0.14315, 0.13564, 0.15173, 0.14153, 0.15109, 0.14782, 0.14157, 0.14168, 0.14516, 0.13449, 0.13595, 0.13466, 0.13854, 0.13617, 0.13542, 0.13551, 0.13682, 0.13396, 0.13632, 0.12977, 0.13179, 0.13436, 0.12818, 0.1318, 0.15065, 0.14138, 0.14121, 0.12829, 0.1243, 0.12753, 0.13425, 0.13136, 0.13043, 0.12709, 0.1367, 0.13831, 0.13249, 0.13782, 0.13352, 0.13464, 0.12973, 0.1292, 0.13364, 0.13332, 0.13424, 0.12997, 0.13345, 0.12818, 0.13196, 0.13345, 0.13333, 0.13254, 0.13659, 0.13184, 0.13348, 0.12597, 0.13454, 0.13192, 0.1375, 0.13257, 0.12337, 0.1345, 0.13062, 0.13753, 0.13119, 0.13426, 0.13825, 0.13839, 0.13388, 0.13726, 0.12898, 0.13377, 0.13935, 0.1381, 0.13416, 0.13521, 0.13765, 0.1373, 0.13402, 0.12531, 0.13371, 0.14559, 0.13302, 0.12679, 0.13579, 0.1348, 0.13764, 0.13247, 0.13464, 0.13235, 0.13117, 0.12868, 0.13327, 0.13496, 0.1324, 0.13728, 0.13904, 0.13275, 0.14304, 0.14323, 0.14887, 0.14315, 0.1468, 0.14026, 0.14574, 0.14975, 0.14342, 0.14555, 0.13943, 0.1403, 0.1444, 0.14205, 0.14177, 0.1462, 0.14686, 0.14634, 0.14245, 0.14549, 0.14618, 0.14887, 0.13512, 0.13541, 0.13381, 0.14182, 0.14007, 0.14152, 0.13605, 0.13807, 0.13717, 0.13509, 0.13546, 0.13698, 0.13358, 0.13623, 0.13205, 0.12316, 0.13181, 0.14145, 0.1317, 0.13396, 0.14106, 0.13611, 0.14089, 0.14373, 0.13469, 0.1384, 0.14246, 0.13291, 0.14068, 0.13738, 0.13421, 0.13749, 0.13088, 0.13458, 0.13609, 0.133, 0.14241, 0.13922, 0.13388, 0.14182, 0.13246, 0.13971, 0.14107, 0.13164, 0.13039, 0.13705, 0.12577, 0.13184, 0.13088, 0.13144, 0.13487, 0.13555, 0.12695, 0.23517, 0.1322, 0.13486, 0.16077, 0.13981, 0.23534, 0.13332, 0.13076, 0.13464, 0.12966, 0.13057, 0.13577, 0.13162, 0.12711, 0.13253, 0.13694, 0.13253, 0.1291, 0.13231, 0.13615, 0.13278, 0.13306, 0.13739, 0.13635, 0.12928, 0.12884, 0.13997, 0.13381, 0.13621, 0.14094, 0.1347, 0.13224, 0.13078, 0.1333, 0.14059, 0.13768, 0.13345, 0.1394, 0.13204, 0.13595, 0.14267, 0.13406, 0.13447, 0.13958, 0.13493, 0.13657, 0.13256, 0.13241, 0.14205, 0.13985, 0.13748, 0.14438, 0.14105, 0.13704, 0.14125, 0.13958, 0.1371, 0.13476, 0.13221, 0.14116, 0.1413, 0.13323, 0.13777, 0.13451, 0.13785, 0.13827, 0.13489, 0.13565, 0.13632, 0.14132, 0.13954, 0.13567, 0.13798, 0.1411, 0.13641, 0.1346, 0.13417, 0.13059, 0.14076, 0.14564, 0.14703, 0.14826, 0.14723, 0.14169, 0.14389, 0.14245, 0.14606, 0.1389, 0.14429, 0.14006, 0.13171, 0.13461, 0.13482, 0.14111, 0.13415, 0.14396, 0.15035, 0.14874, 0.1481, 0.14804, 0.13867, 0.14775, 0.13614, 0.13103, 0.13832, 0.13379, 0.15425, 0.1329, 0.22576, 0.13539, 0.12996, 0.16565, 0.12569, 0.12696, 0.12758, 0.13901, 0.13127, 0.13219, 0.13915, 0.13046, 0.12996, 0.1351, 0.13312, 0.13428, 0.13394, 0.13287, 0.13398, 0.13368, 0.12682, 0.13561, 0.13323, 0.1307, 0.13416, 0.13272, 0.13142, 0.136, 0.13057, 0.13073, 0.13345, 0.13692, 0.13433, 0.13536, 0.13216, 0.13483, 0.13431, 0.13132, 0.13241, 0.13481, 0.13004, 0.13405, 0.12911, 0.13104, 0.13208, 0.13389]}, "backward-send-forward-recv-time": {"start_step": 0, "end_step": 2000, "step_interval": 5, "values": [2.85465, 0.00835, 0.00699, 0.00741, 0.00706, 0.00797, 0.0072, 0.00701, 0.00796, 0.0097, 0.00702, 0.00774, 0.00734, 0.00774, 0.0089, 0.00828, 0.00699, 0.00781, 0.00859, 0.00782, 0.00885, 0.00849, 0.00699, 0.00689, 0.00726, 0.00698, 0.00708, 0.00765, 0.00904, 0.00754, 0.00764, 0.00719, 0.00699, 0.00717, 0.00867, 0.00723, 0.00713, 0.00719, 0.00696, 0.00695, 0.0071, 0.00724, 0.00738, 0.00696, 0.00708, 0.00738, 0.00771, 0.00745, 0.00704, 0.00878, 0.00742, 0.00713, 0.00774, 0.00714, 0.00691, 0.01011, 0.00831, 0.00755, 0.00829, 0.00713, 0.00712, 0.00776, 0.00714, 0.00703, 0.00812, 0.00754, 0.00844, 0.00686, 0.00703, 0.00718, 0.00709, 0.00784, 0.00743, 0.00744, 0.00705, 0.00773, 0.0077, 0.00752, 0.00823, 0.00721, 0.00697, 0.00777, 0.00754, 0.00704, 0.00687, 0.00767, 0.00697, 0.00724, 0.0081, 0.0081, 0.00692, 0.00799, 0.00739, 0.00705, 0.00849, 0.00694, 0.00742, 0.00767, 0.00711, 0.00824, 0.00696, 0.00742, 0.00848, 0.00758, 0.00786, 0.00691, 0.00711, 0.00709, 0.00692, 0.00764, 0.00779, 0.00699, 0.00727, 0.00768, 0.007, 0.0078, 0.00701, 0.00735, 0.00759, 0.00875, 0.00792, 0.00727, 0.00737, 0.00715, 0.00787, 0.00741, 0.00751, 0.00855, 0.00692, 0.00786, 0.00751, 0.00811, 0.00715, 0.00699, 0.00709, 0.00705, 0.00737, 0.0082, 0.00828, 0.00883, 0.00777, 0.00806, 0.00752, 0.0074, 0.00758, 0.00764, 0.00798, 0.00876, 0.0073, 0.00773, 0.00824, 0.00728, 0.00773, 0.00775, 0.00706, 0.00716, 0.00698, 0.00735, 0.00857, 0.00716, 0.00715, 0.00888, 0.00742, 0.00709, 0.00773, 0.00707, 0.00785, 0.00751, 0.00723, 0.00781, 0.00732, 0.00731, 0.00751, 0.00926, 0.00734, 0.00835, 0.00815, 0.00834, 0.00863, 0.00698, 0.00697, 0.00866, 0.00749, 0.00697, 0.00797, 0.00761, 0.00705, 0.00898, 0.00815, 0.00711, 0.00733, 0.00846, 0.00756, 0.00807, 0.00707, 0.00876, 0.00728, 0.00798, 0.00766, 0.00737, 0.00998, 0.00838, 0.0077, 0.00751, 0.00848, 0.00695, 0.00705, 0.00981, 0.00734, 0.00923, 0.0071, 0.00714, 0.00728, 0.00728, 0.0085, 0.00981, 0.00871, 0.00696, 0.00863, 0.00936, 0.01089, 0.00793, 0.00711, 0.00971, 0.00701, 0.00936, 0.00758, 0.00816, 0.00884, 0.00803, 0.00847, 0.01006, 0.00978, 0.00825, 0.0081, 0.00787, 0.00813, 0.00997, 0.00754, 0.00893, 0.00765, 0.00713, 0.0078, 0.0076, 0.00705, 0.00918, 0.11069, 0.00794, 0.00727, 0.07524, 0.00865, 0.00813, 0.007, 0.00696, 0.0071, 0.00698, 0.00706, 0.00709, 0.00901, 0.00738, 0.00798, 0.00783, 0.00755, 0.00757, 0.00792, 0.0078, 0.00758, 0.00842, 0.00991, 0.00945, 0.00712, 0.00835, 0.00735, 0.00734, 0.00709, 0.00708, 0.00953, 0.00709, 0.00704, 0.00922, 0.00937, 0.00856, 0.00712, 0.00846, 0.01121, 0.00908, 0.00701, 0.01037, 0.00813, 0.00814, 0.00709, 0.00791, 0.0074, 0.00756, 0.00813, 0.00849, 0.00705, 0.00877, 0.00705, 0.00702, 0.00784, 0.00699, 0.00862, 0.00977, 0.0078, 0.00851, 0.00917, 0.00814, 0.00962, 0.0071, 0.00832, 0.01014, 0.00711, 0.00716, 0.00781, 0.00825, 0.01002, 0.00758, 0.00695, 0.01037, 0.00713, 0.0097, 0.00977, 0.00754, 0.00863, 0.00703, 0.00781, 0.00826, 0.00731, 0.00742, 0.00778, 0.00814, 0.00835, 0.00713, 0.00837, 0.0071, 0.00718, 0.00856, 0.00694, 0.00858, 0.00741, 0.00763, 0.00727, 0.00894, 0.00892, 0.0078, 0.00875, 0.00972, 0.00704, 0.00701, 0.00812, 0.00733, 0.0694, 0.00715, 0.09935, 0.00722, 0.00697, 0.0823, 0.00708, 0.00762, 0.00706, 0.00717, 0.00712, 0.0071, 0.00708, 0.00694, 0.00712, 0.00717, 0.00703, 0.00723, 0.00767, 0.007, 0.00705, 0.00716, 0.00837, 0.00992, 0.00743, 0.0076, 0.00795, 0.00785, 0.00774, 0.00828, 0.00864, 0.00714, 0.00767, 0.00727, 0.0089, 0.00821, 0.00781, 0.00855, 0.00777, 0.00721, 0.00716, 0.00875, 0.00792, 0.00919, 0.00807, 0.00884, 0.00881, 0.0088]}, "layernorm-grads-all-reduce-time": {"start_step": 0, "end_step": 2000, "step_interval": 5, "values": [4e-05, 1e-05, 1e-05, 1e-05, 1e-05, 1e-05, 1e-05, 1e-05, 1e-05, 1e-05, 1e-05, 1e-05, 1e-05, 2e-05, 1e-05, 1e-05, 1e-05, 1e-05, 1e-05, 1e-05, 1e-05, 1e-05, 1e-05, 1e-05, 1e-05, 1e-05, 1e-05, 1e-05, 1e-05, 1e-05, 1e-05, 1e-05, 1e-05, 1e-05, 1e-05, 1e-05, 2e-05, 1e-05, 1e-05, 2e-05, 1e-05, 1e-05, 1e-05, 1e-05, 1e-05, 1e-05, 1e-05, 1e-05, 1e-05, 1e-05, 1e-05, 1e-05, 2e-05, 1e-05, 1e-05, 1e-05, 2e-05, 1e-05, 1e-05, 1e-05, 2e-05, 2e-05, 1e-05, 1e-05, 1e-05, 1e-05, 1e-05, 1e-05, 1e-05, 1e-05, 1e-05, 2e-05, 1e-05, 1e-05, 1e-05, 1e-05, 1e-05, 2e-05, 1e-05, 2e-05, 1e-05, 1e-05, 1e-05, 1e-05, 1e-05, 1e-05, 1e-05, 1e-05, 1e-05, 1e-05, 1e-05, 1e-05, 1e-05, 1e-05, 1e-05, 1e-05, 1e-05, 1e-05, 1e-05, 2e-05, 2e-05, 1e-05, 1e-05, 1e-05, 1e-05, 1e-05, 1e-05, 1e-05, 1e-05, 1e-05, 2e-05, 1e-05, 1e-05, 1e-05, 2e-05, 1e-05, 1e-05, 1e-05, 1e-05, 2e-05, 1e-05, 1e-05, 1e-05, 2e-05, 1e-05, 1e-05, 2e-05, 2e-05, 2e-05, 1e-05, 1e-05, 2e-05, 1e-05, 1e-05, 1e-05, 1e-05, 1e-05, 1e-05, 1e-05, 1e-05, 1e-05, 1e-05, 1e-05, 1e-05, 1e-05, 1e-05, 2e-05, 1e-05, 1e-05, 2e-05, 1e-05, 1e-05, 1e-05, 1e-05, 1e-05, 1e-05, 1e-05, 2e-05, 1e-05, 1e-05, 1e-05, 1e-05, 2e-05, 1e-05, 2e-05, 2e-05, 1e-05, 2e-05, 1e-05, 1e-05, 1e-05, 2e-05, 1e-05, 1e-05, 1e-05, 1e-05, 1e-05, 2e-05, 1e-05, 1e-05, 2e-05, 1e-05, 1e-05, 1e-05, 1e-05, 1e-05, 1e-05, 1e-05, 1e-05, 2e-05, 1e-05, 1e-05, 1e-05, 2e-05, 1e-05, 1e-05, 1e-05, 1e-05, 1e-05, 2e-05, 3e-05, 1e-05, 1e-05, 1e-05, 1e-05, 2e-05, 1e-05, 1e-05, 1e-05, 1e-05, 1e-05, 1e-05, 1e-05, 1e-05, 1e-05, 1e-05, 1e-05, 1e-05, 1e-05, 2e-05, 1e-05, 1e-05, 1e-05, 1e-05, 1e-05, 1e-05, 1e-05, 1e-05, 1e-05, 1e-05, 1e-05, 1e-05, 1e-05, 1e-05, 1e-05, 1e-05, 1e-05, 2e-05, 1e-05, 1e-05, 1e-05, 1e-05, 1e-05, 2e-05, 1e-05, 1e-05, 2e-05, 1e-05, 2e-05, 1e-05, 1e-05, 1e-05, 1e-05, 1e-05, 1e-05, 1e-05, 2e-05, 1e-05, 2e-05, 1e-05, 1e-05, 1e-05, 1e-05, 1e-05, 1e-05, 1e-05, 1e-05, 1e-05, 1e-05, 1e-05, 1e-05, 1e-05, 1e-05, 1e-05, 1e-05, 1e-05, 1e-05, 1e-05, 1e-05, 1e-05, 1e-05, 1e-05, 1e-05, 1e-05, 1e-05, 1e-05, 1e-05, 1e-05, 2e-05, 1e-05, 1e-05, 1e-05, 1e-05, 1e-05, 1e-05, 1e-05, 1e-05, 1e-05, 1e-05, 1e-05, 1e-05, 2e-05, 2e-05, 2e-05, 2e-05, 1e-05, 1e-05, 1e-05, 1e-05, 1e-05, 1e-05, 1e-05, 1e-05, 1e-05, 1e-05, 1e-05, 1e-05, 1e-05, 1e-05, 1e-05, 1e-05, 2e-05, 2e-05, 1e-05, 1e-05, 1e-05, 1e-05, 2e-05, 1e-05, 1e-05, 1e-05, 1e-05, 2e-05, 2e-05, 1e-05, 2e-05, 1e-05, 1e-05, 1e-05, 1e-05, 1e-05, 2e-05, 1e-05, 1e-05, 1e-05, 1e-05, 1e-05, 2e-05, 1e-05, 1e-05, 1e-05, 1e-05, 1e-05, 2e-05, 2e-05, 1e-05, 1e-05, 1e-05, 1e-05, 1e-05, 1e-05, 1e-05, 1e-05, 1e-05, 1e-05, 1e-05, 1e-05, 2e-05, 1e-05, 1e-05, 1e-05, 1e-05, 1e-05, 1e-05, 1e-05, 1e-05, 1e-05, 2e-05, 1e-05, 1e-05, 1e-05, 1e-05, 1e-05, 1e-05, 1e-05, 1e-05, 1e-05, 2e-05, 1e-05, 2e-05, 2e-05, 1e-05, 1e-05, 1e-05, 1e-05, 1e-05, 1e-05, 1e-05, 1e-05, 1e-05]}, "embedding-grads-all-reduce-time": {"start_step": 0, "end_step": 2000, "step_interval": 5, "values": [0.00055, 0.00031, 0.00031, 0.00031, 0.00035, 0.00031, 0.00031, 0.00031, 0.00032, 0.00032, 0.00032, 0.00031, 0.00032, 0.00031, 0.00031, 0.00031, 0.00034, 0.00031, 0.00031, 0.00031, 0.00036, 0.00031, 0.00031, 0.00031, 0.00035, 0.00032, 0.00035, 0.00032, 0.00031, 0.00034, 0.00036, 0.00032, 0.00033, 0.00033, 0.00032, 0.00032, 0.00036, 0.00036, 0.00036, 0.00036, 0.00031, 0.00034, 0.00036, 0.00031, 0.00032, 0.00032, 0.00031, 0.00032, 0.00032, 0.00032, 0.00033, 0.00032, 0.00032, 0.00031, 0.00032, 0.00032, 0.00032, 0.00032, 0.00036, 0.00032, 0.00031, 0.00032, 0.00036, 0.00032, 0.00032, 0.00036, 0.00036, 0.00031, 0.00032, 0.00031, 0.00032, 0.00031, 0.00032, 0.00031, 0.00035, 0.00032, 0.00032, 0.00031, 0.00033, 0.00032, 0.00032, 0.00031, 0.00031, 0.00036, 0.00032, 0.00031, 0.00032, 0.00033, 0.00036, 0.00031, 0.00037, 0.00032, 0.00035, 0.00032, 0.00031, 0.00035, 0.00036, 0.00032, 0.00031, 0.00032, 0.00036, 0.00031, 0.00032, 0.00036, 0.00031, 0.00034, 0.00031, 0.00032, 0.00032, 0.00031, 0.00036, 0.00032, 0.00036, 0.00031, 0.00037, 0.00032, 0.00037, 0.0004, 0.00031, 0.00032, 0.00035, 0.00031, 0.00032, 0.00031, 0.00031, 0.00031, 0.00032, 0.00031, 0.00033, 0.00032, 0.00031, 0.00032, 0.00032, 0.00032, 0.00032, 0.00031, 0.00036, 0.00031, 0.00031, 0.00033, 0.00036, 0.00031, 0.00032, 0.00032, 0.00032, 0.00036, 0.00031, 0.00035, 0.00032, 0.00039, 0.00033, 0.00032, 0.00031, 0.00035, 0.00032, 0.00031, 0.00032, 0.00035, 0.00031, 0.00032, 0.00031, 0.00032, 0.00031, 0.00031, 0.00034, 0.00036, 0.00036, 0.00031, 0.00032, 0.00032, 0.00031, 0.00035, 0.00036, 0.00032, 0.00031, 0.00032, 0.00032, 0.00036, 0.00033, 0.00035, 0.00031, 0.00031, 0.00031, 0.00032, 0.00036, 0.00037, 0.00031, 0.00032, 0.00031, 0.00032, 0.00031, 0.00032, 0.00032, 0.00036, 0.00037, 0.00031, 0.00032, 0.00033, 0.00032, 0.00032, 0.00045, 0.00031, 0.00031, 0.00038, 0.00032, 0.00036, 0.00034, 0.00031, 0.00032, 0.00036, 0.00032, 0.00031, 0.00036, 0.00031, 0.00031, 0.00031, 0.00036, 0.00031, 0.00032, 0.00032, 0.0004, 0.00032, 0.00032, 0.00032, 0.00032, 0.00032, 0.00031, 0.00037, 0.00031, 0.00031, 0.00032, 0.00031, 0.00032, 0.00032, 0.00036, 0.00032, 0.00035, 0.00032, 0.00036, 0.00038, 0.00036, 0.00036, 0.00032, 0.00036, 0.00033, 0.00032, 0.00032, 0.00031, 0.00036, 0.00031, 0.00033, 0.00033, 0.00032, 0.00037, 0.00032, 0.00032, 0.00032, 0.00032, 0.00032, 0.00031, 0.00032, 0.00031, 0.00032, 0.00032, 0.00032, 0.00031, 0.00032, 0.00032, 0.00032, 0.00037, 0.00032, 0.00031, 0.00032, 0.00032, 0.00036, 0.00032, 0.00033, 0.00031, 0.00032, 0.00031, 0.00032, 0.00031, 0.00032, 0.00032, 0.00032, 0.00033, 0.00032, 0.00036, 0.00032, 0.00032, 0.00037, 0.00031, 0.00031, 0.00031, 0.00032, 0.00032, 0.00032, 0.00031, 0.00031, 0.00037, 0.00035, 0.00036, 0.00032, 0.00032, 0.00032, 0.00032, 0.00032, 0.00032, 0.00031, 0.00036, 0.00032, 0.00031, 0.00032, 0.00036, 0.00032, 0.00032, 0.00032, 0.00036, 0.00032, 0.00032, 0.00032, 0.00032, 0.00032, 0.00032, 0.00031, 0.00032, 0.00033, 0.00038, 0.00034, 0.00036, 0.00032, 0.00033, 0.00032, 0.00032, 0.00035, 0.00032, 0.00032, 0.00032, 0.00032, 0.00032, 0.00032, 0.00032, 0.00035, 0.00032, 0.00032, 0.00031, 0.00032, 0.00036, 0.00036, 0.00032, 0.00032, 0.00032, 0.00036, 0.00032, 0.00032, 0.00031, 0.00036, 0.00032, 0.00036, 0.00033, 0.00031, 0.00031, 0.00032, 0.00032, 0.00032, 0.00032, 0.00032, 0.00036, 0.00035, 0.00031, 0.00032, 0.00036, 0.00032, 0.00033, 0.00036, 0.00032, 0.00032, 0.00032, 0.00031, 0.00032, 0.00033, 0.00032, 0.00031, 0.00032, 0.00035, 0.00032, 0.00032, 0.00035, 0.00032, 0.00035, 0.00032, 0.00037, 0.00032, 0.00031, 0.00037, 0.00032, 0.00035, 0.00031, 0.00036, 0.00032]}, "all-grads-sync-time": {"start_step": 0, "end_step": 2000, "step_interval": 5, "values": [2.11402, 0.00057, 0.00063, 0.00057, 0.00058, 0.00057, 0.00058, 0.00058, 0.00057, 0.00063, 0.00057, 0.00058, 0.00058, 0.00057, 0.00057, 0.00058, 0.00059, 0.00058, 0.00058, 0.00059, 0.00058, 0.00058, 0.00058, 0.00066, 0.00058, 0.00058, 0.00058, 0.00058, 0.00058, 0.00058, 0.00058, 0.00058, 0.00058, 0.00059, 0.00059, 0.00058, 0.00059, 0.00059, 0.00058, 0.00059, 0.0006, 0.00059, 0.00059, 0.00063, 0.00059, 0.00058, 0.00058, 0.00059, 0.00063, 0.00058, 0.00059, 0.00059, 0.00058, 0.00059, 0.00058, 0.00059, 0.00059, 0.00058, 0.00059, 0.0006, 0.00058, 0.00058, 0.00058, 0.00057, 0.0007, 0.00059, 0.00064, 0.00058, 0.00059, 0.00058, 0.00059, 0.00058, 0.00059, 0.00059, 0.00059, 0.00058, 0.00058, 0.00059, 0.00058, 0.00059, 0.00061, 0.00058, 0.00064, 0.00058, 0.00059, 0.00059, 0.00059, 0.00064, 0.00058, 0.00058, 0.00059, 0.00059, 0.00058, 0.00058, 0.00059, 0.00059, 0.00059, 0.00057, 0.00059, 0.0006, 0.00058, 0.00059, 0.00058, 0.00059, 0.00059, 0.00059, 0.00058, 0.00065, 0.00058, 0.00059, 0.00058, 0.00064, 0.00059, 0.00059, 0.00059, 0.00062, 0.00059, 0.00064, 0.00059, 0.00059, 0.00059, 0.0006, 0.00058, 0.00058, 0.00059, 0.00059, 0.00059, 0.00059, 0.00059, 0.00058, 0.00059, 0.00059, 0.00058, 0.00064, 0.00065, 0.00059, 0.00058, 0.00059, 0.00058, 0.00059, 0.00059, 0.00059, 0.00059, 0.00059, 0.00059, 0.00058, 0.00058, 0.00061, 0.0006, 0.00067, 0.00058, 0.00059, 0.00058, 0.00059, 0.00059, 0.00058, 0.00059, 0.00057, 0.00059, 0.00059, 0.00061, 0.00059, 0.0006, 0.00064, 0.00058, 0.00059, 0.00059, 0.00058, 0.00058, 0.00058, 0.00058, 0.00059, 0.0006, 0.00058, 0.00059, 0.0006, 0.00059, 0.00059, 0.00057, 0.00058, 0.00058, 0.00058, 0.0006, 0.0006, 0.00058, 0.00059, 0.00058, 0.00059, 0.00058, 0.00059, 0.00059, 0.00059, 0.00064, 0.00059, 0.00059, 0.00058, 0.00059, 0.00058, 0.00059, 0.00059, 0.00059, 0.00062, 0.00058, 0.00059, 0.00059, 0.00059, 0.00058, 0.00063, 0.00058, 0.00058, 0.00059, 0.00058, 0.00058, 0.00064, 0.0006, 0.00059, 0.00058, 0.00059, 0.00058, 0.00058, 0.00059, 0.00058, 0.0006, 0.00064, 0.00058, 0.00058, 0.0006, 0.0006, 0.00057, 0.00058, 0.00059, 0.00059, 0.00059, 0.00062, 0.00059, 0.00058, 0.00059, 0.00059, 0.00059, 0.00059, 0.00059, 0.0006, 0.00059, 0.00059, 0.00058, 0.00058, 0.00064, 0.00059, 0.00064, 0.00058, 0.00058, 0.00059, 0.00059, 0.00058, 0.00058, 0.00058, 0.00058, 0.00065, 0.0006, 0.00057, 0.00059, 0.00059, 0.00058, 0.00058, 0.00058, 0.0006, 0.00058, 0.00058, 0.00059, 0.00059, 0.00057, 0.00058, 0.00057, 0.00064, 0.00057, 0.00058, 0.00068, 0.00058, 0.00058, 0.00058, 0.00059, 0.00059, 0.00059, 0.00058, 0.00058, 0.00057, 0.00058, 0.00058, 0.00058, 0.00058, 0.00059, 0.00058, 0.00057, 0.00059, 0.00062, 0.00058, 0.00059, 0.00058, 0.00059, 0.00058, 0.00058, 0.00058, 0.00058, 0.00058, 0.00058, 0.0006, 0.00058, 0.00059, 0.00058, 0.00058, 0.00058, 0.00058, 0.00058, 0.00058, 0.00059, 0.00059, 0.00058, 0.00057, 0.00059, 0.00058, 0.00059, 0.00058, 0.00058, 0.00057, 0.00059, 0.00058, 0.00059, 0.00059, 0.0006, 0.00058, 0.00058, 0.00059, 0.00058, 0.00071, 0.00058, 0.00064, 0.00059, 0.00059, 0.00059, 0.00058, 0.00058, 0.00058, 0.00058, 0.00059, 0.00058, 0.00058, 0.00058, 0.00058, 0.00058, 0.00058, 0.00058, 0.00059, 0.00058, 0.00059, 0.00058, 0.00063, 0.00059, 0.00058, 0.00058, 0.00057, 0.00058, 0.00059, 0.00058, 0.00058, 0.00058, 0.00059, 0.00058, 0.00058, 0.00065, 0.00058, 0.00059, 0.00058, 0.00058, 0.00058, 0.00059, 0.00058, 0.00059, 0.00058, 0.00058, 0.00059, 0.00058, 0.00059, 0.00059, 0.00059, 0.00057, 0.00058, 0.00058, 0.00059, 0.00059, 0.00069, 0.00058, 0.0006, 0.00058, 0.00058, 0.00057, 0.00058, 0.00057, 0.00059, 0.00058, 0.00058]}, "optimizer-copy-to-main-grad-time": {"start_step": 0, "end_step": 2000, "step_interval": 5, "values": [0.00021, 0.00012, 0.00013, 0.00013, 0.00013, 0.00013, 0.00013, 0.00013, 0.00013, 0.00014, 0.00013, 0.00013, 0.00013, 0.00013, 0.00013, 0.00013, 0.00013, 0.00013, 0.00012, 0.00013, 0.00013, 0.00013, 0.00013, 0.00013, 0.00013, 0.00014, 0.00013, 0.00013, 0.00013, 0.00013, 0.00013, 0.00013, 0.00014, 0.00014, 0.00014, 0.00014, 0.00014, 0.00014, 0.00014, 0.00014, 0.00014, 0.00014, 0.00014, 0.00014, 0.00014, 0.00014, 0.00014, 0.00014, 0.00014, 0.00014, 0.00014, 0.00014, 0.00014, 0.00014, 0.00014, 0.00014, 0.00014, 0.00013, 0.00014, 0.00014, 0.00014, 0.00014, 0.00014, 0.00013, 0.00014, 0.00014, 0.00014, 0.00013, 0.00014, 0.00014, 0.00014, 0.00014, 0.00014, 0.00014, 0.00014, 0.00014, 0.00014, 0.00014, 0.00014, 0.00014, 0.00014, 0.00013, 0.00014, 0.00013, 0.00014, 0.00014, 0.00014, 0.00014, 0.00014, 0.00014, 0.00014, 0.00014, 0.00014, 0.00014, 0.00015, 0.00014, 0.00014, 0.00014, 0.00014, 0.00014, 0.00014, 0.00014, 0.00013, 0.00014, 0.00014, 0.00013, 0.00014, 0.00014, 0.00015, 0.00014, 0.00014, 0.00014, 0.00014, 0.00014, 0.00014, 0.00014, 0.00014, 0.00014, 0.00014, 0.00013, 0.00014, 0.00014, 0.00014, 0.00014, 0.00014, 0.00014, 0.00014, 0.00014, 0.00014, 0.00014, 0.00014, 0.00014, 0.00014, 0.00014, 0.00014, 0.00014, 0.00014, 0.00014, 0.00014, 0.00014, 0.00014, 0.00015, 0.00014, 0.00014, 0.00014, 0.00015, 0.00014, 0.00014, 0.00014, 0.00014, 0.00014, 0.00014, 0.00014, 0.00014, 0.00014, 0.00014, 0.00014, 0.00014, 0.00014, 0.00014, 0.00014, 0.00014, 0.00014, 0.00014, 0.00014, 0.00014, 0.00014, 0.00014, 0.00014, 0.00014, 0.00014, 0.00014, 0.00014, 0.00014, 0.00014, 0.00014, 0.00014, 0.00014, 0.00014, 0.00014, 0.00014, 0.00014, 0.00014, 0.00014, 0.00014, 0.00014, 0.00014, 0.00014, 0.00014, 0.00014, 0.00014, 0.00014, 0.00014, 0.00014, 0.00014, 0.00014, 0.00016, 0.00014, 0.00014, 0.00014, 0.0002, 0.00014, 0.00014, 0.00014, 0.00015, 0.00014, 0.00015, 0.00014, 0.00014, 0.00013, 0.00014, 0.00014, 0.00014, 0.00013, 0.00014, 0.00014, 0.00014, 0.00014, 0.00014, 0.00015, 0.00014, 0.00014, 0.00014, 0.00014, 0.00014, 0.00014, 0.00014, 0.00014, 0.00013, 0.00013, 0.00014, 0.00014, 0.00014, 0.00014, 0.00014, 0.00014, 0.00014, 0.00013, 0.00014, 0.00014, 0.00014, 0.00014, 0.00014, 0.00014, 0.00014, 0.00014, 0.00015, 0.00014, 0.00014, 0.00014, 0.00014, 0.00015, 0.00014, 0.00014, 0.00014, 0.00014, 0.00014, 0.00014, 0.00014, 0.00015, 0.00014, 0.00014, 0.00014, 0.00014, 0.00014, 0.00014, 0.00014, 0.00014, 0.00014, 0.00014, 0.00014, 0.00014, 0.00014, 0.00014, 0.00014, 0.00014, 0.00013, 0.00014, 0.00013, 0.00013, 0.00014, 0.00014, 0.00014, 0.00014, 0.00014, 0.00014, 0.00014, 0.00014, 0.00014, 0.00014, 0.00014, 0.00014, 0.00013, 0.00014, 0.00014, 0.00014, 0.00014, 0.00014, 0.00014, 0.00014, 0.00014, 0.00014, 0.00014, 0.00014, 0.00014, 0.00014, 0.00014, 0.00014, 0.00014, 0.00014, 0.00014, 0.00014, 0.00015, 0.00014, 0.00014, 0.00014, 0.00014, 0.00013, 0.00014, 0.00014, 0.00014, 0.00014, 0.00014, 0.00014, 0.00014, 0.00014, 0.00014, 0.00014, 0.00014, 0.00014, 0.00014, 0.00013, 0.00014, 0.00014, 0.00014, 0.00014, 0.00014, 0.00014, 0.00014, 0.00014, 0.00014, 0.00014, 0.00014, 0.00014, 0.00014, 0.00014, 0.00015, 0.00014, 0.00015, 0.00014, 0.00015, 0.00014, 0.00014, 0.00014, 0.00014, 0.00013, 0.00014, 0.00014, 0.00013, 0.00014, 0.00014, 0.00014, 0.00014, 0.00014, 0.00014, 0.00014, 0.00014, 0.00014, 0.00014, 0.00014, 0.00014, 0.00014, 0.00014, 0.00015, 0.00014, 0.00014, 0.00014, 0.00014, 0.00014, 0.00014, 0.00014, 0.00014, 0.00014, 0.00014, 0.00014, 0.00014, 0.00014, 0.00015, 0.00014, 0.00014, 0.00014, 0.00014, 0.00014, 0.00014, 0.00014, 0.00014, 0.00014, 0.00014, 0.00014, 0.00014]}, "optimizer-clip-main-grad-time": {"start_step": 0, "end_step": 2000, "step_interval": 5, "values": [4.22691, 0.00055, 0.00056, 0.00056, 0.00056, 0.00057, 0.00056, 0.00056, 0.00055, 0.00056, 0.00056, 0.00056, 0.00056, 0.00055, 0.00057, 0.00057, 0.00056, 0.00056, 0.00054, 0.00056, 0.00056, 0.00055, 0.00055, 0.00056, 0.00056, 0.00055, 0.00061, 0.00058, 0.00058, 0.00056, 0.00056, 0.00056, 0.00057, 0.00061, 0.00059, 0.00057, 0.00058, 0.00056, 0.00058, 0.00058, 0.00058, 0.00058, 0.00059, 0.00056, 0.00058, 0.00058, 0.00059, 0.00057, 0.00059, 0.00057, 0.00058, 0.00058, 0.00059, 0.00058, 0.00059, 0.00058, 0.0006, 0.00057, 0.00058, 0.00058, 0.00056, 0.00058, 0.00059, 0.00059, 0.00058, 0.00059, 0.00058, 0.00057, 0.0006, 0.00061, 0.00058, 0.00059, 0.00058, 0.00057, 0.00059, 0.00058, 0.00058, 0.00058, 0.00058, 0.00059, 0.00058, 0.00056, 0.00057, 0.00058, 0.00059, 0.00058, 0.00057, 0.00057, 0.00058, 0.00057, 0.00058, 0.00058, 0.00056, 0.00057, 0.00049, 0.00057, 0.00057, 0.00057, 0.00048, 0.00057, 0.00058, 0.00059, 0.00058, 0.00058, 0.00058, 0.00058, 0.00058, 0.00058, 0.00058, 0.00059, 0.00058, 0.00048, 0.00048, 0.0005, 0.00058, 0.0006, 0.00058, 0.00058, 0.00059, 0.00056, 0.00058, 0.00058, 0.00058, 0.00059, 0.00057, 0.00058, 0.00057, 0.00058, 0.00057, 0.00073, 0.00058, 0.00058, 0.00057, 0.00058, 0.00058, 0.00059, 0.00058, 0.00058, 0.00046, 0.00058, 0.00057, 0.00059, 0.00058, 0.00057, 0.00048, 0.00058, 0.00058, 0.00059, 0.00059, 0.00059, 0.00058, 0.00057, 0.00059, 0.00058, 0.00057, 0.00057, 0.00058, 0.00056, 0.00058, 0.00058, 0.00058, 0.00057, 0.00047, 0.00047, 0.00067, 0.00057, 0.00058, 0.00059, 0.00057, 0.00058, 0.00066, 0.00058, 0.00058, 0.00059, 0.00048, 0.00059, 0.00059, 0.00059, 0.00057, 0.00062, 0.00058, 0.00057, 0.00057, 0.00057, 0.00058, 0.0006, 0.00057, 0.00057, 0.00058, 0.00058, 0.00057, 0.00058, 0.00059, 0.00058, 0.00059, 0.00058, 0.0006, 0.00058, 0.00058, 0.00058, 0.00064, 0.00057, 0.00058, 0.00059, 0.00059, 0.00058, 0.00059, 0.00059, 0.00058, 0.00057, 0.00057, 0.0006, 0.00058, 0.00057, 0.00058, 0.00059, 0.00059, 0.0006, 0.00059, 0.00059, 0.00059, 0.00059, 0.00059, 0.00058, 0.0006, 0.00058, 0.00061, 0.00059, 0.00057, 0.00056, 0.00058, 0.00059, 0.00059, 0.00059, 0.00059, 0.00058, 0.00059, 0.00063, 0.0006, 0.00059, 0.00058, 0.00059, 0.00058, 0.00058, 0.00058, 0.00058, 0.00061, 0.00059, 0.0006, 0.00058, 0.0006, 0.0006, 0.00059, 0.00059, 0.00059, 0.00058, 0.00058, 0.00059, 0.00058, 0.00057, 0.0006, 0.0006, 0.00059, 0.00059, 0.00059, 0.0006, 0.00059, 0.0006, 0.00059, 0.00058, 0.00058, 0.00057, 0.00058, 0.00061, 0.00058, 0.00061, 0.00058, 0.00058, 0.00057, 0.00057, 0.00059, 0.00058, 0.00057, 0.00058, 0.00059, 0.00058, 0.00059, 0.00059, 0.00058, 0.0006, 0.00058, 0.0006, 0.00057, 0.0006, 0.00059, 0.00059, 0.00059, 0.0006, 0.0006, 0.00059, 0.00058, 0.0006, 0.00058, 0.0006, 0.0006, 0.00061, 0.00058, 0.00058, 0.00059, 0.00058, 0.00058, 0.00059, 0.00058, 0.00061, 0.00062, 0.00062, 0.00058, 0.00057, 0.00058, 0.0006, 0.00058, 0.00058, 0.00058, 0.00058, 0.00058, 0.00058, 0.00059, 0.00059, 0.00059, 0.00059, 0.00059, 0.00058, 0.00058, 0.00058, 0.00059, 0.00059, 0.00059, 0.00058, 0.00059, 0.00058, 0.00058, 0.00063, 0.0006, 0.00059, 0.00062, 0.00058, 0.00059, 0.00058, 0.00059, 0.00059, 0.00058, 0.00058, 0.00058, 0.00063, 0.00059, 0.00056, 0.00058, 0.00058, 0.00056, 0.00057, 0.00059, 0.00059, 0.00059, 0.00059, 0.00058, 0.00059, 0.00058, 0.00058, 0.00059, 0.00058, 0.00058, 0.00058, 0.0006, 0.00059, 0.00058, 0.00058, 0.00058, 0.00059, 0.0006, 0.00058, 0.0006, 0.00058, 0.00059, 0.00058, 0.00057, 0.00057, 0.0006, 0.00064, 0.00059, 0.00061, 0.00058, 0.00058, 0.0006, 0.00058, 0.0006, 0.00067, 0.00057, 0.00058, 0.0006, 0.00059]}, "optimizer-count-zeros-time": {"start_step": 0, "end_step": 2000, "step_interval": 5, "values": [0.00354, 0.00262, 0.00261, 0.00266, 0.0026, 0.0026, 0.0026, 0.00261, 0.00259, 0.00259, 0.00261, 0.00261, 0.00261, 0.00262, 0.00262, 0.0026, 0.0026, 0.00258, 0.00264, 0.00259, 0.00269, 0.00267, 0.00262, 0.00291, 0.00262, 0.00271, 0.00259, 0.00259, 0.0026, 0.00261, 0.00261, 0.0026, 0.0026, 0.00257, 0.00262, 0.00261, 0.00262, 0.00265, 0.0026, 0.00261, 0.00261, 0.00259, 0.0026, 0.00265, 0.00262, 0.00261, 0.00265, 0.00258, 0.0026, 0.00263, 0.00261, 0.0026, 0.0026, 0.00258, 0.00258, 0.0026, 0.00261, 0.0026, 0.00261, 0.00261, 0.00263, 0.00259, 0.00262, 0.0026, 0.00261, 0.00258, 0.00261, 0.0026, 0.00267, 0.00261, 0.00258, 0.00265, 0.00259, 0.00261, 0.00258, 0.00258, 0.00261, 0.00261, 0.00261, 0.00259, 0.00258, 0.00262, 0.00261, 0.00261, 0.00261, 0.00259, 0.00262, 0.0026, 0.0026, 0.00259, 0.0026, 0.00261, 0.0026, 0.00261, 0.0026, 0.00272, 0.00259, 0.00262, 0.00257, 0.0026, 0.00261, 0.00259, 0.00263, 0.00259, 0.00261, 0.00261, 0.00267, 0.00258, 0.0026, 0.00259, 0.00262, 0.00259, 0.00259, 0.00481, 0.00261, 0.00259, 0.00263, 0.0029, 0.00259, 0.00261, 0.00263, 0.0026, 0.0026, 0.00261, 0.00261, 0.00262, 0.00261, 0.00259, 0.0026, 0.00308, 0.00357, 0.00364, 0.0026, 0.00259, 0.00266, 0.00258, 0.0026, 0.00264, 0.00261, 0.0026, 0.0026, 0.0026, 0.00261, 0.00261, 0.0026, 0.00258, 0.00262, 0.00262, 0.00264, 0.00258, 0.00262, 0.0026, 0.00259, 0.00268, 0.0026, 0.00263, 0.00257, 0.0026, 0.00259, 0.00262, 0.00262, 0.00261, 0.00261, 0.00261, 0.0026, 0.0026, 0.00261, 0.0026, 0.00266, 0.00266, 0.00264, 0.0027, 0.00268, 0.00266, 0.00266, 0.00267, 0.00263, 0.00266, 0.00264, 0.00459, 0.00266, 0.00266, 0.00267, 0.00266, 0.00265, 0.00269, 0.00266, 0.00267, 0.00272, 0.00267, 0.00265, 0.00272, 0.00266, 0.00266, 0.0027, 0.00266, 0.00265, 0.00269, 0.00265, 0.00265, 0.00265, 0.00268, 0.00265, 0.00266, 0.00266, 0.00267, 0.00266, 0.00265, 0.00267, 0.00266, 0.0027, 0.00266, 0.00264, 0.00266, 0.00264, 0.00266, 0.00265, 0.00265, 0.00266, 0.00268, 0.00268, 0.00266, 0.00266, 0.00266, 0.00264, 0.00265, 0.00269, 0.00267, 0.00267, 0.00269, 0.00266, 0.00266, 0.00266, 0.00266, 0.00265, 0.00268, 0.0027, 0.00351, 0.00265, 0.00266, 0.00267, 0.00267, 0.00265, 0.00267, 0.00265, 0.00267, 0.00266, 0.00266, 0.00275, 0.00266, 0.00264, 0.00265, 0.00266, 0.0027, 0.00287, 0.00267, 0.00306, 0.00267, 0.00265, 0.00268, 0.00266, 0.00266, 0.00265, 0.00265, 0.00265, 0.00266, 0.00271, 0.00266, 0.00266, 0.00267, 0.00267, 0.00273, 0.00267, 0.00267, 0.00264, 0.00267, 0.00266, 0.00264, 0.00267, 0.00267, 0.00266, 0.00267, 0.00266, 0.00263, 0.00266, 0.00268, 0.00265, 0.00266, 0.00266, 0.00267, 0.00267, 0.00265, 0.00268, 0.00266, 0.00267, 0.00272, 0.00264, 0.00266, 0.00266, 0.00265, 0.00277, 0.00266, 0.00269, 0.00264, 0.00265, 0.00266, 0.00259, 0.00259, 0.0026, 0.00261, 0.0026, 0.00262, 0.0026, 0.00261, 0.00261, 0.00261, 0.00261, 0.00272, 0.00262, 0.00323, 0.0026, 0.00261, 0.00262, 0.00269, 0.00259, 0.00261, 0.00261, 0.00261, 0.00261, 0.0026, 0.00259, 0.00258, 0.0026, 0.00262, 0.00261, 0.00261, 0.00262, 0.0026, 0.0026, 0.00264, 0.00259, 0.00285, 0.0026, 0.00259, 0.00259, 0.0026, 0.00258, 0.00261, 0.00261, 0.00259, 0.0026, 0.00261, 0.0026, 0.00273, 0.0026, 0.00258, 0.00261, 0.0026, 0.00259, 0.0026, 0.00259, 0.00259, 0.00261, 0.00266, 0.00266, 0.00265, 0.00269, 0.00269, 0.00266, 0.00266, 0.00266, 0.00264, 0.00266, 0.00267, 0.00265, 0.00273, 0.00265, 0.00265, 0.0027, 0.00266, 0.00274, 0.00267, 0.00267, 0.00267, 0.00266, 0.00266, 0.00266, 0.00299, 0.00266, 0.00268, 0.00265, 0.00267, 0.00265, 0.00268, 0.00265, 0.00266, 0.00267, 0.00267, 0.00271, 0.00267]}, "optimizer-inner-step-time": {"start_step": 0, "end_step": 2000, "step_interval": 5, "values": [0.00249, 0.00044, 0.00044, 0.00045, 0.00045, 0.00044, 0.00045, 0.00045, 0.00045, 0.00044, 0.00045, 0.00045, 0.00044, 0.00044, 0.00044, 0.00044, 0.00045, 0.00044, 0.00044, 0.00045, 0.00044, 0.00044, 0.00045, 0.00046, 0.00044, 0.00045, 0.00045, 0.00045, 0.00045, 0.00045, 0.00044, 0.00045, 0.00045, 0.00047, 0.00045, 0.00046, 0.00046, 0.00045, 0.00046, 0.00046, 0.00046, 0.00046, 0.00046, 0.00045, 0.00046, 0.00046, 0.00046, 0.00045, 0.00046, 0.00046, 0.00046, 0.00046, 0.00046, 0.00046, 0.00046, 0.00046, 0.00046, 0.00045, 0.00046, 0.00046, 0.00045, 0.00046, 0.00046, 0.00046, 0.00046, 0.00045, 0.00046, 0.00045, 0.00045, 0.00046, 0.00046, 0.00046, 0.00046, 0.00046, 0.00047, 0.00047, 0.00046, 0.00046, 0.00046, 0.00046, 0.00046, 0.00045, 0.00046, 0.00045, 0.00045, 0.00047, 0.00046, 0.00046, 0.00045, 0.00045, 0.00046, 0.00046, 0.00045, 0.00045, 0.00046, 0.00046, 0.00046, 0.00046, 0.00047, 0.00046, 0.00046, 0.00046, 0.00046, 0.00046, 0.00046, 0.00046, 0.00047, 0.00046, 0.00046, 0.00046, 0.00046, 0.00047, 0.00048, 0.00056, 0.00046, 0.00046, 0.00046, 0.00046, 0.00046, 0.00046, 0.00047, 0.00046, 0.00046, 0.00046, 0.00047, 0.00046, 0.00046, 0.00047, 0.00046, 0.00049, 0.00051, 0.00046, 0.00045, 0.00046, 0.00046, 0.00046, 0.00047, 0.00047, 0.00045, 0.00047, 0.00045, 0.00046, 0.00046, 0.00045, 0.00046, 0.00046, 0.00046, 0.00046, 0.00046, 0.00046, 0.00047, 0.00047, 0.00047, 0.00046, 0.00047, 0.00046, 0.00046, 0.00045, 0.00046, 0.00046, 0.00046, 0.00046, 0.00046, 0.00046, 0.00047, 0.00047, 0.00046, 0.00046, 0.00046, 0.00046, 0.00047, 0.00046, 0.00046, 0.00046, 0.00047, 0.00049, 0.00048, 0.00046, 0.00045, 0.00046, 0.00046, 0.00045, 0.00046, 0.00045, 0.00045, 0.00047, 0.00045, 0.00045, 0.00046, 0.00046, 0.00045, 0.00047, 0.00046, 0.00047, 0.00046, 0.00046, 0.00047, 0.00046, 0.00046, 0.00046, 0.00047, 0.00045, 0.00046, 0.00046, 0.00046, 0.00046, 0.00046, 0.00046, 0.00046, 0.00045, 0.00046, 0.00046, 0.00046, 0.00045, 0.00047, 0.00047, 0.00046, 0.00047, 0.00046, 0.00047, 0.00047, 0.00047, 0.00046, 0.00046, 0.00046, 0.00046, 0.00046, 0.00046, 0.00045, 0.00045, 0.00047, 0.00046, 0.00046, 0.00046, 0.00046, 0.00046, 0.00046, 0.00046, 0.00046, 0.00046, 0.00046, 0.00046, 0.00047, 0.00046, 0.00046, 0.00046, 0.00047, 0.00046, 0.00046, 0.00046, 0.00046, 0.00047, 0.00046, 0.00046, 0.00048, 0.00046, 0.00046, 0.00047, 0.00046, 0.00045, 0.00046, 0.00046, 0.00046, 0.00046, 0.00046, 0.00046, 0.00046, 0.00047, 0.00047, 0.00047, 0.00046, 0.00046, 0.00046, 0.00046, 0.00046, 0.00045, 0.00046, 0.00046, 0.00045, 0.00045, 0.00046, 0.00045, 0.00046, 0.0005, 0.00046, 0.00046, 0.00047, 0.00046, 0.00047, 0.00046, 0.00046, 0.00046, 0.00046, 0.00046, 0.00047, 0.00046, 0.00046, 0.00046, 0.00046, 0.00046, 0.00045, 0.00046, 0.00047, 0.00046, 0.00047, 0.00046, 0.00047, 0.00047, 0.00046, 0.00046, 0.00045, 0.00045, 0.00045, 0.00046, 0.00046, 0.00046, 0.00046, 0.00045, 0.00046, 0.00046, 0.00046, 0.00046, 0.00046, 0.00046, 0.00045, 0.00046, 0.00046, 0.00046, 0.00046, 0.00047, 0.00046, 0.00046, 0.00046, 0.00046, 0.00047, 0.00046, 0.00046, 0.00046, 0.00046, 0.00047, 0.00045, 0.00046, 0.00047, 0.00045, 0.00046, 0.00046, 0.00046, 0.00047, 0.00047, 0.00047, 0.00046, 0.00057, 0.00046, 0.00046, 0.00047, 0.00045, 0.00046, 0.00045, 0.00045, 0.00045, 0.00047, 0.00047, 0.00046, 0.00046, 0.00046, 0.00046, 0.00046, 0.00046, 0.00045, 0.00046, 0.00046, 0.00046, 0.00046, 0.00046, 0.00046, 0.00046, 0.00046, 0.00047, 0.00044, 0.00046, 0.00046, 0.00047, 0.00046, 0.00045, 0.00045, 0.00045, 0.00046, 0.00047, 0.00046, 0.00047, 0.00046, 0.00047, 0.00047, 0.00045, 0.00045, 0.00045, 0.00045, 0.00047, 0.00046, 0.00046]}, "optimizer-copy-main-to-model-params-time": {"start_step": 0, "end_step": 2000, "step_interval": 5, "values": [0.00056, 0.00049, 0.00049, 0.00049, 0.00049, 0.00049, 0.00049, 0.00049, 0.00049, 0.0005, 0.00049, 0.00049, 0.00049, 0.00049, 0.00049, 0.00049, 0.00048, 0.00049, 0.00049, 0.00049, 0.00049, 0.00049, 0.00049, 0.0005, 0.00049, 0.00049, 0.00049, 0.00049, 0.00049, 0.00049, 0.00048, 0.00049, 0.00049, 0.00049, 0.00049, 0.00049, 0.00049, 0.00049, 0.00049, 0.00049, 0.00049, 0.00049, 0.00049, 0.00049, 0.00049, 0.00049, 0.00049, 0.00049, 0.00049, 0.00051, 0.00049, 0.00049, 0.00049, 0.00049, 0.0005, 0.00049, 0.00049, 0.00048, 0.00049, 0.00049, 0.00049, 0.00049, 0.00049, 0.00049, 0.00049, 0.00049, 0.00049, 0.00049, 0.00049, 0.00049, 0.00049, 0.00049, 0.00049, 0.00049, 0.00049, 0.00049, 0.0005, 0.00049, 0.00049, 0.0005, 0.00049, 0.00049, 0.00049, 0.00049, 0.00049, 0.00049, 0.00049, 0.00049, 0.00049, 0.00049, 0.00049, 0.00049, 0.0005, 0.00049, 0.00049, 0.00049, 0.00049, 0.00049, 0.00049, 0.00049, 0.00049, 0.00049, 0.00049, 0.00049, 0.00049, 0.00049, 0.00049, 0.00051, 0.00049, 0.00049, 0.00049, 0.00049, 0.0005, 0.00069, 0.00049, 0.0005, 0.00049, 0.00049, 0.00049, 0.00049, 0.00049, 0.00051, 0.00049, 0.00049, 0.00049, 0.00049, 0.0005, 0.00049, 0.00049, 0.00053, 0.00064, 0.00049, 0.00049, 0.00049, 0.0005, 0.00049, 0.00049, 0.00049, 0.00049, 0.00049, 0.00049, 0.0005, 0.00049, 0.00049, 0.00049, 0.00049, 0.00049, 0.00049, 0.00049, 0.00051, 0.0005, 0.00049, 0.00049, 0.00049, 0.00049, 0.00052, 0.00049, 0.00049, 0.00051, 0.00049, 0.0005, 0.00051, 0.00049, 0.00049, 0.00053, 0.00049, 0.00049, 0.0005, 0.00049, 0.00049, 0.0005, 0.00049, 0.00049, 0.00049, 0.00049, 0.00049, 0.00049, 0.00049, 0.00049, 0.0005, 0.00049, 0.00049, 0.00049, 0.00049, 0.00049, 0.00049, 0.00049, 0.00049, 0.0005, 0.00049, 0.00049, 0.00049, 0.00049, 0.00049, 0.00049, 0.00049, 0.00049, 0.00051, 0.00049, 0.00049, 0.00049, 0.00049, 0.00049, 0.0005, 0.00049, 0.00049, 0.00049, 0.00049, 0.00049, 0.00049, 0.00049, 0.00049, 0.0005, 0.00049, 0.00049, 0.00049, 0.00049, 0.0005, 0.00049, 0.0005, 0.00049, 0.00049, 0.00049, 0.00049, 0.00049, 0.0005, 0.00049, 0.00049, 0.00049, 0.00049, 0.00049, 0.0005, 0.0005, 0.00049, 0.00049, 0.0005, 0.00049, 0.0005, 0.00049, 0.00049, 0.00049, 0.00049, 0.00049, 0.00049, 0.00049, 0.00049, 0.00049, 0.00049, 0.0005, 0.00049, 0.00049, 0.00049, 0.00049, 0.00049, 0.0005, 0.00049, 0.00049, 0.00051, 0.00049, 0.00049, 0.00059, 0.00051, 0.00049, 0.00049, 0.00049, 0.00049, 0.00049, 0.0005, 0.00049, 0.00049, 0.0005, 0.00049, 0.00049, 0.0005, 0.00049, 0.00049, 0.00049, 0.00049, 0.00049, 0.0005, 0.0005, 0.00049, 0.00049, 0.00068, 0.0005, 0.00049, 0.00049, 0.00049, 0.00077, 0.00049, 0.00049, 0.00049, 0.00049, 0.00049, 0.0005, 0.00049, 0.0005, 0.00049, 0.00049, 0.00049, 0.00049, 0.00049, 0.00049, 0.00049, 0.0005, 0.0005, 0.00049, 0.00049, 0.00049, 0.00049, 0.00049, 0.00049, 0.0005, 0.0005, 0.00049, 0.00049, 0.00049, 0.00049, 0.0005, 0.00062, 0.00049, 0.0005, 0.00049, 0.00049, 0.00049, 0.0005, 0.00049, 0.0005, 0.0005, 0.00049, 0.00049, 0.00049, 0.00049, 0.00049, 0.0005, 0.00049, 0.00049, 0.00049, 0.0005, 0.0005, 0.00049, 0.00049, 0.00049, 0.0005, 0.00049, 0.00049, 0.0005, 0.00049, 0.0005, 0.00049, 0.0005, 0.00064, 0.00049, 0.00049, 0.00049, 0.00049, 0.00049, 0.00049, 0.00049, 0.0005, 0.00049, 0.00049, 0.00049, 0.00049, 0.00061, 0.00049, 0.00049, 0.00049, 0.00049, 0.00049, 0.00049, 0.00049, 0.00049, 0.00049, 0.00052, 0.00049, 0.0005, 0.00049, 0.00049, 0.00049, 0.00049, 0.00049, 0.00049, 0.00049, 0.00049, 0.00049, 0.00049, 0.0005, 0.00049, 0.0005, 0.00049, 0.0005, 0.00049, 0.00049, 0.00049, 0.00049, 0.0005, 0.00049, 0.00049, 0.00049]}, "optimizer-time": {"start_step": 0, "end_step": 2000, "step_interval": 5, "values": [4.23567, 0.00458, 0.00457, 0.00463, 0.00456, 0.00458, 0.00456, 0.00457, 0.00457, 0.00456, 0.00457, 0.00457, 0.00457, 0.00456, 0.00459, 0.00457, 0.00455, 0.00458, 0.00456, 0.00456, 0.00465, 0.00463, 0.00457, 0.005, 0.00457, 0.00468, 0.0046, 0.00458, 0.00461, 0.0046, 0.00456, 0.00456, 0.00462, 0.00463, 0.00464, 0.0046, 0.00464, 0.00464, 0.00461, 0.00462, 0.00462, 0.00459, 0.00465, 0.00464, 0.00462, 0.00462, 0.00467, 0.00457, 0.00462, 0.00465, 0.00462, 0.00462, 0.00473, 0.00459, 0.0046, 0.00464, 0.00463, 0.00458, 0.00462, 0.00462, 0.00462, 0.00459, 0.00465, 0.00461, 0.00463, 0.00459, 0.0046, 0.00462, 0.00469, 0.00466, 0.00461, 0.00468, 0.0046, 0.00461, 0.0046, 0.00464, 0.00463, 0.00465, 0.00465, 0.00462, 0.00459, 0.00459, 0.00461, 0.00461, 0.00462, 0.00461, 0.00463, 0.00459, 0.00461, 0.00458, 0.00461, 0.00463, 0.00459, 0.0046, 0.00456, 0.00476, 0.00459, 0.00465, 0.00449, 0.00462, 0.00463, 0.0046, 0.00465, 0.0046, 0.00462, 0.00462, 0.00468, 0.00461, 0.00462, 0.00462, 0.00464, 0.0045, 0.00453, 0.00715, 0.00463, 0.00463, 0.00466, 0.00492, 0.00461, 0.00459, 0.00464, 0.00466, 0.00461, 0.00462, 0.00461, 0.00464, 0.00462, 0.00461, 0.0046, 0.00561, 0.00589, 0.00578, 0.0046, 0.0046, 0.00467, 0.0046, 0.00462, 0.00468, 0.00449, 0.00462, 0.00461, 0.00464, 0.00463, 0.00464, 0.0045, 0.0046, 0.00464, 0.00464, 0.00466, 0.00463, 0.00464, 0.00464, 0.00462, 0.00469, 0.00461, 0.00467, 0.00459, 0.00458, 0.00465, 0.00466, 0.00462, 0.00464, 0.00454, 0.00452, 0.00487, 0.00461, 0.00461, 0.00463, 0.00466, 0.00467, 0.00477, 0.00473, 0.00469, 0.00473, 0.00459, 0.00473, 0.00467, 0.00467, 0.00466, 0.0068, 0.00467, 0.00466, 0.00467, 0.00465, 0.00466, 0.00472, 0.00467, 0.00466, 0.00474, 0.00468, 0.00464, 0.00474, 0.00468, 0.00473, 0.00472, 0.00468, 0.0047, 0.00472, 0.00465, 0.00466, 0.00496, 0.00468, 0.00467, 0.00471, 0.0047, 0.00468, 0.00472, 0.00467, 0.00467, 0.00466, 0.00472, 0.00469, 0.00466, 0.00464, 0.00467, 0.00469, 0.00466, 0.00468, 0.00469, 0.00474, 0.00473, 0.00468, 0.0047, 0.00468, 0.00467, 0.00469, 0.00477, 0.00469, 0.00464, 0.00465, 0.0047, 0.0047, 0.00469, 0.00468, 0.00472, 0.00469, 0.00472, 0.00563, 0.00469, 0.00469, 0.00469, 0.0047, 0.00467, 0.0047, 0.00467, 0.00467, 0.00472, 0.00469, 0.00478, 0.00471, 0.00475, 0.00469, 0.00469, 0.00472, 0.00495, 0.00468, 0.0051, 0.00473, 0.0047, 0.00468, 0.00485, 0.00471, 0.00466, 0.0047, 0.00468, 0.00471, 0.00473, 0.00471, 0.0047, 0.00469, 0.00469, 0.00472, 0.00468, 0.00471, 0.00464, 0.00469, 0.00465, 0.00469, 0.00468, 0.00465, 0.00471, 0.00469, 0.0047, 0.00498, 0.00469, 0.00468, 0.00467, 0.00468, 0.00506, 0.0047, 0.00468, 0.00467, 0.00466, 0.00468, 0.0047, 0.00474, 0.00468, 0.00469, 0.0047, 0.00467, 0.00478, 0.00468, 0.00471, 0.0047, 0.00469, 0.00471, 0.00461, 0.00466, 0.00461, 0.00462, 0.0046, 0.00465, 0.00463, 0.00465, 0.00465, 0.00468, 0.00461, 0.00471, 0.00465, 0.00542, 0.00464, 0.00463, 0.00463, 0.00472, 0.0046, 0.00464, 0.00463, 0.0048, 0.00465, 0.00463, 0.00461, 0.00463, 0.0046, 0.00463, 0.00465, 0.00464, 0.00463, 0.00463, 0.00465, 0.00469, 0.00459, 0.00495, 0.00468, 0.00461, 0.00465, 0.00461, 0.00464, 0.00464, 0.00466, 0.00462, 0.00464, 0.00508, 0.00461, 0.0048, 0.00463, 0.00454, 0.00463, 0.00461, 0.00456, 0.0046, 0.00466, 0.00462, 0.00465, 0.00468, 0.00486, 0.00469, 0.00471, 0.00469, 0.00468, 0.00468, 0.00467, 0.00468, 0.00468, 0.00471, 0.00469, 0.00474, 0.00469, 0.00467, 0.00472, 0.00467, 0.00477, 0.00472, 0.00471, 0.00468, 0.00467, 0.00465, 0.00469, 0.00513, 0.00471, 0.00489, 0.00466, 0.00469, 0.00468, 0.00474, 0.00467, 0.00475, 0.00467, 0.00469, 0.00476, 0.0047]}, "learning-rate": {"start_step": 0, "end_step": 2000, "step_interval": 5, "values": [0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 1e-05, 1e-05, 1e-05, 1e-05, 1e-05, 1e-05, 1e-05, 1e-05, 1e-05, 1e-05, 1e-05, 1e-05, 1e-05, 1e-05, 1e-05, 1e-05, 1e-05, 1e-05, 1e-05, 1e-05, 1e-05, 1e-05, 1e-05, 1e-05, 1e-05, 1e-05, 1e-05, 1e-05, 1e-05, 1e-05, 1e-05, 1e-05, 1e-05, 1e-05, 1e-05, 1e-05, 1e-05, 1e-05, 1e-05, 1e-05, 1e-05, 1e-05, 2e-05, 2e-05, 2e-05, 2e-05, 2e-05, 2e-05, 2e-05, 2e-05, 2e-05, 2e-05, 2e-05, 2e-05, 2e-05, 2e-05, 2e-05, 2e-05, 2e-05, 2e-05, 2e-05, 2e-05, 2e-05, 2e-05, 2e-05, 2e-05, 2e-05, 2e-05, 2e-05, 2e-05, 2e-05, 2e-05, 2e-05, 2e-05, 2e-05, 2e-05, 2e-05, 2e-05, 2e-05, 2e-05, 2e-05, 2e-05, 2e-05, 2e-05, 2e-05, 3e-05, 3e-05, 3e-05, 3e-05, 3e-05, 3e-05, 3e-05, 3e-05, 3e-05, 3e-05, 3e-05, 3e-05, 3e-05, 3e-05, 3e-05, 3e-05, 3e-05, 3e-05, 3e-05, 3e-05, 3e-05, 3e-05, 3e-05, 3e-05, 3e-05, 3e-05, 3e-05, 3e-05, 3e-05, 3e-05, 3e-05, 3e-05, 3e-05, 3e-05, 3e-05, 3e-05, 3e-05, 3e-05, 3e-05, 3e-05, 3e-05, 3e-05, 3e-05, 4e-05, 4e-05, 4e-05, 4e-05, 4e-05, 4e-05, 4e-05, 4e-05, 4e-05, 4e-05, 4e-05, 4e-05, 4e-05, 4e-05, 4e-05, 4e-05, 4e-05, 4e-05, 4e-05, 4e-05, 4e-05, 4e-05, 4e-05, 4e-05, 4e-05, 4e-05, 4e-05, 4e-05, 4e-05, 4e-05, 4e-05, 4e-05, 4e-05, 4e-05, 4e-05, 4e-05, 4e-05, 4e-05, 4e-05, 4e-05, 4e-05, 4e-05, 5e-05, 5e-05, 5e-05, 5e-05, 5e-05, 5e-05, 5e-05, 5e-05, 5e-05, 5e-05, 5e-05, 5e-05, 5e-05, 5e-05, 5e-05, 5e-05, 5e-05, 5e-05, 5e-05, 5e-05, 5e-05, 5e-05, 5e-05, 5e-05, 5e-05, 5e-05, 5e-05, 5e-05, 5e-05, 5e-05, 5e-05, 5e-05, 5e-05, 5e-05, 5e-05, 5e-05, 5e-05, 5e-05, 5e-05, 5e-05, 5e-05, 5e-05, 5e-05, 6e-05, 6e-05, 6e-05, 6e-05, 6e-05, 6e-05, 6e-05, 6e-05, 6e-05, 6e-05, 6e-05, 6e-05, 6e-05, 6e-05, 6e-05, 6e-05, 6e-05, 6e-05, 6e-05, 6e-05, 6e-05, 6e-05, 6e-05, 6e-05, 6e-05, 6e-05, 6e-05, 6e-05, 6e-05, 6e-05, 6e-05, 6e-05, 6e-05, 6e-05, 6e-05, 6e-05, 6e-05, 6e-05, 6e-05, 6e-05, 6e-05, 6e-05, 6e-05, 7e-05, 7e-05, 7e-05, 7e-05, 7e-05, 7e-05, 7e-05, 7e-05, 7e-05, 7e-05, 7e-05, 7e-05, 7e-05, 7e-05, 7e-05, 7e-05, 7e-05, 7e-05, 7e-05, 7e-05, 7e-05, 7e-05, 7e-05, 7e-05, 7e-05, 7e-05, 7e-05, 7e-05, 7e-05, 7e-05, 7e-05, 7e-05, 7e-05, 7e-05, 7e-05, 7e-05, 7e-05, 7e-05, 7e-05, 7e-05, 7e-05, 7e-05, 8e-05, 8e-05, 8e-05, 8e-05, 8e-05, 8e-05, 8e-05, 8e-05, 8e-05, 8e-05, 8e-05, 8e-05, 8e-05, 8e-05, 8e-05, 8e-05, 8e-05, 8e-05, 8e-05, 8e-05, 8e-05, 8e-05, 8e-05, 8e-05, 8e-05, 8e-05, 8e-05, 8e-05, 8e-05, 8e-05, 8e-05, 8e-05, 8e-05, 8e-05, 8e-05, 8e-05, 8e-05, 8e-05, 8e-05, 8e-05, 8e-05, 8e-05, 8e-05, 9e-05, 9e-05, 9e-05, 9e-05, 9e-05, 9e-05, 9e-05, 9e-05, 9e-05, 9e-05, 9e-05, 9e-05, 9e-05, 9e-05, 9e-05, 9e-05, 9e-05, 9e-05, 9e-05, 9e-05, 9e-05, 9e-05, 9e-05, 9e-05, 9e-05, 9e-05, 9e-05, 9e-05, 9e-05, 9e-05, 9e-05, 9e-05, 9e-05, 9e-05, 9e-05, 9e-05, 9e-05]}, "learning-rate vs samples": {"start_step": 0, "end_step": 2000, "step_interval": 5, "values": [0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 1e-05, 1e-05, 1e-05, 1e-05, 1e-05, 1e-05, 1e-05, 1e-05, 1e-05, 1e-05, 1e-05, 1e-05, 1e-05, 1e-05, 1e-05, 1e-05, 1e-05, 1e-05, 1e-05, 1e-05, 1e-05, 1e-05, 1e-05, 1e-05, 1e-05, 1e-05, 1e-05, 1e-05, 1e-05, 1e-05, 1e-05, 1e-05, 1e-05, 1e-05, 1e-05, 1e-05, 1e-05, 1e-05, 1e-05, 1e-05, 1e-05, 1e-05, 2e-05, 2e-05, 2e-05, 2e-05, 2e-05, 2e-05, 2e-05, 2e-05, 2e-05, 2e-05, 2e-05, 2e-05, 2e-05, 2e-05, 2e-05, 2e-05, 2e-05, 2e-05, 2e-05, 2e-05, 2e-05, 2e-05, 2e-05, 2e-05, 2e-05, 2e-05, 2e-05, 2e-05, 2e-05, 2e-05, 2e-05, 2e-05, 2e-05, 2e-05, 2e-05, 2e-05, 2e-05, 2e-05, 2e-05, 2e-05, 2e-05, 2e-05, 2e-05, 3e-05, 3e-05, 3e-05, 3e-05, 3e-05, 3e-05, 3e-05, 3e-05, 3e-05, 3e-05, 3e-05, 3e-05, 3e-05, 3e-05, 3e-05, 3e-05, 3e-05, 3e-05, 3e-05, 3e-05, 3e-05, 3e-05, 3e-05, 3e-05, 3e-05, 3e-05, 3e-05, 3e-05, 3e-05, 3e-05, 3e-05, 3e-05, 3e-05, 3e-05, 3e-05, 3e-05, 3e-05, 3e-05, 3e-05, 3e-05, 3e-05, 3e-05, 3e-05, 4e-05, 4e-05, 4e-05, 4e-05, 4e-05, 4e-05, 4e-05, 4e-05, 4e-05, 4e-05, 4e-05, 4e-05, 4e-05, 4e-05, 4e-05, 4e-05, 4e-05, 4e-05, 4e-05, 4e-05, 4e-05, 4e-05, 4e-05, 4e-05, 4e-05, 4e-05, 4e-05, 4e-05, 4e-05, 4e-05, 4e-05, 4e-05, 4e-05, 4e-05, 4e-05, 4e-05, 4e-05, 4e-05, 4e-05, 4e-05, 4e-05, 4e-05, 5e-05, 5e-05, 5e-05, 5e-05, 5e-05, 5e-05, 5e-05, 5e-05, 5e-05, 5e-05, 5e-05, 5e-05, 5e-05, 5e-05, 5e-05, 5e-05, 5e-05, 5e-05, 5e-05, 5e-05, 5e-05, 5e-05, 5e-05, 5e-05, 5e-05, 5e-05, 5e-05, 5e-05, 5e-05, 5e-05, 5e-05, 5e-05, 5e-05, 5e-05, 5e-05, 5e-05, 5e-05, 5e-05, 5e-05, 5e-05, 5e-05, 5e-05, 5e-05, 6e-05, 6e-05, 6e-05, 6e-05, 6e-05, 6e-05, 6e-05, 6e-05, 6e-05, 6e-05, 6e-05, 6e-05, 6e-05, 6e-05, 6e-05, 6e-05, 6e-05, 6e-05, 6e-05, 6e-05, 6e-05, 6e-05, 6e-05, 6e-05, 6e-05, 6e-05, 6e-05, 6e-05, 6e-05, 6e-05, 6e-05, 6e-05, 6e-05, 6e-05, 6e-05, 6e-05, 6e-05, 6e-05, 6e-05, 6e-05, 6e-05, 6e-05, 6e-05, 7e-05, 7e-05, 7e-05, 7e-05, 7e-05, 7e-05, 7e-05, 7e-05, 7e-05, 7e-05, 7e-05, 7e-05, 7e-05, 7e-05, 7e-05, 7e-05, 7e-05, 7e-05, 7e-05, 7e-05, 7e-05, 7e-05, 7e-05, 7e-05, 7e-05, 7e-05, 7e-05, 7e-05, 7e-05, 7e-05, 7e-05, 7e-05, 7e-05, 7e-05, 7e-05, 7e-05, 7e-05, 7e-05, 7e-05, 7e-05, 7e-05, 7e-05, 8e-05, 8e-05, 8e-05, 8e-05, 8e-05, 8e-05, 8e-05, 8e-05, 8e-05, 8e-05, 8e-05, 8e-05, 8e-05, 8e-05, 8e-05, 8e-05, 8e-05, 8e-05, 8e-05, 8e-05, 8e-05, 8e-05, 8e-05, 8e-05, 8e-05, 8e-05, 8e-05, 8e-05, 8e-05, 8e-05, 8e-05, 8e-05, 8e-05, 8e-05, 8e-05, 8e-05, 8e-05, 8e-05, 8e-05, 8e-05, 8e-05, 8e-05, 8e-05, 9e-05, 9e-05, 9e-05, 9e-05, 9e-05, 9e-05, 9e-05, 9e-05, 9e-05, 9e-05, 9e-05, 9e-05, 9e-05, 9e-05, 9e-05, 9e-05, 9e-05, 9e-05, 9e-05, 9e-05, 9e-05, 9e-05, 9e-05, 9e-05, 9e-05, 9e-05, 9e-05, 9e-05, 9e-05, 9e-05, 9e-05, 9e-05, 9e-05, 9e-05, 9e-05, 9e-05, 9e-05]}, "batch-size": {"start_step": 0, "end_step": 2000, "step_interval": 5, "values": [128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0]}, "batch-size vs samples": {"start_step": 0, "end_step": 2000, "step_interval": 5, "values": [128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0]}, "lm loss": {"start_step": 0, "end_step": 2000, "step_interval": 5, "values": [10.84424, 10.87342, 10.85055, 10.81078, 10.64469, 10.6386, 10.4283, 10.13518, 9.93546, 9.83538, 9.5857, 9.84804, 9.88588, 9.63127, 9.79022, 9.5114, 9.4597, 9.65546, 9.38988, 9.33928, 9.24947, 9.15126, 9.18199, 9.00445, 9.19836, 9.06663, 9.16101, 9.1698, 9.30057, 8.98927, 8.92967, 9.05035, 9.04657, 8.66029, 8.72527, 8.75664, 8.69468, 8.74328, 8.66681, 8.77286, 8.67044, 8.86119, 8.84295, 8.50873, 8.39852, 8.43801, 8.49532, 8.39321, 8.44017, 8.59221, 8.37564, 8.19958, 8.2329, 8.22974, 8.27495, 7.92044, 8.0993, 7.89755, 8.2517, 8.23397, 8.00952, 7.97507, 7.92567, 7.74377, 7.74735, 7.64935, 7.51967, 7.91031, 7.70174, 7.45536, 7.74632, 7.77446, 7.54372, 7.30243, 7.45569, 7.34305, 7.4658, 7.22841, 7.63683, 7.28242, 7.34884, 7.21343, 7.21124, 7.41956, 7.17365, 7.2819, 6.99462, 7.00325, 7.04012, 7.13712, 6.82214, 6.98588, 7.08949, 6.99872, 6.87479, 6.75655, 6.99059, 7.06011, 6.70413, 6.58421, 6.72746, 6.74527, 6.73409, 6.73823, 6.65852, 6.40615, 6.63686, 6.6194, 6.44648, 6.62844, 6.74357, 6.61132, 6.72657, 6.69405, 6.62733, 6.50769, 6.59795, 6.40666, 6.66519, 6.24881, 6.25106, 6.30401, 6.39198, 6.34989, 6.45173, 6.29422, 6.33969, 6.23719, 6.20153, 6.39655, 6.32455, 6.32086, 6.16315, 6.15667, 6.23617, 6.38123, 6.19858, 6.14609, 6.17459, 6.11003, 6.05359, 6.06531, 6.24848, 6.39923, 6.24762, 6.28436, 6.08885, 6.1659, 5.99117, 6.01964, 5.94446, 6.23937, 6.17942, 5.95871, 5.7764, 6.11339, 5.84425, 6.10156, 5.77953, 6.15415, 6.13822, 6.07746, 5.92004, 6.10968, 5.93741, 6.19122, 5.88685, 5.78306, 5.77148, 5.68041, 6.00813, 5.99187, 6.05986, 5.88016, 6.03137, 5.96131, 5.99374, 5.98716, 5.94573, 5.83722, 5.94198, 5.61328, 5.69729, 5.88553, 5.83625, 5.85543, 5.75718, 5.83246, 5.71985, 5.55522, 5.71497, 5.61505, 5.82338, 5.59492, 5.70181, 5.69956, 5.89291, 5.6334, 5.84186, 5.73328, 5.86061, 5.32413, 5.89063, 5.86923, 5.84806, 5.40969, 5.40238, 5.62094, 5.5916, 5.47979, 5.57337, 5.67122, 5.47407, 5.73944, 5.51167, 5.59101, 5.62347, 5.61736, 5.50921, 5.61182, 5.67274, 5.68001, 5.58479, 5.65971, 5.37206, 5.67757, 5.62674, 5.42131, 5.58249, 5.62904, 5.55375, 5.34106, 5.53431, 5.48176, 5.48104, 5.38026, 5.55107, 5.59981, 5.38504, 5.51817, 5.48713, 5.33135, 5.50212, 5.40894, 5.44244, 5.31335, 5.06368, 5.47625, 5.56822, 5.71202, 5.40926, 5.59783, 5.63205, 5.23113, 5.2684, 5.39256, 5.39509, 5.32651, 5.49543, 5.18174, 5.2944, 5.24351, 5.3743, 5.25187, 5.4403, 5.53394, 5.30526, 5.42762, 5.33573, 5.07536, 5.30828, 5.24915, 5.30097, 5.10794, 5.27462, 5.25882, 5.46931, 5.15605, 5.26147, 5.20567, 5.34991, 4.9789, 4.90972, 5.32269, 5.39016, 5.22419, 5.31593, 5.10145, 5.16054, 5.25953, 5.0667, 5.26007, 5.06659, 5.33924, 5.2437, 5.14669, 5.24181, 5.03908, 5.31189, 5.0508, 5.02718, 5.13824, 5.11134, 5.26999, 5.14813, 5.27491, 5.09204, 5.0944, 5.24441, 5.32532, 5.25266, 5.18964, 5.14218, 5.28959, 4.95048, 5.2045, 5.09444, 5.30302, 5.17003, 5.18518, 5.11668, 4.98204, 4.99495, 5.222, 5.30847, 5.098, 5.05553, 4.91636, 5.12137, 5.11611, 4.9291, 5.33462, 5.02406, 5.09871, 5.16424, 5.00257, 5.06588, 5.06465, 4.99336, 5.07822, 5.15996, 4.97519, 5.18105, 4.9261, 4.91748, 5.06072, 4.99116, 4.90494, 4.77574, 4.94081, 5.11232, 5.01149, 5.01672, 5.32706, 4.95549, 4.99178, 5.04351, 4.80691, 4.73281, 4.99471, 5.04386, 4.87342, 4.9541, 5.04639, 5.02142, 4.81154, 4.89155, 4.90243, 4.82954, 4.73696, 5.00591, 4.75497, 5.20346, 4.791, 4.99509, 4.73426, 4.7815, 4.81632, 4.64705, 4.65335, 4.84192, 4.80637, 4.79718, 4.91906, 4.87982, 4.9259, 4.76993, 4.87999, 4.73114, 4.91345, 4.95513, 4.87047, 4.70341, 4.77964, 4.89818, 4.70591, 4.85482, 4.68983, 4.68887, 4.64189]}, "lm loss vs samples": {"start_step": 0, "end_step": 2000, "step_interval": 5, "values": [10.84424, 10.87342, 10.85055, 10.81078, 10.64469, 10.6386, 10.4283, 10.13518, 9.93546, 9.83538, 9.5857, 9.84804, 9.88588, 9.63127, 9.79022, 9.5114, 9.4597, 9.65546, 9.38988, 9.33928, 9.24947, 9.15126, 9.18199, 9.00445, 9.19836, 9.06663, 9.16101, 9.1698, 9.30057, 8.98927, 8.92967, 9.05035, 9.04657, 8.66029, 8.72527, 8.75664, 8.69468, 8.74328, 8.66681, 8.77286, 8.67044, 8.86119, 8.84295, 8.50873, 8.39852, 8.43801, 8.49532, 8.39321, 8.44017, 8.59221, 8.37564, 8.19958, 8.2329, 8.22974, 8.27495, 7.92044, 8.0993, 7.89755, 8.2517, 8.23397, 8.00952, 7.97507, 7.92567, 7.74377, 7.74735, 7.64935, 7.51967, 7.91031, 7.70174, 7.45536, 7.74632, 7.77446, 7.54372, 7.30243, 7.45569, 7.34305, 7.4658, 7.22841, 7.63683, 7.28242, 7.34884, 7.21343, 7.21124, 7.41956, 7.17365, 7.2819, 6.99462, 7.00325, 7.04012, 7.13712, 6.82214, 6.98588, 7.08949, 6.99872, 6.87479, 6.75655, 6.99059, 7.06011, 6.70413, 6.58421, 6.72746, 6.74527, 6.73409, 6.73823, 6.65852, 6.40615, 6.63686, 6.6194, 6.44648, 6.62844, 6.74357, 6.61132, 6.72657, 6.69405, 6.62733, 6.50769, 6.59795, 6.40666, 6.66519, 6.24881, 6.25106, 6.30401, 6.39198, 6.34989, 6.45173, 6.29422, 6.33969, 6.23719, 6.20153, 6.39655, 6.32455, 6.32086, 6.16315, 6.15667, 6.23617, 6.38123, 6.19858, 6.14609, 6.17459, 6.11003, 6.05359, 6.06531, 6.24848, 6.39923, 6.24762, 6.28436, 6.08885, 6.1659, 5.99117, 6.01964, 5.94446, 6.23937, 6.17942, 5.95871, 5.7764, 6.11339, 5.84425, 6.10156, 5.77953, 6.15415, 6.13822, 6.07746, 5.92004, 6.10968, 5.93741, 6.19122, 5.88685, 5.78306, 5.77148, 5.68041, 6.00813, 5.99187, 6.05986, 5.88016, 6.03137, 5.96131, 5.99374, 5.98716, 5.94573, 5.83722, 5.94198, 5.61328, 5.69729, 5.88553, 5.83625, 5.85543, 5.75718, 5.83246, 5.71985, 5.55522, 5.71497, 5.61505, 5.82338, 5.59492, 5.70181, 5.69956, 5.89291, 5.6334, 5.84186, 5.73328, 5.86061, 5.32413, 5.89063, 5.86923, 5.84806, 5.40969, 5.40238, 5.62094, 5.5916, 5.47979, 5.57337, 5.67122, 5.47407, 5.73944, 5.51167, 5.59101, 5.62347, 5.61736, 5.50921, 5.61182, 5.67274, 5.68001, 5.58479, 5.65971, 5.37206, 5.67757, 5.62674, 5.42131, 5.58249, 5.62904, 5.55375, 5.34106, 5.53431, 5.48176, 5.48104, 5.38026, 5.55107, 5.59981, 5.38504, 5.51817, 5.48713, 5.33135, 5.50212, 5.40894, 5.44244, 5.31335, 5.06368, 5.47625, 5.56822, 5.71202, 5.40926, 5.59783, 5.63205, 5.23113, 5.2684, 5.39256, 5.39509, 5.32651, 5.49543, 5.18174, 5.2944, 5.24351, 5.3743, 5.25187, 5.4403, 5.53394, 5.30526, 5.42762, 5.33573, 5.07536, 5.30828, 5.24915, 5.30097, 5.10794, 5.27462, 5.25882, 5.46931, 5.15605, 5.26147, 5.20567, 5.34991, 4.9789, 4.90972, 5.32269, 5.39016, 5.22419, 5.31593, 5.10145, 5.16054, 5.25953, 5.0667, 5.26007, 5.06659, 5.33924, 5.2437, 5.14669, 5.24181, 5.03908, 5.31189, 5.0508, 5.02718, 5.13824, 5.11134, 5.26999, 5.14813, 5.27491, 5.09204, 5.0944, 5.24441, 5.32532, 5.25266, 5.18964, 5.14218, 5.28959, 4.95048, 5.2045, 5.09444, 5.30302, 5.17003, 5.18518, 5.11668, 4.98204, 4.99495, 5.222, 5.30847, 5.098, 5.05553, 4.91636, 5.12137, 5.11611, 4.9291, 5.33462, 5.02406, 5.09871, 5.16424, 5.00257, 5.06588, 5.06465, 4.99336, 5.07822, 5.15996, 4.97519, 5.18105, 4.9261, 4.91748, 5.06072, 4.99116, 4.90494, 4.77574, 4.94081, 5.11232, 5.01149, 5.01672, 5.32706, 4.95549, 4.99178, 5.04351, 4.80691, 4.73281, 4.99471, 5.04386, 4.87342, 4.9541, 5.04639, 5.02142, 4.81154, 4.89155, 4.90243, 4.82954, 4.73696, 5.00591, 4.75497, 5.20346, 4.791, 4.99509, 4.73426, 4.7815, 4.81632, 4.64705, 4.65335, 4.84192, 4.80637, 4.79718, 4.91906, 4.87982, 4.9259, 4.76993, 4.87999, 4.73114, 4.91345, 4.95513, 4.87047, 4.70341, 4.77964, 4.89818, 4.70591, 4.85482, 4.68983, 4.68887, 4.64189]}, "loss-scale": {"start_step": 0, "end_step": 2000, "step_interval": 5, "values": [1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0]}, "loss-scale vs samples": {"start_step": 0, "end_step": 2000, "step_interval": 5, "values": [1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0]}, "grad-norm": {"start_step": 0, "end_step": 2000, "step_interval": 5, "values": [13.93626, 13.32689, 13.8137, 12.62172, 11.96992, 9.43513, 6.80799, 6.88665, 5.95498, 4.54619, 4.13053, 2.82596, 2.39543, 2.34537, 2.05773, 2.21996, 2.14537, 1.88392, 2.17069, 2.06105, 2.12373, 2.16615, 2.00976, 2.20876, 1.97308, 2.09194, 1.90863, 1.88776, 1.95054, 2.15308, 2.08778, 2.10616, 1.95646, 2.17094, 2.31724, 2.02642, 2.04764, 1.84545, 1.93704, 1.75657, 2.13069, 1.75993, 1.70876, 1.86665, 1.92331, 1.79127, 1.74297, 1.74426, 1.75161, 1.53485, 1.75292, 1.73299, 1.79809, 1.83477, 1.59059, 1.79085, 1.74313, 1.81505, 1.54888, 1.47615, 1.68285, 1.4812, 1.79315, 1.92171, 1.63149, 1.63813, 1.6586, 1.59744, 1.47545, 1.65909, 1.42464, 1.41939, 1.49901, 1.42049, 1.40172, 1.46225, 1.44185, 1.3706, 1.36838, 1.26055, 1.34627, 1.29904, 1.25687, 1.20642, 1.27731, 1.27576, 1.4537, 1.34738, 1.41703, 1.10279, 1.09805, 1.25584, 1.13228, 1.20775, 0.93229, 1.32305, 1.10083, 1.31134, 0.99675, 1.32116, 1.31807, 1.20377, 1.14298, 1.25982, 1.11587, 1.06268, 1.1383, 1.13456, 1.18344, 1.01042, 1.19822, 0.96542, 0.98282, 0.98083, 1.21915, 1.08304, 1.00478, 1.26788, 1.10619, 1.30807, 1.1248, 1.36119, 1.37901, 1.4392, 1.56444, 1.29037, 1.19911, 1.00927, 1.14759, 1.2293, 1.07062, 1.374, 1.0323, 1.06393, 1.18259, 1.20195, 1.16586, 1.44753, 0.94529, 1.13538, 1.05269, 1.34467, 1.18959, 1.01819, 0.86119, 1.06946, 1.34129, 1.684, 1.13519, 1.32985, 1.38775, 1.34761, 1.74434, 1.43622, 1.39335, 1.37538, 1.86703, 2.00418, 1.35288, 1.23486, 1.3698, 1.32764, 0.9773, 0.96112, 1.19304, 1.38421, 1.30281, 1.24815, 1.29487, 1.60508, 1.50397, 1.88527, 1.44501, 1.35752, 0.94887, 1.377, 2.16776, 1.36769, 1.5918, 1.53974, 1.46219, 1.57752, 1.18503, 1.28159, 1.42022, 1.06676, 1.57312, 1.38623, 1.21566, 1.67634, 1.0445, 1.27733, 1.33704, 1.42129, 1.46397, 1.28187, 1.4299, 1.30773, 1.5098, 1.44392, 1.45291, 1.64364, 1.49176, 1.37459, 1.51541, 1.63213, 1.48678, 1.52484, 1.4594, 1.29967, 1.2736, 1.3991, 1.32876, 1.30752, 2.30271, 1.55904, 1.8449, 1.46033, 1.24296, 1.20709, 1.62628, 1.5864, 1.26763, 1.43759, 1.47487, 1.37697, 1.3542, 1.33151, 1.73529, 1.34567, 1.25198, 1.32539, 1.47482, 1.18237, 1.36743, 1.49708, 1.35135, 1.39444, 1.32979, 1.17935, 1.87393, 1.4264, 1.47427, 1.49289, 1.23046, 1.40513, 1.22641, 1.41026, 1.60243, 1.3143, 1.19178, 1.29275, 1.40778, 1.27321, 1.41008, 1.70248, 1.64394, 1.51805, 1.52213, 1.56958, 1.37322, 1.23197, 1.2534, 1.33391, 1.27155, 1.71409, 1.36328, 1.34111, 1.56216, 1.69178, 1.34859, 1.23125, 1.30141, 1.35618, 1.71086, 1.21378, 1.62762, 1.35769, 1.32471, 1.3449, 1.37393, 1.16861, 1.52125, 1.65464, 1.84529, 1.4419, 1.39298, 1.45439, 1.43606, 1.60436, 1.56537, 1.49466, 1.35372, 1.44924, 1.44717, 1.59557, 1.51747, 1.64905, 1.33058, 1.31553, 1.61355, 1.23394, 1.40751, 1.24118, 1.39003, 1.46524, 1.46231, 1.5848, 1.30142, 1.49751, 1.49494, 1.35146, 1.32779, 1.48392, 1.42067, 1.43745, 1.57573, 1.52413, 1.22763, 1.19418, 1.89055, 1.53347, 1.40105, 1.60967, 1.38946, 1.31243, 1.45306, 1.42686, 1.36629, 1.4597, 1.59178, 1.37262, 1.28569, 1.49855, 1.29513, 1.26508, 1.32564, 1.18627, 1.52963, 1.41157, 1.22284, 1.09058, 1.41662, 1.39267, 1.29437, 1.39958, 1.3399, 1.36221, 1.4319, 1.07457, 1.45594, 1.29022, 1.47328, 1.63456, 1.35731, 1.53342, 1.23853, 1.30778, 1.37885, 1.39437, 1.58806, 1.41021, 1.41084, 1.3741, 1.18704, 1.36438, 1.50507, 1.3615, 1.43368, 1.39267, 1.48306, 1.60864, 1.92464, 1.65072, 1.54144, 1.35616, 1.29657, 1.5044, 1.29558, 1.3191, 1.41541, 1.44176, 1.48919, 1.28271, 1.18322, 1.31948, 1.34975, 1.36515, 1.26883, 1.48957, 1.40195, 1.45318, 1.67399, 1.47474, 1.53573, 1.49973, 1.39375, 1.51272, 1.36339, 1.21633]}, "grad-norm vs samples": {"start_step": 0, "end_step": 2000, "step_interval": 5, "values": [13.93626, 13.32689, 13.8137, 12.62172, 11.96992, 9.43513, 6.80799, 6.88665, 5.95498, 4.54619, 4.13053, 2.82596, 2.39543, 2.34537, 2.05773, 2.21996, 2.14537, 1.88392, 2.17069, 2.06105, 2.12373, 2.16615, 2.00976, 2.20876, 1.97308, 2.09194, 1.90863, 1.88776, 1.95054, 2.15308, 2.08778, 2.10616, 1.95646, 2.17094, 2.31724, 2.02642, 2.04764, 1.84545, 1.93704, 1.75657, 2.13069, 1.75993, 1.70876, 1.86665, 1.92331, 1.79127, 1.74297, 1.74426, 1.75161, 1.53485, 1.75292, 1.73299, 1.79809, 1.83477, 1.59059, 1.79085, 1.74313, 1.81505, 1.54888, 1.47615, 1.68285, 1.4812, 1.79315, 1.92171, 1.63149, 1.63813, 1.6586, 1.59744, 1.47545, 1.65909, 1.42464, 1.41939, 1.49901, 1.42049, 1.40172, 1.46225, 1.44185, 1.3706, 1.36838, 1.26055, 1.34627, 1.29904, 1.25687, 1.20642, 1.27731, 1.27576, 1.4537, 1.34738, 1.41703, 1.10279, 1.09805, 1.25584, 1.13228, 1.20775, 0.93229, 1.32305, 1.10083, 1.31134, 0.99675, 1.32116, 1.31807, 1.20377, 1.14298, 1.25982, 1.11587, 1.06268, 1.1383, 1.13456, 1.18344, 1.01042, 1.19822, 0.96542, 0.98282, 0.98083, 1.21915, 1.08304, 1.00478, 1.26788, 1.10619, 1.30807, 1.1248, 1.36119, 1.37901, 1.4392, 1.56444, 1.29037, 1.19911, 1.00927, 1.14759, 1.2293, 1.07062, 1.374, 1.0323, 1.06393, 1.18259, 1.20195, 1.16586, 1.44753, 0.94529, 1.13538, 1.05269, 1.34467, 1.18959, 1.01819, 0.86119, 1.06946, 1.34129, 1.684, 1.13519, 1.32985, 1.38775, 1.34761, 1.74434, 1.43622, 1.39335, 1.37538, 1.86703, 2.00418, 1.35288, 1.23486, 1.3698, 1.32764, 0.9773, 0.96112, 1.19304, 1.38421, 1.30281, 1.24815, 1.29487, 1.60508, 1.50397, 1.88527, 1.44501, 1.35752, 0.94887, 1.377, 2.16776, 1.36769, 1.5918, 1.53974, 1.46219, 1.57752, 1.18503, 1.28159, 1.42022, 1.06676, 1.57312, 1.38623, 1.21566, 1.67634, 1.0445, 1.27733, 1.33704, 1.42129, 1.46397, 1.28187, 1.4299, 1.30773, 1.5098, 1.44392, 1.45291, 1.64364, 1.49176, 1.37459, 1.51541, 1.63213, 1.48678, 1.52484, 1.4594, 1.29967, 1.2736, 1.3991, 1.32876, 1.30752, 2.30271, 1.55904, 1.8449, 1.46033, 1.24296, 1.20709, 1.62628, 1.5864, 1.26763, 1.43759, 1.47487, 1.37697, 1.3542, 1.33151, 1.73529, 1.34567, 1.25198, 1.32539, 1.47482, 1.18237, 1.36743, 1.49708, 1.35135, 1.39444, 1.32979, 1.17935, 1.87393, 1.4264, 1.47427, 1.49289, 1.23046, 1.40513, 1.22641, 1.41026, 1.60243, 1.3143, 1.19178, 1.29275, 1.40778, 1.27321, 1.41008, 1.70248, 1.64394, 1.51805, 1.52213, 1.56958, 1.37322, 1.23197, 1.2534, 1.33391, 1.27155, 1.71409, 1.36328, 1.34111, 1.56216, 1.69178, 1.34859, 1.23125, 1.30141, 1.35618, 1.71086, 1.21378, 1.62762, 1.35769, 1.32471, 1.3449, 1.37393, 1.16861, 1.52125, 1.65464, 1.84529, 1.4419, 1.39298, 1.45439, 1.43606, 1.60436, 1.56537, 1.49466, 1.35372, 1.44924, 1.44717, 1.59557, 1.51747, 1.64905, 1.33058, 1.31553, 1.61355, 1.23394, 1.40751, 1.24118, 1.39003, 1.46524, 1.46231, 1.5848, 1.30142, 1.49751, 1.49494, 1.35146, 1.32779, 1.48392, 1.42067, 1.43745, 1.57573, 1.52413, 1.22763, 1.19418, 1.89055, 1.53347, 1.40105, 1.60967, 1.38946, 1.31243, 1.45306, 1.42686, 1.36629, 1.4597, 1.59178, 1.37262, 1.28569, 1.49855, 1.29513, 1.26508, 1.32564, 1.18627, 1.52963, 1.41157, 1.22284, 1.09058, 1.41662, 1.39267, 1.29437, 1.39958, 1.3399, 1.36221, 1.4319, 1.07457, 1.45594, 1.29022, 1.47328, 1.63456, 1.35731, 1.53342, 1.23853, 1.30778, 1.37885, 1.39437, 1.58806, 1.41021, 1.41084, 1.3741, 1.18704, 1.36438, 1.50507, 1.3615, 1.43368, 1.39267, 1.48306, 1.60864, 1.92464, 1.65072, 1.54144, 1.35616, 1.29657, 1.5044, 1.29558, 1.3191, 1.41541, 1.44176, 1.48919, 1.28271, 1.18322, 1.31948, 1.34975, 1.36515, 1.26883, 1.48957, 1.40195, 1.45318, 1.67399, 1.47474, 1.53573, 1.49973, 1.39375, 1.51272, 1.36339, 1.21633]}, "num-zeros": {"start_step": 0, "end_step": 2000, "step_interval": 5, "values": [69.0, 86.0, 77.0, 73.0, 78.0, 81.0, 100.0, 105.0, 134.0, 134.0, 122.0, 173.0, 158.0, 179.0, 178.0, 172.0, 173.0, 192.0, 186.0, 185.0, 155.0, 157.0, 183.0, 172.0, 179.0, 162.0, 166.0, 176.0, 162.0, 177.0, 178.0, 149.0, 163.0, 200.0, 122.0, 151.0, 160.0, 216.0, 173.0, 192.0, 163.0, 174.0, 167.0, 195.0, 177.0, 181.0, 195.0, 201.0, 171.0, 240.0, 190.0, 187.0, 177.0, 159.0, 167.0, 211.0, 151.0, 167.0, 226.0, 215.0, 184.0, 206.0, 174.0, 166.0, 203.0, 236.0, 215.0, 192.0, 197.0, 197.0, 250.0, 225.0, 178.0, 210.0, 205.0, 223.0, 233.0, 196.0, 258.0, 221.0, 228.0, 237.0, 226.0, 223.0, 188.0, 182.0, 179.0, 198.0, 147.0, 189.0, 211.0, 214.0, 206.0, 216.0, 245.0, 156.0, 216.0, 214.0, 192.0, 170.0, 167.0, 167.0, 171.0, 168.0, 164.0, 141.0, 174.0, 143.0, 140.0, 184.0, 153.0, 162.0, 175.0, 144.0, 145.0, 144.0, 166.0, 110.0, 159.0, 132.0, 128.0, 137.0, 112.0, 132.0, 126.0, 136.0, 128.0, 172.0, 158.0, 131.0, 135.0, 133.0, 133.0, 144.0, 114.0, 123.0, 127.0, 129.0, 121.0, 139.0, 118.0, 107.0, 135.0, 149.0, 155.0, 123.0, 118.0, 109.0, 109.0, 111.0, 101.0, 119.0, 87.0, 118.0, 99.0, 104.0, 99.0, 88.0, 112.0, 112.0, 136.0, 110.0, 122.0, 128.0, 102.0, 105.0, 114.0, 106.0, 103.0, 119.0, 109.0, 83.0, 87.0, 99.0, 136.0, 116.0, 91.0, 112.0, 94.0, 98.0, 128.0, 100.0, 108.0, 115.0, 104.0, 128.0, 109.0, 99.0, 112.0, 96.0, 123.0, 103.0, 109.0, 84.0, 117.0, 105.0, 92.0, 104.0, 83.0, 96.0, 128.0, 71.0, 107.0, 110.0, 99.0, 96.0, 100.0, 100.0, 99.0, 122.0, 94.0, 98.0, 121.0, 118.0, 83.0, 96.0, 99.0, 123.0, 108.0, 107.0, 108.0, 93.0, 89.0, 101.0, 121.0, 121.0, 113.0, 108.0, 83.0, 123.0, 89.0, 105.0, 99.0, 100.0, 108.0, 105.0, 95.0, 112.0, 101.0, 110.0, 93.0, 108.0, 94.0, 120.0, 118.0, 107.0, 98.0, 121.0, 102.0, 97.0, 111.0, 126.0, 102.0, 108.0, 107.0, 108.0, 95.0, 97.0, 96.0, 118.0, 100.0, 111.0, 103.0, 92.0, 100.0, 101.0, 100.0, 103.0, 112.0, 87.0, 86.0, 119.0, 97.0, 101.0, 119.0, 120.0, 124.0, 114.0, 108.0, 105.0, 101.0, 104.0, 103.0, 98.0, 86.0, 101.0, 115.0, 98.0, 90.0, 108.0, 102.0, 102.0, 108.0, 125.0, 109.0, 90.0, 115.0, 94.0, 114.0, 113.0, 98.0, 113.0, 122.0, 101.0, 97.0, 109.0, 106.0, 105.0, 115.0, 95.0, 117.0, 118.0, 95.0, 111.0, 88.0, 121.0, 121.0, 117.0, 138.0, 134.0, 89.0, 99.0, 117.0, 93.0, 106.0, 123.0, 117.0, 107.0, 117.0, 108.0, 86.0, 121.0, 125.0, 105.0, 114.0, 107.0, 129.0, 114.0, 114.0, 107.0, 120.0, 118.0, 101.0, 109.0, 107.0, 124.0, 120.0, 116.0, 103.0, 127.0, 126.0, 90.0, 102.0, 114.0, 111.0, 108.0, 136.0, 107.0, 112.0, 104.0, 113.0, 117.0, 133.0, 104.0, 125.0, 119.0, 111.0, 122.0, 100.0, 118.0, 119.0, 104.0, 85.0, 133.0, 104.0, 119.0, 118.0, 95.0, 117.0, 123.0, 101.0, 132.0, 121.0, 110.0, 116.0, 116.0, 111.0, 91.0, 104.0, 104.0, 115.0, 124.0, 105.0, 104.0, 105.0, 101.0, 99.0, 112.0, 126.0, 139.0]}, "num-zeros vs samples": {"start_step": 0, "end_step": 2000, "step_interval": 5, "values": [69.0, 86.0, 77.0, 73.0, 78.0, 81.0, 100.0, 105.0, 134.0, 134.0, 122.0, 173.0, 158.0, 179.0, 178.0, 172.0, 173.0, 192.0, 186.0, 185.0, 155.0, 157.0, 183.0, 172.0, 179.0, 162.0, 166.0, 176.0, 162.0, 177.0, 178.0, 149.0, 163.0, 200.0, 122.0, 151.0, 160.0, 216.0, 173.0, 192.0, 163.0, 174.0, 167.0, 195.0, 177.0, 181.0, 195.0, 201.0, 171.0, 240.0, 190.0, 187.0, 177.0, 159.0, 167.0, 211.0, 151.0, 167.0, 226.0, 215.0, 184.0, 206.0, 174.0, 166.0, 203.0, 236.0, 215.0, 192.0, 197.0, 197.0, 250.0, 225.0, 178.0, 210.0, 205.0, 223.0, 233.0, 196.0, 258.0, 221.0, 228.0, 237.0, 226.0, 223.0, 188.0, 182.0, 179.0, 198.0, 147.0, 189.0, 211.0, 214.0, 206.0, 216.0, 245.0, 156.0, 216.0, 214.0, 192.0, 170.0, 167.0, 167.0, 171.0, 168.0, 164.0, 141.0, 174.0, 143.0, 140.0, 184.0, 153.0, 162.0, 175.0, 144.0, 145.0, 144.0, 166.0, 110.0, 159.0, 132.0, 128.0, 137.0, 112.0, 132.0, 126.0, 136.0, 128.0, 172.0, 158.0, 131.0, 135.0, 133.0, 133.0, 144.0, 114.0, 123.0, 127.0, 129.0, 121.0, 139.0, 118.0, 107.0, 135.0, 149.0, 155.0, 123.0, 118.0, 109.0, 109.0, 111.0, 101.0, 119.0, 87.0, 118.0, 99.0, 104.0, 99.0, 88.0, 112.0, 112.0, 136.0, 110.0, 122.0, 128.0, 102.0, 105.0, 114.0, 106.0, 103.0, 119.0, 109.0, 83.0, 87.0, 99.0, 136.0, 116.0, 91.0, 112.0, 94.0, 98.0, 128.0, 100.0, 108.0, 115.0, 104.0, 128.0, 109.0, 99.0, 112.0, 96.0, 123.0, 103.0, 109.0, 84.0, 117.0, 105.0, 92.0, 104.0, 83.0, 96.0, 128.0, 71.0, 107.0, 110.0, 99.0, 96.0, 100.0, 100.0, 99.0, 122.0, 94.0, 98.0, 121.0, 118.0, 83.0, 96.0, 99.0, 123.0, 108.0, 107.0, 108.0, 93.0, 89.0, 101.0, 121.0, 121.0, 113.0, 108.0, 83.0, 123.0, 89.0, 105.0, 99.0, 100.0, 108.0, 105.0, 95.0, 112.0, 101.0, 110.0, 93.0, 108.0, 94.0, 120.0, 118.0, 107.0, 98.0, 121.0, 102.0, 97.0, 111.0, 126.0, 102.0, 108.0, 107.0, 108.0, 95.0, 97.0, 96.0, 118.0, 100.0, 111.0, 103.0, 92.0, 100.0, 101.0, 100.0, 103.0, 112.0, 87.0, 86.0, 119.0, 97.0, 101.0, 119.0, 120.0, 124.0, 114.0, 108.0, 105.0, 101.0, 104.0, 103.0, 98.0, 86.0, 101.0, 115.0, 98.0, 90.0, 108.0, 102.0, 102.0, 108.0, 125.0, 109.0, 90.0, 115.0, 94.0, 114.0, 113.0, 98.0, 113.0, 122.0, 101.0, 97.0, 109.0, 106.0, 105.0, 115.0, 95.0, 117.0, 118.0, 95.0, 111.0, 88.0, 121.0, 121.0, 117.0, 138.0, 134.0, 89.0, 99.0, 117.0, 93.0, 106.0, 123.0, 117.0, 107.0, 117.0, 108.0, 86.0, 121.0, 125.0, 105.0, 114.0, 107.0, 129.0, 114.0, 114.0, 107.0, 120.0, 118.0, 101.0, 109.0, 107.0, 124.0, 120.0, 116.0, 103.0, 127.0, 126.0, 90.0, 102.0, 114.0, 111.0, 108.0, 136.0, 107.0, 112.0, 104.0, 113.0, 117.0, 133.0, 104.0, 125.0, 119.0, 111.0, 122.0, 100.0, 118.0, 119.0, 104.0, 85.0, 133.0, 104.0, 119.0, 118.0, 95.0, 117.0, 123.0, 101.0, 132.0, 121.0, 110.0, 116.0, 116.0, 111.0, 91.0, 104.0, 104.0, 115.0, 124.0, 105.0, 104.0, 105.0, 101.0, 99.0, 112.0, 126.0, 139.0]}, "params-norm": {"start_step": 0, "end_step": 2000, "step_interval": 5, "values": [180.0215, 180.0215, 180.0215, 180.02148, 180.02148, 180.02147, 180.02145, 180.02135, 180.02115, 180.02094, 180.01961, 180.01897, 180.01846, 180.01622, 180.01544, 180.01474, 180.01422, 180.01404, 180.01433, 180.01431, 180.01425, 180.01393, 180.01378, 180.01398, 180.01486, 180.01613, 180.01736, 180.01823, 180.01926, 180.02052, 180.02249, 180.0247, 180.0269, 180.02905, 180.03157, 180.03452, 180.03809, 180.04181, 180.04561, 180.04982, 180.05472, 180.06001, 180.06567, 180.07184, 180.0788, 180.08618, 180.09402, 180.10249, 180.11177, 180.12202, 180.13301, 180.14465, 180.15689, 180.16972, 180.18321, 180.19737, 180.21191, 180.22699, 180.24295, 180.26004, 180.27771, 180.29611, 180.31612, 180.33702, 180.35811, 180.38084, 180.40419, 180.4287, 180.45442, 180.48056, 180.50702, 180.53406, 180.56171, 180.58975, 180.61829, 180.64751, 180.67677, 180.70682, 180.73743, 180.76886, 180.80061, 180.83215, 180.86478, 180.89844, 180.93239, 180.96716, 181.00246, 181.03769, 181.07275, 181.10832, 181.14499, 181.18263, 181.21957, 181.25639, 181.29378, 181.33115, 181.36745, 181.40192, 181.43672, 181.47206, 181.50702, 181.54108, 181.57564, 181.61107, 181.64665, 181.68359, 181.72212, 181.76016, 181.79727, 181.83466, 181.87212, 181.91078, 181.94928, 181.98863, 182.02866, 182.0679, 182.10756, 182.14766, 182.18661, 182.22534, 182.26395, 182.30188, 182.33997, 182.3786, 182.41617, 182.45273, 182.48906, 182.52652, 182.56755, 182.60834, 182.64743, 182.68629, 182.72655, 182.76643, 182.80617, 182.84549, 182.8847, 182.92358, 182.96255, 183.00255, 183.04317, 183.08311, 183.12239, 183.16113, 183.20087, 183.24062, 183.27989, 183.31709, 183.35413, 183.39204, 183.42976, 183.46664, 183.50266, 183.5378, 183.57317, 183.60986, 183.64481, 183.67638, 183.7079, 183.74036, 183.77179, 183.80507, 183.8432, 183.8837, 183.92522, 183.96664, 184.00832, 184.04984, 184.09091, 184.13011, 184.16745, 184.20192, 184.2364, 184.27042, 184.30766, 184.34671, 184.38367, 184.41844, 184.45454, 184.49117, 184.52921, 184.56746, 184.60696, 184.64819, 184.69025, 184.73074, 184.77034, 184.80975, 184.84845, 184.88777, 184.92712, 184.96806, 185.00996, 185.0508, 185.09145, 185.13165, 185.17198, 185.21196, 185.25362, 185.29736, 185.33859, 185.37759, 185.41449, 185.45093, 185.48775, 185.52527, 185.56303, 185.60017, 185.63844, 185.67694, 185.717, 185.75711, 185.79745, 185.83626, 185.87444, 185.91074, 185.94763, 185.98566, 186.02451, 186.06494, 186.10443, 186.14497, 186.18584, 186.22533, 186.26512, 186.30524, 186.34587, 186.38719, 186.42752, 186.46732, 186.5069, 186.54416, 186.58186, 186.62146, 186.66272, 186.7025, 186.74118, 186.78197, 186.82381, 186.86591, 186.90703, 186.94699, 186.98782, 187.02896, 187.07161, 187.11592, 187.16006, 187.20297, 187.24727, 187.29167, 187.33688, 187.38315, 187.43051, 187.47704, 187.52306, 187.56926, 187.61435, 187.65848, 187.70207, 187.74612, 187.791, 187.83688, 187.88379, 187.93002, 187.97664, 188.02202, 188.06602, 188.10904, 188.15352, 188.19698, 188.23994, 188.28452, 188.3309, 188.37823, 188.4254, 188.47156, 188.51752, 188.5639, 188.60988, 188.65466, 188.69901, 188.74353, 188.78758, 188.82999, 188.87415, 188.91789, 188.9626, 189.00793, 189.05475, 189.10188, 189.14818, 189.1933, 189.23761, 189.28363, 189.33023, 189.37675, 189.42268, 189.46941, 189.51593, 189.56395, 189.61171, 189.65927, 189.70778, 189.75581, 189.80321, 189.8503, 189.89809, 189.9472, 189.9967, 190.04593, 190.09396, 190.14343, 190.1933, 190.24219, 190.29274, 190.34343, 190.39359, 190.44443, 190.49617, 190.54893, 190.60107, 190.65158, 190.70294, 190.75449, 190.80663, 190.86197, 190.91545, 190.96892, 191.02086, 191.07315, 191.12288, 191.17188, 191.22237, 191.27545, 191.32816, 191.38139, 191.43503, 191.48665, 191.53937, 191.58943, 191.64163, 191.69427, 191.74928, 191.8026, 191.85596, 191.90891, 191.96182, 192.01491, 192.06815, 192.12227, 192.17641, 192.23074, 192.28561, 192.34024, 192.39484, 192.44731, 192.50171, 192.55782, 192.61383, 192.67009, 192.72624, 192.78252, 192.83763, 192.89287, 192.94981, 193.00703, 193.06404, 193.12177, 193.17989, 193.23723, 193.29391, 193.34985, 193.40605, 193.45912, 193.51132, 193.56346, 193.61696, 193.67215, 193.72841, 193.78329, 193.83797, 193.89262, 193.94887, 194.00604, 194.064, 194.12062, 194.17807, 194.23741, 194.29666, 194.35547, 194.41553, 194.47499, 194.53378, 194.59259, 194.65202, 194.70923, 194.76607, 194.82375, 194.88065, 194.93935]}, "params-norm vs samples": {"start_step": 0, "end_step": 2000, "step_interval": 5, "values": [180.0215, 180.0215, 180.0215, 180.02148, 180.02148, 180.02147, 180.02145, 180.02135, 180.02115, 180.02094, 180.01961, 180.01897, 180.01846, 180.01622, 180.01544, 180.01474, 180.01422, 180.01404, 180.01433, 180.01431, 180.01425, 180.01393, 180.01378, 180.01398, 180.01486, 180.01613, 180.01736, 180.01823, 180.01926, 180.02052, 180.02249, 180.0247, 180.0269, 180.02905, 180.03157, 180.03452, 180.03809, 180.04181, 180.04561, 180.04982, 180.05472, 180.06001, 180.06567, 180.07184, 180.0788, 180.08618, 180.09402, 180.10249, 180.11177, 180.12202, 180.13301, 180.14465, 180.15689, 180.16972, 180.18321, 180.19737, 180.21191, 180.22699, 180.24295, 180.26004, 180.27771, 180.29611, 180.31612, 180.33702, 180.35811, 180.38084, 180.40419, 180.4287, 180.45442, 180.48056, 180.50702, 180.53406, 180.56171, 180.58975, 180.61829, 180.64751, 180.67677, 180.70682, 180.73743, 180.76886, 180.80061, 180.83215, 180.86478, 180.89844, 180.93239, 180.96716, 181.00246, 181.03769, 181.07275, 181.10832, 181.14499, 181.18263, 181.21957, 181.25639, 181.29378, 181.33115, 181.36745, 181.40192, 181.43672, 181.47206, 181.50702, 181.54108, 181.57564, 181.61107, 181.64665, 181.68359, 181.72212, 181.76016, 181.79727, 181.83466, 181.87212, 181.91078, 181.94928, 181.98863, 182.02866, 182.0679, 182.10756, 182.14766, 182.18661, 182.22534, 182.26395, 182.30188, 182.33997, 182.3786, 182.41617, 182.45273, 182.48906, 182.52652, 182.56755, 182.60834, 182.64743, 182.68629, 182.72655, 182.76643, 182.80617, 182.84549, 182.8847, 182.92358, 182.96255, 183.00255, 183.04317, 183.08311, 183.12239, 183.16113, 183.20087, 183.24062, 183.27989, 183.31709, 183.35413, 183.39204, 183.42976, 183.46664, 183.50266, 183.5378, 183.57317, 183.60986, 183.64481, 183.67638, 183.7079, 183.74036, 183.77179, 183.80507, 183.8432, 183.8837, 183.92522, 183.96664, 184.00832, 184.04984, 184.09091, 184.13011, 184.16745, 184.20192, 184.2364, 184.27042, 184.30766, 184.34671, 184.38367, 184.41844, 184.45454, 184.49117, 184.52921, 184.56746, 184.60696, 184.64819, 184.69025, 184.73074, 184.77034, 184.80975, 184.84845, 184.88777, 184.92712, 184.96806, 185.00996, 185.0508, 185.09145, 185.13165, 185.17198, 185.21196, 185.25362, 185.29736, 185.33859, 185.37759, 185.41449, 185.45093, 185.48775, 185.52527, 185.56303, 185.60017, 185.63844, 185.67694, 185.717, 185.75711, 185.79745, 185.83626, 185.87444, 185.91074, 185.94763, 185.98566, 186.02451, 186.06494, 186.10443, 186.14497, 186.18584, 186.22533, 186.26512, 186.30524, 186.34587, 186.38719, 186.42752, 186.46732, 186.5069, 186.54416, 186.58186, 186.62146, 186.66272, 186.7025, 186.74118, 186.78197, 186.82381, 186.86591, 186.90703, 186.94699, 186.98782, 187.02896, 187.07161, 187.11592, 187.16006, 187.20297, 187.24727, 187.29167, 187.33688, 187.38315, 187.43051, 187.47704, 187.52306, 187.56926, 187.61435, 187.65848, 187.70207, 187.74612, 187.791, 187.83688, 187.88379, 187.93002, 187.97664, 188.02202, 188.06602, 188.10904, 188.15352, 188.19698, 188.23994, 188.28452, 188.3309, 188.37823, 188.4254, 188.47156, 188.51752, 188.5639, 188.60988, 188.65466, 188.69901, 188.74353, 188.78758, 188.82999, 188.87415, 188.91789, 188.9626, 189.00793, 189.05475, 189.10188, 189.14818, 189.1933, 189.23761, 189.28363, 189.33023, 189.37675, 189.42268, 189.46941, 189.51593, 189.56395, 189.61171, 189.65927, 189.70778, 189.75581, 189.80321, 189.8503, 189.89809, 189.9472, 189.9967, 190.04593, 190.09396, 190.14343, 190.1933, 190.24219, 190.29274, 190.34343, 190.39359, 190.44443, 190.49617, 190.54893, 190.60107, 190.65158, 190.70294, 190.75449, 190.80663, 190.86197, 190.91545, 190.96892, 191.02086, 191.07315, 191.12288, 191.17188, 191.22237, 191.27545, 191.32816, 191.38139, 191.43503, 191.48665, 191.53937, 191.58943, 191.64163, 191.69427, 191.74928, 191.8026, 191.85596, 191.90891, 191.96182, 192.01491, 192.06815, 192.12227, 192.17641, 192.23074, 192.28561, 192.34024, 192.39484, 192.44731, 192.50171, 192.55782, 192.61383, 192.67009, 192.72624, 192.78252, 192.83763, 192.89287, 192.94981, 193.00703, 193.06404, 193.12177, 193.17989, 193.23723, 193.29391, 193.34985, 193.40605, 193.45912, 193.51132, 193.56346, 193.61696, 193.67215, 193.72841, 193.78329, 193.83797, 193.89262, 193.94887, 194.00604, 194.064, 194.12062, 194.17807, 194.23741, 194.29666, 194.35547, 194.41553, 194.47499, 194.53378, 194.59259, 194.65202, 194.70923, 194.76607, 194.82375, 194.88065, 194.93935]}, "iteration-time": {"start_step": 0, "end_step": 2000, "step_interval": 5, "values": [25.13033, 1.48166, 1.46987, 1.47023, 1.48503, 1.46592, 1.47336, 1.47508, 1.47402, 1.4685, 1.46594, 1.46551, 1.47349, 1.47267, 1.46624, 1.4694, 1.46787, 1.46277, 1.47132, 1.47851, 1.46741, 1.46542, 1.4696, 1.47275, 1.46461, 1.47691, 1.4675, 1.4656, 1.47118, 1.46861, 1.46276, 1.46336, 1.46191, 1.46454, 1.46661, 1.45397, 1.45433, 1.45318, 1.47248, 1.45987, 1.4605, 1.47021, 1.46471, 1.46712, 1.47916, 1.46564, 1.46806, 1.48231, 1.47331, 1.47647, 1.4749, 1.47736, 1.47088, 1.48046, 1.47029, 1.4749, 1.47423, 1.4743, 1.47451, 1.47312, 1.46669, 1.48162, 1.47248, 1.47813, 1.47924, 1.47693, 1.4857, 1.47407, 1.47761, 1.47904, 1.47169, 1.46697, 1.48901, 1.47837, 1.47292, 1.48078, 1.49273, 1.48823, 1.48311, 1.48576, 1.48783, 1.48617, 1.47144, 1.46991, 1.46885, 1.47351, 1.47373, 1.46882, 1.46809, 1.46714, 1.4672, 1.47772, 1.46612, 1.46651, 1.47094, 1.47578, 1.46913, 1.48331, 1.4865, 1.48787, 1.47171, 1.46821, 1.4802, 1.46723, 1.47379, 1.46841, 1.46785, 1.47559, 1.47509, 1.46854, 1.47345, 1.47159, 1.46793, 1.47819, 1.48813, 1.4716, 1.47495, 1.46872, 1.47829, 1.47064, 1.47018, 1.47559, 1.47576, 1.47037, 1.47433, 1.47533, 1.47013, 1.47921, 1.47494, 1.4767, 1.47607, 1.47345, 1.47128, 1.47431, 1.46759, 1.46948, 1.46669, 1.47222, 1.46674, 1.47388, 1.47388, 1.46524, 1.47407, 1.47207, 1.46963, 1.47611, 1.47057, 1.47046, 1.47507, 1.4718, 1.47093, 1.46875, 1.47966, 1.47691, 1.47958, 1.46848, 1.47659, 1.47233, 1.46829, 1.47134, 1.47162, 1.47084, 1.46812, 1.46169, 1.47005, 1.47196, 1.47131, 1.4779, 1.47053, 1.46873, 1.47177, 1.47562, 1.47441, 1.47279, 1.4738, 1.47473, 1.47647, 1.4711, 1.47612, 1.47591, 1.48126, 1.47512, 1.47351, 1.47769, 1.46263, 1.47234, 1.47526, 1.47224, 1.47085, 1.46942, 1.46803, 1.4759, 1.47343, 1.46362, 1.4685, 1.47079, 1.47101, 1.47158, 1.47044, 1.46992, 1.46298, 1.47836, 1.46169, 1.46751, 1.47839, 1.47255, 1.47103, 1.47052, 1.46863, 1.4668, 1.4769, 1.47204, 1.4723, 1.47157, 1.4667, 1.47441, 1.48003, 1.47181, 1.48009, 1.48373, 1.47652, 1.4796, 1.47353, 1.47567, 1.47796, 1.47632, 1.48009, 1.4717, 1.47188, 1.48104, 1.47363, 1.47129, 1.47793, 1.47574, 1.47484, 1.47619, 1.47177, 1.47614, 1.47933, 1.47156, 1.46844, 1.4802, 1.47829, 1.47093, 1.4754, 1.47276, 1.57859, 1.4684, 1.47537, 1.54583, 1.47639, 1.57948, 1.47918, 1.48066, 1.48212, 1.4774, 1.47852, 1.47639, 1.47826, 1.48039, 1.4739, 1.4819, 1.48028, 1.47407, 1.47624, 1.48205, 1.47628, 1.48393, 1.48589, 1.47517, 1.47758, 1.47729, 1.48745, 1.47685, 1.48033, 1.47602, 1.47812, 1.48054, 1.47432, 1.47337, 1.47804, 1.47123, 1.47425, 1.47715, 1.47794, 1.47273, 1.47454, 1.47875, 1.4782, 1.47577, 1.47167, 1.47763, 1.4744, 1.47683, 1.48168, 1.47497, 1.47434, 1.4796, 1.4776, 1.47214, 1.47435, 1.47766, 1.4835, 1.48072, 1.4744, 1.48392, 1.47533, 1.47683, 1.47742, 1.48516, 1.47634, 1.478, 1.47244, 1.48265, 1.47422, 1.48296, 1.48311, 1.47628, 1.47751, 1.48129, 1.47507, 1.48075, 1.47775, 1.47657, 1.48203, 1.48345, 1.48818, 1.48194, 1.48374, 1.482, 1.48749, 1.48551, 1.48527, 1.4871, 1.49114, 1.48723, 1.47874, 1.47877, 1.48314, 1.47745, 1.47138, 1.4823, 1.4909, 1.48278, 1.48582, 1.48063, 1.47195, 1.47501, 1.47117, 1.47685, 1.47555, 1.47306, 1.54386, 1.47358, 1.57973, 1.47563, 1.47575, 1.56224, 1.47774, 1.4817, 1.48012, 1.48778, 1.47737, 1.47738, 1.48069, 1.47712, 1.47909, 1.47385, 1.47532, 1.47459, 1.47167, 1.47808, 1.48123, 1.47993, 1.46614, 1.46983, 1.47318, 1.47539, 1.47425, 1.47523, 1.47895, 1.47481, 1.4698, 1.46941, 1.47466, 1.47011, 1.46611, 1.47663, 1.47626, 1.4741, 1.47847, 1.46407, 1.47268, 1.47738, 1.46488, 1.48113, 1.47284, 1.46934, 1.47784, 1.4777]}, "lm loss validation": {"start_step": 0, "end_step": 4, "step_interval": 5, "values": [5.6001]}, "lm loss validation vs samples": {"start_step": 0, "end_step": 4, "step_interval": 5, "values": [5.6001]}, "lm loss validation ppl": {"start_step": 0, "end_step": 4, "step_interval": 5, "values": [270.45398]}, "lm loss validation ppl vs samples": {"start_step": 0, "end_step": 4, "step_interval": 5, "values": [270.45398]}}
\ No newline at end of file
diff --git a/tests/functional_tests/test_cases/gpt/gpt3_345m_weekly_dgx_h100_1N8G_mcore_tp2_pp2_fp8_tp_pp/golden_values_lts.json b/tests/functional_tests/test_cases/gpt/gpt3_345m_weekly_dgx_h100_1N8G_mcore_tp2_pp2_fp8_tp_pp/golden_values_lts.json
new file mode 100644
index 0000000000..d314392934
--- /dev/null
+++ b/tests/functional_tests/test_cases/gpt/gpt3_345m_weekly_dgx_h100_1N8G_mcore_tp2_pp2_fp8_tp_pp/golden_values_lts.json
@@ -0,0 +1 @@
+{"forward-backward-time": {"start_step": 0, "end_step": 2000, "step_interval": 5, "values": [20.88514, 1.46887, 1.45698, 1.45724, 1.47204, 1.4532, 1.46049, 1.46232, 1.46114, 1.45572, 1.45278, 1.45251, 1.4606, 1.45971, 1.45327, 1.45649, 1.45387, 1.44992, 1.45853, 1.46565, 1.45437, 1.4525, 1.45638, 1.45952, 1.45173, 1.46389, 1.45431, 1.45274, 1.4583, 1.45541, 1.44989, 1.45048, 1.44894, 1.45131, 1.45345, 1.44108, 1.44133, 1.44014, 1.45925, 1.44689, 1.44677, 1.45727, 1.45173, 1.45401, 1.46616, 1.45271, 1.45499, 1.46938, 1.4604, 1.4635, 1.4619, 1.46438, 1.45747, 1.46752, 1.45729, 1.46194, 1.46122, 1.46137, 1.46148, 1.46024, 1.45382, 1.46877, 1.45937, 1.46525, 1.46624, 1.46409, 1.4727, 1.46116, 1.46451, 1.4659, 1.45827, 1.45377, 1.47607, 1.46536, 1.45984, 1.46776, 1.47935, 1.47512, 1.47012, 1.47272, 1.47499, 1.47329, 1.4585, 1.45704, 1.4555, 1.46025, 1.46072, 1.45592, 1.45507, 1.45416, 1.45424, 1.46471, 1.45308, 1.45358, 1.45797, 1.46272, 1.45587, 1.47021, 1.47373, 1.47488, 1.45879, 1.45526, 1.46684, 1.45424, 1.46048, 1.45539, 1.45476, 1.46257, 1.46204, 1.4552, 1.46046, 1.45792, 1.45501, 1.46191, 1.47519, 1.45861, 1.46195, 1.4555, 1.46541, 1.45771, 1.45708, 1.46256, 1.46253, 1.45733, 1.46154, 1.46224, 1.45714, 1.46628, 1.462, 1.46251, 1.46041, 1.45921, 1.45844, 1.46129, 1.45453, 1.45615, 1.45383, 1.45915, 1.45368, 1.46097, 1.4609, 1.4519, 1.46109, 1.45906, 1.45677, 1.46323, 1.45746, 1.45755, 1.46188, 1.45867, 1.45807, 1.45578, 1.46681, 1.46385, 1.46569, 1.4551, 1.46369, 1.45943, 1.45524, 1.45829, 1.45857, 1.45785, 1.45457, 1.44886, 1.45654, 1.4591, 1.4583, 1.46482, 1.45668, 1.45572, 1.45853, 1.46203, 1.46116, 1.45964, 1.4598, 1.46157, 1.46339, 1.45804, 1.46302, 1.4604, 1.4681, 1.4619, 1.46043, 1.46458, 1.44955, 1.45921, 1.46214, 1.45918, 1.45767, 1.45627, 1.45501, 1.46271, 1.46011, 1.45047, 1.45537, 1.45774, 1.45791, 1.45844, 1.45736, 1.45685, 1.44897, 1.46515, 1.44824, 1.4544, 1.46501, 1.45918, 1.45782, 1.45713, 1.45546, 1.4536, 1.46366, 1.45823, 1.45916, 1.45823, 1.45337, 1.46118, 1.46699, 1.4587, 1.46699, 1.47055, 1.46344, 1.46652, 1.46046, 1.46265, 1.46449, 1.46285, 1.46692, 1.45814, 1.45886, 1.46803, 1.46061, 1.45819, 1.4648, 1.46266, 1.46133, 1.46278, 1.4587, 1.46188, 1.46627, 1.45851, 1.45538, 1.46707, 1.4652, 1.45779, 1.46235, 1.45952, 1.56522, 1.45535, 1.46212, 1.53267, 1.46331, 1.56631, 1.46611, 1.4675, 1.46789, 1.46422, 1.46465, 1.46332, 1.46526, 1.46728, 1.46084, 1.46879, 1.4673, 1.46097, 1.4632, 1.46893, 1.46312, 1.47082, 1.47286, 1.46203, 1.46457, 1.46392, 1.47428, 1.46372, 1.46741, 1.46293, 1.46502, 1.46743, 1.46135, 1.45986, 1.46485, 1.45803, 1.46118, 1.46355, 1.46477, 1.4597, 1.46145, 1.46577, 1.46316, 1.46246, 1.45852, 1.46444, 1.46127, 1.46343, 1.46846, 1.46172, 1.4611, 1.46651, 1.46449, 1.45901, 1.46118, 1.46452, 1.47046, 1.46733, 1.46134, 1.4708, 1.46233, 1.46381, 1.46441, 1.47211, 1.46336, 1.46499, 1.45935, 1.46955, 1.46104, 1.46986, 1.47015, 1.46324, 1.46425, 1.46739, 1.46074, 1.46764, 1.46483, 1.46352, 1.46907, 1.4704, 1.47514, 1.4677, 1.47074, 1.46865, 1.4746, 1.47247, 1.47112, 1.47411, 1.47813, 1.47421, 1.46569, 1.46574, 1.47004, 1.46433, 1.45849, 1.46834, 1.47747, 1.46919, 1.47242, 1.46719, 1.45884, 1.462, 1.45808, 1.46357, 1.46256, 1.4583, 1.53085, 1.46007, 1.56675, 1.46277, 1.46292, 1.54903, 1.46448, 1.46847, 1.46708, 1.47477, 1.46444, 1.46433, 1.46714, 1.46403, 1.46557, 1.4607, 1.4618, 1.4615, 1.45857, 1.46496, 1.46801, 1.46664, 1.45296, 1.45665, 1.46006, 1.46236, 1.46106, 1.4622, 1.46573, 1.46166, 1.45667, 1.4563, 1.46152, 1.45678, 1.45303, 1.46242, 1.46316, 1.46041, 1.4655, 1.45096, 1.45962, 1.46428, 1.45196, 1.46789, 1.45986, 1.45627, 1.46454, 1.46424]}, "forward-compute-time": {"start_step": 0, "end_step": 2000, "step_interval": 5, "values": [6.36252, 0.75642, 0.75338, 0.74782, 0.75864, 0.75119, 0.75271, 0.75652, 0.75238, 0.74967, 0.74518, 0.74699, 0.74982, 0.74683, 0.74477, 0.74825, 0.75424, 0.74304, 0.74908, 0.74831, 0.74285, 0.74505, 0.75194, 0.75268, 0.74597, 0.75419, 0.74822, 0.74832, 0.75308, 0.7494, 0.74312, 0.74787, 0.74249, 0.74586, 0.74659, 0.74391, 0.7376, 0.74214, 0.75476, 0.74522, 0.74687, 0.75765, 0.7462, 0.75118, 0.75883, 0.7495, 0.7508, 0.75734, 0.7532, 0.75555, 0.75913, 0.75728, 0.75891, 0.75923, 0.75304, 0.75387, 0.75689, 0.75658, 0.76074, 0.76432, 0.75769, 0.76347, 0.75739, 0.7616, 0.76613, 0.76452, 0.76556, 0.76205, 0.76331, 0.76266, 0.7584, 0.75596, 0.77338, 0.76537, 0.75847, 0.77247, 0.7698, 0.76711, 0.76502, 0.76683, 0.76807, 0.76879, 0.75959, 0.75609, 0.7542, 0.75889, 0.7586, 0.75685, 0.75677, 0.7569, 0.75222, 0.75781, 0.74463, 0.74619, 0.75051, 0.75082, 0.74909, 0.7631, 0.75774, 0.76204, 0.75145, 0.745, 0.75456, 0.75, 0.75135, 0.75247, 0.74698, 0.7545, 0.75599, 0.74765, 0.75411, 0.75279, 0.74869, 0.75208, 0.75762, 0.74974, 0.75249, 0.74767, 0.75172, 0.74899, 0.751, 0.74685, 0.75057, 0.75145, 0.7525, 0.75608, 0.74708, 0.75458, 0.7537, 0.74712, 0.75411, 0.7543, 0.74836, 0.74769, 0.74953, 0.75136, 0.75937, 0.76403, 0.75925, 0.76123, 0.76488, 0.75935, 0.76327, 0.7569, 0.75895, 0.76622, 0.76412, 0.75914, 0.76039, 0.76442, 0.76455, 0.76016, 0.76196, 0.76613, 0.76729, 0.75679, 0.75985, 0.75945, 0.76323, 0.7635, 0.75457, 0.75811, 0.75642, 0.74425, 0.74872, 0.75503, 0.74958, 0.75606, 0.7608, 0.75663, 0.75567, 0.76176, 0.76045, 0.76145, 0.76278, 0.76702, 0.76166, 0.75954, 0.76405, 0.76075, 0.76028, 0.75744, 0.76195, 0.75996, 0.76397, 0.76843, 0.76911, 0.76882, 0.76899, 0.76126, 0.76583, 0.77184, 0.76598, 0.76126, 0.76043, 0.75584, 0.7596, 0.7606, 0.75826, 0.75896, 0.75754, 0.76441, 0.75157, 0.75476, 0.76479, 0.75674, 0.75885, 0.75822, 0.75074, 0.75763, 0.76244, 0.75885, 0.75847, 0.7616, 0.75912, 0.76519, 0.75935, 0.75886, 0.75905, 0.76846, 0.7612, 0.7615, 0.76008, 0.76429, 0.75844, 0.75869, 0.76255, 0.76097, 0.75995, 0.76319, 0.76129, 0.76036, 0.76016, 0.76111, 0.76323, 0.76537, 0.759, 0.7601, 0.76445, 0.75571, 0.75685, 0.76075, 0.75723, 0.75653, 0.75845, 0.75674, 0.86396, 0.75777, 0.76008, 0.79802, 0.76226, 0.86191, 0.76011, 0.76317, 0.76386, 0.7605, 0.76066, 0.76276, 0.76322, 0.7613, 0.7592, 0.762, 0.76075, 0.75635, 0.75896, 0.7677, 0.7624, 0.76381, 0.76676, 0.75786, 0.75925, 0.76099, 0.76684, 0.7623, 0.76206, 0.76286, 0.76089, 0.75817, 0.75534, 0.75831, 0.76571, 0.76592, 0.76306, 0.76728, 0.76327, 0.76387, 0.7666, 0.76417, 0.7663, 0.7669, 0.76023, 0.76799, 0.76358, 0.76252, 0.76815, 0.76889, 0.76519, 0.77456, 0.76596, 0.76411, 0.76815, 0.77016, 0.77392, 0.76784, 0.76277, 0.77204, 0.76778, 0.7655, 0.76653, 0.76663, 0.7655, 0.76981, 0.76378, 0.76855, 0.76427, 0.77286, 0.76279, 0.75723, 0.75876, 0.76093, 0.75608, 0.76062, 0.75705, 0.75985, 0.76693, 0.76742, 0.77256, 0.76978, 0.76789, 0.76969, 0.76933, 0.77265, 0.76608, 0.76739, 0.77128, 0.76748, 0.75765, 0.75397, 0.76206, 0.75882, 0.75813, 0.76547, 0.77479, 0.76791, 0.77465, 0.76715, 0.75994, 0.76202, 0.75688, 0.75371, 0.75879, 0.75648, 0.78313, 0.75471, 0.85298, 0.75745, 0.75629, 0.79889, 0.75755, 0.7675, 0.76401, 0.77476, 0.7623, 0.76426, 0.77061, 0.76259, 0.76592, 0.76419, 0.76322, 0.76581, 0.76288, 0.76458, 0.76887, 0.76604, 0.7592, 0.7636, 0.76038, 0.76398, 0.76433, 0.76564, 0.7642, 0.76491, 0.76122, 0.76383, 0.76659, 0.76312, 0.76135, 0.76522, 0.76474, 0.76522, 0.76449, 0.75942, 0.76396, 0.76563, 0.75814, 0.76753, 0.76464, 0.7621, 0.77007, 0.76728]}, "backward-compute-time": {"start_step": 0, "end_step": 2000, "step_interval": 5, "values": [4.28133, 0.68196, 0.6748, 0.67881, 0.68478, 0.67217, 0.67802, 0.67659, 0.67892, 0.67668, 0.67659, 0.67465, 0.67463, 0.67462, 0.67762, 0.67642, 0.6769, 0.67572, 0.67809, 0.68097, 0.67934, 0.67704, 0.67406, 0.67837, 0.6757, 0.67949, 0.67968, 0.6787, 0.67717, 0.68038, 0.67537, 0.67968, 0.67434, 0.67314, 0.67835, 0.66827, 0.67483, 0.66865, 0.67777, 0.67612, 0.66888, 0.68034, 0.67914, 0.67754, 0.686, 0.67891, 0.6825, 0.69249, 0.68805, 0.68071, 0.6807, 0.68401, 0.68197, 0.68831, 0.67921, 0.68344, 0.68292, 0.68269, 0.67859, 0.67491, 0.67595, 0.68683, 0.68164, 0.68009, 0.68194, 0.68378, 0.68844, 0.68048, 0.67795, 0.68343, 0.6796, 0.67682, 0.6863, 0.68552, 0.67712, 0.67901, 0.6881, 0.68205, 0.67931, 0.68414, 0.68584, 0.68259, 0.67712, 0.67748, 0.67636, 0.67686, 0.67957, 0.67669, 0.67544, 0.67461, 0.67469, 0.68134, 0.68, 0.67587, 0.68021, 0.68045, 0.67544, 0.67937, 0.68676, 0.68585, 0.67936, 0.68061, 0.68245, 0.67815, 0.67775, 0.6759, 0.67787, 0.68054, 0.6803, 0.67305, 0.67653, 0.67563, 0.67417, 0.68429, 0.68658, 0.67537, 0.68025, 0.6803, 0.68056, 0.6828, 0.68066, 0.68532, 0.67902, 0.67418, 0.68192, 0.6772, 0.6791, 0.68139, 0.68311, 0.68253, 0.67839, 0.67915, 0.67948, 0.68314, 0.67734, 0.67756, 0.67316, 0.67604, 0.6758, 0.67978, 0.67641, 0.67242, 0.67813, 0.67872, 0.6783, 0.67885, 0.67431, 0.67749, 0.67801, 0.6758, 0.67622, 0.67701, 0.68426, 0.6762, 0.67926, 0.67417, 0.68505, 0.67444, 0.67174, 0.67764, 0.67913, 0.67644, 0.67728, 0.67567, 0.67951, 0.67766, 0.67997, 0.68347, 0.67314, 0.66987, 0.67882, 0.67735, 0.67469, 0.67484, 0.67452, 0.67036, 0.67219, 0.66928, 0.67596, 0.68103, 0.68041, 0.67951, 0.67362, 0.6784, 0.6726, 0.67127, 0.67283, 0.67413, 0.67371, 0.67426, 0.67198, 0.67275, 0.67579, 0.66994, 0.67168, 0.6776, 0.67237, 0.67165, 0.67104, 0.67192, 0.67427, 0.67627, 0.66668, 0.66922, 0.67584, 0.67473, 0.6708, 0.67557, 0.67335, 0.67079, 0.67545, 0.67499, 0.67953, 0.67406, 0.67059, 0.67194, 0.67815, 0.67685, 0.67968, 0.67768, 0.67845, 0.68065, 0.67662, 0.67606, 0.68139, 0.67895, 0.67961, 0.67462, 0.67355, 0.68106, 0.67561, 0.67393, 0.67793, 0.67786, 0.6746, 0.67779, 0.67398, 0.67743, 0.67735, 0.67743, 0.67124, 0.68018, 0.68312, 0.67575, 0.67441, 0.67795, 0.77498, 0.67162, 0.6764, 0.67127, 0.67597, 0.68008, 0.68042, 0.67905, 0.68174, 0.67734, 0.68026, 0.6787, 0.67714, 0.682, 0.67394, 0.68013, 0.68188, 0.67889, 0.67722, 0.67427, 0.67656, 0.68229, 0.68021, 0.6768, 0.68025, 0.67886, 0.68439, 0.67958, 0.6764, 0.67518, 0.67551, 0.68714, 0.67915, 0.67531, 0.67638, 0.674, 0.67847, 0.67644, 0.67977, 0.674, 0.67593, 0.68097, 0.67926, 0.67773, 0.67609, 0.6796, 0.67785, 0.67882, 0.67923, 0.6747, 0.67544, 0.67361, 0.68038, 0.67547, 0.67624, 0.67248, 0.67952, 0.68043, 0.67937, 0.67985, 0.67588, 0.68025, 0.67916, 0.68539, 0.67959, 0.67855, 0.67714, 0.68454, 0.67696, 0.67981, 0.683, 0.68247, 0.6825, 0.68134, 0.67836, 0.68273, 0.68212, 0.68044, 0.67659, 0.67798, 0.67887, 0.67623, 0.67774, 0.67659, 0.67891, 0.67811, 0.68204, 0.68313, 0.68107, 0.68061, 0.68094, 0.68548, 0.68238, 0.67942, 0.67349, 0.67874, 0.67949, 0.67779, 0.67431, 0.67512, 0.67432, 0.67473, 0.67593, 0.68238, 0.67917, 0.67651, 0.68094, 0.67897, 0.68533, 0.67806, 0.68435, 0.68504, 0.682, 0.68404, 0.68368, 0.68461, 0.68091, 0.6825, 0.67628, 0.68089, 0.6828, 0.67779, 0.67875, 0.67869, 0.67726, 0.67954, 0.68441, 0.67716, 0.67303, 0.67398, 0.67541, 0.6785, 0.67881, 0.67645, 0.68188, 0.67884, 0.67565, 0.67403, 0.67785, 0.67584, 0.67366, 0.67828, 0.67909, 0.67494, 0.68175, 0.67414, 0.67764, 0.68174, 0.67366, 0.68332, 0.67954, 0.67548, 0.67937, 0.67851]}, "batch-generator-time": {"start_step": 0, "end_step": 2000, "step_interval": 5, "values": [2.31358, 0.01342, 0.01402, 0.01374, 0.01299, 0.01268, 0.01392, 0.01354, 0.01304, 0.01288, 0.01303, 0.01298, 0.01232, 0.01255, 0.01299, 0.01326, 0.01362, 0.0129, 0.01443, 0.01263, 0.01254, 0.01285, 0.01249, 0.01344, 0.01424, 0.01237, 0.01372, 0.01224, 0.013, 0.01253, 0.01341, 0.01286, 0.01401, 0.01393, 0.01367, 0.01532, 0.01387, 0.01392, 0.01291, 0.01426, 0.0158, 0.01586, 0.01402, 0.01614, 0.01699, 0.0155, 0.01558, 0.01634, 0.01595, 0.01549, 0.01633, 0.01561, 0.01611, 0.01605, 0.01621, 0.01402, 0.01567, 0.01545, 0.0163, 0.01651, 0.01564, 0.01603, 0.01693, 0.01689, 0.01357, 0.0139, 0.01398, 0.01321, 0.0147, 0.01234, 0.01211, 0.01284, 0.01261, 0.01263, 0.01246, 0.01271, 0.01272, 0.01352, 0.01254, 0.01474, 0.01286, 0.01466, 0.01388, 0.01269, 0.01267, 0.01231, 0.01228, 0.01211, 0.01249, 0.01199, 0.01406, 0.01239, 0.012, 0.01243, 0.01264, 0.01202, 0.01259, 0.01295, 0.01265, 0.01251, 0.01294, 0.01235, 0.01204, 0.01263, 0.01427, 0.01248, 0.01231, 0.01225, 0.01258, 0.01178, 0.01262, 0.01236, 0.01219, 0.01244, 0.01253, 0.01287, 0.01341, 0.01255, 0.01211, 0.01241, 0.01252, 0.01245, 0.01248, 0.01249, 0.01246, 0.01257, 0.01439, 0.01257, 0.01277, 0.01231, 0.01239, 0.01246, 0.01285, 0.01264, 0.01226, 0.01308, 0.01475, 0.01426, 0.01226, 0.01234, 0.0128, 0.01255, 0.01327, 0.01286, 0.01198, 0.0126, 0.01182, 0.01221, 0.01291, 0.01266, 0.0138, 0.01491, 0.01556, 0.01521, 0.01547, 0.01523, 0.01535, 0.01539, 0.01545, 0.01502, 0.01553, 0.01548, 0.01523, 0.0158, 0.0149, 0.01554, 0.01524, 0.01563, 0.01495, 0.01509, 0.01539, 0.01542, 0.01541, 0.01496, 0.0133, 0.01391, 0.01409, 0.01274, 0.01438, 0.01341, 0.01299, 0.01457, 0.0135, 0.01472, 0.01228, 0.01294, 0.01287, 0.01243, 0.01296, 0.01232, 0.0131, 0.01254, 0.01253, 0.01203, 0.01548, 0.01457, 0.01673, 0.01491, 0.01608, 0.01713, 0.20109, 0.01559, 0.01542, 0.01587, 0.01537, 0.01617, 0.01548, 0.01476, 0.01531, 0.01468, 0.01359, 0.01328, 0.01334, 0.01271, 0.01326, 0.01281, 0.01274, 0.01235, 0.01343, 0.01378, 0.01234, 0.01331, 0.01322, 0.01409, 0.01395, 0.01384, 0.01454, 0.01599, 0.01706, 0.01595, 0.01555, 0.01494, 0.01652, 0.01668, 0.01556, 0.01656, 0.01651, 0.01523, 0.01549, 0.01748, 0.0151, 0.01561, 0.01593, 0.01703, 0.01695, 0.01519, 0.11815, 0.01383, 0.01413, 0.01352, 0.0127, 0.01447, 0.01336, 0.0136, 0.0135, 0.01283, 0.01313, 0.01327, 0.01457, 0.0137, 0.01312, 0.01422, 0.01356, 0.01359, 0.01298, 0.01365, 0.01348, 0.01345, 0.01333, 0.01313, 0.01267, 0.01374, 0.01318, 0.01263, 0.01428, 0.01505, 0.01249, 0.01321, 0.01297, 0.01239, 0.01264, 0.01257, 0.01217, 0.0122, 0.0122, 0.01198, 0.0127, 0.01478, 0.01247, 0.01244, 0.01216, 0.0125, 0.01376, 0.01279, 0.01258, 0.01297, 0.01503, 0.01572, 0.01498, 0.01367, 0.01289, 0.01246, 0.01343, 0.01425, 0.01243, 0.01244, 0.0128, 0.01271, 0.01294, 0.01314, 0.01241, 0.01281, 0.01413, 0.01267, 0.01236, 0.01278, 0.01212, 0.01253, 0.01258, 0.01307, 0.0136, 0.01249, 0.0128, 0.01213, 0.01404, 0.01391, 0.01279, 0.0132, 0.01312, 0.01257, 0.01296, 0.01486, 0.01348, 0.01408, 0.01312, 0.01352, 0.01264, 0.01361, 0.01373, 0.01287, 0.01447, 0.01273, 0.0134, 0.01256, 0.01471, 0.01292, 0.01296, 0.01556, 0.01269, 0.01275, 0.01262, 0.01243, 0.01254, 0.01292, 0.01389, 0.01214, 0.01259, 0.01322, 0.01252, 0.01284, 0.01326, 0.01406, 0.01221, 0.01209, 0.01445, 0.01235, 0.01243, 0.01521, 0.01303, 0.01308, 0.01361, 0.01255, 0.01227, 0.01283, 0.01623, 0.01515, 0.01582, 0.01716, 0.01637, 0.01737, 0.01732, 0.01611, 0.01683, 0.01561, 0.01502, 0.01608, 0.015, 0.01699, 0.017, 0.0159, 0.01671, 0.016, 0.01726, 0.01765, 0.01553, 0.01619, 0.01499, 0.01559, 0.01568, 0.01579]}, "forward-recv-time": {"start_step": 0, "end_step": 2000, "step_interval": 5, "values": [5.69523, 0.02394, 0.02348, 0.02329, 0.02364, 0.02293, 0.02376, 0.0234, 0.02371, 0.02468, 0.02324, 0.02396, 0.02501, 0.0256, 0.02468, 0.02408, 0.02484, 0.02364, 0.02322, 0.02328, 0.02362, 0.02407, 0.02284, 0.02422, 0.02402, 0.02397, 0.0233, 0.02317, 0.0238, 0.02388, 0.02326, 0.02363, 0.02416, 0.02354, 0.02309, 0.02365, 0.02345, 0.02308, 0.02317, 0.02313, 0.02335, 0.023, 0.02326, 0.0233, 0.0238, 0.02375, 0.02493, 0.02394, 0.02412, 0.0238, 0.02339, 0.02351, 0.02335, 0.0266, 0.0234, 0.02405, 0.02373, 0.0237, 0.02385, 0.02378, 0.02359, 0.02689, 0.02333, 0.02338, 0.02322, 0.02354, 0.0233, 0.02329, 0.02452, 0.02693, 0.02345, 0.02326, 0.02375, 0.02341, 0.02388, 0.0233, 0.02333, 0.02476, 0.02365, 0.0236, 0.02356, 0.02344, 0.02363, 0.02334, 0.0233, 0.02313, 0.02387, 0.02342, 0.02362, 0.02319, 0.02461, 0.02359, 0.0234, 0.02397, 0.02524, 0.02331, 0.02386, 0.02533, 0.02416, 0.02445, 0.02309, 0.02381, 0.02352, 0.02393, 0.02341, 0.02313, 0.02371, 0.02364, 0.02387, 0.02355, 0.02449, 0.02408, 0.02363, 0.02317, 0.02331, 0.0239, 0.02385, 0.0235, 0.02309, 0.0239, 0.02371, 0.0232, 0.0236, 0.0237, 0.0241, 0.02434, 0.02347, 0.02522, 0.02461, 0.02418, 0.02376, 0.02318, 0.02386, 0.02379, 0.02334, 0.02333, 0.02452, 0.02365, 0.02364, 0.02368, 0.02399, 0.02426, 0.02355, 0.02382, 0.02423, 0.02653, 0.02379, 0.02327, 0.02414, 0.02462, 0.02631, 0.02476, 0.02402, 0.02578, 0.02427, 0.02403, 0.02365, 0.02467, 0.02569, 0.02364, 0.02413, 0.02503, 0.02507, 0.02438, 0.02416, 0.02449, 0.02518, 0.02522, 0.02409, 0.02476, 0.02466, 0.02482, 0.02437, 0.02418, 0.0241, 0.02501, 0.02478, 0.02401, 0.02483, 0.02545, 0.02468, 0.02391, 0.02507, 0.02466, 0.02414, 0.02353, 0.0242, 0.02477, 0.02356, 0.02431, 0.02316, 0.02439, 0.02399, 0.02385, 0.02354, 0.02465, 0.02547, 0.02508, 0.02419, 0.02477, 0.01768, 0.02429, 0.02356, 0.02577, 0.02434, 0.02473, 0.02445, 0.02378, 0.02439, 0.02389, 0.02352, 0.02408, 0.02328, 0.02452, 0.02367, 0.02386, 0.02413, 0.02431, 0.02462, 0.02369, 0.02376, 0.02491, 0.02439, 0.02403, 0.02377, 0.02464, 0.02435, 0.02348, 0.02371, 0.0252, 0.02368, 0.02387, 0.02399, 0.02427, 0.02729, 0.02472, 0.02405, 0.02401, 0.02437, 0.02492, 0.02402, 0.02449, 0.02457, 0.02418, 0.02405, 0.02463, 0.02494, 0.02411, 0.02427, 0.02434, 0.02507, 0.02381, 0.02365, 0.02529, 0.02396, 0.02466, 0.0235, 0.02361, 0.02374, 0.02465, 0.02472, 0.02388, 0.02377, 0.02493, 0.02356, 0.02375, 0.024, 0.02421, 0.02437, 0.02348, 0.02314, 0.02411, 0.02461, 0.02389, 0.0247, 0.02407, 0.0246, 0.02474, 0.02412, 0.02434, 0.02469, 0.02369, 0.02397, 0.02513, 0.02411, 0.02363, 0.02383, 0.02511, 0.02474, 0.02401, 0.02392, 0.0241, 0.02386, 0.02404, 0.02408, 0.02406, 0.02452, 0.02544, 0.02797, 0.0258, 0.02429, 0.02521, 0.02549, 0.02471, 0.02437, 0.02521, 0.02445, 0.0245, 0.0237, 0.02743, 0.02449, 0.02397, 0.02369, 0.02461, 0.02423, 0.02547, 0.02366, 0.02466, 0.02473, 0.02447, 0.02511, 0.02472, 0.02518, 0.02397, 0.02404, 0.02493, 0.02555, 0.02496, 0.02436, 0.02395, 0.02507, 0.02456, 0.0243, 0.02385, 0.02539, 0.02483, 0.02431, 0.02399, 0.02469, 0.0254, 0.02512, 0.03429, 0.0364, 0.03571, 0.03561, 0.03474, 0.02415, 0.02604, 0.02499, 0.02494, 0.0246, 0.02567, 0.02501, 0.02468, 0.02397, 0.02793, 0.02468, 0.02491, 0.02539, 0.02409, 0.02475, 0.02441, 0.02562, 0.02394, 0.02557, 0.02449, 0.02381, 0.02425, 0.02474, 0.02431, 0.02389, 0.02357, 0.02526, 0.0266, 0.02574, 0.02347, 0.02485, 0.02498, 0.02413, 0.02387, 0.02515, 0.02481, 0.02439, 0.02404, 0.02457, 0.02585, 0.02502, 0.02382, 0.02429, 0.02509, 0.02444, 0.02418, 0.02439, 0.02469, 0.0242, 0.0249, 0.02556, 0.0254, 0.02589, 0.02426]}, "forward-send-time": {"start_step": 0, "end_step": 2000, "step_interval": 5, "values": [2.90859, 0.00013, 0.00013, 0.00012, 0.00012, 0.00011, 0.00012, 0.00012, 0.00012, 0.00012, 0.00012, 0.00012, 0.00013, 0.00013, 0.00012, 0.00012, 0.00014, 0.00012, 0.00012, 0.00012, 0.00012, 0.00012, 0.00012, 0.00012, 0.00012, 0.00012, 0.00013, 0.00012, 0.00012, 0.00012, 0.00012, 0.00012, 0.00012, 0.00012, 0.00012, 0.00012, 0.00012, 0.00012, 0.00012, 0.00012, 0.00011, 0.00012, 0.00012, 0.00012, 0.00012, 0.00012, 0.00012, 0.00012, 0.00012, 0.00041, 0.00012, 0.00012, 0.00012, 0.00013, 0.00012, 0.00012, 0.00012, 0.00012, 0.00012, 0.00012, 0.00013, 0.00013, 0.00011, 0.00013, 0.00011, 0.00012, 0.00011, 0.00012, 0.00012, 0.00012, 0.00012, 0.00013, 0.00012, 0.00011, 0.00012, 0.00012, 0.00011, 0.00012, 0.00012, 0.00012, 0.00013, 0.00012, 0.00012, 0.00012, 0.00011, 0.00012, 0.00012, 0.00012, 0.00012, 0.00012, 0.00013, 0.00012, 0.00012, 0.00012, 0.00012, 0.00011, 0.00012, 0.00012, 0.00012, 0.00012, 0.00012, 0.00012, 0.00013, 0.00012, 0.00012, 0.00012, 0.00012, 0.00012, 0.00012, 0.00012, 0.00012, 0.00012, 0.00012, 0.00012, 0.00012, 0.00013, 0.00012, 0.00012, 0.00012, 0.00012, 0.00012, 0.00012, 0.00012, 0.00012, 0.00013, 0.00013, 0.00012, 0.00012, 0.00012, 0.00012, 0.00012, 0.00012, 0.00012, 0.00012, 0.00012, 0.00012, 0.00012, 0.00012, 0.00012, 0.00012, 0.00013, 0.00012, 0.00012, 0.00012, 0.00012, 0.00012, 0.00014, 0.00012, 0.00012, 0.00012, 0.00012, 0.00013, 0.00012, 0.00012, 0.00012, 0.00012, 0.00012, 0.00013, 0.00013, 0.00012, 0.00013, 0.00012, 0.00013, 0.00012, 0.00012, 0.00013, 0.00012, 0.00013, 0.00012, 0.00012, 0.00012, 0.00012, 0.00012, 0.00012, 0.00012, 0.00013, 0.00012, 0.00012, 0.00013, 0.00012, 0.00014, 0.00012, 0.00013, 0.00013, 0.00012, 0.00012, 0.00012, 0.00012, 0.00012, 0.00012, 0.00012, 0.00012, 0.00013, 0.00012, 0.00012, 0.00012, 0.00013, 0.00012, 0.00012, 0.00012, 0.00011, 0.00012, 0.00012, 0.00012, 0.00012, 0.00012, 0.00012, 0.00012, 0.00012, 0.00012, 0.00013, 0.00012, 0.00012, 0.00013, 0.00012, 0.00012, 0.00012, 0.00013, 0.00013, 0.00012, 0.00012, 0.00013, 0.00013, 0.00012, 0.00012, 0.00012, 0.00012, 0.00012, 0.00012, 0.00012, 0.00013, 0.00012, 0.00012, 0.00012, 0.00013, 0.00014, 0.00012, 0.00012, 0.00012, 0.00012, 0.00012, 0.00012, 0.00012, 0.00012, 0.00012, 0.00012, 0.00012, 0.00012, 0.00012, 0.00012, 0.00013, 0.00013, 0.00012, 0.00012, 0.00012, 0.00013, 0.00012, 0.00012, 0.00012, 0.00012, 0.00012, 0.00012, 0.00012, 0.00012, 0.00013, 0.00014, 0.00012, 0.00014, 0.00012, 0.00012, 0.00012, 0.00013, 0.00012, 0.00012, 0.00012, 0.00012, 0.00013, 0.00012, 0.00012, 0.00012, 0.00013, 0.00012, 0.00012, 0.00013, 0.00012, 0.00012, 0.00012, 0.00012, 0.00012, 0.00012, 0.00012, 0.00013, 0.00012, 0.00013, 0.00012, 0.00012, 0.00012, 0.00013, 0.00013, 0.00013, 0.00012, 0.00013, 0.00013, 0.00012, 0.00012, 0.00012, 0.00013, 0.00012, 0.00013, 0.00012, 0.00014, 0.00012, 0.00011, 0.00012, 0.00013, 0.00013, 0.00012, 0.00012, 0.00012, 0.00012, 0.00013, 0.00012, 0.00012, 0.00012, 0.00012, 0.00013, 0.00013, 0.00012, 0.00012, 0.00012, 0.00013, 0.00012, 0.00012, 0.00012, 0.00012, 0.00013, 0.00012, 0.00012, 0.00012, 0.00013, 0.00013, 0.00014, 0.00017, 0.00016, 0.00012, 0.00017, 0.00011, 0.00012, 0.00012, 0.00012, 0.00014, 0.00013, 0.00012, 0.00013, 0.00012, 0.00013, 0.00013, 0.00014, 0.00013, 0.00012, 0.00012, 0.00012, 0.00012, 0.00011, 0.00013, 0.00012, 0.00012, 0.00013, 0.00012, 0.00012, 0.00012, 0.00012, 0.00012, 0.00013, 0.00013, 0.00012, 0.00012, 0.00013, 0.00012, 0.00012, 0.00012, 0.00012, 0.00012, 0.00012, 0.00011, 0.00012, 0.00012, 0.00013, 0.00012, 0.00013, 0.00012, 0.00012, 0.00012, 0.00012, 0.00012, 0.00012, 0.00011, 0.00012, 0.00013, 0.00013]}, "backward-recv-time": {"start_step": 0, "end_step": 2000, "step_interval": 5, "values": [0.02368, 0.02348, 0.02394, 0.02364, 0.02449, 0.02409, 0.02505, 0.02374, 0.02528, 0.0259, 0.02358, 0.0242, 0.02637, 0.02354, 0.0251, 0.02307, 0.02342, 0.02386, 0.02487, 0.02353, 0.02241, 0.02358, 0.02336, 0.02385, 0.02423, 0.02362, 0.02431, 0.02368, 0.02447, 0.02388, 0.02278, 0.02395, 0.02289, 0.02372, 0.0236, 0.02367, 0.02368, 0.02432, 0.02399, 0.02338, 0.02355, 0.02343, 0.02344, 0.02565, 0.02464, 0.02367, 0.02563, 0.02365, 0.02498, 0.02382, 0.02437, 0.02419, 0.02505, 0.02388, 0.02389, 0.02396, 0.02377, 0.02399, 0.02396, 0.02304, 0.02377, 0.02724, 0.02399, 0.02408, 0.02416, 0.02465, 0.02583, 0.02394, 0.02408, 0.02617, 0.02288, 0.02529, 0.0259, 0.02468, 0.02405, 0.02424, 0.02366, 0.02431, 0.02501, 0.02416, 0.02392, 0.02398, 0.02395, 0.02361, 0.02493, 0.02419, 0.02355, 0.02345, 0.02429, 0.02305, 0.02433, 0.02418, 0.02434, 0.02361, 0.02432, 0.02418, 0.0234, 0.02415, 0.02349, 0.02463, 0.02416, 0.02344, 0.02561, 0.02358, 0.02435, 0.024, 0.02522, 0.02503, 0.02562, 0.02467, 0.02425, 0.02421, 0.02382, 0.0242, 0.02401, 0.02416, 0.02588, 0.0247, 0.02434, 0.02473, 0.02524, 0.02511, 0.02494, 0.02375, 0.02595, 0.02432, 0.02337, 0.02414, 0.02486, 0.0245, 0.02433, 0.02431, 0.02365, 0.02411, 0.02342, 0.02427, 0.02467, 0.02469, 0.02352, 0.02452, 0.02337, 0.02463, 0.02478, 0.02463, 0.02462, 0.02668, 0.02409, 0.02498, 0.02302, 0.02351, 0.02626, 0.02404, 0.02319, 0.02423, 0.02437, 0.02371, 0.02423, 0.02372, 0.02372, 0.02417, 0.02394, 0.02401, 0.02428, 0.02406, 0.02443, 0.02396, 0.02341, 0.02439, 0.02392, 0.02389, 0.02372, 0.02654, 0.02468, 0.02413, 0.02396, 0.02411, 0.02434, 0.02436, 0.02416, 0.02432, 0.02413, 0.02462, 0.0275, 0.02423, 0.02396, 0.027, 0.02446, 0.02452, 0.025, 0.02481, 0.02389, 0.02952, 0.02408, 0.02468, 0.02725, 0.02317, 0.02402, 0.02623, 0.02326, 0.02418, 0.0249, 0.0242, 0.02443, 0.02409, 0.0256, 0.02406, 0.02355, 0.02409, 0.02372, 0.02539, 0.02507, 0.02461, 0.02483, 0.02426, 0.02423, 0.02431, 0.02427, 0.02447, 0.02382, 0.02564, 0.02441, 0.02556, 0.02403, 0.02573, 0.02428, 0.02401, 0.02513, 0.02382, 0.02364, 0.02454, 0.02477, 0.02397, 0.0253, 0.02422, 0.02361, 0.02617, 0.02493, 0.02542, 0.0241, 0.02392, 0.02412, 0.02369, 0.02392, 0.02434, 0.02381, 0.02437, 0.02629, 0.02397, 0.0244, 0.02457, 0.02396, 0.02392, 0.02359, 0.02513, 0.02438, 0.02434, 0.02525, 0.02462, 0.02406, 0.02675, 0.0243, 0.02493, 0.02442, 0.02465, 0.02474, 0.02404, 0.02508, 0.02549, 0.02338, 0.02287, 0.02444, 0.02513, 0.02493, 0.02474, 0.0248, 0.02431, 0.0245, 0.02863, 0.02409, 0.02427, 0.02391, 0.02367, 0.02441, 0.02399, 0.02425, 0.02368, 0.0241, 0.02393, 0.02417, 0.02474, 0.02369, 0.02638, 0.02436, 0.02611, 0.02434, 0.02576, 0.02383, 0.02442, 0.02353, 0.02419, 0.02477, 0.02466, 0.02579, 0.02455, 0.0242, 0.02475, 0.02338, 0.02403, 0.02538, 0.02364, 0.02364, 0.02423, 0.02324, 0.02408, 0.02434, 0.02456, 0.0243, 0.02403, 0.02448, 0.02338, 0.02413, 0.02447, 0.02323, 0.02365, 0.02506, 0.02554, 0.02565, 0.02416, 0.025, 0.02532, 0.02482, 0.02683, 0.02458, 0.02498, 0.02491, 0.02422, 0.0243, 0.02428, 0.02417, 0.02376, 0.02431, 0.02339, 0.02362, 0.02365, 0.02371, 0.02421, 0.02393, 0.02386, 0.02374, 0.0249, 0.02454, 0.02401, 0.02418, 0.02411, 0.02461, 0.02418, 0.02303, 0.02369, 0.02384, 0.02685, 0.02364, 0.02436, 0.02417, 0.02486, 0.02423, 0.02448, 0.02462, 0.02366, 0.02415, 0.02421, 0.0243, 0.02378, 0.02574, 0.02403, 0.02374, 0.02434, 0.02432, 0.02579, 0.02343, 0.02354, 0.02396, 0.02392, 0.02373, 0.02416, 0.02348, 0.02355, 0.02427, 0.0252, 0.02486, 0.02405, 0.02393, 0.0234, 0.02443, 0.02418, 0.02422, 0.02504, 0.02408, 0.0243, 0.02762, 0.02382]}, "backward-send-time": {"start_step": 0, "end_step": 2000, "step_interval": 5, "values": [0.00016, 0.00016, 0.00019, 0.00016, 0.00017, 0.00018, 0.00016, 0.00016, 0.00019, 0.00016, 0.00017, 0.00016, 0.00016, 0.00016, 0.00018, 0.00018, 0.00016, 0.00016, 0.00019, 0.00016, 0.00018, 0.00019, 0.00018, 0.00015, 0.00016, 0.00016, 0.00016, 0.00016, 0.00016, 0.00016, 0.00016, 0.00017, 0.00018, 0.00016, 0.00016, 0.00016, 0.00016, 0.00017, 0.00018, 0.00019, 0.00018, 0.00018, 0.00018, 0.00017, 0.00018, 0.00016, 0.00016, 0.00016, 0.00017, 0.00016, 0.00017, 0.00019, 0.00016, 0.00017, 0.00017, 0.00016, 0.00019, 0.00018, 0.00018, 0.00016, 0.00018, 0.00016, 0.00017, 0.00016, 0.00018, 0.00016, 0.00019, 0.00018, 0.00018, 0.00016, 0.00018, 0.00016, 0.00016, 0.00016, 0.00018, 0.00016, 0.00017, 0.00016, 0.00018, 0.00018, 0.00018, 0.00017, 0.00017, 0.00017, 0.00018, 0.00016, 0.00018, 0.00018, 0.00019, 0.00016, 0.00018, 0.00016, 0.00016, 0.00016, 0.00016, 0.00016, 0.00019, 0.00016, 0.00018, 0.00016, 0.00017, 0.00016, 0.00018, 0.00016, 0.00018, 0.00016, 0.00017, 0.00032, 0.00018, 0.00016, 0.00016, 0.00016, 0.00018, 0.00018, 0.00018, 0.00019, 0.00016, 0.00016, 0.00016, 0.00016, 0.00018, 0.00019, 0.00016, 0.00016, 0.00016, 0.00017, 0.00019, 0.00018, 0.00016, 0.00018, 0.00018, 0.00016, 0.00018, 0.00017, 0.00016, 0.00016, 0.00018, 0.00016, 0.00018, 0.00017, 0.00016, 0.00017, 0.00025, 0.00016, 0.00016, 0.00018, 0.00016, 0.00016, 0.00016, 0.00018, 0.00016, 0.00016, 0.00016, 0.00016, 0.00018, 0.00017, 0.00019, 0.00016, 0.00016, 0.00018, 0.00018, 0.00016, 0.00016, 0.00016, 0.00016, 0.00016, 0.00016, 0.00031, 0.00016, 0.00016, 0.00025, 0.00018, 0.00016, 0.00016, 0.00016, 0.00018, 0.00017, 0.00016, 0.00018, 0.00018, 0.00016, 0.00016, 0.00022, 0.00016, 0.00016, 0.00016, 0.00018, 0.00016, 0.00017, 0.00017, 0.00015, 0.00016, 0.00018, 0.00016, 0.00016, 0.00016, 0.00017, 0.00018, 0.00019, 0.00017, 0.00017, 0.00018, 0.00016, 0.00018, 0.00018, 0.00018, 0.00015, 0.00016, 0.00017, 0.00016, 0.00016, 0.00017, 0.00016, 0.00017, 0.00018, 0.00017, 0.00018, 0.00018, 0.00016, 0.00016, 0.00018, 0.00018, 0.00016, 0.00016, 0.00016, 0.00018, 0.00017, 0.00017, 0.00018, 0.00018, 0.00016, 0.00017, 0.00017, 0.00016, 0.00017, 0.00019, 0.00019, 0.00028, 0.00017, 0.00017, 0.00016, 0.00016, 0.00016, 0.00016, 0.00015, 0.00017, 0.00016, 0.00016, 0.00017, 0.00018, 0.00016, 0.00016, 0.00016, 0.00017, 0.00018, 0.0002, 0.00016, 0.00017, 0.00017, 0.00018, 0.00018, 0.00016, 0.00016, 0.00017, 0.00018, 0.00018, 0.00016, 0.00023, 0.00016, 0.00016, 0.00016, 0.00017, 0.00016, 0.00019, 0.00017, 0.00016, 0.00016, 0.00015, 0.00016, 0.00018, 0.00019, 0.00016, 0.00018, 0.00017, 0.00016, 0.00017, 0.00018, 0.00018, 0.00022, 0.00016, 0.00016, 0.0002, 0.00019, 0.00017, 0.00016, 0.00018, 0.00016, 0.00016, 0.00017, 0.00016, 0.00017, 0.00019, 0.00016, 0.00016, 0.00018, 0.00017, 0.00018, 0.00015, 0.00016, 0.00016, 0.00019, 0.00018, 0.00018, 0.00016, 0.00018, 0.00016, 0.00016, 0.00016, 0.00017, 0.00016, 0.00016, 0.00018, 0.00016, 0.00016, 0.00016, 0.00018, 0.00018, 0.00018, 0.00016, 0.00017, 0.00022, 0.00016, 0.00017, 0.00016, 0.00018, 0.00016, 0.00018, 0.00018, 0.00018, 0.00017, 0.00016, 0.00017, 0.00016, 0.00026, 0.00018, 0.00016, 0.00016, 0.00016, 0.00017, 0.00018, 0.00016, 0.00016, 0.00019, 0.00017, 0.00018, 0.00016, 0.00016, 0.00016, 0.00016, 0.00018, 0.00019, 0.00016, 0.00016, 0.00016, 0.00016, 0.00016, 0.00018, 0.00019, 0.00016, 0.00018, 0.00018, 0.00016, 0.00017, 0.00018, 0.00031, 0.00018, 0.00017, 0.00016, 0.00016, 0.00016, 0.00016, 0.00016, 0.00017, 0.00016, 0.00016, 0.00017, 0.00016, 0.00016, 0.00019, 0.00018, 0.00016, 0.00017, 0.00018, 0.00018, 0.00018, 0.00016, 0.00018, 0.00019, 0.00016, 0.00019]}, "forward-send-backward-recv-time": {"start_step": 0, "end_step": 2000, "step_interval": 5, "values": [7.32739, 0.12477, 0.12666, 0.128, 0.12835, 0.12967, 0.1275, 0.13153, 0.12112, 0.12816, 0.12128, 0.1203, 0.12267, 0.122, 0.12207, 0.1236, 0.12689, 0.12116, 0.11515, 0.1236, 0.11731, 0.11801, 0.12855, 0.12095, 0.12421, 0.12165, 0.12224, 0.11784, 0.12171, 0.11872, 0.11626, 0.12467, 0.1241, 0.11907, 0.11776, 0.12636, 0.11891, 0.12432, 0.12301, 0.12655, 0.12996, 0.13374, 0.12156, 0.12801, 0.13689, 0.1275, 0.13219, 0.13231, 0.13041, 0.12833, 0.13716, 0.13099, 0.1317, 0.1252, 0.12341, 0.12286, 0.12995, 0.12336, 0.13226, 0.13381, 0.12738, 0.13598, 0.13071, 0.13531, 0.14271, 0.14199, 0.13871, 0.142, 0.14001, 0.14332, 0.13666, 0.13328, 0.14543, 0.14315, 0.13564, 0.15173, 0.14153, 0.15109, 0.14782, 0.14157, 0.14168, 0.14516, 0.13449, 0.13595, 0.13466, 0.13854, 0.13617, 0.13542, 0.13551, 0.13682, 0.13396, 0.13632, 0.12977, 0.13179, 0.13436, 0.12818, 0.1318, 0.15065, 0.14138, 0.14121, 0.12829, 0.1243, 0.12753, 0.13425, 0.13136, 0.13043, 0.12709, 0.1367, 0.13831, 0.13249, 0.13782, 0.13352, 0.13464, 0.12973, 0.1292, 0.13364, 0.13332, 0.13424, 0.12997, 0.13345, 0.12818, 0.13196, 0.13345, 0.13333, 0.13254, 0.13659, 0.13184, 0.13348, 0.12597, 0.13454, 0.13192, 0.1375, 0.13257, 0.12337, 0.1345, 0.13062, 0.13753, 0.13119, 0.13426, 0.13825, 0.13839, 0.13388, 0.13726, 0.12898, 0.13377, 0.13935, 0.1381, 0.13416, 0.13521, 0.13765, 0.1373, 0.13402, 0.12531, 0.13371, 0.14559, 0.13302, 0.12679, 0.13579, 0.1348, 0.13764, 0.13247, 0.13464, 0.13235, 0.13117, 0.12868, 0.13327, 0.13496, 0.1324, 0.13728, 0.13904, 0.13275, 0.14304, 0.14323, 0.14887, 0.14315, 0.1468, 0.14026, 0.14574, 0.14975, 0.14342, 0.14555, 0.13943, 0.1403, 0.1444, 0.14205, 0.14177, 0.1462, 0.14686, 0.14634, 0.14245, 0.14549, 0.14618, 0.14887, 0.13512, 0.13541, 0.13381, 0.14182, 0.14007, 0.14152, 0.13605, 0.13807, 0.13717, 0.13509, 0.13546, 0.13698, 0.13358, 0.13623, 0.13205, 0.12316, 0.13181, 0.14145, 0.1317, 0.13396, 0.14106, 0.13611, 0.14089, 0.14373, 0.13469, 0.1384, 0.14246, 0.13291, 0.14068, 0.13738, 0.13421, 0.13749, 0.13088, 0.13458, 0.13609, 0.133, 0.14241, 0.13922, 0.13388, 0.14182, 0.13246, 0.13971, 0.14107, 0.13164, 0.13039, 0.13705, 0.12577, 0.13184, 0.13088, 0.13144, 0.13487, 0.13555, 0.12695, 0.23517, 0.1322, 0.13486, 0.16077, 0.13981, 0.23534, 0.13332, 0.13076, 0.13464, 0.12966, 0.13057, 0.13577, 0.13162, 0.12711, 0.13253, 0.13694, 0.13253, 0.1291, 0.13231, 0.13615, 0.13278, 0.13306, 0.13739, 0.13635, 0.12928, 0.12884, 0.13997, 0.13381, 0.13621, 0.14094, 0.1347, 0.13224, 0.13078, 0.1333, 0.14059, 0.13768, 0.13345, 0.1394, 0.13204, 0.13595, 0.14267, 0.13406, 0.13447, 0.13958, 0.13493, 0.13657, 0.13256, 0.13241, 0.14205, 0.13985, 0.13748, 0.14438, 0.14105, 0.13704, 0.14125, 0.13958, 0.1371, 0.13476, 0.13221, 0.14116, 0.1413, 0.13323, 0.13777, 0.13451, 0.13785, 0.13827, 0.13489, 0.13565, 0.13632, 0.14132, 0.13954, 0.13567, 0.13798, 0.1411, 0.13641, 0.1346, 0.13417, 0.13059, 0.14076, 0.14564, 0.14703, 0.14826, 0.14723, 0.14169, 0.14389, 0.14245, 0.14606, 0.1389, 0.14429, 0.14006, 0.13171, 0.13461, 0.13482, 0.14111, 0.13415, 0.14396, 0.15035, 0.14874, 0.1481, 0.14804, 0.13867, 0.14775, 0.13614, 0.13103, 0.13832, 0.13379, 0.15425, 0.1329, 0.22576, 0.13539, 0.12996, 0.16565, 0.12569, 0.12696, 0.12758, 0.13901, 0.13127, 0.13219, 0.13915, 0.13046, 0.12996, 0.1351, 0.13312, 0.13428, 0.13394, 0.13287, 0.13398, 0.13368, 0.12682, 0.13561, 0.13323, 0.1307, 0.13416, 0.13272, 0.13142, 0.136, 0.13057, 0.13073, 0.13345, 0.13692, 0.13433, 0.13536, 0.13216, 0.13483, 0.13431, 0.13132, 0.13241, 0.13481, 0.13004, 0.13405, 0.12911, 0.13104, 0.13208, 0.13389]}, "backward-send-forward-recv-time": {"start_step": 0, "end_step": 2000, "step_interval": 5, "values": [2.85465, 0.00835, 0.00699, 0.00741, 0.00706, 0.00797, 0.0072, 0.00701, 0.00796, 0.0097, 0.00702, 0.00774, 0.00734, 0.00774, 0.0089, 0.00828, 0.00699, 0.00781, 0.00859, 0.00782, 0.00885, 0.00849, 0.00699, 0.00689, 0.00726, 0.00698, 0.00708, 0.00765, 0.00904, 0.00754, 0.00764, 0.00719, 0.00699, 0.00717, 0.00867, 0.00723, 0.00713, 0.00719, 0.00696, 0.00695, 0.0071, 0.00724, 0.00738, 0.00696, 0.00708, 0.00738, 0.00771, 0.00745, 0.00704, 0.00878, 0.00742, 0.00713, 0.00774, 0.00714, 0.00691, 0.01011, 0.00831, 0.00755, 0.00829, 0.00713, 0.00712, 0.00776, 0.00714, 0.00703, 0.00812, 0.00754, 0.00844, 0.00686, 0.00703, 0.00718, 0.00709, 0.00784, 0.00743, 0.00744, 0.00705, 0.00773, 0.0077, 0.00752, 0.00823, 0.00721, 0.00697, 0.00777, 0.00754, 0.00704, 0.00687, 0.00767, 0.00697, 0.00724, 0.0081, 0.0081, 0.00692, 0.00799, 0.00739, 0.00705, 0.00849, 0.00694, 0.00742, 0.00767, 0.00711, 0.00824, 0.00696, 0.00742, 0.00848, 0.00758, 0.00786, 0.00691, 0.00711, 0.00709, 0.00692, 0.00764, 0.00779, 0.00699, 0.00727, 0.00768, 0.007, 0.0078, 0.00701, 0.00735, 0.00759, 0.00875, 0.00792, 0.00727, 0.00737, 0.00715, 0.00787, 0.00741, 0.00751, 0.00855, 0.00692, 0.00786, 0.00751, 0.00811, 0.00715, 0.00699, 0.00709, 0.00705, 0.00737, 0.0082, 0.00828, 0.00883, 0.00777, 0.00806, 0.00752, 0.0074, 0.00758, 0.00764, 0.00798, 0.00876, 0.0073, 0.00773, 0.00824, 0.00728, 0.00773, 0.00775, 0.00706, 0.00716, 0.00698, 0.00735, 0.00857, 0.00716, 0.00715, 0.00888, 0.00742, 0.00709, 0.00773, 0.00707, 0.00785, 0.00751, 0.00723, 0.00781, 0.00732, 0.00731, 0.00751, 0.00926, 0.00734, 0.00835, 0.00815, 0.00834, 0.00863, 0.00698, 0.00697, 0.00866, 0.00749, 0.00697, 0.00797, 0.00761, 0.00705, 0.00898, 0.00815, 0.00711, 0.00733, 0.00846, 0.00756, 0.00807, 0.00707, 0.00876, 0.00728, 0.00798, 0.00766, 0.00737, 0.00998, 0.00838, 0.0077, 0.00751, 0.00848, 0.00695, 0.00705, 0.00981, 0.00734, 0.00923, 0.0071, 0.00714, 0.00728, 0.00728, 0.0085, 0.00981, 0.00871, 0.00696, 0.00863, 0.00936, 0.01089, 0.00793, 0.00711, 0.00971, 0.00701, 0.00936, 0.00758, 0.00816, 0.00884, 0.00803, 0.00847, 0.01006, 0.00978, 0.00825, 0.0081, 0.00787, 0.00813, 0.00997, 0.00754, 0.00893, 0.00765, 0.00713, 0.0078, 0.0076, 0.00705, 0.00918, 0.11069, 0.00794, 0.00727, 0.07524, 0.00865, 0.00813, 0.007, 0.00696, 0.0071, 0.00698, 0.00706, 0.00709, 0.00901, 0.00738, 0.00798, 0.00783, 0.00755, 0.00757, 0.00792, 0.0078, 0.00758, 0.00842, 0.00991, 0.00945, 0.00712, 0.00835, 0.00735, 0.00734, 0.00709, 0.00708, 0.00953, 0.00709, 0.00704, 0.00922, 0.00937, 0.00856, 0.00712, 0.00846, 0.01121, 0.00908, 0.00701, 0.01037, 0.00813, 0.00814, 0.00709, 0.00791, 0.0074, 0.00756, 0.00813, 0.00849, 0.00705, 0.00877, 0.00705, 0.00702, 0.00784, 0.00699, 0.00862, 0.00977, 0.0078, 0.00851, 0.00917, 0.00814, 0.00962, 0.0071, 0.00832, 0.01014, 0.00711, 0.00716, 0.00781, 0.00825, 0.01002, 0.00758, 0.00695, 0.01037, 0.00713, 0.0097, 0.00977, 0.00754, 0.00863, 0.00703, 0.00781, 0.00826, 0.00731, 0.00742, 0.00778, 0.00814, 0.00835, 0.00713, 0.00837, 0.0071, 0.00718, 0.00856, 0.00694, 0.00858, 0.00741, 0.00763, 0.00727, 0.00894, 0.00892, 0.0078, 0.00875, 0.00972, 0.00704, 0.00701, 0.00812, 0.00733, 0.0694, 0.00715, 0.09935, 0.00722, 0.00697, 0.0823, 0.00708, 0.00762, 0.00706, 0.00717, 0.00712, 0.0071, 0.00708, 0.00694, 0.00712, 0.00717, 0.00703, 0.00723, 0.00767, 0.007, 0.00705, 0.00716, 0.00837, 0.00992, 0.00743, 0.0076, 0.00795, 0.00785, 0.00774, 0.00828, 0.00864, 0.00714, 0.00767, 0.00727, 0.0089, 0.00821, 0.00781, 0.00855, 0.00777, 0.00721, 0.00716, 0.00875, 0.00792, 0.00919, 0.00807, 0.00884, 0.00881, 0.0088]}, "layernorm-grads-all-reduce-time": {"start_step": 0, "end_step": 2000, "step_interval": 5, "values": [4e-05, 1e-05, 1e-05, 1e-05, 1e-05, 1e-05, 1e-05, 1e-05, 1e-05, 1e-05, 1e-05, 1e-05, 1e-05, 2e-05, 1e-05, 1e-05, 1e-05, 1e-05, 1e-05, 1e-05, 1e-05, 1e-05, 1e-05, 1e-05, 1e-05, 1e-05, 1e-05, 1e-05, 1e-05, 1e-05, 1e-05, 1e-05, 1e-05, 1e-05, 1e-05, 1e-05, 2e-05, 1e-05, 1e-05, 2e-05, 1e-05, 1e-05, 1e-05, 1e-05, 1e-05, 1e-05, 1e-05, 1e-05, 1e-05, 1e-05, 1e-05, 1e-05, 2e-05, 1e-05, 1e-05, 1e-05, 2e-05, 1e-05, 1e-05, 1e-05, 2e-05, 2e-05, 1e-05, 1e-05, 1e-05, 1e-05, 1e-05, 1e-05, 1e-05, 1e-05, 1e-05, 2e-05, 1e-05, 1e-05, 1e-05, 1e-05, 1e-05, 2e-05, 1e-05, 2e-05, 1e-05, 1e-05, 1e-05, 1e-05, 1e-05, 1e-05, 1e-05, 1e-05, 1e-05, 1e-05, 1e-05, 1e-05, 1e-05, 1e-05, 1e-05, 1e-05, 1e-05, 1e-05, 1e-05, 2e-05, 2e-05, 1e-05, 1e-05, 1e-05, 1e-05, 1e-05, 1e-05, 1e-05, 1e-05, 1e-05, 2e-05, 1e-05, 1e-05, 1e-05, 2e-05, 1e-05, 1e-05, 1e-05, 1e-05, 2e-05, 1e-05, 1e-05, 1e-05, 2e-05, 1e-05, 1e-05, 2e-05, 2e-05, 2e-05, 1e-05, 1e-05, 2e-05, 1e-05, 1e-05, 1e-05, 1e-05, 1e-05, 1e-05, 1e-05, 1e-05, 1e-05, 1e-05, 1e-05, 1e-05, 1e-05, 1e-05, 2e-05, 1e-05, 1e-05, 2e-05, 1e-05, 1e-05, 1e-05, 1e-05, 1e-05, 1e-05, 1e-05, 2e-05, 1e-05, 1e-05, 1e-05, 1e-05, 2e-05, 1e-05, 2e-05, 2e-05, 1e-05, 2e-05, 1e-05, 1e-05, 1e-05, 2e-05, 1e-05, 1e-05, 1e-05, 1e-05, 1e-05, 2e-05, 1e-05, 1e-05, 2e-05, 1e-05, 1e-05, 1e-05, 1e-05, 1e-05, 1e-05, 1e-05, 1e-05, 2e-05, 1e-05, 1e-05, 1e-05, 2e-05, 1e-05, 1e-05, 1e-05, 1e-05, 1e-05, 2e-05, 3e-05, 1e-05, 1e-05, 1e-05, 1e-05, 2e-05, 1e-05, 1e-05, 1e-05, 1e-05, 1e-05, 1e-05, 1e-05, 1e-05, 1e-05, 1e-05, 1e-05, 1e-05, 1e-05, 2e-05, 1e-05, 1e-05, 1e-05, 1e-05, 1e-05, 1e-05, 1e-05, 1e-05, 1e-05, 1e-05, 1e-05, 1e-05, 1e-05, 1e-05, 1e-05, 1e-05, 1e-05, 2e-05, 1e-05, 1e-05, 1e-05, 1e-05, 1e-05, 2e-05, 1e-05, 1e-05, 2e-05, 1e-05, 2e-05, 1e-05, 1e-05, 1e-05, 1e-05, 1e-05, 1e-05, 1e-05, 2e-05, 1e-05, 2e-05, 1e-05, 1e-05, 1e-05, 1e-05, 1e-05, 1e-05, 1e-05, 1e-05, 1e-05, 1e-05, 1e-05, 1e-05, 1e-05, 1e-05, 1e-05, 1e-05, 1e-05, 1e-05, 1e-05, 1e-05, 1e-05, 1e-05, 1e-05, 1e-05, 1e-05, 1e-05, 1e-05, 1e-05, 1e-05, 2e-05, 1e-05, 1e-05, 1e-05, 1e-05, 1e-05, 1e-05, 1e-05, 1e-05, 1e-05, 1e-05, 1e-05, 1e-05, 2e-05, 2e-05, 2e-05, 2e-05, 1e-05, 1e-05, 1e-05, 1e-05, 1e-05, 1e-05, 1e-05, 1e-05, 1e-05, 1e-05, 1e-05, 1e-05, 1e-05, 1e-05, 1e-05, 1e-05, 2e-05, 2e-05, 1e-05, 1e-05, 1e-05, 1e-05, 2e-05, 1e-05, 1e-05, 1e-05, 1e-05, 2e-05, 2e-05, 1e-05, 2e-05, 1e-05, 1e-05, 1e-05, 1e-05, 1e-05, 2e-05, 1e-05, 1e-05, 1e-05, 1e-05, 1e-05, 2e-05, 1e-05, 1e-05, 1e-05, 1e-05, 1e-05, 2e-05, 2e-05, 1e-05, 1e-05, 1e-05, 1e-05, 1e-05, 1e-05, 1e-05, 1e-05, 1e-05, 1e-05, 1e-05, 1e-05, 2e-05, 1e-05, 1e-05, 1e-05, 1e-05, 1e-05, 1e-05, 1e-05, 1e-05, 1e-05, 2e-05, 1e-05, 1e-05, 1e-05, 1e-05, 1e-05, 1e-05, 1e-05, 1e-05, 1e-05, 2e-05, 1e-05, 2e-05, 2e-05, 1e-05, 1e-05, 1e-05, 1e-05, 1e-05, 1e-05, 1e-05, 1e-05, 1e-05]}, "embedding-grads-all-reduce-time": {"start_step": 0, "end_step": 2000, "step_interval": 5, "values": [0.00055, 0.00031, 0.00031, 0.00031, 0.00035, 0.00031, 0.00031, 0.00031, 0.00032, 0.00032, 0.00032, 0.00031, 0.00032, 0.00031, 0.00031, 0.00031, 0.00034, 0.00031, 0.00031, 0.00031, 0.00036, 0.00031, 0.00031, 0.00031, 0.00035, 0.00032, 0.00035, 0.00032, 0.00031, 0.00034, 0.00036, 0.00032, 0.00033, 0.00033, 0.00032, 0.00032, 0.00036, 0.00036, 0.00036, 0.00036, 0.00031, 0.00034, 0.00036, 0.00031, 0.00032, 0.00032, 0.00031, 0.00032, 0.00032, 0.00032, 0.00033, 0.00032, 0.00032, 0.00031, 0.00032, 0.00032, 0.00032, 0.00032, 0.00036, 0.00032, 0.00031, 0.00032, 0.00036, 0.00032, 0.00032, 0.00036, 0.00036, 0.00031, 0.00032, 0.00031, 0.00032, 0.00031, 0.00032, 0.00031, 0.00035, 0.00032, 0.00032, 0.00031, 0.00033, 0.00032, 0.00032, 0.00031, 0.00031, 0.00036, 0.00032, 0.00031, 0.00032, 0.00033, 0.00036, 0.00031, 0.00037, 0.00032, 0.00035, 0.00032, 0.00031, 0.00035, 0.00036, 0.00032, 0.00031, 0.00032, 0.00036, 0.00031, 0.00032, 0.00036, 0.00031, 0.00034, 0.00031, 0.00032, 0.00032, 0.00031, 0.00036, 0.00032, 0.00036, 0.00031, 0.00037, 0.00032, 0.00037, 0.0004, 0.00031, 0.00032, 0.00035, 0.00031, 0.00032, 0.00031, 0.00031, 0.00031, 0.00032, 0.00031, 0.00033, 0.00032, 0.00031, 0.00032, 0.00032, 0.00032, 0.00032, 0.00031, 0.00036, 0.00031, 0.00031, 0.00033, 0.00036, 0.00031, 0.00032, 0.00032, 0.00032, 0.00036, 0.00031, 0.00035, 0.00032, 0.00039, 0.00033, 0.00032, 0.00031, 0.00035, 0.00032, 0.00031, 0.00032, 0.00035, 0.00031, 0.00032, 0.00031, 0.00032, 0.00031, 0.00031, 0.00034, 0.00036, 0.00036, 0.00031, 0.00032, 0.00032, 0.00031, 0.00035, 0.00036, 0.00032, 0.00031, 0.00032, 0.00032, 0.00036, 0.00033, 0.00035, 0.00031, 0.00031, 0.00031, 0.00032, 0.00036, 0.00037, 0.00031, 0.00032, 0.00031, 0.00032, 0.00031, 0.00032, 0.00032, 0.00036, 0.00037, 0.00031, 0.00032, 0.00033, 0.00032, 0.00032, 0.00045, 0.00031, 0.00031, 0.00038, 0.00032, 0.00036, 0.00034, 0.00031, 0.00032, 0.00036, 0.00032, 0.00031, 0.00036, 0.00031, 0.00031, 0.00031, 0.00036, 0.00031, 0.00032, 0.00032, 0.0004, 0.00032, 0.00032, 0.00032, 0.00032, 0.00032, 0.00031, 0.00037, 0.00031, 0.00031, 0.00032, 0.00031, 0.00032, 0.00032, 0.00036, 0.00032, 0.00035, 0.00032, 0.00036, 0.00038, 0.00036, 0.00036, 0.00032, 0.00036, 0.00033, 0.00032, 0.00032, 0.00031, 0.00036, 0.00031, 0.00033, 0.00033, 0.00032, 0.00037, 0.00032, 0.00032, 0.00032, 0.00032, 0.00032, 0.00031, 0.00032, 0.00031, 0.00032, 0.00032, 0.00032, 0.00031, 0.00032, 0.00032, 0.00032, 0.00037, 0.00032, 0.00031, 0.00032, 0.00032, 0.00036, 0.00032, 0.00033, 0.00031, 0.00032, 0.00031, 0.00032, 0.00031, 0.00032, 0.00032, 0.00032, 0.00033, 0.00032, 0.00036, 0.00032, 0.00032, 0.00037, 0.00031, 0.00031, 0.00031, 0.00032, 0.00032, 0.00032, 0.00031, 0.00031, 0.00037, 0.00035, 0.00036, 0.00032, 0.00032, 0.00032, 0.00032, 0.00032, 0.00032, 0.00031, 0.00036, 0.00032, 0.00031, 0.00032, 0.00036, 0.00032, 0.00032, 0.00032, 0.00036, 0.00032, 0.00032, 0.00032, 0.00032, 0.00032, 0.00032, 0.00031, 0.00032, 0.00033, 0.00038, 0.00034, 0.00036, 0.00032, 0.00033, 0.00032, 0.00032, 0.00035, 0.00032, 0.00032, 0.00032, 0.00032, 0.00032, 0.00032, 0.00032, 0.00035, 0.00032, 0.00032, 0.00031, 0.00032, 0.00036, 0.00036, 0.00032, 0.00032, 0.00032, 0.00036, 0.00032, 0.00032, 0.00031, 0.00036, 0.00032, 0.00036, 0.00033, 0.00031, 0.00031, 0.00032, 0.00032, 0.00032, 0.00032, 0.00032, 0.00036, 0.00035, 0.00031, 0.00032, 0.00036, 0.00032, 0.00033, 0.00036, 0.00032, 0.00032, 0.00032, 0.00031, 0.00032, 0.00033, 0.00032, 0.00031, 0.00032, 0.00035, 0.00032, 0.00032, 0.00035, 0.00032, 0.00035, 0.00032, 0.00037, 0.00032, 0.00031, 0.00037, 0.00032, 0.00035, 0.00031, 0.00036, 0.00032]}, "all-grads-sync-time": {"start_step": 0, "end_step": 2000, "step_interval": 5, "values": [2.11402, 0.00057, 0.00063, 0.00057, 0.00058, 0.00057, 0.00058, 0.00058, 0.00057, 0.00063, 0.00057, 0.00058, 0.00058, 0.00057, 0.00057, 0.00058, 0.00059, 0.00058, 0.00058, 0.00059, 0.00058, 0.00058, 0.00058, 0.00066, 0.00058, 0.00058, 0.00058, 0.00058, 0.00058, 0.00058, 0.00058, 0.00058, 0.00058, 0.00059, 0.00059, 0.00058, 0.00059, 0.00059, 0.00058, 0.00059, 0.0006, 0.00059, 0.00059, 0.00063, 0.00059, 0.00058, 0.00058, 0.00059, 0.00063, 0.00058, 0.00059, 0.00059, 0.00058, 0.00059, 0.00058, 0.00059, 0.00059, 0.00058, 0.00059, 0.0006, 0.00058, 0.00058, 0.00058, 0.00057, 0.0007, 0.00059, 0.00064, 0.00058, 0.00059, 0.00058, 0.00059, 0.00058, 0.00059, 0.00059, 0.00059, 0.00058, 0.00058, 0.00059, 0.00058, 0.00059, 0.00061, 0.00058, 0.00064, 0.00058, 0.00059, 0.00059, 0.00059, 0.00064, 0.00058, 0.00058, 0.00059, 0.00059, 0.00058, 0.00058, 0.00059, 0.00059, 0.00059, 0.00057, 0.00059, 0.0006, 0.00058, 0.00059, 0.00058, 0.00059, 0.00059, 0.00059, 0.00058, 0.00065, 0.00058, 0.00059, 0.00058, 0.00064, 0.00059, 0.00059, 0.00059, 0.00062, 0.00059, 0.00064, 0.00059, 0.00059, 0.00059, 0.0006, 0.00058, 0.00058, 0.00059, 0.00059, 0.00059, 0.00059, 0.00059, 0.00058, 0.00059, 0.00059, 0.00058, 0.00064, 0.00065, 0.00059, 0.00058, 0.00059, 0.00058, 0.00059, 0.00059, 0.00059, 0.00059, 0.00059, 0.00059, 0.00058, 0.00058, 0.00061, 0.0006, 0.00067, 0.00058, 0.00059, 0.00058, 0.00059, 0.00059, 0.00058, 0.00059, 0.00057, 0.00059, 0.00059, 0.00061, 0.00059, 0.0006, 0.00064, 0.00058, 0.00059, 0.00059, 0.00058, 0.00058, 0.00058, 0.00058, 0.00059, 0.0006, 0.00058, 0.00059, 0.0006, 0.00059, 0.00059, 0.00057, 0.00058, 0.00058, 0.00058, 0.0006, 0.0006, 0.00058, 0.00059, 0.00058, 0.00059, 0.00058, 0.00059, 0.00059, 0.00059, 0.00064, 0.00059, 0.00059, 0.00058, 0.00059, 0.00058, 0.00059, 0.00059, 0.00059, 0.00062, 0.00058, 0.00059, 0.00059, 0.00059, 0.00058, 0.00063, 0.00058, 0.00058, 0.00059, 0.00058, 0.00058, 0.00064, 0.0006, 0.00059, 0.00058, 0.00059, 0.00058, 0.00058, 0.00059, 0.00058, 0.0006, 0.00064, 0.00058, 0.00058, 0.0006, 0.0006, 0.00057, 0.00058, 0.00059, 0.00059, 0.00059, 0.00062, 0.00059, 0.00058, 0.00059, 0.00059, 0.00059, 0.00059, 0.00059, 0.0006, 0.00059, 0.00059, 0.00058, 0.00058, 0.00064, 0.00059, 0.00064, 0.00058, 0.00058, 0.00059, 0.00059, 0.00058, 0.00058, 0.00058, 0.00058, 0.00065, 0.0006, 0.00057, 0.00059, 0.00059, 0.00058, 0.00058, 0.00058, 0.0006, 0.00058, 0.00058, 0.00059, 0.00059, 0.00057, 0.00058, 0.00057, 0.00064, 0.00057, 0.00058, 0.00068, 0.00058, 0.00058, 0.00058, 0.00059, 0.00059, 0.00059, 0.00058, 0.00058, 0.00057, 0.00058, 0.00058, 0.00058, 0.00058, 0.00059, 0.00058, 0.00057, 0.00059, 0.00062, 0.00058, 0.00059, 0.00058, 0.00059, 0.00058, 0.00058, 0.00058, 0.00058, 0.00058, 0.00058, 0.0006, 0.00058, 0.00059, 0.00058, 0.00058, 0.00058, 0.00058, 0.00058, 0.00058, 0.00059, 0.00059, 0.00058, 0.00057, 0.00059, 0.00058, 0.00059, 0.00058, 0.00058, 0.00057, 0.00059, 0.00058, 0.00059, 0.00059, 0.0006, 0.00058, 0.00058, 0.00059, 0.00058, 0.00071, 0.00058, 0.00064, 0.00059, 0.00059, 0.00059, 0.00058, 0.00058, 0.00058, 0.00058, 0.00059, 0.00058, 0.00058, 0.00058, 0.00058, 0.00058, 0.00058, 0.00058, 0.00059, 0.00058, 0.00059, 0.00058, 0.00063, 0.00059, 0.00058, 0.00058, 0.00057, 0.00058, 0.00059, 0.00058, 0.00058, 0.00058, 0.00059, 0.00058, 0.00058, 0.00065, 0.00058, 0.00059, 0.00058, 0.00058, 0.00058, 0.00059, 0.00058, 0.00059, 0.00058, 0.00058, 0.00059, 0.00058, 0.00059, 0.00059, 0.00059, 0.00057, 0.00058, 0.00058, 0.00059, 0.00059, 0.00069, 0.00058, 0.0006, 0.00058, 0.00058, 0.00057, 0.00058, 0.00057, 0.00059, 0.00058, 0.00058]}, "optimizer-copy-to-main-grad-time": {"start_step": 0, "end_step": 2000, "step_interval": 5, "values": [0.00021, 0.00012, 0.00013, 0.00013, 0.00013, 0.00013, 0.00013, 0.00013, 0.00013, 0.00014, 0.00013, 0.00013, 0.00013, 0.00013, 0.00013, 0.00013, 0.00013, 0.00013, 0.00012, 0.00013, 0.00013, 0.00013, 0.00013, 0.00013, 0.00013, 0.00014, 0.00013, 0.00013, 0.00013, 0.00013, 0.00013, 0.00013, 0.00014, 0.00014, 0.00014, 0.00014, 0.00014, 0.00014, 0.00014, 0.00014, 0.00014, 0.00014, 0.00014, 0.00014, 0.00014, 0.00014, 0.00014, 0.00014, 0.00014, 0.00014, 0.00014, 0.00014, 0.00014, 0.00014, 0.00014, 0.00014, 0.00014, 0.00013, 0.00014, 0.00014, 0.00014, 0.00014, 0.00014, 0.00013, 0.00014, 0.00014, 0.00014, 0.00013, 0.00014, 0.00014, 0.00014, 0.00014, 0.00014, 0.00014, 0.00014, 0.00014, 0.00014, 0.00014, 0.00014, 0.00014, 0.00014, 0.00013, 0.00014, 0.00013, 0.00014, 0.00014, 0.00014, 0.00014, 0.00014, 0.00014, 0.00014, 0.00014, 0.00014, 0.00014, 0.00015, 0.00014, 0.00014, 0.00014, 0.00014, 0.00014, 0.00014, 0.00014, 0.00013, 0.00014, 0.00014, 0.00013, 0.00014, 0.00014, 0.00015, 0.00014, 0.00014, 0.00014, 0.00014, 0.00014, 0.00014, 0.00014, 0.00014, 0.00014, 0.00014, 0.00013, 0.00014, 0.00014, 0.00014, 0.00014, 0.00014, 0.00014, 0.00014, 0.00014, 0.00014, 0.00014, 0.00014, 0.00014, 0.00014, 0.00014, 0.00014, 0.00014, 0.00014, 0.00014, 0.00014, 0.00014, 0.00014, 0.00015, 0.00014, 0.00014, 0.00014, 0.00015, 0.00014, 0.00014, 0.00014, 0.00014, 0.00014, 0.00014, 0.00014, 0.00014, 0.00014, 0.00014, 0.00014, 0.00014, 0.00014, 0.00014, 0.00014, 0.00014, 0.00014, 0.00014, 0.00014, 0.00014, 0.00014, 0.00014, 0.00014, 0.00014, 0.00014, 0.00014, 0.00014, 0.00014, 0.00014, 0.00014, 0.00014, 0.00014, 0.00014, 0.00014, 0.00014, 0.00014, 0.00014, 0.00014, 0.00014, 0.00014, 0.00014, 0.00014, 0.00014, 0.00014, 0.00014, 0.00014, 0.00014, 0.00014, 0.00014, 0.00014, 0.00016, 0.00014, 0.00014, 0.00014, 0.0002, 0.00014, 0.00014, 0.00014, 0.00015, 0.00014, 0.00015, 0.00014, 0.00014, 0.00013, 0.00014, 0.00014, 0.00014, 0.00013, 0.00014, 0.00014, 0.00014, 0.00014, 0.00014, 0.00015, 0.00014, 0.00014, 0.00014, 0.00014, 0.00014, 0.00014, 0.00014, 0.00014, 0.00013, 0.00013, 0.00014, 0.00014, 0.00014, 0.00014, 0.00014, 0.00014, 0.00014, 0.00013, 0.00014, 0.00014, 0.00014, 0.00014, 0.00014, 0.00014, 0.00014, 0.00014, 0.00015, 0.00014, 0.00014, 0.00014, 0.00014, 0.00015, 0.00014, 0.00014, 0.00014, 0.00014, 0.00014, 0.00014, 0.00014, 0.00015, 0.00014, 0.00014, 0.00014, 0.00014, 0.00014, 0.00014, 0.00014, 0.00014, 0.00014, 0.00014, 0.00014, 0.00014, 0.00014, 0.00014, 0.00014, 0.00014, 0.00013, 0.00014, 0.00013, 0.00013, 0.00014, 0.00014, 0.00014, 0.00014, 0.00014, 0.00014, 0.00014, 0.00014, 0.00014, 0.00014, 0.00014, 0.00014, 0.00013, 0.00014, 0.00014, 0.00014, 0.00014, 0.00014, 0.00014, 0.00014, 0.00014, 0.00014, 0.00014, 0.00014, 0.00014, 0.00014, 0.00014, 0.00014, 0.00014, 0.00014, 0.00014, 0.00014, 0.00015, 0.00014, 0.00014, 0.00014, 0.00014, 0.00013, 0.00014, 0.00014, 0.00014, 0.00014, 0.00014, 0.00014, 0.00014, 0.00014, 0.00014, 0.00014, 0.00014, 0.00014, 0.00014, 0.00013, 0.00014, 0.00014, 0.00014, 0.00014, 0.00014, 0.00014, 0.00014, 0.00014, 0.00014, 0.00014, 0.00014, 0.00014, 0.00014, 0.00014, 0.00015, 0.00014, 0.00015, 0.00014, 0.00015, 0.00014, 0.00014, 0.00014, 0.00014, 0.00013, 0.00014, 0.00014, 0.00013, 0.00014, 0.00014, 0.00014, 0.00014, 0.00014, 0.00014, 0.00014, 0.00014, 0.00014, 0.00014, 0.00014, 0.00014, 0.00014, 0.00014, 0.00015, 0.00014, 0.00014, 0.00014, 0.00014, 0.00014, 0.00014, 0.00014, 0.00014, 0.00014, 0.00014, 0.00014, 0.00014, 0.00014, 0.00015, 0.00014, 0.00014, 0.00014, 0.00014, 0.00014, 0.00014, 0.00014, 0.00014, 0.00014, 0.00014, 0.00014, 0.00014]}, "optimizer-clip-main-grad-time": {"start_step": 0, "end_step": 2000, "step_interval": 5, "values": [4.22691, 0.00055, 0.00056, 0.00056, 0.00056, 0.00057, 0.00056, 0.00056, 0.00055, 0.00056, 0.00056, 0.00056, 0.00056, 0.00055, 0.00057, 0.00057, 0.00056, 0.00056, 0.00054, 0.00056, 0.00056, 0.00055, 0.00055, 0.00056, 0.00056, 0.00055, 0.00061, 0.00058, 0.00058, 0.00056, 0.00056, 0.00056, 0.00057, 0.00061, 0.00059, 0.00057, 0.00058, 0.00056, 0.00058, 0.00058, 0.00058, 0.00058, 0.00059, 0.00056, 0.00058, 0.00058, 0.00059, 0.00057, 0.00059, 0.00057, 0.00058, 0.00058, 0.00059, 0.00058, 0.00059, 0.00058, 0.0006, 0.00057, 0.00058, 0.00058, 0.00056, 0.00058, 0.00059, 0.00059, 0.00058, 0.00059, 0.00058, 0.00057, 0.0006, 0.00061, 0.00058, 0.00059, 0.00058, 0.00057, 0.00059, 0.00058, 0.00058, 0.00058, 0.00058, 0.00059, 0.00058, 0.00056, 0.00057, 0.00058, 0.00059, 0.00058, 0.00057, 0.00057, 0.00058, 0.00057, 0.00058, 0.00058, 0.00056, 0.00057, 0.00049, 0.00057, 0.00057, 0.00057, 0.00048, 0.00057, 0.00058, 0.00059, 0.00058, 0.00058, 0.00058, 0.00058, 0.00058, 0.00058, 0.00058, 0.00059, 0.00058, 0.00048, 0.00048, 0.0005, 0.00058, 0.0006, 0.00058, 0.00058, 0.00059, 0.00056, 0.00058, 0.00058, 0.00058, 0.00059, 0.00057, 0.00058, 0.00057, 0.00058, 0.00057, 0.00073, 0.00058, 0.00058, 0.00057, 0.00058, 0.00058, 0.00059, 0.00058, 0.00058, 0.00046, 0.00058, 0.00057, 0.00059, 0.00058, 0.00057, 0.00048, 0.00058, 0.00058, 0.00059, 0.00059, 0.00059, 0.00058, 0.00057, 0.00059, 0.00058, 0.00057, 0.00057, 0.00058, 0.00056, 0.00058, 0.00058, 0.00058, 0.00057, 0.00047, 0.00047, 0.00067, 0.00057, 0.00058, 0.00059, 0.00057, 0.00058, 0.00066, 0.00058, 0.00058, 0.00059, 0.00048, 0.00059, 0.00059, 0.00059, 0.00057, 0.00062, 0.00058, 0.00057, 0.00057, 0.00057, 0.00058, 0.0006, 0.00057, 0.00057, 0.00058, 0.00058, 0.00057, 0.00058, 0.00059, 0.00058, 0.00059, 0.00058, 0.0006, 0.00058, 0.00058, 0.00058, 0.00064, 0.00057, 0.00058, 0.00059, 0.00059, 0.00058, 0.00059, 0.00059, 0.00058, 0.00057, 0.00057, 0.0006, 0.00058, 0.00057, 0.00058, 0.00059, 0.00059, 0.0006, 0.00059, 0.00059, 0.00059, 0.00059, 0.00059, 0.00058, 0.0006, 0.00058, 0.00061, 0.00059, 0.00057, 0.00056, 0.00058, 0.00059, 0.00059, 0.00059, 0.00059, 0.00058, 0.00059, 0.00063, 0.0006, 0.00059, 0.00058, 0.00059, 0.00058, 0.00058, 0.00058, 0.00058, 0.00061, 0.00059, 0.0006, 0.00058, 0.0006, 0.0006, 0.00059, 0.00059, 0.00059, 0.00058, 0.00058, 0.00059, 0.00058, 0.00057, 0.0006, 0.0006, 0.00059, 0.00059, 0.00059, 0.0006, 0.00059, 0.0006, 0.00059, 0.00058, 0.00058, 0.00057, 0.00058, 0.00061, 0.00058, 0.00061, 0.00058, 0.00058, 0.00057, 0.00057, 0.00059, 0.00058, 0.00057, 0.00058, 0.00059, 0.00058, 0.00059, 0.00059, 0.00058, 0.0006, 0.00058, 0.0006, 0.00057, 0.0006, 0.00059, 0.00059, 0.00059, 0.0006, 0.0006, 0.00059, 0.00058, 0.0006, 0.00058, 0.0006, 0.0006, 0.00061, 0.00058, 0.00058, 0.00059, 0.00058, 0.00058, 0.00059, 0.00058, 0.00061, 0.00062, 0.00062, 0.00058, 0.00057, 0.00058, 0.0006, 0.00058, 0.00058, 0.00058, 0.00058, 0.00058, 0.00058, 0.00059, 0.00059, 0.00059, 0.00059, 0.00059, 0.00058, 0.00058, 0.00058, 0.00059, 0.00059, 0.00059, 0.00058, 0.00059, 0.00058, 0.00058, 0.00063, 0.0006, 0.00059, 0.00062, 0.00058, 0.00059, 0.00058, 0.00059, 0.00059, 0.00058, 0.00058, 0.00058, 0.00063, 0.00059, 0.00056, 0.00058, 0.00058, 0.00056, 0.00057, 0.00059, 0.00059, 0.00059, 0.00059, 0.00058, 0.00059, 0.00058, 0.00058, 0.00059, 0.00058, 0.00058, 0.00058, 0.0006, 0.00059, 0.00058, 0.00058, 0.00058, 0.00059, 0.0006, 0.00058, 0.0006, 0.00058, 0.00059, 0.00058, 0.00057, 0.00057, 0.0006, 0.00064, 0.00059, 0.00061, 0.00058, 0.00058, 0.0006, 0.00058, 0.0006, 0.00067, 0.00057, 0.00058, 0.0006, 0.00059]}, "optimizer-count-zeros-time": {"start_step": 0, "end_step": 2000, "step_interval": 5, "values": [0.00354, 0.00262, 0.00261, 0.00266, 0.0026, 0.0026, 0.0026, 0.00261, 0.00259, 0.00259, 0.00261, 0.00261, 0.00261, 0.00262, 0.00262, 0.0026, 0.0026, 0.00258, 0.00264, 0.00259, 0.00269, 0.00267, 0.00262, 0.00291, 0.00262, 0.00271, 0.00259, 0.00259, 0.0026, 0.00261, 0.00261, 0.0026, 0.0026, 0.00257, 0.00262, 0.00261, 0.00262, 0.00265, 0.0026, 0.00261, 0.00261, 0.00259, 0.0026, 0.00265, 0.00262, 0.00261, 0.00265, 0.00258, 0.0026, 0.00263, 0.00261, 0.0026, 0.0026, 0.00258, 0.00258, 0.0026, 0.00261, 0.0026, 0.00261, 0.00261, 0.00263, 0.00259, 0.00262, 0.0026, 0.00261, 0.00258, 0.00261, 0.0026, 0.00267, 0.00261, 0.00258, 0.00265, 0.00259, 0.00261, 0.00258, 0.00258, 0.00261, 0.00261, 0.00261, 0.00259, 0.00258, 0.00262, 0.00261, 0.00261, 0.00261, 0.00259, 0.00262, 0.0026, 0.0026, 0.00259, 0.0026, 0.00261, 0.0026, 0.00261, 0.0026, 0.00272, 0.00259, 0.00262, 0.00257, 0.0026, 0.00261, 0.00259, 0.00263, 0.00259, 0.00261, 0.00261, 0.00267, 0.00258, 0.0026, 0.00259, 0.00262, 0.00259, 0.00259, 0.00481, 0.00261, 0.00259, 0.00263, 0.0029, 0.00259, 0.00261, 0.00263, 0.0026, 0.0026, 0.00261, 0.00261, 0.00262, 0.00261, 0.00259, 0.0026, 0.00308, 0.00357, 0.00364, 0.0026, 0.00259, 0.00266, 0.00258, 0.0026, 0.00264, 0.00261, 0.0026, 0.0026, 0.0026, 0.00261, 0.00261, 0.0026, 0.00258, 0.00262, 0.00262, 0.00264, 0.00258, 0.00262, 0.0026, 0.00259, 0.00268, 0.0026, 0.00263, 0.00257, 0.0026, 0.00259, 0.00262, 0.00262, 0.00261, 0.00261, 0.00261, 0.0026, 0.0026, 0.00261, 0.0026, 0.00266, 0.00266, 0.00264, 0.0027, 0.00268, 0.00266, 0.00266, 0.00267, 0.00263, 0.00266, 0.00264, 0.00459, 0.00266, 0.00266, 0.00267, 0.00266, 0.00265, 0.00269, 0.00266, 0.00267, 0.00272, 0.00267, 0.00265, 0.00272, 0.00266, 0.00266, 0.0027, 0.00266, 0.00265, 0.00269, 0.00265, 0.00265, 0.00265, 0.00268, 0.00265, 0.00266, 0.00266, 0.00267, 0.00266, 0.00265, 0.00267, 0.00266, 0.0027, 0.00266, 0.00264, 0.00266, 0.00264, 0.00266, 0.00265, 0.00265, 0.00266, 0.00268, 0.00268, 0.00266, 0.00266, 0.00266, 0.00264, 0.00265, 0.00269, 0.00267, 0.00267, 0.00269, 0.00266, 0.00266, 0.00266, 0.00266, 0.00265, 0.00268, 0.0027, 0.00351, 0.00265, 0.00266, 0.00267, 0.00267, 0.00265, 0.00267, 0.00265, 0.00267, 0.00266, 0.00266, 0.00275, 0.00266, 0.00264, 0.00265, 0.00266, 0.0027, 0.00287, 0.00267, 0.00306, 0.00267, 0.00265, 0.00268, 0.00266, 0.00266, 0.00265, 0.00265, 0.00265, 0.00266, 0.00271, 0.00266, 0.00266, 0.00267, 0.00267, 0.00273, 0.00267, 0.00267, 0.00264, 0.00267, 0.00266, 0.00264, 0.00267, 0.00267, 0.00266, 0.00267, 0.00266, 0.00263, 0.00266, 0.00268, 0.00265, 0.00266, 0.00266, 0.00267, 0.00267, 0.00265, 0.00268, 0.00266, 0.00267, 0.00272, 0.00264, 0.00266, 0.00266, 0.00265, 0.00277, 0.00266, 0.00269, 0.00264, 0.00265, 0.00266, 0.00259, 0.00259, 0.0026, 0.00261, 0.0026, 0.00262, 0.0026, 0.00261, 0.00261, 0.00261, 0.00261, 0.00272, 0.00262, 0.00323, 0.0026, 0.00261, 0.00262, 0.00269, 0.00259, 0.00261, 0.00261, 0.00261, 0.00261, 0.0026, 0.00259, 0.00258, 0.0026, 0.00262, 0.00261, 0.00261, 0.00262, 0.0026, 0.0026, 0.00264, 0.00259, 0.00285, 0.0026, 0.00259, 0.00259, 0.0026, 0.00258, 0.00261, 0.00261, 0.00259, 0.0026, 0.00261, 0.0026, 0.00273, 0.0026, 0.00258, 0.00261, 0.0026, 0.00259, 0.0026, 0.00259, 0.00259, 0.00261, 0.00266, 0.00266, 0.00265, 0.00269, 0.00269, 0.00266, 0.00266, 0.00266, 0.00264, 0.00266, 0.00267, 0.00265, 0.00273, 0.00265, 0.00265, 0.0027, 0.00266, 0.00274, 0.00267, 0.00267, 0.00267, 0.00266, 0.00266, 0.00266, 0.00299, 0.00266, 0.00268, 0.00265, 0.00267, 0.00265, 0.00268, 0.00265, 0.00266, 0.00267, 0.00267, 0.00271, 0.00267]}, "optimizer-inner-step-time": {"start_step": 0, "end_step": 2000, "step_interval": 5, "values": [0.00249, 0.00044, 0.00044, 0.00045, 0.00045, 0.00044, 0.00045, 0.00045, 0.00045, 0.00044, 0.00045, 0.00045, 0.00044, 0.00044, 0.00044, 0.00044, 0.00045, 0.00044, 0.00044, 0.00045, 0.00044, 0.00044, 0.00045, 0.00046, 0.00044, 0.00045, 0.00045, 0.00045, 0.00045, 0.00045, 0.00044, 0.00045, 0.00045, 0.00047, 0.00045, 0.00046, 0.00046, 0.00045, 0.00046, 0.00046, 0.00046, 0.00046, 0.00046, 0.00045, 0.00046, 0.00046, 0.00046, 0.00045, 0.00046, 0.00046, 0.00046, 0.00046, 0.00046, 0.00046, 0.00046, 0.00046, 0.00046, 0.00045, 0.00046, 0.00046, 0.00045, 0.00046, 0.00046, 0.00046, 0.00046, 0.00045, 0.00046, 0.00045, 0.00045, 0.00046, 0.00046, 0.00046, 0.00046, 0.00046, 0.00047, 0.00047, 0.00046, 0.00046, 0.00046, 0.00046, 0.00046, 0.00045, 0.00046, 0.00045, 0.00045, 0.00047, 0.00046, 0.00046, 0.00045, 0.00045, 0.00046, 0.00046, 0.00045, 0.00045, 0.00046, 0.00046, 0.00046, 0.00046, 0.00047, 0.00046, 0.00046, 0.00046, 0.00046, 0.00046, 0.00046, 0.00046, 0.00047, 0.00046, 0.00046, 0.00046, 0.00046, 0.00047, 0.00048, 0.00056, 0.00046, 0.00046, 0.00046, 0.00046, 0.00046, 0.00046, 0.00047, 0.00046, 0.00046, 0.00046, 0.00047, 0.00046, 0.00046, 0.00047, 0.00046, 0.00049, 0.00051, 0.00046, 0.00045, 0.00046, 0.00046, 0.00046, 0.00047, 0.00047, 0.00045, 0.00047, 0.00045, 0.00046, 0.00046, 0.00045, 0.00046, 0.00046, 0.00046, 0.00046, 0.00046, 0.00046, 0.00047, 0.00047, 0.00047, 0.00046, 0.00047, 0.00046, 0.00046, 0.00045, 0.00046, 0.00046, 0.00046, 0.00046, 0.00046, 0.00046, 0.00047, 0.00047, 0.00046, 0.00046, 0.00046, 0.00046, 0.00047, 0.00046, 0.00046, 0.00046, 0.00047, 0.00049, 0.00048, 0.00046, 0.00045, 0.00046, 0.00046, 0.00045, 0.00046, 0.00045, 0.00045, 0.00047, 0.00045, 0.00045, 0.00046, 0.00046, 0.00045, 0.00047, 0.00046, 0.00047, 0.00046, 0.00046, 0.00047, 0.00046, 0.00046, 0.00046, 0.00047, 0.00045, 0.00046, 0.00046, 0.00046, 0.00046, 0.00046, 0.00046, 0.00046, 0.00045, 0.00046, 0.00046, 0.00046, 0.00045, 0.00047, 0.00047, 0.00046, 0.00047, 0.00046, 0.00047, 0.00047, 0.00047, 0.00046, 0.00046, 0.00046, 0.00046, 0.00046, 0.00046, 0.00045, 0.00045, 0.00047, 0.00046, 0.00046, 0.00046, 0.00046, 0.00046, 0.00046, 0.00046, 0.00046, 0.00046, 0.00046, 0.00046, 0.00047, 0.00046, 0.00046, 0.00046, 0.00047, 0.00046, 0.00046, 0.00046, 0.00046, 0.00047, 0.00046, 0.00046, 0.00048, 0.00046, 0.00046, 0.00047, 0.00046, 0.00045, 0.00046, 0.00046, 0.00046, 0.00046, 0.00046, 0.00046, 0.00046, 0.00047, 0.00047, 0.00047, 0.00046, 0.00046, 0.00046, 0.00046, 0.00046, 0.00045, 0.00046, 0.00046, 0.00045, 0.00045, 0.00046, 0.00045, 0.00046, 0.0005, 0.00046, 0.00046, 0.00047, 0.00046, 0.00047, 0.00046, 0.00046, 0.00046, 0.00046, 0.00046, 0.00047, 0.00046, 0.00046, 0.00046, 0.00046, 0.00046, 0.00045, 0.00046, 0.00047, 0.00046, 0.00047, 0.00046, 0.00047, 0.00047, 0.00046, 0.00046, 0.00045, 0.00045, 0.00045, 0.00046, 0.00046, 0.00046, 0.00046, 0.00045, 0.00046, 0.00046, 0.00046, 0.00046, 0.00046, 0.00046, 0.00045, 0.00046, 0.00046, 0.00046, 0.00046, 0.00047, 0.00046, 0.00046, 0.00046, 0.00046, 0.00047, 0.00046, 0.00046, 0.00046, 0.00046, 0.00047, 0.00045, 0.00046, 0.00047, 0.00045, 0.00046, 0.00046, 0.00046, 0.00047, 0.00047, 0.00047, 0.00046, 0.00057, 0.00046, 0.00046, 0.00047, 0.00045, 0.00046, 0.00045, 0.00045, 0.00045, 0.00047, 0.00047, 0.00046, 0.00046, 0.00046, 0.00046, 0.00046, 0.00046, 0.00045, 0.00046, 0.00046, 0.00046, 0.00046, 0.00046, 0.00046, 0.00046, 0.00046, 0.00047, 0.00044, 0.00046, 0.00046, 0.00047, 0.00046, 0.00045, 0.00045, 0.00045, 0.00046, 0.00047, 0.00046, 0.00047, 0.00046, 0.00047, 0.00047, 0.00045, 0.00045, 0.00045, 0.00045, 0.00047, 0.00046, 0.00046]}, "optimizer-copy-main-to-model-params-time": {"start_step": 0, "end_step": 2000, "step_interval": 5, "values": [0.00056, 0.00049, 0.00049, 0.00049, 0.00049, 0.00049, 0.00049, 0.00049, 0.00049, 0.0005, 0.00049, 0.00049, 0.00049, 0.00049, 0.00049, 0.00049, 0.00048, 0.00049, 0.00049, 0.00049, 0.00049, 0.00049, 0.00049, 0.0005, 0.00049, 0.00049, 0.00049, 0.00049, 0.00049, 0.00049, 0.00048, 0.00049, 0.00049, 0.00049, 0.00049, 0.00049, 0.00049, 0.00049, 0.00049, 0.00049, 0.00049, 0.00049, 0.00049, 0.00049, 0.00049, 0.00049, 0.00049, 0.00049, 0.00049, 0.00051, 0.00049, 0.00049, 0.00049, 0.00049, 0.0005, 0.00049, 0.00049, 0.00048, 0.00049, 0.00049, 0.00049, 0.00049, 0.00049, 0.00049, 0.00049, 0.00049, 0.00049, 0.00049, 0.00049, 0.00049, 0.00049, 0.00049, 0.00049, 0.00049, 0.00049, 0.00049, 0.0005, 0.00049, 0.00049, 0.0005, 0.00049, 0.00049, 0.00049, 0.00049, 0.00049, 0.00049, 0.00049, 0.00049, 0.00049, 0.00049, 0.00049, 0.00049, 0.0005, 0.00049, 0.00049, 0.00049, 0.00049, 0.00049, 0.00049, 0.00049, 0.00049, 0.00049, 0.00049, 0.00049, 0.00049, 0.00049, 0.00049, 0.00051, 0.00049, 0.00049, 0.00049, 0.00049, 0.0005, 0.00069, 0.00049, 0.0005, 0.00049, 0.00049, 0.00049, 0.00049, 0.00049, 0.00051, 0.00049, 0.00049, 0.00049, 0.00049, 0.0005, 0.00049, 0.00049, 0.00053, 0.00064, 0.00049, 0.00049, 0.00049, 0.0005, 0.00049, 0.00049, 0.00049, 0.00049, 0.00049, 0.00049, 0.0005, 0.00049, 0.00049, 0.00049, 0.00049, 0.00049, 0.00049, 0.00049, 0.00051, 0.0005, 0.00049, 0.00049, 0.00049, 0.00049, 0.00052, 0.00049, 0.00049, 0.00051, 0.00049, 0.0005, 0.00051, 0.00049, 0.00049, 0.00053, 0.00049, 0.00049, 0.0005, 0.00049, 0.00049, 0.0005, 0.00049, 0.00049, 0.00049, 0.00049, 0.00049, 0.00049, 0.00049, 0.00049, 0.0005, 0.00049, 0.00049, 0.00049, 0.00049, 0.00049, 0.00049, 0.00049, 0.00049, 0.0005, 0.00049, 0.00049, 0.00049, 0.00049, 0.00049, 0.00049, 0.00049, 0.00049, 0.00051, 0.00049, 0.00049, 0.00049, 0.00049, 0.00049, 0.0005, 0.00049, 0.00049, 0.00049, 0.00049, 0.00049, 0.00049, 0.00049, 0.00049, 0.0005, 0.00049, 0.00049, 0.00049, 0.00049, 0.0005, 0.00049, 0.0005, 0.00049, 0.00049, 0.00049, 0.00049, 0.00049, 0.0005, 0.00049, 0.00049, 0.00049, 0.00049, 0.00049, 0.0005, 0.0005, 0.00049, 0.00049, 0.0005, 0.00049, 0.0005, 0.00049, 0.00049, 0.00049, 0.00049, 0.00049, 0.00049, 0.00049, 0.00049, 0.00049, 0.00049, 0.0005, 0.00049, 0.00049, 0.00049, 0.00049, 0.00049, 0.0005, 0.00049, 0.00049, 0.00051, 0.00049, 0.00049, 0.00059, 0.00051, 0.00049, 0.00049, 0.00049, 0.00049, 0.00049, 0.0005, 0.00049, 0.00049, 0.0005, 0.00049, 0.00049, 0.0005, 0.00049, 0.00049, 0.00049, 0.00049, 0.00049, 0.0005, 0.0005, 0.00049, 0.00049, 0.00068, 0.0005, 0.00049, 0.00049, 0.00049, 0.00077, 0.00049, 0.00049, 0.00049, 0.00049, 0.00049, 0.0005, 0.00049, 0.0005, 0.00049, 0.00049, 0.00049, 0.00049, 0.00049, 0.00049, 0.00049, 0.0005, 0.0005, 0.00049, 0.00049, 0.00049, 0.00049, 0.00049, 0.00049, 0.0005, 0.0005, 0.00049, 0.00049, 0.00049, 0.00049, 0.0005, 0.00062, 0.00049, 0.0005, 0.00049, 0.00049, 0.00049, 0.0005, 0.00049, 0.0005, 0.0005, 0.00049, 0.00049, 0.00049, 0.00049, 0.00049, 0.0005, 0.00049, 0.00049, 0.00049, 0.0005, 0.0005, 0.00049, 0.00049, 0.00049, 0.0005, 0.00049, 0.00049, 0.0005, 0.00049, 0.0005, 0.00049, 0.0005, 0.00064, 0.00049, 0.00049, 0.00049, 0.00049, 0.00049, 0.00049, 0.00049, 0.0005, 0.00049, 0.00049, 0.00049, 0.00049, 0.00061, 0.00049, 0.00049, 0.00049, 0.00049, 0.00049, 0.00049, 0.00049, 0.00049, 0.00049, 0.00052, 0.00049, 0.0005, 0.00049, 0.00049, 0.00049, 0.00049, 0.00049, 0.00049, 0.00049, 0.00049, 0.00049, 0.00049, 0.0005, 0.00049, 0.0005, 0.00049, 0.0005, 0.00049, 0.00049, 0.00049, 0.00049, 0.0005, 0.00049, 0.00049, 0.00049]}, "optimizer-time": {"start_step": 0, "end_step": 2000, "step_interval": 5, "values": [4.23567, 0.00458, 0.00457, 0.00463, 0.00456, 0.00458, 0.00456, 0.00457, 0.00457, 0.00456, 0.00457, 0.00457, 0.00457, 0.00456, 0.00459, 0.00457, 0.00455, 0.00458, 0.00456, 0.00456, 0.00465, 0.00463, 0.00457, 0.005, 0.00457, 0.00468, 0.0046, 0.00458, 0.00461, 0.0046, 0.00456, 0.00456, 0.00462, 0.00463, 0.00464, 0.0046, 0.00464, 0.00464, 0.00461, 0.00462, 0.00462, 0.00459, 0.00465, 0.00464, 0.00462, 0.00462, 0.00467, 0.00457, 0.00462, 0.00465, 0.00462, 0.00462, 0.00473, 0.00459, 0.0046, 0.00464, 0.00463, 0.00458, 0.00462, 0.00462, 0.00462, 0.00459, 0.00465, 0.00461, 0.00463, 0.00459, 0.0046, 0.00462, 0.00469, 0.00466, 0.00461, 0.00468, 0.0046, 0.00461, 0.0046, 0.00464, 0.00463, 0.00465, 0.00465, 0.00462, 0.00459, 0.00459, 0.00461, 0.00461, 0.00462, 0.00461, 0.00463, 0.00459, 0.00461, 0.00458, 0.00461, 0.00463, 0.00459, 0.0046, 0.00456, 0.00476, 0.00459, 0.00465, 0.00449, 0.00462, 0.00463, 0.0046, 0.00465, 0.0046, 0.00462, 0.00462, 0.00468, 0.00461, 0.00462, 0.00462, 0.00464, 0.0045, 0.00453, 0.00715, 0.00463, 0.00463, 0.00466, 0.00492, 0.00461, 0.00459, 0.00464, 0.00466, 0.00461, 0.00462, 0.00461, 0.00464, 0.00462, 0.00461, 0.0046, 0.00561, 0.00589, 0.00578, 0.0046, 0.0046, 0.00467, 0.0046, 0.00462, 0.00468, 0.00449, 0.00462, 0.00461, 0.00464, 0.00463, 0.00464, 0.0045, 0.0046, 0.00464, 0.00464, 0.00466, 0.00463, 0.00464, 0.00464, 0.00462, 0.00469, 0.00461, 0.00467, 0.00459, 0.00458, 0.00465, 0.00466, 0.00462, 0.00464, 0.00454, 0.00452, 0.00487, 0.00461, 0.00461, 0.00463, 0.00466, 0.00467, 0.00477, 0.00473, 0.00469, 0.00473, 0.00459, 0.00473, 0.00467, 0.00467, 0.00466, 0.0068, 0.00467, 0.00466, 0.00467, 0.00465, 0.00466, 0.00472, 0.00467, 0.00466, 0.00474, 0.00468, 0.00464, 0.00474, 0.00468, 0.00473, 0.00472, 0.00468, 0.0047, 0.00472, 0.00465, 0.00466, 0.00496, 0.00468, 0.00467, 0.00471, 0.0047, 0.00468, 0.00472, 0.00467, 0.00467, 0.00466, 0.00472, 0.00469, 0.00466, 0.00464, 0.00467, 0.00469, 0.00466, 0.00468, 0.00469, 0.00474, 0.00473, 0.00468, 0.0047, 0.00468, 0.00467, 0.00469, 0.00477, 0.00469, 0.00464, 0.00465, 0.0047, 0.0047, 0.00469, 0.00468, 0.00472, 0.00469, 0.00472, 0.00563, 0.00469, 0.00469, 0.00469, 0.0047, 0.00467, 0.0047, 0.00467, 0.00467, 0.00472, 0.00469, 0.00478, 0.00471, 0.00475, 0.00469, 0.00469, 0.00472, 0.00495, 0.00468, 0.0051, 0.00473, 0.0047, 0.00468, 0.00485, 0.00471, 0.00466, 0.0047, 0.00468, 0.00471, 0.00473, 0.00471, 0.0047, 0.00469, 0.00469, 0.00472, 0.00468, 0.00471, 0.00464, 0.00469, 0.00465, 0.00469, 0.00468, 0.00465, 0.00471, 0.00469, 0.0047, 0.00498, 0.00469, 0.00468, 0.00467, 0.00468, 0.00506, 0.0047, 0.00468, 0.00467, 0.00466, 0.00468, 0.0047, 0.00474, 0.00468, 0.00469, 0.0047, 0.00467, 0.00478, 0.00468, 0.00471, 0.0047, 0.00469, 0.00471, 0.00461, 0.00466, 0.00461, 0.00462, 0.0046, 0.00465, 0.00463, 0.00465, 0.00465, 0.00468, 0.00461, 0.00471, 0.00465, 0.00542, 0.00464, 0.00463, 0.00463, 0.00472, 0.0046, 0.00464, 0.00463, 0.0048, 0.00465, 0.00463, 0.00461, 0.00463, 0.0046, 0.00463, 0.00465, 0.00464, 0.00463, 0.00463, 0.00465, 0.00469, 0.00459, 0.00495, 0.00468, 0.00461, 0.00465, 0.00461, 0.00464, 0.00464, 0.00466, 0.00462, 0.00464, 0.00508, 0.00461, 0.0048, 0.00463, 0.00454, 0.00463, 0.00461, 0.00456, 0.0046, 0.00466, 0.00462, 0.00465, 0.00468, 0.00486, 0.00469, 0.00471, 0.00469, 0.00468, 0.00468, 0.00467, 0.00468, 0.00468, 0.00471, 0.00469, 0.00474, 0.00469, 0.00467, 0.00472, 0.00467, 0.00477, 0.00472, 0.00471, 0.00468, 0.00467, 0.00465, 0.00469, 0.00513, 0.00471, 0.00489, 0.00466, 0.00469, 0.00468, 0.00474, 0.00467, 0.00475, 0.00467, 0.00469, 0.00476, 0.0047]}, "learning-rate": {"start_step": 0, "end_step": 2000, "step_interval": 5, "values": [0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 1e-05, 1e-05, 1e-05, 1e-05, 1e-05, 1e-05, 1e-05, 1e-05, 1e-05, 1e-05, 1e-05, 1e-05, 1e-05, 1e-05, 1e-05, 1e-05, 1e-05, 1e-05, 1e-05, 1e-05, 1e-05, 1e-05, 1e-05, 1e-05, 1e-05, 1e-05, 1e-05, 1e-05, 1e-05, 1e-05, 1e-05, 1e-05, 1e-05, 1e-05, 1e-05, 1e-05, 1e-05, 1e-05, 1e-05, 1e-05, 1e-05, 1e-05, 2e-05, 2e-05, 2e-05, 2e-05, 2e-05, 2e-05, 2e-05, 2e-05, 2e-05, 2e-05, 2e-05, 2e-05, 2e-05, 2e-05, 2e-05, 2e-05, 2e-05, 2e-05, 2e-05, 2e-05, 2e-05, 2e-05, 2e-05, 2e-05, 2e-05, 2e-05, 2e-05, 2e-05, 2e-05, 2e-05, 2e-05, 2e-05, 2e-05, 2e-05, 2e-05, 2e-05, 2e-05, 2e-05, 2e-05, 2e-05, 2e-05, 2e-05, 2e-05, 3e-05, 3e-05, 3e-05, 3e-05, 3e-05, 3e-05, 3e-05, 3e-05, 3e-05, 3e-05, 3e-05, 3e-05, 3e-05, 3e-05, 3e-05, 3e-05, 3e-05, 3e-05, 3e-05, 3e-05, 3e-05, 3e-05, 3e-05, 3e-05, 3e-05, 3e-05, 3e-05, 3e-05, 3e-05, 3e-05, 3e-05, 3e-05, 3e-05, 3e-05, 3e-05, 3e-05, 3e-05, 3e-05, 3e-05, 3e-05, 3e-05, 3e-05, 3e-05, 4e-05, 4e-05, 4e-05, 4e-05, 4e-05, 4e-05, 4e-05, 4e-05, 4e-05, 4e-05, 4e-05, 4e-05, 4e-05, 4e-05, 4e-05, 4e-05, 4e-05, 4e-05, 4e-05, 4e-05, 4e-05, 4e-05, 4e-05, 4e-05, 4e-05, 4e-05, 4e-05, 4e-05, 4e-05, 4e-05, 4e-05, 4e-05, 4e-05, 4e-05, 4e-05, 4e-05, 4e-05, 4e-05, 4e-05, 4e-05, 4e-05, 4e-05, 5e-05, 5e-05, 5e-05, 5e-05, 5e-05, 5e-05, 5e-05, 5e-05, 5e-05, 5e-05, 5e-05, 5e-05, 5e-05, 5e-05, 5e-05, 5e-05, 5e-05, 5e-05, 5e-05, 5e-05, 5e-05, 5e-05, 5e-05, 5e-05, 5e-05, 5e-05, 5e-05, 5e-05, 5e-05, 5e-05, 5e-05, 5e-05, 5e-05, 5e-05, 5e-05, 5e-05, 5e-05, 5e-05, 5e-05, 5e-05, 5e-05, 5e-05, 5e-05, 6e-05, 6e-05, 6e-05, 6e-05, 6e-05, 6e-05, 6e-05, 6e-05, 6e-05, 6e-05, 6e-05, 6e-05, 6e-05, 6e-05, 6e-05, 6e-05, 6e-05, 6e-05, 6e-05, 6e-05, 6e-05, 6e-05, 6e-05, 6e-05, 6e-05, 6e-05, 6e-05, 6e-05, 6e-05, 6e-05, 6e-05, 6e-05, 6e-05, 6e-05, 6e-05, 6e-05, 6e-05, 6e-05, 6e-05, 6e-05, 6e-05, 6e-05, 6e-05, 7e-05, 7e-05, 7e-05, 7e-05, 7e-05, 7e-05, 7e-05, 7e-05, 7e-05, 7e-05, 7e-05, 7e-05, 7e-05, 7e-05, 7e-05, 7e-05, 7e-05, 7e-05, 7e-05, 7e-05, 7e-05, 7e-05, 7e-05, 7e-05, 7e-05, 7e-05, 7e-05, 7e-05, 7e-05, 7e-05, 7e-05, 7e-05, 7e-05, 7e-05, 7e-05, 7e-05, 7e-05, 7e-05, 7e-05, 7e-05, 7e-05, 7e-05, 8e-05, 8e-05, 8e-05, 8e-05, 8e-05, 8e-05, 8e-05, 8e-05, 8e-05, 8e-05, 8e-05, 8e-05, 8e-05, 8e-05, 8e-05, 8e-05, 8e-05, 8e-05, 8e-05, 8e-05, 8e-05, 8e-05, 8e-05, 8e-05, 8e-05, 8e-05, 8e-05, 8e-05, 8e-05, 8e-05, 8e-05, 8e-05, 8e-05, 8e-05, 8e-05, 8e-05, 8e-05, 8e-05, 8e-05, 8e-05, 8e-05, 8e-05, 8e-05, 9e-05, 9e-05, 9e-05, 9e-05, 9e-05, 9e-05, 9e-05, 9e-05, 9e-05, 9e-05, 9e-05, 9e-05, 9e-05, 9e-05, 9e-05, 9e-05, 9e-05, 9e-05, 9e-05, 9e-05, 9e-05, 9e-05, 9e-05, 9e-05, 9e-05, 9e-05, 9e-05, 9e-05, 9e-05, 9e-05, 9e-05, 9e-05, 9e-05, 9e-05, 9e-05, 9e-05, 9e-05]}, "learning-rate vs samples": {"start_step": 0, "end_step": 2000, "step_interval": 5, "values": [0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 1e-05, 1e-05, 1e-05, 1e-05, 1e-05, 1e-05, 1e-05, 1e-05, 1e-05, 1e-05, 1e-05, 1e-05, 1e-05, 1e-05, 1e-05, 1e-05, 1e-05, 1e-05, 1e-05, 1e-05, 1e-05, 1e-05, 1e-05, 1e-05, 1e-05, 1e-05, 1e-05, 1e-05, 1e-05, 1e-05, 1e-05, 1e-05, 1e-05, 1e-05, 1e-05, 1e-05, 1e-05, 1e-05, 1e-05, 1e-05, 1e-05, 1e-05, 2e-05, 2e-05, 2e-05, 2e-05, 2e-05, 2e-05, 2e-05, 2e-05, 2e-05, 2e-05, 2e-05, 2e-05, 2e-05, 2e-05, 2e-05, 2e-05, 2e-05, 2e-05, 2e-05, 2e-05, 2e-05, 2e-05, 2e-05, 2e-05, 2e-05, 2e-05, 2e-05, 2e-05, 2e-05, 2e-05, 2e-05, 2e-05, 2e-05, 2e-05, 2e-05, 2e-05, 2e-05, 2e-05, 2e-05, 2e-05, 2e-05, 2e-05, 2e-05, 3e-05, 3e-05, 3e-05, 3e-05, 3e-05, 3e-05, 3e-05, 3e-05, 3e-05, 3e-05, 3e-05, 3e-05, 3e-05, 3e-05, 3e-05, 3e-05, 3e-05, 3e-05, 3e-05, 3e-05, 3e-05, 3e-05, 3e-05, 3e-05, 3e-05, 3e-05, 3e-05, 3e-05, 3e-05, 3e-05, 3e-05, 3e-05, 3e-05, 3e-05, 3e-05, 3e-05, 3e-05, 3e-05, 3e-05, 3e-05, 3e-05, 3e-05, 3e-05, 4e-05, 4e-05, 4e-05, 4e-05, 4e-05, 4e-05, 4e-05, 4e-05, 4e-05, 4e-05, 4e-05, 4e-05, 4e-05, 4e-05, 4e-05, 4e-05, 4e-05, 4e-05, 4e-05, 4e-05, 4e-05, 4e-05, 4e-05, 4e-05, 4e-05, 4e-05, 4e-05, 4e-05, 4e-05, 4e-05, 4e-05, 4e-05, 4e-05, 4e-05, 4e-05, 4e-05, 4e-05, 4e-05, 4e-05, 4e-05, 4e-05, 4e-05, 5e-05, 5e-05, 5e-05, 5e-05, 5e-05, 5e-05, 5e-05, 5e-05, 5e-05, 5e-05, 5e-05, 5e-05, 5e-05, 5e-05, 5e-05, 5e-05, 5e-05, 5e-05, 5e-05, 5e-05, 5e-05, 5e-05, 5e-05, 5e-05, 5e-05, 5e-05, 5e-05, 5e-05, 5e-05, 5e-05, 5e-05, 5e-05, 5e-05, 5e-05, 5e-05, 5e-05, 5e-05, 5e-05, 5e-05, 5e-05, 5e-05, 5e-05, 5e-05, 6e-05, 6e-05, 6e-05, 6e-05, 6e-05, 6e-05, 6e-05, 6e-05, 6e-05, 6e-05, 6e-05, 6e-05, 6e-05, 6e-05, 6e-05, 6e-05, 6e-05, 6e-05, 6e-05, 6e-05, 6e-05, 6e-05, 6e-05, 6e-05, 6e-05, 6e-05, 6e-05, 6e-05, 6e-05, 6e-05, 6e-05, 6e-05, 6e-05, 6e-05, 6e-05, 6e-05, 6e-05, 6e-05, 6e-05, 6e-05, 6e-05, 6e-05, 6e-05, 7e-05, 7e-05, 7e-05, 7e-05, 7e-05, 7e-05, 7e-05, 7e-05, 7e-05, 7e-05, 7e-05, 7e-05, 7e-05, 7e-05, 7e-05, 7e-05, 7e-05, 7e-05, 7e-05, 7e-05, 7e-05, 7e-05, 7e-05, 7e-05, 7e-05, 7e-05, 7e-05, 7e-05, 7e-05, 7e-05, 7e-05, 7e-05, 7e-05, 7e-05, 7e-05, 7e-05, 7e-05, 7e-05, 7e-05, 7e-05, 7e-05, 7e-05, 8e-05, 8e-05, 8e-05, 8e-05, 8e-05, 8e-05, 8e-05, 8e-05, 8e-05, 8e-05, 8e-05, 8e-05, 8e-05, 8e-05, 8e-05, 8e-05, 8e-05, 8e-05, 8e-05, 8e-05, 8e-05, 8e-05, 8e-05, 8e-05, 8e-05, 8e-05, 8e-05, 8e-05, 8e-05, 8e-05, 8e-05, 8e-05, 8e-05, 8e-05, 8e-05, 8e-05, 8e-05, 8e-05, 8e-05, 8e-05, 8e-05, 8e-05, 8e-05, 9e-05, 9e-05, 9e-05, 9e-05, 9e-05, 9e-05, 9e-05, 9e-05, 9e-05, 9e-05, 9e-05, 9e-05, 9e-05, 9e-05, 9e-05, 9e-05, 9e-05, 9e-05, 9e-05, 9e-05, 9e-05, 9e-05, 9e-05, 9e-05, 9e-05, 9e-05, 9e-05, 9e-05, 9e-05, 9e-05, 9e-05, 9e-05, 9e-05, 9e-05, 9e-05, 9e-05, 9e-05]}, "batch-size": {"start_step": 0, "end_step": 2000, "step_interval": 5, "values": [128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0]}, "batch-size vs samples": {"start_step": 0, "end_step": 2000, "step_interval": 5, "values": [128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0]}, "lm loss": {"start_step": 0, "end_step": 2000, "step_interval": 5, "values": [10.84424, 10.87342, 10.85055, 10.81078, 10.64469, 10.6386, 10.4283, 10.13518, 9.93546, 9.83538, 9.5857, 9.84804, 9.88588, 9.63127, 9.79022, 9.5114, 9.4597, 9.65546, 9.38988, 9.33928, 9.24947, 9.15126, 9.18199, 9.00445, 9.19836, 9.06663, 9.16101, 9.1698, 9.30057, 8.98927, 8.92967, 9.05035, 9.04657, 8.66029, 8.72527, 8.75664, 8.69468, 8.74328, 8.66681, 8.77286, 8.67044, 8.86119, 8.84295, 8.50873, 8.39852, 8.43801, 8.49532, 8.39321, 8.44017, 8.59221, 8.37564, 8.19958, 8.2329, 8.22974, 8.27495, 7.92044, 8.0993, 7.89755, 8.2517, 8.23397, 8.00952, 7.97507, 7.92567, 7.74377, 7.74735, 7.64935, 7.51967, 7.91031, 7.70174, 7.45536, 7.74632, 7.77446, 7.54372, 7.30243, 7.45569, 7.34305, 7.4658, 7.22841, 7.63683, 7.28242, 7.34884, 7.21343, 7.21124, 7.41956, 7.17365, 7.2819, 6.99462, 7.00325, 7.04012, 7.13712, 6.82214, 6.98588, 7.08949, 6.99872, 6.87479, 6.75655, 6.99059, 7.06011, 6.70413, 6.58421, 6.72746, 6.74527, 6.73409, 6.73823, 6.65852, 6.40615, 6.63686, 6.6194, 6.44648, 6.62844, 6.74357, 6.61132, 6.72657, 6.69405, 6.62733, 6.50769, 6.59795, 6.40666, 6.66519, 6.24881, 6.25106, 6.30401, 6.39198, 6.34989, 6.45173, 6.29422, 6.33969, 6.23719, 6.20153, 6.39655, 6.32455, 6.32086, 6.16315, 6.15667, 6.23617, 6.38123, 6.19858, 6.14609, 6.17459, 6.11003, 6.05359, 6.06531, 6.24848, 6.39923, 6.24762, 6.28436, 6.08885, 6.1659, 5.99117, 6.01964, 5.94446, 6.23937, 6.17942, 5.95871, 5.7764, 6.11339, 5.84425, 6.10156, 5.77953, 6.15415, 6.13822, 6.07746, 5.92004, 6.10968, 5.93741, 6.19122, 5.88685, 5.78306, 5.77148, 5.68041, 6.00813, 5.99187, 6.05986, 5.88016, 6.03137, 5.96131, 5.99374, 5.98716, 5.94573, 5.83722, 5.94198, 5.61328, 5.69729, 5.88553, 5.83625, 5.85543, 5.75718, 5.83246, 5.71985, 5.55522, 5.71497, 5.61505, 5.82338, 5.59492, 5.70181, 5.69956, 5.89291, 5.6334, 5.84186, 5.73328, 5.86061, 5.32413, 5.89063, 5.86923, 5.84806, 5.40969, 5.40238, 5.62094, 5.5916, 5.47979, 5.57337, 5.67122, 5.47407, 5.73944, 5.51167, 5.59101, 5.62347, 5.61736, 5.50921, 5.61182, 5.67274, 5.68001, 5.58479, 5.65971, 5.37206, 5.67757, 5.62674, 5.42131, 5.58249, 5.62904, 5.55375, 5.34106, 5.53431, 5.48176, 5.48104, 5.38026, 5.55107, 5.59981, 5.38504, 5.51817, 5.48713, 5.33135, 5.50212, 5.40894, 5.44244, 5.31335, 5.06368, 5.47625, 5.56822, 5.71202, 5.40926, 5.59783, 5.63205, 5.23113, 5.2684, 5.39256, 5.39509, 5.32651, 5.49543, 5.18174, 5.2944, 5.24351, 5.3743, 5.25187, 5.4403, 5.53394, 5.30526, 5.42762, 5.33573, 5.07536, 5.30828, 5.24915, 5.30097, 5.10794, 5.27462, 5.25882, 5.46931, 5.15605, 5.26147, 5.20567, 5.34991, 4.9789, 4.90972, 5.32269, 5.39016, 5.22419, 5.31593, 5.10145, 5.16054, 5.25953, 5.0667, 5.26007, 5.06659, 5.33924, 5.2437, 5.14669, 5.24181, 5.03908, 5.31189, 5.0508, 5.02718, 5.13824, 5.11134, 5.26999, 5.14813, 5.27491, 5.09204, 5.0944, 5.24441, 5.32532, 5.25266, 5.18964, 5.14218, 5.28959, 4.95048, 5.2045, 5.09444, 5.30302, 5.17003, 5.18518, 5.11668, 4.98204, 4.99495, 5.222, 5.30847, 5.098, 5.05553, 4.91636, 5.12137, 5.11611, 4.9291, 5.33462, 5.02406, 5.09871, 5.16424, 5.00257, 5.06588, 5.06465, 4.99336, 5.07822, 5.15996, 4.97519, 5.18105, 4.9261, 4.91748, 5.06072, 4.99116, 4.90494, 4.77574, 4.94081, 5.11232, 5.01149, 5.01672, 5.32706, 4.95549, 4.99178, 5.04351, 4.80691, 4.73281, 4.99471, 5.04386, 4.87342, 4.9541, 5.04639, 5.02142, 4.81154, 4.89155, 4.90243, 4.82954, 4.73696, 5.00591, 4.75497, 5.20346, 4.791, 4.99509, 4.73426, 4.7815, 4.81632, 4.64705, 4.65335, 4.84192, 4.80637, 4.79718, 4.91906, 4.87982, 4.9259, 4.76993, 4.87999, 4.73114, 4.91345, 4.95513, 4.87047, 4.70341, 4.77964, 4.89818, 4.70591, 4.85482, 4.68983, 4.68887, 4.64189]}, "lm loss vs samples": {"start_step": 0, "end_step": 2000, "step_interval": 5, "values": [10.84424, 10.87342, 10.85055, 10.81078, 10.64469, 10.6386, 10.4283, 10.13518, 9.93546, 9.83538, 9.5857, 9.84804, 9.88588, 9.63127, 9.79022, 9.5114, 9.4597, 9.65546, 9.38988, 9.33928, 9.24947, 9.15126, 9.18199, 9.00445, 9.19836, 9.06663, 9.16101, 9.1698, 9.30057, 8.98927, 8.92967, 9.05035, 9.04657, 8.66029, 8.72527, 8.75664, 8.69468, 8.74328, 8.66681, 8.77286, 8.67044, 8.86119, 8.84295, 8.50873, 8.39852, 8.43801, 8.49532, 8.39321, 8.44017, 8.59221, 8.37564, 8.19958, 8.2329, 8.22974, 8.27495, 7.92044, 8.0993, 7.89755, 8.2517, 8.23397, 8.00952, 7.97507, 7.92567, 7.74377, 7.74735, 7.64935, 7.51967, 7.91031, 7.70174, 7.45536, 7.74632, 7.77446, 7.54372, 7.30243, 7.45569, 7.34305, 7.4658, 7.22841, 7.63683, 7.28242, 7.34884, 7.21343, 7.21124, 7.41956, 7.17365, 7.2819, 6.99462, 7.00325, 7.04012, 7.13712, 6.82214, 6.98588, 7.08949, 6.99872, 6.87479, 6.75655, 6.99059, 7.06011, 6.70413, 6.58421, 6.72746, 6.74527, 6.73409, 6.73823, 6.65852, 6.40615, 6.63686, 6.6194, 6.44648, 6.62844, 6.74357, 6.61132, 6.72657, 6.69405, 6.62733, 6.50769, 6.59795, 6.40666, 6.66519, 6.24881, 6.25106, 6.30401, 6.39198, 6.34989, 6.45173, 6.29422, 6.33969, 6.23719, 6.20153, 6.39655, 6.32455, 6.32086, 6.16315, 6.15667, 6.23617, 6.38123, 6.19858, 6.14609, 6.17459, 6.11003, 6.05359, 6.06531, 6.24848, 6.39923, 6.24762, 6.28436, 6.08885, 6.1659, 5.99117, 6.01964, 5.94446, 6.23937, 6.17942, 5.95871, 5.7764, 6.11339, 5.84425, 6.10156, 5.77953, 6.15415, 6.13822, 6.07746, 5.92004, 6.10968, 5.93741, 6.19122, 5.88685, 5.78306, 5.77148, 5.68041, 6.00813, 5.99187, 6.05986, 5.88016, 6.03137, 5.96131, 5.99374, 5.98716, 5.94573, 5.83722, 5.94198, 5.61328, 5.69729, 5.88553, 5.83625, 5.85543, 5.75718, 5.83246, 5.71985, 5.55522, 5.71497, 5.61505, 5.82338, 5.59492, 5.70181, 5.69956, 5.89291, 5.6334, 5.84186, 5.73328, 5.86061, 5.32413, 5.89063, 5.86923, 5.84806, 5.40969, 5.40238, 5.62094, 5.5916, 5.47979, 5.57337, 5.67122, 5.47407, 5.73944, 5.51167, 5.59101, 5.62347, 5.61736, 5.50921, 5.61182, 5.67274, 5.68001, 5.58479, 5.65971, 5.37206, 5.67757, 5.62674, 5.42131, 5.58249, 5.62904, 5.55375, 5.34106, 5.53431, 5.48176, 5.48104, 5.38026, 5.55107, 5.59981, 5.38504, 5.51817, 5.48713, 5.33135, 5.50212, 5.40894, 5.44244, 5.31335, 5.06368, 5.47625, 5.56822, 5.71202, 5.40926, 5.59783, 5.63205, 5.23113, 5.2684, 5.39256, 5.39509, 5.32651, 5.49543, 5.18174, 5.2944, 5.24351, 5.3743, 5.25187, 5.4403, 5.53394, 5.30526, 5.42762, 5.33573, 5.07536, 5.30828, 5.24915, 5.30097, 5.10794, 5.27462, 5.25882, 5.46931, 5.15605, 5.26147, 5.20567, 5.34991, 4.9789, 4.90972, 5.32269, 5.39016, 5.22419, 5.31593, 5.10145, 5.16054, 5.25953, 5.0667, 5.26007, 5.06659, 5.33924, 5.2437, 5.14669, 5.24181, 5.03908, 5.31189, 5.0508, 5.02718, 5.13824, 5.11134, 5.26999, 5.14813, 5.27491, 5.09204, 5.0944, 5.24441, 5.32532, 5.25266, 5.18964, 5.14218, 5.28959, 4.95048, 5.2045, 5.09444, 5.30302, 5.17003, 5.18518, 5.11668, 4.98204, 4.99495, 5.222, 5.30847, 5.098, 5.05553, 4.91636, 5.12137, 5.11611, 4.9291, 5.33462, 5.02406, 5.09871, 5.16424, 5.00257, 5.06588, 5.06465, 4.99336, 5.07822, 5.15996, 4.97519, 5.18105, 4.9261, 4.91748, 5.06072, 4.99116, 4.90494, 4.77574, 4.94081, 5.11232, 5.01149, 5.01672, 5.32706, 4.95549, 4.99178, 5.04351, 4.80691, 4.73281, 4.99471, 5.04386, 4.87342, 4.9541, 5.04639, 5.02142, 4.81154, 4.89155, 4.90243, 4.82954, 4.73696, 5.00591, 4.75497, 5.20346, 4.791, 4.99509, 4.73426, 4.7815, 4.81632, 4.64705, 4.65335, 4.84192, 4.80637, 4.79718, 4.91906, 4.87982, 4.9259, 4.76993, 4.87999, 4.73114, 4.91345, 4.95513, 4.87047, 4.70341, 4.77964, 4.89818, 4.70591, 4.85482, 4.68983, 4.68887, 4.64189]}, "loss-scale": {"start_step": 0, "end_step": 2000, "step_interval": 5, "values": [1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0]}, "loss-scale vs samples": {"start_step": 0, "end_step": 2000, "step_interval": 5, "values": [1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0]}, "grad-norm": {"start_step": 0, "end_step": 2000, "step_interval": 5, "values": [13.93626, 13.32689, 13.8137, 12.62172, 11.96992, 9.43513, 6.80799, 6.88665, 5.95498, 4.54619, 4.13053, 2.82596, 2.39543, 2.34537, 2.05773, 2.21996, 2.14537, 1.88392, 2.17069, 2.06105, 2.12373, 2.16615, 2.00976, 2.20876, 1.97308, 2.09194, 1.90863, 1.88776, 1.95054, 2.15308, 2.08778, 2.10616, 1.95646, 2.17094, 2.31724, 2.02642, 2.04764, 1.84545, 1.93704, 1.75657, 2.13069, 1.75993, 1.70876, 1.86665, 1.92331, 1.79127, 1.74297, 1.74426, 1.75161, 1.53485, 1.75292, 1.73299, 1.79809, 1.83477, 1.59059, 1.79085, 1.74313, 1.81505, 1.54888, 1.47615, 1.68285, 1.4812, 1.79315, 1.92171, 1.63149, 1.63813, 1.6586, 1.59744, 1.47545, 1.65909, 1.42464, 1.41939, 1.49901, 1.42049, 1.40172, 1.46225, 1.44185, 1.3706, 1.36838, 1.26055, 1.34627, 1.29904, 1.25687, 1.20642, 1.27731, 1.27576, 1.4537, 1.34738, 1.41703, 1.10279, 1.09805, 1.25584, 1.13228, 1.20775, 0.93229, 1.32305, 1.10083, 1.31134, 0.99675, 1.32116, 1.31807, 1.20377, 1.14298, 1.25982, 1.11587, 1.06268, 1.1383, 1.13456, 1.18344, 1.01042, 1.19822, 0.96542, 0.98282, 0.98083, 1.21915, 1.08304, 1.00478, 1.26788, 1.10619, 1.30807, 1.1248, 1.36119, 1.37901, 1.4392, 1.56444, 1.29037, 1.19911, 1.00927, 1.14759, 1.2293, 1.07062, 1.374, 1.0323, 1.06393, 1.18259, 1.20195, 1.16586, 1.44753, 0.94529, 1.13538, 1.05269, 1.34467, 1.18959, 1.01819, 0.86119, 1.06946, 1.34129, 1.684, 1.13519, 1.32985, 1.38775, 1.34761, 1.74434, 1.43622, 1.39335, 1.37538, 1.86703, 2.00418, 1.35288, 1.23486, 1.3698, 1.32764, 0.9773, 0.96112, 1.19304, 1.38421, 1.30281, 1.24815, 1.29487, 1.60508, 1.50397, 1.88527, 1.44501, 1.35752, 0.94887, 1.377, 2.16776, 1.36769, 1.5918, 1.53974, 1.46219, 1.57752, 1.18503, 1.28159, 1.42022, 1.06676, 1.57312, 1.38623, 1.21566, 1.67634, 1.0445, 1.27733, 1.33704, 1.42129, 1.46397, 1.28187, 1.4299, 1.30773, 1.5098, 1.44392, 1.45291, 1.64364, 1.49176, 1.37459, 1.51541, 1.63213, 1.48678, 1.52484, 1.4594, 1.29967, 1.2736, 1.3991, 1.32876, 1.30752, 2.30271, 1.55904, 1.8449, 1.46033, 1.24296, 1.20709, 1.62628, 1.5864, 1.26763, 1.43759, 1.47487, 1.37697, 1.3542, 1.33151, 1.73529, 1.34567, 1.25198, 1.32539, 1.47482, 1.18237, 1.36743, 1.49708, 1.35135, 1.39444, 1.32979, 1.17935, 1.87393, 1.4264, 1.47427, 1.49289, 1.23046, 1.40513, 1.22641, 1.41026, 1.60243, 1.3143, 1.19178, 1.29275, 1.40778, 1.27321, 1.41008, 1.70248, 1.64394, 1.51805, 1.52213, 1.56958, 1.37322, 1.23197, 1.2534, 1.33391, 1.27155, 1.71409, 1.36328, 1.34111, 1.56216, 1.69178, 1.34859, 1.23125, 1.30141, 1.35618, 1.71086, 1.21378, 1.62762, 1.35769, 1.32471, 1.3449, 1.37393, 1.16861, 1.52125, 1.65464, 1.84529, 1.4419, 1.39298, 1.45439, 1.43606, 1.60436, 1.56537, 1.49466, 1.35372, 1.44924, 1.44717, 1.59557, 1.51747, 1.64905, 1.33058, 1.31553, 1.61355, 1.23394, 1.40751, 1.24118, 1.39003, 1.46524, 1.46231, 1.5848, 1.30142, 1.49751, 1.49494, 1.35146, 1.32779, 1.48392, 1.42067, 1.43745, 1.57573, 1.52413, 1.22763, 1.19418, 1.89055, 1.53347, 1.40105, 1.60967, 1.38946, 1.31243, 1.45306, 1.42686, 1.36629, 1.4597, 1.59178, 1.37262, 1.28569, 1.49855, 1.29513, 1.26508, 1.32564, 1.18627, 1.52963, 1.41157, 1.22284, 1.09058, 1.41662, 1.39267, 1.29437, 1.39958, 1.3399, 1.36221, 1.4319, 1.07457, 1.45594, 1.29022, 1.47328, 1.63456, 1.35731, 1.53342, 1.23853, 1.30778, 1.37885, 1.39437, 1.58806, 1.41021, 1.41084, 1.3741, 1.18704, 1.36438, 1.50507, 1.3615, 1.43368, 1.39267, 1.48306, 1.60864, 1.92464, 1.65072, 1.54144, 1.35616, 1.29657, 1.5044, 1.29558, 1.3191, 1.41541, 1.44176, 1.48919, 1.28271, 1.18322, 1.31948, 1.34975, 1.36515, 1.26883, 1.48957, 1.40195, 1.45318, 1.67399, 1.47474, 1.53573, 1.49973, 1.39375, 1.51272, 1.36339, 1.21633]}, "grad-norm vs samples": {"start_step": 0, "end_step": 2000, "step_interval": 5, "values": [13.93626, 13.32689, 13.8137, 12.62172, 11.96992, 9.43513, 6.80799, 6.88665, 5.95498, 4.54619, 4.13053, 2.82596, 2.39543, 2.34537, 2.05773, 2.21996, 2.14537, 1.88392, 2.17069, 2.06105, 2.12373, 2.16615, 2.00976, 2.20876, 1.97308, 2.09194, 1.90863, 1.88776, 1.95054, 2.15308, 2.08778, 2.10616, 1.95646, 2.17094, 2.31724, 2.02642, 2.04764, 1.84545, 1.93704, 1.75657, 2.13069, 1.75993, 1.70876, 1.86665, 1.92331, 1.79127, 1.74297, 1.74426, 1.75161, 1.53485, 1.75292, 1.73299, 1.79809, 1.83477, 1.59059, 1.79085, 1.74313, 1.81505, 1.54888, 1.47615, 1.68285, 1.4812, 1.79315, 1.92171, 1.63149, 1.63813, 1.6586, 1.59744, 1.47545, 1.65909, 1.42464, 1.41939, 1.49901, 1.42049, 1.40172, 1.46225, 1.44185, 1.3706, 1.36838, 1.26055, 1.34627, 1.29904, 1.25687, 1.20642, 1.27731, 1.27576, 1.4537, 1.34738, 1.41703, 1.10279, 1.09805, 1.25584, 1.13228, 1.20775, 0.93229, 1.32305, 1.10083, 1.31134, 0.99675, 1.32116, 1.31807, 1.20377, 1.14298, 1.25982, 1.11587, 1.06268, 1.1383, 1.13456, 1.18344, 1.01042, 1.19822, 0.96542, 0.98282, 0.98083, 1.21915, 1.08304, 1.00478, 1.26788, 1.10619, 1.30807, 1.1248, 1.36119, 1.37901, 1.4392, 1.56444, 1.29037, 1.19911, 1.00927, 1.14759, 1.2293, 1.07062, 1.374, 1.0323, 1.06393, 1.18259, 1.20195, 1.16586, 1.44753, 0.94529, 1.13538, 1.05269, 1.34467, 1.18959, 1.01819, 0.86119, 1.06946, 1.34129, 1.684, 1.13519, 1.32985, 1.38775, 1.34761, 1.74434, 1.43622, 1.39335, 1.37538, 1.86703, 2.00418, 1.35288, 1.23486, 1.3698, 1.32764, 0.9773, 0.96112, 1.19304, 1.38421, 1.30281, 1.24815, 1.29487, 1.60508, 1.50397, 1.88527, 1.44501, 1.35752, 0.94887, 1.377, 2.16776, 1.36769, 1.5918, 1.53974, 1.46219, 1.57752, 1.18503, 1.28159, 1.42022, 1.06676, 1.57312, 1.38623, 1.21566, 1.67634, 1.0445, 1.27733, 1.33704, 1.42129, 1.46397, 1.28187, 1.4299, 1.30773, 1.5098, 1.44392, 1.45291, 1.64364, 1.49176, 1.37459, 1.51541, 1.63213, 1.48678, 1.52484, 1.4594, 1.29967, 1.2736, 1.3991, 1.32876, 1.30752, 2.30271, 1.55904, 1.8449, 1.46033, 1.24296, 1.20709, 1.62628, 1.5864, 1.26763, 1.43759, 1.47487, 1.37697, 1.3542, 1.33151, 1.73529, 1.34567, 1.25198, 1.32539, 1.47482, 1.18237, 1.36743, 1.49708, 1.35135, 1.39444, 1.32979, 1.17935, 1.87393, 1.4264, 1.47427, 1.49289, 1.23046, 1.40513, 1.22641, 1.41026, 1.60243, 1.3143, 1.19178, 1.29275, 1.40778, 1.27321, 1.41008, 1.70248, 1.64394, 1.51805, 1.52213, 1.56958, 1.37322, 1.23197, 1.2534, 1.33391, 1.27155, 1.71409, 1.36328, 1.34111, 1.56216, 1.69178, 1.34859, 1.23125, 1.30141, 1.35618, 1.71086, 1.21378, 1.62762, 1.35769, 1.32471, 1.3449, 1.37393, 1.16861, 1.52125, 1.65464, 1.84529, 1.4419, 1.39298, 1.45439, 1.43606, 1.60436, 1.56537, 1.49466, 1.35372, 1.44924, 1.44717, 1.59557, 1.51747, 1.64905, 1.33058, 1.31553, 1.61355, 1.23394, 1.40751, 1.24118, 1.39003, 1.46524, 1.46231, 1.5848, 1.30142, 1.49751, 1.49494, 1.35146, 1.32779, 1.48392, 1.42067, 1.43745, 1.57573, 1.52413, 1.22763, 1.19418, 1.89055, 1.53347, 1.40105, 1.60967, 1.38946, 1.31243, 1.45306, 1.42686, 1.36629, 1.4597, 1.59178, 1.37262, 1.28569, 1.49855, 1.29513, 1.26508, 1.32564, 1.18627, 1.52963, 1.41157, 1.22284, 1.09058, 1.41662, 1.39267, 1.29437, 1.39958, 1.3399, 1.36221, 1.4319, 1.07457, 1.45594, 1.29022, 1.47328, 1.63456, 1.35731, 1.53342, 1.23853, 1.30778, 1.37885, 1.39437, 1.58806, 1.41021, 1.41084, 1.3741, 1.18704, 1.36438, 1.50507, 1.3615, 1.43368, 1.39267, 1.48306, 1.60864, 1.92464, 1.65072, 1.54144, 1.35616, 1.29657, 1.5044, 1.29558, 1.3191, 1.41541, 1.44176, 1.48919, 1.28271, 1.18322, 1.31948, 1.34975, 1.36515, 1.26883, 1.48957, 1.40195, 1.45318, 1.67399, 1.47474, 1.53573, 1.49973, 1.39375, 1.51272, 1.36339, 1.21633]}, "num-zeros": {"start_step": 0, "end_step": 2000, "step_interval": 5, "values": [69.0, 86.0, 77.0, 73.0, 78.0, 81.0, 100.0, 105.0, 134.0, 134.0, 122.0, 173.0, 158.0, 179.0, 178.0, 172.0, 173.0, 192.0, 186.0, 185.0, 155.0, 157.0, 183.0, 172.0, 179.0, 162.0, 166.0, 176.0, 162.0, 177.0, 178.0, 149.0, 163.0, 200.0, 122.0, 151.0, 160.0, 216.0, 173.0, 192.0, 163.0, 174.0, 167.0, 195.0, 177.0, 181.0, 195.0, 201.0, 171.0, 240.0, 190.0, 187.0, 177.0, 159.0, 167.0, 211.0, 151.0, 167.0, 226.0, 215.0, 184.0, 206.0, 174.0, 166.0, 203.0, 236.0, 215.0, 192.0, 197.0, 197.0, 250.0, 225.0, 178.0, 210.0, 205.0, 223.0, 233.0, 196.0, 258.0, 221.0, 228.0, 237.0, 226.0, 223.0, 188.0, 182.0, 179.0, 198.0, 147.0, 189.0, 211.0, 214.0, 206.0, 216.0, 245.0, 156.0, 216.0, 214.0, 192.0, 170.0, 167.0, 167.0, 171.0, 168.0, 164.0, 141.0, 174.0, 143.0, 140.0, 184.0, 153.0, 162.0, 175.0, 144.0, 145.0, 144.0, 166.0, 110.0, 159.0, 132.0, 128.0, 137.0, 112.0, 132.0, 126.0, 136.0, 128.0, 172.0, 158.0, 131.0, 135.0, 133.0, 133.0, 144.0, 114.0, 123.0, 127.0, 129.0, 121.0, 139.0, 118.0, 107.0, 135.0, 149.0, 155.0, 123.0, 118.0, 109.0, 109.0, 111.0, 101.0, 119.0, 87.0, 118.0, 99.0, 104.0, 99.0, 88.0, 112.0, 112.0, 136.0, 110.0, 122.0, 128.0, 102.0, 105.0, 114.0, 106.0, 103.0, 119.0, 109.0, 83.0, 87.0, 99.0, 136.0, 116.0, 91.0, 112.0, 94.0, 98.0, 128.0, 100.0, 108.0, 115.0, 104.0, 128.0, 109.0, 99.0, 112.0, 96.0, 123.0, 103.0, 109.0, 84.0, 117.0, 105.0, 92.0, 104.0, 83.0, 96.0, 128.0, 71.0, 107.0, 110.0, 99.0, 96.0, 100.0, 100.0, 99.0, 122.0, 94.0, 98.0, 121.0, 118.0, 83.0, 96.0, 99.0, 123.0, 108.0, 107.0, 108.0, 93.0, 89.0, 101.0, 121.0, 121.0, 113.0, 108.0, 83.0, 123.0, 89.0, 105.0, 99.0, 100.0, 108.0, 105.0, 95.0, 112.0, 101.0, 110.0, 93.0, 108.0, 94.0, 120.0, 118.0, 107.0, 98.0, 121.0, 102.0, 97.0, 111.0, 126.0, 102.0, 108.0, 107.0, 108.0, 95.0, 97.0, 96.0, 118.0, 100.0, 111.0, 103.0, 92.0, 100.0, 101.0, 100.0, 103.0, 112.0, 87.0, 86.0, 119.0, 97.0, 101.0, 119.0, 120.0, 124.0, 114.0, 108.0, 105.0, 101.0, 104.0, 103.0, 98.0, 86.0, 101.0, 115.0, 98.0, 90.0, 108.0, 102.0, 102.0, 108.0, 125.0, 109.0, 90.0, 115.0, 94.0, 114.0, 113.0, 98.0, 113.0, 122.0, 101.0, 97.0, 109.0, 106.0, 105.0, 115.0, 95.0, 117.0, 118.0, 95.0, 111.0, 88.0, 121.0, 121.0, 117.0, 138.0, 134.0, 89.0, 99.0, 117.0, 93.0, 106.0, 123.0, 117.0, 107.0, 117.0, 108.0, 86.0, 121.0, 125.0, 105.0, 114.0, 107.0, 129.0, 114.0, 114.0, 107.0, 120.0, 118.0, 101.0, 109.0, 107.0, 124.0, 120.0, 116.0, 103.0, 127.0, 126.0, 90.0, 102.0, 114.0, 111.0, 108.0, 136.0, 107.0, 112.0, 104.0, 113.0, 117.0, 133.0, 104.0, 125.0, 119.0, 111.0, 122.0, 100.0, 118.0, 119.0, 104.0, 85.0, 133.0, 104.0, 119.0, 118.0, 95.0, 117.0, 123.0, 101.0, 132.0, 121.0, 110.0, 116.0, 116.0, 111.0, 91.0, 104.0, 104.0, 115.0, 124.0, 105.0, 104.0, 105.0, 101.0, 99.0, 112.0, 126.0, 139.0]}, "num-zeros vs samples": {"start_step": 0, "end_step": 2000, "step_interval": 5, "values": [69.0, 86.0, 77.0, 73.0, 78.0, 81.0, 100.0, 105.0, 134.0, 134.0, 122.0, 173.0, 158.0, 179.0, 178.0, 172.0, 173.0, 192.0, 186.0, 185.0, 155.0, 157.0, 183.0, 172.0, 179.0, 162.0, 166.0, 176.0, 162.0, 177.0, 178.0, 149.0, 163.0, 200.0, 122.0, 151.0, 160.0, 216.0, 173.0, 192.0, 163.0, 174.0, 167.0, 195.0, 177.0, 181.0, 195.0, 201.0, 171.0, 240.0, 190.0, 187.0, 177.0, 159.0, 167.0, 211.0, 151.0, 167.0, 226.0, 215.0, 184.0, 206.0, 174.0, 166.0, 203.0, 236.0, 215.0, 192.0, 197.0, 197.0, 250.0, 225.0, 178.0, 210.0, 205.0, 223.0, 233.0, 196.0, 258.0, 221.0, 228.0, 237.0, 226.0, 223.0, 188.0, 182.0, 179.0, 198.0, 147.0, 189.0, 211.0, 214.0, 206.0, 216.0, 245.0, 156.0, 216.0, 214.0, 192.0, 170.0, 167.0, 167.0, 171.0, 168.0, 164.0, 141.0, 174.0, 143.0, 140.0, 184.0, 153.0, 162.0, 175.0, 144.0, 145.0, 144.0, 166.0, 110.0, 159.0, 132.0, 128.0, 137.0, 112.0, 132.0, 126.0, 136.0, 128.0, 172.0, 158.0, 131.0, 135.0, 133.0, 133.0, 144.0, 114.0, 123.0, 127.0, 129.0, 121.0, 139.0, 118.0, 107.0, 135.0, 149.0, 155.0, 123.0, 118.0, 109.0, 109.0, 111.0, 101.0, 119.0, 87.0, 118.0, 99.0, 104.0, 99.0, 88.0, 112.0, 112.0, 136.0, 110.0, 122.0, 128.0, 102.0, 105.0, 114.0, 106.0, 103.0, 119.0, 109.0, 83.0, 87.0, 99.0, 136.0, 116.0, 91.0, 112.0, 94.0, 98.0, 128.0, 100.0, 108.0, 115.0, 104.0, 128.0, 109.0, 99.0, 112.0, 96.0, 123.0, 103.0, 109.0, 84.0, 117.0, 105.0, 92.0, 104.0, 83.0, 96.0, 128.0, 71.0, 107.0, 110.0, 99.0, 96.0, 100.0, 100.0, 99.0, 122.0, 94.0, 98.0, 121.0, 118.0, 83.0, 96.0, 99.0, 123.0, 108.0, 107.0, 108.0, 93.0, 89.0, 101.0, 121.0, 121.0, 113.0, 108.0, 83.0, 123.0, 89.0, 105.0, 99.0, 100.0, 108.0, 105.0, 95.0, 112.0, 101.0, 110.0, 93.0, 108.0, 94.0, 120.0, 118.0, 107.0, 98.0, 121.0, 102.0, 97.0, 111.0, 126.0, 102.0, 108.0, 107.0, 108.0, 95.0, 97.0, 96.0, 118.0, 100.0, 111.0, 103.0, 92.0, 100.0, 101.0, 100.0, 103.0, 112.0, 87.0, 86.0, 119.0, 97.0, 101.0, 119.0, 120.0, 124.0, 114.0, 108.0, 105.0, 101.0, 104.0, 103.0, 98.0, 86.0, 101.0, 115.0, 98.0, 90.0, 108.0, 102.0, 102.0, 108.0, 125.0, 109.0, 90.0, 115.0, 94.0, 114.0, 113.0, 98.0, 113.0, 122.0, 101.0, 97.0, 109.0, 106.0, 105.0, 115.0, 95.0, 117.0, 118.0, 95.0, 111.0, 88.0, 121.0, 121.0, 117.0, 138.0, 134.0, 89.0, 99.0, 117.0, 93.0, 106.0, 123.0, 117.0, 107.0, 117.0, 108.0, 86.0, 121.0, 125.0, 105.0, 114.0, 107.0, 129.0, 114.0, 114.0, 107.0, 120.0, 118.0, 101.0, 109.0, 107.0, 124.0, 120.0, 116.0, 103.0, 127.0, 126.0, 90.0, 102.0, 114.0, 111.0, 108.0, 136.0, 107.0, 112.0, 104.0, 113.0, 117.0, 133.0, 104.0, 125.0, 119.0, 111.0, 122.0, 100.0, 118.0, 119.0, 104.0, 85.0, 133.0, 104.0, 119.0, 118.0, 95.0, 117.0, 123.0, 101.0, 132.0, 121.0, 110.0, 116.0, 116.0, 111.0, 91.0, 104.0, 104.0, 115.0, 124.0, 105.0, 104.0, 105.0, 101.0, 99.0, 112.0, 126.0, 139.0]}, "params-norm": {"start_step": 0, "end_step": 2000, "step_interval": 5, "values": [180.0215, 180.0215, 180.0215, 180.02148, 180.02148, 180.02147, 180.02145, 180.02135, 180.02115, 180.02094, 180.01961, 180.01897, 180.01846, 180.01622, 180.01544, 180.01474, 180.01422, 180.01404, 180.01433, 180.01431, 180.01425, 180.01393, 180.01378, 180.01398, 180.01486, 180.01613, 180.01736, 180.01823, 180.01926, 180.02052, 180.02249, 180.0247, 180.0269, 180.02905, 180.03157, 180.03452, 180.03809, 180.04181, 180.04561, 180.04982, 180.05472, 180.06001, 180.06567, 180.07184, 180.0788, 180.08618, 180.09402, 180.10249, 180.11177, 180.12202, 180.13301, 180.14465, 180.15689, 180.16972, 180.18321, 180.19737, 180.21191, 180.22699, 180.24295, 180.26004, 180.27771, 180.29611, 180.31612, 180.33702, 180.35811, 180.38084, 180.40419, 180.4287, 180.45442, 180.48056, 180.50702, 180.53406, 180.56171, 180.58975, 180.61829, 180.64751, 180.67677, 180.70682, 180.73743, 180.76886, 180.80061, 180.83215, 180.86478, 180.89844, 180.93239, 180.96716, 181.00246, 181.03769, 181.07275, 181.10832, 181.14499, 181.18263, 181.21957, 181.25639, 181.29378, 181.33115, 181.36745, 181.40192, 181.43672, 181.47206, 181.50702, 181.54108, 181.57564, 181.61107, 181.64665, 181.68359, 181.72212, 181.76016, 181.79727, 181.83466, 181.87212, 181.91078, 181.94928, 181.98863, 182.02866, 182.0679, 182.10756, 182.14766, 182.18661, 182.22534, 182.26395, 182.30188, 182.33997, 182.3786, 182.41617, 182.45273, 182.48906, 182.52652, 182.56755, 182.60834, 182.64743, 182.68629, 182.72655, 182.76643, 182.80617, 182.84549, 182.8847, 182.92358, 182.96255, 183.00255, 183.04317, 183.08311, 183.12239, 183.16113, 183.20087, 183.24062, 183.27989, 183.31709, 183.35413, 183.39204, 183.42976, 183.46664, 183.50266, 183.5378, 183.57317, 183.60986, 183.64481, 183.67638, 183.7079, 183.74036, 183.77179, 183.80507, 183.8432, 183.8837, 183.92522, 183.96664, 184.00832, 184.04984, 184.09091, 184.13011, 184.16745, 184.20192, 184.2364, 184.27042, 184.30766, 184.34671, 184.38367, 184.41844, 184.45454, 184.49117, 184.52921, 184.56746, 184.60696, 184.64819, 184.69025, 184.73074, 184.77034, 184.80975, 184.84845, 184.88777, 184.92712, 184.96806, 185.00996, 185.0508, 185.09145, 185.13165, 185.17198, 185.21196, 185.25362, 185.29736, 185.33859, 185.37759, 185.41449, 185.45093, 185.48775, 185.52527, 185.56303, 185.60017, 185.63844, 185.67694, 185.717, 185.75711, 185.79745, 185.83626, 185.87444, 185.91074, 185.94763, 185.98566, 186.02451, 186.06494, 186.10443, 186.14497, 186.18584, 186.22533, 186.26512, 186.30524, 186.34587, 186.38719, 186.42752, 186.46732, 186.5069, 186.54416, 186.58186, 186.62146, 186.66272, 186.7025, 186.74118, 186.78197, 186.82381, 186.86591, 186.90703, 186.94699, 186.98782, 187.02896, 187.07161, 187.11592, 187.16006, 187.20297, 187.24727, 187.29167, 187.33688, 187.38315, 187.43051, 187.47704, 187.52306, 187.56926, 187.61435, 187.65848, 187.70207, 187.74612, 187.791, 187.83688, 187.88379, 187.93002, 187.97664, 188.02202, 188.06602, 188.10904, 188.15352, 188.19698, 188.23994, 188.28452, 188.3309, 188.37823, 188.4254, 188.47156, 188.51752, 188.5639, 188.60988, 188.65466, 188.69901, 188.74353, 188.78758, 188.82999, 188.87415, 188.91789, 188.9626, 189.00793, 189.05475, 189.10188, 189.14818, 189.1933, 189.23761, 189.28363, 189.33023, 189.37675, 189.42268, 189.46941, 189.51593, 189.56395, 189.61171, 189.65927, 189.70778, 189.75581, 189.80321, 189.8503, 189.89809, 189.9472, 189.9967, 190.04593, 190.09396, 190.14343, 190.1933, 190.24219, 190.29274, 190.34343, 190.39359, 190.44443, 190.49617, 190.54893, 190.60107, 190.65158, 190.70294, 190.75449, 190.80663, 190.86197, 190.91545, 190.96892, 191.02086, 191.07315, 191.12288, 191.17188, 191.22237, 191.27545, 191.32816, 191.38139, 191.43503, 191.48665, 191.53937, 191.58943, 191.64163, 191.69427, 191.74928, 191.8026, 191.85596, 191.90891, 191.96182, 192.01491, 192.06815, 192.12227, 192.17641, 192.23074, 192.28561, 192.34024, 192.39484, 192.44731, 192.50171, 192.55782, 192.61383, 192.67009, 192.72624, 192.78252, 192.83763, 192.89287, 192.94981, 193.00703, 193.06404, 193.12177, 193.17989, 193.23723, 193.29391, 193.34985, 193.40605, 193.45912, 193.51132, 193.56346, 193.61696, 193.67215, 193.72841, 193.78329, 193.83797, 193.89262, 193.94887, 194.00604, 194.064, 194.12062, 194.17807, 194.23741, 194.29666, 194.35547, 194.41553, 194.47499, 194.53378, 194.59259, 194.65202, 194.70923, 194.76607, 194.82375, 194.88065, 194.93935]}, "params-norm vs samples": {"start_step": 0, "end_step": 2000, "step_interval": 5, "values": [180.0215, 180.0215, 180.0215, 180.02148, 180.02148, 180.02147, 180.02145, 180.02135, 180.02115, 180.02094, 180.01961, 180.01897, 180.01846, 180.01622, 180.01544, 180.01474, 180.01422, 180.01404, 180.01433, 180.01431, 180.01425, 180.01393, 180.01378, 180.01398, 180.01486, 180.01613, 180.01736, 180.01823, 180.01926, 180.02052, 180.02249, 180.0247, 180.0269, 180.02905, 180.03157, 180.03452, 180.03809, 180.04181, 180.04561, 180.04982, 180.05472, 180.06001, 180.06567, 180.07184, 180.0788, 180.08618, 180.09402, 180.10249, 180.11177, 180.12202, 180.13301, 180.14465, 180.15689, 180.16972, 180.18321, 180.19737, 180.21191, 180.22699, 180.24295, 180.26004, 180.27771, 180.29611, 180.31612, 180.33702, 180.35811, 180.38084, 180.40419, 180.4287, 180.45442, 180.48056, 180.50702, 180.53406, 180.56171, 180.58975, 180.61829, 180.64751, 180.67677, 180.70682, 180.73743, 180.76886, 180.80061, 180.83215, 180.86478, 180.89844, 180.93239, 180.96716, 181.00246, 181.03769, 181.07275, 181.10832, 181.14499, 181.18263, 181.21957, 181.25639, 181.29378, 181.33115, 181.36745, 181.40192, 181.43672, 181.47206, 181.50702, 181.54108, 181.57564, 181.61107, 181.64665, 181.68359, 181.72212, 181.76016, 181.79727, 181.83466, 181.87212, 181.91078, 181.94928, 181.98863, 182.02866, 182.0679, 182.10756, 182.14766, 182.18661, 182.22534, 182.26395, 182.30188, 182.33997, 182.3786, 182.41617, 182.45273, 182.48906, 182.52652, 182.56755, 182.60834, 182.64743, 182.68629, 182.72655, 182.76643, 182.80617, 182.84549, 182.8847, 182.92358, 182.96255, 183.00255, 183.04317, 183.08311, 183.12239, 183.16113, 183.20087, 183.24062, 183.27989, 183.31709, 183.35413, 183.39204, 183.42976, 183.46664, 183.50266, 183.5378, 183.57317, 183.60986, 183.64481, 183.67638, 183.7079, 183.74036, 183.77179, 183.80507, 183.8432, 183.8837, 183.92522, 183.96664, 184.00832, 184.04984, 184.09091, 184.13011, 184.16745, 184.20192, 184.2364, 184.27042, 184.30766, 184.34671, 184.38367, 184.41844, 184.45454, 184.49117, 184.52921, 184.56746, 184.60696, 184.64819, 184.69025, 184.73074, 184.77034, 184.80975, 184.84845, 184.88777, 184.92712, 184.96806, 185.00996, 185.0508, 185.09145, 185.13165, 185.17198, 185.21196, 185.25362, 185.29736, 185.33859, 185.37759, 185.41449, 185.45093, 185.48775, 185.52527, 185.56303, 185.60017, 185.63844, 185.67694, 185.717, 185.75711, 185.79745, 185.83626, 185.87444, 185.91074, 185.94763, 185.98566, 186.02451, 186.06494, 186.10443, 186.14497, 186.18584, 186.22533, 186.26512, 186.30524, 186.34587, 186.38719, 186.42752, 186.46732, 186.5069, 186.54416, 186.58186, 186.62146, 186.66272, 186.7025, 186.74118, 186.78197, 186.82381, 186.86591, 186.90703, 186.94699, 186.98782, 187.02896, 187.07161, 187.11592, 187.16006, 187.20297, 187.24727, 187.29167, 187.33688, 187.38315, 187.43051, 187.47704, 187.52306, 187.56926, 187.61435, 187.65848, 187.70207, 187.74612, 187.791, 187.83688, 187.88379, 187.93002, 187.97664, 188.02202, 188.06602, 188.10904, 188.15352, 188.19698, 188.23994, 188.28452, 188.3309, 188.37823, 188.4254, 188.47156, 188.51752, 188.5639, 188.60988, 188.65466, 188.69901, 188.74353, 188.78758, 188.82999, 188.87415, 188.91789, 188.9626, 189.00793, 189.05475, 189.10188, 189.14818, 189.1933, 189.23761, 189.28363, 189.33023, 189.37675, 189.42268, 189.46941, 189.51593, 189.56395, 189.61171, 189.65927, 189.70778, 189.75581, 189.80321, 189.8503, 189.89809, 189.9472, 189.9967, 190.04593, 190.09396, 190.14343, 190.1933, 190.24219, 190.29274, 190.34343, 190.39359, 190.44443, 190.49617, 190.54893, 190.60107, 190.65158, 190.70294, 190.75449, 190.80663, 190.86197, 190.91545, 190.96892, 191.02086, 191.07315, 191.12288, 191.17188, 191.22237, 191.27545, 191.32816, 191.38139, 191.43503, 191.48665, 191.53937, 191.58943, 191.64163, 191.69427, 191.74928, 191.8026, 191.85596, 191.90891, 191.96182, 192.01491, 192.06815, 192.12227, 192.17641, 192.23074, 192.28561, 192.34024, 192.39484, 192.44731, 192.50171, 192.55782, 192.61383, 192.67009, 192.72624, 192.78252, 192.83763, 192.89287, 192.94981, 193.00703, 193.06404, 193.12177, 193.17989, 193.23723, 193.29391, 193.34985, 193.40605, 193.45912, 193.51132, 193.56346, 193.61696, 193.67215, 193.72841, 193.78329, 193.83797, 193.89262, 193.94887, 194.00604, 194.064, 194.12062, 194.17807, 194.23741, 194.29666, 194.35547, 194.41553, 194.47499, 194.53378, 194.59259, 194.65202, 194.70923, 194.76607, 194.82375, 194.88065, 194.93935]}, "iteration-time": {"start_step": 0, "end_step": 2000, "step_interval": 5, "values": [25.13033, 1.48166, 1.46987, 1.47023, 1.48503, 1.46592, 1.47336, 1.47508, 1.47402, 1.4685, 1.46594, 1.46551, 1.47349, 1.47267, 1.46624, 1.4694, 1.46787, 1.46277, 1.47132, 1.47851, 1.46741, 1.46542, 1.4696, 1.47275, 1.46461, 1.47691, 1.4675, 1.4656, 1.47118, 1.46861, 1.46276, 1.46336, 1.46191, 1.46454, 1.46661, 1.45397, 1.45433, 1.45318, 1.47248, 1.45987, 1.4605, 1.47021, 1.46471, 1.46712, 1.47916, 1.46564, 1.46806, 1.48231, 1.47331, 1.47647, 1.4749, 1.47736, 1.47088, 1.48046, 1.47029, 1.4749, 1.47423, 1.4743, 1.47451, 1.47312, 1.46669, 1.48162, 1.47248, 1.47813, 1.47924, 1.47693, 1.4857, 1.47407, 1.47761, 1.47904, 1.47169, 1.46697, 1.48901, 1.47837, 1.47292, 1.48078, 1.49273, 1.48823, 1.48311, 1.48576, 1.48783, 1.48617, 1.47144, 1.46991, 1.46885, 1.47351, 1.47373, 1.46882, 1.46809, 1.46714, 1.4672, 1.47772, 1.46612, 1.46651, 1.47094, 1.47578, 1.46913, 1.48331, 1.4865, 1.48787, 1.47171, 1.46821, 1.4802, 1.46723, 1.47379, 1.46841, 1.46785, 1.47559, 1.47509, 1.46854, 1.47345, 1.47159, 1.46793, 1.47819, 1.48813, 1.4716, 1.47495, 1.46872, 1.47829, 1.47064, 1.47018, 1.47559, 1.47576, 1.47037, 1.47433, 1.47533, 1.47013, 1.47921, 1.47494, 1.4767, 1.47607, 1.47345, 1.47128, 1.47431, 1.46759, 1.46948, 1.46669, 1.47222, 1.46674, 1.47388, 1.47388, 1.46524, 1.47407, 1.47207, 1.46963, 1.47611, 1.47057, 1.47046, 1.47507, 1.4718, 1.47093, 1.46875, 1.47966, 1.47691, 1.47958, 1.46848, 1.47659, 1.47233, 1.46829, 1.47134, 1.47162, 1.47084, 1.46812, 1.46169, 1.47005, 1.47196, 1.47131, 1.4779, 1.47053, 1.46873, 1.47177, 1.47562, 1.47441, 1.47279, 1.4738, 1.47473, 1.47647, 1.4711, 1.47612, 1.47591, 1.48126, 1.47512, 1.47351, 1.47769, 1.46263, 1.47234, 1.47526, 1.47224, 1.47085, 1.46942, 1.46803, 1.4759, 1.47343, 1.46362, 1.4685, 1.47079, 1.47101, 1.47158, 1.47044, 1.46992, 1.46298, 1.47836, 1.46169, 1.46751, 1.47839, 1.47255, 1.47103, 1.47052, 1.46863, 1.4668, 1.4769, 1.47204, 1.4723, 1.47157, 1.4667, 1.47441, 1.48003, 1.47181, 1.48009, 1.48373, 1.47652, 1.4796, 1.47353, 1.47567, 1.47796, 1.47632, 1.48009, 1.4717, 1.47188, 1.48104, 1.47363, 1.47129, 1.47793, 1.47574, 1.47484, 1.47619, 1.47177, 1.47614, 1.47933, 1.47156, 1.46844, 1.4802, 1.47829, 1.47093, 1.4754, 1.47276, 1.57859, 1.4684, 1.47537, 1.54583, 1.47639, 1.57948, 1.47918, 1.48066, 1.48212, 1.4774, 1.47852, 1.47639, 1.47826, 1.48039, 1.4739, 1.4819, 1.48028, 1.47407, 1.47624, 1.48205, 1.47628, 1.48393, 1.48589, 1.47517, 1.47758, 1.47729, 1.48745, 1.47685, 1.48033, 1.47602, 1.47812, 1.48054, 1.47432, 1.47337, 1.47804, 1.47123, 1.47425, 1.47715, 1.47794, 1.47273, 1.47454, 1.47875, 1.4782, 1.47577, 1.47167, 1.47763, 1.4744, 1.47683, 1.48168, 1.47497, 1.47434, 1.4796, 1.4776, 1.47214, 1.47435, 1.47766, 1.4835, 1.48072, 1.4744, 1.48392, 1.47533, 1.47683, 1.47742, 1.48516, 1.47634, 1.478, 1.47244, 1.48265, 1.47422, 1.48296, 1.48311, 1.47628, 1.47751, 1.48129, 1.47507, 1.48075, 1.47775, 1.47657, 1.48203, 1.48345, 1.48818, 1.48194, 1.48374, 1.482, 1.48749, 1.48551, 1.48527, 1.4871, 1.49114, 1.48723, 1.47874, 1.47877, 1.48314, 1.47745, 1.47138, 1.4823, 1.4909, 1.48278, 1.48582, 1.48063, 1.47195, 1.47501, 1.47117, 1.47685, 1.47555, 1.47306, 1.54386, 1.47358, 1.57973, 1.47563, 1.47575, 1.56224, 1.47774, 1.4817, 1.48012, 1.48778, 1.47737, 1.47738, 1.48069, 1.47712, 1.47909, 1.47385, 1.47532, 1.47459, 1.47167, 1.47808, 1.48123, 1.47993, 1.46614, 1.46983, 1.47318, 1.47539, 1.47425, 1.47523, 1.47895, 1.47481, 1.4698, 1.46941, 1.47466, 1.47011, 1.46611, 1.47663, 1.47626, 1.4741, 1.47847, 1.46407, 1.47268, 1.47738, 1.46488, 1.48113, 1.47284, 1.46934, 1.47784, 1.4777]}, "lm loss validation": {"start_step": 0, "end_step": 4, "step_interval": 5, "values": [5.6001]}, "lm loss validation vs samples": {"start_step": 0, "end_step": 4, "step_interval": 5, "values": [5.6001]}, "lm loss validation ppl": {"start_step": 0, "end_step": 4, "step_interval": 5, "values": [270.45398]}, "lm loss validation ppl vs samples": {"start_step": 0, "end_step": 4, "step_interval": 5, "values": [270.45398]}}
\ No newline at end of file
diff --git a/tests/functional_tests/test_cases/gpt/gpt3_345m_weekly_dgx_h100_1N8G_mcore_tp2_pp2_fp8_tp_pp/model_config.yaml b/tests/functional_tests/test_cases/gpt/gpt3_345m_weekly_dgx_h100_1N8G_mcore_tp2_pp2_fp8_tp_pp/model_config.yaml
new file mode 100644
index 0000000000..743064e121
--- /dev/null
+++ b/tests/functional_tests/test_cases/gpt/gpt3_345m_weekly_dgx_h100_1N8G_mcore_tp2_pp2_fp8_tp_pp/model_config.yaml
@@ -0,0 +1,52 @@
+ENV_VARS:
+  CUDA_DEVICE_MAX_CONNECTIONS: 1
+  NVTE_ALLOW_NONDETERMINISTIC_ALGO: 0
+  NVTE_FUSED_ATTN: 0
+  NCCL_ALGO: Tree
+  CUBLAS_WORKSPACE_CONFIG: :4096:8
+MODEL_ARGS:
+  --num-layers: 12
+  --hidden-size: 512
+  --num-attention-heads: 8
+  --log-params-norm: true
+  --log-num-zeros-in-grad: true
+  --log-validation-ppl-to-tensorboard: true
+  --log-timers-to-tensorboard: true
+  --tensorboard-dir: ${TENSORBOARD_PATH}
+  --micro-batch-size: 2
+  --global-batch-size: 128
+  --seq-length: 1024
+  --max-position-embeddings: 1024
+  --train-iters: 2000
+  --timing-log-level: 2
+  --lr-decay-iters: 320000
+  --save: ${CHECKPOINT_PATH}
+  --load: ${CHECKPOINT_PATH}
+  --data-path: ${DATA_PATH}/my-gpt3_00_text_document
+  --vocab-file: ${DATA_PATH}/bpe/vocab.json
+  --merge-file: ${DATA_PATH}/bpe/merges.txt
+  --split: 949,50,1
+  --distributed-backend: nccl
+  --lr: 0.00015
+  --lr-decay-style: cosine
+  --min-lr: 1.0e-5
+  --weight-decay: 1e-2
+  --clip-grad: 1.0
+  --lr-warmup-fraction: .01
+  --log-interval: 1
+  --save-interval: 10000
+  --eval-interval: 1000
+  --eval-iters: 10
+  --transformer-impl: transformer_engine
+  --tensor-model-parallel-size: 2
+  --pipeline-model-parallel-size: 2
+  --deterministic-mode: true
+  --no-gradient-accumulation-fusion: true
+  --fp8-format: hybrid
+  --fp8-amax-history-len: 1024
+  --fp8-amax-compute-algo: max
+  --attention-softmax-in-fp32: true
+  --ckpt-format: torch_dist
+  --data-cache-path: ${DATA_CACHE_PATH}
+  --bf16: true
+TEST_TYPE: regular
diff --git a/tests/functional_tests/test_cases/gpt/gpt3_345m_weekly_dgx_h100_1N8G_mcore_tp2_pp2_fp8_tp_pp_sp/golden_values_dev.json b/tests/functional_tests/test_cases/gpt/gpt3_345m_weekly_dgx_h100_1N8G_mcore_tp2_pp2_fp8_tp_pp_sp/golden_values_dev.json
new file mode 100644
index 0000000000..0af59da700
--- /dev/null
+++ b/tests/functional_tests/test_cases/gpt/gpt3_345m_weekly_dgx_h100_1N8G_mcore_tp2_pp2_fp8_tp_pp_sp/golden_values_dev.json
@@ -0,0 +1 @@
+{"forward-backward-time": {"start_step": 0, "end_step": 2000, "step_interval": 5, "values": [21.65799, 1.57316, 1.56036, 1.56197, 1.56002, 1.57036, 1.57498, 1.57179, 1.57223, 1.56447, 1.57065, 1.57253, 1.56833, 1.57388, 1.58074, 1.57741, 1.58388, 1.58795, 1.5903, 1.58075, 1.57656, 1.58312, 1.57306, 1.57348, 1.58999, 1.57118, 1.56942, 1.57642, 1.58455, 1.57798, 1.57753, 1.5848, 1.57952, 1.57466, 1.5634, 1.5759, 1.57055, 1.56518, 1.64863, 1.56915, 1.57234, 1.57176, 1.59307, 1.58513, 1.59397, 1.59455, 1.58862, 1.58627, 1.57781, 1.5836, 1.59175, 1.58787, 1.58531, 1.56743, 1.56768, 1.57061, 1.57416, 1.56759, 1.5696, 1.57589, 1.57313, 1.571, 1.58684, 1.58081, 1.58172, 1.57572, 1.58332, 1.58369, 1.5742, 1.58521, 1.57857, 1.57985, 1.59598, 1.58564, 1.58954, 1.58921, 1.58516, 1.58693, 1.58278, 1.58855, 1.58036, 1.58425, 1.57404, 1.56846, 1.57061, 1.57471, 1.57444, 1.57552, 1.58566, 1.59602, 1.57809, 1.59795, 1.58523, 1.58552, 1.58948, 1.5857, 1.58918, 1.58406, 1.58274, 1.58292, 1.5878, 1.57929, 1.57852, 1.57229, 1.58645, 1.58337, 1.57647, 1.56993, 1.57461, 1.57583, 1.57981, 1.58228, 1.58026, 1.58041, 1.57147, 1.57774, 1.57198, 1.56711, 1.56216, 1.57948, 1.57013, 1.5652, 1.57538, 1.59385, 1.58672, 1.57603, 1.57508, 1.58044, 1.56643, 1.57319, 1.56412, 1.56703, 1.57342, 1.57169, 1.58538, 1.57905, 1.57735, 1.5713, 1.56908, 1.56945, 1.57129, 1.5672, 1.57775, 1.58937, 1.59019, 1.5751, 1.58049, 1.58855, 1.58446, 1.59003, 1.58787, 1.58871, 1.59524, 1.59317, 1.59223, 1.59165, 1.58901, 1.59193, 1.5866, 1.59184, 1.59323, 1.59575, 1.58596, 1.59591, 1.58463, 1.58779, 1.59392, 1.59398, 1.59893, 1.5974, 1.59446, 1.58691, 1.58241, 1.58352, 1.59639, 1.58013, 1.59181, 1.58597, 1.58425, 1.58787, 1.58445, 1.58197, 1.58869, 1.5852, 1.58751, 1.5889, 1.58458, 1.57701, 1.58666, 1.584, 1.57776, 1.58858, 1.58222, 1.58721, 1.60018, 1.59115, 1.59271, 1.58842, 1.59023, 1.58933, 1.57882, 1.59135, 1.5868, 1.57554, 1.58258, 1.58243, 1.58389, 1.58426, 1.5849, 1.58819, 1.58199, 1.58031, 1.58504, 1.58277, 1.5863, 1.57949, 1.58628, 1.58781, 1.58443, 1.57924, 1.58531, 1.59139, 1.58724, 1.58582, 1.59165, 1.58221, 1.58782, 1.59196, 1.58549, 1.58279, 1.59669, 1.58729, 1.58776, 1.58434, 1.58643, 1.57486, 1.58484, 1.57875, 1.58178, 1.58296, 1.57564, 1.57269, 1.73935, 1.63419, 1.58507, 1.59194, 1.5809, 1.60067, 1.59666, 1.59408, 1.59512, 1.68832, 1.59093, 1.57923, 1.58167, 1.5802, 1.58149, 1.59105, 1.58674, 1.59021, 1.59488, 1.60007, 1.59231, 1.59296, 1.59159, 1.588, 1.58471, 1.58515, 1.58686, 1.58415, 1.58593, 1.58185, 1.58805, 1.59063, 1.58623, 1.58868, 1.5863, 1.58712, 1.58387, 1.58919, 1.58738, 1.58618, 1.58901, 1.58673, 1.5896, 1.59327, 1.58995, 1.59034, 1.59043, 1.58508, 1.58835, 1.59575, 1.59028, 1.58788, 1.59495, 1.59031, 1.58998, 1.58896, 1.59037, 1.58923, 1.59259, 1.59082, 1.59843, 1.59394, 1.59716, 1.58592, 1.58443, 1.59841, 1.58588, 1.59009, 1.58471, 1.58793, 1.59585, 1.58806, 1.59097, 1.59974, 1.58594, 1.59971, 1.5913, 1.5727, 1.57474, 1.58074, 1.57644, 1.58641, 1.58808, 1.58075, 1.5907, 1.58838, 1.58642, 1.58856, 1.58469, 1.58982, 1.59264, 1.59172, 1.58848, 1.59119, 1.59145, 1.58124, 1.60003, 1.58841, 1.59199, 1.58955, 1.59024, 1.58713, 1.58159, 1.58812, 1.58697, 1.59477, 1.58735, 1.68808, 1.60409, 1.59368, 1.68921, 1.59656, 1.59503, 1.59737, 1.5981, 1.6072, 1.60584, 1.60205, 1.60339, 1.59005, 1.59398, 1.59059, 1.5983, 1.59588, 1.58451, 1.59372, 1.59209, 1.58828, 1.59305, 1.59272, 1.59217, 1.59417, 1.59371, 1.60293, 1.6081, 1.59666, 1.59861, 1.59979, 1.59362, 1.60255, 1.60302, 1.60884, 1.60587, 1.5947, 1.59209, 1.60211, 1.60023, 1.60283, 1.60565, 1.6008, 1.5957, 1.60008, 1.59899, 1.59865, 1.59781, 1.59196, 1.59478, 1.59227]}, "forward-compute-time": {"start_step": 0, "end_step": 2000, "step_interval": 5, "values": [6.22042, 0.7887, 0.79083, 0.78962, 0.78756, 0.78885, 0.8016, 0.80118, 0.79635, 0.79549, 0.79171, 0.803, 0.8016, 0.79277, 0.79347, 0.80205, 0.80724, 0.8102, 0.80595, 0.79227, 0.78683, 0.79736, 0.79666, 0.79876, 0.80245, 0.79592, 0.79874, 0.79753, 0.81164, 0.79672, 0.79701, 0.80746, 0.80543, 0.79696, 0.79511, 0.79932, 0.79557, 0.79429, 0.84751, 0.79126, 0.79445, 0.79427, 0.81209, 0.80591, 0.79877, 0.8166, 0.8125, 0.80956, 0.80732, 0.79604, 0.80371, 0.80021, 0.79673, 0.78625, 0.79742, 0.79855, 0.79833, 0.79792, 0.79392, 0.79627, 0.78993, 0.80003, 0.78776, 0.80568, 0.77968, 0.7912, 0.79925, 0.79922, 0.79071, 0.79884, 0.78877, 0.79858, 0.81252, 0.8067, 0.79219, 0.81833, 0.81779, 0.80094, 0.80137, 0.81945, 0.80719, 0.79232, 0.79516, 0.80871, 0.80104, 0.79685, 0.80162, 0.80637, 0.80248, 0.80857, 0.81037, 0.80869, 0.7965, 0.80743, 0.8098, 0.80128, 0.80589, 0.80206, 0.80032, 0.80015, 0.79522, 0.79329, 0.80165, 0.80384, 0.80062, 0.79949, 0.80381, 0.78559, 0.80393, 0.80321, 0.80107, 0.79216, 0.79542, 0.79246, 0.80303, 0.8106, 0.79065, 0.79761, 0.79846, 0.80131, 0.80281, 0.79732, 0.7963, 0.81465, 0.81139, 0.79778, 0.80117, 0.79101, 0.78623, 0.79644, 0.7976, 0.79653, 0.79953, 0.79765, 0.80015, 0.81095, 0.80579, 0.7998, 0.7917, 0.79794, 0.79775, 0.79275, 0.80199, 0.81948, 0.81204, 0.79625, 0.79973, 0.79652, 0.80445, 0.80534, 0.80518, 0.79884, 0.81423, 0.80952, 0.81247, 0.80766, 0.80443, 0.81182, 0.80591, 0.81339, 0.80677, 0.79581, 0.79801, 0.81209, 0.7963, 0.79413, 0.8031, 0.80814, 0.80927, 0.81215, 0.81255, 0.79604, 0.80852, 0.80814, 0.81295, 0.80402, 0.81318, 0.8097, 0.80155, 0.81294, 0.81295, 0.80384, 0.81085, 0.80809, 0.81049, 0.81462, 0.81121, 0.80114, 0.81317, 0.8073, 0.80801, 0.81335, 0.81351, 0.81644, 0.8235, 0.8092, 0.81494, 0.80197, 0.80738, 0.80524, 0.80729, 0.81006, 0.81098, 0.8058, 0.81736, 0.81018, 0.81686, 0.81077, 0.81584, 0.81737, 0.81149, 0.81076, 0.81213, 0.8138, 0.81013, 0.80497, 0.82135, 0.81652, 0.81154, 0.81448, 0.81949, 0.81162, 0.81162, 0.80853, 0.81191, 0.81703, 0.8125, 0.80932, 0.80851, 0.79798, 0.81183, 0.80938, 0.80838, 0.81083, 0.81336, 0.81205, 0.81618, 0.80587, 0.81362, 0.81042, 0.80604, 0.80513, 0.95515, 0.83951, 0.81274, 0.80912, 0.80158, 0.81243, 0.81495, 0.81427, 0.81731, 0.90437, 0.812, 0.81127, 0.80335, 0.80701, 0.81174, 0.81789, 0.8062, 0.81818, 0.81364, 0.82457, 0.81861, 0.81831, 0.81451, 0.81624, 0.819, 0.81664, 0.81149, 0.81897, 0.82098, 0.80639, 0.82356, 0.81998, 0.82291, 0.8172, 0.81813, 0.82015, 0.82009, 0.8243, 0.82188, 0.82103, 0.81895, 0.8227, 0.81898, 0.81687, 0.82231, 0.82276, 0.82281, 0.81752, 0.81589, 0.81308, 0.81283, 0.8171, 0.82039, 0.81907, 0.81497, 0.81934, 0.81714, 0.8101, 0.8135, 0.81914, 0.82468, 0.81829, 0.82195, 0.81334, 0.81505, 0.83, 0.82284, 0.82566, 0.82499, 0.82531, 0.81828, 0.81665, 0.82509, 0.82012, 0.82215, 0.82179, 0.81542, 0.80285, 0.81044, 0.80469, 0.8102, 0.8158, 0.81485, 0.82051, 0.80883, 0.82724, 0.81536, 0.8108, 0.81338, 0.81843, 0.81932, 0.81808, 0.81079, 0.81136, 0.82409, 0.81369, 0.81194, 0.81256, 0.81683, 0.81111, 0.8172, 0.80945, 0.80932, 0.8134, 0.81086, 0.81202, 0.81131, 0.86018, 0.81312, 0.81026, 0.91292, 0.81781, 0.81732, 0.82904, 0.82523, 0.83411, 0.83407, 0.83166, 0.82856, 0.81239, 0.81494, 0.82555, 0.83157, 0.82113, 0.80701, 0.81497, 0.8215, 0.80867, 0.81134, 0.82362, 0.81971, 0.808, 0.80408, 0.81663, 0.82201, 0.81271, 0.82346, 0.82415, 0.81743, 0.8063, 0.80216, 0.80964, 0.8105, 0.8118, 0.81122, 0.81369, 0.81864, 0.82566, 0.81149, 0.80986, 0.81981, 0.81964, 0.82004, 0.80608, 0.81446, 0.81929, 0.8075, 0.80881]}, "backward-compute-time": {"start_step": 0, "end_step": 2000, "step_interval": 5, "values": [4.62942, 0.75097, 0.74, 0.74537, 0.74999, 0.75094, 0.74822, 0.74322, 0.74143, 0.74188, 0.75087, 0.75511, 0.75059, 0.75125, 0.75555, 0.7505, 0.76577, 0.75929, 0.75813, 0.75798, 0.75777, 0.75449, 0.75219, 0.76004, 0.76606, 0.74726, 0.75154, 0.75719, 0.75304, 0.75913, 0.75194, 0.76105, 0.75155, 0.75361, 0.75194, 0.74863, 0.75344, 0.75699, 0.76125, 0.76168, 0.75845, 0.75545, 0.76173, 0.76702, 0.76538, 0.76769, 0.75666, 0.75657, 0.75518, 0.75767, 0.75791, 0.75998, 0.76253, 0.75636, 0.75269, 0.75165, 0.75005, 0.74953, 0.7487, 0.76173, 0.75616, 0.75523, 0.77089, 0.75678, 0.76, 0.7504, 0.7563, 0.75155, 0.75497, 0.74943, 0.75435, 0.75485, 0.76133, 0.75829, 0.75424, 0.74885, 0.75032, 0.76341, 0.76306, 0.75225, 0.74967, 0.75803, 0.74607, 0.74997, 0.75189, 0.75522, 0.75126, 0.75345, 0.75402, 0.76221, 0.75573, 0.75879, 0.7447, 0.75592, 0.75875, 0.76088, 0.76149, 0.75471, 0.75716, 0.7483, 0.75544, 0.7486, 0.75419, 0.75681, 0.75858, 0.76287, 0.75413, 0.75433, 0.75404, 0.75102, 0.75167, 0.75697, 0.75394, 0.75963, 0.75308, 0.75609, 0.74811, 0.74816, 0.74646, 0.74523, 0.74868, 0.74707, 0.74934, 0.7508, 0.76531, 0.76133, 0.75869, 0.75454, 0.74851, 0.74933, 0.74654, 0.74315, 0.74234, 0.74764, 0.75289, 0.7578, 0.75618, 0.75315, 0.75232, 0.75728, 0.75011, 0.75412, 0.75242, 0.74889, 0.75119, 0.75527, 0.75085, 0.7583, 0.76477, 0.75215, 0.75071, 0.76072, 0.75986, 0.76825, 0.75337, 0.75661, 0.75384, 0.76056, 0.76054, 0.76494, 0.7674, 0.76549, 0.75611, 0.76183, 0.75053, 0.75482, 0.75715, 0.76983, 0.77042, 0.76028, 0.77021, 0.75151, 0.75914, 0.75118, 0.76133, 0.75325, 0.76558, 0.75951, 0.76119, 0.75926, 0.75073, 0.75384, 0.75883, 0.7634, 0.76168, 0.76652, 0.75731, 0.75344, 0.76068, 0.75369, 0.75137, 0.75963, 0.7697, 0.751, 0.77098, 0.75284, 0.75939, 0.75995, 0.75928, 0.75802, 0.75677, 0.76065, 0.75638, 0.75119, 0.76038, 0.75423, 0.75553, 0.75918, 0.75995, 0.75408, 0.76136, 0.74612, 0.75854, 0.75865, 0.7593, 0.75419, 0.75151, 0.75761, 0.76577, 0.75463, 0.74788, 0.75358, 0.76279, 0.76172, 0.76321, 0.75292, 0.75124, 0.75794, 0.76269, 0.76049, 0.75669, 0.7573, 0.75738, 0.75375, 0.76126, 0.75621, 0.75055, 0.75297, 0.75603, 0.75099, 0.75101, 0.74554, 0.83246, 0.7545, 0.75293, 0.75203, 0.75391, 0.7554, 0.75839, 0.75728, 0.76242, 0.75203, 0.75857, 0.7516, 0.75317, 0.75327, 0.75445, 0.7579, 0.753, 0.753, 0.75219, 0.75665, 0.75118, 0.75048, 0.74602, 0.74682, 0.75041, 0.74864, 0.75542, 0.74976, 0.74748, 0.75186, 0.75401, 0.75027, 0.74959, 0.75363, 0.74766, 0.75374, 0.751, 0.75381, 0.75069, 0.74504, 0.75077, 0.75083, 0.75402, 0.74825, 0.75092, 0.75145, 0.75314, 0.75502, 0.74951, 0.7579, 0.75347, 0.7511, 0.75538, 0.75696, 0.7579, 0.75511, 0.75693, 0.75306, 0.74836, 0.7533, 0.75717, 0.76271, 0.75482, 0.75341, 0.74896, 0.75096, 0.74632, 0.75083, 0.74516, 0.74075, 0.75065, 0.75718, 0.75375, 0.7557, 0.7462, 0.75504, 0.75655, 0.74982, 0.75081, 0.74949, 0.74808, 0.75239, 0.75544, 0.74273, 0.75537, 0.75449, 0.75109, 0.7469, 0.7528, 0.75193, 0.75171, 0.75366, 0.75959, 0.74847, 0.75215, 0.75052, 0.76098, 0.75632, 0.75747, 0.74845, 0.74437, 0.75406, 0.75357, 0.75105, 0.75484, 0.75765, 0.75917, 0.7582, 0.75622, 0.75762, 0.74952, 0.75592, 0.75778, 0.74829, 0.75888, 0.75085, 0.75064, 0.74667, 0.751, 0.75208, 0.75768, 0.74883, 0.75857, 0.7487, 0.75962, 0.76274, 0.75413, 0.75644, 0.75008, 0.75022, 0.75465, 0.76027, 0.75685, 0.7526, 0.7567, 0.75515, 0.75552, 0.75496, 0.75875, 0.76104, 0.77511, 0.77406, 0.768, 0.7781, 0.77247, 0.78055, 0.77825, 0.76677, 0.78188, 0.77415, 0.77114, 0.77225, 0.77049, 0.77717, 0.77115, 0.76807, 0.77259, 0.77472]}, "batch-generator-time": {"start_step": 0, "end_step": 2000, "step_interval": 5, "values": [2.20334, 0.0143, 0.01667, 0.01326, 0.01295, 0.01293, 0.01334, 0.01436, 0.01318, 0.01437, 0.01301, 0.01378, 0.01472, 0.01468, 0.01314, 0.01281, 0.01302, 0.01378, 0.01285, 0.01444, 0.01432, 0.01486, 0.01305, 0.01348, 0.01674, 0.01301, 0.01444, 0.01426, 0.01437, 0.01321, 0.01305, 0.01316, 0.01395, 0.01333, 0.01301, 0.01363, 0.01284, 0.01423, 0.01642, 0.01753, 0.01691, 0.01476, 0.01495, 0.01652, 0.01707, 0.02019, 0.01642, 0.01534, 0.01555, 0.01455, 0.01613, 0.01682, 0.01611, 0.01302, 0.01316, 0.01386, 0.0152, 0.01835, 0.01342, 0.01579, 0.01295, 0.01372, 0.01717, 0.0153, 0.01567, 0.01348, 0.01623, 0.0153, 0.01466, 0.01622, 0.01222, 0.01602, 0.02111, 0.01556, 0.01731, 0.01708, 0.01773, 0.0175, 0.01682, 0.0175, 0.01625, 0.0172, 0.01748, 0.02121, 0.01676, 0.01653, 0.01683, 0.01767, 0.01788, 0.01764, 0.01715, 0.02209, 0.01681, 0.01797, 0.01754, 0.01797, 0.01781, 0.01828, 0.0179, 0.01691, 0.01823, 0.0176, 0.01724, 0.0166, 0.01718, 0.01732, 0.0149, 0.01363, 0.01477, 0.01454, 0.01309, 0.01297, 0.01408, 0.0145, 0.01297, 0.01965, 0.01506, 0.01303, 0.01404, 0.01373, 0.01435, 0.01442, 0.01449, 0.01568, 0.01599, 0.01299, 0.01288, 0.01478, 0.01302, 0.01354, 0.01604, 0.01518, 0.01493, 0.01391, 0.01308, 0.01275, 0.01267, 0.01483, 0.0133, 0.01279, 0.01339, 0.01261, 0.01553, 0.01269, 0.0125, 0.01256, 0.01329, 0.0129, 0.01284, 0.01681, 0.01599, 0.01537, 0.0153, 0.01362, 0.01518, 0.01566, 0.01486, 0.01485, 0.01522, 0.01745, 0.01558, 0.01496, 0.01484, 0.01693, 0.01487, 0.01546, 0.02093, 0.01683, 0.01724, 0.01738, 0.01648, 0.01861, 0.01776, 0.01745, 0.01724, 0.01583, 0.02118, 0.01682, 0.01836, 0.02112, 0.01766, 0.0169, 0.01696, 0.01695, 0.01754, 0.01652, 0.0184, 0.0173, 0.01627, 0.01667, 0.01742, 0.01775, 0.01745, 0.01643, 0.01709, 0.01696, 0.01761, 0.01648, 0.01725, 0.01672, 0.21908, 0.01675, 0.01611, 0.01752, 0.01616, 0.01728, 0.01777, 0.0171, 0.01749, 0.01847, 0.01858, 0.01789, 0.01723, 0.01628, 0.01773, 0.01691, 0.01878, 0.01787, 0.0209, 0.01796, 0.01741, 0.01777, 0.01829, 0.01892, 0.01729, 0.01774, 0.01727, 0.02061, 0.01571, 0.01771, 0.01838, 0.01772, 0.0174, 0.01766, 0.01725, 0.01763, 0.01752, 0.01709, 0.01817, 0.02143, 0.0161, 0.01751, 0.09405, 0.06723, 0.01758, 0.01661, 0.02181, 0.02167, 0.01822, 0.01785, 0.01747, 0.01708, 0.01826, 0.01765, 0.01811, 0.01727, 0.01812, 0.01807, 0.01812, 0.01919, 0.01774, 0.01749, 0.01737, 0.01751, 0.01714, 0.02283, 0.01759, 0.01975, 0.02057, 0.01799, 0.01752, 0.01739, 0.01757, 0.01773, 0.01789, 0.01729, 0.01642, 0.01712, 0.0176, 0.01717, 0.01691, 0.01727, 0.01589, 0.01789, 0.0174, 0.0174, 0.01722, 0.01761, 0.01802, 0.0174, 0.02069, 0.0171, 0.01719, 0.01766, 0.01768, 0.01677, 0.01705, 0.01777, 0.01669, 0.02073, 0.01723, 0.01707, 0.01707, 0.01723, 0.01751, 0.01953, 0.0174, 0.0167, 0.01749, 0.01753, 0.01974, 0.01695, 0.01888, 0.01805, 0.01809, 0.01779, 0.0192, 0.01732, 0.01965, 0.01793, 0.01875, 0.01855, 0.01915, 0.01839, 0.01868, 0.01864, 0.01893, 0.01823, 0.01908, 0.01892, 0.01884, 0.01914, 0.02012, 0.01861, 0.02283, 0.01928, 0.01945, 0.01841, 0.01795, 0.01816, 0.0187, 0.01867, 0.01891, 0.02308, 0.0188, 0.01869, 0.01974, 0.02014, 0.02234, 0.0193, 0.01762, 0.01819, 0.0184, 0.01952, 0.01974, 0.01869, 0.0205, 0.018, 0.0183, 0.01719, 0.01915, 0.01879, 0.0194, 0.01781, 0.01856, 0.01773, 0.01734, 0.01914, 0.0169, 0.019, 0.01792, 0.01743, 0.02488, 0.01724, 0.01703, 0.01755, 0.01784, 0.01774, 0.01824, 0.01859, 0.02236, 0.01639, 0.0181, 0.01772, 0.01786, 0.01787, 0.01629, 0.01663, 0.01687, 0.01734, 0.01643, 0.0175, 0.0166, 0.01686, 0.0162, 0.01662, 0.02025, 0.01762, 0.01683, 0.01837]}, "forward-recv-time": {"start_step": 0, "end_step": 2000, "step_interval": 5, "values": [5.65416, 0.02537, 0.02635, 0.02461, 0.02504, 0.02484, 0.02542, 0.02517, 0.02613, 0.02496, 0.02499, 0.02526, 0.02517, 0.02669, 0.02527, 0.02523, 0.02555, 0.02514, 0.02531, 0.02544, 0.02502, 0.02866, 0.02534, 0.02519, 0.02546, 0.02642, 0.02449, 0.02505, 0.02448, 0.02468, 0.02481, 0.02534, 0.02569, 0.02662, 0.02525, 0.02575, 0.02553, 0.02468, 0.02518, 0.02486, 0.02617, 0.0262, 0.02498, 0.02481, 0.02556, 0.02544, 0.02525, 0.02507, 0.02521, 0.02526, 0.02607, 0.02518, 0.02513, 0.02559, 0.02488, 0.02586, 0.02585, 0.02611, 0.02926, 0.02566, 0.02649, 0.02556, 0.02541, 0.02684, 0.0255, 0.02555, 0.0255, 0.0255, 0.02545, 0.02694, 0.02533, 0.02962, 0.02527, 0.02528, 0.02579, 0.02515, 0.02509, 0.02553, 0.02514, 0.02532, 0.02535, 0.02565, 0.02505, 0.02564, 0.02529, 0.02581, 0.02662, 0.02629, 0.02709, 0.02508, 0.0255, 0.02567, 0.02579, 0.0251, 0.02471, 0.02553, 0.02567, 0.02524, 0.02526, 0.02542, 0.02549, 0.02485, 0.0254, 0.02557, 0.02563, 0.02532, 0.02527, 0.02538, 0.02679, 0.02564, 0.02917, 0.02565, 0.02736, 0.02515, 0.02504, 0.02493, 0.02534, 0.0255, 0.02468, 0.02576, 0.02535, 0.02502, 0.02542, 0.02937, 0.02618, 0.02564, 0.02552, 0.02493, 0.02464, 0.02534, 0.02541, 0.02506, 0.02906, 0.02585, 0.02551, 0.02458, 0.02524, 0.0254, 0.02487, 0.02705, 0.02476, 0.02422, 0.02846, 0.02862, 0.02919, 0.02491, 0.02528, 0.0255, 0.02536, 0.02481, 0.02663, 0.02537, 0.02529, 0.02555, 0.02495, 0.02532, 0.02892, 0.02477, 0.02508, 0.0255, 0.02505, 0.0255, 0.02603, 0.02601, 0.02543, 0.0257, 0.02514, 0.02658, 0.02696, 0.02519, 0.02558, 0.02777, 0.027, 0.02528, 0.02566, 0.02491, 0.02592, 0.02533, 0.02595, 0.0256, 0.02521, 0.02524, 0.02528, 0.02552, 0.02639, 0.02554, 0.02548, 0.02553, 0.02553, 0.02546, 0.02481, 0.02518, 0.02516, 0.02541, 0.02568, 0.02495, 0.02523, 0.02848, 0.02556, 0.02499, 0.022, 0.02884, 0.02809, 0.02537, 0.02485, 0.02541, 0.0241, 0.02529, 0.02531, 0.02522, 0.02532, 0.02491, 0.02523, 0.02501, 0.02691, 0.02738, 0.02935, 0.02585, 0.02542, 0.02516, 0.02571, 0.03013, 0.02563, 0.02483, 0.0253, 0.02509, 0.02525, 0.0255, 0.02513, 0.02517, 0.02489, 0.02524, 0.02485, 0.02507, 0.02536, 0.02583, 0.02534, 0.02509, 0.0251, 0.02531, 0.02518, 0.02475, 0.02917, 0.02567, 0.02587, 0.02568, 0.02609, 0.02628, 0.02622, 0.02564, 0.02497, 0.02578, 0.02549, 0.02526, 0.02494, 0.02571, 0.02582, 0.02631, 0.02647, 0.02581, 0.02643, 0.02664, 0.0263, 0.02556, 0.025, 0.02535, 0.02517, 0.02527, 0.0252, 0.02486, 0.02861, 0.02534, 0.02604, 0.02568, 0.02564, 0.02728, 0.02552, 0.02578, 0.02551, 0.02575, 0.02545, 0.02536, 0.02514, 0.02619, 0.02548, 0.02549, 0.02561, 0.02555, 0.02574, 0.02616, 0.02572, 0.02599, 0.02561, 0.02503, 0.02535, 0.02684, 0.02548, 0.02545, 0.02557, 0.02504, 0.02542, 0.0261, 0.02567, 0.02546, 0.0255, 0.02529, 0.02633, 0.03021, 0.0287, 0.0293, 0.0291, 0.03051, 0.03077, 0.02941, 0.03025, 0.02889, 0.02504, 0.02563, 0.02509, 0.02514, 0.02874, 0.02525, 0.02524, 0.02529, 0.02567, 0.02595, 0.02539, 0.02551, 0.02571, 0.02607, 0.02531, 0.02862, 0.02572, 0.02526, 0.02664, 0.02609, 0.02882, 0.02605, 0.02621, 0.02593, 0.02588, 0.02619, 0.02534, 0.02604, 0.02557, 0.02616, 0.02561, 0.02542, 0.02469, 0.02539, 0.02533, 0.02624, 0.02525, 0.02545, 0.02533, 0.02553, 0.02573, 0.02577, 0.0253, 0.02529, 0.02629, 0.02636, 0.02548, 0.02577, 0.0255, 0.02611, 0.02473, 0.02582, 0.02551, 0.02567, 0.0253, 0.02519, 0.0256, 0.02642, 0.02489, 0.02549, 0.02566, 0.0257, 0.02523, 0.02566, 0.02708, 0.02568, 0.025, 0.02826, 0.02772, 0.02446, 0.02415, 0.0242, 0.02452, 0.02402, 0.02491, 0.02511, 0.02443, 0.0247, 0.02457, 0.02433, 0.02427, 0.02485, 0.02473, 0.02411]}, "forward-send-time": {"start_step": 0, "end_step": 2000, "step_interval": 5, "values": [2.82565, 0.00013, 0.00013, 0.00012, 0.00013, 0.00012, 0.00012, 0.00012, 0.00012, 0.00012, 0.00012, 0.00012, 0.00012, 0.00012, 0.00012, 0.00012, 0.00012, 0.00013, 0.00012, 0.00019, 0.00012, 0.00012, 0.00012, 0.00013, 0.00012, 0.00013, 0.00012, 0.00012, 0.00013, 0.00012, 0.00012, 0.00012, 0.00012, 0.00013, 0.00013, 0.00013, 0.00012, 0.00012, 0.00013, 0.00012, 0.00012, 0.00013, 0.00012, 0.00012, 0.00013, 0.00013, 0.00012, 0.00012, 0.00013, 0.00012, 0.00013, 0.00012, 0.00012, 0.00013, 0.00012, 0.00013, 0.00013, 0.00013, 0.00013, 0.00012, 0.00013, 0.00013, 0.00012, 0.00013, 0.00013, 0.00012, 0.00013, 0.00012, 0.00013, 0.00013, 0.00012, 0.00013, 0.00013, 0.00012, 0.00012, 0.00014, 0.00012, 0.00012, 0.00012, 0.00013, 0.00012, 0.00012, 0.00012, 0.00013, 0.00012, 0.00012, 0.00013, 0.00013, 0.00012, 0.00013, 0.00012, 0.00013, 0.00013, 0.00012, 0.00012, 0.00013, 0.00012, 0.00012, 0.00012, 0.00013, 0.00013, 0.00012, 0.00012, 0.00012, 0.00013, 0.00012, 0.00012, 0.00012, 0.00013, 0.00012, 0.00015, 0.00012, 0.00013, 0.00013, 0.00012, 0.00012, 0.00012, 0.00013, 0.00012, 0.00013, 0.00012, 0.00012, 0.00012, 0.00013, 0.00012, 0.00013, 0.00013, 0.00012, 0.00012, 0.00013, 0.00012, 0.00012, 0.00013, 0.00012, 0.00013, 0.00012, 0.00012, 0.00012, 0.00013, 0.00014, 0.00012, 0.00012, 0.00012, 0.00013, 0.00012, 0.00012, 0.00013, 0.00012, 0.00012, 0.00013, 0.00013, 0.00012, 0.00012, 0.00012, 0.00013, 0.00012, 0.00013, 0.00012, 0.00012, 0.00012, 0.00012, 0.00013, 0.00013, 0.00013, 0.00013, 0.00013, 0.00013, 0.00012, 0.00013, 0.00012, 0.00012, 0.00013, 0.00012, 0.00012, 0.00012, 0.00012, 0.00012, 0.00012, 0.00012, 0.00012, 0.00012, 0.00012, 0.00012, 0.00013, 0.00012, 0.00012, 0.00013, 0.00012, 0.00012, 0.00018, 0.00012, 0.00012, 0.00013, 0.00013, 0.00012, 0.00013, 0.00012, 0.00013, 0.00012, 0.00013, 0.00012, 0.00012, 0.00012, 0.00013, 0.00012, 0.00013, 0.00012, 0.00012, 0.00012, 0.00012, 0.00012, 0.00012, 0.00013, 0.00013, 0.00012, 0.00012, 0.00012, 0.00013, 0.00012, 0.00013, 0.00012, 0.00013, 0.00013, 0.00012, 0.00012, 0.00012, 0.00012, 0.00012, 0.00012, 0.00012, 0.00012, 0.00012, 0.00012, 0.00013, 0.00014, 0.00012, 0.00012, 0.00012, 0.00012, 0.00012, 0.00013, 0.00012, 0.00012, 0.00012, 0.00012, 0.00012, 0.00012, 0.00013, 0.00012, 0.00012, 0.00012, 0.00013, 0.00013, 0.00012, 0.00012, 0.00013, 0.00012, 0.00012, 0.00012, 0.00012, 0.00012, 0.00013, 0.00012, 0.00012, 0.00013, 0.00013, 0.00012, 0.00012, 0.00012, 0.00012, 0.00014, 0.00013, 0.00012, 0.00013, 0.00013, 0.00013, 0.00012, 0.00012, 0.00012, 0.00012, 0.00012, 0.00012, 0.00012, 0.00012, 0.00012, 0.00012, 0.00012, 0.00012, 0.00012, 0.00012, 0.00012, 0.00012, 0.00013, 0.00012, 0.00012, 0.00013, 0.00012, 0.00012, 0.00012, 0.00012, 0.00012, 0.00012, 0.00012, 0.00013, 0.00013, 0.00013, 0.00012, 0.00013, 0.00013, 0.00013, 0.00013, 0.00013, 0.00014, 0.00013, 0.00013, 0.00013, 0.00012, 0.00012, 0.00013, 0.00012, 0.00013, 0.00013, 0.00012, 0.00012, 0.00012, 0.00012, 0.00012, 0.00013, 0.00012, 0.00012, 0.00012, 0.00013, 0.00012, 0.00014, 0.00014, 0.00012, 0.00013, 0.00012, 0.00012, 0.00012, 0.00013, 0.00012, 0.00012, 0.00012, 0.00012, 0.00012, 0.00012, 0.00013, 0.00012, 0.00012, 0.00012, 0.00013, 0.00012, 0.00012, 0.00012, 0.00012, 0.00013, 0.00012, 0.00012, 0.00013, 0.00013, 0.00013, 0.00013, 0.00013, 0.00012, 0.00012, 0.00012, 0.00012, 0.00013, 0.00012, 0.00012, 0.00013, 0.00012, 0.00012, 0.00012, 0.00012, 0.00012, 0.00013, 0.00013, 0.00012, 0.00013, 0.00012, 0.00012, 0.00013, 0.00013, 0.00012, 0.00012, 0.00012, 0.00012, 0.00012, 0.00012, 0.00012, 0.00012, 0.00012, 0.00013, 0.00012, 0.00012, 0.00012, 0.00012, 0.00012]}, "backward-recv-time": {"start_step": 0, "end_step": 2000, "step_interval": 5, "values": [0.02047, 0.0283, 0.02457, 0.02402, 0.02376, 0.02455, 0.02368, 0.02489, 0.03547, 0.02397, 0.02483, 0.02383, 0.02354, 0.02677, 0.02403, 0.02404, 0.02385, 0.02413, 0.02382, 0.02401, 0.02447, 0.02418, 0.02565, 0.02458, 0.02399, 0.02426, 0.02371, 0.02373, 0.02497, 0.02531, 0.02428, 0.02424, 0.02812, 0.02847, 0.02391, 0.0276, 0.02414, 0.02342, 0.02403, 0.0241, 0.02246, 0.0239, 0.02373, 0.02354, 0.024, 0.02551, 0.02523, 0.02434, 0.02333, 0.02695, 0.02802, 0.03335, 0.024, 0.02415, 0.02428, 0.0235, 0.02721, 0.02385, 0.02396, 0.02372, 0.02372, 0.02589, 0.02448, 0.02657, 0.02807, 0.02364, 0.02407, 0.02393, 0.02278, 0.02609, 0.02324, 0.02406, 0.02392, 0.02575, 0.02435, 0.02335, 0.02423, 0.02688, 0.02482, 0.02464, 0.0283, 0.02798, 0.02454, 0.02403, 0.02385, 0.02375, 0.024, 0.02436, 0.02658, 0.02418, 0.02444, 0.02438, 0.02772, 0.02445, 0.02469, 0.02482, 0.025, 0.0236, 0.02423, 0.02583, 0.02383, 0.02532, 0.02443, 0.02397, 0.02832, 0.02453, 0.02425, 0.02386, 0.02401, 0.02329, 0.02374, 0.02459, 0.02345, 0.02812, 0.02257, 0.02428, 0.03159, 0.02496, 0.02394, 0.02407, 0.02348, 0.02404, 0.0242, 0.02606, 0.02405, 0.02413, 0.02672, 0.02751, 0.02579, 0.02343, 0.02459, 0.02392, 0.02467, 0.02321, 0.02966, 0.02406, 0.02342, 0.02901, 0.02438, 0.02338, 0.02418, 0.02428, 0.02389, 0.02408, 0.02451, 0.02382, 0.02778, 0.02307, 0.02734, 0.02437, 0.02405, 0.02422, 0.02458, 0.02387, 0.02398, 0.02622, 0.0253, 0.02883, 0.02608, 0.02311, 0.02341, 0.0239, 0.02486, 0.02775, 0.02913, 0.02946, 0.03162, 0.03164, 0.03243, 0.02904, 0.03427, 0.02606, 0.02427, 0.02426, 0.02481, 0.02533, 0.02412, 0.02331, 0.02327, 0.02433, 0.02456, 0.02446, 0.02307, 0.02419, 0.02354, 0.02436, 0.02445, 0.02378, 0.02468, 0.02434, 0.02455, 0.02741, 0.02293, 0.02633, 0.02903, 0.02671, 0.02326, 0.0238, 0.02369, 0.02323, 0.02472, 0.02363, 0.02637, 0.02415, 0.0239, 0.02407, 0.02419, 0.0237, 0.02387, 0.02419, 0.02417, 0.02427, 0.02439, 0.02456, 0.02399, 0.02419, 0.0259, 0.02715, 0.02432, 0.02384, 0.02406, 0.02463, 0.02389, 0.02404, 0.02528, 0.02496, 0.0241, 0.02492, 0.02586, 0.02752, 0.02936, 0.02831, 0.02641, 0.02748, 0.02535, 0.0236, 0.02441, 0.02391, 0.02402, 0.02375, 0.02392, 0.02658, 0.02281, 0.02404, 0.02443, 0.02393, 0.02425, 0.02565, 0.02492, 0.02922, 0.02822, 0.02695, 0.02827, 0.02425, 0.02791, 0.02429, 0.02507, 0.02421, 0.02448, 0.02504, 0.02444, 0.02428, 0.02484, 0.02431, 0.0247, 0.02476, 0.02429, 0.02826, 0.02806, 0.02466, 0.02444, 0.02446, 0.02398, 0.0246, 0.02694, 0.02743, 0.02754, 0.02821, 0.02752, 0.02768, 0.02846, 0.02827, 0.02821, 0.02757, 0.02781, 0.03032, 0.0282, 0.02767, 0.02766, 0.02791, 0.02891, 0.02728, 0.02724, 0.02826, 0.02818, 0.0275, 0.02704, 0.02768, 0.02881, 0.02841, 0.02812, 0.02758, 0.02852, 0.02732, 0.02863, 0.0247, 0.02488, 0.02405, 0.02493, 0.02485, 0.025, 0.02485, 0.0248, 0.02492, 0.02512, 0.02464, 0.02467, 0.02816, 0.02752, 0.02469, 0.02368, 0.02464, 0.02438, 0.02448, 0.02474, 0.0246, 0.0247, 0.02471, 0.02492, 0.02452, 0.02459, 0.02436, 0.02461, 0.02714, 0.02468, 0.02624, 0.02941, 0.02449, 0.02703, 0.02762, 0.0284, 0.02681, 0.02872, 0.02442, 0.02456, 0.02406, 0.02457, 0.02358, 0.02347, 0.02871, 0.03113, 0.02849, 0.02643, 0.02442, 0.02499, 0.02477, 0.02568, 0.02464, 0.02487, 0.02408, 0.0248, 0.0262, 0.02523, 0.02571, 0.02565, 0.02504, 0.02409, 0.02564, 0.02393, 0.02423, 0.02644, 0.0241, 0.02354, 0.02445, 0.02479, 0.02481, 0.02499, 0.02444, 0.02433, 0.02438, 0.02439, 0.02468, 0.02426, 0.02465, 0.02263, 0.02673, 0.0262, 0.02622, 0.02641, 0.0272, 0.02655, 0.02722, 0.02659, 0.02705, 0.02744, 0.02687, 0.02797, 0.02579, 0.0241, 0.02442]}, "backward-send-time": {"start_step": 0, "end_step": 2000, "step_interval": 5, "values": [0.00019, 0.00019, 0.00016, 0.0002, 0.00018, 0.00018, 0.00016, 0.00018, 0.00022, 0.00017, 0.00018, 0.00017, 0.00018, 0.00016, 0.00017, 0.00017, 0.00018, 0.00017, 0.00016, 0.00016, 0.00019, 0.00019, 0.00016, 0.00018, 0.00016, 0.00016, 0.00018, 0.00016, 0.00017, 0.00018, 0.00016, 0.00019, 0.00018, 0.00016, 0.00019, 0.00016, 0.00016, 0.00018, 0.00018, 0.00018, 0.0002, 0.00018, 0.00018, 0.00017, 0.00016, 0.00018, 0.00017, 0.00017, 0.00018, 0.00021, 0.00019, 0.00018, 0.00019, 0.00018, 0.00016, 0.00017, 0.00018, 0.00018, 0.00018, 0.00016, 0.00018, 0.00018, 0.00016, 0.00018, 0.0002, 0.00018, 0.00018, 0.00017, 0.00018, 0.00016, 0.00016, 0.00019, 0.00021, 0.00017, 0.00016, 0.00016, 0.00018, 0.00018, 0.00019, 0.00018, 0.00019, 0.00019, 0.00019, 0.00016, 0.00018, 0.00021, 0.00017, 0.00018, 0.00017, 0.00018, 0.00016, 0.00016, 0.00018, 0.00017, 0.00016, 0.00018, 0.00036, 0.00016, 0.00022, 0.00016, 0.00016, 0.00019, 0.00019, 0.00018, 0.00017, 0.00016, 0.00018, 0.00016, 0.00018, 0.00018, 0.00016, 0.00017, 0.00018, 0.00019, 0.00018, 0.00018, 0.00032, 0.00018, 0.00018, 0.00016, 0.00021, 0.00018, 0.00017, 0.00018, 0.00018, 0.00018, 0.00016, 0.00018, 0.00017, 0.00018, 0.00018, 0.00018, 0.00021, 0.00016, 0.00019, 0.00019, 0.00018, 0.00017, 0.00017, 0.00018, 0.00018, 0.00018, 0.00018, 0.00019, 0.00018, 0.00016, 0.00018, 0.00019, 0.00018, 0.00021, 0.00016, 0.00017, 0.00016, 0.00016, 0.00017, 0.0002, 0.00018, 0.00017, 0.00018, 0.00017, 0.00018, 0.00019, 0.00018, 0.00018, 0.00018, 0.00019, 0.00027, 0.00031, 0.00017, 0.00017, 0.00016, 0.00016, 0.00019, 0.00018, 0.00016, 0.00018, 0.00018, 0.0002, 0.00018, 0.00018, 0.00018, 0.00018, 0.00018, 0.00016, 0.00018, 0.00018, 0.00019, 0.00018, 0.00019, 0.00018, 0.0002, 0.0002, 0.00018, 0.00017, 0.00019, 0.00018, 0.00018, 0.00018, 0.00016, 0.00019, 0.00016, 0.00018, 0.00018, 0.00018, 0.00016, 0.00018, 0.00019, 0.00018, 0.00018, 0.00016, 0.00018, 0.00018, 0.00018, 0.00017, 0.00017, 0.00019, 0.00019, 0.00018, 0.00019, 0.00018, 0.00016, 0.00018, 0.00018, 0.00016, 0.0002, 0.00016, 0.00018, 0.00016, 0.00017, 0.00018, 0.00018, 0.00019, 0.00018, 0.00016, 0.00018, 0.00019, 0.00018, 0.00016, 0.00016, 0.00018, 0.00016, 0.00016, 0.00016, 0.00019, 0.00018, 0.00018, 0.00018, 0.00018, 0.00019, 0.00019, 0.00016, 0.00018, 0.00017, 0.00019, 0.00037, 0.00017, 0.00017, 0.00019, 0.00018, 0.00018, 0.00016, 0.00018, 0.00018, 0.00018, 0.00019, 0.0002, 0.00016, 0.00018, 0.00029, 0.00019, 0.0002, 0.00018, 0.00016, 0.00016, 0.00016, 0.00016, 0.00018, 0.00018, 0.00018, 0.00017, 0.00037, 0.00019, 0.00019, 0.00018, 0.00018, 0.00018, 0.0002, 0.00016, 0.00018, 0.00029, 0.00017, 0.00024, 0.00016, 0.00019, 0.00016, 0.00017, 0.00035, 0.00036, 0.00017, 0.00016, 0.0002, 0.00034, 0.0002, 0.00016, 0.00017, 0.0002, 0.00016, 0.00018, 0.00018, 0.00016, 0.00017, 0.00017, 0.00016, 0.00017, 0.00018, 0.00018, 0.00018, 0.00025, 0.00018, 0.00016, 0.00016, 0.00016, 0.00017, 0.00017, 0.00018, 0.00016, 0.00017, 0.00018, 0.00017, 0.00016, 0.00018, 0.00018, 0.00017, 0.00018, 0.00018, 0.00018, 0.00017, 0.00016, 0.00016, 0.00019, 0.00017, 0.00018, 0.00019, 0.00016, 0.00018, 0.00018, 0.00019, 0.00016, 0.00016, 0.00019, 0.00017, 0.00019, 0.00017, 0.00017, 0.00018, 0.00017, 0.00016, 0.00018, 0.00016, 0.00018, 0.00019, 0.00016, 0.00017, 0.00019, 0.00016, 0.00017, 0.00016, 0.00016, 0.00019, 0.00017, 0.00018, 0.00018, 0.00018, 0.00016, 0.00019, 0.00018, 0.00017, 0.00018, 0.00016, 0.00018, 0.0002, 0.00017, 0.00016, 0.00017, 0.00017, 0.00018, 0.00018, 0.00018, 0.00016, 0.00019, 0.00018, 0.00018, 0.00018, 0.00019, 0.00016, 0.00016, 0.00017, 0.00018, 0.00018, 0.00016]}, "forward-send-backward-recv-time": {"start_step": 0, "end_step": 2000, "step_interval": 5, "values": [7.72045, 0.09004, 0.10467, 0.09849, 0.09238, 0.09943, 0.10332, 0.10911, 0.10563, 0.10498, 0.10272, 0.10382, 0.10192, 0.10289, 0.10891, 0.10722, 0.1057, 0.11565, 0.11445, 0.10746, 0.11354, 0.10514, 0.10376, 0.08937, 0.09262, 0.08764, 0.08288, 0.09035, 0.09702, 0.09008, 0.09616, 0.09645, 0.09564, 0.08936, 0.08325, 0.08878, 0.08887, 0.08097, 0.16157, 0.08262, 0.08896, 0.09145, 0.09803, 0.08184, 0.09702, 0.0971, 0.09683, 0.09764, 0.08935, 0.0971, 0.10578, 0.09846, 0.10251, 0.08742, 0.08778, 0.08971, 0.09353, 0.08897, 0.09, 0.08803, 0.08686, 0.08756, 0.09058, 0.08647, 0.08759, 0.09747, 0.10439, 0.10521, 0.09647, 0.10904, 0.09397, 0.09736, 0.10653, 0.0936, 0.10631, 0.1059, 0.10256, 0.09952, 0.09927, 0.10519, 0.10149, 0.09551, 0.10221, 0.10051, 0.09736, 0.09577, 0.0979, 0.09361, 0.09726, 0.10742, 0.0922, 0.10792, 0.10335, 0.10219, 0.1015, 0.09685, 0.09726, 0.10184, 0.09792, 0.10191, 0.1005, 0.10051, 0.09742, 0.09427, 0.09441, 0.08885, 0.09704, 0.09172, 0.09714, 0.09629, 0.10183, 0.09676, 0.09562, 0.09133, 0.09003, 0.10068, 0.09125, 0.0941, 0.09629, 0.10409, 0.09294, 0.09359, 0.10104, 0.10583, 0.09162, 0.08569, 0.08813, 0.093, 0.08756, 0.10008, 0.09688, 0.1054, 0.10747, 0.10112, 0.10023, 0.10296, 0.09747, 0.0945, 0.09503, 0.09075, 0.10094, 0.09821, 0.10359, 0.11126, 0.11094, 0.10686, 0.10472, 0.10387, 0.09679, 0.10627, 0.11005, 0.10858, 0.10916, 0.10819, 0.11254, 0.11227, 0.1067, 0.10979, 0.10635, 0.10862, 0.11093, 0.10588, 0.1078, 0.11054, 0.10333, 0.10314, 0.11111, 0.10133, 0.10064, 0.10338, 0.09919, 0.10252, 0.10368, 0.10692, 0.11169, 0.10373, 0.1082, 0.11025, 0.09905, 0.10905, 0.11343, 0.10499, 0.10807, 0.10315, 0.09841, 0.10583, 0.10804, 0.09746, 0.10771, 0.10609, 0.10625, 0.1058, 0.10401, 0.10832, 0.10595, 0.10705, 0.11742, 0.10139, 0.10969, 0.09952, 0.10696, 0.11066, 0.10165, 0.10114, 0.10538, 0.10594, 0.11402, 0.10492, 0.10645, 0.11173, 0.10848, 0.11309, 0.10714, 0.10786, 0.10722, 0.10193, 0.11309, 0.0997, 0.10535, 0.10927, 0.11186, 0.11523, 0.10176, 0.11174, 0.10738, 0.10339, 0.10818, 0.10428, 0.10357, 0.102, 0.11031, 0.10504, 0.10603, 0.10464, 0.10777, 0.10003, 0.11154, 0.10215, 0.10884, 0.1135, 0.10294, 0.10521, 0.18146, 0.15513, 0.10795, 0.10192, 0.09492, 0.1123, 0.11068, 0.10753, 0.10062, 0.20176, 0.10053, 0.10546, 0.10178, 0.10047, 0.10162, 0.10317, 0.10396, 0.10664, 0.11601, 0.12091, 0.11596, 0.11321, 0.11757, 0.11585, 0.1102, 0.10582, 0.10902, 0.11204, 0.11498, 0.11048, 0.11561, 0.12266, 0.11204, 0.10563, 0.11232, 0.10806, 0.10523, 0.11245, 0.10857, 0.10998, 0.10637, 0.11004, 0.10832, 0.1137, 0.11249, 0.1137, 0.11325, 0.10714, 0.10913, 0.11342, 0.10767, 0.11168, 0.1127, 0.10979, 0.10867, 0.10899, 0.11074, 0.10988, 0.11196, 0.11045, 0.10625, 0.10876, 0.11621, 0.10786, 0.11166, 0.1137, 0.1159, 0.12034, 0.12688, 0.13086, 0.12051, 0.11583, 0.12425, 0.12785, 0.11994, 0.1156, 0.11305, 0.1064, 0.11037, 0.11458, 0.10783, 0.11267, 0.11832, 0.11674, 0.12221, 0.11896, 0.11355, 0.12228, 0.11929, 0.11934, 0.11071, 0.11311, 0.12323, 0.11815, 0.1124, 0.10574, 0.10714, 0.11404, 0.1155, 0.11749, 0.11507, 0.11217, 0.11336, 0.11724, 0.11529, 0.11873, 0.11413, 0.11342, 0.11662, 0.11253, 0.21031, 0.1153, 0.11949, 0.12203, 0.12384, 0.12782, 0.12363, 0.12548, 0.12785, 0.11974, 0.12339, 0.11698, 0.1138, 0.11801, 0.11508, 0.12193, 0.1161, 0.11722, 0.11675, 0.12016, 0.12149, 0.12239, 0.12005, 0.12773, 0.12921, 0.11853, 0.11824, 0.12298, 0.11989, 0.12376, 0.12606, 0.12268, 0.12167, 0.11886, 0.10748, 0.11973, 0.11767, 0.12515, 0.11708, 0.11935, 0.12016, 0.12159, 0.11803, 0.11151, 0.11606, 0.11651, 0.12057, 0.10879]}, "backward-send-forward-recv-time": {"start_step": 0, "end_step": 2000, "step_interval": 5, "values": [3.17241, 0.01112, 0.01172, 0.00869, 0.00901, 0.01001, 0.01115, 0.00794, 0.00798, 0.0109, 0.01029, 0.01093, 0.01077, 0.01317, 0.01259, 0.00838, 0.01022, 0.00884, 0.01678, 0.0152, 0.00915, 0.00886, 0.00872, 0.00978, 0.01165, 0.00864, 0.01118, 0.01286, 0.00996, 0.0125, 0.01039, 0.01705, 0.00824, 0.00886, 0.00817, 0.00863, 0.0105, 0.00871, 0.08171, 0.01193, 0.01314, 0.01206, 0.01407, 0.01071, 0.01251, 0.01179, 0.01146, 0.00929, 0.01052, 0.01215, 0.0084, 0.00818, 0.00939, 0.0111, 0.00825, 0.01008, 0.01023, 0.00961, 0.0079, 0.01198, 0.0144, 0.00802, 0.01242, 0.00847, 0.01011, 0.00724, 0.00808, 0.0078, 0.00899, 0.00896, 0.00949, 0.00922, 0.01098, 0.01, 0.01342, 0.00965, 0.00844, 0.01778, 0.01504, 0.00876, 0.01126, 0.01156, 0.00994, 0.00745, 0.01045, 0.01139, 0.01102, 0.01004, 0.01044, 0.01421, 0.01363, 0.0147, 0.01748, 0.01497, 0.01481, 0.01661, 0.00933, 0.01088, 0.01211, 0.01187, 0.0114, 0.01087, 0.00985, 0.01082, 0.01058, 0.01129, 0.00882, 0.01084, 0.00902, 0.0079, 0.01036, 0.01589, 0.01561, 0.01591, 0.00899, 0.01108, 0.00841, 0.01003, 0.00851, 0.00882, 0.00846, 0.00785, 0.01152, 0.00747, 0.01326, 0.01202, 0.01211, 0.01078, 0.00952, 0.00873, 0.00881, 0.00874, 0.00915, 0.00875, 0.01297, 0.01552, 0.0151, 0.01016, 0.00992, 0.01251, 0.01115, 0.01149, 0.00982, 0.01462, 0.01529, 0.0145, 0.01056, 0.01488, 0.01365, 0.01448, 0.00917, 0.0134, 0.01205, 0.01572, 0.0126, 0.01488, 0.01305, 0.01335, 0.0138, 0.0164, 0.01209, 0.01237, 0.01442, 0.01402, 0.01277, 0.01318, 0.01188, 0.0129, 0.01144, 0.01322, 0.01297, 0.0121, 0.01209, 0.01029, 0.01079, 0.01249, 0.01233, 0.0121, 0.01022, 0.0128, 0.01174, 0.01218, 0.01303, 0.01323, 0.01318, 0.01287, 0.00961, 0.01202, 0.0124, 0.00992, 0.00876, 0.00935, 0.01319, 0.01636, 0.01632, 0.01494, 0.01298, 0.01614, 0.01406, 0.01537, 0.01153, 0.01115, 0.01271, 0.0107, 0.01222, 0.01248, 0.01198, 0.01383, 0.01146, 0.01187, 0.01068, 0.01125, 0.00998, 0.01224, 0.01454, 0.01162, 0.00956, 0.01122, 0.0154, 0.01199, 0.01342, 0.01294, 0.01456, 0.01293, 0.01589, 0.01161, 0.01349, 0.01587, 0.0161, 0.01506, 0.01604, 0.01245, 0.01415, 0.01038, 0.01375, 0.01225, 0.01179, 0.01138, 0.01149, 0.0114, 0.01157, 0.01201, 0.09678, 0.06875, 0.01665, 0.01943, 0.01672, 0.01779, 0.01975, 0.01513, 0.01188, 0.01383, 0.01055, 0.01209, 0.01624, 0.01171, 0.01034, 0.00943, 0.0124, 0.01104, 0.01002, 0.00883, 0.01064, 0.01032, 0.00949, 0.01005, 0.01087, 0.01209, 0.01055, 0.00979, 0.00997, 0.01044, 0.01106, 0.01088, 0.01076, 0.01045, 0.01152, 0.01085, 0.0105, 0.01114, 0.01146, 0.01082, 0.01229, 0.01175, 0.01162, 0.01101, 0.01116, 0.01256, 0.01128, 0.01152, 0.0107, 0.00988, 0.0095, 0.01009, 0.01045, 0.01003, 0.00992, 0.01213, 0.01087, 0.01368, 0.00953, 0.01064, 0.01243, 0.01214, 0.01155, 0.01008, 0.00976, 0.01033, 0.00912, 0.0081, 0.00967, 0.01116, 0.00911, 0.00921, 0.00997, 0.01136, 0.01025, 0.01241, 0.01273, 0.01327, 0.01109, 0.01279, 0.01226, 0.0121, 0.01061, 0.01401, 0.0134, 0.01432, 0.01133, 0.01394, 0.01414, 0.01459, 0.01155, 0.01481, 0.01262, 0.01169, 0.01079, 0.01328, 0.01375, 0.01229, 0.01428, 0.01132, 0.0128, 0.01126, 0.01216, 0.01314, 0.01251, 0.01231, 0.01489, 0.10504, 0.01146, 0.01181, 0.10182, 0.00974, 0.01066, 0.01245, 0.01188, 0.01268, 0.01247, 0.01243, 0.0136, 0.0116, 0.01212, 0.01459, 0.01641, 0.0161, 0.01189, 0.01301, 0.01594, 0.01101, 0.01209, 0.0146, 0.01388, 0.01439, 0.01206, 0.01364, 0.01212, 0.01313, 0.01581, 0.01511, 0.01362, 0.01411, 0.0139, 0.01423, 0.01307, 0.01509, 0.01644, 0.01567, 0.01653, 0.01601, 0.0161, 0.01324, 0.01587, 0.01735, 0.01691, 0.01574, 0.01699, 0.01222, 0.01273, 0.0119]}, "layernorm-grads-all-reduce-time": {"start_step": 0, "end_step": 2000, "step_interval": 5, "values": [0.00124, 0.00087, 0.00088, 0.00087, 0.00086, 0.00085, 0.00085, 0.00085, 0.00098, 0.00088, 0.00087, 0.00087, 0.00087, 0.00088, 0.00085, 0.00085, 0.00086, 0.00082, 0.00084, 0.00083, 0.00103, 0.00352, 0.00085, 0.00084, 0.00084, 0.00089, 0.00086, 0.00084, 0.00085, 0.00084, 0.00085, 0.00087, 0.00085, 0.00085, 0.00086, 0.00086, 0.00084, 0.00086, 0.00086, 0.00085, 0.00087, 0.00086, 0.00085, 0.00087, 0.00084, 0.00086, 0.00085, 0.00084, 0.00167, 0.00083, 0.00086, 0.00111, 0.00108, 0.00101, 0.00084, 0.00085, 0.00085, 0.00086, 0.00084, 0.00084, 0.00086, 0.00083, 0.00083, 0.00083, 0.00111, 0.0009, 0.00086, 0.00088, 0.00086, 0.00084, 0.00086, 0.00084, 0.00091, 0.00085, 0.00084, 0.00087, 0.00083, 0.00083, 0.00241, 0.00085, 0.00086, 0.00109, 0.00086, 0.00085, 0.00085, 0.00084, 0.00084, 0.00084, 0.00084, 0.00092, 0.00087, 0.00083, 0.00087, 0.00532, 0.00083, 0.00085, 0.00101, 0.00113, 0.0011, 0.00089, 0.00088, 0.00086, 0.00113, 0.00084, 0.00122, 0.00087, 0.00086, 0.00085, 0.00086, 0.00088, 0.00085, 0.00088, 0.0031, 0.00085, 0.00087, 0.00085, 0.001, 0.00116, 0.00088, 0.00088, 0.00086, 0.00085, 0.00085, 0.00084, 0.00426, 0.00086, 0.00086, 0.00116, 0.00089, 0.00087, 0.00087, 0.00085, 0.00085, 0.00084, 0.00087, 0.00084, 0.00084, 0.0009, 0.00108, 0.00085, 0.00085, 0.00086, 0.00086, 0.00088, 0.00084, 0.00085, 0.00084, 0.00104, 0.00087, 0.00104, 0.00084, 0.00083, 0.00084, 0.00086, 0.00086, 0.00087, 0.00084, 0.00083, 0.00086, 0.00218, 0.00084, 0.004, 0.00086, 0.00087, 0.00087, 0.00105, 0.00103, 0.00103, 0.00107, 0.00089, 0.00107, 0.00114, 0.00113, 0.00085, 0.00107, 0.00086, 0.00089, 0.00088, 0.00089, 0.00086, 0.00085, 0.00085, 0.00086, 0.00088, 0.00087, 0.00085, 0.00086, 0.00087, 0.00085, 0.00085, 0.00087, 0.00089, 0.00085, 0.00088, 0.00087, 0.00086, 0.00241, 0.00085, 0.00084, 0.00087, 0.00099, 0.001, 0.00108, 0.00085, 0.00084, 0.00086, 0.00085, 0.00088, 0.00085, 0.00085, 0.00084, 0.00086, 0.00088, 0.00084, 0.00085, 0.00087, 0.00087, 0.00087, 0.00111, 0.00086, 0.00085, 0.00086, 0.00086, 0.00084, 0.00083, 0.00084, 0.00083, 0.00088, 0.00084, 0.00085, 0.0011, 0.0011, 0.00116, 0.00089, 0.00115, 0.00087, 0.00378, 0.00087, 0.00085, 0.00085, 0.0009, 0.00086, 0.00089, 0.00086, 0.00085, 0.00085, 0.00084, 0.00087, 0.00086, 0.00086, 0.00104, 0.00088, 0.00085, 0.00115, 0.00106, 0.00088, 0.00086, 0.00106, 0.00086, 0.00087, 0.00086, 0.0026, 0.00449, 0.00471, 0.00277, 0.00087, 0.00088, 0.00085, 0.00107, 0.0011, 0.00118, 0.00086, 0.00089, 0.00084, 0.00084, 0.00084, 0.00085, 0.00087, 0.00108, 0.0011, 0.00098, 0.00109, 0.00111, 0.0011, 0.0011, 0.0011, 0.0011, 0.00111, 0.00111, 0.00107, 0.0011, 0.00103, 0.00103, 0.00111, 0.00112, 0.00109, 0.00106, 0.00108, 0.00103, 0.00103, 0.00111, 0.00102, 0.00112, 0.00112, 0.00111, 0.00112, 0.00109, 0.00329, 0.00093, 0.00085, 0.00089, 0.00085, 0.00089, 0.00087, 0.00086, 0.00536, 0.0011, 0.00111, 0.00111, 0.00116, 0.00086, 0.00084, 0.00087, 0.0009, 0.00085, 0.00084, 0.00087, 0.00086, 0.00087, 0.00086, 0.00084, 0.00085, 0.00088, 0.00086, 0.00086, 0.00417, 0.00088, 0.00121, 0.00085, 0.00085, 0.00085, 0.00085, 0.00095, 0.00116, 0.00086, 0.00086, 0.00086, 0.00499, 0.00318, 0.00107, 0.00371, 0.00087, 0.00089, 0.00087, 0.00086, 0.00085, 0.00084, 0.00084, 0.00086, 0.00083, 0.00088, 0.00085, 0.00085, 0.00087, 0.00085, 0.00087, 0.00086, 0.00086, 0.00087, 0.00085, 0.00084, 0.00085, 0.00085, 0.00086, 0.00086, 0.00085, 0.00084, 0.00088, 0.00086, 0.00085, 0.00086, 0.00085, 0.0009, 0.00095, 0.00448, 0.00088, 0.00088, 0.00089, 0.00089, 0.00086, 0.00087, 0.00087, 0.0009, 0.00086, 0.00086, 0.00088, 0.00087, 0.00088, 0.0009, 0.00101]}, "embedding-grads-all-reduce-time": {"start_step": 0, "end_step": 2000, "step_interval": 5, "values": [0.00038, 0.00032, 0.00032, 0.00032, 0.00032, 0.00032, 0.00032, 0.00032, 0.00032, 0.00032, 0.00032, 0.00032, 0.00032, 0.00032, 0.00032, 0.00033, 0.00032, 0.00032, 0.00031, 0.00032, 0.00031, 0.00032, 0.00032, 0.00032, 0.00032, 0.00031, 0.00032, 0.00032, 0.00032, 0.00032, 0.00031, 0.00032, 0.00032, 0.00032, 0.00032, 0.00032, 0.00032, 0.00032, 0.00031, 0.00032, 0.00032, 0.00031, 0.00032, 0.00032, 0.00032, 0.00032, 0.00032, 0.00031, 0.00032, 0.00032, 0.00032, 0.00032, 0.00032, 0.00032, 0.00032, 0.00033, 0.00031, 0.00032, 0.00032, 0.00032, 0.00032, 0.00031, 0.00031, 0.00031, 0.00032, 0.00032, 0.00031, 0.00032, 0.00032, 0.00032, 0.00032, 0.00032, 0.00032, 0.00032, 0.00032, 0.00032, 0.00032, 0.00032, 0.00032, 0.00032, 0.00032, 0.00032, 0.00032, 0.00032, 0.00032, 0.00032, 0.00032, 0.00032, 0.00032, 0.00033, 0.00031, 0.00032, 0.00032, 0.00034, 0.00032, 0.00032, 0.00031, 0.00031, 0.00032, 0.00032, 0.00032, 0.00032, 0.00032, 0.00031, 0.00034, 0.00031, 0.00031, 0.00032, 0.00032, 0.00032, 0.00031, 0.00032, 0.00033, 0.00032, 0.00032, 0.00031, 0.00032, 0.00032, 0.00032, 0.00032, 0.00032, 0.00032, 0.00032, 0.00032, 0.00034, 0.00032, 0.00032, 0.00032, 0.00032, 0.00032, 0.00032, 0.00032, 0.00032, 0.00032, 0.00032, 0.00032, 0.00031, 0.00032, 0.00031, 0.00032, 0.00032, 0.00031, 0.00032, 0.00031, 0.00032, 0.00031, 0.00032, 0.00032, 0.00032, 0.00032, 0.00033, 0.00032, 0.00032, 0.00032, 0.00032, 0.00032, 0.00031, 0.00031, 0.00032, 0.00034, 0.00032, 0.00033, 0.00032, 0.00032, 0.00032, 0.00031, 0.00032, 0.00032, 0.00032, 0.00032, 0.00032, 0.00032, 0.00032, 0.00032, 0.00032, 0.00032, 0.00033, 0.00032, 0.00032, 0.00032, 0.00032, 0.00032, 0.00032, 0.00032, 0.00032, 0.00032, 0.00032, 0.00032, 0.00032, 0.00032, 0.00032, 0.00032, 0.00032, 0.00031, 0.00032, 0.00031, 0.00032, 0.00033, 0.00032, 0.00031, 0.00032, 0.00031, 0.00032, 0.00032, 0.00032, 0.00031, 0.00032, 0.00032, 0.00031, 0.00031, 0.00032, 0.00031, 0.00032, 0.00032, 0.00031, 0.00032, 0.00032, 0.00032, 0.00032, 0.00032, 0.00032, 0.00032, 0.00031, 0.00032, 0.00031, 0.00031, 0.00032, 0.00031, 0.00032, 0.00031, 0.00032, 0.00032, 0.00032, 0.00032, 0.00032, 0.00032, 0.00031, 0.00032, 0.00033, 0.00032, 0.00032, 0.00032, 0.00032, 0.00032, 0.00032, 0.00032, 0.00032, 0.00031, 0.00032, 0.00032, 0.00032, 0.00033, 0.00032, 0.00032, 0.00032, 0.00031, 0.00032, 0.00032, 0.00031, 0.00032, 0.00032, 0.00031, 0.00032, 0.00031, 0.00034, 0.00032, 0.00032, 0.00032, 0.00032, 0.00031, 0.00031, 0.00032, 0.00032, 0.00032, 0.00033, 0.00032, 0.00031, 0.00032, 0.00032, 0.00031, 0.00032, 0.00032, 0.00032, 0.00032, 0.00032, 0.00031, 0.00031, 0.00033, 0.00031, 0.00032, 0.00031, 0.00032, 0.00032, 0.00032, 0.00032, 0.00032, 0.00032, 0.00031, 0.00031, 0.00033, 0.00032, 0.00032, 0.00032, 0.00032, 0.00032, 0.00031, 0.00032, 0.00032, 0.00032, 0.00031, 0.00032, 0.00031, 0.00032, 0.00032, 0.00033, 0.00033, 0.00031, 0.00032, 0.00031, 0.00031, 0.00032, 0.00031, 0.00031, 0.00032, 0.00032, 0.00032, 0.00032, 0.00033, 0.00032, 0.00031, 0.00032, 0.00032, 0.00032, 0.00031, 0.00032, 0.00032, 0.00031, 0.00031, 0.00032, 0.00032, 0.00033, 0.00032, 0.00034, 0.00032, 0.00032, 0.00031, 0.00032, 0.00034, 0.00034, 0.00032, 0.00033, 0.00032, 0.00032, 0.00032, 0.00032, 0.00032, 0.00032, 0.00032, 0.00032, 0.00031, 0.00032, 0.00032, 0.00032, 0.00031, 0.00032, 0.00032, 0.00032, 0.00032, 0.00033, 0.00032, 0.00032, 0.00032, 0.00032, 0.00032, 0.00032, 0.00032, 0.00032, 0.00032, 0.00032, 0.00032, 0.00032, 0.00032, 0.00032, 0.00032, 0.00032, 0.00032, 0.00032, 0.00032, 0.00032, 0.00032, 0.00032, 0.00032, 0.00032, 0.00032, 0.00031, 0.00032, 0.00032, 0.00032, 0.00032, 0.00032]}, "all-grads-sync-time": {"start_step": 0, "end_step": 2000, "step_interval": 5, "values": [2.1656, 0.00059, 0.0006, 0.0006, 0.00059, 0.00062, 0.0006, 0.00059, 0.00058, 0.0006, 0.00059, 0.00058, 0.00059, 0.00059, 0.0006, 0.00058, 0.00059, 0.00058, 0.00059, 0.00058, 0.00059, 0.00065, 0.00064, 0.00063, 0.00059, 0.00059, 0.0006, 0.00058, 0.00059, 0.00059, 0.00059, 0.00059, 0.00061, 0.0006, 0.00058, 0.00064, 0.00058, 0.00058, 0.0006, 0.0006, 0.00058, 0.00058, 0.00059, 0.00059, 0.00063, 0.00059, 0.00058, 0.00058, 0.00059, 0.00059, 0.00059, 0.00059, 0.00063, 0.00059, 0.00059, 0.00059, 0.00058, 0.00059, 0.00064, 0.00058, 0.0006, 0.00059, 0.00059, 0.00059, 0.0006, 0.00059, 0.00059, 0.00059, 0.00059, 0.00058, 0.0006, 0.00058, 0.0006, 0.00059, 0.0006, 0.0006, 0.00057, 0.00059, 0.0006, 0.00058, 0.00059, 0.00059, 0.00064, 0.00058, 0.00059, 0.00063, 0.00059, 0.00058, 0.00059, 0.0006, 0.00059, 0.00058, 0.00058, 0.00058, 0.00059, 0.00058, 0.00058, 0.00059, 0.00058, 0.00059, 0.00059, 0.00059, 0.00059, 0.00063, 0.00059, 0.00059, 0.00059, 0.00057, 0.00058, 0.00059, 0.00058, 0.00062, 0.00059, 0.00059, 0.00059, 0.00058, 0.00059, 0.0006, 0.00058, 0.0006, 0.00058, 0.00062, 0.00059, 0.00063, 0.0006, 0.00059, 0.00059, 0.0006, 0.00059, 0.00059, 0.00059, 0.00058, 0.00063, 0.00058, 0.00059, 0.00058, 0.00058, 0.00058, 0.0006, 0.00058, 0.00058, 0.00059, 0.00059, 0.00059, 0.00058, 0.00057, 0.00059, 0.00058, 0.00059, 0.00059, 0.00058, 0.00058, 0.0006, 0.00058, 0.00058, 0.00059, 0.00058, 0.00059, 0.00059, 0.00058, 0.00058, 0.00059, 0.00059, 0.0006, 0.00063, 0.00059, 0.00059, 0.00058, 0.00059, 0.00062, 0.00062, 0.00058, 0.00059, 0.00059, 0.00058, 0.00059, 0.00059, 0.00058, 0.00059, 0.00058, 0.00058, 0.00059, 0.00059, 0.00059, 0.00058, 0.00059, 0.00059, 0.00058, 0.00059, 0.00059, 0.00058, 0.00059, 0.00059, 0.00058, 0.00058, 0.00059, 0.00074, 0.00059, 0.00059, 0.00059, 0.0006, 0.0006, 0.0006, 0.00058, 0.00058, 0.00058, 0.00059, 0.00059, 0.00059, 0.00059, 0.00058, 0.00059, 0.00059, 0.00059, 0.00058, 0.00059, 0.00058, 0.00064, 0.00059, 0.00063, 0.00059, 0.00059, 0.0006, 0.00058, 0.00058, 0.00058, 0.00058, 0.00059, 0.00058, 0.0006, 0.0006, 0.00059, 0.00058, 0.00058, 0.00057, 0.00058, 0.0006, 0.00059, 0.00058, 0.00058, 0.00059, 0.0006, 0.00058, 0.00058, 0.00059, 0.00058, 0.00057, 0.00058, 0.00059, 0.00058, 0.00059, 0.00059, 0.00059, 0.00058, 0.00058, 0.00058, 0.00059, 0.0006, 0.00058, 0.00065, 0.00059, 0.00062, 0.00058, 0.00057, 0.00061, 0.00059, 0.00059, 0.00058, 0.0006, 0.00063, 0.00059, 0.00058, 0.00059, 0.00058, 0.00062, 0.00059, 0.00058, 0.00059, 0.00058, 0.00058, 0.00058, 0.00058, 0.0006, 0.0006, 0.00059, 0.00058, 0.00059, 0.0006, 0.00058, 0.00059, 0.00059, 0.00059, 0.00059, 0.00058, 0.00058, 0.00058, 0.00059, 0.00064, 0.00058, 0.00057, 0.00059, 0.00058, 0.00059, 0.00057, 0.00058, 0.00058, 0.00058, 0.00059, 0.00059, 0.00064, 0.00058, 0.00058, 0.00059, 0.00058, 0.00058, 0.00058, 0.00057, 0.00059, 0.00058, 0.00059, 0.00058, 0.00058, 0.00058, 0.00059, 0.00059, 0.00058, 0.00058, 0.00058, 0.00059, 0.00059, 0.00059, 0.00059, 0.00059, 0.00058, 0.00058, 0.00063, 0.00058, 0.00063, 0.00059, 0.0006, 0.00057, 0.00058, 0.00059, 0.00059, 0.00059, 0.00059, 0.00062, 0.00062, 0.00058, 0.00057, 0.00058, 0.00059, 0.00059, 0.00059, 0.00059, 0.00059, 0.00059, 0.00059, 0.00059, 0.00059, 0.00059, 0.00059, 0.00059, 0.0006, 0.0006, 0.00058, 0.00058, 0.00059, 0.00063, 0.00057, 0.00059, 0.00063, 0.00059, 0.00059, 0.00059, 0.00058, 0.00058, 0.00058, 0.00059, 0.00059, 0.00063, 0.00059, 0.00059, 0.00059, 0.00059, 0.0006, 0.00059, 0.00059, 0.00059, 0.00059, 0.00059, 0.00059, 0.00058, 0.00059, 0.00058, 0.00059, 0.00059, 0.00058, 0.00059, 0.00059]}, "optimizer-copy-to-main-grad-time": {"start_step": 0, "end_step": 2000, "step_interval": 5, "values": [0.00016, 0.00012, 0.00011, 0.00011, 0.00011, 0.00011, 0.00012, 0.00011, 0.00012, 0.00011, 0.00011, 0.00011, 0.0001, 0.00011, 0.0001, 0.00011, 0.00011, 0.0001, 0.00011, 0.0001, 0.00011, 0.00011, 0.00011, 0.00011, 0.00011, 0.00011, 0.00011, 0.0001, 0.00011, 0.00011, 0.00011, 0.00012, 0.00011, 0.0001, 0.0001, 0.00011, 0.00011, 0.00012, 0.00011, 0.00011, 0.00011, 0.00011, 0.00011, 0.00011, 0.0001, 0.00011, 0.0001, 0.0001, 0.00011, 0.0001, 0.00011, 0.0001, 0.00011, 0.00011, 0.0001, 0.00011, 0.00011, 0.00011, 0.00011, 0.00011, 0.00011, 0.0001, 0.00011, 0.0001, 0.00011, 0.00011, 0.0001, 0.0001, 0.00011, 0.00011, 0.00011, 0.0001, 0.00012, 0.00011, 0.00011, 0.00012, 0.0001, 0.00011, 0.00011, 0.00011, 0.00011, 0.00011, 0.00011, 0.00011, 0.00011, 0.00011, 0.00011, 0.00011, 0.0001, 0.00012, 0.00011, 0.00012, 0.0001, 0.0001, 0.00011, 0.00011, 0.0001, 0.00011, 0.00011, 0.00011, 0.00011, 0.00011, 0.00011, 0.0001, 0.00011, 0.00011, 0.00011, 0.0001, 0.0001, 0.00012, 0.00011, 0.00011, 0.00011, 0.00011, 0.00011, 0.0001, 0.0001, 0.00011, 0.0001, 0.00011, 0.0001, 0.0001, 0.00011, 0.00011, 0.00011, 0.00011, 0.0001, 0.0001, 0.00011, 0.00011, 0.00011, 0.0001, 0.0001, 0.0001, 0.00011, 0.00011, 0.00011, 0.0001, 0.00012, 0.00011, 0.00011, 0.00011, 0.0001, 0.00011, 0.00011, 0.00011, 0.00011, 0.0001, 0.00011, 0.00011, 0.0001, 0.0001, 0.00012, 0.00012, 0.00011, 0.0001, 0.0001, 0.0001, 0.00011, 0.00011, 0.0001, 0.00011, 0.00011, 0.00011, 0.00011, 0.00011, 0.00011, 0.00011, 0.00011, 0.00011, 0.00011, 0.00011, 0.00011, 0.00011, 0.00012, 0.00011, 0.0001, 0.0001, 0.00011, 0.00011, 0.00011, 0.00011, 0.0001, 0.00011, 0.00011, 0.0001, 0.0001, 0.0001, 0.00011, 0.00011, 0.00012, 0.00011, 0.0001, 0.0001, 0.00011, 0.0001, 0.00011, 0.00011, 0.00011, 0.00011, 0.00012, 0.0001, 0.00011, 0.00011, 0.00011, 0.00011, 0.0001, 0.00011, 0.0001, 0.0001, 0.0001, 0.0001, 0.00011, 0.0001, 0.0001, 0.0001, 0.0001, 0.00011, 0.00011, 0.0001, 0.00011, 0.00011, 0.00011, 0.00011, 0.0001, 0.00011, 0.0001, 0.00011, 0.0001, 0.0001, 0.0001, 0.0001, 0.00011, 0.0001, 0.0001, 0.0001, 0.0001, 0.00011, 0.0001, 0.0001, 0.0001, 0.0001, 0.00011, 0.00011, 0.0001, 0.0001, 0.0001, 0.0001, 0.00011, 0.00011, 0.0001, 0.00011, 0.0001, 0.0001, 0.0001, 0.00011, 0.00011, 0.00011, 0.0001, 0.0001, 0.00011, 0.00011, 0.00011, 0.00012, 0.0001, 0.00011, 0.00011, 0.00011, 0.0001, 0.0001, 0.00011, 0.00011, 0.00011, 0.0001, 0.0001, 0.0001, 0.0001, 0.00011, 0.0001, 0.0001, 0.0001, 0.0001, 0.00011, 0.0001, 0.00012, 0.00011, 0.00011, 0.00011, 0.0001, 0.0001, 0.0001, 0.0001, 0.00011, 0.0001, 0.00012, 0.0001, 0.00011, 0.00011, 0.0001, 0.0001, 0.0001, 0.0001, 0.00011, 0.00011, 0.00011, 0.0001, 0.00011, 0.00011, 0.00011, 0.0001, 0.00011, 0.0001, 0.00011, 0.00011, 0.00019, 0.0001, 0.0001, 0.00011, 0.00011, 0.00011, 0.0001, 0.0001, 0.00011, 0.0001, 0.0001, 0.0001, 0.00011, 0.00011, 0.0001, 0.0001, 0.0001, 0.00011, 0.00011, 0.00011, 0.0001, 0.00011, 0.00011, 0.00011, 0.00011, 0.00011, 0.0001, 0.0001, 0.00011, 0.00011, 0.00011, 0.0001, 0.00011, 0.00011, 0.00011, 0.0001, 0.00011, 0.0001, 0.00012, 0.00011, 0.0001, 0.0001, 0.0001, 0.00011, 0.00011, 0.0001, 0.0001, 0.0001, 0.00011, 0.00011, 0.0001, 0.0001, 0.0001, 0.00011, 0.00011, 0.0001, 0.0001, 0.0001, 0.00011, 0.00011, 0.0001, 0.0001, 0.0001, 0.0001, 0.00011, 0.0001, 0.00011, 0.0001, 0.00011, 0.00011, 0.00011, 0.00011, 0.00011, 0.00011, 0.00011, 0.00011, 0.00011, 0.00011, 0.00011, 0.00011, 0.00011, 0.00011, 0.0001, 0.00011, 0.00011, 0.00011]}, "optimizer-clip-main-grad-time": {"start_step": 0, "end_step": 2000, "step_interval": 5, "values": [4.25848, 0.00058, 0.00058, 0.00057, 0.00057, 0.00058, 0.00058, 0.00057, 0.00057, 0.00058, 0.00057, 0.00057, 0.00056, 0.00056, 0.00057, 0.00056, 0.00059, 0.00056, 0.00056, 0.00055, 0.00056, 0.00057, 0.00056, 0.00056, 0.00056, 0.00057, 0.00056, 0.00056, 0.00057, 0.00056, 0.00057, 0.00056, 0.00055, 0.00055, 0.00057, 0.00057, 0.00058, 0.00055, 0.00056, 0.00056, 0.00056, 0.00055, 0.00057, 0.00056, 0.00056, 0.00056, 0.00058, 0.00056, 0.00055, 0.00056, 0.00056, 0.00057, 0.00057, 0.0006, 0.00056, 0.00056, 0.00055, 0.00056, 0.00056, 0.00055, 0.00056, 0.00055, 0.00055, 0.00056, 0.00057, 0.00057, 0.00056, 0.00056, 0.00056, 0.00057, 0.00059, 0.00056, 0.00058, 0.00056, 0.00056, 0.00057, 0.00055, 0.00055, 0.00056, 0.00056, 0.00056, 0.00071, 0.00056, 0.00056, 0.00057, 0.00057, 0.00055, 0.00056, 0.00055, 0.0006, 0.00055, 0.00056, 0.00055, 0.00055, 0.00057, 0.00055, 0.00055, 0.00057, 0.00046, 0.00057, 0.00057, 0.00057, 0.00056, 0.00055, 0.00071, 0.00056, 0.00056, 0.00057, 0.00057, 0.00047, 0.00056, 0.00048, 0.00046, 0.00056, 0.00057, 0.00055, 0.00055, 0.00056, 0.00055, 0.00057, 0.00056, 0.00056, 0.00056, 0.00056, 0.00046, 0.00056, 0.00055, 0.00055, 0.00056, 0.00058, 0.00045, 0.00056, 0.00057, 0.00055, 0.00057, 0.00055, 0.00055, 0.00055, 0.00056, 0.00056, 0.00055, 0.00055, 0.00057, 0.00046, 0.00046, 0.00056, 0.00056, 0.00056, 0.00056, 0.00056, 0.00055, 0.00055, 0.00055, 0.00055, 0.00056, 0.00057, 0.00055, 0.00055, 0.00057, 0.00057, 0.00064, 0.00056, 0.00056, 0.00057, 0.00057, 0.00055, 0.00056, 0.00055, 0.00055, 0.00056, 0.00056, 0.00057, 0.00056, 0.00056, 0.00056, 0.00056, 0.00055, 0.00055, 0.00056, 0.00055, 0.00055, 0.00056, 0.00057, 0.00056, 0.00057, 0.00056, 0.00055, 0.00058, 0.00056, 0.00056, 0.00056, 0.00056, 0.00055, 0.00056, 0.00056, 0.00055, 0.00056, 0.00056, 0.00055, 0.00057, 0.00077, 0.00056, 0.00056, 0.00057, 0.00056, 0.00057, 0.00056, 0.00057, 0.00056, 0.00056, 0.00055, 0.00056, 0.00058, 0.00055, 0.00056, 0.00055, 0.00056, 0.00056, 0.00056, 0.00056, 0.00056, 0.00056, 0.00056, 0.00056, 0.00056, 0.00055, 0.00055, 0.00054, 0.00055, 0.00055, 0.00056, 0.00062, 0.00058, 0.00055, 0.00056, 0.00056, 0.00056, 0.00056, 0.00055, 0.00055, 0.00055, 0.00056, 0.00056, 0.00055, 0.00057, 0.00057, 0.00056, 0.00055, 0.00055, 0.00055, 0.00055, 0.00058, 0.00055, 0.00056, 0.00056, 0.00056, 0.00055, 0.00056, 0.00055, 0.00056, 0.00056, 0.00057, 0.00057, 0.00061, 0.00057, 0.00057, 0.00056, 0.00057, 0.00055, 0.00056, 0.00056, 0.00056, 0.00058, 0.00056, 0.00057, 0.00055, 0.0006, 0.00056, 0.00057, 0.00056, 0.00055, 0.00056, 0.00056, 0.00057, 0.00057, 0.00055, 0.00056, 0.00056, 0.0006, 0.00063, 0.00057, 0.00056, 0.00056, 0.00057, 0.00058, 0.00056, 0.00059, 0.00057, 0.00056, 0.00055, 0.00056, 0.00064, 0.00056, 0.00056, 0.00057, 0.00056, 0.00056, 0.00057, 0.00068, 0.00056, 0.00056, 0.00056, 0.00058, 0.00056, 0.00059, 0.00056, 0.00055, 0.00057, 0.00057, 0.00055, 0.00057, 0.00056, 0.00057, 0.00057, 0.00056, 0.00056, 0.00055, 0.00057, 0.00057, 0.00055, 0.00056, 0.00057, 0.00056, 0.00056, 0.00056, 0.00057, 0.00056, 0.00056, 0.00058, 0.00056, 0.00055, 0.00056, 0.00056, 0.00057, 0.00056, 0.00056, 0.00076, 0.00058, 0.00057, 0.00057, 0.00056, 0.00056, 0.00057, 0.00056, 0.00056, 0.00056, 0.00057, 0.00057, 0.00056, 0.00055, 0.00055, 0.00057, 0.00056, 0.00056, 0.00056, 0.00055, 0.00056, 0.00057, 0.00056, 0.00055, 0.00061, 0.00056, 0.00055, 0.00056, 0.00055, 0.00056, 0.00056, 0.00055, 0.00057, 0.00055, 0.00055, 0.00056, 0.00057, 0.00056, 0.00057, 0.00056, 0.00056, 0.00056, 0.00057, 0.00057, 0.00057, 0.00057, 0.00057, 0.00057, 0.00057, 0.00056, 0.00056, 0.00056, 0.00056, 0.00056]}, "optimizer-count-zeros-time": {"start_step": 0, "end_step": 2000, "step_interval": 5, "values": [0.00381, 0.00273, 0.0027, 0.0027, 0.00273, 0.00271, 0.00267, 0.00283, 0.00274, 0.00269, 0.0027, 0.00269, 0.00272, 0.00273, 0.0027, 0.0027, 0.00269, 0.00268, 0.0027, 0.0027, 0.00273, 0.00272, 0.00268, 0.0027, 0.00278, 0.00278, 0.00271, 0.00269, 0.00268, 0.0027, 0.00271, 0.00271, 0.00269, 0.00273, 0.00271, 0.0027, 0.00267, 0.00269, 0.0027, 0.00271, 0.00271, 0.00269, 0.00269, 0.00267, 0.00269, 0.00269, 0.00269, 0.0027, 0.0027, 0.00271, 0.00271, 0.00288, 0.00277, 0.00297, 0.0027, 0.00269, 0.00268, 0.00269, 0.00268, 0.00269, 0.00269, 0.0027, 0.00268, 0.0027, 0.00272, 0.00269, 0.0027, 0.00271, 0.00273, 0.0027, 0.00284, 0.0027, 0.00271, 0.00282, 0.0027, 0.00268, 0.00268, 0.00268, 0.0027, 0.0027, 0.00272, 0.00496, 0.0027, 0.00268, 0.00269, 0.00269, 0.00271, 0.00269, 0.00271, 0.00292, 0.0027, 0.00269, 0.00269, 0.00268, 0.00269, 0.00271, 0.00271, 0.00275, 0.00271, 0.00271, 0.00268, 0.00271, 0.00291, 0.00269, 0.00286, 0.00271, 0.00269, 0.00269, 0.00271, 0.00269, 0.0027, 0.00272, 0.00269, 0.00267, 0.00268, 0.00269, 0.00272, 0.00269, 0.00272, 0.0027, 0.00268, 0.00268, 0.00269, 0.0027, 0.00269, 0.0027, 0.00272, 0.0027, 0.00271, 0.00269, 0.00273, 0.0027, 0.0027, 0.0027, 0.00268, 0.00269, 0.0027, 0.00272, 0.00271, 0.00271, 0.00269, 0.0027, 0.00267, 0.00271, 0.00269, 0.00268, 0.00268, 0.0027, 0.00269, 0.00269, 0.00267, 0.0027, 0.00268, 0.00269, 0.0027, 0.0027, 0.00269, 0.00269, 0.00268, 0.00269, 0.00269, 0.00269, 0.00269, 0.00281, 0.0028, 0.00273, 0.00272, 0.00273, 0.00273, 0.00274, 0.00271, 0.00272, 0.0027, 0.00271, 0.0027, 0.00271, 0.00273, 0.00271, 0.00269, 0.00271, 0.00272, 0.00272, 0.00272, 0.0027, 0.00269, 0.00281, 0.00272, 0.00282, 0.00271, 0.0027, 0.00269, 0.00272, 0.00273, 0.00271, 0.00269, 0.0027, 0.0027, 0.00269, 0.00271, 0.00271, 0.00282, 0.00271, 0.00269, 0.00271, 0.0027, 0.00313, 0.0027, 0.00269, 0.00271, 0.00271, 0.0027, 0.0027, 0.00271, 0.00269, 0.00278, 0.00269, 0.00272, 0.00278, 0.00271, 0.0027, 0.00269, 0.00271, 0.0027, 0.0027, 0.0027, 0.00269, 0.00271, 0.00271, 0.00269, 0.00272, 0.00271, 0.00296, 0.00271, 0.00271, 0.0027, 0.00271, 0.00271, 0.00275, 0.00269, 0.00267, 0.00271, 0.00274, 0.00267, 0.00271, 0.0027, 0.00273, 0.00272, 0.00271, 0.00271, 0.00273, 0.00272, 0.0027, 0.00274, 0.00273, 0.0027, 0.00272, 0.00271, 0.0027, 0.00271, 0.00265, 0.00264, 0.00264, 0.00273, 0.00262, 0.00291, 0.00266, 0.00273, 0.00265, 0.00265, 0.00263, 0.00265, 0.00264, 0.00274, 0.00272, 0.00262, 0.00274, 0.00265, 0.00273, 0.00264, 0.00274, 0.00264, 0.00274, 0.0028, 0.00265, 0.00263, 0.00263, 0.00272, 0.00271, 0.00276, 0.00267, 0.00265, 0.00262, 0.00272, 0.00277, 0.00264, 0.00269, 0.00264, 0.00264, 0.00272, 0.00271, 0.00294, 0.00388, 0.00268, 0.00273, 0.00273, 0.00265, 0.00357, 0.00265, 0.00304, 0.00272, 0.00261, 0.00268, 0.0027, 0.00266, 0.00267, 0.00264, 0.00278, 0.00274, 0.00267, 0.00269, 0.00268, 0.0027, 0.00269, 0.0027, 0.00269, 0.0027, 0.00271, 0.00269, 0.00267, 0.0027, 0.00268, 0.0027, 0.00272, 0.00271, 0.0027, 0.00272, 0.00272, 0.00274, 0.00269, 0.00313, 0.00269, 0.00269, 0.00269, 0.00271, 0.00271, 0.00273, 0.00283, 0.0027, 0.00269, 0.00278, 0.00276, 0.00271, 0.00271, 0.0027, 0.0027, 0.00271, 0.00272, 0.00271, 0.00272, 0.00271, 0.00271, 0.00268, 0.00273, 0.00271, 0.00269, 0.0027, 0.00273, 0.00275, 0.00269, 0.00273, 0.00271, 0.00271, 0.0027, 0.00272, 0.00269, 0.00269, 0.00272, 0.00274, 0.00271, 0.00272, 0.00272, 0.0027, 0.0027, 0.00272, 0.0027, 0.00271, 0.00271, 0.00273, 0.00271, 0.00268, 0.0027, 0.00271, 0.00273, 0.00272, 0.0027, 0.00269, 0.00272, 0.00272, 0.0027, 0.00271]}, "optimizer-inner-step-time": {"start_step": 0, "end_step": 2000, "step_interval": 5, "values": [0.0026, 0.00046, 0.00046, 0.00045, 0.00045, 0.00045, 0.00045, 0.00045, 0.00046, 0.00045, 0.00045, 0.00045, 0.00044, 0.00044, 0.00045, 0.00045, 0.00045, 0.00045, 0.00045, 0.00045, 0.00045, 0.00045, 0.00045, 0.00045, 0.00045, 0.00045, 0.00046, 0.00044, 0.00045, 0.00044, 0.00045, 0.00045, 0.00044, 0.00051, 0.00045, 0.00045, 0.00045, 0.00044, 0.00045, 0.00046, 0.00045, 0.00045, 0.00045, 0.00044, 0.00045, 0.00045, 0.00044, 0.00045, 0.00045, 0.00045, 0.00045, 0.00044, 0.00044, 0.00046, 0.00045, 0.00046, 0.00044, 0.00045, 0.00045, 0.00044, 0.00045, 0.00044, 0.00044, 0.00044, 0.00045, 0.00046, 0.00045, 0.00044, 0.00044, 0.00044, 0.00045, 0.00045, 0.00046, 0.00045, 0.00045, 0.00046, 0.00044, 0.00045, 0.00046, 0.00045, 0.00045, 0.00048, 0.00045, 0.00045, 0.00045, 0.00045, 0.00045, 0.00044, 0.00045, 0.00046, 0.00045, 0.00044, 0.00045, 0.00045, 0.00046, 0.00045, 0.00045, 0.00044, 0.00045, 0.00045, 0.00045, 0.00045, 0.00047, 0.00044, 0.00057, 0.00046, 0.00045, 0.00045, 0.00045, 0.00045, 0.00045, 0.00045, 0.00045, 0.00045, 0.00045, 0.00044, 0.00045, 0.0005, 0.00044, 0.00046, 0.00045, 0.00045, 0.00045, 0.00045, 0.00046, 0.00044, 0.00045, 0.00045, 0.00045, 0.00045, 0.00045, 0.00045, 0.00045, 0.00045, 0.00045, 0.00044, 0.00045, 0.00044, 0.00045, 0.00045, 0.00045, 0.00045, 0.00046, 0.00045, 0.00045, 0.00044, 0.00045, 0.00045, 0.00045, 0.00045, 0.00045, 0.00044, 0.00045, 0.00045, 0.00047, 0.00045, 0.00045, 0.00045, 0.00046, 0.00044, 0.00045, 0.00045, 0.00045, 0.00045, 0.00049, 0.00045, 0.00045, 0.00045, 0.00045, 0.00046, 0.00044, 0.00045, 0.00045, 0.00059, 0.00045, 0.00045, 0.00045, 0.00045, 0.00045, 0.00045, 0.00044, 0.00044, 0.00045, 0.00045, 0.00045, 0.00045, 0.00044, 0.00045, 0.00045, 0.00045, 0.00045, 0.00046, 0.00051, 0.00046, 0.00046, 0.00046, 0.00045, 0.00045, 0.00046, 0.00046, 0.00061, 0.00045, 0.00045, 0.00045, 0.00046, 0.00045, 0.00045, 0.00046, 0.00044, 0.00045, 0.00045, 0.00045, 0.00045, 0.00046, 0.00045, 0.00045, 0.00044, 0.00045, 0.00045, 0.00045, 0.00045, 0.00045, 0.00045, 0.00044, 0.00045, 0.00044, 0.00044, 0.00045, 0.00045, 0.00044, 0.00046, 0.00054, 0.00045, 0.00045, 0.00045, 0.00045, 0.00045, 0.00045, 0.00045, 0.00045, 0.00044, 0.00044, 0.00045, 0.00044, 0.00044, 0.00044, 0.00045, 0.00044, 0.00044, 0.00045, 0.00044, 0.00045, 0.00044, 0.00044, 0.00045, 0.00044, 0.00045, 0.00045, 0.00045, 0.00045, 0.00045, 0.00045, 0.00046, 0.00046, 0.00046, 0.00045, 0.00045, 0.00045, 0.00045, 0.00046, 0.00046, 0.00045, 0.00046, 0.00055, 0.00045, 0.00045, 0.00045, 0.00045, 0.00044, 0.00044, 0.00045, 0.00046, 0.00046, 0.00045, 0.00045, 0.00045, 0.00045, 0.00045, 0.00045, 0.00045, 0.00044, 0.00045, 0.00045, 0.00045, 0.00044, 0.00044, 0.00045, 0.00046, 0.00045, 0.00044, 0.00076, 0.00046, 0.00045, 0.00045, 0.00045, 0.00044, 0.00045, 0.00045, 0.00046, 0.00045, 0.00046, 0.00045, 0.00045, 0.00046, 0.00044, 0.00045, 0.00048, 0.00045, 0.00045, 0.00048, 0.00045, 0.00045, 0.00044, 0.00045, 0.00045, 0.00045, 0.00045, 0.00044, 0.00045, 0.00045, 0.00045, 0.00045, 0.00045, 0.00045, 0.00045, 0.00045, 0.00045, 0.00045, 0.00045, 0.00045, 0.00045, 0.00045, 0.00045, 0.00045, 0.00052, 0.0005, 0.00056, 0.00045, 0.00045, 0.00045, 0.00045, 0.00045, 0.00046, 0.00045, 0.00045, 0.00045, 0.00045, 0.00045, 0.00045, 0.00045, 0.00045, 0.00045, 0.00045, 0.00045, 0.00045, 0.00045, 0.00045, 0.00045, 0.00046, 0.00044, 0.00045, 0.00046, 0.00044, 0.00045, 0.00045, 0.00046, 0.00045, 0.00045, 0.00045, 0.00044, 0.00045, 0.00044, 0.00045, 0.00045, 0.00046, 0.00045, 0.00046, 0.00046, 0.00046, 0.00045, 0.00045, 0.00046, 0.00046, 0.00046, 0.00046, 0.00046, 0.00045, 0.00045, 0.00045, 0.00045]}, "optimizer-copy-main-to-model-params-time": {"start_step": 0, "end_step": 2000, "step_interval": 5, "values": [0.00055, 0.00049, 0.0005, 0.0005, 0.00049, 0.0005, 0.00049, 0.00049, 0.00049, 0.00049, 0.00049, 0.00049, 0.0005, 0.00049, 0.00049, 0.00049, 0.00049, 0.00049, 0.0005, 0.0005, 0.00049, 0.0005, 0.00049, 0.00049, 0.00049, 0.00049, 0.00049, 0.0005, 0.00049, 0.00051, 0.00049, 0.00049, 0.00049, 0.00066, 0.00049, 0.0005, 0.00049, 0.00049, 0.00049, 0.00049, 0.00049, 0.0005, 0.00049, 0.00049, 0.00049, 0.00049, 0.00049, 0.00049, 0.00049, 0.00049, 0.00049, 0.00049, 0.00049, 0.0005, 0.00049, 0.00049, 0.00049, 0.00049, 0.00049, 0.00049, 0.00049, 0.00049, 0.00049, 0.00049, 0.00049, 0.00049, 0.00049, 0.00049, 0.00049, 0.00049, 0.00049, 0.00049, 0.00049, 0.00049, 0.00049, 0.00049, 0.00048, 0.00049, 0.0005, 0.00049, 0.00049, 0.00068, 0.00049, 0.0005, 0.00049, 0.00049, 0.0005, 0.00049, 0.00049, 0.00049, 0.00049, 0.0005, 0.00049, 0.0005, 0.00049, 0.0005, 0.00049, 0.00049, 0.00049, 0.00049, 0.00049, 0.00049, 0.00049, 0.00049, 0.00067, 0.00049, 0.00049, 0.00049, 0.00049, 0.00049, 0.00049, 0.00049, 0.00049, 0.00049, 0.00049, 0.00049, 0.00049, 0.00063, 0.00049, 0.00049, 0.00049, 0.00049, 0.00049, 0.00049, 0.00049, 0.00049, 0.00049, 0.0005, 0.00049, 0.00049, 0.00049, 0.00049, 0.00049, 0.00049, 0.00049, 0.00049, 0.00049, 0.00048, 0.00049, 0.00049, 0.00049, 0.00049, 0.00049, 0.00049, 0.0005, 0.00049, 0.00049, 0.00049, 0.00049, 0.00049, 0.00049, 0.00049, 0.00049, 0.00049, 0.00049, 0.00049, 0.00049, 0.00049, 0.00049, 0.00049, 0.00049, 0.00049, 0.00049, 0.00049, 0.00068, 0.00049, 0.00049, 0.00049, 0.00049, 0.00049, 0.00049, 0.00049, 0.00049, 0.00076, 0.00049, 0.00049, 0.00049, 0.00049, 0.00049, 0.00049, 0.00049, 0.00049, 0.00049, 0.00049, 0.00049, 0.00049, 0.00049, 0.0005, 0.00049, 0.00049, 0.00049, 0.00049, 0.00051, 0.00049, 0.0005, 0.00049, 0.0005, 0.00049, 0.00052, 0.00049, 0.00066, 0.0005, 0.00049, 0.00049, 0.00049, 0.0005, 0.00049, 0.00049, 0.0005, 0.0005, 0.00049, 0.00049, 0.00049, 0.00049, 0.00049, 0.00049, 0.0005, 0.0005, 0.0005, 0.00049, 0.00049, 0.0005, 0.0005, 0.0005, 0.00049, 0.00049, 0.00049, 0.00049, 0.00051, 0.0005, 0.0005, 0.00072, 0.00049, 0.0005, 0.00049, 0.00049, 0.0005, 0.00049, 0.0005, 0.00049, 0.00049, 0.0005, 0.00049, 0.00049, 0.0005, 0.0005, 0.00049, 0.0005, 0.0005, 0.0005, 0.0005, 0.0005, 0.0005, 0.00049, 0.0005, 0.00049, 0.0005, 0.00049, 0.00049, 0.00049, 0.0005, 0.00052, 0.0005, 0.00049, 0.00049, 0.00049, 0.0005, 0.00049, 0.00049, 0.00049, 0.00049, 0.00049, 0.00049, 0.00066, 0.0005, 0.00049, 0.0005, 0.00049, 0.0005, 0.00049, 0.00049, 0.00049, 0.00049, 0.00049, 0.00049, 0.00052, 0.0005, 0.00049, 0.0005, 0.00049, 0.00049, 0.0005, 0.00049, 0.00049, 0.00053, 0.00049, 0.00052, 0.00049, 0.00049, 0.00049, 0.00076, 0.00049, 0.0005, 0.00049, 0.0005, 0.00049, 0.00064, 0.0005, 0.00051, 0.00049, 0.00049, 0.00049, 0.00049, 0.00049, 0.00049, 0.0005, 0.00066, 0.00049, 0.00051, 0.00063, 0.0005, 0.0005, 0.0005, 0.00049, 0.00049, 0.0005, 0.0005, 0.0005, 0.00049, 0.00049, 0.00049, 0.00049, 0.00049, 0.0005, 0.0005, 0.0005, 0.00051, 0.00049, 0.00049, 0.0005, 0.0005, 0.0005, 0.00053, 0.0005, 0.00073, 0.00072, 0.00072, 0.00049, 0.0005, 0.00049, 0.00049, 0.00049, 0.0005, 0.00051, 0.00051, 0.0005, 0.00049, 0.0005, 0.0005, 0.0005, 0.00049, 0.00049, 0.00049, 0.00049, 0.00049, 0.0005, 0.00049, 0.00049, 0.00049, 0.00049, 0.00049, 0.00049, 0.00049, 0.00049, 0.0005, 0.00049, 0.00049, 0.0005, 0.0005, 0.0005, 0.0005, 0.0005, 0.0005, 0.0005, 0.0005, 0.0005, 0.0005, 0.0005, 0.0005, 0.0005, 0.0005, 0.0005, 0.0005, 0.0005, 0.00051, 0.0005, 0.0005, 0.0005, 0.00049, 0.0005]}, "optimizer-time": {"start_step": 0, "end_step": 2000, "step_interval": 5, "values": [4.26785, 0.00472, 0.00469, 0.00468, 0.0047, 0.00469, 0.00466, 0.00479, 0.00473, 0.00465, 0.00467, 0.00466, 0.00467, 0.00467, 0.00464, 0.00466, 0.00468, 0.00461, 0.00465, 0.00464, 0.00469, 0.00469, 0.00464, 0.00465, 0.00473, 0.00473, 0.00467, 0.00463, 0.00464, 0.00465, 0.00468, 0.00467, 0.00464, 0.00516, 0.00466, 0.00468, 0.00465, 0.00465, 0.00465, 0.00469, 0.00466, 0.00464, 0.00465, 0.00462, 0.00463, 0.00466, 0.00466, 0.00464, 0.00465, 0.00466, 0.00468, 0.00483, 0.00473, 0.005, 0.00465, 0.00465, 0.00463, 0.00466, 0.00463, 0.00463, 0.00465, 0.00465, 0.00461, 0.00465, 0.00467, 0.00467, 0.00464, 0.00464, 0.00468, 0.00465, 0.00483, 0.00466, 0.0047, 0.00478, 0.00466, 0.00466, 0.00461, 0.00462, 0.00467, 0.00465, 0.00469, 0.00749, 0.00467, 0.00465, 0.00466, 0.00466, 0.00465, 0.00465, 0.00465, 0.00495, 0.00465, 0.00465, 0.00463, 0.00463, 0.00466, 0.00467, 0.00464, 0.00472, 0.00456, 0.00469, 0.00464, 0.00466, 0.0049, 0.00463, 0.00555, 0.00466, 0.00464, 0.00464, 0.00466, 0.00456, 0.00466, 0.0046, 0.00453, 0.00464, 0.00465, 0.00461, 0.00466, 0.00495, 0.00466, 0.00467, 0.00463, 0.00461, 0.00463, 0.00465, 0.00458, 0.00465, 0.00467, 0.00464, 0.00466, 0.00467, 0.00456, 0.00464, 0.00465, 0.00464, 0.00465, 0.00462, 0.00462, 0.00464, 0.00466, 0.00465, 0.00464, 0.00465, 0.00463, 0.00456, 0.00455, 0.00464, 0.00462, 0.00466, 0.00464, 0.00466, 0.00461, 0.00462, 0.00463, 0.00464, 0.00468, 0.00465, 0.00462, 0.00463, 0.00466, 0.00465, 0.00472, 0.00464, 0.00465, 0.00477, 0.00511, 0.00469, 0.00467, 0.00467, 0.00468, 0.00471, 0.00465, 0.00468, 0.00465, 0.00522, 0.00464, 0.00465, 0.00466, 0.00465, 0.00464, 0.00465, 0.00465, 0.00466, 0.00467, 0.00466, 0.00464, 0.00475, 0.00467, 0.0048, 0.00468, 0.00466, 0.00466, 0.00467, 0.00478, 0.00466, 0.00469, 0.00465, 0.00466, 0.00465, 0.00499, 0.0047, 0.00568, 0.00465, 0.00465, 0.00466, 0.00466, 0.00541, 0.00464, 0.00465, 0.00465, 0.00465, 0.00463, 0.00465, 0.00469, 0.00464, 0.00473, 0.00463, 0.00466, 0.00474, 0.00466, 0.00465, 0.00464, 0.00467, 0.00464, 0.00466, 0.00464, 0.00462, 0.00464, 0.00466, 0.00463, 0.00467, 0.00467, 0.00542, 0.00468, 0.00466, 0.00465, 0.00465, 0.00467, 0.0047, 0.00463, 0.00461, 0.00466, 0.00468, 0.00464, 0.00466, 0.00467, 0.00468, 0.00467, 0.00465, 0.00467, 0.00468, 0.00465, 0.00469, 0.00468, 0.00468, 0.00464, 0.00466, 0.00467, 0.00464, 0.00464, 0.00461, 0.00462, 0.00463, 0.0047, 0.00464, 0.00489, 0.00464, 0.00469, 0.0046, 0.00459, 0.00459, 0.0046, 0.00459, 0.00472, 0.00501, 0.00458, 0.00468, 0.00465, 0.00469, 0.00461, 0.00469, 0.00458, 0.0047, 0.00478, 0.0046, 0.00464, 0.00461, 0.00468, 0.00468, 0.00476, 0.00469, 0.00461, 0.00457, 0.00469, 0.00472, 0.00468, 0.00464, 0.00467, 0.00461, 0.00467, 0.00463, 0.00558, 0.00601, 0.00464, 0.0047, 0.0047, 0.00459, 0.00574, 0.00463, 0.00519, 0.00467, 0.00462, 0.00464, 0.00469, 0.00461, 0.00476, 0.00462, 0.00501, 0.00471, 0.00465, 0.0049, 0.00465, 0.00465, 0.00465, 0.00465, 0.00462, 0.00466, 0.00466, 0.00465, 0.00463, 0.00464, 0.00464, 0.00465, 0.00468, 0.00466, 0.00465, 0.00469, 0.00468, 0.0047, 0.00466, 0.00514, 0.00464, 0.00465, 0.00469, 0.00468, 0.00511, 0.00511, 0.00571, 0.00469, 0.00467, 0.00473, 0.00471, 0.00465, 0.00469, 0.00466, 0.00464, 0.00465, 0.00468, 0.00467, 0.00468, 0.00465, 0.00464, 0.00464, 0.00468, 0.00467, 0.00464, 0.00464, 0.00467, 0.00472, 0.00466, 0.00466, 0.00473, 0.00466, 0.00465, 0.00468, 0.00463, 0.00465, 0.00465, 0.00469, 0.00467, 0.00465, 0.00469, 0.00464, 0.00467, 0.00468, 0.00468, 0.00467, 0.00468, 0.00469, 0.00467, 0.00465, 0.00466, 0.00468, 0.0047, 0.0047, 0.00469, 0.00467, 0.00475, 0.00469, 0.00466, 0.00467]}, "learning-rate": {"start_step": 0, "end_step": 2000, "step_interval": 5, "values": [0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 1e-05, 1e-05, 1e-05, 1e-05, 1e-05, 1e-05, 1e-05, 1e-05, 1e-05, 1e-05, 1e-05, 1e-05, 1e-05, 1e-05, 1e-05, 1e-05, 1e-05, 1e-05, 1e-05, 1e-05, 1e-05, 1e-05, 1e-05, 1e-05, 1e-05, 1e-05, 1e-05, 1e-05, 1e-05, 1e-05, 1e-05, 1e-05, 1e-05, 1e-05, 1e-05, 1e-05, 1e-05, 1e-05, 1e-05, 1e-05, 1e-05, 1e-05, 2e-05, 2e-05, 2e-05, 2e-05, 2e-05, 2e-05, 2e-05, 2e-05, 2e-05, 2e-05, 2e-05, 2e-05, 2e-05, 2e-05, 2e-05, 2e-05, 2e-05, 2e-05, 2e-05, 2e-05, 2e-05, 2e-05, 2e-05, 2e-05, 2e-05, 2e-05, 2e-05, 2e-05, 2e-05, 2e-05, 2e-05, 2e-05, 2e-05, 2e-05, 2e-05, 2e-05, 2e-05, 2e-05, 2e-05, 2e-05, 2e-05, 2e-05, 2e-05, 3e-05, 3e-05, 3e-05, 3e-05, 3e-05, 3e-05, 3e-05, 3e-05, 3e-05, 3e-05, 3e-05, 3e-05, 3e-05, 3e-05, 3e-05, 3e-05, 3e-05, 3e-05, 3e-05, 3e-05, 3e-05, 3e-05, 3e-05, 3e-05, 3e-05, 3e-05, 3e-05, 3e-05, 3e-05, 3e-05, 3e-05, 3e-05, 3e-05, 3e-05, 3e-05, 3e-05, 3e-05, 3e-05, 3e-05, 3e-05, 3e-05, 3e-05, 3e-05, 4e-05, 4e-05, 4e-05, 4e-05, 4e-05, 4e-05, 4e-05, 4e-05, 4e-05, 4e-05, 4e-05, 4e-05, 4e-05, 4e-05, 4e-05, 4e-05, 4e-05, 4e-05, 4e-05, 4e-05, 4e-05, 4e-05, 4e-05, 4e-05, 4e-05, 4e-05, 4e-05, 4e-05, 4e-05, 4e-05, 4e-05, 4e-05, 4e-05, 4e-05, 4e-05, 4e-05, 4e-05, 4e-05, 4e-05, 4e-05, 4e-05, 4e-05, 5e-05, 5e-05, 5e-05, 5e-05, 5e-05, 5e-05, 5e-05, 5e-05, 5e-05, 5e-05, 5e-05, 5e-05, 5e-05, 5e-05, 5e-05, 5e-05, 5e-05, 5e-05, 5e-05, 5e-05, 5e-05, 5e-05, 5e-05, 5e-05, 5e-05, 5e-05, 5e-05, 5e-05, 5e-05, 5e-05, 5e-05, 5e-05, 5e-05, 5e-05, 5e-05, 5e-05, 5e-05, 5e-05, 5e-05, 5e-05, 5e-05, 5e-05, 5e-05, 6e-05, 6e-05, 6e-05, 6e-05, 6e-05, 6e-05, 6e-05, 6e-05, 6e-05, 6e-05, 6e-05, 6e-05, 6e-05, 6e-05, 6e-05, 6e-05, 6e-05, 6e-05, 6e-05, 6e-05, 6e-05, 6e-05, 6e-05, 6e-05, 6e-05, 6e-05, 6e-05, 6e-05, 6e-05, 6e-05, 6e-05, 6e-05, 6e-05, 6e-05, 6e-05, 6e-05, 6e-05, 6e-05, 6e-05, 6e-05, 6e-05, 6e-05, 6e-05, 7e-05, 7e-05, 7e-05, 7e-05, 7e-05, 7e-05, 7e-05, 7e-05, 7e-05, 7e-05, 7e-05, 7e-05, 7e-05, 7e-05, 7e-05, 7e-05, 7e-05, 7e-05, 7e-05, 7e-05, 7e-05, 7e-05, 7e-05, 7e-05, 7e-05, 7e-05, 7e-05, 7e-05, 7e-05, 7e-05, 7e-05, 7e-05, 7e-05, 7e-05, 7e-05, 7e-05, 7e-05, 7e-05, 7e-05, 7e-05, 7e-05, 7e-05, 8e-05, 8e-05, 8e-05, 8e-05, 8e-05, 8e-05, 8e-05, 8e-05, 8e-05, 8e-05, 8e-05, 8e-05, 8e-05, 8e-05, 8e-05, 8e-05, 8e-05, 8e-05, 8e-05, 8e-05, 8e-05, 8e-05, 8e-05, 8e-05, 8e-05, 8e-05, 8e-05, 8e-05, 8e-05, 8e-05, 8e-05, 8e-05, 8e-05, 8e-05, 8e-05, 8e-05, 8e-05, 8e-05, 8e-05, 8e-05, 8e-05, 8e-05, 8e-05, 9e-05, 9e-05, 9e-05, 9e-05, 9e-05, 9e-05, 9e-05, 9e-05, 9e-05, 9e-05, 9e-05, 9e-05, 9e-05, 9e-05, 9e-05, 9e-05, 9e-05, 9e-05, 9e-05, 9e-05, 9e-05, 9e-05, 9e-05, 9e-05, 9e-05, 9e-05, 9e-05, 9e-05, 9e-05, 9e-05, 9e-05, 9e-05, 9e-05, 9e-05, 9e-05, 9e-05, 9e-05]}, "learning-rate vs samples": {"start_step": 0, "end_step": 2000, "step_interval": 5, "values": [0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 1e-05, 1e-05, 1e-05, 1e-05, 1e-05, 1e-05, 1e-05, 1e-05, 1e-05, 1e-05, 1e-05, 1e-05, 1e-05, 1e-05, 1e-05, 1e-05, 1e-05, 1e-05, 1e-05, 1e-05, 1e-05, 1e-05, 1e-05, 1e-05, 1e-05, 1e-05, 1e-05, 1e-05, 1e-05, 1e-05, 1e-05, 1e-05, 1e-05, 1e-05, 1e-05, 1e-05, 1e-05, 1e-05, 1e-05, 1e-05, 1e-05, 1e-05, 2e-05, 2e-05, 2e-05, 2e-05, 2e-05, 2e-05, 2e-05, 2e-05, 2e-05, 2e-05, 2e-05, 2e-05, 2e-05, 2e-05, 2e-05, 2e-05, 2e-05, 2e-05, 2e-05, 2e-05, 2e-05, 2e-05, 2e-05, 2e-05, 2e-05, 2e-05, 2e-05, 2e-05, 2e-05, 2e-05, 2e-05, 2e-05, 2e-05, 2e-05, 2e-05, 2e-05, 2e-05, 2e-05, 2e-05, 2e-05, 2e-05, 2e-05, 2e-05, 3e-05, 3e-05, 3e-05, 3e-05, 3e-05, 3e-05, 3e-05, 3e-05, 3e-05, 3e-05, 3e-05, 3e-05, 3e-05, 3e-05, 3e-05, 3e-05, 3e-05, 3e-05, 3e-05, 3e-05, 3e-05, 3e-05, 3e-05, 3e-05, 3e-05, 3e-05, 3e-05, 3e-05, 3e-05, 3e-05, 3e-05, 3e-05, 3e-05, 3e-05, 3e-05, 3e-05, 3e-05, 3e-05, 3e-05, 3e-05, 3e-05, 3e-05, 3e-05, 4e-05, 4e-05, 4e-05, 4e-05, 4e-05, 4e-05, 4e-05, 4e-05, 4e-05, 4e-05, 4e-05, 4e-05, 4e-05, 4e-05, 4e-05, 4e-05, 4e-05, 4e-05, 4e-05, 4e-05, 4e-05, 4e-05, 4e-05, 4e-05, 4e-05, 4e-05, 4e-05, 4e-05, 4e-05, 4e-05, 4e-05, 4e-05, 4e-05, 4e-05, 4e-05, 4e-05, 4e-05, 4e-05, 4e-05, 4e-05, 4e-05, 4e-05, 5e-05, 5e-05, 5e-05, 5e-05, 5e-05, 5e-05, 5e-05, 5e-05, 5e-05, 5e-05, 5e-05, 5e-05, 5e-05, 5e-05, 5e-05, 5e-05, 5e-05, 5e-05, 5e-05, 5e-05, 5e-05, 5e-05, 5e-05, 5e-05, 5e-05, 5e-05, 5e-05, 5e-05, 5e-05, 5e-05, 5e-05, 5e-05, 5e-05, 5e-05, 5e-05, 5e-05, 5e-05, 5e-05, 5e-05, 5e-05, 5e-05, 5e-05, 5e-05, 6e-05, 6e-05, 6e-05, 6e-05, 6e-05, 6e-05, 6e-05, 6e-05, 6e-05, 6e-05, 6e-05, 6e-05, 6e-05, 6e-05, 6e-05, 6e-05, 6e-05, 6e-05, 6e-05, 6e-05, 6e-05, 6e-05, 6e-05, 6e-05, 6e-05, 6e-05, 6e-05, 6e-05, 6e-05, 6e-05, 6e-05, 6e-05, 6e-05, 6e-05, 6e-05, 6e-05, 6e-05, 6e-05, 6e-05, 6e-05, 6e-05, 6e-05, 6e-05, 7e-05, 7e-05, 7e-05, 7e-05, 7e-05, 7e-05, 7e-05, 7e-05, 7e-05, 7e-05, 7e-05, 7e-05, 7e-05, 7e-05, 7e-05, 7e-05, 7e-05, 7e-05, 7e-05, 7e-05, 7e-05, 7e-05, 7e-05, 7e-05, 7e-05, 7e-05, 7e-05, 7e-05, 7e-05, 7e-05, 7e-05, 7e-05, 7e-05, 7e-05, 7e-05, 7e-05, 7e-05, 7e-05, 7e-05, 7e-05, 7e-05, 7e-05, 8e-05, 8e-05, 8e-05, 8e-05, 8e-05, 8e-05, 8e-05, 8e-05, 8e-05, 8e-05, 8e-05, 8e-05, 8e-05, 8e-05, 8e-05, 8e-05, 8e-05, 8e-05, 8e-05, 8e-05, 8e-05, 8e-05, 8e-05, 8e-05, 8e-05, 8e-05, 8e-05, 8e-05, 8e-05, 8e-05, 8e-05, 8e-05, 8e-05, 8e-05, 8e-05, 8e-05, 8e-05, 8e-05, 8e-05, 8e-05, 8e-05, 8e-05, 8e-05, 9e-05, 9e-05, 9e-05, 9e-05, 9e-05, 9e-05, 9e-05, 9e-05, 9e-05, 9e-05, 9e-05, 9e-05, 9e-05, 9e-05, 9e-05, 9e-05, 9e-05, 9e-05, 9e-05, 9e-05, 9e-05, 9e-05, 9e-05, 9e-05, 9e-05, 9e-05, 9e-05, 9e-05, 9e-05, 9e-05, 9e-05, 9e-05, 9e-05, 9e-05, 9e-05, 9e-05, 9e-05]}, "batch-size": {"start_step": 0, "end_step": 2000, "step_interval": 5, "values": [128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0]}, "batch-size vs samples": {"start_step": 0, "end_step": 2000, "step_interval": 5, "values": [128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0]}, "lm loss": {"start_step": 0, "end_step": 2000, "step_interval": 5, "values": [10.84303, 10.87155, 10.85032, 10.81087, 10.64537, 10.63943, 10.42704, 10.13551, 9.93496, 9.83494, 9.58592, 9.84757, 9.88552, 9.63097, 9.79022, 9.51147, 9.4606, 9.65582, 9.39007, 9.33886, 9.24978, 9.152, 9.18226, 9.00447, 9.19856, 9.06681, 9.16059, 9.16939, 9.30049, 8.98819, 8.92948, 9.0507, 9.0463, 8.66041, 8.72526, 8.75716, 8.69559, 8.74303, 8.66681, 8.77472, 8.67057, 8.8619, 8.84447, 8.50989, 8.39988, 8.43941, 8.49864, 8.39575, 8.4422, 8.59464, 8.37842, 8.20138, 8.236, 8.2319, 8.27672, 7.92273, 8.10152, 7.8984, 8.25217, 8.23541, 8.01089, 7.97596, 7.92706, 7.74403, 7.7485, 7.65015, 7.52079, 7.9112, 7.70347, 7.45605, 7.74759, 7.77568, 7.54533, 7.30357, 7.45723, 7.3426, 7.46645, 7.22831, 7.63649, 7.28211, 7.34866, 7.21221, 7.21132, 7.41795, 7.17177, 7.28168, 6.99581, 7.004, 7.04074, 7.1367, 6.82354, 6.98508, 7.08921, 6.99769, 6.87461, 6.75657, 6.99031, 7.05959, 6.70411, 6.5827, 6.72604, 6.74348, 6.73218, 6.73708, 6.65685, 6.4055, 6.63559, 6.61892, 6.44639, 6.62609, 6.74333, 6.61179, 6.7261, 6.69431, 6.62741, 6.50922, 6.59901, 6.40739, 6.6657, 6.24852, 6.25199, 6.30265, 6.39086, 6.34866, 6.4484, 6.29117, 6.33917, 6.23682, 6.20019, 6.39713, 6.32382, 6.32063, 6.16132, 6.15692, 6.23736, 6.38207, 6.20216, 6.14927, 6.18286, 6.11574, 6.06273, 6.07513, 6.25658, 6.40785, 6.25681, 6.2924, 6.09673, 6.17564, 6.00002, 6.02568, 5.95394, 6.24995, 6.18499, 5.96441, 5.78379, 6.12452, 5.8475, 6.10173, 5.78491, 6.16542, 6.14406, 6.08134, 5.92727, 6.11254, 5.94363, 6.20077, 5.89399, 5.7901, 5.78128, 5.68813, 6.01482, 5.99528, 6.06741, 5.89085, 6.03981, 5.96811, 5.99655, 5.98984, 5.94628, 5.83848, 5.9481, 5.61614, 5.7002, 5.88656, 5.83806, 5.86311, 5.75859, 5.83316, 5.72072, 5.55659, 5.71965, 5.61978, 5.82718, 5.59717, 5.70318, 5.70327, 5.89853, 5.63883, 5.84367, 5.73571, 5.86365, 5.32462, 5.89684, 5.87059, 5.85018, 5.40966, 5.40521, 5.6244, 5.59463, 5.48385, 5.57514, 5.67111, 5.47486, 5.74063, 5.50617, 5.58954, 5.62055, 5.61722, 5.51063, 5.6138, 5.67042, 5.67814, 5.58421, 5.65728, 5.36779, 5.67697, 5.62608, 5.41953, 5.57893, 5.62664, 5.55034, 5.33858, 5.53624, 5.48821, 5.48891, 5.37489, 5.5499, 5.60024, 5.39139, 5.51868, 5.4935, 5.33216, 5.50746, 5.41318, 5.44698, 5.31869, 5.06634, 5.48126, 5.57099, 5.71639, 5.41515, 5.60293, 5.63581, 5.23321, 5.27358, 5.3934, 5.40049, 5.32861, 5.49563, 5.18115, 5.29818, 5.24632, 5.377, 5.25164, 5.44247, 5.53356, 5.31175, 5.43649, 5.33683, 5.07482, 5.31199, 5.25123, 5.30045, 5.10952, 5.27365, 5.26615, 5.4733, 5.15569, 5.2676, 5.21227, 5.35586, 4.98451, 4.91017, 5.32431, 5.38997, 5.22667, 5.3209, 5.10232, 5.16141, 5.26239, 5.0658, 5.26091, 5.06389, 5.34895, 5.24827, 5.1463, 5.24113, 5.03942, 5.31795, 5.05285, 5.02784, 5.14139, 5.11164, 5.27303, 5.15115, 5.2757, 5.09401, 5.09338, 5.24504, 5.32369, 5.25347, 5.19226, 5.14165, 5.29079, 4.95338, 5.20578, 5.09105, 5.30122, 5.17357, 5.19235, 5.11365, 4.98113, 4.9916, 5.22149, 5.30937, 5.10092, 5.0529, 4.91086, 5.12305, 5.11531, 4.92812, 5.3389, 5.02814, 5.10063, 5.16722, 5.00342, 5.0656, 5.06853, 5.0, 5.08165, 5.16456, 4.98252, 5.1839, 4.93148, 4.92569, 5.06682, 4.99595, 4.90624, 4.77517, 4.94606, 5.11508, 5.01539, 5.01397, 5.3327, 4.96029, 4.9915, 5.04439, 4.80654, 4.73199, 4.99639, 5.04237, 4.8734, 4.95425, 5.04678, 5.02392, 4.81994, 4.89463, 4.90711, 4.83288, 4.74257, 5.01934, 4.75352, 5.20696, 4.79359, 4.99212, 4.73894, 4.7885, 4.82299, 4.65617, 4.65522, 4.84524, 4.81217, 4.79792, 4.92038, 4.88607, 4.92565, 4.7712, 4.88216, 4.73528, 4.92078, 4.96145, 4.87447, 4.71317, 4.78702, 4.90462, 4.71624, 4.86657, 4.69712, 4.69196, 4.64876]}, "lm loss vs samples": {"start_step": 0, "end_step": 2000, "step_interval": 5, "values": [10.84303, 10.87155, 10.85032, 10.81087, 10.64537, 10.63943, 10.42704, 10.13551, 9.93496, 9.83494, 9.58592, 9.84757, 9.88552, 9.63097, 9.79022, 9.51147, 9.4606, 9.65582, 9.39007, 9.33886, 9.24978, 9.152, 9.18226, 9.00447, 9.19856, 9.06681, 9.16059, 9.16939, 9.30049, 8.98819, 8.92948, 9.0507, 9.0463, 8.66041, 8.72526, 8.75716, 8.69559, 8.74303, 8.66681, 8.77472, 8.67057, 8.8619, 8.84447, 8.50989, 8.39988, 8.43941, 8.49864, 8.39575, 8.4422, 8.59464, 8.37842, 8.20138, 8.236, 8.2319, 8.27672, 7.92273, 8.10152, 7.8984, 8.25217, 8.23541, 8.01089, 7.97596, 7.92706, 7.74403, 7.7485, 7.65015, 7.52079, 7.9112, 7.70347, 7.45605, 7.74759, 7.77568, 7.54533, 7.30357, 7.45723, 7.3426, 7.46645, 7.22831, 7.63649, 7.28211, 7.34866, 7.21221, 7.21132, 7.41795, 7.17177, 7.28168, 6.99581, 7.004, 7.04074, 7.1367, 6.82354, 6.98508, 7.08921, 6.99769, 6.87461, 6.75657, 6.99031, 7.05959, 6.70411, 6.5827, 6.72604, 6.74348, 6.73218, 6.73708, 6.65685, 6.4055, 6.63559, 6.61892, 6.44639, 6.62609, 6.74333, 6.61179, 6.7261, 6.69431, 6.62741, 6.50922, 6.59901, 6.40739, 6.6657, 6.24852, 6.25199, 6.30265, 6.39086, 6.34866, 6.4484, 6.29117, 6.33917, 6.23682, 6.20019, 6.39713, 6.32382, 6.32063, 6.16132, 6.15692, 6.23736, 6.38207, 6.20216, 6.14927, 6.18286, 6.11574, 6.06273, 6.07513, 6.25658, 6.40785, 6.25681, 6.2924, 6.09673, 6.17564, 6.00002, 6.02568, 5.95394, 6.24995, 6.18499, 5.96441, 5.78379, 6.12452, 5.8475, 6.10173, 5.78491, 6.16542, 6.14406, 6.08134, 5.92727, 6.11254, 5.94363, 6.20077, 5.89399, 5.7901, 5.78128, 5.68813, 6.01482, 5.99528, 6.06741, 5.89085, 6.03981, 5.96811, 5.99655, 5.98984, 5.94628, 5.83848, 5.9481, 5.61614, 5.7002, 5.88656, 5.83806, 5.86311, 5.75859, 5.83316, 5.72072, 5.55659, 5.71965, 5.61978, 5.82718, 5.59717, 5.70318, 5.70327, 5.89853, 5.63883, 5.84367, 5.73571, 5.86365, 5.32462, 5.89684, 5.87059, 5.85018, 5.40966, 5.40521, 5.6244, 5.59463, 5.48385, 5.57514, 5.67111, 5.47486, 5.74063, 5.50617, 5.58954, 5.62055, 5.61722, 5.51063, 5.6138, 5.67042, 5.67814, 5.58421, 5.65728, 5.36779, 5.67697, 5.62608, 5.41953, 5.57893, 5.62664, 5.55034, 5.33858, 5.53624, 5.48821, 5.48891, 5.37489, 5.5499, 5.60024, 5.39139, 5.51868, 5.4935, 5.33216, 5.50746, 5.41318, 5.44698, 5.31869, 5.06634, 5.48126, 5.57099, 5.71639, 5.41515, 5.60293, 5.63581, 5.23321, 5.27358, 5.3934, 5.40049, 5.32861, 5.49563, 5.18115, 5.29818, 5.24632, 5.377, 5.25164, 5.44247, 5.53356, 5.31175, 5.43649, 5.33683, 5.07482, 5.31199, 5.25123, 5.30045, 5.10952, 5.27365, 5.26615, 5.4733, 5.15569, 5.2676, 5.21227, 5.35586, 4.98451, 4.91017, 5.32431, 5.38997, 5.22667, 5.3209, 5.10232, 5.16141, 5.26239, 5.0658, 5.26091, 5.06389, 5.34895, 5.24827, 5.1463, 5.24113, 5.03942, 5.31795, 5.05285, 5.02784, 5.14139, 5.11164, 5.27303, 5.15115, 5.2757, 5.09401, 5.09338, 5.24504, 5.32369, 5.25347, 5.19226, 5.14165, 5.29079, 4.95338, 5.20578, 5.09105, 5.30122, 5.17357, 5.19235, 5.11365, 4.98113, 4.9916, 5.22149, 5.30937, 5.10092, 5.0529, 4.91086, 5.12305, 5.11531, 4.92812, 5.3389, 5.02814, 5.10063, 5.16722, 5.00342, 5.0656, 5.06853, 5.0, 5.08165, 5.16456, 4.98252, 5.1839, 4.93148, 4.92569, 5.06682, 4.99595, 4.90624, 4.77517, 4.94606, 5.11508, 5.01539, 5.01397, 5.3327, 4.96029, 4.9915, 5.04439, 4.80654, 4.73199, 4.99639, 5.04237, 4.8734, 4.95425, 5.04678, 5.02392, 4.81994, 4.89463, 4.90711, 4.83288, 4.74257, 5.01934, 4.75352, 5.20696, 4.79359, 4.99212, 4.73894, 4.7885, 4.82299, 4.65617, 4.65522, 4.84524, 4.81217, 4.79792, 4.92038, 4.88607, 4.92565, 4.7712, 4.88216, 4.73528, 4.92078, 4.96145, 4.87447, 4.71317, 4.78702, 4.90462, 4.71624, 4.86657, 4.69712, 4.69196, 4.64876]}, "loss-scale": {"start_step": 0, "end_step": 2000, "step_interval": 5, "values": [1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0]}, "loss-scale vs samples": {"start_step": 0, "end_step": 2000, "step_interval": 5, "values": [1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0]}, "grad-norm": {"start_step": 0, "end_step": 2000, "step_interval": 5, "values": [13.92196, 13.29306, 13.8377, 12.64037, 11.97375, 9.45262, 6.78823, 6.89004, 5.94557, 4.54615, 4.13637, 2.82375, 2.38927, 2.34389, 2.05973, 2.22596, 2.14457, 1.88597, 2.17986, 2.06069, 2.12423, 2.1677, 2.0115, 2.21442, 1.98307, 2.0966, 1.90389, 1.86829, 1.92477, 2.13027, 2.09469, 2.11211, 1.95723, 2.18758, 2.38519, 2.04808, 2.04244, 1.85027, 1.9837, 1.78603, 2.12943, 1.83753, 1.73653, 1.84787, 1.96175, 1.78052, 1.76095, 1.7401, 1.76961, 1.54057, 1.76088, 1.7938, 1.76365, 1.83855, 1.58517, 1.79545, 1.7158, 1.81815, 1.53518, 1.48648, 1.68949, 1.4562, 1.8648, 1.85145, 1.61928, 1.6745, 1.65487, 1.55646, 1.47797, 1.6989, 1.43883, 1.43836, 1.46011, 1.39711, 1.37457, 1.48663, 1.40785, 1.35385, 1.34051, 1.27757, 1.35283, 1.29709, 1.2816, 1.30185, 1.24092, 1.29738, 1.41961, 1.34489, 1.44199, 1.06928, 1.09491, 1.16108, 1.14396, 1.33634, 1.03654, 1.30756, 1.08982, 1.27845, 0.98191, 1.37412, 1.30793, 1.21672, 1.05131, 1.25909, 1.09643, 1.13996, 1.20961, 1.09191, 1.24074, 0.97878, 1.18535, 0.97714, 0.95456, 1.10186, 1.24389, 1.07847, 1.01822, 1.2519, 1.18392, 1.42087, 1.00253, 1.23223, 1.05494, 1.02956, 0.95692, 1.27887, 1.54081, 1.2168, 1.18019, 1.34805, 0.93443, 1.06987, 1.00938, 1.19729, 1.32572, 1.18029, 1.39724, 1.01719, 1.76109, 1.21222, 1.26256, 1.31969, 1.1555, 0.93801, 0.99546, 1.01521, 1.36553, 1.55577, 1.11391, 1.2491, 1.45721, 1.65042, 1.60593, 1.30243, 1.29342, 2.04924, 1.3376, 1.21234, 1.37945, 1.79037, 1.23389, 1.08215, 1.31811, 1.12901, 1.35786, 1.8341, 1.46143, 1.31586, 1.39491, 1.24546, 1.26969, 1.25412, 1.27022, 1.43967, 1.14847, 1.3362, 1.91114, 1.35642, 1.06973, 1.20518, 1.11732, 1.73877, 1.36915, 1.34679, 1.25766, 1.64809, 1.37397, 1.17279, 1.169, 1.49772, 1.11509, 1.29145, 1.479, 1.60514, 1.12787, 1.20465, 1.52478, 1.37769, 1.40825, 1.40433, 1.19434, 1.52129, 1.49087, 1.60752, 1.51416, 1.37753, 1.49097, 1.59106, 1.33146, 1.56964, 1.54958, 1.2024, 1.29844, 1.28184, 1.63096, 1.29563, 1.41842, 1.57651, 1.29669, 1.23902, 1.51872, 1.34276, 1.28172, 1.67239, 1.39643, 1.57361, 1.69097, 1.37206, 1.81716, 1.3501, 1.2879, 1.45938, 1.9477, 1.77504, 2.56828, 1.55284, 1.34454, 1.21685, 1.65336, 1.29693, 2.2136, 1.28644, 1.78502, 1.52285, 1.47963, 1.65183, 1.23421, 1.41797, 1.5183, 1.31219, 1.29375, 1.3932, 1.5544, 1.2678, 1.61107, 1.43809, 1.9371, 1.64335, 1.38939, 1.24473, 1.15131, 1.26598, 1.37433, 1.20588, 1.22283, 1.31678, 1.40086, 1.53213, 1.35367, 1.43407, 1.41639, 1.25063, 1.37444, 1.20928, 1.40445, 1.48011, 1.49606, 1.43456, 1.4511, 1.51505, 1.49329, 1.32736, 1.34283, 1.56947, 1.3986, 1.38533, 1.4325, 1.36846, 1.40113, 1.40195, 1.41944, 1.73207, 1.35246, 1.98477, 1.75001, 1.59412, 1.33312, 1.55175, 1.45641, 1.40103, 1.32697, 1.19674, 1.19056, 1.56111, 1.64, 1.52329, 1.62982, 1.42489, 1.1143, 1.42326, 1.36052, 1.20749, 1.49372, 1.38211, 1.6856, 1.48198, 1.34985, 1.48241, 1.24509, 1.40355, 1.44024, 1.31152, 1.30253, 1.59307, 1.35212, 1.78683, 1.61562, 1.61575, 1.46207, 1.29047, 1.55842, 1.39097, 1.35377, 1.50655, 1.67836, 1.37929, 1.32311, 1.35305, 1.77455, 1.48895, 1.40827, 1.23883, 1.35995, 1.46576, 1.39021, 1.55027, 1.27874, 1.53316, 1.30645, 1.32818, 1.41856, 1.40297, 1.19176, 1.73797, 1.28462, 1.46556, 1.31822, 1.27157, 1.29905, 1.43641, 1.37732, 1.32041, 1.45048, 1.30403, 1.12439, 1.41266, 1.49642, 1.41634, 1.48283, 1.73467, 1.90209, 1.41005, 1.66166, 1.51488, 1.35734, 1.47652, 1.40564, 1.6499, 1.41346, 1.24965, 1.34929, 1.35141, 1.18107, 1.30851, 1.17223, 1.29341, 1.38306, 1.247, 1.29013, 1.70946, 1.36584, 1.4061, 1.82813, 1.27073, 1.45088, 1.55944, 1.5925, 1.64727, 1.42815, 1.19955]}, "grad-norm vs samples": {"start_step": 0, "end_step": 2000, "step_interval": 5, "values": [13.92196, 13.29306, 13.8377, 12.64037, 11.97375, 9.45262, 6.78823, 6.89004, 5.94557, 4.54615, 4.13637, 2.82375, 2.38927, 2.34389, 2.05973, 2.22596, 2.14457, 1.88597, 2.17986, 2.06069, 2.12423, 2.1677, 2.0115, 2.21442, 1.98307, 2.0966, 1.90389, 1.86829, 1.92477, 2.13027, 2.09469, 2.11211, 1.95723, 2.18758, 2.38519, 2.04808, 2.04244, 1.85027, 1.9837, 1.78603, 2.12943, 1.83753, 1.73653, 1.84787, 1.96175, 1.78052, 1.76095, 1.7401, 1.76961, 1.54057, 1.76088, 1.7938, 1.76365, 1.83855, 1.58517, 1.79545, 1.7158, 1.81815, 1.53518, 1.48648, 1.68949, 1.4562, 1.8648, 1.85145, 1.61928, 1.6745, 1.65487, 1.55646, 1.47797, 1.6989, 1.43883, 1.43836, 1.46011, 1.39711, 1.37457, 1.48663, 1.40785, 1.35385, 1.34051, 1.27757, 1.35283, 1.29709, 1.2816, 1.30185, 1.24092, 1.29738, 1.41961, 1.34489, 1.44199, 1.06928, 1.09491, 1.16108, 1.14396, 1.33634, 1.03654, 1.30756, 1.08982, 1.27845, 0.98191, 1.37412, 1.30793, 1.21672, 1.05131, 1.25909, 1.09643, 1.13996, 1.20961, 1.09191, 1.24074, 0.97878, 1.18535, 0.97714, 0.95456, 1.10186, 1.24389, 1.07847, 1.01822, 1.2519, 1.18392, 1.42087, 1.00253, 1.23223, 1.05494, 1.02956, 0.95692, 1.27887, 1.54081, 1.2168, 1.18019, 1.34805, 0.93443, 1.06987, 1.00938, 1.19729, 1.32572, 1.18029, 1.39724, 1.01719, 1.76109, 1.21222, 1.26256, 1.31969, 1.1555, 0.93801, 0.99546, 1.01521, 1.36553, 1.55577, 1.11391, 1.2491, 1.45721, 1.65042, 1.60593, 1.30243, 1.29342, 2.04924, 1.3376, 1.21234, 1.37945, 1.79037, 1.23389, 1.08215, 1.31811, 1.12901, 1.35786, 1.8341, 1.46143, 1.31586, 1.39491, 1.24546, 1.26969, 1.25412, 1.27022, 1.43967, 1.14847, 1.3362, 1.91114, 1.35642, 1.06973, 1.20518, 1.11732, 1.73877, 1.36915, 1.34679, 1.25766, 1.64809, 1.37397, 1.17279, 1.169, 1.49772, 1.11509, 1.29145, 1.479, 1.60514, 1.12787, 1.20465, 1.52478, 1.37769, 1.40825, 1.40433, 1.19434, 1.52129, 1.49087, 1.60752, 1.51416, 1.37753, 1.49097, 1.59106, 1.33146, 1.56964, 1.54958, 1.2024, 1.29844, 1.28184, 1.63096, 1.29563, 1.41842, 1.57651, 1.29669, 1.23902, 1.51872, 1.34276, 1.28172, 1.67239, 1.39643, 1.57361, 1.69097, 1.37206, 1.81716, 1.3501, 1.2879, 1.45938, 1.9477, 1.77504, 2.56828, 1.55284, 1.34454, 1.21685, 1.65336, 1.29693, 2.2136, 1.28644, 1.78502, 1.52285, 1.47963, 1.65183, 1.23421, 1.41797, 1.5183, 1.31219, 1.29375, 1.3932, 1.5544, 1.2678, 1.61107, 1.43809, 1.9371, 1.64335, 1.38939, 1.24473, 1.15131, 1.26598, 1.37433, 1.20588, 1.22283, 1.31678, 1.40086, 1.53213, 1.35367, 1.43407, 1.41639, 1.25063, 1.37444, 1.20928, 1.40445, 1.48011, 1.49606, 1.43456, 1.4511, 1.51505, 1.49329, 1.32736, 1.34283, 1.56947, 1.3986, 1.38533, 1.4325, 1.36846, 1.40113, 1.40195, 1.41944, 1.73207, 1.35246, 1.98477, 1.75001, 1.59412, 1.33312, 1.55175, 1.45641, 1.40103, 1.32697, 1.19674, 1.19056, 1.56111, 1.64, 1.52329, 1.62982, 1.42489, 1.1143, 1.42326, 1.36052, 1.20749, 1.49372, 1.38211, 1.6856, 1.48198, 1.34985, 1.48241, 1.24509, 1.40355, 1.44024, 1.31152, 1.30253, 1.59307, 1.35212, 1.78683, 1.61562, 1.61575, 1.46207, 1.29047, 1.55842, 1.39097, 1.35377, 1.50655, 1.67836, 1.37929, 1.32311, 1.35305, 1.77455, 1.48895, 1.40827, 1.23883, 1.35995, 1.46576, 1.39021, 1.55027, 1.27874, 1.53316, 1.30645, 1.32818, 1.41856, 1.40297, 1.19176, 1.73797, 1.28462, 1.46556, 1.31822, 1.27157, 1.29905, 1.43641, 1.37732, 1.32041, 1.45048, 1.30403, 1.12439, 1.41266, 1.49642, 1.41634, 1.48283, 1.73467, 1.90209, 1.41005, 1.66166, 1.51488, 1.35734, 1.47652, 1.40564, 1.6499, 1.41346, 1.24965, 1.34929, 1.35141, 1.18107, 1.30851, 1.17223, 1.29341, 1.38306, 1.247, 1.29013, 1.70946, 1.36584, 1.4061, 1.82813, 1.27073, 1.45088, 1.55944, 1.5925, 1.64727, 1.42815, 1.19955]}, "num-zeros": {"start_step": 0, "end_step": 2000, "step_interval": 5, "values": [65.0, 80.0, 81.0, 75.0, 72.0, 103.0, 108.0, 112.0, 107.0, 122.0, 99.0, 159.0, 148.0, 150.0, 167.0, 157.0, 165.0, 144.0, 182.0, 187.0, 180.0, 162.0, 181.0, 129.0, 189.0, 148.0, 195.0, 190.0, 137.0, 181.0, 151.0, 155.0, 152.0, 166.0, 152.0, 170.0, 160.0, 209.0, 168.0, 214.0, 166.0, 181.0, 190.0, 185.0, 161.0, 162.0, 169.0, 187.0, 184.0, 239.0, 225.0, 187.0, 190.0, 131.0, 187.0, 182.0, 159.0, 161.0, 248.0, 226.0, 201.0, 211.0, 174.0, 164.0, 168.0, 225.0, 202.0, 174.0, 223.0, 202.0, 243.0, 235.0, 180.0, 239.0, 219.0, 205.0, 210.0, 192.0, 216.0, 207.0, 209.0, 245.0, 217.0, 227.0, 212.0, 207.0, 191.0, 173.0, 196.0, 193.0, 194.0, 186.0, 203.0, 189.0, 210.0, 160.0, 204.0, 187.0, 189.0, 159.0, 168.0, 209.0, 181.0, 159.0, 173.0, 153.0, 175.0, 152.0, 147.0, 174.0, 180.0, 153.0, 176.0, 146.0, 165.0, 154.0, 147.0, 106.0, 147.0, 133.0, 174.0, 148.0, 152.0, 143.0, 173.0, 127.0, 116.0, 130.0, 127.0, 123.0, 143.0, 142.0, 146.0, 123.0, 131.0, 124.0, 138.0, 139.0, 109.0, 107.0, 130.0, 103.0, 121.0, 157.0, 131.0, 148.0, 139.0, 96.0, 120.0, 101.0, 96.0, 102.0, 102.0, 122.0, 105.0, 84.0, 114.0, 117.0, 95.0, 90.0, 106.0, 137.0, 136.0, 131.0, 122.0, 95.0, 111.0, 99.0, 117.0, 119.0, 129.0, 111.0, 104.0, 112.0, 108.0, 102.0, 88.0, 97.0, 120.0, 121.0, 124.0, 96.0, 126.0, 134.0, 122.0, 98.0, 97.0, 115.0, 102.0, 102.0, 128.0, 120.0, 104.0, 104.0, 97.0, 112.0, 104.0, 96.0, 117.0, 97.0, 136.0, 100.0, 92.0, 104.0, 95.0, 111.0, 97.0, 87.0, 108.0, 128.0, 94.0, 111.0, 106.0, 122.0, 99.0, 94.0, 110.0, 104.0, 116.0, 119.0, 114.0, 112.0, 104.0, 104.0, 108.0, 88.0, 105.0, 114.0, 103.0, 105.0, 96.0, 98.0, 92.0, 92.0, 91.0, 102.0, 119.0, 106.0, 86.0, 104.0, 60.0, 110.0, 92.0, 91.0, 80.0, 91.0, 114.0, 106.0, 80.0, 119.0, 117.0, 112.0, 114.0, 98.0, 102.0, 109.0, 101.0, 100.0, 102.0, 126.0, 124.0, 99.0, 112.0, 110.0, 129.0, 111.0, 99.0, 119.0, 101.0, 82.0, 110.0, 84.0, 95.0, 104.0, 96.0, 107.0, 83.0, 114.0, 105.0, 93.0, 104.0, 108.0, 94.0, 99.0, 104.0, 101.0, 88.0, 112.0, 101.0, 101.0, 108.0, 119.0, 118.0, 103.0, 100.0, 107.0, 94.0, 104.0, 118.0, 111.0, 115.0, 100.0, 114.0, 90.0, 110.0, 107.0, 90.0, 91.0, 145.0, 113.0, 112.0, 120.0, 101.0, 98.0, 97.0, 96.0, 109.0, 100.0, 115.0, 120.0, 120.0, 121.0, 128.0, 103.0, 94.0, 104.0, 110.0, 89.0, 102.0, 106.0, 113.0, 117.0, 113.0, 115.0, 93.0, 114.0, 119.0, 132.0, 82.0, 112.0, 105.0, 96.0, 124.0, 107.0, 108.0, 104.0, 145.0, 119.0, 124.0, 115.0, 116.0, 94.0, 130.0, 98.0, 115.0, 117.0, 120.0, 122.0, 122.0, 110.0, 108.0, 87.0, 117.0, 102.0, 123.0, 108.0, 123.0, 107.0, 99.0, 127.0, 94.0, 107.0, 72.0, 102.0, 86.0, 91.0, 94.0, 116.0, 106.0, 120.0, 127.0, 115.0, 124.0, 126.0, 129.0, 117.0, 112.0, 120.0, 119.0, 126.0, 111.0, 119.0, 91.0, 102.0, 95.0, 118.0, 111.0, 99.0, 122.0, 125.0]}, "num-zeros vs samples": {"start_step": 0, "end_step": 2000, "step_interval": 5, "values": [65.0, 80.0, 81.0, 75.0, 72.0, 103.0, 108.0, 112.0, 107.0, 122.0, 99.0, 159.0, 148.0, 150.0, 167.0, 157.0, 165.0, 144.0, 182.0, 187.0, 180.0, 162.0, 181.0, 129.0, 189.0, 148.0, 195.0, 190.0, 137.0, 181.0, 151.0, 155.0, 152.0, 166.0, 152.0, 170.0, 160.0, 209.0, 168.0, 214.0, 166.0, 181.0, 190.0, 185.0, 161.0, 162.0, 169.0, 187.0, 184.0, 239.0, 225.0, 187.0, 190.0, 131.0, 187.0, 182.0, 159.0, 161.0, 248.0, 226.0, 201.0, 211.0, 174.0, 164.0, 168.0, 225.0, 202.0, 174.0, 223.0, 202.0, 243.0, 235.0, 180.0, 239.0, 219.0, 205.0, 210.0, 192.0, 216.0, 207.0, 209.0, 245.0, 217.0, 227.0, 212.0, 207.0, 191.0, 173.0, 196.0, 193.0, 194.0, 186.0, 203.0, 189.0, 210.0, 160.0, 204.0, 187.0, 189.0, 159.0, 168.0, 209.0, 181.0, 159.0, 173.0, 153.0, 175.0, 152.0, 147.0, 174.0, 180.0, 153.0, 176.0, 146.0, 165.0, 154.0, 147.0, 106.0, 147.0, 133.0, 174.0, 148.0, 152.0, 143.0, 173.0, 127.0, 116.0, 130.0, 127.0, 123.0, 143.0, 142.0, 146.0, 123.0, 131.0, 124.0, 138.0, 139.0, 109.0, 107.0, 130.0, 103.0, 121.0, 157.0, 131.0, 148.0, 139.0, 96.0, 120.0, 101.0, 96.0, 102.0, 102.0, 122.0, 105.0, 84.0, 114.0, 117.0, 95.0, 90.0, 106.0, 137.0, 136.0, 131.0, 122.0, 95.0, 111.0, 99.0, 117.0, 119.0, 129.0, 111.0, 104.0, 112.0, 108.0, 102.0, 88.0, 97.0, 120.0, 121.0, 124.0, 96.0, 126.0, 134.0, 122.0, 98.0, 97.0, 115.0, 102.0, 102.0, 128.0, 120.0, 104.0, 104.0, 97.0, 112.0, 104.0, 96.0, 117.0, 97.0, 136.0, 100.0, 92.0, 104.0, 95.0, 111.0, 97.0, 87.0, 108.0, 128.0, 94.0, 111.0, 106.0, 122.0, 99.0, 94.0, 110.0, 104.0, 116.0, 119.0, 114.0, 112.0, 104.0, 104.0, 108.0, 88.0, 105.0, 114.0, 103.0, 105.0, 96.0, 98.0, 92.0, 92.0, 91.0, 102.0, 119.0, 106.0, 86.0, 104.0, 60.0, 110.0, 92.0, 91.0, 80.0, 91.0, 114.0, 106.0, 80.0, 119.0, 117.0, 112.0, 114.0, 98.0, 102.0, 109.0, 101.0, 100.0, 102.0, 126.0, 124.0, 99.0, 112.0, 110.0, 129.0, 111.0, 99.0, 119.0, 101.0, 82.0, 110.0, 84.0, 95.0, 104.0, 96.0, 107.0, 83.0, 114.0, 105.0, 93.0, 104.0, 108.0, 94.0, 99.0, 104.0, 101.0, 88.0, 112.0, 101.0, 101.0, 108.0, 119.0, 118.0, 103.0, 100.0, 107.0, 94.0, 104.0, 118.0, 111.0, 115.0, 100.0, 114.0, 90.0, 110.0, 107.0, 90.0, 91.0, 145.0, 113.0, 112.0, 120.0, 101.0, 98.0, 97.0, 96.0, 109.0, 100.0, 115.0, 120.0, 120.0, 121.0, 128.0, 103.0, 94.0, 104.0, 110.0, 89.0, 102.0, 106.0, 113.0, 117.0, 113.0, 115.0, 93.0, 114.0, 119.0, 132.0, 82.0, 112.0, 105.0, 96.0, 124.0, 107.0, 108.0, 104.0, 145.0, 119.0, 124.0, 115.0, 116.0, 94.0, 130.0, 98.0, 115.0, 117.0, 120.0, 122.0, 122.0, 110.0, 108.0, 87.0, 117.0, 102.0, 123.0, 108.0, 123.0, 107.0, 99.0, 127.0, 94.0, 107.0, 72.0, 102.0, 86.0, 91.0, 94.0, 116.0, 106.0, 120.0, 127.0, 115.0, 124.0, 126.0, 129.0, 117.0, 112.0, 120.0, 119.0, 126.0, 111.0, 119.0, 91.0, 102.0, 95.0, 118.0, 111.0, 99.0, 122.0, 125.0]}, "params-norm": {"start_step": 0, "end_step": 2000, "step_interval": 5, "values": [180.0215, 180.0215, 180.0215, 180.0215, 180.02148, 180.02147, 180.02145, 180.02135, 180.02115, 180.02094, 180.01958, 180.01897, 180.01846, 180.01624, 180.01547, 180.01477, 180.01425, 180.01407, 180.0143, 180.01436, 180.01428, 180.01392, 180.01378, 180.01396, 180.01488, 180.0161, 180.0174, 180.01826, 180.0192, 180.0206, 180.02252, 180.02473, 180.0269, 180.02902, 180.03149, 180.0345, 180.03801, 180.04181, 180.04555, 180.04977, 180.05466, 180.05992, 180.06546, 180.0715, 180.07837, 180.08574, 180.09346, 180.10184, 180.11102, 180.12125, 180.13229, 180.14392, 180.15607, 180.16887, 180.1824, 180.19664, 180.21123, 180.22636, 180.24251, 180.25967, 180.27742, 180.29587, 180.31598, 180.33707, 180.3582, 180.3808, 180.40411, 180.42862, 180.45422, 180.48024, 180.50642, 180.53325, 180.56082, 180.58878, 180.61742, 180.64685, 180.67635, 180.70671, 180.73753, 180.76909, 180.80096, 180.83255, 180.86522, 180.89883, 180.93253, 180.96713, 181.00252, 181.03773, 181.07297, 181.10829, 181.14496, 181.18279, 181.22028, 181.25752, 181.29439, 181.32959, 181.36458, 181.40088, 181.43741, 181.47369, 181.50917, 181.54332, 181.57774, 181.61334, 181.64902, 181.68596, 181.7242, 181.7617, 181.79843, 181.83513, 181.87192, 181.90961, 181.94727, 181.9857, 182.02441, 182.06326, 182.1035, 182.14424, 182.18398, 182.22302, 182.26132, 182.30066, 182.33942, 182.37904, 182.41917, 182.45876, 182.49632, 182.53271, 182.56963, 182.60735, 182.64554, 182.68359, 182.72183, 182.75928, 182.79482, 182.83173, 182.86961, 182.90521, 182.94044, 182.97412, 183.00899, 183.04352, 183.0809, 183.12045, 183.16031, 183.20035, 183.24016, 183.27913, 183.31721, 183.35562, 183.39336, 183.42928, 183.46495, 183.50055, 183.53683, 183.57225, 183.60655, 183.64061, 183.67566, 183.71036, 183.74536, 183.78122, 183.81776, 183.85562, 183.89389, 183.93182, 183.96855, 184.00623, 184.04614, 184.08539, 184.12434, 184.16336, 184.20358, 184.2431, 184.28152, 184.32024, 184.3553, 184.3905, 184.42917, 184.4704, 184.51273, 184.55392, 184.59485, 184.63615, 184.67656, 184.71397, 184.74928, 184.78352, 184.82126, 184.86098, 184.90076, 184.94235, 184.98337, 185.02277, 185.0623, 185.10294, 185.14499, 185.18594, 185.22719, 185.26956, 185.31255, 185.35408, 185.39359, 185.43069, 185.46863, 185.50841, 185.54842, 185.5876, 185.62738, 185.66747, 185.7076, 185.74796, 185.78799, 185.82808, 185.86952, 185.91144, 185.95245, 185.99278, 186.03255, 186.07283, 186.11411, 186.15575, 186.19742, 186.2375, 186.27637, 186.31621, 186.35637, 186.39667, 186.43544, 186.4731, 186.51167, 186.55107, 186.5916, 186.63014, 186.66568, 186.69972, 186.73563, 186.77632, 186.81931, 186.86119, 186.89891, 186.93753, 186.97639, 187.01602, 187.0556, 187.0981, 187.14053, 187.1834, 187.22716, 187.27185, 187.31763, 187.36372, 187.4113, 187.45898, 187.506, 187.55214, 187.59671, 187.64069, 187.68445, 187.73042, 187.77773, 187.82211, 187.86797, 187.91481, 187.96231, 188.00858, 188.05304, 188.09511, 188.13795, 188.1804, 188.22424, 188.27013, 188.31894, 188.36742, 188.41576, 188.4644, 188.51416, 188.56253, 188.60983, 188.65424, 188.69913, 188.7431, 188.78632, 188.83072, 188.87659, 188.92245, 188.96892, 189.01532, 189.06158, 189.10831, 189.15527, 189.20079, 189.2475, 189.29361, 189.33777, 189.38203, 189.42827, 189.47591, 189.52328, 189.57204, 189.62096, 189.6709, 189.72188, 189.77139, 189.81842, 189.8649, 189.91235, 189.95949, 190.0078, 190.05704, 190.10622, 190.15698, 190.20724, 190.25786, 190.30705, 190.35727, 190.40851, 190.45973, 190.51111, 190.56392, 190.61598, 190.66782, 190.7196, 190.77359, 190.82573, 190.87747, 190.92769, 190.97775, 191.02827, 191.07834, 191.12999, 191.17932, 191.22862, 191.27965, 191.33025, 191.38222, 191.433, 191.48625, 191.53882, 191.59085, 191.64409, 191.698, 191.7515, 191.8065, 191.86282, 191.91794, 191.97198, 192.02602, 192.07971, 192.1337, 192.18675, 192.24236, 192.29745, 192.35396, 192.40863, 192.46198, 192.51579, 192.57161, 192.62778, 192.68323, 192.73868, 192.79523, 192.85144, 192.9077, 192.96512, 193.02281, 193.07899, 193.13582, 193.19206, 193.24911, 193.30396, 193.35805, 193.41168, 193.46552, 193.52077, 193.57597, 193.63229, 193.68961, 193.74706, 193.80554, 193.86365, 193.92087, 193.97789, 194.03809, 194.09793, 194.15579, 194.21254, 194.27122, 194.33063, 194.39035, 194.44989, 194.51079, 194.56964, 194.62762, 194.68622, 194.74329, 194.79973, 194.85442, 194.91043, 194.96838]}, "params-norm vs samples": {"start_step": 0, "end_step": 2000, "step_interval": 5, "values": [180.0215, 180.0215, 180.0215, 180.0215, 180.02148, 180.02147, 180.02145, 180.02135, 180.02115, 180.02094, 180.01958, 180.01897, 180.01846, 180.01624, 180.01547, 180.01477, 180.01425, 180.01407, 180.0143, 180.01436, 180.01428, 180.01392, 180.01378, 180.01396, 180.01488, 180.0161, 180.0174, 180.01826, 180.0192, 180.0206, 180.02252, 180.02473, 180.0269, 180.02902, 180.03149, 180.0345, 180.03801, 180.04181, 180.04555, 180.04977, 180.05466, 180.05992, 180.06546, 180.0715, 180.07837, 180.08574, 180.09346, 180.10184, 180.11102, 180.12125, 180.13229, 180.14392, 180.15607, 180.16887, 180.1824, 180.19664, 180.21123, 180.22636, 180.24251, 180.25967, 180.27742, 180.29587, 180.31598, 180.33707, 180.3582, 180.3808, 180.40411, 180.42862, 180.45422, 180.48024, 180.50642, 180.53325, 180.56082, 180.58878, 180.61742, 180.64685, 180.67635, 180.70671, 180.73753, 180.76909, 180.80096, 180.83255, 180.86522, 180.89883, 180.93253, 180.96713, 181.00252, 181.03773, 181.07297, 181.10829, 181.14496, 181.18279, 181.22028, 181.25752, 181.29439, 181.32959, 181.36458, 181.40088, 181.43741, 181.47369, 181.50917, 181.54332, 181.57774, 181.61334, 181.64902, 181.68596, 181.7242, 181.7617, 181.79843, 181.83513, 181.87192, 181.90961, 181.94727, 181.9857, 182.02441, 182.06326, 182.1035, 182.14424, 182.18398, 182.22302, 182.26132, 182.30066, 182.33942, 182.37904, 182.41917, 182.45876, 182.49632, 182.53271, 182.56963, 182.60735, 182.64554, 182.68359, 182.72183, 182.75928, 182.79482, 182.83173, 182.86961, 182.90521, 182.94044, 182.97412, 183.00899, 183.04352, 183.0809, 183.12045, 183.16031, 183.20035, 183.24016, 183.27913, 183.31721, 183.35562, 183.39336, 183.42928, 183.46495, 183.50055, 183.53683, 183.57225, 183.60655, 183.64061, 183.67566, 183.71036, 183.74536, 183.78122, 183.81776, 183.85562, 183.89389, 183.93182, 183.96855, 184.00623, 184.04614, 184.08539, 184.12434, 184.16336, 184.20358, 184.2431, 184.28152, 184.32024, 184.3553, 184.3905, 184.42917, 184.4704, 184.51273, 184.55392, 184.59485, 184.63615, 184.67656, 184.71397, 184.74928, 184.78352, 184.82126, 184.86098, 184.90076, 184.94235, 184.98337, 185.02277, 185.0623, 185.10294, 185.14499, 185.18594, 185.22719, 185.26956, 185.31255, 185.35408, 185.39359, 185.43069, 185.46863, 185.50841, 185.54842, 185.5876, 185.62738, 185.66747, 185.7076, 185.74796, 185.78799, 185.82808, 185.86952, 185.91144, 185.95245, 185.99278, 186.03255, 186.07283, 186.11411, 186.15575, 186.19742, 186.2375, 186.27637, 186.31621, 186.35637, 186.39667, 186.43544, 186.4731, 186.51167, 186.55107, 186.5916, 186.63014, 186.66568, 186.69972, 186.73563, 186.77632, 186.81931, 186.86119, 186.89891, 186.93753, 186.97639, 187.01602, 187.0556, 187.0981, 187.14053, 187.1834, 187.22716, 187.27185, 187.31763, 187.36372, 187.4113, 187.45898, 187.506, 187.55214, 187.59671, 187.64069, 187.68445, 187.73042, 187.77773, 187.82211, 187.86797, 187.91481, 187.96231, 188.00858, 188.05304, 188.09511, 188.13795, 188.1804, 188.22424, 188.27013, 188.31894, 188.36742, 188.41576, 188.4644, 188.51416, 188.56253, 188.60983, 188.65424, 188.69913, 188.7431, 188.78632, 188.83072, 188.87659, 188.92245, 188.96892, 189.01532, 189.06158, 189.10831, 189.15527, 189.20079, 189.2475, 189.29361, 189.33777, 189.38203, 189.42827, 189.47591, 189.52328, 189.57204, 189.62096, 189.6709, 189.72188, 189.77139, 189.81842, 189.8649, 189.91235, 189.95949, 190.0078, 190.05704, 190.10622, 190.15698, 190.20724, 190.25786, 190.30705, 190.35727, 190.40851, 190.45973, 190.51111, 190.56392, 190.61598, 190.66782, 190.7196, 190.77359, 190.82573, 190.87747, 190.92769, 190.97775, 191.02827, 191.07834, 191.12999, 191.17932, 191.22862, 191.27965, 191.33025, 191.38222, 191.433, 191.48625, 191.53882, 191.59085, 191.64409, 191.698, 191.7515, 191.8065, 191.86282, 191.91794, 191.97198, 192.02602, 192.07971, 192.1337, 192.18675, 192.24236, 192.29745, 192.35396, 192.40863, 192.46198, 192.51579, 192.57161, 192.62778, 192.68323, 192.73868, 192.79523, 192.85144, 192.9077, 192.96512, 193.02281, 193.07899, 193.13582, 193.19206, 193.24911, 193.30396, 193.35805, 193.41168, 193.46552, 193.52077, 193.57597, 193.63229, 193.68961, 193.74706, 193.80554, 193.86365, 193.92087, 193.97789, 194.03809, 194.09793, 194.15579, 194.21254, 194.27122, 194.33063, 194.39035, 194.44989, 194.51079, 194.56964, 194.62762, 194.68622, 194.74329, 194.79973, 194.85442, 194.91043, 194.96838]}, "iteration-time": {"start_step": 0, "end_step": 2000, "step_interval": 5, "values": [25.9357, 1.58651, 1.57374, 1.5753, 1.57369, 1.58365, 1.58825, 1.58527, 1.58564, 1.5777, 1.58419, 1.58585, 1.58154, 1.58741, 1.59392, 1.59071, 1.59711, 1.6014, 1.60351, 1.59396, 1.5899, 1.59645, 1.58704, 1.58712, 1.60341, 1.58462, 1.5838, 1.58964, 1.5977, 1.5914, 1.59087, 1.59805, 1.5927, 1.59042, 1.57661, 1.58906, 1.58372, 1.5783, 1.662, 1.58247, 1.58561, 1.58497, 1.60619, 1.59828, 1.60708, 1.60788, 1.6018, 1.59949, 1.59104, 1.5968, 1.60548, 1.60125, 1.59943, 1.58135, 1.58089, 1.58389, 1.58725, 1.58116, 1.58404, 1.58902, 1.58673, 1.58415, 1.60076, 1.59392, 1.59498, 1.58949, 1.59688, 1.59686, 1.58746, 1.59881, 1.5919, 1.59305, 1.60935, 1.59895, 1.60324, 1.60238, 1.59829, 1.60008, 1.59605, 1.60176, 1.59396, 1.60186, 1.58731, 1.58171, 1.58397, 1.58802, 1.58792, 1.5888, 1.5989, 1.60961, 1.59174, 1.61116, 1.59839, 1.5987, 1.60266, 1.59894, 1.60234, 1.59759, 1.59588, 1.59656, 1.60095, 1.59247, 1.59334, 1.58581, 1.60076, 1.5966, 1.58958, 1.58303, 1.58777, 1.58897, 1.59327, 1.59617, 1.59379, 1.59354, 1.58468, 1.59116, 1.58522, 1.58052, 1.57531, 1.59285, 1.58327, 1.57928, 1.58856, 1.60734, 1.60047, 1.58954, 1.5887, 1.59365, 1.57967, 1.58675, 1.57718, 1.58018, 1.58698, 1.58486, 1.59903, 1.5922, 1.59084, 1.58453, 1.58231, 1.58267, 1.58483, 1.58037, 1.5909, 1.60252, 1.60356, 1.58876, 1.59367, 1.60171, 1.59771, 1.6032, 1.60106, 1.60184, 1.60827, 1.60637, 1.60548, 1.60525, 1.60212, 1.60506, 1.59982, 1.60509, 1.60647, 1.60886, 1.60014, 1.60931, 1.59824, 1.60157, 1.60774, 1.60732, 1.61218, 1.61074, 1.60769, 1.60031, 1.59568, 1.59819, 1.6096, 1.59367, 1.60494, 1.59917, 1.59747, 1.60124, 1.59771, 1.59534, 1.60201, 1.59851, 1.60069, 1.60225, 1.59775, 1.59041, 1.60108, 1.59759, 1.59096, 1.60191, 1.5962, 1.60086, 1.61379, 1.60436, 1.60606, 1.60163, 1.60378, 1.60305, 1.59492, 1.60456, 1.60034, 1.58872, 1.59577, 1.59654, 1.59711, 1.59749, 1.59808, 1.60144, 1.59512, 1.59382, 1.59822, 1.59585, 1.59994, 1.59286, 1.59958, 1.60154, 1.59764, 1.59284, 1.59867, 1.6049, 1.6004, 1.59909, 1.60488, 1.59532, 1.60133, 1.60538, 1.5991, 1.59608, 1.60992, 1.60101, 1.60144, 1.59775, 1.59962, 1.58809, 1.59851, 1.59204, 1.59492, 1.59647, 1.58928, 1.58595, 1.7535, 1.6478, 1.59827, 1.60514, 1.59426, 1.61414, 1.60982, 1.60735, 1.60866, 1.70147, 1.60416, 1.59248, 1.59525, 1.59344, 1.59499, 1.60459, 1.6003, 1.60341, 1.60801, 1.61343, 1.60596, 1.60611, 1.60542, 1.60121, 1.59801, 1.59823, 1.59998, 1.59829, 1.59898, 1.59531, 1.60142, 1.60403, 1.59966, 1.60202, 1.59979, 1.60042, 1.59732, 1.60245, 1.60091, 1.5998, 1.60238, 1.59984, 1.60274, 1.60666, 1.60321, 1.6036, 1.6041, 1.59868, 1.6015, 1.60892, 1.60377, 1.60116, 1.60829, 1.60355, 1.60349, 1.60256, 1.60399, 1.60265, 1.60684, 1.60536, 1.61211, 1.60719, 1.6104, 1.59911, 1.59879, 1.61165, 1.60015, 1.6048, 1.59789, 1.60116, 1.60929, 1.60128, 1.60444, 1.6133, 1.59942, 1.6132, 1.60448, 1.58597, 1.58802, 1.59401, 1.58972, 1.59965, 1.60201, 1.59413, 1.60397, 1.60165, 1.59963, 1.60178, 1.59826, 1.60301, 1.6063, 1.60499, 1.6023, 1.60467, 1.6048, 1.59497, 1.61355, 1.60237, 1.60516, 1.60289, 1.60404, 1.60076, 1.59623, 1.60269, 1.60248, 1.60802, 1.60059, 1.70142, 1.61751, 1.60679, 1.7026, 1.60996, 1.6083, 1.61064, 1.61183, 1.62052, 1.61909, 1.61534, 1.61668, 1.6033, 1.60768, 1.60386, 1.61143, 1.60918, 1.59776, 1.60709, 1.60535, 1.60161, 1.60666, 1.60582, 1.60545, 1.6075, 1.60733, 1.61657, 1.62133, 1.60999, 1.61188, 1.61305, 1.6069, 1.61671, 1.61762, 1.62212, 1.61922, 1.6081, 1.60551, 1.61555, 1.61354, 1.61632, 1.61937, 1.6141, 1.60911, 1.614, 1.61245, 1.61194, 1.6115, 1.60534, 1.60841, 1.60561]}, "lm loss validation": {"start_step": 0, "end_step": 4, "step_interval": 5, "values": [5.60068]}, "lm loss validation vs samples": {"start_step": 0, "end_step": 4, "step_interval": 5, "values": [5.60068]}, "lm loss validation ppl": {"start_step": 0, "end_step": 4, "step_interval": 5, "values": [270.6116]}, "lm loss validation ppl vs samples": {"start_step": 0, "end_step": 4, "step_interval": 5, "values": [270.6116]}}
\ No newline at end of file
diff --git a/tests/functional_tests/test_cases/gpt/gpt3_345m_weekly_dgx_h100_1N8G_mcore_tp2_pp2_fp8_tp_pp_sp/golden_values_lts.json b/tests/functional_tests/test_cases/gpt/gpt3_345m_weekly_dgx_h100_1N8G_mcore_tp2_pp2_fp8_tp_pp_sp/golden_values_lts.json
new file mode 100644
index 0000000000..0af59da700
--- /dev/null
+++ b/tests/functional_tests/test_cases/gpt/gpt3_345m_weekly_dgx_h100_1N8G_mcore_tp2_pp2_fp8_tp_pp_sp/golden_values_lts.json
@@ -0,0 +1 @@
+{"forward-backward-time": {"start_step": 0, "end_step": 2000, "step_interval": 5, "values": [21.65799, 1.57316, 1.56036, 1.56197, 1.56002, 1.57036, 1.57498, 1.57179, 1.57223, 1.56447, 1.57065, 1.57253, 1.56833, 1.57388, 1.58074, 1.57741, 1.58388, 1.58795, 1.5903, 1.58075, 1.57656, 1.58312, 1.57306, 1.57348, 1.58999, 1.57118, 1.56942, 1.57642, 1.58455, 1.57798, 1.57753, 1.5848, 1.57952, 1.57466, 1.5634, 1.5759, 1.57055, 1.56518, 1.64863, 1.56915, 1.57234, 1.57176, 1.59307, 1.58513, 1.59397, 1.59455, 1.58862, 1.58627, 1.57781, 1.5836, 1.59175, 1.58787, 1.58531, 1.56743, 1.56768, 1.57061, 1.57416, 1.56759, 1.5696, 1.57589, 1.57313, 1.571, 1.58684, 1.58081, 1.58172, 1.57572, 1.58332, 1.58369, 1.5742, 1.58521, 1.57857, 1.57985, 1.59598, 1.58564, 1.58954, 1.58921, 1.58516, 1.58693, 1.58278, 1.58855, 1.58036, 1.58425, 1.57404, 1.56846, 1.57061, 1.57471, 1.57444, 1.57552, 1.58566, 1.59602, 1.57809, 1.59795, 1.58523, 1.58552, 1.58948, 1.5857, 1.58918, 1.58406, 1.58274, 1.58292, 1.5878, 1.57929, 1.57852, 1.57229, 1.58645, 1.58337, 1.57647, 1.56993, 1.57461, 1.57583, 1.57981, 1.58228, 1.58026, 1.58041, 1.57147, 1.57774, 1.57198, 1.56711, 1.56216, 1.57948, 1.57013, 1.5652, 1.57538, 1.59385, 1.58672, 1.57603, 1.57508, 1.58044, 1.56643, 1.57319, 1.56412, 1.56703, 1.57342, 1.57169, 1.58538, 1.57905, 1.57735, 1.5713, 1.56908, 1.56945, 1.57129, 1.5672, 1.57775, 1.58937, 1.59019, 1.5751, 1.58049, 1.58855, 1.58446, 1.59003, 1.58787, 1.58871, 1.59524, 1.59317, 1.59223, 1.59165, 1.58901, 1.59193, 1.5866, 1.59184, 1.59323, 1.59575, 1.58596, 1.59591, 1.58463, 1.58779, 1.59392, 1.59398, 1.59893, 1.5974, 1.59446, 1.58691, 1.58241, 1.58352, 1.59639, 1.58013, 1.59181, 1.58597, 1.58425, 1.58787, 1.58445, 1.58197, 1.58869, 1.5852, 1.58751, 1.5889, 1.58458, 1.57701, 1.58666, 1.584, 1.57776, 1.58858, 1.58222, 1.58721, 1.60018, 1.59115, 1.59271, 1.58842, 1.59023, 1.58933, 1.57882, 1.59135, 1.5868, 1.57554, 1.58258, 1.58243, 1.58389, 1.58426, 1.5849, 1.58819, 1.58199, 1.58031, 1.58504, 1.58277, 1.5863, 1.57949, 1.58628, 1.58781, 1.58443, 1.57924, 1.58531, 1.59139, 1.58724, 1.58582, 1.59165, 1.58221, 1.58782, 1.59196, 1.58549, 1.58279, 1.59669, 1.58729, 1.58776, 1.58434, 1.58643, 1.57486, 1.58484, 1.57875, 1.58178, 1.58296, 1.57564, 1.57269, 1.73935, 1.63419, 1.58507, 1.59194, 1.5809, 1.60067, 1.59666, 1.59408, 1.59512, 1.68832, 1.59093, 1.57923, 1.58167, 1.5802, 1.58149, 1.59105, 1.58674, 1.59021, 1.59488, 1.60007, 1.59231, 1.59296, 1.59159, 1.588, 1.58471, 1.58515, 1.58686, 1.58415, 1.58593, 1.58185, 1.58805, 1.59063, 1.58623, 1.58868, 1.5863, 1.58712, 1.58387, 1.58919, 1.58738, 1.58618, 1.58901, 1.58673, 1.5896, 1.59327, 1.58995, 1.59034, 1.59043, 1.58508, 1.58835, 1.59575, 1.59028, 1.58788, 1.59495, 1.59031, 1.58998, 1.58896, 1.59037, 1.58923, 1.59259, 1.59082, 1.59843, 1.59394, 1.59716, 1.58592, 1.58443, 1.59841, 1.58588, 1.59009, 1.58471, 1.58793, 1.59585, 1.58806, 1.59097, 1.59974, 1.58594, 1.59971, 1.5913, 1.5727, 1.57474, 1.58074, 1.57644, 1.58641, 1.58808, 1.58075, 1.5907, 1.58838, 1.58642, 1.58856, 1.58469, 1.58982, 1.59264, 1.59172, 1.58848, 1.59119, 1.59145, 1.58124, 1.60003, 1.58841, 1.59199, 1.58955, 1.59024, 1.58713, 1.58159, 1.58812, 1.58697, 1.59477, 1.58735, 1.68808, 1.60409, 1.59368, 1.68921, 1.59656, 1.59503, 1.59737, 1.5981, 1.6072, 1.60584, 1.60205, 1.60339, 1.59005, 1.59398, 1.59059, 1.5983, 1.59588, 1.58451, 1.59372, 1.59209, 1.58828, 1.59305, 1.59272, 1.59217, 1.59417, 1.59371, 1.60293, 1.6081, 1.59666, 1.59861, 1.59979, 1.59362, 1.60255, 1.60302, 1.60884, 1.60587, 1.5947, 1.59209, 1.60211, 1.60023, 1.60283, 1.60565, 1.6008, 1.5957, 1.60008, 1.59899, 1.59865, 1.59781, 1.59196, 1.59478, 1.59227]}, "forward-compute-time": {"start_step": 0, "end_step": 2000, "step_interval": 5, "values": [6.22042, 0.7887, 0.79083, 0.78962, 0.78756, 0.78885, 0.8016, 0.80118, 0.79635, 0.79549, 0.79171, 0.803, 0.8016, 0.79277, 0.79347, 0.80205, 0.80724, 0.8102, 0.80595, 0.79227, 0.78683, 0.79736, 0.79666, 0.79876, 0.80245, 0.79592, 0.79874, 0.79753, 0.81164, 0.79672, 0.79701, 0.80746, 0.80543, 0.79696, 0.79511, 0.79932, 0.79557, 0.79429, 0.84751, 0.79126, 0.79445, 0.79427, 0.81209, 0.80591, 0.79877, 0.8166, 0.8125, 0.80956, 0.80732, 0.79604, 0.80371, 0.80021, 0.79673, 0.78625, 0.79742, 0.79855, 0.79833, 0.79792, 0.79392, 0.79627, 0.78993, 0.80003, 0.78776, 0.80568, 0.77968, 0.7912, 0.79925, 0.79922, 0.79071, 0.79884, 0.78877, 0.79858, 0.81252, 0.8067, 0.79219, 0.81833, 0.81779, 0.80094, 0.80137, 0.81945, 0.80719, 0.79232, 0.79516, 0.80871, 0.80104, 0.79685, 0.80162, 0.80637, 0.80248, 0.80857, 0.81037, 0.80869, 0.7965, 0.80743, 0.8098, 0.80128, 0.80589, 0.80206, 0.80032, 0.80015, 0.79522, 0.79329, 0.80165, 0.80384, 0.80062, 0.79949, 0.80381, 0.78559, 0.80393, 0.80321, 0.80107, 0.79216, 0.79542, 0.79246, 0.80303, 0.8106, 0.79065, 0.79761, 0.79846, 0.80131, 0.80281, 0.79732, 0.7963, 0.81465, 0.81139, 0.79778, 0.80117, 0.79101, 0.78623, 0.79644, 0.7976, 0.79653, 0.79953, 0.79765, 0.80015, 0.81095, 0.80579, 0.7998, 0.7917, 0.79794, 0.79775, 0.79275, 0.80199, 0.81948, 0.81204, 0.79625, 0.79973, 0.79652, 0.80445, 0.80534, 0.80518, 0.79884, 0.81423, 0.80952, 0.81247, 0.80766, 0.80443, 0.81182, 0.80591, 0.81339, 0.80677, 0.79581, 0.79801, 0.81209, 0.7963, 0.79413, 0.8031, 0.80814, 0.80927, 0.81215, 0.81255, 0.79604, 0.80852, 0.80814, 0.81295, 0.80402, 0.81318, 0.8097, 0.80155, 0.81294, 0.81295, 0.80384, 0.81085, 0.80809, 0.81049, 0.81462, 0.81121, 0.80114, 0.81317, 0.8073, 0.80801, 0.81335, 0.81351, 0.81644, 0.8235, 0.8092, 0.81494, 0.80197, 0.80738, 0.80524, 0.80729, 0.81006, 0.81098, 0.8058, 0.81736, 0.81018, 0.81686, 0.81077, 0.81584, 0.81737, 0.81149, 0.81076, 0.81213, 0.8138, 0.81013, 0.80497, 0.82135, 0.81652, 0.81154, 0.81448, 0.81949, 0.81162, 0.81162, 0.80853, 0.81191, 0.81703, 0.8125, 0.80932, 0.80851, 0.79798, 0.81183, 0.80938, 0.80838, 0.81083, 0.81336, 0.81205, 0.81618, 0.80587, 0.81362, 0.81042, 0.80604, 0.80513, 0.95515, 0.83951, 0.81274, 0.80912, 0.80158, 0.81243, 0.81495, 0.81427, 0.81731, 0.90437, 0.812, 0.81127, 0.80335, 0.80701, 0.81174, 0.81789, 0.8062, 0.81818, 0.81364, 0.82457, 0.81861, 0.81831, 0.81451, 0.81624, 0.819, 0.81664, 0.81149, 0.81897, 0.82098, 0.80639, 0.82356, 0.81998, 0.82291, 0.8172, 0.81813, 0.82015, 0.82009, 0.8243, 0.82188, 0.82103, 0.81895, 0.8227, 0.81898, 0.81687, 0.82231, 0.82276, 0.82281, 0.81752, 0.81589, 0.81308, 0.81283, 0.8171, 0.82039, 0.81907, 0.81497, 0.81934, 0.81714, 0.8101, 0.8135, 0.81914, 0.82468, 0.81829, 0.82195, 0.81334, 0.81505, 0.83, 0.82284, 0.82566, 0.82499, 0.82531, 0.81828, 0.81665, 0.82509, 0.82012, 0.82215, 0.82179, 0.81542, 0.80285, 0.81044, 0.80469, 0.8102, 0.8158, 0.81485, 0.82051, 0.80883, 0.82724, 0.81536, 0.8108, 0.81338, 0.81843, 0.81932, 0.81808, 0.81079, 0.81136, 0.82409, 0.81369, 0.81194, 0.81256, 0.81683, 0.81111, 0.8172, 0.80945, 0.80932, 0.8134, 0.81086, 0.81202, 0.81131, 0.86018, 0.81312, 0.81026, 0.91292, 0.81781, 0.81732, 0.82904, 0.82523, 0.83411, 0.83407, 0.83166, 0.82856, 0.81239, 0.81494, 0.82555, 0.83157, 0.82113, 0.80701, 0.81497, 0.8215, 0.80867, 0.81134, 0.82362, 0.81971, 0.808, 0.80408, 0.81663, 0.82201, 0.81271, 0.82346, 0.82415, 0.81743, 0.8063, 0.80216, 0.80964, 0.8105, 0.8118, 0.81122, 0.81369, 0.81864, 0.82566, 0.81149, 0.80986, 0.81981, 0.81964, 0.82004, 0.80608, 0.81446, 0.81929, 0.8075, 0.80881]}, "backward-compute-time": {"start_step": 0, "end_step": 2000, "step_interval": 5, "values": [4.62942, 0.75097, 0.74, 0.74537, 0.74999, 0.75094, 0.74822, 0.74322, 0.74143, 0.74188, 0.75087, 0.75511, 0.75059, 0.75125, 0.75555, 0.7505, 0.76577, 0.75929, 0.75813, 0.75798, 0.75777, 0.75449, 0.75219, 0.76004, 0.76606, 0.74726, 0.75154, 0.75719, 0.75304, 0.75913, 0.75194, 0.76105, 0.75155, 0.75361, 0.75194, 0.74863, 0.75344, 0.75699, 0.76125, 0.76168, 0.75845, 0.75545, 0.76173, 0.76702, 0.76538, 0.76769, 0.75666, 0.75657, 0.75518, 0.75767, 0.75791, 0.75998, 0.76253, 0.75636, 0.75269, 0.75165, 0.75005, 0.74953, 0.7487, 0.76173, 0.75616, 0.75523, 0.77089, 0.75678, 0.76, 0.7504, 0.7563, 0.75155, 0.75497, 0.74943, 0.75435, 0.75485, 0.76133, 0.75829, 0.75424, 0.74885, 0.75032, 0.76341, 0.76306, 0.75225, 0.74967, 0.75803, 0.74607, 0.74997, 0.75189, 0.75522, 0.75126, 0.75345, 0.75402, 0.76221, 0.75573, 0.75879, 0.7447, 0.75592, 0.75875, 0.76088, 0.76149, 0.75471, 0.75716, 0.7483, 0.75544, 0.7486, 0.75419, 0.75681, 0.75858, 0.76287, 0.75413, 0.75433, 0.75404, 0.75102, 0.75167, 0.75697, 0.75394, 0.75963, 0.75308, 0.75609, 0.74811, 0.74816, 0.74646, 0.74523, 0.74868, 0.74707, 0.74934, 0.7508, 0.76531, 0.76133, 0.75869, 0.75454, 0.74851, 0.74933, 0.74654, 0.74315, 0.74234, 0.74764, 0.75289, 0.7578, 0.75618, 0.75315, 0.75232, 0.75728, 0.75011, 0.75412, 0.75242, 0.74889, 0.75119, 0.75527, 0.75085, 0.7583, 0.76477, 0.75215, 0.75071, 0.76072, 0.75986, 0.76825, 0.75337, 0.75661, 0.75384, 0.76056, 0.76054, 0.76494, 0.7674, 0.76549, 0.75611, 0.76183, 0.75053, 0.75482, 0.75715, 0.76983, 0.77042, 0.76028, 0.77021, 0.75151, 0.75914, 0.75118, 0.76133, 0.75325, 0.76558, 0.75951, 0.76119, 0.75926, 0.75073, 0.75384, 0.75883, 0.7634, 0.76168, 0.76652, 0.75731, 0.75344, 0.76068, 0.75369, 0.75137, 0.75963, 0.7697, 0.751, 0.77098, 0.75284, 0.75939, 0.75995, 0.75928, 0.75802, 0.75677, 0.76065, 0.75638, 0.75119, 0.76038, 0.75423, 0.75553, 0.75918, 0.75995, 0.75408, 0.76136, 0.74612, 0.75854, 0.75865, 0.7593, 0.75419, 0.75151, 0.75761, 0.76577, 0.75463, 0.74788, 0.75358, 0.76279, 0.76172, 0.76321, 0.75292, 0.75124, 0.75794, 0.76269, 0.76049, 0.75669, 0.7573, 0.75738, 0.75375, 0.76126, 0.75621, 0.75055, 0.75297, 0.75603, 0.75099, 0.75101, 0.74554, 0.83246, 0.7545, 0.75293, 0.75203, 0.75391, 0.7554, 0.75839, 0.75728, 0.76242, 0.75203, 0.75857, 0.7516, 0.75317, 0.75327, 0.75445, 0.7579, 0.753, 0.753, 0.75219, 0.75665, 0.75118, 0.75048, 0.74602, 0.74682, 0.75041, 0.74864, 0.75542, 0.74976, 0.74748, 0.75186, 0.75401, 0.75027, 0.74959, 0.75363, 0.74766, 0.75374, 0.751, 0.75381, 0.75069, 0.74504, 0.75077, 0.75083, 0.75402, 0.74825, 0.75092, 0.75145, 0.75314, 0.75502, 0.74951, 0.7579, 0.75347, 0.7511, 0.75538, 0.75696, 0.7579, 0.75511, 0.75693, 0.75306, 0.74836, 0.7533, 0.75717, 0.76271, 0.75482, 0.75341, 0.74896, 0.75096, 0.74632, 0.75083, 0.74516, 0.74075, 0.75065, 0.75718, 0.75375, 0.7557, 0.7462, 0.75504, 0.75655, 0.74982, 0.75081, 0.74949, 0.74808, 0.75239, 0.75544, 0.74273, 0.75537, 0.75449, 0.75109, 0.7469, 0.7528, 0.75193, 0.75171, 0.75366, 0.75959, 0.74847, 0.75215, 0.75052, 0.76098, 0.75632, 0.75747, 0.74845, 0.74437, 0.75406, 0.75357, 0.75105, 0.75484, 0.75765, 0.75917, 0.7582, 0.75622, 0.75762, 0.74952, 0.75592, 0.75778, 0.74829, 0.75888, 0.75085, 0.75064, 0.74667, 0.751, 0.75208, 0.75768, 0.74883, 0.75857, 0.7487, 0.75962, 0.76274, 0.75413, 0.75644, 0.75008, 0.75022, 0.75465, 0.76027, 0.75685, 0.7526, 0.7567, 0.75515, 0.75552, 0.75496, 0.75875, 0.76104, 0.77511, 0.77406, 0.768, 0.7781, 0.77247, 0.78055, 0.77825, 0.76677, 0.78188, 0.77415, 0.77114, 0.77225, 0.77049, 0.77717, 0.77115, 0.76807, 0.77259, 0.77472]}, "batch-generator-time": {"start_step": 0, "end_step": 2000, "step_interval": 5, "values": [2.20334, 0.0143, 0.01667, 0.01326, 0.01295, 0.01293, 0.01334, 0.01436, 0.01318, 0.01437, 0.01301, 0.01378, 0.01472, 0.01468, 0.01314, 0.01281, 0.01302, 0.01378, 0.01285, 0.01444, 0.01432, 0.01486, 0.01305, 0.01348, 0.01674, 0.01301, 0.01444, 0.01426, 0.01437, 0.01321, 0.01305, 0.01316, 0.01395, 0.01333, 0.01301, 0.01363, 0.01284, 0.01423, 0.01642, 0.01753, 0.01691, 0.01476, 0.01495, 0.01652, 0.01707, 0.02019, 0.01642, 0.01534, 0.01555, 0.01455, 0.01613, 0.01682, 0.01611, 0.01302, 0.01316, 0.01386, 0.0152, 0.01835, 0.01342, 0.01579, 0.01295, 0.01372, 0.01717, 0.0153, 0.01567, 0.01348, 0.01623, 0.0153, 0.01466, 0.01622, 0.01222, 0.01602, 0.02111, 0.01556, 0.01731, 0.01708, 0.01773, 0.0175, 0.01682, 0.0175, 0.01625, 0.0172, 0.01748, 0.02121, 0.01676, 0.01653, 0.01683, 0.01767, 0.01788, 0.01764, 0.01715, 0.02209, 0.01681, 0.01797, 0.01754, 0.01797, 0.01781, 0.01828, 0.0179, 0.01691, 0.01823, 0.0176, 0.01724, 0.0166, 0.01718, 0.01732, 0.0149, 0.01363, 0.01477, 0.01454, 0.01309, 0.01297, 0.01408, 0.0145, 0.01297, 0.01965, 0.01506, 0.01303, 0.01404, 0.01373, 0.01435, 0.01442, 0.01449, 0.01568, 0.01599, 0.01299, 0.01288, 0.01478, 0.01302, 0.01354, 0.01604, 0.01518, 0.01493, 0.01391, 0.01308, 0.01275, 0.01267, 0.01483, 0.0133, 0.01279, 0.01339, 0.01261, 0.01553, 0.01269, 0.0125, 0.01256, 0.01329, 0.0129, 0.01284, 0.01681, 0.01599, 0.01537, 0.0153, 0.01362, 0.01518, 0.01566, 0.01486, 0.01485, 0.01522, 0.01745, 0.01558, 0.01496, 0.01484, 0.01693, 0.01487, 0.01546, 0.02093, 0.01683, 0.01724, 0.01738, 0.01648, 0.01861, 0.01776, 0.01745, 0.01724, 0.01583, 0.02118, 0.01682, 0.01836, 0.02112, 0.01766, 0.0169, 0.01696, 0.01695, 0.01754, 0.01652, 0.0184, 0.0173, 0.01627, 0.01667, 0.01742, 0.01775, 0.01745, 0.01643, 0.01709, 0.01696, 0.01761, 0.01648, 0.01725, 0.01672, 0.21908, 0.01675, 0.01611, 0.01752, 0.01616, 0.01728, 0.01777, 0.0171, 0.01749, 0.01847, 0.01858, 0.01789, 0.01723, 0.01628, 0.01773, 0.01691, 0.01878, 0.01787, 0.0209, 0.01796, 0.01741, 0.01777, 0.01829, 0.01892, 0.01729, 0.01774, 0.01727, 0.02061, 0.01571, 0.01771, 0.01838, 0.01772, 0.0174, 0.01766, 0.01725, 0.01763, 0.01752, 0.01709, 0.01817, 0.02143, 0.0161, 0.01751, 0.09405, 0.06723, 0.01758, 0.01661, 0.02181, 0.02167, 0.01822, 0.01785, 0.01747, 0.01708, 0.01826, 0.01765, 0.01811, 0.01727, 0.01812, 0.01807, 0.01812, 0.01919, 0.01774, 0.01749, 0.01737, 0.01751, 0.01714, 0.02283, 0.01759, 0.01975, 0.02057, 0.01799, 0.01752, 0.01739, 0.01757, 0.01773, 0.01789, 0.01729, 0.01642, 0.01712, 0.0176, 0.01717, 0.01691, 0.01727, 0.01589, 0.01789, 0.0174, 0.0174, 0.01722, 0.01761, 0.01802, 0.0174, 0.02069, 0.0171, 0.01719, 0.01766, 0.01768, 0.01677, 0.01705, 0.01777, 0.01669, 0.02073, 0.01723, 0.01707, 0.01707, 0.01723, 0.01751, 0.01953, 0.0174, 0.0167, 0.01749, 0.01753, 0.01974, 0.01695, 0.01888, 0.01805, 0.01809, 0.01779, 0.0192, 0.01732, 0.01965, 0.01793, 0.01875, 0.01855, 0.01915, 0.01839, 0.01868, 0.01864, 0.01893, 0.01823, 0.01908, 0.01892, 0.01884, 0.01914, 0.02012, 0.01861, 0.02283, 0.01928, 0.01945, 0.01841, 0.01795, 0.01816, 0.0187, 0.01867, 0.01891, 0.02308, 0.0188, 0.01869, 0.01974, 0.02014, 0.02234, 0.0193, 0.01762, 0.01819, 0.0184, 0.01952, 0.01974, 0.01869, 0.0205, 0.018, 0.0183, 0.01719, 0.01915, 0.01879, 0.0194, 0.01781, 0.01856, 0.01773, 0.01734, 0.01914, 0.0169, 0.019, 0.01792, 0.01743, 0.02488, 0.01724, 0.01703, 0.01755, 0.01784, 0.01774, 0.01824, 0.01859, 0.02236, 0.01639, 0.0181, 0.01772, 0.01786, 0.01787, 0.01629, 0.01663, 0.01687, 0.01734, 0.01643, 0.0175, 0.0166, 0.01686, 0.0162, 0.01662, 0.02025, 0.01762, 0.01683, 0.01837]}, "forward-recv-time": {"start_step": 0, "end_step": 2000, "step_interval": 5, "values": [5.65416, 0.02537, 0.02635, 0.02461, 0.02504, 0.02484, 0.02542, 0.02517, 0.02613, 0.02496, 0.02499, 0.02526, 0.02517, 0.02669, 0.02527, 0.02523, 0.02555, 0.02514, 0.02531, 0.02544, 0.02502, 0.02866, 0.02534, 0.02519, 0.02546, 0.02642, 0.02449, 0.02505, 0.02448, 0.02468, 0.02481, 0.02534, 0.02569, 0.02662, 0.02525, 0.02575, 0.02553, 0.02468, 0.02518, 0.02486, 0.02617, 0.0262, 0.02498, 0.02481, 0.02556, 0.02544, 0.02525, 0.02507, 0.02521, 0.02526, 0.02607, 0.02518, 0.02513, 0.02559, 0.02488, 0.02586, 0.02585, 0.02611, 0.02926, 0.02566, 0.02649, 0.02556, 0.02541, 0.02684, 0.0255, 0.02555, 0.0255, 0.0255, 0.02545, 0.02694, 0.02533, 0.02962, 0.02527, 0.02528, 0.02579, 0.02515, 0.02509, 0.02553, 0.02514, 0.02532, 0.02535, 0.02565, 0.02505, 0.02564, 0.02529, 0.02581, 0.02662, 0.02629, 0.02709, 0.02508, 0.0255, 0.02567, 0.02579, 0.0251, 0.02471, 0.02553, 0.02567, 0.02524, 0.02526, 0.02542, 0.02549, 0.02485, 0.0254, 0.02557, 0.02563, 0.02532, 0.02527, 0.02538, 0.02679, 0.02564, 0.02917, 0.02565, 0.02736, 0.02515, 0.02504, 0.02493, 0.02534, 0.0255, 0.02468, 0.02576, 0.02535, 0.02502, 0.02542, 0.02937, 0.02618, 0.02564, 0.02552, 0.02493, 0.02464, 0.02534, 0.02541, 0.02506, 0.02906, 0.02585, 0.02551, 0.02458, 0.02524, 0.0254, 0.02487, 0.02705, 0.02476, 0.02422, 0.02846, 0.02862, 0.02919, 0.02491, 0.02528, 0.0255, 0.02536, 0.02481, 0.02663, 0.02537, 0.02529, 0.02555, 0.02495, 0.02532, 0.02892, 0.02477, 0.02508, 0.0255, 0.02505, 0.0255, 0.02603, 0.02601, 0.02543, 0.0257, 0.02514, 0.02658, 0.02696, 0.02519, 0.02558, 0.02777, 0.027, 0.02528, 0.02566, 0.02491, 0.02592, 0.02533, 0.02595, 0.0256, 0.02521, 0.02524, 0.02528, 0.02552, 0.02639, 0.02554, 0.02548, 0.02553, 0.02553, 0.02546, 0.02481, 0.02518, 0.02516, 0.02541, 0.02568, 0.02495, 0.02523, 0.02848, 0.02556, 0.02499, 0.022, 0.02884, 0.02809, 0.02537, 0.02485, 0.02541, 0.0241, 0.02529, 0.02531, 0.02522, 0.02532, 0.02491, 0.02523, 0.02501, 0.02691, 0.02738, 0.02935, 0.02585, 0.02542, 0.02516, 0.02571, 0.03013, 0.02563, 0.02483, 0.0253, 0.02509, 0.02525, 0.0255, 0.02513, 0.02517, 0.02489, 0.02524, 0.02485, 0.02507, 0.02536, 0.02583, 0.02534, 0.02509, 0.0251, 0.02531, 0.02518, 0.02475, 0.02917, 0.02567, 0.02587, 0.02568, 0.02609, 0.02628, 0.02622, 0.02564, 0.02497, 0.02578, 0.02549, 0.02526, 0.02494, 0.02571, 0.02582, 0.02631, 0.02647, 0.02581, 0.02643, 0.02664, 0.0263, 0.02556, 0.025, 0.02535, 0.02517, 0.02527, 0.0252, 0.02486, 0.02861, 0.02534, 0.02604, 0.02568, 0.02564, 0.02728, 0.02552, 0.02578, 0.02551, 0.02575, 0.02545, 0.02536, 0.02514, 0.02619, 0.02548, 0.02549, 0.02561, 0.02555, 0.02574, 0.02616, 0.02572, 0.02599, 0.02561, 0.02503, 0.02535, 0.02684, 0.02548, 0.02545, 0.02557, 0.02504, 0.02542, 0.0261, 0.02567, 0.02546, 0.0255, 0.02529, 0.02633, 0.03021, 0.0287, 0.0293, 0.0291, 0.03051, 0.03077, 0.02941, 0.03025, 0.02889, 0.02504, 0.02563, 0.02509, 0.02514, 0.02874, 0.02525, 0.02524, 0.02529, 0.02567, 0.02595, 0.02539, 0.02551, 0.02571, 0.02607, 0.02531, 0.02862, 0.02572, 0.02526, 0.02664, 0.02609, 0.02882, 0.02605, 0.02621, 0.02593, 0.02588, 0.02619, 0.02534, 0.02604, 0.02557, 0.02616, 0.02561, 0.02542, 0.02469, 0.02539, 0.02533, 0.02624, 0.02525, 0.02545, 0.02533, 0.02553, 0.02573, 0.02577, 0.0253, 0.02529, 0.02629, 0.02636, 0.02548, 0.02577, 0.0255, 0.02611, 0.02473, 0.02582, 0.02551, 0.02567, 0.0253, 0.02519, 0.0256, 0.02642, 0.02489, 0.02549, 0.02566, 0.0257, 0.02523, 0.02566, 0.02708, 0.02568, 0.025, 0.02826, 0.02772, 0.02446, 0.02415, 0.0242, 0.02452, 0.02402, 0.02491, 0.02511, 0.02443, 0.0247, 0.02457, 0.02433, 0.02427, 0.02485, 0.02473, 0.02411]}, "forward-send-time": {"start_step": 0, "end_step": 2000, "step_interval": 5, "values": [2.82565, 0.00013, 0.00013, 0.00012, 0.00013, 0.00012, 0.00012, 0.00012, 0.00012, 0.00012, 0.00012, 0.00012, 0.00012, 0.00012, 0.00012, 0.00012, 0.00012, 0.00013, 0.00012, 0.00019, 0.00012, 0.00012, 0.00012, 0.00013, 0.00012, 0.00013, 0.00012, 0.00012, 0.00013, 0.00012, 0.00012, 0.00012, 0.00012, 0.00013, 0.00013, 0.00013, 0.00012, 0.00012, 0.00013, 0.00012, 0.00012, 0.00013, 0.00012, 0.00012, 0.00013, 0.00013, 0.00012, 0.00012, 0.00013, 0.00012, 0.00013, 0.00012, 0.00012, 0.00013, 0.00012, 0.00013, 0.00013, 0.00013, 0.00013, 0.00012, 0.00013, 0.00013, 0.00012, 0.00013, 0.00013, 0.00012, 0.00013, 0.00012, 0.00013, 0.00013, 0.00012, 0.00013, 0.00013, 0.00012, 0.00012, 0.00014, 0.00012, 0.00012, 0.00012, 0.00013, 0.00012, 0.00012, 0.00012, 0.00013, 0.00012, 0.00012, 0.00013, 0.00013, 0.00012, 0.00013, 0.00012, 0.00013, 0.00013, 0.00012, 0.00012, 0.00013, 0.00012, 0.00012, 0.00012, 0.00013, 0.00013, 0.00012, 0.00012, 0.00012, 0.00013, 0.00012, 0.00012, 0.00012, 0.00013, 0.00012, 0.00015, 0.00012, 0.00013, 0.00013, 0.00012, 0.00012, 0.00012, 0.00013, 0.00012, 0.00013, 0.00012, 0.00012, 0.00012, 0.00013, 0.00012, 0.00013, 0.00013, 0.00012, 0.00012, 0.00013, 0.00012, 0.00012, 0.00013, 0.00012, 0.00013, 0.00012, 0.00012, 0.00012, 0.00013, 0.00014, 0.00012, 0.00012, 0.00012, 0.00013, 0.00012, 0.00012, 0.00013, 0.00012, 0.00012, 0.00013, 0.00013, 0.00012, 0.00012, 0.00012, 0.00013, 0.00012, 0.00013, 0.00012, 0.00012, 0.00012, 0.00012, 0.00013, 0.00013, 0.00013, 0.00013, 0.00013, 0.00013, 0.00012, 0.00013, 0.00012, 0.00012, 0.00013, 0.00012, 0.00012, 0.00012, 0.00012, 0.00012, 0.00012, 0.00012, 0.00012, 0.00012, 0.00012, 0.00012, 0.00013, 0.00012, 0.00012, 0.00013, 0.00012, 0.00012, 0.00018, 0.00012, 0.00012, 0.00013, 0.00013, 0.00012, 0.00013, 0.00012, 0.00013, 0.00012, 0.00013, 0.00012, 0.00012, 0.00012, 0.00013, 0.00012, 0.00013, 0.00012, 0.00012, 0.00012, 0.00012, 0.00012, 0.00012, 0.00013, 0.00013, 0.00012, 0.00012, 0.00012, 0.00013, 0.00012, 0.00013, 0.00012, 0.00013, 0.00013, 0.00012, 0.00012, 0.00012, 0.00012, 0.00012, 0.00012, 0.00012, 0.00012, 0.00012, 0.00012, 0.00013, 0.00014, 0.00012, 0.00012, 0.00012, 0.00012, 0.00012, 0.00013, 0.00012, 0.00012, 0.00012, 0.00012, 0.00012, 0.00012, 0.00013, 0.00012, 0.00012, 0.00012, 0.00013, 0.00013, 0.00012, 0.00012, 0.00013, 0.00012, 0.00012, 0.00012, 0.00012, 0.00012, 0.00013, 0.00012, 0.00012, 0.00013, 0.00013, 0.00012, 0.00012, 0.00012, 0.00012, 0.00014, 0.00013, 0.00012, 0.00013, 0.00013, 0.00013, 0.00012, 0.00012, 0.00012, 0.00012, 0.00012, 0.00012, 0.00012, 0.00012, 0.00012, 0.00012, 0.00012, 0.00012, 0.00012, 0.00012, 0.00012, 0.00012, 0.00013, 0.00012, 0.00012, 0.00013, 0.00012, 0.00012, 0.00012, 0.00012, 0.00012, 0.00012, 0.00012, 0.00013, 0.00013, 0.00013, 0.00012, 0.00013, 0.00013, 0.00013, 0.00013, 0.00013, 0.00014, 0.00013, 0.00013, 0.00013, 0.00012, 0.00012, 0.00013, 0.00012, 0.00013, 0.00013, 0.00012, 0.00012, 0.00012, 0.00012, 0.00012, 0.00013, 0.00012, 0.00012, 0.00012, 0.00013, 0.00012, 0.00014, 0.00014, 0.00012, 0.00013, 0.00012, 0.00012, 0.00012, 0.00013, 0.00012, 0.00012, 0.00012, 0.00012, 0.00012, 0.00012, 0.00013, 0.00012, 0.00012, 0.00012, 0.00013, 0.00012, 0.00012, 0.00012, 0.00012, 0.00013, 0.00012, 0.00012, 0.00013, 0.00013, 0.00013, 0.00013, 0.00013, 0.00012, 0.00012, 0.00012, 0.00012, 0.00013, 0.00012, 0.00012, 0.00013, 0.00012, 0.00012, 0.00012, 0.00012, 0.00012, 0.00013, 0.00013, 0.00012, 0.00013, 0.00012, 0.00012, 0.00013, 0.00013, 0.00012, 0.00012, 0.00012, 0.00012, 0.00012, 0.00012, 0.00012, 0.00012, 0.00012, 0.00013, 0.00012, 0.00012, 0.00012, 0.00012, 0.00012]}, "backward-recv-time": {"start_step": 0, "end_step": 2000, "step_interval": 5, "values": [0.02047, 0.0283, 0.02457, 0.02402, 0.02376, 0.02455, 0.02368, 0.02489, 0.03547, 0.02397, 0.02483, 0.02383, 0.02354, 0.02677, 0.02403, 0.02404, 0.02385, 0.02413, 0.02382, 0.02401, 0.02447, 0.02418, 0.02565, 0.02458, 0.02399, 0.02426, 0.02371, 0.02373, 0.02497, 0.02531, 0.02428, 0.02424, 0.02812, 0.02847, 0.02391, 0.0276, 0.02414, 0.02342, 0.02403, 0.0241, 0.02246, 0.0239, 0.02373, 0.02354, 0.024, 0.02551, 0.02523, 0.02434, 0.02333, 0.02695, 0.02802, 0.03335, 0.024, 0.02415, 0.02428, 0.0235, 0.02721, 0.02385, 0.02396, 0.02372, 0.02372, 0.02589, 0.02448, 0.02657, 0.02807, 0.02364, 0.02407, 0.02393, 0.02278, 0.02609, 0.02324, 0.02406, 0.02392, 0.02575, 0.02435, 0.02335, 0.02423, 0.02688, 0.02482, 0.02464, 0.0283, 0.02798, 0.02454, 0.02403, 0.02385, 0.02375, 0.024, 0.02436, 0.02658, 0.02418, 0.02444, 0.02438, 0.02772, 0.02445, 0.02469, 0.02482, 0.025, 0.0236, 0.02423, 0.02583, 0.02383, 0.02532, 0.02443, 0.02397, 0.02832, 0.02453, 0.02425, 0.02386, 0.02401, 0.02329, 0.02374, 0.02459, 0.02345, 0.02812, 0.02257, 0.02428, 0.03159, 0.02496, 0.02394, 0.02407, 0.02348, 0.02404, 0.0242, 0.02606, 0.02405, 0.02413, 0.02672, 0.02751, 0.02579, 0.02343, 0.02459, 0.02392, 0.02467, 0.02321, 0.02966, 0.02406, 0.02342, 0.02901, 0.02438, 0.02338, 0.02418, 0.02428, 0.02389, 0.02408, 0.02451, 0.02382, 0.02778, 0.02307, 0.02734, 0.02437, 0.02405, 0.02422, 0.02458, 0.02387, 0.02398, 0.02622, 0.0253, 0.02883, 0.02608, 0.02311, 0.02341, 0.0239, 0.02486, 0.02775, 0.02913, 0.02946, 0.03162, 0.03164, 0.03243, 0.02904, 0.03427, 0.02606, 0.02427, 0.02426, 0.02481, 0.02533, 0.02412, 0.02331, 0.02327, 0.02433, 0.02456, 0.02446, 0.02307, 0.02419, 0.02354, 0.02436, 0.02445, 0.02378, 0.02468, 0.02434, 0.02455, 0.02741, 0.02293, 0.02633, 0.02903, 0.02671, 0.02326, 0.0238, 0.02369, 0.02323, 0.02472, 0.02363, 0.02637, 0.02415, 0.0239, 0.02407, 0.02419, 0.0237, 0.02387, 0.02419, 0.02417, 0.02427, 0.02439, 0.02456, 0.02399, 0.02419, 0.0259, 0.02715, 0.02432, 0.02384, 0.02406, 0.02463, 0.02389, 0.02404, 0.02528, 0.02496, 0.0241, 0.02492, 0.02586, 0.02752, 0.02936, 0.02831, 0.02641, 0.02748, 0.02535, 0.0236, 0.02441, 0.02391, 0.02402, 0.02375, 0.02392, 0.02658, 0.02281, 0.02404, 0.02443, 0.02393, 0.02425, 0.02565, 0.02492, 0.02922, 0.02822, 0.02695, 0.02827, 0.02425, 0.02791, 0.02429, 0.02507, 0.02421, 0.02448, 0.02504, 0.02444, 0.02428, 0.02484, 0.02431, 0.0247, 0.02476, 0.02429, 0.02826, 0.02806, 0.02466, 0.02444, 0.02446, 0.02398, 0.0246, 0.02694, 0.02743, 0.02754, 0.02821, 0.02752, 0.02768, 0.02846, 0.02827, 0.02821, 0.02757, 0.02781, 0.03032, 0.0282, 0.02767, 0.02766, 0.02791, 0.02891, 0.02728, 0.02724, 0.02826, 0.02818, 0.0275, 0.02704, 0.02768, 0.02881, 0.02841, 0.02812, 0.02758, 0.02852, 0.02732, 0.02863, 0.0247, 0.02488, 0.02405, 0.02493, 0.02485, 0.025, 0.02485, 0.0248, 0.02492, 0.02512, 0.02464, 0.02467, 0.02816, 0.02752, 0.02469, 0.02368, 0.02464, 0.02438, 0.02448, 0.02474, 0.0246, 0.0247, 0.02471, 0.02492, 0.02452, 0.02459, 0.02436, 0.02461, 0.02714, 0.02468, 0.02624, 0.02941, 0.02449, 0.02703, 0.02762, 0.0284, 0.02681, 0.02872, 0.02442, 0.02456, 0.02406, 0.02457, 0.02358, 0.02347, 0.02871, 0.03113, 0.02849, 0.02643, 0.02442, 0.02499, 0.02477, 0.02568, 0.02464, 0.02487, 0.02408, 0.0248, 0.0262, 0.02523, 0.02571, 0.02565, 0.02504, 0.02409, 0.02564, 0.02393, 0.02423, 0.02644, 0.0241, 0.02354, 0.02445, 0.02479, 0.02481, 0.02499, 0.02444, 0.02433, 0.02438, 0.02439, 0.02468, 0.02426, 0.02465, 0.02263, 0.02673, 0.0262, 0.02622, 0.02641, 0.0272, 0.02655, 0.02722, 0.02659, 0.02705, 0.02744, 0.02687, 0.02797, 0.02579, 0.0241, 0.02442]}, "backward-send-time": {"start_step": 0, "end_step": 2000, "step_interval": 5, "values": [0.00019, 0.00019, 0.00016, 0.0002, 0.00018, 0.00018, 0.00016, 0.00018, 0.00022, 0.00017, 0.00018, 0.00017, 0.00018, 0.00016, 0.00017, 0.00017, 0.00018, 0.00017, 0.00016, 0.00016, 0.00019, 0.00019, 0.00016, 0.00018, 0.00016, 0.00016, 0.00018, 0.00016, 0.00017, 0.00018, 0.00016, 0.00019, 0.00018, 0.00016, 0.00019, 0.00016, 0.00016, 0.00018, 0.00018, 0.00018, 0.0002, 0.00018, 0.00018, 0.00017, 0.00016, 0.00018, 0.00017, 0.00017, 0.00018, 0.00021, 0.00019, 0.00018, 0.00019, 0.00018, 0.00016, 0.00017, 0.00018, 0.00018, 0.00018, 0.00016, 0.00018, 0.00018, 0.00016, 0.00018, 0.0002, 0.00018, 0.00018, 0.00017, 0.00018, 0.00016, 0.00016, 0.00019, 0.00021, 0.00017, 0.00016, 0.00016, 0.00018, 0.00018, 0.00019, 0.00018, 0.00019, 0.00019, 0.00019, 0.00016, 0.00018, 0.00021, 0.00017, 0.00018, 0.00017, 0.00018, 0.00016, 0.00016, 0.00018, 0.00017, 0.00016, 0.00018, 0.00036, 0.00016, 0.00022, 0.00016, 0.00016, 0.00019, 0.00019, 0.00018, 0.00017, 0.00016, 0.00018, 0.00016, 0.00018, 0.00018, 0.00016, 0.00017, 0.00018, 0.00019, 0.00018, 0.00018, 0.00032, 0.00018, 0.00018, 0.00016, 0.00021, 0.00018, 0.00017, 0.00018, 0.00018, 0.00018, 0.00016, 0.00018, 0.00017, 0.00018, 0.00018, 0.00018, 0.00021, 0.00016, 0.00019, 0.00019, 0.00018, 0.00017, 0.00017, 0.00018, 0.00018, 0.00018, 0.00018, 0.00019, 0.00018, 0.00016, 0.00018, 0.00019, 0.00018, 0.00021, 0.00016, 0.00017, 0.00016, 0.00016, 0.00017, 0.0002, 0.00018, 0.00017, 0.00018, 0.00017, 0.00018, 0.00019, 0.00018, 0.00018, 0.00018, 0.00019, 0.00027, 0.00031, 0.00017, 0.00017, 0.00016, 0.00016, 0.00019, 0.00018, 0.00016, 0.00018, 0.00018, 0.0002, 0.00018, 0.00018, 0.00018, 0.00018, 0.00018, 0.00016, 0.00018, 0.00018, 0.00019, 0.00018, 0.00019, 0.00018, 0.0002, 0.0002, 0.00018, 0.00017, 0.00019, 0.00018, 0.00018, 0.00018, 0.00016, 0.00019, 0.00016, 0.00018, 0.00018, 0.00018, 0.00016, 0.00018, 0.00019, 0.00018, 0.00018, 0.00016, 0.00018, 0.00018, 0.00018, 0.00017, 0.00017, 0.00019, 0.00019, 0.00018, 0.00019, 0.00018, 0.00016, 0.00018, 0.00018, 0.00016, 0.0002, 0.00016, 0.00018, 0.00016, 0.00017, 0.00018, 0.00018, 0.00019, 0.00018, 0.00016, 0.00018, 0.00019, 0.00018, 0.00016, 0.00016, 0.00018, 0.00016, 0.00016, 0.00016, 0.00019, 0.00018, 0.00018, 0.00018, 0.00018, 0.00019, 0.00019, 0.00016, 0.00018, 0.00017, 0.00019, 0.00037, 0.00017, 0.00017, 0.00019, 0.00018, 0.00018, 0.00016, 0.00018, 0.00018, 0.00018, 0.00019, 0.0002, 0.00016, 0.00018, 0.00029, 0.00019, 0.0002, 0.00018, 0.00016, 0.00016, 0.00016, 0.00016, 0.00018, 0.00018, 0.00018, 0.00017, 0.00037, 0.00019, 0.00019, 0.00018, 0.00018, 0.00018, 0.0002, 0.00016, 0.00018, 0.00029, 0.00017, 0.00024, 0.00016, 0.00019, 0.00016, 0.00017, 0.00035, 0.00036, 0.00017, 0.00016, 0.0002, 0.00034, 0.0002, 0.00016, 0.00017, 0.0002, 0.00016, 0.00018, 0.00018, 0.00016, 0.00017, 0.00017, 0.00016, 0.00017, 0.00018, 0.00018, 0.00018, 0.00025, 0.00018, 0.00016, 0.00016, 0.00016, 0.00017, 0.00017, 0.00018, 0.00016, 0.00017, 0.00018, 0.00017, 0.00016, 0.00018, 0.00018, 0.00017, 0.00018, 0.00018, 0.00018, 0.00017, 0.00016, 0.00016, 0.00019, 0.00017, 0.00018, 0.00019, 0.00016, 0.00018, 0.00018, 0.00019, 0.00016, 0.00016, 0.00019, 0.00017, 0.00019, 0.00017, 0.00017, 0.00018, 0.00017, 0.00016, 0.00018, 0.00016, 0.00018, 0.00019, 0.00016, 0.00017, 0.00019, 0.00016, 0.00017, 0.00016, 0.00016, 0.00019, 0.00017, 0.00018, 0.00018, 0.00018, 0.00016, 0.00019, 0.00018, 0.00017, 0.00018, 0.00016, 0.00018, 0.0002, 0.00017, 0.00016, 0.00017, 0.00017, 0.00018, 0.00018, 0.00018, 0.00016, 0.00019, 0.00018, 0.00018, 0.00018, 0.00019, 0.00016, 0.00016, 0.00017, 0.00018, 0.00018, 0.00016]}, "forward-send-backward-recv-time": {"start_step": 0, "end_step": 2000, "step_interval": 5, "values": [7.72045, 0.09004, 0.10467, 0.09849, 0.09238, 0.09943, 0.10332, 0.10911, 0.10563, 0.10498, 0.10272, 0.10382, 0.10192, 0.10289, 0.10891, 0.10722, 0.1057, 0.11565, 0.11445, 0.10746, 0.11354, 0.10514, 0.10376, 0.08937, 0.09262, 0.08764, 0.08288, 0.09035, 0.09702, 0.09008, 0.09616, 0.09645, 0.09564, 0.08936, 0.08325, 0.08878, 0.08887, 0.08097, 0.16157, 0.08262, 0.08896, 0.09145, 0.09803, 0.08184, 0.09702, 0.0971, 0.09683, 0.09764, 0.08935, 0.0971, 0.10578, 0.09846, 0.10251, 0.08742, 0.08778, 0.08971, 0.09353, 0.08897, 0.09, 0.08803, 0.08686, 0.08756, 0.09058, 0.08647, 0.08759, 0.09747, 0.10439, 0.10521, 0.09647, 0.10904, 0.09397, 0.09736, 0.10653, 0.0936, 0.10631, 0.1059, 0.10256, 0.09952, 0.09927, 0.10519, 0.10149, 0.09551, 0.10221, 0.10051, 0.09736, 0.09577, 0.0979, 0.09361, 0.09726, 0.10742, 0.0922, 0.10792, 0.10335, 0.10219, 0.1015, 0.09685, 0.09726, 0.10184, 0.09792, 0.10191, 0.1005, 0.10051, 0.09742, 0.09427, 0.09441, 0.08885, 0.09704, 0.09172, 0.09714, 0.09629, 0.10183, 0.09676, 0.09562, 0.09133, 0.09003, 0.10068, 0.09125, 0.0941, 0.09629, 0.10409, 0.09294, 0.09359, 0.10104, 0.10583, 0.09162, 0.08569, 0.08813, 0.093, 0.08756, 0.10008, 0.09688, 0.1054, 0.10747, 0.10112, 0.10023, 0.10296, 0.09747, 0.0945, 0.09503, 0.09075, 0.10094, 0.09821, 0.10359, 0.11126, 0.11094, 0.10686, 0.10472, 0.10387, 0.09679, 0.10627, 0.11005, 0.10858, 0.10916, 0.10819, 0.11254, 0.11227, 0.1067, 0.10979, 0.10635, 0.10862, 0.11093, 0.10588, 0.1078, 0.11054, 0.10333, 0.10314, 0.11111, 0.10133, 0.10064, 0.10338, 0.09919, 0.10252, 0.10368, 0.10692, 0.11169, 0.10373, 0.1082, 0.11025, 0.09905, 0.10905, 0.11343, 0.10499, 0.10807, 0.10315, 0.09841, 0.10583, 0.10804, 0.09746, 0.10771, 0.10609, 0.10625, 0.1058, 0.10401, 0.10832, 0.10595, 0.10705, 0.11742, 0.10139, 0.10969, 0.09952, 0.10696, 0.11066, 0.10165, 0.10114, 0.10538, 0.10594, 0.11402, 0.10492, 0.10645, 0.11173, 0.10848, 0.11309, 0.10714, 0.10786, 0.10722, 0.10193, 0.11309, 0.0997, 0.10535, 0.10927, 0.11186, 0.11523, 0.10176, 0.11174, 0.10738, 0.10339, 0.10818, 0.10428, 0.10357, 0.102, 0.11031, 0.10504, 0.10603, 0.10464, 0.10777, 0.10003, 0.11154, 0.10215, 0.10884, 0.1135, 0.10294, 0.10521, 0.18146, 0.15513, 0.10795, 0.10192, 0.09492, 0.1123, 0.11068, 0.10753, 0.10062, 0.20176, 0.10053, 0.10546, 0.10178, 0.10047, 0.10162, 0.10317, 0.10396, 0.10664, 0.11601, 0.12091, 0.11596, 0.11321, 0.11757, 0.11585, 0.1102, 0.10582, 0.10902, 0.11204, 0.11498, 0.11048, 0.11561, 0.12266, 0.11204, 0.10563, 0.11232, 0.10806, 0.10523, 0.11245, 0.10857, 0.10998, 0.10637, 0.11004, 0.10832, 0.1137, 0.11249, 0.1137, 0.11325, 0.10714, 0.10913, 0.11342, 0.10767, 0.11168, 0.1127, 0.10979, 0.10867, 0.10899, 0.11074, 0.10988, 0.11196, 0.11045, 0.10625, 0.10876, 0.11621, 0.10786, 0.11166, 0.1137, 0.1159, 0.12034, 0.12688, 0.13086, 0.12051, 0.11583, 0.12425, 0.12785, 0.11994, 0.1156, 0.11305, 0.1064, 0.11037, 0.11458, 0.10783, 0.11267, 0.11832, 0.11674, 0.12221, 0.11896, 0.11355, 0.12228, 0.11929, 0.11934, 0.11071, 0.11311, 0.12323, 0.11815, 0.1124, 0.10574, 0.10714, 0.11404, 0.1155, 0.11749, 0.11507, 0.11217, 0.11336, 0.11724, 0.11529, 0.11873, 0.11413, 0.11342, 0.11662, 0.11253, 0.21031, 0.1153, 0.11949, 0.12203, 0.12384, 0.12782, 0.12363, 0.12548, 0.12785, 0.11974, 0.12339, 0.11698, 0.1138, 0.11801, 0.11508, 0.12193, 0.1161, 0.11722, 0.11675, 0.12016, 0.12149, 0.12239, 0.12005, 0.12773, 0.12921, 0.11853, 0.11824, 0.12298, 0.11989, 0.12376, 0.12606, 0.12268, 0.12167, 0.11886, 0.10748, 0.11973, 0.11767, 0.12515, 0.11708, 0.11935, 0.12016, 0.12159, 0.11803, 0.11151, 0.11606, 0.11651, 0.12057, 0.10879]}, "backward-send-forward-recv-time": {"start_step": 0, "end_step": 2000, "step_interval": 5, "values": [3.17241, 0.01112, 0.01172, 0.00869, 0.00901, 0.01001, 0.01115, 0.00794, 0.00798, 0.0109, 0.01029, 0.01093, 0.01077, 0.01317, 0.01259, 0.00838, 0.01022, 0.00884, 0.01678, 0.0152, 0.00915, 0.00886, 0.00872, 0.00978, 0.01165, 0.00864, 0.01118, 0.01286, 0.00996, 0.0125, 0.01039, 0.01705, 0.00824, 0.00886, 0.00817, 0.00863, 0.0105, 0.00871, 0.08171, 0.01193, 0.01314, 0.01206, 0.01407, 0.01071, 0.01251, 0.01179, 0.01146, 0.00929, 0.01052, 0.01215, 0.0084, 0.00818, 0.00939, 0.0111, 0.00825, 0.01008, 0.01023, 0.00961, 0.0079, 0.01198, 0.0144, 0.00802, 0.01242, 0.00847, 0.01011, 0.00724, 0.00808, 0.0078, 0.00899, 0.00896, 0.00949, 0.00922, 0.01098, 0.01, 0.01342, 0.00965, 0.00844, 0.01778, 0.01504, 0.00876, 0.01126, 0.01156, 0.00994, 0.00745, 0.01045, 0.01139, 0.01102, 0.01004, 0.01044, 0.01421, 0.01363, 0.0147, 0.01748, 0.01497, 0.01481, 0.01661, 0.00933, 0.01088, 0.01211, 0.01187, 0.0114, 0.01087, 0.00985, 0.01082, 0.01058, 0.01129, 0.00882, 0.01084, 0.00902, 0.0079, 0.01036, 0.01589, 0.01561, 0.01591, 0.00899, 0.01108, 0.00841, 0.01003, 0.00851, 0.00882, 0.00846, 0.00785, 0.01152, 0.00747, 0.01326, 0.01202, 0.01211, 0.01078, 0.00952, 0.00873, 0.00881, 0.00874, 0.00915, 0.00875, 0.01297, 0.01552, 0.0151, 0.01016, 0.00992, 0.01251, 0.01115, 0.01149, 0.00982, 0.01462, 0.01529, 0.0145, 0.01056, 0.01488, 0.01365, 0.01448, 0.00917, 0.0134, 0.01205, 0.01572, 0.0126, 0.01488, 0.01305, 0.01335, 0.0138, 0.0164, 0.01209, 0.01237, 0.01442, 0.01402, 0.01277, 0.01318, 0.01188, 0.0129, 0.01144, 0.01322, 0.01297, 0.0121, 0.01209, 0.01029, 0.01079, 0.01249, 0.01233, 0.0121, 0.01022, 0.0128, 0.01174, 0.01218, 0.01303, 0.01323, 0.01318, 0.01287, 0.00961, 0.01202, 0.0124, 0.00992, 0.00876, 0.00935, 0.01319, 0.01636, 0.01632, 0.01494, 0.01298, 0.01614, 0.01406, 0.01537, 0.01153, 0.01115, 0.01271, 0.0107, 0.01222, 0.01248, 0.01198, 0.01383, 0.01146, 0.01187, 0.01068, 0.01125, 0.00998, 0.01224, 0.01454, 0.01162, 0.00956, 0.01122, 0.0154, 0.01199, 0.01342, 0.01294, 0.01456, 0.01293, 0.01589, 0.01161, 0.01349, 0.01587, 0.0161, 0.01506, 0.01604, 0.01245, 0.01415, 0.01038, 0.01375, 0.01225, 0.01179, 0.01138, 0.01149, 0.0114, 0.01157, 0.01201, 0.09678, 0.06875, 0.01665, 0.01943, 0.01672, 0.01779, 0.01975, 0.01513, 0.01188, 0.01383, 0.01055, 0.01209, 0.01624, 0.01171, 0.01034, 0.00943, 0.0124, 0.01104, 0.01002, 0.00883, 0.01064, 0.01032, 0.00949, 0.01005, 0.01087, 0.01209, 0.01055, 0.00979, 0.00997, 0.01044, 0.01106, 0.01088, 0.01076, 0.01045, 0.01152, 0.01085, 0.0105, 0.01114, 0.01146, 0.01082, 0.01229, 0.01175, 0.01162, 0.01101, 0.01116, 0.01256, 0.01128, 0.01152, 0.0107, 0.00988, 0.0095, 0.01009, 0.01045, 0.01003, 0.00992, 0.01213, 0.01087, 0.01368, 0.00953, 0.01064, 0.01243, 0.01214, 0.01155, 0.01008, 0.00976, 0.01033, 0.00912, 0.0081, 0.00967, 0.01116, 0.00911, 0.00921, 0.00997, 0.01136, 0.01025, 0.01241, 0.01273, 0.01327, 0.01109, 0.01279, 0.01226, 0.0121, 0.01061, 0.01401, 0.0134, 0.01432, 0.01133, 0.01394, 0.01414, 0.01459, 0.01155, 0.01481, 0.01262, 0.01169, 0.01079, 0.01328, 0.01375, 0.01229, 0.01428, 0.01132, 0.0128, 0.01126, 0.01216, 0.01314, 0.01251, 0.01231, 0.01489, 0.10504, 0.01146, 0.01181, 0.10182, 0.00974, 0.01066, 0.01245, 0.01188, 0.01268, 0.01247, 0.01243, 0.0136, 0.0116, 0.01212, 0.01459, 0.01641, 0.0161, 0.01189, 0.01301, 0.01594, 0.01101, 0.01209, 0.0146, 0.01388, 0.01439, 0.01206, 0.01364, 0.01212, 0.01313, 0.01581, 0.01511, 0.01362, 0.01411, 0.0139, 0.01423, 0.01307, 0.01509, 0.01644, 0.01567, 0.01653, 0.01601, 0.0161, 0.01324, 0.01587, 0.01735, 0.01691, 0.01574, 0.01699, 0.01222, 0.01273, 0.0119]}, "layernorm-grads-all-reduce-time": {"start_step": 0, "end_step": 2000, "step_interval": 5, "values": [0.00124, 0.00087, 0.00088, 0.00087, 0.00086, 0.00085, 0.00085, 0.00085, 0.00098, 0.00088, 0.00087, 0.00087, 0.00087, 0.00088, 0.00085, 0.00085, 0.00086, 0.00082, 0.00084, 0.00083, 0.00103, 0.00352, 0.00085, 0.00084, 0.00084, 0.00089, 0.00086, 0.00084, 0.00085, 0.00084, 0.00085, 0.00087, 0.00085, 0.00085, 0.00086, 0.00086, 0.00084, 0.00086, 0.00086, 0.00085, 0.00087, 0.00086, 0.00085, 0.00087, 0.00084, 0.00086, 0.00085, 0.00084, 0.00167, 0.00083, 0.00086, 0.00111, 0.00108, 0.00101, 0.00084, 0.00085, 0.00085, 0.00086, 0.00084, 0.00084, 0.00086, 0.00083, 0.00083, 0.00083, 0.00111, 0.0009, 0.00086, 0.00088, 0.00086, 0.00084, 0.00086, 0.00084, 0.00091, 0.00085, 0.00084, 0.00087, 0.00083, 0.00083, 0.00241, 0.00085, 0.00086, 0.00109, 0.00086, 0.00085, 0.00085, 0.00084, 0.00084, 0.00084, 0.00084, 0.00092, 0.00087, 0.00083, 0.00087, 0.00532, 0.00083, 0.00085, 0.00101, 0.00113, 0.0011, 0.00089, 0.00088, 0.00086, 0.00113, 0.00084, 0.00122, 0.00087, 0.00086, 0.00085, 0.00086, 0.00088, 0.00085, 0.00088, 0.0031, 0.00085, 0.00087, 0.00085, 0.001, 0.00116, 0.00088, 0.00088, 0.00086, 0.00085, 0.00085, 0.00084, 0.00426, 0.00086, 0.00086, 0.00116, 0.00089, 0.00087, 0.00087, 0.00085, 0.00085, 0.00084, 0.00087, 0.00084, 0.00084, 0.0009, 0.00108, 0.00085, 0.00085, 0.00086, 0.00086, 0.00088, 0.00084, 0.00085, 0.00084, 0.00104, 0.00087, 0.00104, 0.00084, 0.00083, 0.00084, 0.00086, 0.00086, 0.00087, 0.00084, 0.00083, 0.00086, 0.00218, 0.00084, 0.004, 0.00086, 0.00087, 0.00087, 0.00105, 0.00103, 0.00103, 0.00107, 0.00089, 0.00107, 0.00114, 0.00113, 0.00085, 0.00107, 0.00086, 0.00089, 0.00088, 0.00089, 0.00086, 0.00085, 0.00085, 0.00086, 0.00088, 0.00087, 0.00085, 0.00086, 0.00087, 0.00085, 0.00085, 0.00087, 0.00089, 0.00085, 0.00088, 0.00087, 0.00086, 0.00241, 0.00085, 0.00084, 0.00087, 0.00099, 0.001, 0.00108, 0.00085, 0.00084, 0.00086, 0.00085, 0.00088, 0.00085, 0.00085, 0.00084, 0.00086, 0.00088, 0.00084, 0.00085, 0.00087, 0.00087, 0.00087, 0.00111, 0.00086, 0.00085, 0.00086, 0.00086, 0.00084, 0.00083, 0.00084, 0.00083, 0.00088, 0.00084, 0.00085, 0.0011, 0.0011, 0.00116, 0.00089, 0.00115, 0.00087, 0.00378, 0.00087, 0.00085, 0.00085, 0.0009, 0.00086, 0.00089, 0.00086, 0.00085, 0.00085, 0.00084, 0.00087, 0.00086, 0.00086, 0.00104, 0.00088, 0.00085, 0.00115, 0.00106, 0.00088, 0.00086, 0.00106, 0.00086, 0.00087, 0.00086, 0.0026, 0.00449, 0.00471, 0.00277, 0.00087, 0.00088, 0.00085, 0.00107, 0.0011, 0.00118, 0.00086, 0.00089, 0.00084, 0.00084, 0.00084, 0.00085, 0.00087, 0.00108, 0.0011, 0.00098, 0.00109, 0.00111, 0.0011, 0.0011, 0.0011, 0.0011, 0.00111, 0.00111, 0.00107, 0.0011, 0.00103, 0.00103, 0.00111, 0.00112, 0.00109, 0.00106, 0.00108, 0.00103, 0.00103, 0.00111, 0.00102, 0.00112, 0.00112, 0.00111, 0.00112, 0.00109, 0.00329, 0.00093, 0.00085, 0.00089, 0.00085, 0.00089, 0.00087, 0.00086, 0.00536, 0.0011, 0.00111, 0.00111, 0.00116, 0.00086, 0.00084, 0.00087, 0.0009, 0.00085, 0.00084, 0.00087, 0.00086, 0.00087, 0.00086, 0.00084, 0.00085, 0.00088, 0.00086, 0.00086, 0.00417, 0.00088, 0.00121, 0.00085, 0.00085, 0.00085, 0.00085, 0.00095, 0.00116, 0.00086, 0.00086, 0.00086, 0.00499, 0.00318, 0.00107, 0.00371, 0.00087, 0.00089, 0.00087, 0.00086, 0.00085, 0.00084, 0.00084, 0.00086, 0.00083, 0.00088, 0.00085, 0.00085, 0.00087, 0.00085, 0.00087, 0.00086, 0.00086, 0.00087, 0.00085, 0.00084, 0.00085, 0.00085, 0.00086, 0.00086, 0.00085, 0.00084, 0.00088, 0.00086, 0.00085, 0.00086, 0.00085, 0.0009, 0.00095, 0.00448, 0.00088, 0.00088, 0.00089, 0.00089, 0.00086, 0.00087, 0.00087, 0.0009, 0.00086, 0.00086, 0.00088, 0.00087, 0.00088, 0.0009, 0.00101]}, "embedding-grads-all-reduce-time": {"start_step": 0, "end_step": 2000, "step_interval": 5, "values": [0.00038, 0.00032, 0.00032, 0.00032, 0.00032, 0.00032, 0.00032, 0.00032, 0.00032, 0.00032, 0.00032, 0.00032, 0.00032, 0.00032, 0.00032, 0.00033, 0.00032, 0.00032, 0.00031, 0.00032, 0.00031, 0.00032, 0.00032, 0.00032, 0.00032, 0.00031, 0.00032, 0.00032, 0.00032, 0.00032, 0.00031, 0.00032, 0.00032, 0.00032, 0.00032, 0.00032, 0.00032, 0.00032, 0.00031, 0.00032, 0.00032, 0.00031, 0.00032, 0.00032, 0.00032, 0.00032, 0.00032, 0.00031, 0.00032, 0.00032, 0.00032, 0.00032, 0.00032, 0.00032, 0.00032, 0.00033, 0.00031, 0.00032, 0.00032, 0.00032, 0.00032, 0.00031, 0.00031, 0.00031, 0.00032, 0.00032, 0.00031, 0.00032, 0.00032, 0.00032, 0.00032, 0.00032, 0.00032, 0.00032, 0.00032, 0.00032, 0.00032, 0.00032, 0.00032, 0.00032, 0.00032, 0.00032, 0.00032, 0.00032, 0.00032, 0.00032, 0.00032, 0.00032, 0.00032, 0.00033, 0.00031, 0.00032, 0.00032, 0.00034, 0.00032, 0.00032, 0.00031, 0.00031, 0.00032, 0.00032, 0.00032, 0.00032, 0.00032, 0.00031, 0.00034, 0.00031, 0.00031, 0.00032, 0.00032, 0.00032, 0.00031, 0.00032, 0.00033, 0.00032, 0.00032, 0.00031, 0.00032, 0.00032, 0.00032, 0.00032, 0.00032, 0.00032, 0.00032, 0.00032, 0.00034, 0.00032, 0.00032, 0.00032, 0.00032, 0.00032, 0.00032, 0.00032, 0.00032, 0.00032, 0.00032, 0.00032, 0.00031, 0.00032, 0.00031, 0.00032, 0.00032, 0.00031, 0.00032, 0.00031, 0.00032, 0.00031, 0.00032, 0.00032, 0.00032, 0.00032, 0.00033, 0.00032, 0.00032, 0.00032, 0.00032, 0.00032, 0.00031, 0.00031, 0.00032, 0.00034, 0.00032, 0.00033, 0.00032, 0.00032, 0.00032, 0.00031, 0.00032, 0.00032, 0.00032, 0.00032, 0.00032, 0.00032, 0.00032, 0.00032, 0.00032, 0.00032, 0.00033, 0.00032, 0.00032, 0.00032, 0.00032, 0.00032, 0.00032, 0.00032, 0.00032, 0.00032, 0.00032, 0.00032, 0.00032, 0.00032, 0.00032, 0.00032, 0.00032, 0.00031, 0.00032, 0.00031, 0.00032, 0.00033, 0.00032, 0.00031, 0.00032, 0.00031, 0.00032, 0.00032, 0.00032, 0.00031, 0.00032, 0.00032, 0.00031, 0.00031, 0.00032, 0.00031, 0.00032, 0.00032, 0.00031, 0.00032, 0.00032, 0.00032, 0.00032, 0.00032, 0.00032, 0.00032, 0.00031, 0.00032, 0.00031, 0.00031, 0.00032, 0.00031, 0.00032, 0.00031, 0.00032, 0.00032, 0.00032, 0.00032, 0.00032, 0.00032, 0.00031, 0.00032, 0.00033, 0.00032, 0.00032, 0.00032, 0.00032, 0.00032, 0.00032, 0.00032, 0.00032, 0.00031, 0.00032, 0.00032, 0.00032, 0.00033, 0.00032, 0.00032, 0.00032, 0.00031, 0.00032, 0.00032, 0.00031, 0.00032, 0.00032, 0.00031, 0.00032, 0.00031, 0.00034, 0.00032, 0.00032, 0.00032, 0.00032, 0.00031, 0.00031, 0.00032, 0.00032, 0.00032, 0.00033, 0.00032, 0.00031, 0.00032, 0.00032, 0.00031, 0.00032, 0.00032, 0.00032, 0.00032, 0.00032, 0.00031, 0.00031, 0.00033, 0.00031, 0.00032, 0.00031, 0.00032, 0.00032, 0.00032, 0.00032, 0.00032, 0.00032, 0.00031, 0.00031, 0.00033, 0.00032, 0.00032, 0.00032, 0.00032, 0.00032, 0.00031, 0.00032, 0.00032, 0.00032, 0.00031, 0.00032, 0.00031, 0.00032, 0.00032, 0.00033, 0.00033, 0.00031, 0.00032, 0.00031, 0.00031, 0.00032, 0.00031, 0.00031, 0.00032, 0.00032, 0.00032, 0.00032, 0.00033, 0.00032, 0.00031, 0.00032, 0.00032, 0.00032, 0.00031, 0.00032, 0.00032, 0.00031, 0.00031, 0.00032, 0.00032, 0.00033, 0.00032, 0.00034, 0.00032, 0.00032, 0.00031, 0.00032, 0.00034, 0.00034, 0.00032, 0.00033, 0.00032, 0.00032, 0.00032, 0.00032, 0.00032, 0.00032, 0.00032, 0.00032, 0.00031, 0.00032, 0.00032, 0.00032, 0.00031, 0.00032, 0.00032, 0.00032, 0.00032, 0.00033, 0.00032, 0.00032, 0.00032, 0.00032, 0.00032, 0.00032, 0.00032, 0.00032, 0.00032, 0.00032, 0.00032, 0.00032, 0.00032, 0.00032, 0.00032, 0.00032, 0.00032, 0.00032, 0.00032, 0.00032, 0.00032, 0.00032, 0.00032, 0.00032, 0.00032, 0.00031, 0.00032, 0.00032, 0.00032, 0.00032, 0.00032]}, "all-grads-sync-time": {"start_step": 0, "end_step": 2000, "step_interval": 5, "values": [2.1656, 0.00059, 0.0006, 0.0006, 0.00059, 0.00062, 0.0006, 0.00059, 0.00058, 0.0006, 0.00059, 0.00058, 0.00059, 0.00059, 0.0006, 0.00058, 0.00059, 0.00058, 0.00059, 0.00058, 0.00059, 0.00065, 0.00064, 0.00063, 0.00059, 0.00059, 0.0006, 0.00058, 0.00059, 0.00059, 0.00059, 0.00059, 0.00061, 0.0006, 0.00058, 0.00064, 0.00058, 0.00058, 0.0006, 0.0006, 0.00058, 0.00058, 0.00059, 0.00059, 0.00063, 0.00059, 0.00058, 0.00058, 0.00059, 0.00059, 0.00059, 0.00059, 0.00063, 0.00059, 0.00059, 0.00059, 0.00058, 0.00059, 0.00064, 0.00058, 0.0006, 0.00059, 0.00059, 0.00059, 0.0006, 0.00059, 0.00059, 0.00059, 0.00059, 0.00058, 0.0006, 0.00058, 0.0006, 0.00059, 0.0006, 0.0006, 0.00057, 0.00059, 0.0006, 0.00058, 0.00059, 0.00059, 0.00064, 0.00058, 0.00059, 0.00063, 0.00059, 0.00058, 0.00059, 0.0006, 0.00059, 0.00058, 0.00058, 0.00058, 0.00059, 0.00058, 0.00058, 0.00059, 0.00058, 0.00059, 0.00059, 0.00059, 0.00059, 0.00063, 0.00059, 0.00059, 0.00059, 0.00057, 0.00058, 0.00059, 0.00058, 0.00062, 0.00059, 0.00059, 0.00059, 0.00058, 0.00059, 0.0006, 0.00058, 0.0006, 0.00058, 0.00062, 0.00059, 0.00063, 0.0006, 0.00059, 0.00059, 0.0006, 0.00059, 0.00059, 0.00059, 0.00058, 0.00063, 0.00058, 0.00059, 0.00058, 0.00058, 0.00058, 0.0006, 0.00058, 0.00058, 0.00059, 0.00059, 0.00059, 0.00058, 0.00057, 0.00059, 0.00058, 0.00059, 0.00059, 0.00058, 0.00058, 0.0006, 0.00058, 0.00058, 0.00059, 0.00058, 0.00059, 0.00059, 0.00058, 0.00058, 0.00059, 0.00059, 0.0006, 0.00063, 0.00059, 0.00059, 0.00058, 0.00059, 0.00062, 0.00062, 0.00058, 0.00059, 0.00059, 0.00058, 0.00059, 0.00059, 0.00058, 0.00059, 0.00058, 0.00058, 0.00059, 0.00059, 0.00059, 0.00058, 0.00059, 0.00059, 0.00058, 0.00059, 0.00059, 0.00058, 0.00059, 0.00059, 0.00058, 0.00058, 0.00059, 0.00074, 0.00059, 0.00059, 0.00059, 0.0006, 0.0006, 0.0006, 0.00058, 0.00058, 0.00058, 0.00059, 0.00059, 0.00059, 0.00059, 0.00058, 0.00059, 0.00059, 0.00059, 0.00058, 0.00059, 0.00058, 0.00064, 0.00059, 0.00063, 0.00059, 0.00059, 0.0006, 0.00058, 0.00058, 0.00058, 0.00058, 0.00059, 0.00058, 0.0006, 0.0006, 0.00059, 0.00058, 0.00058, 0.00057, 0.00058, 0.0006, 0.00059, 0.00058, 0.00058, 0.00059, 0.0006, 0.00058, 0.00058, 0.00059, 0.00058, 0.00057, 0.00058, 0.00059, 0.00058, 0.00059, 0.00059, 0.00059, 0.00058, 0.00058, 0.00058, 0.00059, 0.0006, 0.00058, 0.00065, 0.00059, 0.00062, 0.00058, 0.00057, 0.00061, 0.00059, 0.00059, 0.00058, 0.0006, 0.00063, 0.00059, 0.00058, 0.00059, 0.00058, 0.00062, 0.00059, 0.00058, 0.00059, 0.00058, 0.00058, 0.00058, 0.00058, 0.0006, 0.0006, 0.00059, 0.00058, 0.00059, 0.0006, 0.00058, 0.00059, 0.00059, 0.00059, 0.00059, 0.00058, 0.00058, 0.00058, 0.00059, 0.00064, 0.00058, 0.00057, 0.00059, 0.00058, 0.00059, 0.00057, 0.00058, 0.00058, 0.00058, 0.00059, 0.00059, 0.00064, 0.00058, 0.00058, 0.00059, 0.00058, 0.00058, 0.00058, 0.00057, 0.00059, 0.00058, 0.00059, 0.00058, 0.00058, 0.00058, 0.00059, 0.00059, 0.00058, 0.00058, 0.00058, 0.00059, 0.00059, 0.00059, 0.00059, 0.00059, 0.00058, 0.00058, 0.00063, 0.00058, 0.00063, 0.00059, 0.0006, 0.00057, 0.00058, 0.00059, 0.00059, 0.00059, 0.00059, 0.00062, 0.00062, 0.00058, 0.00057, 0.00058, 0.00059, 0.00059, 0.00059, 0.00059, 0.00059, 0.00059, 0.00059, 0.00059, 0.00059, 0.00059, 0.00059, 0.00059, 0.0006, 0.0006, 0.00058, 0.00058, 0.00059, 0.00063, 0.00057, 0.00059, 0.00063, 0.00059, 0.00059, 0.00059, 0.00058, 0.00058, 0.00058, 0.00059, 0.00059, 0.00063, 0.00059, 0.00059, 0.00059, 0.00059, 0.0006, 0.00059, 0.00059, 0.00059, 0.00059, 0.00059, 0.00059, 0.00058, 0.00059, 0.00058, 0.00059, 0.00059, 0.00058, 0.00059, 0.00059]}, "optimizer-copy-to-main-grad-time": {"start_step": 0, "end_step": 2000, "step_interval": 5, "values": [0.00016, 0.00012, 0.00011, 0.00011, 0.00011, 0.00011, 0.00012, 0.00011, 0.00012, 0.00011, 0.00011, 0.00011, 0.0001, 0.00011, 0.0001, 0.00011, 0.00011, 0.0001, 0.00011, 0.0001, 0.00011, 0.00011, 0.00011, 0.00011, 0.00011, 0.00011, 0.00011, 0.0001, 0.00011, 0.00011, 0.00011, 0.00012, 0.00011, 0.0001, 0.0001, 0.00011, 0.00011, 0.00012, 0.00011, 0.00011, 0.00011, 0.00011, 0.00011, 0.00011, 0.0001, 0.00011, 0.0001, 0.0001, 0.00011, 0.0001, 0.00011, 0.0001, 0.00011, 0.00011, 0.0001, 0.00011, 0.00011, 0.00011, 0.00011, 0.00011, 0.00011, 0.0001, 0.00011, 0.0001, 0.00011, 0.00011, 0.0001, 0.0001, 0.00011, 0.00011, 0.00011, 0.0001, 0.00012, 0.00011, 0.00011, 0.00012, 0.0001, 0.00011, 0.00011, 0.00011, 0.00011, 0.00011, 0.00011, 0.00011, 0.00011, 0.00011, 0.00011, 0.00011, 0.0001, 0.00012, 0.00011, 0.00012, 0.0001, 0.0001, 0.00011, 0.00011, 0.0001, 0.00011, 0.00011, 0.00011, 0.00011, 0.00011, 0.00011, 0.0001, 0.00011, 0.00011, 0.00011, 0.0001, 0.0001, 0.00012, 0.00011, 0.00011, 0.00011, 0.00011, 0.00011, 0.0001, 0.0001, 0.00011, 0.0001, 0.00011, 0.0001, 0.0001, 0.00011, 0.00011, 0.00011, 0.00011, 0.0001, 0.0001, 0.00011, 0.00011, 0.00011, 0.0001, 0.0001, 0.0001, 0.00011, 0.00011, 0.00011, 0.0001, 0.00012, 0.00011, 0.00011, 0.00011, 0.0001, 0.00011, 0.00011, 0.00011, 0.00011, 0.0001, 0.00011, 0.00011, 0.0001, 0.0001, 0.00012, 0.00012, 0.00011, 0.0001, 0.0001, 0.0001, 0.00011, 0.00011, 0.0001, 0.00011, 0.00011, 0.00011, 0.00011, 0.00011, 0.00011, 0.00011, 0.00011, 0.00011, 0.00011, 0.00011, 0.00011, 0.00011, 0.00012, 0.00011, 0.0001, 0.0001, 0.00011, 0.00011, 0.00011, 0.00011, 0.0001, 0.00011, 0.00011, 0.0001, 0.0001, 0.0001, 0.00011, 0.00011, 0.00012, 0.00011, 0.0001, 0.0001, 0.00011, 0.0001, 0.00011, 0.00011, 0.00011, 0.00011, 0.00012, 0.0001, 0.00011, 0.00011, 0.00011, 0.00011, 0.0001, 0.00011, 0.0001, 0.0001, 0.0001, 0.0001, 0.00011, 0.0001, 0.0001, 0.0001, 0.0001, 0.00011, 0.00011, 0.0001, 0.00011, 0.00011, 0.00011, 0.00011, 0.0001, 0.00011, 0.0001, 0.00011, 0.0001, 0.0001, 0.0001, 0.0001, 0.00011, 0.0001, 0.0001, 0.0001, 0.0001, 0.00011, 0.0001, 0.0001, 0.0001, 0.0001, 0.00011, 0.00011, 0.0001, 0.0001, 0.0001, 0.0001, 0.00011, 0.00011, 0.0001, 0.00011, 0.0001, 0.0001, 0.0001, 0.00011, 0.00011, 0.00011, 0.0001, 0.0001, 0.00011, 0.00011, 0.00011, 0.00012, 0.0001, 0.00011, 0.00011, 0.00011, 0.0001, 0.0001, 0.00011, 0.00011, 0.00011, 0.0001, 0.0001, 0.0001, 0.0001, 0.00011, 0.0001, 0.0001, 0.0001, 0.0001, 0.00011, 0.0001, 0.00012, 0.00011, 0.00011, 0.00011, 0.0001, 0.0001, 0.0001, 0.0001, 0.00011, 0.0001, 0.00012, 0.0001, 0.00011, 0.00011, 0.0001, 0.0001, 0.0001, 0.0001, 0.00011, 0.00011, 0.00011, 0.0001, 0.00011, 0.00011, 0.00011, 0.0001, 0.00011, 0.0001, 0.00011, 0.00011, 0.00019, 0.0001, 0.0001, 0.00011, 0.00011, 0.00011, 0.0001, 0.0001, 0.00011, 0.0001, 0.0001, 0.0001, 0.00011, 0.00011, 0.0001, 0.0001, 0.0001, 0.00011, 0.00011, 0.00011, 0.0001, 0.00011, 0.00011, 0.00011, 0.00011, 0.00011, 0.0001, 0.0001, 0.00011, 0.00011, 0.00011, 0.0001, 0.00011, 0.00011, 0.00011, 0.0001, 0.00011, 0.0001, 0.00012, 0.00011, 0.0001, 0.0001, 0.0001, 0.00011, 0.00011, 0.0001, 0.0001, 0.0001, 0.00011, 0.00011, 0.0001, 0.0001, 0.0001, 0.00011, 0.00011, 0.0001, 0.0001, 0.0001, 0.00011, 0.00011, 0.0001, 0.0001, 0.0001, 0.0001, 0.00011, 0.0001, 0.00011, 0.0001, 0.00011, 0.00011, 0.00011, 0.00011, 0.00011, 0.00011, 0.00011, 0.00011, 0.00011, 0.00011, 0.00011, 0.00011, 0.00011, 0.00011, 0.0001, 0.00011, 0.00011, 0.00011]}, "optimizer-clip-main-grad-time": {"start_step": 0, "end_step": 2000, "step_interval": 5, "values": [4.25848, 0.00058, 0.00058, 0.00057, 0.00057, 0.00058, 0.00058, 0.00057, 0.00057, 0.00058, 0.00057, 0.00057, 0.00056, 0.00056, 0.00057, 0.00056, 0.00059, 0.00056, 0.00056, 0.00055, 0.00056, 0.00057, 0.00056, 0.00056, 0.00056, 0.00057, 0.00056, 0.00056, 0.00057, 0.00056, 0.00057, 0.00056, 0.00055, 0.00055, 0.00057, 0.00057, 0.00058, 0.00055, 0.00056, 0.00056, 0.00056, 0.00055, 0.00057, 0.00056, 0.00056, 0.00056, 0.00058, 0.00056, 0.00055, 0.00056, 0.00056, 0.00057, 0.00057, 0.0006, 0.00056, 0.00056, 0.00055, 0.00056, 0.00056, 0.00055, 0.00056, 0.00055, 0.00055, 0.00056, 0.00057, 0.00057, 0.00056, 0.00056, 0.00056, 0.00057, 0.00059, 0.00056, 0.00058, 0.00056, 0.00056, 0.00057, 0.00055, 0.00055, 0.00056, 0.00056, 0.00056, 0.00071, 0.00056, 0.00056, 0.00057, 0.00057, 0.00055, 0.00056, 0.00055, 0.0006, 0.00055, 0.00056, 0.00055, 0.00055, 0.00057, 0.00055, 0.00055, 0.00057, 0.00046, 0.00057, 0.00057, 0.00057, 0.00056, 0.00055, 0.00071, 0.00056, 0.00056, 0.00057, 0.00057, 0.00047, 0.00056, 0.00048, 0.00046, 0.00056, 0.00057, 0.00055, 0.00055, 0.00056, 0.00055, 0.00057, 0.00056, 0.00056, 0.00056, 0.00056, 0.00046, 0.00056, 0.00055, 0.00055, 0.00056, 0.00058, 0.00045, 0.00056, 0.00057, 0.00055, 0.00057, 0.00055, 0.00055, 0.00055, 0.00056, 0.00056, 0.00055, 0.00055, 0.00057, 0.00046, 0.00046, 0.00056, 0.00056, 0.00056, 0.00056, 0.00056, 0.00055, 0.00055, 0.00055, 0.00055, 0.00056, 0.00057, 0.00055, 0.00055, 0.00057, 0.00057, 0.00064, 0.00056, 0.00056, 0.00057, 0.00057, 0.00055, 0.00056, 0.00055, 0.00055, 0.00056, 0.00056, 0.00057, 0.00056, 0.00056, 0.00056, 0.00056, 0.00055, 0.00055, 0.00056, 0.00055, 0.00055, 0.00056, 0.00057, 0.00056, 0.00057, 0.00056, 0.00055, 0.00058, 0.00056, 0.00056, 0.00056, 0.00056, 0.00055, 0.00056, 0.00056, 0.00055, 0.00056, 0.00056, 0.00055, 0.00057, 0.00077, 0.00056, 0.00056, 0.00057, 0.00056, 0.00057, 0.00056, 0.00057, 0.00056, 0.00056, 0.00055, 0.00056, 0.00058, 0.00055, 0.00056, 0.00055, 0.00056, 0.00056, 0.00056, 0.00056, 0.00056, 0.00056, 0.00056, 0.00056, 0.00056, 0.00055, 0.00055, 0.00054, 0.00055, 0.00055, 0.00056, 0.00062, 0.00058, 0.00055, 0.00056, 0.00056, 0.00056, 0.00056, 0.00055, 0.00055, 0.00055, 0.00056, 0.00056, 0.00055, 0.00057, 0.00057, 0.00056, 0.00055, 0.00055, 0.00055, 0.00055, 0.00058, 0.00055, 0.00056, 0.00056, 0.00056, 0.00055, 0.00056, 0.00055, 0.00056, 0.00056, 0.00057, 0.00057, 0.00061, 0.00057, 0.00057, 0.00056, 0.00057, 0.00055, 0.00056, 0.00056, 0.00056, 0.00058, 0.00056, 0.00057, 0.00055, 0.0006, 0.00056, 0.00057, 0.00056, 0.00055, 0.00056, 0.00056, 0.00057, 0.00057, 0.00055, 0.00056, 0.00056, 0.0006, 0.00063, 0.00057, 0.00056, 0.00056, 0.00057, 0.00058, 0.00056, 0.00059, 0.00057, 0.00056, 0.00055, 0.00056, 0.00064, 0.00056, 0.00056, 0.00057, 0.00056, 0.00056, 0.00057, 0.00068, 0.00056, 0.00056, 0.00056, 0.00058, 0.00056, 0.00059, 0.00056, 0.00055, 0.00057, 0.00057, 0.00055, 0.00057, 0.00056, 0.00057, 0.00057, 0.00056, 0.00056, 0.00055, 0.00057, 0.00057, 0.00055, 0.00056, 0.00057, 0.00056, 0.00056, 0.00056, 0.00057, 0.00056, 0.00056, 0.00058, 0.00056, 0.00055, 0.00056, 0.00056, 0.00057, 0.00056, 0.00056, 0.00076, 0.00058, 0.00057, 0.00057, 0.00056, 0.00056, 0.00057, 0.00056, 0.00056, 0.00056, 0.00057, 0.00057, 0.00056, 0.00055, 0.00055, 0.00057, 0.00056, 0.00056, 0.00056, 0.00055, 0.00056, 0.00057, 0.00056, 0.00055, 0.00061, 0.00056, 0.00055, 0.00056, 0.00055, 0.00056, 0.00056, 0.00055, 0.00057, 0.00055, 0.00055, 0.00056, 0.00057, 0.00056, 0.00057, 0.00056, 0.00056, 0.00056, 0.00057, 0.00057, 0.00057, 0.00057, 0.00057, 0.00057, 0.00057, 0.00056, 0.00056, 0.00056, 0.00056, 0.00056]}, "optimizer-count-zeros-time": {"start_step": 0, "end_step": 2000, "step_interval": 5, "values": [0.00381, 0.00273, 0.0027, 0.0027, 0.00273, 0.00271, 0.00267, 0.00283, 0.00274, 0.00269, 0.0027, 0.00269, 0.00272, 0.00273, 0.0027, 0.0027, 0.00269, 0.00268, 0.0027, 0.0027, 0.00273, 0.00272, 0.00268, 0.0027, 0.00278, 0.00278, 0.00271, 0.00269, 0.00268, 0.0027, 0.00271, 0.00271, 0.00269, 0.00273, 0.00271, 0.0027, 0.00267, 0.00269, 0.0027, 0.00271, 0.00271, 0.00269, 0.00269, 0.00267, 0.00269, 0.00269, 0.00269, 0.0027, 0.0027, 0.00271, 0.00271, 0.00288, 0.00277, 0.00297, 0.0027, 0.00269, 0.00268, 0.00269, 0.00268, 0.00269, 0.00269, 0.0027, 0.00268, 0.0027, 0.00272, 0.00269, 0.0027, 0.00271, 0.00273, 0.0027, 0.00284, 0.0027, 0.00271, 0.00282, 0.0027, 0.00268, 0.00268, 0.00268, 0.0027, 0.0027, 0.00272, 0.00496, 0.0027, 0.00268, 0.00269, 0.00269, 0.00271, 0.00269, 0.00271, 0.00292, 0.0027, 0.00269, 0.00269, 0.00268, 0.00269, 0.00271, 0.00271, 0.00275, 0.00271, 0.00271, 0.00268, 0.00271, 0.00291, 0.00269, 0.00286, 0.00271, 0.00269, 0.00269, 0.00271, 0.00269, 0.0027, 0.00272, 0.00269, 0.00267, 0.00268, 0.00269, 0.00272, 0.00269, 0.00272, 0.0027, 0.00268, 0.00268, 0.00269, 0.0027, 0.00269, 0.0027, 0.00272, 0.0027, 0.00271, 0.00269, 0.00273, 0.0027, 0.0027, 0.0027, 0.00268, 0.00269, 0.0027, 0.00272, 0.00271, 0.00271, 0.00269, 0.0027, 0.00267, 0.00271, 0.00269, 0.00268, 0.00268, 0.0027, 0.00269, 0.00269, 0.00267, 0.0027, 0.00268, 0.00269, 0.0027, 0.0027, 0.00269, 0.00269, 0.00268, 0.00269, 0.00269, 0.00269, 0.00269, 0.00281, 0.0028, 0.00273, 0.00272, 0.00273, 0.00273, 0.00274, 0.00271, 0.00272, 0.0027, 0.00271, 0.0027, 0.00271, 0.00273, 0.00271, 0.00269, 0.00271, 0.00272, 0.00272, 0.00272, 0.0027, 0.00269, 0.00281, 0.00272, 0.00282, 0.00271, 0.0027, 0.00269, 0.00272, 0.00273, 0.00271, 0.00269, 0.0027, 0.0027, 0.00269, 0.00271, 0.00271, 0.00282, 0.00271, 0.00269, 0.00271, 0.0027, 0.00313, 0.0027, 0.00269, 0.00271, 0.00271, 0.0027, 0.0027, 0.00271, 0.00269, 0.00278, 0.00269, 0.00272, 0.00278, 0.00271, 0.0027, 0.00269, 0.00271, 0.0027, 0.0027, 0.0027, 0.00269, 0.00271, 0.00271, 0.00269, 0.00272, 0.00271, 0.00296, 0.00271, 0.00271, 0.0027, 0.00271, 0.00271, 0.00275, 0.00269, 0.00267, 0.00271, 0.00274, 0.00267, 0.00271, 0.0027, 0.00273, 0.00272, 0.00271, 0.00271, 0.00273, 0.00272, 0.0027, 0.00274, 0.00273, 0.0027, 0.00272, 0.00271, 0.0027, 0.00271, 0.00265, 0.00264, 0.00264, 0.00273, 0.00262, 0.00291, 0.00266, 0.00273, 0.00265, 0.00265, 0.00263, 0.00265, 0.00264, 0.00274, 0.00272, 0.00262, 0.00274, 0.00265, 0.00273, 0.00264, 0.00274, 0.00264, 0.00274, 0.0028, 0.00265, 0.00263, 0.00263, 0.00272, 0.00271, 0.00276, 0.00267, 0.00265, 0.00262, 0.00272, 0.00277, 0.00264, 0.00269, 0.00264, 0.00264, 0.00272, 0.00271, 0.00294, 0.00388, 0.00268, 0.00273, 0.00273, 0.00265, 0.00357, 0.00265, 0.00304, 0.00272, 0.00261, 0.00268, 0.0027, 0.00266, 0.00267, 0.00264, 0.00278, 0.00274, 0.00267, 0.00269, 0.00268, 0.0027, 0.00269, 0.0027, 0.00269, 0.0027, 0.00271, 0.00269, 0.00267, 0.0027, 0.00268, 0.0027, 0.00272, 0.00271, 0.0027, 0.00272, 0.00272, 0.00274, 0.00269, 0.00313, 0.00269, 0.00269, 0.00269, 0.00271, 0.00271, 0.00273, 0.00283, 0.0027, 0.00269, 0.00278, 0.00276, 0.00271, 0.00271, 0.0027, 0.0027, 0.00271, 0.00272, 0.00271, 0.00272, 0.00271, 0.00271, 0.00268, 0.00273, 0.00271, 0.00269, 0.0027, 0.00273, 0.00275, 0.00269, 0.00273, 0.00271, 0.00271, 0.0027, 0.00272, 0.00269, 0.00269, 0.00272, 0.00274, 0.00271, 0.00272, 0.00272, 0.0027, 0.0027, 0.00272, 0.0027, 0.00271, 0.00271, 0.00273, 0.00271, 0.00268, 0.0027, 0.00271, 0.00273, 0.00272, 0.0027, 0.00269, 0.00272, 0.00272, 0.0027, 0.00271]}, "optimizer-inner-step-time": {"start_step": 0, "end_step": 2000, "step_interval": 5, "values": [0.0026, 0.00046, 0.00046, 0.00045, 0.00045, 0.00045, 0.00045, 0.00045, 0.00046, 0.00045, 0.00045, 0.00045, 0.00044, 0.00044, 0.00045, 0.00045, 0.00045, 0.00045, 0.00045, 0.00045, 0.00045, 0.00045, 0.00045, 0.00045, 0.00045, 0.00045, 0.00046, 0.00044, 0.00045, 0.00044, 0.00045, 0.00045, 0.00044, 0.00051, 0.00045, 0.00045, 0.00045, 0.00044, 0.00045, 0.00046, 0.00045, 0.00045, 0.00045, 0.00044, 0.00045, 0.00045, 0.00044, 0.00045, 0.00045, 0.00045, 0.00045, 0.00044, 0.00044, 0.00046, 0.00045, 0.00046, 0.00044, 0.00045, 0.00045, 0.00044, 0.00045, 0.00044, 0.00044, 0.00044, 0.00045, 0.00046, 0.00045, 0.00044, 0.00044, 0.00044, 0.00045, 0.00045, 0.00046, 0.00045, 0.00045, 0.00046, 0.00044, 0.00045, 0.00046, 0.00045, 0.00045, 0.00048, 0.00045, 0.00045, 0.00045, 0.00045, 0.00045, 0.00044, 0.00045, 0.00046, 0.00045, 0.00044, 0.00045, 0.00045, 0.00046, 0.00045, 0.00045, 0.00044, 0.00045, 0.00045, 0.00045, 0.00045, 0.00047, 0.00044, 0.00057, 0.00046, 0.00045, 0.00045, 0.00045, 0.00045, 0.00045, 0.00045, 0.00045, 0.00045, 0.00045, 0.00044, 0.00045, 0.0005, 0.00044, 0.00046, 0.00045, 0.00045, 0.00045, 0.00045, 0.00046, 0.00044, 0.00045, 0.00045, 0.00045, 0.00045, 0.00045, 0.00045, 0.00045, 0.00045, 0.00045, 0.00044, 0.00045, 0.00044, 0.00045, 0.00045, 0.00045, 0.00045, 0.00046, 0.00045, 0.00045, 0.00044, 0.00045, 0.00045, 0.00045, 0.00045, 0.00045, 0.00044, 0.00045, 0.00045, 0.00047, 0.00045, 0.00045, 0.00045, 0.00046, 0.00044, 0.00045, 0.00045, 0.00045, 0.00045, 0.00049, 0.00045, 0.00045, 0.00045, 0.00045, 0.00046, 0.00044, 0.00045, 0.00045, 0.00059, 0.00045, 0.00045, 0.00045, 0.00045, 0.00045, 0.00045, 0.00044, 0.00044, 0.00045, 0.00045, 0.00045, 0.00045, 0.00044, 0.00045, 0.00045, 0.00045, 0.00045, 0.00046, 0.00051, 0.00046, 0.00046, 0.00046, 0.00045, 0.00045, 0.00046, 0.00046, 0.00061, 0.00045, 0.00045, 0.00045, 0.00046, 0.00045, 0.00045, 0.00046, 0.00044, 0.00045, 0.00045, 0.00045, 0.00045, 0.00046, 0.00045, 0.00045, 0.00044, 0.00045, 0.00045, 0.00045, 0.00045, 0.00045, 0.00045, 0.00044, 0.00045, 0.00044, 0.00044, 0.00045, 0.00045, 0.00044, 0.00046, 0.00054, 0.00045, 0.00045, 0.00045, 0.00045, 0.00045, 0.00045, 0.00045, 0.00045, 0.00044, 0.00044, 0.00045, 0.00044, 0.00044, 0.00044, 0.00045, 0.00044, 0.00044, 0.00045, 0.00044, 0.00045, 0.00044, 0.00044, 0.00045, 0.00044, 0.00045, 0.00045, 0.00045, 0.00045, 0.00045, 0.00045, 0.00046, 0.00046, 0.00046, 0.00045, 0.00045, 0.00045, 0.00045, 0.00046, 0.00046, 0.00045, 0.00046, 0.00055, 0.00045, 0.00045, 0.00045, 0.00045, 0.00044, 0.00044, 0.00045, 0.00046, 0.00046, 0.00045, 0.00045, 0.00045, 0.00045, 0.00045, 0.00045, 0.00045, 0.00044, 0.00045, 0.00045, 0.00045, 0.00044, 0.00044, 0.00045, 0.00046, 0.00045, 0.00044, 0.00076, 0.00046, 0.00045, 0.00045, 0.00045, 0.00044, 0.00045, 0.00045, 0.00046, 0.00045, 0.00046, 0.00045, 0.00045, 0.00046, 0.00044, 0.00045, 0.00048, 0.00045, 0.00045, 0.00048, 0.00045, 0.00045, 0.00044, 0.00045, 0.00045, 0.00045, 0.00045, 0.00044, 0.00045, 0.00045, 0.00045, 0.00045, 0.00045, 0.00045, 0.00045, 0.00045, 0.00045, 0.00045, 0.00045, 0.00045, 0.00045, 0.00045, 0.00045, 0.00045, 0.00052, 0.0005, 0.00056, 0.00045, 0.00045, 0.00045, 0.00045, 0.00045, 0.00046, 0.00045, 0.00045, 0.00045, 0.00045, 0.00045, 0.00045, 0.00045, 0.00045, 0.00045, 0.00045, 0.00045, 0.00045, 0.00045, 0.00045, 0.00045, 0.00046, 0.00044, 0.00045, 0.00046, 0.00044, 0.00045, 0.00045, 0.00046, 0.00045, 0.00045, 0.00045, 0.00044, 0.00045, 0.00044, 0.00045, 0.00045, 0.00046, 0.00045, 0.00046, 0.00046, 0.00046, 0.00045, 0.00045, 0.00046, 0.00046, 0.00046, 0.00046, 0.00046, 0.00045, 0.00045, 0.00045, 0.00045]}, "optimizer-copy-main-to-model-params-time": {"start_step": 0, "end_step": 2000, "step_interval": 5, "values": [0.00055, 0.00049, 0.0005, 0.0005, 0.00049, 0.0005, 0.00049, 0.00049, 0.00049, 0.00049, 0.00049, 0.00049, 0.0005, 0.00049, 0.00049, 0.00049, 0.00049, 0.00049, 0.0005, 0.0005, 0.00049, 0.0005, 0.00049, 0.00049, 0.00049, 0.00049, 0.00049, 0.0005, 0.00049, 0.00051, 0.00049, 0.00049, 0.00049, 0.00066, 0.00049, 0.0005, 0.00049, 0.00049, 0.00049, 0.00049, 0.00049, 0.0005, 0.00049, 0.00049, 0.00049, 0.00049, 0.00049, 0.00049, 0.00049, 0.00049, 0.00049, 0.00049, 0.00049, 0.0005, 0.00049, 0.00049, 0.00049, 0.00049, 0.00049, 0.00049, 0.00049, 0.00049, 0.00049, 0.00049, 0.00049, 0.00049, 0.00049, 0.00049, 0.00049, 0.00049, 0.00049, 0.00049, 0.00049, 0.00049, 0.00049, 0.00049, 0.00048, 0.00049, 0.0005, 0.00049, 0.00049, 0.00068, 0.00049, 0.0005, 0.00049, 0.00049, 0.0005, 0.00049, 0.00049, 0.00049, 0.00049, 0.0005, 0.00049, 0.0005, 0.00049, 0.0005, 0.00049, 0.00049, 0.00049, 0.00049, 0.00049, 0.00049, 0.00049, 0.00049, 0.00067, 0.00049, 0.00049, 0.00049, 0.00049, 0.00049, 0.00049, 0.00049, 0.00049, 0.00049, 0.00049, 0.00049, 0.00049, 0.00063, 0.00049, 0.00049, 0.00049, 0.00049, 0.00049, 0.00049, 0.00049, 0.00049, 0.00049, 0.0005, 0.00049, 0.00049, 0.00049, 0.00049, 0.00049, 0.00049, 0.00049, 0.00049, 0.00049, 0.00048, 0.00049, 0.00049, 0.00049, 0.00049, 0.00049, 0.00049, 0.0005, 0.00049, 0.00049, 0.00049, 0.00049, 0.00049, 0.00049, 0.00049, 0.00049, 0.00049, 0.00049, 0.00049, 0.00049, 0.00049, 0.00049, 0.00049, 0.00049, 0.00049, 0.00049, 0.00049, 0.00068, 0.00049, 0.00049, 0.00049, 0.00049, 0.00049, 0.00049, 0.00049, 0.00049, 0.00076, 0.00049, 0.00049, 0.00049, 0.00049, 0.00049, 0.00049, 0.00049, 0.00049, 0.00049, 0.00049, 0.00049, 0.00049, 0.00049, 0.0005, 0.00049, 0.00049, 0.00049, 0.00049, 0.00051, 0.00049, 0.0005, 0.00049, 0.0005, 0.00049, 0.00052, 0.00049, 0.00066, 0.0005, 0.00049, 0.00049, 0.00049, 0.0005, 0.00049, 0.00049, 0.0005, 0.0005, 0.00049, 0.00049, 0.00049, 0.00049, 0.00049, 0.00049, 0.0005, 0.0005, 0.0005, 0.00049, 0.00049, 0.0005, 0.0005, 0.0005, 0.00049, 0.00049, 0.00049, 0.00049, 0.00051, 0.0005, 0.0005, 0.00072, 0.00049, 0.0005, 0.00049, 0.00049, 0.0005, 0.00049, 0.0005, 0.00049, 0.00049, 0.0005, 0.00049, 0.00049, 0.0005, 0.0005, 0.00049, 0.0005, 0.0005, 0.0005, 0.0005, 0.0005, 0.0005, 0.00049, 0.0005, 0.00049, 0.0005, 0.00049, 0.00049, 0.00049, 0.0005, 0.00052, 0.0005, 0.00049, 0.00049, 0.00049, 0.0005, 0.00049, 0.00049, 0.00049, 0.00049, 0.00049, 0.00049, 0.00066, 0.0005, 0.00049, 0.0005, 0.00049, 0.0005, 0.00049, 0.00049, 0.00049, 0.00049, 0.00049, 0.00049, 0.00052, 0.0005, 0.00049, 0.0005, 0.00049, 0.00049, 0.0005, 0.00049, 0.00049, 0.00053, 0.00049, 0.00052, 0.00049, 0.00049, 0.00049, 0.00076, 0.00049, 0.0005, 0.00049, 0.0005, 0.00049, 0.00064, 0.0005, 0.00051, 0.00049, 0.00049, 0.00049, 0.00049, 0.00049, 0.00049, 0.0005, 0.00066, 0.00049, 0.00051, 0.00063, 0.0005, 0.0005, 0.0005, 0.00049, 0.00049, 0.0005, 0.0005, 0.0005, 0.00049, 0.00049, 0.00049, 0.00049, 0.00049, 0.0005, 0.0005, 0.0005, 0.00051, 0.00049, 0.00049, 0.0005, 0.0005, 0.0005, 0.00053, 0.0005, 0.00073, 0.00072, 0.00072, 0.00049, 0.0005, 0.00049, 0.00049, 0.00049, 0.0005, 0.00051, 0.00051, 0.0005, 0.00049, 0.0005, 0.0005, 0.0005, 0.00049, 0.00049, 0.00049, 0.00049, 0.00049, 0.0005, 0.00049, 0.00049, 0.00049, 0.00049, 0.00049, 0.00049, 0.00049, 0.00049, 0.0005, 0.00049, 0.00049, 0.0005, 0.0005, 0.0005, 0.0005, 0.0005, 0.0005, 0.0005, 0.0005, 0.0005, 0.0005, 0.0005, 0.0005, 0.0005, 0.0005, 0.0005, 0.0005, 0.0005, 0.00051, 0.0005, 0.0005, 0.0005, 0.00049, 0.0005]}, "optimizer-time": {"start_step": 0, "end_step": 2000, "step_interval": 5, "values": [4.26785, 0.00472, 0.00469, 0.00468, 0.0047, 0.00469, 0.00466, 0.00479, 0.00473, 0.00465, 0.00467, 0.00466, 0.00467, 0.00467, 0.00464, 0.00466, 0.00468, 0.00461, 0.00465, 0.00464, 0.00469, 0.00469, 0.00464, 0.00465, 0.00473, 0.00473, 0.00467, 0.00463, 0.00464, 0.00465, 0.00468, 0.00467, 0.00464, 0.00516, 0.00466, 0.00468, 0.00465, 0.00465, 0.00465, 0.00469, 0.00466, 0.00464, 0.00465, 0.00462, 0.00463, 0.00466, 0.00466, 0.00464, 0.00465, 0.00466, 0.00468, 0.00483, 0.00473, 0.005, 0.00465, 0.00465, 0.00463, 0.00466, 0.00463, 0.00463, 0.00465, 0.00465, 0.00461, 0.00465, 0.00467, 0.00467, 0.00464, 0.00464, 0.00468, 0.00465, 0.00483, 0.00466, 0.0047, 0.00478, 0.00466, 0.00466, 0.00461, 0.00462, 0.00467, 0.00465, 0.00469, 0.00749, 0.00467, 0.00465, 0.00466, 0.00466, 0.00465, 0.00465, 0.00465, 0.00495, 0.00465, 0.00465, 0.00463, 0.00463, 0.00466, 0.00467, 0.00464, 0.00472, 0.00456, 0.00469, 0.00464, 0.00466, 0.0049, 0.00463, 0.00555, 0.00466, 0.00464, 0.00464, 0.00466, 0.00456, 0.00466, 0.0046, 0.00453, 0.00464, 0.00465, 0.00461, 0.00466, 0.00495, 0.00466, 0.00467, 0.00463, 0.00461, 0.00463, 0.00465, 0.00458, 0.00465, 0.00467, 0.00464, 0.00466, 0.00467, 0.00456, 0.00464, 0.00465, 0.00464, 0.00465, 0.00462, 0.00462, 0.00464, 0.00466, 0.00465, 0.00464, 0.00465, 0.00463, 0.00456, 0.00455, 0.00464, 0.00462, 0.00466, 0.00464, 0.00466, 0.00461, 0.00462, 0.00463, 0.00464, 0.00468, 0.00465, 0.00462, 0.00463, 0.00466, 0.00465, 0.00472, 0.00464, 0.00465, 0.00477, 0.00511, 0.00469, 0.00467, 0.00467, 0.00468, 0.00471, 0.00465, 0.00468, 0.00465, 0.00522, 0.00464, 0.00465, 0.00466, 0.00465, 0.00464, 0.00465, 0.00465, 0.00466, 0.00467, 0.00466, 0.00464, 0.00475, 0.00467, 0.0048, 0.00468, 0.00466, 0.00466, 0.00467, 0.00478, 0.00466, 0.00469, 0.00465, 0.00466, 0.00465, 0.00499, 0.0047, 0.00568, 0.00465, 0.00465, 0.00466, 0.00466, 0.00541, 0.00464, 0.00465, 0.00465, 0.00465, 0.00463, 0.00465, 0.00469, 0.00464, 0.00473, 0.00463, 0.00466, 0.00474, 0.00466, 0.00465, 0.00464, 0.00467, 0.00464, 0.00466, 0.00464, 0.00462, 0.00464, 0.00466, 0.00463, 0.00467, 0.00467, 0.00542, 0.00468, 0.00466, 0.00465, 0.00465, 0.00467, 0.0047, 0.00463, 0.00461, 0.00466, 0.00468, 0.00464, 0.00466, 0.00467, 0.00468, 0.00467, 0.00465, 0.00467, 0.00468, 0.00465, 0.00469, 0.00468, 0.00468, 0.00464, 0.00466, 0.00467, 0.00464, 0.00464, 0.00461, 0.00462, 0.00463, 0.0047, 0.00464, 0.00489, 0.00464, 0.00469, 0.0046, 0.00459, 0.00459, 0.0046, 0.00459, 0.00472, 0.00501, 0.00458, 0.00468, 0.00465, 0.00469, 0.00461, 0.00469, 0.00458, 0.0047, 0.00478, 0.0046, 0.00464, 0.00461, 0.00468, 0.00468, 0.00476, 0.00469, 0.00461, 0.00457, 0.00469, 0.00472, 0.00468, 0.00464, 0.00467, 0.00461, 0.00467, 0.00463, 0.00558, 0.00601, 0.00464, 0.0047, 0.0047, 0.00459, 0.00574, 0.00463, 0.00519, 0.00467, 0.00462, 0.00464, 0.00469, 0.00461, 0.00476, 0.00462, 0.00501, 0.00471, 0.00465, 0.0049, 0.00465, 0.00465, 0.00465, 0.00465, 0.00462, 0.00466, 0.00466, 0.00465, 0.00463, 0.00464, 0.00464, 0.00465, 0.00468, 0.00466, 0.00465, 0.00469, 0.00468, 0.0047, 0.00466, 0.00514, 0.00464, 0.00465, 0.00469, 0.00468, 0.00511, 0.00511, 0.00571, 0.00469, 0.00467, 0.00473, 0.00471, 0.00465, 0.00469, 0.00466, 0.00464, 0.00465, 0.00468, 0.00467, 0.00468, 0.00465, 0.00464, 0.00464, 0.00468, 0.00467, 0.00464, 0.00464, 0.00467, 0.00472, 0.00466, 0.00466, 0.00473, 0.00466, 0.00465, 0.00468, 0.00463, 0.00465, 0.00465, 0.00469, 0.00467, 0.00465, 0.00469, 0.00464, 0.00467, 0.00468, 0.00468, 0.00467, 0.00468, 0.00469, 0.00467, 0.00465, 0.00466, 0.00468, 0.0047, 0.0047, 0.00469, 0.00467, 0.00475, 0.00469, 0.00466, 0.00467]}, "learning-rate": {"start_step": 0, "end_step": 2000, "step_interval": 5, "values": [0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 1e-05, 1e-05, 1e-05, 1e-05, 1e-05, 1e-05, 1e-05, 1e-05, 1e-05, 1e-05, 1e-05, 1e-05, 1e-05, 1e-05, 1e-05, 1e-05, 1e-05, 1e-05, 1e-05, 1e-05, 1e-05, 1e-05, 1e-05, 1e-05, 1e-05, 1e-05, 1e-05, 1e-05, 1e-05, 1e-05, 1e-05, 1e-05, 1e-05, 1e-05, 1e-05, 1e-05, 1e-05, 1e-05, 1e-05, 1e-05, 1e-05, 1e-05, 2e-05, 2e-05, 2e-05, 2e-05, 2e-05, 2e-05, 2e-05, 2e-05, 2e-05, 2e-05, 2e-05, 2e-05, 2e-05, 2e-05, 2e-05, 2e-05, 2e-05, 2e-05, 2e-05, 2e-05, 2e-05, 2e-05, 2e-05, 2e-05, 2e-05, 2e-05, 2e-05, 2e-05, 2e-05, 2e-05, 2e-05, 2e-05, 2e-05, 2e-05, 2e-05, 2e-05, 2e-05, 2e-05, 2e-05, 2e-05, 2e-05, 2e-05, 2e-05, 3e-05, 3e-05, 3e-05, 3e-05, 3e-05, 3e-05, 3e-05, 3e-05, 3e-05, 3e-05, 3e-05, 3e-05, 3e-05, 3e-05, 3e-05, 3e-05, 3e-05, 3e-05, 3e-05, 3e-05, 3e-05, 3e-05, 3e-05, 3e-05, 3e-05, 3e-05, 3e-05, 3e-05, 3e-05, 3e-05, 3e-05, 3e-05, 3e-05, 3e-05, 3e-05, 3e-05, 3e-05, 3e-05, 3e-05, 3e-05, 3e-05, 3e-05, 3e-05, 4e-05, 4e-05, 4e-05, 4e-05, 4e-05, 4e-05, 4e-05, 4e-05, 4e-05, 4e-05, 4e-05, 4e-05, 4e-05, 4e-05, 4e-05, 4e-05, 4e-05, 4e-05, 4e-05, 4e-05, 4e-05, 4e-05, 4e-05, 4e-05, 4e-05, 4e-05, 4e-05, 4e-05, 4e-05, 4e-05, 4e-05, 4e-05, 4e-05, 4e-05, 4e-05, 4e-05, 4e-05, 4e-05, 4e-05, 4e-05, 4e-05, 4e-05, 5e-05, 5e-05, 5e-05, 5e-05, 5e-05, 5e-05, 5e-05, 5e-05, 5e-05, 5e-05, 5e-05, 5e-05, 5e-05, 5e-05, 5e-05, 5e-05, 5e-05, 5e-05, 5e-05, 5e-05, 5e-05, 5e-05, 5e-05, 5e-05, 5e-05, 5e-05, 5e-05, 5e-05, 5e-05, 5e-05, 5e-05, 5e-05, 5e-05, 5e-05, 5e-05, 5e-05, 5e-05, 5e-05, 5e-05, 5e-05, 5e-05, 5e-05, 5e-05, 6e-05, 6e-05, 6e-05, 6e-05, 6e-05, 6e-05, 6e-05, 6e-05, 6e-05, 6e-05, 6e-05, 6e-05, 6e-05, 6e-05, 6e-05, 6e-05, 6e-05, 6e-05, 6e-05, 6e-05, 6e-05, 6e-05, 6e-05, 6e-05, 6e-05, 6e-05, 6e-05, 6e-05, 6e-05, 6e-05, 6e-05, 6e-05, 6e-05, 6e-05, 6e-05, 6e-05, 6e-05, 6e-05, 6e-05, 6e-05, 6e-05, 6e-05, 6e-05, 7e-05, 7e-05, 7e-05, 7e-05, 7e-05, 7e-05, 7e-05, 7e-05, 7e-05, 7e-05, 7e-05, 7e-05, 7e-05, 7e-05, 7e-05, 7e-05, 7e-05, 7e-05, 7e-05, 7e-05, 7e-05, 7e-05, 7e-05, 7e-05, 7e-05, 7e-05, 7e-05, 7e-05, 7e-05, 7e-05, 7e-05, 7e-05, 7e-05, 7e-05, 7e-05, 7e-05, 7e-05, 7e-05, 7e-05, 7e-05, 7e-05, 7e-05, 8e-05, 8e-05, 8e-05, 8e-05, 8e-05, 8e-05, 8e-05, 8e-05, 8e-05, 8e-05, 8e-05, 8e-05, 8e-05, 8e-05, 8e-05, 8e-05, 8e-05, 8e-05, 8e-05, 8e-05, 8e-05, 8e-05, 8e-05, 8e-05, 8e-05, 8e-05, 8e-05, 8e-05, 8e-05, 8e-05, 8e-05, 8e-05, 8e-05, 8e-05, 8e-05, 8e-05, 8e-05, 8e-05, 8e-05, 8e-05, 8e-05, 8e-05, 8e-05, 9e-05, 9e-05, 9e-05, 9e-05, 9e-05, 9e-05, 9e-05, 9e-05, 9e-05, 9e-05, 9e-05, 9e-05, 9e-05, 9e-05, 9e-05, 9e-05, 9e-05, 9e-05, 9e-05, 9e-05, 9e-05, 9e-05, 9e-05, 9e-05, 9e-05, 9e-05, 9e-05, 9e-05, 9e-05, 9e-05, 9e-05, 9e-05, 9e-05, 9e-05, 9e-05, 9e-05, 9e-05]}, "learning-rate vs samples": {"start_step": 0, "end_step": 2000, "step_interval": 5, "values": [0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 1e-05, 1e-05, 1e-05, 1e-05, 1e-05, 1e-05, 1e-05, 1e-05, 1e-05, 1e-05, 1e-05, 1e-05, 1e-05, 1e-05, 1e-05, 1e-05, 1e-05, 1e-05, 1e-05, 1e-05, 1e-05, 1e-05, 1e-05, 1e-05, 1e-05, 1e-05, 1e-05, 1e-05, 1e-05, 1e-05, 1e-05, 1e-05, 1e-05, 1e-05, 1e-05, 1e-05, 1e-05, 1e-05, 1e-05, 1e-05, 1e-05, 1e-05, 2e-05, 2e-05, 2e-05, 2e-05, 2e-05, 2e-05, 2e-05, 2e-05, 2e-05, 2e-05, 2e-05, 2e-05, 2e-05, 2e-05, 2e-05, 2e-05, 2e-05, 2e-05, 2e-05, 2e-05, 2e-05, 2e-05, 2e-05, 2e-05, 2e-05, 2e-05, 2e-05, 2e-05, 2e-05, 2e-05, 2e-05, 2e-05, 2e-05, 2e-05, 2e-05, 2e-05, 2e-05, 2e-05, 2e-05, 2e-05, 2e-05, 2e-05, 2e-05, 3e-05, 3e-05, 3e-05, 3e-05, 3e-05, 3e-05, 3e-05, 3e-05, 3e-05, 3e-05, 3e-05, 3e-05, 3e-05, 3e-05, 3e-05, 3e-05, 3e-05, 3e-05, 3e-05, 3e-05, 3e-05, 3e-05, 3e-05, 3e-05, 3e-05, 3e-05, 3e-05, 3e-05, 3e-05, 3e-05, 3e-05, 3e-05, 3e-05, 3e-05, 3e-05, 3e-05, 3e-05, 3e-05, 3e-05, 3e-05, 3e-05, 3e-05, 3e-05, 4e-05, 4e-05, 4e-05, 4e-05, 4e-05, 4e-05, 4e-05, 4e-05, 4e-05, 4e-05, 4e-05, 4e-05, 4e-05, 4e-05, 4e-05, 4e-05, 4e-05, 4e-05, 4e-05, 4e-05, 4e-05, 4e-05, 4e-05, 4e-05, 4e-05, 4e-05, 4e-05, 4e-05, 4e-05, 4e-05, 4e-05, 4e-05, 4e-05, 4e-05, 4e-05, 4e-05, 4e-05, 4e-05, 4e-05, 4e-05, 4e-05, 4e-05, 5e-05, 5e-05, 5e-05, 5e-05, 5e-05, 5e-05, 5e-05, 5e-05, 5e-05, 5e-05, 5e-05, 5e-05, 5e-05, 5e-05, 5e-05, 5e-05, 5e-05, 5e-05, 5e-05, 5e-05, 5e-05, 5e-05, 5e-05, 5e-05, 5e-05, 5e-05, 5e-05, 5e-05, 5e-05, 5e-05, 5e-05, 5e-05, 5e-05, 5e-05, 5e-05, 5e-05, 5e-05, 5e-05, 5e-05, 5e-05, 5e-05, 5e-05, 5e-05, 6e-05, 6e-05, 6e-05, 6e-05, 6e-05, 6e-05, 6e-05, 6e-05, 6e-05, 6e-05, 6e-05, 6e-05, 6e-05, 6e-05, 6e-05, 6e-05, 6e-05, 6e-05, 6e-05, 6e-05, 6e-05, 6e-05, 6e-05, 6e-05, 6e-05, 6e-05, 6e-05, 6e-05, 6e-05, 6e-05, 6e-05, 6e-05, 6e-05, 6e-05, 6e-05, 6e-05, 6e-05, 6e-05, 6e-05, 6e-05, 6e-05, 6e-05, 6e-05, 7e-05, 7e-05, 7e-05, 7e-05, 7e-05, 7e-05, 7e-05, 7e-05, 7e-05, 7e-05, 7e-05, 7e-05, 7e-05, 7e-05, 7e-05, 7e-05, 7e-05, 7e-05, 7e-05, 7e-05, 7e-05, 7e-05, 7e-05, 7e-05, 7e-05, 7e-05, 7e-05, 7e-05, 7e-05, 7e-05, 7e-05, 7e-05, 7e-05, 7e-05, 7e-05, 7e-05, 7e-05, 7e-05, 7e-05, 7e-05, 7e-05, 7e-05, 8e-05, 8e-05, 8e-05, 8e-05, 8e-05, 8e-05, 8e-05, 8e-05, 8e-05, 8e-05, 8e-05, 8e-05, 8e-05, 8e-05, 8e-05, 8e-05, 8e-05, 8e-05, 8e-05, 8e-05, 8e-05, 8e-05, 8e-05, 8e-05, 8e-05, 8e-05, 8e-05, 8e-05, 8e-05, 8e-05, 8e-05, 8e-05, 8e-05, 8e-05, 8e-05, 8e-05, 8e-05, 8e-05, 8e-05, 8e-05, 8e-05, 8e-05, 8e-05, 9e-05, 9e-05, 9e-05, 9e-05, 9e-05, 9e-05, 9e-05, 9e-05, 9e-05, 9e-05, 9e-05, 9e-05, 9e-05, 9e-05, 9e-05, 9e-05, 9e-05, 9e-05, 9e-05, 9e-05, 9e-05, 9e-05, 9e-05, 9e-05, 9e-05, 9e-05, 9e-05, 9e-05, 9e-05, 9e-05, 9e-05, 9e-05, 9e-05, 9e-05, 9e-05, 9e-05, 9e-05]}, "batch-size": {"start_step": 0, "end_step": 2000, "step_interval": 5, "values": [128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0]}, "batch-size vs samples": {"start_step": 0, "end_step": 2000, "step_interval": 5, "values": [128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0]}, "lm loss": {"start_step": 0, "end_step": 2000, "step_interval": 5, "values": [10.84303, 10.87155, 10.85032, 10.81087, 10.64537, 10.63943, 10.42704, 10.13551, 9.93496, 9.83494, 9.58592, 9.84757, 9.88552, 9.63097, 9.79022, 9.51147, 9.4606, 9.65582, 9.39007, 9.33886, 9.24978, 9.152, 9.18226, 9.00447, 9.19856, 9.06681, 9.16059, 9.16939, 9.30049, 8.98819, 8.92948, 9.0507, 9.0463, 8.66041, 8.72526, 8.75716, 8.69559, 8.74303, 8.66681, 8.77472, 8.67057, 8.8619, 8.84447, 8.50989, 8.39988, 8.43941, 8.49864, 8.39575, 8.4422, 8.59464, 8.37842, 8.20138, 8.236, 8.2319, 8.27672, 7.92273, 8.10152, 7.8984, 8.25217, 8.23541, 8.01089, 7.97596, 7.92706, 7.74403, 7.7485, 7.65015, 7.52079, 7.9112, 7.70347, 7.45605, 7.74759, 7.77568, 7.54533, 7.30357, 7.45723, 7.3426, 7.46645, 7.22831, 7.63649, 7.28211, 7.34866, 7.21221, 7.21132, 7.41795, 7.17177, 7.28168, 6.99581, 7.004, 7.04074, 7.1367, 6.82354, 6.98508, 7.08921, 6.99769, 6.87461, 6.75657, 6.99031, 7.05959, 6.70411, 6.5827, 6.72604, 6.74348, 6.73218, 6.73708, 6.65685, 6.4055, 6.63559, 6.61892, 6.44639, 6.62609, 6.74333, 6.61179, 6.7261, 6.69431, 6.62741, 6.50922, 6.59901, 6.40739, 6.6657, 6.24852, 6.25199, 6.30265, 6.39086, 6.34866, 6.4484, 6.29117, 6.33917, 6.23682, 6.20019, 6.39713, 6.32382, 6.32063, 6.16132, 6.15692, 6.23736, 6.38207, 6.20216, 6.14927, 6.18286, 6.11574, 6.06273, 6.07513, 6.25658, 6.40785, 6.25681, 6.2924, 6.09673, 6.17564, 6.00002, 6.02568, 5.95394, 6.24995, 6.18499, 5.96441, 5.78379, 6.12452, 5.8475, 6.10173, 5.78491, 6.16542, 6.14406, 6.08134, 5.92727, 6.11254, 5.94363, 6.20077, 5.89399, 5.7901, 5.78128, 5.68813, 6.01482, 5.99528, 6.06741, 5.89085, 6.03981, 5.96811, 5.99655, 5.98984, 5.94628, 5.83848, 5.9481, 5.61614, 5.7002, 5.88656, 5.83806, 5.86311, 5.75859, 5.83316, 5.72072, 5.55659, 5.71965, 5.61978, 5.82718, 5.59717, 5.70318, 5.70327, 5.89853, 5.63883, 5.84367, 5.73571, 5.86365, 5.32462, 5.89684, 5.87059, 5.85018, 5.40966, 5.40521, 5.6244, 5.59463, 5.48385, 5.57514, 5.67111, 5.47486, 5.74063, 5.50617, 5.58954, 5.62055, 5.61722, 5.51063, 5.6138, 5.67042, 5.67814, 5.58421, 5.65728, 5.36779, 5.67697, 5.62608, 5.41953, 5.57893, 5.62664, 5.55034, 5.33858, 5.53624, 5.48821, 5.48891, 5.37489, 5.5499, 5.60024, 5.39139, 5.51868, 5.4935, 5.33216, 5.50746, 5.41318, 5.44698, 5.31869, 5.06634, 5.48126, 5.57099, 5.71639, 5.41515, 5.60293, 5.63581, 5.23321, 5.27358, 5.3934, 5.40049, 5.32861, 5.49563, 5.18115, 5.29818, 5.24632, 5.377, 5.25164, 5.44247, 5.53356, 5.31175, 5.43649, 5.33683, 5.07482, 5.31199, 5.25123, 5.30045, 5.10952, 5.27365, 5.26615, 5.4733, 5.15569, 5.2676, 5.21227, 5.35586, 4.98451, 4.91017, 5.32431, 5.38997, 5.22667, 5.3209, 5.10232, 5.16141, 5.26239, 5.0658, 5.26091, 5.06389, 5.34895, 5.24827, 5.1463, 5.24113, 5.03942, 5.31795, 5.05285, 5.02784, 5.14139, 5.11164, 5.27303, 5.15115, 5.2757, 5.09401, 5.09338, 5.24504, 5.32369, 5.25347, 5.19226, 5.14165, 5.29079, 4.95338, 5.20578, 5.09105, 5.30122, 5.17357, 5.19235, 5.11365, 4.98113, 4.9916, 5.22149, 5.30937, 5.10092, 5.0529, 4.91086, 5.12305, 5.11531, 4.92812, 5.3389, 5.02814, 5.10063, 5.16722, 5.00342, 5.0656, 5.06853, 5.0, 5.08165, 5.16456, 4.98252, 5.1839, 4.93148, 4.92569, 5.06682, 4.99595, 4.90624, 4.77517, 4.94606, 5.11508, 5.01539, 5.01397, 5.3327, 4.96029, 4.9915, 5.04439, 4.80654, 4.73199, 4.99639, 5.04237, 4.8734, 4.95425, 5.04678, 5.02392, 4.81994, 4.89463, 4.90711, 4.83288, 4.74257, 5.01934, 4.75352, 5.20696, 4.79359, 4.99212, 4.73894, 4.7885, 4.82299, 4.65617, 4.65522, 4.84524, 4.81217, 4.79792, 4.92038, 4.88607, 4.92565, 4.7712, 4.88216, 4.73528, 4.92078, 4.96145, 4.87447, 4.71317, 4.78702, 4.90462, 4.71624, 4.86657, 4.69712, 4.69196, 4.64876]}, "lm loss vs samples": {"start_step": 0, "end_step": 2000, "step_interval": 5, "values": [10.84303, 10.87155, 10.85032, 10.81087, 10.64537, 10.63943, 10.42704, 10.13551, 9.93496, 9.83494, 9.58592, 9.84757, 9.88552, 9.63097, 9.79022, 9.51147, 9.4606, 9.65582, 9.39007, 9.33886, 9.24978, 9.152, 9.18226, 9.00447, 9.19856, 9.06681, 9.16059, 9.16939, 9.30049, 8.98819, 8.92948, 9.0507, 9.0463, 8.66041, 8.72526, 8.75716, 8.69559, 8.74303, 8.66681, 8.77472, 8.67057, 8.8619, 8.84447, 8.50989, 8.39988, 8.43941, 8.49864, 8.39575, 8.4422, 8.59464, 8.37842, 8.20138, 8.236, 8.2319, 8.27672, 7.92273, 8.10152, 7.8984, 8.25217, 8.23541, 8.01089, 7.97596, 7.92706, 7.74403, 7.7485, 7.65015, 7.52079, 7.9112, 7.70347, 7.45605, 7.74759, 7.77568, 7.54533, 7.30357, 7.45723, 7.3426, 7.46645, 7.22831, 7.63649, 7.28211, 7.34866, 7.21221, 7.21132, 7.41795, 7.17177, 7.28168, 6.99581, 7.004, 7.04074, 7.1367, 6.82354, 6.98508, 7.08921, 6.99769, 6.87461, 6.75657, 6.99031, 7.05959, 6.70411, 6.5827, 6.72604, 6.74348, 6.73218, 6.73708, 6.65685, 6.4055, 6.63559, 6.61892, 6.44639, 6.62609, 6.74333, 6.61179, 6.7261, 6.69431, 6.62741, 6.50922, 6.59901, 6.40739, 6.6657, 6.24852, 6.25199, 6.30265, 6.39086, 6.34866, 6.4484, 6.29117, 6.33917, 6.23682, 6.20019, 6.39713, 6.32382, 6.32063, 6.16132, 6.15692, 6.23736, 6.38207, 6.20216, 6.14927, 6.18286, 6.11574, 6.06273, 6.07513, 6.25658, 6.40785, 6.25681, 6.2924, 6.09673, 6.17564, 6.00002, 6.02568, 5.95394, 6.24995, 6.18499, 5.96441, 5.78379, 6.12452, 5.8475, 6.10173, 5.78491, 6.16542, 6.14406, 6.08134, 5.92727, 6.11254, 5.94363, 6.20077, 5.89399, 5.7901, 5.78128, 5.68813, 6.01482, 5.99528, 6.06741, 5.89085, 6.03981, 5.96811, 5.99655, 5.98984, 5.94628, 5.83848, 5.9481, 5.61614, 5.7002, 5.88656, 5.83806, 5.86311, 5.75859, 5.83316, 5.72072, 5.55659, 5.71965, 5.61978, 5.82718, 5.59717, 5.70318, 5.70327, 5.89853, 5.63883, 5.84367, 5.73571, 5.86365, 5.32462, 5.89684, 5.87059, 5.85018, 5.40966, 5.40521, 5.6244, 5.59463, 5.48385, 5.57514, 5.67111, 5.47486, 5.74063, 5.50617, 5.58954, 5.62055, 5.61722, 5.51063, 5.6138, 5.67042, 5.67814, 5.58421, 5.65728, 5.36779, 5.67697, 5.62608, 5.41953, 5.57893, 5.62664, 5.55034, 5.33858, 5.53624, 5.48821, 5.48891, 5.37489, 5.5499, 5.60024, 5.39139, 5.51868, 5.4935, 5.33216, 5.50746, 5.41318, 5.44698, 5.31869, 5.06634, 5.48126, 5.57099, 5.71639, 5.41515, 5.60293, 5.63581, 5.23321, 5.27358, 5.3934, 5.40049, 5.32861, 5.49563, 5.18115, 5.29818, 5.24632, 5.377, 5.25164, 5.44247, 5.53356, 5.31175, 5.43649, 5.33683, 5.07482, 5.31199, 5.25123, 5.30045, 5.10952, 5.27365, 5.26615, 5.4733, 5.15569, 5.2676, 5.21227, 5.35586, 4.98451, 4.91017, 5.32431, 5.38997, 5.22667, 5.3209, 5.10232, 5.16141, 5.26239, 5.0658, 5.26091, 5.06389, 5.34895, 5.24827, 5.1463, 5.24113, 5.03942, 5.31795, 5.05285, 5.02784, 5.14139, 5.11164, 5.27303, 5.15115, 5.2757, 5.09401, 5.09338, 5.24504, 5.32369, 5.25347, 5.19226, 5.14165, 5.29079, 4.95338, 5.20578, 5.09105, 5.30122, 5.17357, 5.19235, 5.11365, 4.98113, 4.9916, 5.22149, 5.30937, 5.10092, 5.0529, 4.91086, 5.12305, 5.11531, 4.92812, 5.3389, 5.02814, 5.10063, 5.16722, 5.00342, 5.0656, 5.06853, 5.0, 5.08165, 5.16456, 4.98252, 5.1839, 4.93148, 4.92569, 5.06682, 4.99595, 4.90624, 4.77517, 4.94606, 5.11508, 5.01539, 5.01397, 5.3327, 4.96029, 4.9915, 5.04439, 4.80654, 4.73199, 4.99639, 5.04237, 4.8734, 4.95425, 5.04678, 5.02392, 4.81994, 4.89463, 4.90711, 4.83288, 4.74257, 5.01934, 4.75352, 5.20696, 4.79359, 4.99212, 4.73894, 4.7885, 4.82299, 4.65617, 4.65522, 4.84524, 4.81217, 4.79792, 4.92038, 4.88607, 4.92565, 4.7712, 4.88216, 4.73528, 4.92078, 4.96145, 4.87447, 4.71317, 4.78702, 4.90462, 4.71624, 4.86657, 4.69712, 4.69196, 4.64876]}, "loss-scale": {"start_step": 0, "end_step": 2000, "step_interval": 5, "values": [1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0]}, "loss-scale vs samples": {"start_step": 0, "end_step": 2000, "step_interval": 5, "values": [1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0]}, "grad-norm": {"start_step": 0, "end_step": 2000, "step_interval": 5, "values": [13.92196, 13.29306, 13.8377, 12.64037, 11.97375, 9.45262, 6.78823, 6.89004, 5.94557, 4.54615, 4.13637, 2.82375, 2.38927, 2.34389, 2.05973, 2.22596, 2.14457, 1.88597, 2.17986, 2.06069, 2.12423, 2.1677, 2.0115, 2.21442, 1.98307, 2.0966, 1.90389, 1.86829, 1.92477, 2.13027, 2.09469, 2.11211, 1.95723, 2.18758, 2.38519, 2.04808, 2.04244, 1.85027, 1.9837, 1.78603, 2.12943, 1.83753, 1.73653, 1.84787, 1.96175, 1.78052, 1.76095, 1.7401, 1.76961, 1.54057, 1.76088, 1.7938, 1.76365, 1.83855, 1.58517, 1.79545, 1.7158, 1.81815, 1.53518, 1.48648, 1.68949, 1.4562, 1.8648, 1.85145, 1.61928, 1.6745, 1.65487, 1.55646, 1.47797, 1.6989, 1.43883, 1.43836, 1.46011, 1.39711, 1.37457, 1.48663, 1.40785, 1.35385, 1.34051, 1.27757, 1.35283, 1.29709, 1.2816, 1.30185, 1.24092, 1.29738, 1.41961, 1.34489, 1.44199, 1.06928, 1.09491, 1.16108, 1.14396, 1.33634, 1.03654, 1.30756, 1.08982, 1.27845, 0.98191, 1.37412, 1.30793, 1.21672, 1.05131, 1.25909, 1.09643, 1.13996, 1.20961, 1.09191, 1.24074, 0.97878, 1.18535, 0.97714, 0.95456, 1.10186, 1.24389, 1.07847, 1.01822, 1.2519, 1.18392, 1.42087, 1.00253, 1.23223, 1.05494, 1.02956, 0.95692, 1.27887, 1.54081, 1.2168, 1.18019, 1.34805, 0.93443, 1.06987, 1.00938, 1.19729, 1.32572, 1.18029, 1.39724, 1.01719, 1.76109, 1.21222, 1.26256, 1.31969, 1.1555, 0.93801, 0.99546, 1.01521, 1.36553, 1.55577, 1.11391, 1.2491, 1.45721, 1.65042, 1.60593, 1.30243, 1.29342, 2.04924, 1.3376, 1.21234, 1.37945, 1.79037, 1.23389, 1.08215, 1.31811, 1.12901, 1.35786, 1.8341, 1.46143, 1.31586, 1.39491, 1.24546, 1.26969, 1.25412, 1.27022, 1.43967, 1.14847, 1.3362, 1.91114, 1.35642, 1.06973, 1.20518, 1.11732, 1.73877, 1.36915, 1.34679, 1.25766, 1.64809, 1.37397, 1.17279, 1.169, 1.49772, 1.11509, 1.29145, 1.479, 1.60514, 1.12787, 1.20465, 1.52478, 1.37769, 1.40825, 1.40433, 1.19434, 1.52129, 1.49087, 1.60752, 1.51416, 1.37753, 1.49097, 1.59106, 1.33146, 1.56964, 1.54958, 1.2024, 1.29844, 1.28184, 1.63096, 1.29563, 1.41842, 1.57651, 1.29669, 1.23902, 1.51872, 1.34276, 1.28172, 1.67239, 1.39643, 1.57361, 1.69097, 1.37206, 1.81716, 1.3501, 1.2879, 1.45938, 1.9477, 1.77504, 2.56828, 1.55284, 1.34454, 1.21685, 1.65336, 1.29693, 2.2136, 1.28644, 1.78502, 1.52285, 1.47963, 1.65183, 1.23421, 1.41797, 1.5183, 1.31219, 1.29375, 1.3932, 1.5544, 1.2678, 1.61107, 1.43809, 1.9371, 1.64335, 1.38939, 1.24473, 1.15131, 1.26598, 1.37433, 1.20588, 1.22283, 1.31678, 1.40086, 1.53213, 1.35367, 1.43407, 1.41639, 1.25063, 1.37444, 1.20928, 1.40445, 1.48011, 1.49606, 1.43456, 1.4511, 1.51505, 1.49329, 1.32736, 1.34283, 1.56947, 1.3986, 1.38533, 1.4325, 1.36846, 1.40113, 1.40195, 1.41944, 1.73207, 1.35246, 1.98477, 1.75001, 1.59412, 1.33312, 1.55175, 1.45641, 1.40103, 1.32697, 1.19674, 1.19056, 1.56111, 1.64, 1.52329, 1.62982, 1.42489, 1.1143, 1.42326, 1.36052, 1.20749, 1.49372, 1.38211, 1.6856, 1.48198, 1.34985, 1.48241, 1.24509, 1.40355, 1.44024, 1.31152, 1.30253, 1.59307, 1.35212, 1.78683, 1.61562, 1.61575, 1.46207, 1.29047, 1.55842, 1.39097, 1.35377, 1.50655, 1.67836, 1.37929, 1.32311, 1.35305, 1.77455, 1.48895, 1.40827, 1.23883, 1.35995, 1.46576, 1.39021, 1.55027, 1.27874, 1.53316, 1.30645, 1.32818, 1.41856, 1.40297, 1.19176, 1.73797, 1.28462, 1.46556, 1.31822, 1.27157, 1.29905, 1.43641, 1.37732, 1.32041, 1.45048, 1.30403, 1.12439, 1.41266, 1.49642, 1.41634, 1.48283, 1.73467, 1.90209, 1.41005, 1.66166, 1.51488, 1.35734, 1.47652, 1.40564, 1.6499, 1.41346, 1.24965, 1.34929, 1.35141, 1.18107, 1.30851, 1.17223, 1.29341, 1.38306, 1.247, 1.29013, 1.70946, 1.36584, 1.4061, 1.82813, 1.27073, 1.45088, 1.55944, 1.5925, 1.64727, 1.42815, 1.19955]}, "grad-norm vs samples": {"start_step": 0, "end_step": 2000, "step_interval": 5, "values": [13.92196, 13.29306, 13.8377, 12.64037, 11.97375, 9.45262, 6.78823, 6.89004, 5.94557, 4.54615, 4.13637, 2.82375, 2.38927, 2.34389, 2.05973, 2.22596, 2.14457, 1.88597, 2.17986, 2.06069, 2.12423, 2.1677, 2.0115, 2.21442, 1.98307, 2.0966, 1.90389, 1.86829, 1.92477, 2.13027, 2.09469, 2.11211, 1.95723, 2.18758, 2.38519, 2.04808, 2.04244, 1.85027, 1.9837, 1.78603, 2.12943, 1.83753, 1.73653, 1.84787, 1.96175, 1.78052, 1.76095, 1.7401, 1.76961, 1.54057, 1.76088, 1.7938, 1.76365, 1.83855, 1.58517, 1.79545, 1.7158, 1.81815, 1.53518, 1.48648, 1.68949, 1.4562, 1.8648, 1.85145, 1.61928, 1.6745, 1.65487, 1.55646, 1.47797, 1.6989, 1.43883, 1.43836, 1.46011, 1.39711, 1.37457, 1.48663, 1.40785, 1.35385, 1.34051, 1.27757, 1.35283, 1.29709, 1.2816, 1.30185, 1.24092, 1.29738, 1.41961, 1.34489, 1.44199, 1.06928, 1.09491, 1.16108, 1.14396, 1.33634, 1.03654, 1.30756, 1.08982, 1.27845, 0.98191, 1.37412, 1.30793, 1.21672, 1.05131, 1.25909, 1.09643, 1.13996, 1.20961, 1.09191, 1.24074, 0.97878, 1.18535, 0.97714, 0.95456, 1.10186, 1.24389, 1.07847, 1.01822, 1.2519, 1.18392, 1.42087, 1.00253, 1.23223, 1.05494, 1.02956, 0.95692, 1.27887, 1.54081, 1.2168, 1.18019, 1.34805, 0.93443, 1.06987, 1.00938, 1.19729, 1.32572, 1.18029, 1.39724, 1.01719, 1.76109, 1.21222, 1.26256, 1.31969, 1.1555, 0.93801, 0.99546, 1.01521, 1.36553, 1.55577, 1.11391, 1.2491, 1.45721, 1.65042, 1.60593, 1.30243, 1.29342, 2.04924, 1.3376, 1.21234, 1.37945, 1.79037, 1.23389, 1.08215, 1.31811, 1.12901, 1.35786, 1.8341, 1.46143, 1.31586, 1.39491, 1.24546, 1.26969, 1.25412, 1.27022, 1.43967, 1.14847, 1.3362, 1.91114, 1.35642, 1.06973, 1.20518, 1.11732, 1.73877, 1.36915, 1.34679, 1.25766, 1.64809, 1.37397, 1.17279, 1.169, 1.49772, 1.11509, 1.29145, 1.479, 1.60514, 1.12787, 1.20465, 1.52478, 1.37769, 1.40825, 1.40433, 1.19434, 1.52129, 1.49087, 1.60752, 1.51416, 1.37753, 1.49097, 1.59106, 1.33146, 1.56964, 1.54958, 1.2024, 1.29844, 1.28184, 1.63096, 1.29563, 1.41842, 1.57651, 1.29669, 1.23902, 1.51872, 1.34276, 1.28172, 1.67239, 1.39643, 1.57361, 1.69097, 1.37206, 1.81716, 1.3501, 1.2879, 1.45938, 1.9477, 1.77504, 2.56828, 1.55284, 1.34454, 1.21685, 1.65336, 1.29693, 2.2136, 1.28644, 1.78502, 1.52285, 1.47963, 1.65183, 1.23421, 1.41797, 1.5183, 1.31219, 1.29375, 1.3932, 1.5544, 1.2678, 1.61107, 1.43809, 1.9371, 1.64335, 1.38939, 1.24473, 1.15131, 1.26598, 1.37433, 1.20588, 1.22283, 1.31678, 1.40086, 1.53213, 1.35367, 1.43407, 1.41639, 1.25063, 1.37444, 1.20928, 1.40445, 1.48011, 1.49606, 1.43456, 1.4511, 1.51505, 1.49329, 1.32736, 1.34283, 1.56947, 1.3986, 1.38533, 1.4325, 1.36846, 1.40113, 1.40195, 1.41944, 1.73207, 1.35246, 1.98477, 1.75001, 1.59412, 1.33312, 1.55175, 1.45641, 1.40103, 1.32697, 1.19674, 1.19056, 1.56111, 1.64, 1.52329, 1.62982, 1.42489, 1.1143, 1.42326, 1.36052, 1.20749, 1.49372, 1.38211, 1.6856, 1.48198, 1.34985, 1.48241, 1.24509, 1.40355, 1.44024, 1.31152, 1.30253, 1.59307, 1.35212, 1.78683, 1.61562, 1.61575, 1.46207, 1.29047, 1.55842, 1.39097, 1.35377, 1.50655, 1.67836, 1.37929, 1.32311, 1.35305, 1.77455, 1.48895, 1.40827, 1.23883, 1.35995, 1.46576, 1.39021, 1.55027, 1.27874, 1.53316, 1.30645, 1.32818, 1.41856, 1.40297, 1.19176, 1.73797, 1.28462, 1.46556, 1.31822, 1.27157, 1.29905, 1.43641, 1.37732, 1.32041, 1.45048, 1.30403, 1.12439, 1.41266, 1.49642, 1.41634, 1.48283, 1.73467, 1.90209, 1.41005, 1.66166, 1.51488, 1.35734, 1.47652, 1.40564, 1.6499, 1.41346, 1.24965, 1.34929, 1.35141, 1.18107, 1.30851, 1.17223, 1.29341, 1.38306, 1.247, 1.29013, 1.70946, 1.36584, 1.4061, 1.82813, 1.27073, 1.45088, 1.55944, 1.5925, 1.64727, 1.42815, 1.19955]}, "num-zeros": {"start_step": 0, "end_step": 2000, "step_interval": 5, "values": [65.0, 80.0, 81.0, 75.0, 72.0, 103.0, 108.0, 112.0, 107.0, 122.0, 99.0, 159.0, 148.0, 150.0, 167.0, 157.0, 165.0, 144.0, 182.0, 187.0, 180.0, 162.0, 181.0, 129.0, 189.0, 148.0, 195.0, 190.0, 137.0, 181.0, 151.0, 155.0, 152.0, 166.0, 152.0, 170.0, 160.0, 209.0, 168.0, 214.0, 166.0, 181.0, 190.0, 185.0, 161.0, 162.0, 169.0, 187.0, 184.0, 239.0, 225.0, 187.0, 190.0, 131.0, 187.0, 182.0, 159.0, 161.0, 248.0, 226.0, 201.0, 211.0, 174.0, 164.0, 168.0, 225.0, 202.0, 174.0, 223.0, 202.0, 243.0, 235.0, 180.0, 239.0, 219.0, 205.0, 210.0, 192.0, 216.0, 207.0, 209.0, 245.0, 217.0, 227.0, 212.0, 207.0, 191.0, 173.0, 196.0, 193.0, 194.0, 186.0, 203.0, 189.0, 210.0, 160.0, 204.0, 187.0, 189.0, 159.0, 168.0, 209.0, 181.0, 159.0, 173.0, 153.0, 175.0, 152.0, 147.0, 174.0, 180.0, 153.0, 176.0, 146.0, 165.0, 154.0, 147.0, 106.0, 147.0, 133.0, 174.0, 148.0, 152.0, 143.0, 173.0, 127.0, 116.0, 130.0, 127.0, 123.0, 143.0, 142.0, 146.0, 123.0, 131.0, 124.0, 138.0, 139.0, 109.0, 107.0, 130.0, 103.0, 121.0, 157.0, 131.0, 148.0, 139.0, 96.0, 120.0, 101.0, 96.0, 102.0, 102.0, 122.0, 105.0, 84.0, 114.0, 117.0, 95.0, 90.0, 106.0, 137.0, 136.0, 131.0, 122.0, 95.0, 111.0, 99.0, 117.0, 119.0, 129.0, 111.0, 104.0, 112.0, 108.0, 102.0, 88.0, 97.0, 120.0, 121.0, 124.0, 96.0, 126.0, 134.0, 122.0, 98.0, 97.0, 115.0, 102.0, 102.0, 128.0, 120.0, 104.0, 104.0, 97.0, 112.0, 104.0, 96.0, 117.0, 97.0, 136.0, 100.0, 92.0, 104.0, 95.0, 111.0, 97.0, 87.0, 108.0, 128.0, 94.0, 111.0, 106.0, 122.0, 99.0, 94.0, 110.0, 104.0, 116.0, 119.0, 114.0, 112.0, 104.0, 104.0, 108.0, 88.0, 105.0, 114.0, 103.0, 105.0, 96.0, 98.0, 92.0, 92.0, 91.0, 102.0, 119.0, 106.0, 86.0, 104.0, 60.0, 110.0, 92.0, 91.0, 80.0, 91.0, 114.0, 106.0, 80.0, 119.0, 117.0, 112.0, 114.0, 98.0, 102.0, 109.0, 101.0, 100.0, 102.0, 126.0, 124.0, 99.0, 112.0, 110.0, 129.0, 111.0, 99.0, 119.0, 101.0, 82.0, 110.0, 84.0, 95.0, 104.0, 96.0, 107.0, 83.0, 114.0, 105.0, 93.0, 104.0, 108.0, 94.0, 99.0, 104.0, 101.0, 88.0, 112.0, 101.0, 101.0, 108.0, 119.0, 118.0, 103.0, 100.0, 107.0, 94.0, 104.0, 118.0, 111.0, 115.0, 100.0, 114.0, 90.0, 110.0, 107.0, 90.0, 91.0, 145.0, 113.0, 112.0, 120.0, 101.0, 98.0, 97.0, 96.0, 109.0, 100.0, 115.0, 120.0, 120.0, 121.0, 128.0, 103.0, 94.0, 104.0, 110.0, 89.0, 102.0, 106.0, 113.0, 117.0, 113.0, 115.0, 93.0, 114.0, 119.0, 132.0, 82.0, 112.0, 105.0, 96.0, 124.0, 107.0, 108.0, 104.0, 145.0, 119.0, 124.0, 115.0, 116.0, 94.0, 130.0, 98.0, 115.0, 117.0, 120.0, 122.0, 122.0, 110.0, 108.0, 87.0, 117.0, 102.0, 123.0, 108.0, 123.0, 107.0, 99.0, 127.0, 94.0, 107.0, 72.0, 102.0, 86.0, 91.0, 94.0, 116.0, 106.0, 120.0, 127.0, 115.0, 124.0, 126.0, 129.0, 117.0, 112.0, 120.0, 119.0, 126.0, 111.0, 119.0, 91.0, 102.0, 95.0, 118.0, 111.0, 99.0, 122.0, 125.0]}, "num-zeros vs samples": {"start_step": 0, "end_step": 2000, "step_interval": 5, "values": [65.0, 80.0, 81.0, 75.0, 72.0, 103.0, 108.0, 112.0, 107.0, 122.0, 99.0, 159.0, 148.0, 150.0, 167.0, 157.0, 165.0, 144.0, 182.0, 187.0, 180.0, 162.0, 181.0, 129.0, 189.0, 148.0, 195.0, 190.0, 137.0, 181.0, 151.0, 155.0, 152.0, 166.0, 152.0, 170.0, 160.0, 209.0, 168.0, 214.0, 166.0, 181.0, 190.0, 185.0, 161.0, 162.0, 169.0, 187.0, 184.0, 239.0, 225.0, 187.0, 190.0, 131.0, 187.0, 182.0, 159.0, 161.0, 248.0, 226.0, 201.0, 211.0, 174.0, 164.0, 168.0, 225.0, 202.0, 174.0, 223.0, 202.0, 243.0, 235.0, 180.0, 239.0, 219.0, 205.0, 210.0, 192.0, 216.0, 207.0, 209.0, 245.0, 217.0, 227.0, 212.0, 207.0, 191.0, 173.0, 196.0, 193.0, 194.0, 186.0, 203.0, 189.0, 210.0, 160.0, 204.0, 187.0, 189.0, 159.0, 168.0, 209.0, 181.0, 159.0, 173.0, 153.0, 175.0, 152.0, 147.0, 174.0, 180.0, 153.0, 176.0, 146.0, 165.0, 154.0, 147.0, 106.0, 147.0, 133.0, 174.0, 148.0, 152.0, 143.0, 173.0, 127.0, 116.0, 130.0, 127.0, 123.0, 143.0, 142.0, 146.0, 123.0, 131.0, 124.0, 138.0, 139.0, 109.0, 107.0, 130.0, 103.0, 121.0, 157.0, 131.0, 148.0, 139.0, 96.0, 120.0, 101.0, 96.0, 102.0, 102.0, 122.0, 105.0, 84.0, 114.0, 117.0, 95.0, 90.0, 106.0, 137.0, 136.0, 131.0, 122.0, 95.0, 111.0, 99.0, 117.0, 119.0, 129.0, 111.0, 104.0, 112.0, 108.0, 102.0, 88.0, 97.0, 120.0, 121.0, 124.0, 96.0, 126.0, 134.0, 122.0, 98.0, 97.0, 115.0, 102.0, 102.0, 128.0, 120.0, 104.0, 104.0, 97.0, 112.0, 104.0, 96.0, 117.0, 97.0, 136.0, 100.0, 92.0, 104.0, 95.0, 111.0, 97.0, 87.0, 108.0, 128.0, 94.0, 111.0, 106.0, 122.0, 99.0, 94.0, 110.0, 104.0, 116.0, 119.0, 114.0, 112.0, 104.0, 104.0, 108.0, 88.0, 105.0, 114.0, 103.0, 105.0, 96.0, 98.0, 92.0, 92.0, 91.0, 102.0, 119.0, 106.0, 86.0, 104.0, 60.0, 110.0, 92.0, 91.0, 80.0, 91.0, 114.0, 106.0, 80.0, 119.0, 117.0, 112.0, 114.0, 98.0, 102.0, 109.0, 101.0, 100.0, 102.0, 126.0, 124.0, 99.0, 112.0, 110.0, 129.0, 111.0, 99.0, 119.0, 101.0, 82.0, 110.0, 84.0, 95.0, 104.0, 96.0, 107.0, 83.0, 114.0, 105.0, 93.0, 104.0, 108.0, 94.0, 99.0, 104.0, 101.0, 88.0, 112.0, 101.0, 101.0, 108.0, 119.0, 118.0, 103.0, 100.0, 107.0, 94.0, 104.0, 118.0, 111.0, 115.0, 100.0, 114.0, 90.0, 110.0, 107.0, 90.0, 91.0, 145.0, 113.0, 112.0, 120.0, 101.0, 98.0, 97.0, 96.0, 109.0, 100.0, 115.0, 120.0, 120.0, 121.0, 128.0, 103.0, 94.0, 104.0, 110.0, 89.0, 102.0, 106.0, 113.0, 117.0, 113.0, 115.0, 93.0, 114.0, 119.0, 132.0, 82.0, 112.0, 105.0, 96.0, 124.0, 107.0, 108.0, 104.0, 145.0, 119.0, 124.0, 115.0, 116.0, 94.0, 130.0, 98.0, 115.0, 117.0, 120.0, 122.0, 122.0, 110.0, 108.0, 87.0, 117.0, 102.0, 123.0, 108.0, 123.0, 107.0, 99.0, 127.0, 94.0, 107.0, 72.0, 102.0, 86.0, 91.0, 94.0, 116.0, 106.0, 120.0, 127.0, 115.0, 124.0, 126.0, 129.0, 117.0, 112.0, 120.0, 119.0, 126.0, 111.0, 119.0, 91.0, 102.0, 95.0, 118.0, 111.0, 99.0, 122.0, 125.0]}, "params-norm": {"start_step": 0, "end_step": 2000, "step_interval": 5, "values": [180.0215, 180.0215, 180.0215, 180.0215, 180.02148, 180.02147, 180.02145, 180.02135, 180.02115, 180.02094, 180.01958, 180.01897, 180.01846, 180.01624, 180.01547, 180.01477, 180.01425, 180.01407, 180.0143, 180.01436, 180.01428, 180.01392, 180.01378, 180.01396, 180.01488, 180.0161, 180.0174, 180.01826, 180.0192, 180.0206, 180.02252, 180.02473, 180.0269, 180.02902, 180.03149, 180.0345, 180.03801, 180.04181, 180.04555, 180.04977, 180.05466, 180.05992, 180.06546, 180.0715, 180.07837, 180.08574, 180.09346, 180.10184, 180.11102, 180.12125, 180.13229, 180.14392, 180.15607, 180.16887, 180.1824, 180.19664, 180.21123, 180.22636, 180.24251, 180.25967, 180.27742, 180.29587, 180.31598, 180.33707, 180.3582, 180.3808, 180.40411, 180.42862, 180.45422, 180.48024, 180.50642, 180.53325, 180.56082, 180.58878, 180.61742, 180.64685, 180.67635, 180.70671, 180.73753, 180.76909, 180.80096, 180.83255, 180.86522, 180.89883, 180.93253, 180.96713, 181.00252, 181.03773, 181.07297, 181.10829, 181.14496, 181.18279, 181.22028, 181.25752, 181.29439, 181.32959, 181.36458, 181.40088, 181.43741, 181.47369, 181.50917, 181.54332, 181.57774, 181.61334, 181.64902, 181.68596, 181.7242, 181.7617, 181.79843, 181.83513, 181.87192, 181.90961, 181.94727, 181.9857, 182.02441, 182.06326, 182.1035, 182.14424, 182.18398, 182.22302, 182.26132, 182.30066, 182.33942, 182.37904, 182.41917, 182.45876, 182.49632, 182.53271, 182.56963, 182.60735, 182.64554, 182.68359, 182.72183, 182.75928, 182.79482, 182.83173, 182.86961, 182.90521, 182.94044, 182.97412, 183.00899, 183.04352, 183.0809, 183.12045, 183.16031, 183.20035, 183.24016, 183.27913, 183.31721, 183.35562, 183.39336, 183.42928, 183.46495, 183.50055, 183.53683, 183.57225, 183.60655, 183.64061, 183.67566, 183.71036, 183.74536, 183.78122, 183.81776, 183.85562, 183.89389, 183.93182, 183.96855, 184.00623, 184.04614, 184.08539, 184.12434, 184.16336, 184.20358, 184.2431, 184.28152, 184.32024, 184.3553, 184.3905, 184.42917, 184.4704, 184.51273, 184.55392, 184.59485, 184.63615, 184.67656, 184.71397, 184.74928, 184.78352, 184.82126, 184.86098, 184.90076, 184.94235, 184.98337, 185.02277, 185.0623, 185.10294, 185.14499, 185.18594, 185.22719, 185.26956, 185.31255, 185.35408, 185.39359, 185.43069, 185.46863, 185.50841, 185.54842, 185.5876, 185.62738, 185.66747, 185.7076, 185.74796, 185.78799, 185.82808, 185.86952, 185.91144, 185.95245, 185.99278, 186.03255, 186.07283, 186.11411, 186.15575, 186.19742, 186.2375, 186.27637, 186.31621, 186.35637, 186.39667, 186.43544, 186.4731, 186.51167, 186.55107, 186.5916, 186.63014, 186.66568, 186.69972, 186.73563, 186.77632, 186.81931, 186.86119, 186.89891, 186.93753, 186.97639, 187.01602, 187.0556, 187.0981, 187.14053, 187.1834, 187.22716, 187.27185, 187.31763, 187.36372, 187.4113, 187.45898, 187.506, 187.55214, 187.59671, 187.64069, 187.68445, 187.73042, 187.77773, 187.82211, 187.86797, 187.91481, 187.96231, 188.00858, 188.05304, 188.09511, 188.13795, 188.1804, 188.22424, 188.27013, 188.31894, 188.36742, 188.41576, 188.4644, 188.51416, 188.56253, 188.60983, 188.65424, 188.69913, 188.7431, 188.78632, 188.83072, 188.87659, 188.92245, 188.96892, 189.01532, 189.06158, 189.10831, 189.15527, 189.20079, 189.2475, 189.29361, 189.33777, 189.38203, 189.42827, 189.47591, 189.52328, 189.57204, 189.62096, 189.6709, 189.72188, 189.77139, 189.81842, 189.8649, 189.91235, 189.95949, 190.0078, 190.05704, 190.10622, 190.15698, 190.20724, 190.25786, 190.30705, 190.35727, 190.40851, 190.45973, 190.51111, 190.56392, 190.61598, 190.66782, 190.7196, 190.77359, 190.82573, 190.87747, 190.92769, 190.97775, 191.02827, 191.07834, 191.12999, 191.17932, 191.22862, 191.27965, 191.33025, 191.38222, 191.433, 191.48625, 191.53882, 191.59085, 191.64409, 191.698, 191.7515, 191.8065, 191.86282, 191.91794, 191.97198, 192.02602, 192.07971, 192.1337, 192.18675, 192.24236, 192.29745, 192.35396, 192.40863, 192.46198, 192.51579, 192.57161, 192.62778, 192.68323, 192.73868, 192.79523, 192.85144, 192.9077, 192.96512, 193.02281, 193.07899, 193.13582, 193.19206, 193.24911, 193.30396, 193.35805, 193.41168, 193.46552, 193.52077, 193.57597, 193.63229, 193.68961, 193.74706, 193.80554, 193.86365, 193.92087, 193.97789, 194.03809, 194.09793, 194.15579, 194.21254, 194.27122, 194.33063, 194.39035, 194.44989, 194.51079, 194.56964, 194.62762, 194.68622, 194.74329, 194.79973, 194.85442, 194.91043, 194.96838]}, "params-norm vs samples": {"start_step": 0, "end_step": 2000, "step_interval": 5, "values": [180.0215, 180.0215, 180.0215, 180.0215, 180.02148, 180.02147, 180.02145, 180.02135, 180.02115, 180.02094, 180.01958, 180.01897, 180.01846, 180.01624, 180.01547, 180.01477, 180.01425, 180.01407, 180.0143, 180.01436, 180.01428, 180.01392, 180.01378, 180.01396, 180.01488, 180.0161, 180.0174, 180.01826, 180.0192, 180.0206, 180.02252, 180.02473, 180.0269, 180.02902, 180.03149, 180.0345, 180.03801, 180.04181, 180.04555, 180.04977, 180.05466, 180.05992, 180.06546, 180.0715, 180.07837, 180.08574, 180.09346, 180.10184, 180.11102, 180.12125, 180.13229, 180.14392, 180.15607, 180.16887, 180.1824, 180.19664, 180.21123, 180.22636, 180.24251, 180.25967, 180.27742, 180.29587, 180.31598, 180.33707, 180.3582, 180.3808, 180.40411, 180.42862, 180.45422, 180.48024, 180.50642, 180.53325, 180.56082, 180.58878, 180.61742, 180.64685, 180.67635, 180.70671, 180.73753, 180.76909, 180.80096, 180.83255, 180.86522, 180.89883, 180.93253, 180.96713, 181.00252, 181.03773, 181.07297, 181.10829, 181.14496, 181.18279, 181.22028, 181.25752, 181.29439, 181.32959, 181.36458, 181.40088, 181.43741, 181.47369, 181.50917, 181.54332, 181.57774, 181.61334, 181.64902, 181.68596, 181.7242, 181.7617, 181.79843, 181.83513, 181.87192, 181.90961, 181.94727, 181.9857, 182.02441, 182.06326, 182.1035, 182.14424, 182.18398, 182.22302, 182.26132, 182.30066, 182.33942, 182.37904, 182.41917, 182.45876, 182.49632, 182.53271, 182.56963, 182.60735, 182.64554, 182.68359, 182.72183, 182.75928, 182.79482, 182.83173, 182.86961, 182.90521, 182.94044, 182.97412, 183.00899, 183.04352, 183.0809, 183.12045, 183.16031, 183.20035, 183.24016, 183.27913, 183.31721, 183.35562, 183.39336, 183.42928, 183.46495, 183.50055, 183.53683, 183.57225, 183.60655, 183.64061, 183.67566, 183.71036, 183.74536, 183.78122, 183.81776, 183.85562, 183.89389, 183.93182, 183.96855, 184.00623, 184.04614, 184.08539, 184.12434, 184.16336, 184.20358, 184.2431, 184.28152, 184.32024, 184.3553, 184.3905, 184.42917, 184.4704, 184.51273, 184.55392, 184.59485, 184.63615, 184.67656, 184.71397, 184.74928, 184.78352, 184.82126, 184.86098, 184.90076, 184.94235, 184.98337, 185.02277, 185.0623, 185.10294, 185.14499, 185.18594, 185.22719, 185.26956, 185.31255, 185.35408, 185.39359, 185.43069, 185.46863, 185.50841, 185.54842, 185.5876, 185.62738, 185.66747, 185.7076, 185.74796, 185.78799, 185.82808, 185.86952, 185.91144, 185.95245, 185.99278, 186.03255, 186.07283, 186.11411, 186.15575, 186.19742, 186.2375, 186.27637, 186.31621, 186.35637, 186.39667, 186.43544, 186.4731, 186.51167, 186.55107, 186.5916, 186.63014, 186.66568, 186.69972, 186.73563, 186.77632, 186.81931, 186.86119, 186.89891, 186.93753, 186.97639, 187.01602, 187.0556, 187.0981, 187.14053, 187.1834, 187.22716, 187.27185, 187.31763, 187.36372, 187.4113, 187.45898, 187.506, 187.55214, 187.59671, 187.64069, 187.68445, 187.73042, 187.77773, 187.82211, 187.86797, 187.91481, 187.96231, 188.00858, 188.05304, 188.09511, 188.13795, 188.1804, 188.22424, 188.27013, 188.31894, 188.36742, 188.41576, 188.4644, 188.51416, 188.56253, 188.60983, 188.65424, 188.69913, 188.7431, 188.78632, 188.83072, 188.87659, 188.92245, 188.96892, 189.01532, 189.06158, 189.10831, 189.15527, 189.20079, 189.2475, 189.29361, 189.33777, 189.38203, 189.42827, 189.47591, 189.52328, 189.57204, 189.62096, 189.6709, 189.72188, 189.77139, 189.81842, 189.8649, 189.91235, 189.95949, 190.0078, 190.05704, 190.10622, 190.15698, 190.20724, 190.25786, 190.30705, 190.35727, 190.40851, 190.45973, 190.51111, 190.56392, 190.61598, 190.66782, 190.7196, 190.77359, 190.82573, 190.87747, 190.92769, 190.97775, 191.02827, 191.07834, 191.12999, 191.17932, 191.22862, 191.27965, 191.33025, 191.38222, 191.433, 191.48625, 191.53882, 191.59085, 191.64409, 191.698, 191.7515, 191.8065, 191.86282, 191.91794, 191.97198, 192.02602, 192.07971, 192.1337, 192.18675, 192.24236, 192.29745, 192.35396, 192.40863, 192.46198, 192.51579, 192.57161, 192.62778, 192.68323, 192.73868, 192.79523, 192.85144, 192.9077, 192.96512, 193.02281, 193.07899, 193.13582, 193.19206, 193.24911, 193.30396, 193.35805, 193.41168, 193.46552, 193.52077, 193.57597, 193.63229, 193.68961, 193.74706, 193.80554, 193.86365, 193.92087, 193.97789, 194.03809, 194.09793, 194.15579, 194.21254, 194.27122, 194.33063, 194.39035, 194.44989, 194.51079, 194.56964, 194.62762, 194.68622, 194.74329, 194.79973, 194.85442, 194.91043, 194.96838]}, "iteration-time": {"start_step": 0, "end_step": 2000, "step_interval": 5, "values": [25.9357, 1.58651, 1.57374, 1.5753, 1.57369, 1.58365, 1.58825, 1.58527, 1.58564, 1.5777, 1.58419, 1.58585, 1.58154, 1.58741, 1.59392, 1.59071, 1.59711, 1.6014, 1.60351, 1.59396, 1.5899, 1.59645, 1.58704, 1.58712, 1.60341, 1.58462, 1.5838, 1.58964, 1.5977, 1.5914, 1.59087, 1.59805, 1.5927, 1.59042, 1.57661, 1.58906, 1.58372, 1.5783, 1.662, 1.58247, 1.58561, 1.58497, 1.60619, 1.59828, 1.60708, 1.60788, 1.6018, 1.59949, 1.59104, 1.5968, 1.60548, 1.60125, 1.59943, 1.58135, 1.58089, 1.58389, 1.58725, 1.58116, 1.58404, 1.58902, 1.58673, 1.58415, 1.60076, 1.59392, 1.59498, 1.58949, 1.59688, 1.59686, 1.58746, 1.59881, 1.5919, 1.59305, 1.60935, 1.59895, 1.60324, 1.60238, 1.59829, 1.60008, 1.59605, 1.60176, 1.59396, 1.60186, 1.58731, 1.58171, 1.58397, 1.58802, 1.58792, 1.5888, 1.5989, 1.60961, 1.59174, 1.61116, 1.59839, 1.5987, 1.60266, 1.59894, 1.60234, 1.59759, 1.59588, 1.59656, 1.60095, 1.59247, 1.59334, 1.58581, 1.60076, 1.5966, 1.58958, 1.58303, 1.58777, 1.58897, 1.59327, 1.59617, 1.59379, 1.59354, 1.58468, 1.59116, 1.58522, 1.58052, 1.57531, 1.59285, 1.58327, 1.57928, 1.58856, 1.60734, 1.60047, 1.58954, 1.5887, 1.59365, 1.57967, 1.58675, 1.57718, 1.58018, 1.58698, 1.58486, 1.59903, 1.5922, 1.59084, 1.58453, 1.58231, 1.58267, 1.58483, 1.58037, 1.5909, 1.60252, 1.60356, 1.58876, 1.59367, 1.60171, 1.59771, 1.6032, 1.60106, 1.60184, 1.60827, 1.60637, 1.60548, 1.60525, 1.60212, 1.60506, 1.59982, 1.60509, 1.60647, 1.60886, 1.60014, 1.60931, 1.59824, 1.60157, 1.60774, 1.60732, 1.61218, 1.61074, 1.60769, 1.60031, 1.59568, 1.59819, 1.6096, 1.59367, 1.60494, 1.59917, 1.59747, 1.60124, 1.59771, 1.59534, 1.60201, 1.59851, 1.60069, 1.60225, 1.59775, 1.59041, 1.60108, 1.59759, 1.59096, 1.60191, 1.5962, 1.60086, 1.61379, 1.60436, 1.60606, 1.60163, 1.60378, 1.60305, 1.59492, 1.60456, 1.60034, 1.58872, 1.59577, 1.59654, 1.59711, 1.59749, 1.59808, 1.60144, 1.59512, 1.59382, 1.59822, 1.59585, 1.59994, 1.59286, 1.59958, 1.60154, 1.59764, 1.59284, 1.59867, 1.6049, 1.6004, 1.59909, 1.60488, 1.59532, 1.60133, 1.60538, 1.5991, 1.59608, 1.60992, 1.60101, 1.60144, 1.59775, 1.59962, 1.58809, 1.59851, 1.59204, 1.59492, 1.59647, 1.58928, 1.58595, 1.7535, 1.6478, 1.59827, 1.60514, 1.59426, 1.61414, 1.60982, 1.60735, 1.60866, 1.70147, 1.60416, 1.59248, 1.59525, 1.59344, 1.59499, 1.60459, 1.6003, 1.60341, 1.60801, 1.61343, 1.60596, 1.60611, 1.60542, 1.60121, 1.59801, 1.59823, 1.59998, 1.59829, 1.59898, 1.59531, 1.60142, 1.60403, 1.59966, 1.60202, 1.59979, 1.60042, 1.59732, 1.60245, 1.60091, 1.5998, 1.60238, 1.59984, 1.60274, 1.60666, 1.60321, 1.6036, 1.6041, 1.59868, 1.6015, 1.60892, 1.60377, 1.60116, 1.60829, 1.60355, 1.60349, 1.60256, 1.60399, 1.60265, 1.60684, 1.60536, 1.61211, 1.60719, 1.6104, 1.59911, 1.59879, 1.61165, 1.60015, 1.6048, 1.59789, 1.60116, 1.60929, 1.60128, 1.60444, 1.6133, 1.59942, 1.6132, 1.60448, 1.58597, 1.58802, 1.59401, 1.58972, 1.59965, 1.60201, 1.59413, 1.60397, 1.60165, 1.59963, 1.60178, 1.59826, 1.60301, 1.6063, 1.60499, 1.6023, 1.60467, 1.6048, 1.59497, 1.61355, 1.60237, 1.60516, 1.60289, 1.60404, 1.60076, 1.59623, 1.60269, 1.60248, 1.60802, 1.60059, 1.70142, 1.61751, 1.60679, 1.7026, 1.60996, 1.6083, 1.61064, 1.61183, 1.62052, 1.61909, 1.61534, 1.61668, 1.6033, 1.60768, 1.60386, 1.61143, 1.60918, 1.59776, 1.60709, 1.60535, 1.60161, 1.60666, 1.60582, 1.60545, 1.6075, 1.60733, 1.61657, 1.62133, 1.60999, 1.61188, 1.61305, 1.6069, 1.61671, 1.61762, 1.62212, 1.61922, 1.6081, 1.60551, 1.61555, 1.61354, 1.61632, 1.61937, 1.6141, 1.60911, 1.614, 1.61245, 1.61194, 1.6115, 1.60534, 1.60841, 1.60561]}, "lm loss validation": {"start_step": 0, "end_step": 4, "step_interval": 5, "values": [5.60068]}, "lm loss validation vs samples": {"start_step": 0, "end_step": 4, "step_interval": 5, "values": [5.60068]}, "lm loss validation ppl": {"start_step": 0, "end_step": 4, "step_interval": 5, "values": [270.6116]}, "lm loss validation ppl vs samples": {"start_step": 0, "end_step": 4, "step_interval": 5, "values": [270.6116]}}
\ No newline at end of file
diff --git a/tests/functional_tests/test_cases/gpt/gpt3_345m_weekly_dgx_h100_1N8G_mcore_tp2_pp2_fp8_tp_pp_sp/model_config.yaml b/tests/functional_tests/test_cases/gpt/gpt3_345m_weekly_dgx_h100_1N8G_mcore_tp2_pp2_fp8_tp_pp_sp/model_config.yaml
new file mode 100644
index 0000000000..61edc36fbe
--- /dev/null
+++ b/tests/functional_tests/test_cases/gpt/gpt3_345m_weekly_dgx_h100_1N8G_mcore_tp2_pp2_fp8_tp_pp_sp/model_config.yaml
@@ -0,0 +1,53 @@
+ENV_VARS:
+  CUDA_DEVICE_MAX_CONNECTIONS: 1
+  NVTE_ALLOW_NONDETERMINISTIC_ALGO: 0
+  NVTE_FUSED_ATTN: 0
+  NCCL_ALGO: Tree
+  CUBLAS_WORKSPACE_CONFIG: :4096:8
+MODEL_ARGS:
+  --num-layers: 12
+  --hidden-size: 512
+  --num-attention-heads: 8
+  --log-params-norm: true
+  --log-num-zeros-in-grad: true
+  --log-validation-ppl-to-tensorboard: true
+  --log-timers-to-tensorboard: true
+  --tensorboard-dir: ${TENSORBOARD_PATH}
+  --micro-batch-size: 2
+  --global-batch-size: 128
+  --seq-length: 1024
+  --max-position-embeddings: 1024
+  --train-iters: 2000
+  --timing-log-level: 2
+  --lr-decay-iters: 320000
+  --save: ${CHECKPOINT_PATH}
+  --load: ${CHECKPOINT_PATH}
+  --data-path: ${DATA_PATH}/my-gpt3_00_text_document
+  --vocab-file: ${DATA_PATH}/bpe/vocab.json
+  --merge-file: ${DATA_PATH}/bpe/merges.txt
+  --split: 949,50,1
+  --distributed-backend: nccl
+  --lr: 0.00015
+  --lr-decay-style: cosine
+  --min-lr: 1.0e-5
+  --weight-decay: 1e-2
+  --clip-grad: 1.0
+  --lr-warmup-fraction: .01
+  --log-interval: 1
+  --save-interval: 10000
+  --eval-interval: 1000
+  --eval-iters: 10
+  --transformer-impl: transformer_engine
+  --tensor-model-parallel-size: 2
+  --pipeline-model-parallel-size: 2
+  --sequence-parallel: true
+  --deterministic-mode: true
+  --no-gradient-accumulation-fusion: true
+  --fp8-format: hybrid
+  --fp8-amax-history-len: 1024
+  --fp8-amax-compute-algo: max
+  --attention-softmax-in-fp32: true
+  --ckpt-format: torch_dist
+  --data-cache-path: ${DATA_CACHE_PATH}
+  --bf16: true
+TEST_TYPE: regular
diff --git a/tests/functional_tests/test_cases/gpt/gpt3_345m_weekly_dgx_h100_1N8G_mcore_tp2_pp2_native_fp8_tp_pp_sp/golden_values_dev.json b/tests/functional_tests/test_cases/gpt/gpt3_345m_weekly_dgx_h100_1N8G_mcore_tp2_pp2_native_fp8_tp_pp_sp/golden_values_dev.json
new file mode 100644
index 0000000000..6009b31b8c
--- /dev/null
+++ b/tests/functional_tests/test_cases/gpt/gpt3_345m_weekly_dgx_h100_1N8G_mcore_tp2_pp2_native_fp8_tp_pp_sp/golden_values_dev.json
@@ -0,0 +1 @@
+{"forward-backward-time": {"start_step": 0, "end_step": 2000, "step_interval": 5, "values": [21.99065, 1.57716, 1.57152, 1.59321, 1.5863, 1.57478, 1.58558, 1.58758, 1.58479, 1.57895, 1.58493, 1.59324, 1.58233, 1.56795, 1.58055, 1.56688, 1.5678, 1.56791, 1.63783, 1.57453, 1.56285, 1.57599, 1.58307, 1.58147, 1.58884, 1.58487, 1.58844, 1.57964, 1.57929, 1.58802, 1.58073, 1.58122, 1.58449, 1.66952, 1.599, 1.60139, 1.74695, 1.60068, 1.58989, 1.59432, 1.59007, 1.58621, 1.5794, 1.58271, 1.58306, 1.59677, 1.59288, 1.58275, 1.67584, 1.59164, 1.60367, 1.5998, 1.60414, 1.59758, 1.58882, 1.60425, 1.59006, 1.58295, 1.58588, 1.60115, 1.59394, 1.6001, 1.59693, 1.5944, 1.59722, 1.60347, 1.59248, 1.67877, 1.59416, 1.59784, 1.61277, 1.59908, 1.59639, 1.5935, 1.59862, 1.61381, 1.60093, 1.59916, 1.59139, 1.59544, 1.60373, 1.59931, 1.59729, 1.58924, 1.59278, 1.60393, 1.59751, 1.59588, 1.597, 1.5921, 1.60557, 1.5915, 1.59296, 1.59099, 1.58952, 1.59785, 1.59236, 1.59138, 1.58196, 1.68409, 1.59552, 1.60388, 1.59454, 1.58942, 1.58688, 1.59613, 1.60092, 1.59976, 1.59462, 1.60601, 1.59966, 1.59879, 1.59803, 1.59743, 1.60087, 1.60123, 1.60561, 1.59721, 1.60002, 1.59717, 1.60267, 1.60202, 1.58969, 1.5937, 1.59501, 1.59729, 1.6055, 1.59373, 1.59552, 1.59903, 1.60628, 1.59959, 1.60033, 1.59523, 1.59534, 1.59886, 1.59989, 1.59127, 1.60846, 1.60265, 1.6054, 1.59487, 1.59192, 1.58491, 1.59173, 1.59624, 1.60184, 1.59635, 1.60701, 1.59973, 1.59592, 1.58783, 1.59596, 1.59257, 1.60207, 1.59766, 1.59014, 1.59147, 1.58958, 1.58849, 1.59599, 1.59796, 1.59187, 1.59629, 1.59167, 1.59103, 1.58381, 1.59206, 1.58888, 1.5904, 1.58555, 1.59114, 1.58539, 1.58566, 1.5894, 1.58315, 1.57556, 1.5798, 1.57936, 1.59144, 1.59188, 1.58985, 1.58744, 1.57959, 1.57707, 1.58114, 1.57447, 1.58757, 1.58393, 1.5814, 1.58214, 1.56869, 1.59904, 1.58832, 1.58446, 1.5886, 1.5964, 1.59995, 1.58984, 1.58458, 1.57848, 1.58262, 1.58372, 1.58511, 1.57472, 1.58482, 1.57884, 1.57655, 1.57371, 1.56768, 1.58436, 1.57434, 1.58546, 1.57895, 1.58824, 1.58943, 1.58534, 1.58931, 1.58768, 1.67183, 1.5994, 1.59551, 1.58731, 1.58941, 1.59427, 1.59768, 1.58889, 1.5907, 1.58959, 1.58719, 1.59215, 1.5863, 1.59281, 1.59155, 1.58447, 1.58437, 1.5847, 1.58696, 1.59622, 1.58517, 1.59019, 1.60434, 1.59968, 1.5969, 1.59751, 1.59456, 1.6066, 1.59805, 1.59315, 1.59835, 1.60342, 1.62288, 1.59735, 1.59455, 1.59386, 1.5899, 1.60537, 1.58935, 1.59479, 1.5931, 1.59564, 1.61221, 1.59658, 1.59741, 1.60139, 1.59726, 1.60686, 1.59462, 1.59958, 1.59653, 1.59254, 1.60457, 1.59551, 1.59428, 1.60093, 1.5944, 1.60142, 1.59772, 1.58999, 1.59811, 1.59342, 1.59459, 1.59229, 1.59446, 1.59758, 1.59514, 1.59376, 1.60015, 1.59289, 1.60569, 1.59243, 1.59995, 1.60277, 1.58962, 1.59704, 1.59408, 1.58742, 1.59956, 1.5946, 1.59711, 1.59521, 1.60094, 1.60537, 1.59472, 1.60512, 1.59709, 1.59942, 1.60326, 1.59747, 1.59643, 1.60252, 1.59668, 1.5978, 1.59291, 1.60286, 1.59494, 1.60307, 1.6023, 1.61125, 1.60608, 1.60499, 1.60013, 1.60294, 1.59839, 1.59445, 1.59771, 1.59912, 1.59625, 1.60071, 1.592, 1.59986, 1.59715, 1.59092, 1.5888, 1.58483, 1.58369, 1.58578, 1.58892, 1.58607, 1.57772, 1.58567, 1.58058, 1.57579, 1.58081, 1.57885, 1.57944, 1.5775, 1.57886, 1.58441, 1.64955, 1.57793, 1.57628, 1.57996, 1.60901, 1.5979, 1.59148, 1.58504, 1.58873, 1.61471, 1.61412, 1.59947, 1.59781, 1.59535, 1.61042, 1.60213, 1.59684, 1.59637, 1.59781, 1.60971, 1.59714, 1.58835, 1.59658, 1.5958, 1.5924, 1.59655, 1.59597, 1.60519, 1.60003, 1.61195, 1.61366, 1.6023, 1.60659, 1.59405, 1.60115, 1.6049, 1.6052, 1.60253, 1.59948, 1.5816, 1.59621, 1.58755, 1.59445, 1.59719, 1.59069, 1.60911, 1.59481, 1.59684, 1.60214, 1.59905, 1.60381]}, "forward-compute-time": {"start_step": 0, "end_step": 2000, "step_interval": 5, "values": [6.16126, 0.78048, 0.77638, 0.78285, 0.77945, 0.7768, 0.78398, 0.78215, 0.7833, 0.77542, 0.78468, 0.78711, 0.78251, 0.76662, 0.76894, 0.76826, 0.77171, 0.76847, 0.83221, 0.7706, 0.76442, 0.77548, 0.77966, 0.76518, 0.7854, 0.7799, 0.77136, 0.76634, 0.78834, 0.77019, 0.78986, 0.77045, 0.78652, 0.87018, 0.80011, 0.7944, 0.94182, 0.79666, 0.78564, 0.78708, 0.78355, 0.78735, 0.78535, 0.79227, 0.79173, 0.79116, 0.79578, 0.78576, 0.88058, 0.78541, 0.7905, 0.80177, 0.80159, 0.79536, 0.78436, 0.80424, 0.79113, 0.78133, 0.79513, 0.79725, 0.78505, 0.80445, 0.7974, 0.80505, 0.80566, 0.79011, 0.78303, 0.8828, 0.7992, 0.80046, 0.79496, 0.80104, 0.80208, 0.78598, 0.79918, 0.79817, 0.80692, 0.79948, 0.79832, 0.80065, 0.79953, 0.80613, 0.80349, 0.79995, 0.80406, 0.8022, 0.80453, 0.80228, 0.8056, 0.79734, 0.80242, 0.78707, 0.79319, 0.80876, 0.78925, 0.79762, 0.79177, 0.81095, 0.78559, 0.87702, 0.80826, 0.80874, 0.79998, 0.78873, 0.79623, 0.80044, 0.7965, 0.80088, 0.80451, 0.80617, 0.80803, 0.80736, 0.80357, 0.80072, 0.80574, 0.80861, 0.80081, 0.80256, 0.8016, 0.80416, 0.80062, 0.79705, 0.79613, 0.7934, 0.79423, 0.79439, 0.79639, 0.79437, 0.80375, 0.79641, 0.8075, 0.79693, 0.80388, 0.79802, 0.79685, 0.80158, 0.79875, 0.79886, 0.80926, 0.81104, 0.80752, 0.80381, 0.79608, 0.7893, 0.78982, 0.79582, 0.79985, 0.79486, 0.8058, 0.79802, 0.79424, 0.79685, 0.79506, 0.79473, 0.79858, 0.79203, 0.79193, 0.79375, 0.79263, 0.78662, 0.78983, 0.79242, 0.78834, 0.78866, 0.78847, 0.79475, 0.78474, 0.78928, 0.78727, 0.7942, 0.78678, 0.78404, 0.7855, 0.78669, 0.7807, 0.79077, 0.78107, 0.78201, 0.78183, 0.80216, 0.79952, 0.79773, 0.7904, 0.78485, 0.7784, 0.78943, 0.78644, 0.78928, 0.79161, 0.79481, 0.79068, 0.78383, 0.79727, 0.78767, 0.79378, 0.79855, 0.79573, 0.79906, 0.79796, 0.78811, 0.77833, 0.78832, 0.79352, 0.78682, 0.78545, 0.78929, 0.78422, 0.78978, 0.78901, 0.78354, 0.78883, 0.78807, 0.79656, 0.79382, 0.79009, 0.79261, 0.79204, 0.79399, 0.79138, 0.87044, 0.79415, 0.78856, 0.7904, 0.7891, 0.78842, 0.79047, 0.78866, 0.78816, 0.78669, 0.78557, 0.78863, 0.79242, 0.79337, 0.78575, 0.78866, 0.78509, 0.78346, 0.78462, 0.78704, 0.78025, 0.78234, 0.78547, 0.78832, 0.78406, 0.79176, 0.78752, 0.79148, 0.7926, 0.78905, 0.79623, 0.79876, 0.80189, 0.79329, 0.78938, 0.78571, 0.79206, 0.79022, 0.78916, 0.79198, 0.78965, 0.78841, 0.79706, 0.79681, 0.79422, 0.79582, 0.7978, 0.7929, 0.79692, 0.79951, 0.79613, 0.78441, 0.78081, 0.78582, 0.78913, 0.79294, 0.7902, 0.78677, 0.79445, 0.79001, 0.79247, 0.78884, 0.78757, 0.79082, 0.79372, 0.79339, 0.79117, 0.79464, 0.79238, 0.78456, 0.80253, 0.7832, 0.79582, 0.78585, 0.78817, 0.7996, 0.80334, 0.80038, 0.78266, 0.79835, 0.80583, 0.7884, 0.803, 0.7964, 0.7803, 0.80771, 0.78154, 0.78737, 0.78425, 0.79511, 0.79935, 0.79899, 0.80031, 0.79737, 0.7882, 0.78726, 0.80196, 0.78826, 0.79069, 0.79987, 0.80053, 0.79658, 0.80868, 0.78979, 0.79176, 0.80466, 0.79718, 0.80577, 0.78989, 0.78977, 0.79845, 0.80176, 0.79513, 0.79765, 0.78377, 0.78605, 0.7817, 0.78486, 0.78251, 0.782, 0.77773, 0.78515, 0.78532, 0.7826, 0.78594, 0.7847, 0.78814, 0.78399, 0.78924, 0.78495, 0.85297, 0.78501, 0.78455, 0.78521, 0.79499, 0.78326, 0.78572, 0.78491, 0.78588, 0.79342, 0.79911, 0.79939, 0.79997, 0.78403, 0.79216, 0.80483, 0.79356, 0.79564, 0.79104, 0.79195, 0.79461, 0.79321, 0.78786, 0.79505, 0.78766, 0.78873, 0.7989, 0.79328, 0.79827, 0.79828, 0.79999, 0.80446, 0.80505, 0.79428, 0.80603, 0.80135, 0.79708, 0.78828, 0.78401, 0.78511, 0.79061, 0.7807, 0.78293, 0.7859, 0.78918, 0.79204, 0.7906, 0.79616, 0.79381, 0.7949, 0.79715]}, "backward-compute-time": {"start_step": 0, "end_step": 2000, "step_interval": 5, "values": [4.59311, 0.76076, 0.76217, 0.75984, 0.7615, 0.75659, 0.76053, 0.7532, 0.76274, 0.76117, 0.76101, 0.76233, 0.76144, 0.75668, 0.76922, 0.75609, 0.75913, 0.76116, 0.76025, 0.76541, 0.75884, 0.75825, 0.75703, 0.766, 0.76226, 0.76154, 0.76489, 0.76817, 0.75764, 0.76666, 0.76075, 0.75889, 0.75671, 0.76413, 0.76441, 0.76109, 0.75862, 0.76306, 0.74826, 0.75641, 0.74619, 0.74555, 0.74425, 0.74896, 0.74343, 0.75132, 0.74633, 0.74611, 0.74624, 0.74486, 0.75681, 0.756, 0.75967, 0.7522, 0.74699, 0.75759, 0.75126, 0.74675, 0.75177, 0.75405, 0.7585, 0.75155, 0.75405, 0.75102, 0.75148, 0.75893, 0.74911, 0.74587, 0.75218, 0.74921, 0.76638, 0.74462, 0.7501, 0.7496, 0.74661, 0.7608, 0.75236, 0.74756, 0.74835, 0.74741, 0.75597, 0.74513, 0.75335, 0.74569, 0.74992, 0.75987, 0.73959, 0.74426, 0.7594, 0.74595, 0.75601, 0.74294, 0.74297, 0.75107, 0.74798, 0.75807, 0.74348, 0.75472, 0.74211, 0.7499, 0.7459, 0.75376, 0.74383, 0.74411, 0.74537, 0.74321, 0.75045, 0.74449, 0.75823, 0.74876, 0.74922, 0.75592, 0.75588, 0.75204, 0.74904, 0.74934, 0.76179, 0.74708, 0.74898, 0.7495, 0.749, 0.75109, 0.75134, 0.74604, 0.74742, 0.74319, 0.75078, 0.74752, 0.75245, 0.74673, 0.75517, 0.75235, 0.74881, 0.74945, 0.75053, 0.74903, 0.75641, 0.74336, 0.76521, 0.75829, 0.75724, 0.75492, 0.7561, 0.75292, 0.74603, 0.75381, 0.74787, 0.75257, 0.76831, 0.74923, 0.75133, 0.74595, 0.75539, 0.74856, 0.75247, 0.75168, 0.74839, 0.75531, 0.74901, 0.75107, 0.75151, 0.75163, 0.75496, 0.75207, 0.75274, 0.75371, 0.75218, 0.75324, 0.75429, 0.74775, 0.75082, 0.74975, 0.75003, 0.74514, 0.74798, 0.7422, 0.74955, 0.74687, 0.74432, 0.76318, 0.76862, 0.75695, 0.75138, 0.74947, 0.74824, 0.74949, 0.74673, 0.76097, 0.75456, 0.75612, 0.74619, 0.74667, 0.75557, 0.75602, 0.74867, 0.74532, 0.75908, 0.75984, 0.75566, 0.75544, 0.74912, 0.74344, 0.74466, 0.743, 0.74211, 0.75391, 0.74844, 0.74322, 0.7419, 0.7391, 0.75107, 0.74688, 0.74472, 0.74867, 0.74188, 0.75312, 0.75735, 0.75298, 0.75011, 0.83767, 0.75688, 0.7468, 0.75125, 0.75873, 0.75439, 0.76222, 0.74909, 0.75114, 0.74996, 0.74891, 0.75631, 0.75529, 0.75222, 0.74576, 0.74916, 0.74348, 0.7422, 0.74917, 0.74763, 0.74945, 0.74253, 0.75781, 0.74585, 0.75081, 0.75209, 0.75165, 0.7532, 0.75146, 0.75199, 0.75085, 0.75606, 0.76797, 0.74123, 0.75583, 0.7498, 0.74976, 0.76018, 0.74891, 0.74315, 0.74567, 0.74733, 0.76326, 0.74371, 0.74843, 0.74397, 0.74563, 0.76375, 0.74742, 0.7484, 0.75035, 0.74757, 0.75381, 0.7431, 0.74767, 0.74383, 0.74076, 0.75278, 0.75322, 0.74717, 0.74642, 0.74435, 0.74553, 0.75415, 0.75172, 0.74406, 0.74946, 0.74845, 0.7471, 0.74058, 0.74992, 0.74948, 0.74994, 0.75938, 0.75195, 0.75199, 0.75277, 0.74398, 0.75468, 0.74625, 0.74009, 0.75462, 0.74436, 0.75709, 0.75842, 0.75583, 0.75652, 0.75955, 0.75822, 0.74976, 0.74693, 0.7489, 0.7484, 0.74876, 0.75623, 0.75485, 0.75131, 0.75086, 0.75519, 0.7563, 0.75201, 0.74461, 0.75083, 0.75104, 0.7491, 0.74353, 0.74963, 0.74824, 0.75106, 0.75407, 0.74618, 0.7523, 0.75149, 0.74913, 0.74663, 0.74746, 0.7482, 0.74592, 0.74512, 0.75269, 0.74881, 0.75383, 0.74575, 0.74092, 0.74646, 0.74972, 0.75151, 0.74727, 0.74596, 0.75029, 0.74634, 0.74441, 0.75077, 0.76193, 0.7811, 0.76201, 0.76484, 0.77016, 0.76471, 0.76985, 0.76565, 0.75567, 0.76091, 0.76601, 0.7782, 0.76131, 0.75676, 0.76458, 0.76377, 0.77738, 0.75801, 0.75902, 0.762, 0.75749, 0.75518, 0.75814, 0.7671, 0.76157, 0.76399, 0.77689, 0.76899, 0.76062, 0.76435, 0.76315, 0.75948, 0.77408, 0.75612, 0.76269, 0.75559, 0.76227, 0.77122, 0.76094, 0.76349, 0.7582, 0.75871, 0.77745, 0.76055, 0.76243, 0.76016, 0.76322, 0.76742]}, "batch-generator-time": {"start_step": 0, "end_step": 2000, "step_interval": 5, "values": [2.19292, 0.01741, 0.01488, 0.01641, 0.01712, 0.01701, 0.01724, 0.01612, 0.01735, 0.01689, 0.01449, 0.01795, 0.01495, 0.01541, 0.01502, 0.01516, 0.01428, 0.01451, 0.01769, 0.01847, 0.0169, 0.01788, 0.01813, 0.01751, 0.01774, 0.01679, 0.01619, 0.01655, 0.01654, 0.01696, 0.0174, 0.0185, 0.01671, 0.01581, 0.01697, 0.01627, 0.02111, 0.01585, 0.0176, 0.01783, 0.01799, 0.01548, 0.01578, 0.01602, 0.01539, 0.01659, 0.01748, 0.01708, 0.01454, 0.01909, 0.01622, 0.01722, 0.01943, 0.01822, 0.01639, 0.01887, 0.0157, 0.01802, 0.01601, 0.01682, 0.01679, 0.01666, 0.01696, 0.01447, 0.01725, 0.01735, 0.01643, 0.01884, 0.01609, 0.0185, 0.0184, 0.01703, 0.01561, 0.01899, 0.01693, 0.01673, 0.01557, 0.02037, 0.01648, 0.02182, 0.01581, 0.01883, 0.01486, 0.01422, 0.01602, 0.0206, 0.01692, 0.01644, 0.01443, 0.0164, 0.01772, 0.01699, 0.01792, 0.01841, 0.01616, 0.01914, 0.01786, 0.01399, 0.01385, 0.01298, 0.01984, 0.01393, 0.01641, 0.01237, 0.01672, 0.01523, 0.01481, 0.01312, 0.01514, 0.0141, 0.01688, 0.01659, 0.01531, 0.01306, 0.01415, 0.01307, 0.01504, 0.01566, 0.01521, 0.01304, 0.0151, 0.01337, 0.01578, 0.01428, 0.01733, 0.01324, 0.01568, 0.01651, 0.01314, 0.01407, 0.01374, 0.01429, 0.01421, 0.01802, 0.01439, 0.01347, 0.01541, 0.01301, 0.01489, 0.01769, 0.01406, 0.01394, 0.01544, 0.01425, 0.01399, 0.01414, 0.01541, 0.01538, 0.01478, 0.01476, 0.01498, 0.01626, 0.01614, 0.01516, 0.0146, 0.02163, 0.01496, 0.01399, 0.0156, 0.01517, 0.01657, 0.01525, 0.02091, 0.01583, 0.01574, 0.01726, 0.01555, 0.01523, 0.01459, 0.01318, 0.01563, 0.01531, 0.01592, 0.01602, 0.01375, 0.01616, 0.01854, 0.0199, 0.01523, 0.01384, 0.01396, 0.01413, 0.01587, 0.01384, 0.01554, 0.01277, 0.0125, 0.01321, 0.01511, 0.01439, 0.01651, 0.01382, 0.01689, 0.01614, 0.01571, 0.01361, 0.01704, 0.01534, 0.01385, 0.01423, 0.20705, 0.01218, 0.01233, 0.01727, 0.01275, 0.01244, 0.01327, 0.01272, 0.01371, 0.01665, 0.01392, 0.01222, 0.01222, 0.01188, 0.01265, 0.01482, 0.01632, 0.01649, 0.01702, 0.10117, 0.01844, 0.01611, 0.01574, 0.01967, 0.01779, 0.0181, 0.01873, 0.01598, 0.01615, 0.0136, 0.01405, 0.0131, 0.01348, 0.01358, 0.01592, 0.01254, 0.01772, 0.01503, 0.01408, 0.01322, 0.01435, 0.0158, 0.01713, 0.01512, 0.01582, 0.01578, 0.01584, 0.01532, 0.01652, 0.01516, 0.01295, 0.01398, 0.01359, 0.01339, 0.01358, 0.01304, 0.01422, 0.01314, 0.01282, 0.01422, 0.01411, 0.01529, 0.01575, 0.01454, 0.01377, 0.01423, 0.0158, 0.0128, 0.01659, 0.0174, 0.01592, 0.01617, 0.01462, 0.01415, 0.01495, 0.01263, 0.01928, 0.01701, 0.01799, 0.01302, 0.01537, 0.01683, 0.01358, 0.01378, 0.01553, 0.01478, 0.01516, 0.01864, 0.01487, 0.0145, 0.01315, 0.0163, 0.01453, 0.01978, 0.01808, 0.01337, 0.01516, 0.01483, 0.0141, 0.01325, 0.01391, 0.01431, 0.01452, 0.01452, 0.01284, 0.01318, 0.01339, 0.01336, 0.01442, 0.01234, 0.01424, 0.01284, 0.01762, 0.01661, 0.01281, 0.01962, 0.01329, 0.01356, 0.01369, 0.01291, 0.01345, 0.01577, 0.01307, 0.01371, 0.01245, 0.0144, 0.01266, 0.01493, 0.01942, 0.01384, 0.01403, 0.01338, 0.01325, 0.01563, 0.0138, 0.01307, 0.01453, 0.0157, 0.01517, 0.01449, 0.01345, 0.01482, 0.01389, 0.01533, 0.01504, 0.01529, 0.01484, 0.01361, 0.01578, 0.01436, 0.01584, 0.01282, 0.01395, 0.01777, 0.01465, 0.01446, 0.01422, 0.01426, 0.01624, 0.01786, 0.01661, 0.01321, 0.01562, 0.016, 0.0161, 0.01445, 0.01562, 0.01697, 0.01694, 0.01328, 0.01308, 0.01623, 0.01535, 0.01156, 0.01359, 0.01294, 0.01787, 0.01354, 0.01547, 0.01746, 0.01479, 0.01512, 0.0137, 0.01697, 0.01836, 0.0165, 0.01597, 0.01426, 0.01481, 0.01758, 0.01613, 0.01995, 0.01744, 0.01619, 0.02014, 0.01917, 0.01834, 0.02092, 0.0156, 0.01825]}, "forward-recv-time": {"start_step": 0, "end_step": 2000, "step_interval": 5, "values": [5.93081, 0.02344, 0.02331, 0.02309, 0.02318, 0.02288, 0.02295, 0.02315, 0.02278, 0.02311, 0.02303, 0.02319, 0.02297, 0.02355, 0.0232, 0.02307, 0.02294, 0.02279, 0.02348, 0.02322, 0.02312, 0.02338, 0.02754, 0.02903, 0.02328, 0.02314, 0.02339, 0.02314, 0.02316, 0.02611, 0.02298, 0.02317, 0.02368, 0.02303, 0.02318, 0.0236, 0.02624, 0.02329, 0.02423, 0.02403, 0.02326, 0.02356, 0.02358, 0.02322, 0.02307, 0.02339, 0.02352, 0.02314, 0.02321, 0.02319, 0.02427, 0.02732, 0.02447, 0.02413, 0.02414, 0.02384, 0.02448, 0.02435, 0.0243, 0.02437, 0.02392, 0.02395, 0.02424, 0.0244, 0.02386, 0.02399, 0.02583, 0.02402, 0.02381, 0.02363, 0.02384, 0.02415, 0.02408, 0.02332, 0.02351, 0.02417, 0.02341, 0.02374, 0.0239, 0.02359, 0.02348, 0.02367, 0.02309, 0.02341, 0.02304, 0.02341, 0.02349, 0.02339, 0.02324, 0.02343, 0.02447, 0.02397, 0.02425, 0.02336, 0.02357, 0.02378, 0.02358, 0.02333, 0.02324, 0.02381, 0.02363, 0.02361, 0.02379, 0.023, 0.02331, 0.02406, 0.02303, 0.02381, 0.02338, 0.0233, 0.02375, 0.02361, 0.02338, 0.0254, 0.02366, 0.02346, 0.02319, 0.0231, 0.02322, 0.02336, 0.02359, 0.02301, 0.0232, 0.0231, 0.02325, 0.02535, 0.02543, 0.0249, 0.0258, 0.02421, 0.02631, 0.02569, 0.02546, 0.02523, 0.02374, 0.02369, 0.02287, 0.02328, 0.02335, 0.02342, 0.02348, 0.02584, 0.02846, 0.02333, 0.02325, 0.02317, 0.02344, 0.02362, 0.02449, 0.02398, 0.02331, 0.02313, 0.02338, 0.02374, 0.02377, 0.02343, 0.02294, 0.02316, 0.02278, 0.02313, 0.02341, 0.02344, 0.02325, 0.02347, 0.02341, 0.02425, 0.0234, 0.0236, 0.02348, 0.02328, 0.02322, 0.02797, 0.02349, 0.02368, 0.02483, 0.02541, 0.02365, 0.02349, 0.02286, 0.02337, 0.02361, 0.02351, 0.02501, 0.02329, 0.02303, 0.02332, 0.02369, 0.02402, 0.02326, 0.02743, 0.02371, 0.02333, 0.02452, 0.02852, 0.02423, 0.02431, 0.02363, 0.02347, 0.0234, 0.02355, 0.0171, 0.02364, 0.02374, 0.02365, 0.02307, 0.02279, 0.02328, 0.02362, 0.0233, 0.02395, 0.02325, 0.02349, 0.0286, 0.02347, 0.02365, 0.02351, 0.02314, 0.02283, 0.02321, 0.02365, 0.02339, 0.02363, 0.02445, 0.0234, 0.023, 0.02306, 0.02312, 0.0258, 0.02371, 0.02351, 0.02414, 0.02516, 0.02398, 0.02387, 0.02789, 0.02332, 0.02291, 0.02319, 0.02382, 0.02362, 0.02352, 0.0236, 0.02482, 0.02336, 0.02343, 0.02386, 0.02373, 0.02332, 0.02345, 0.02366, 0.02371, 0.02383, 0.02391, 0.02309, 0.02396, 0.0237, 0.02358, 0.02332, 0.02354, 0.0237, 0.02431, 0.02339, 0.02333, 0.02358, 0.02566, 0.02353, 0.02329, 0.02355, 0.02334, 0.02388, 0.02322, 0.02748, 0.02759, 0.02327, 0.02777, 0.02798, 0.0238, 0.02318, 0.02324, 0.02335, 0.02358, 0.02398, 0.02384, 0.02417, 0.02338, 0.02373, 0.02324, 0.02322, 0.02308, 0.02335, 0.02824, 0.02882, 0.02297, 0.02325, 0.02282, 0.02322, 0.02355, 0.02322, 0.02216, 0.02334, 0.02367, 0.02317, 0.0235, 0.02347, 0.02352, 0.02303, 0.02358, 0.02344, 0.02281, 0.02283, 0.02317, 0.02298, 0.02317, 0.02316, 0.02391, 0.02343, 0.02303, 0.02332, 0.02335, 0.02338, 0.02344, 0.0231, 0.02322, 0.02326, 0.02319, 0.02352, 0.02355, 0.02458, 0.02323, 0.02296, 0.02379, 0.02609, 0.02363, 0.02342, 0.02402, 0.02329, 0.02315, 0.02333, 0.02366, 0.02341, 0.02336, 0.02367, 0.02372, 0.02313, 0.02316, 0.02322, 0.0229, 0.02346, 0.02318, 0.02345, 0.0231, 0.02329, 0.0234, 0.02416, 0.02352, 0.0233, 0.02333, 0.02358, 0.02304, 0.0234, 0.02373, 0.02367, 0.02364, 0.02394, 0.02331, 0.02361, 0.02549, 0.02611, 0.02307, 0.02307, 0.02339, 0.02305, 0.02337, 0.02343, 0.02331, 0.02306, 0.02371, 0.02326, 0.02401, 0.02338, 0.02329, 0.02355, 0.02339, 0.02318, 0.02379, 0.02372, 0.02332, 0.02367, 0.02321, 0.02384, 0.0232, 0.02419, 0.02337, 0.02355, 0.0235, 0.02303, 0.02314, 0.02384, 0.02385, 0.02327]}, "forward-send-time": {"start_step": 0, "end_step": 2000, "step_interval": 5, "values": [2.86591, 0.00012, 0.00013, 0.00012, 0.00012, 0.00012, 0.00012, 0.00012, 0.00012, 0.00012, 0.00012, 0.00012, 0.00013, 0.00012, 0.00012, 0.00012, 0.00013, 0.00012, 0.00012, 0.00012, 0.00012, 0.00012, 0.00013, 0.00012, 0.00012, 0.00012, 0.00012, 0.00012, 0.00012, 0.00012, 0.00012, 0.00012, 0.00012, 0.00011, 0.00012, 0.00012, 0.00012, 0.00012, 0.00012, 0.00012, 0.00012, 0.00012, 0.00013, 0.00012, 0.00012, 0.00012, 0.00012, 0.00012, 0.00012, 0.00012, 0.00012, 0.00014, 0.00012, 0.00012, 0.00012, 0.00012, 0.00013, 0.00013, 0.00012, 0.00012, 0.00012, 0.00012, 0.00013, 0.00013, 0.00012, 0.00013, 0.00013, 0.00013, 0.00013, 0.00012, 0.00012, 0.00015, 0.00013, 0.00012, 0.00012, 0.00012, 0.00012, 0.00012, 0.00012, 0.00012, 0.00013, 0.00013, 0.00012, 0.00012, 0.00012, 0.00012, 0.00012, 0.00012, 0.00012, 0.00013, 0.00012, 0.00012, 0.00012, 0.00012, 0.00012, 0.00013, 0.00012, 0.00013, 0.00012, 0.00012, 0.00013, 0.00013, 0.00012, 0.00012, 0.00013, 0.00012, 0.00012, 0.00013, 0.00012, 0.00012, 0.00013, 0.00012, 0.00012, 0.00013, 0.00013, 0.00012, 0.00012, 0.00012, 0.00012, 0.00012, 0.00013, 0.00013, 0.00012, 0.00011, 0.00013, 0.00012, 0.00013, 0.00013, 0.00013, 0.00013, 0.00012, 0.00012, 0.00012, 0.00012, 0.00013, 0.00013, 0.00012, 0.00012, 0.00013, 0.00012, 0.00013, 0.00013, 0.00013, 0.00012, 0.00012, 0.00012, 0.00013, 0.00012, 0.00012, 0.00012, 0.00013, 0.00014, 0.00012, 0.00013, 0.00013, 0.00013, 0.00012, 0.00013, 0.00012, 0.00012, 0.00013, 0.00013, 0.00012, 0.00012, 0.00014, 0.00012, 0.00013, 0.00013, 0.00012, 0.00013, 0.00012, 0.00014, 0.00012, 0.00012, 0.00012, 0.00012, 0.00013, 0.00013, 0.00012, 0.00012, 0.00012, 0.00013, 0.00013, 0.00012, 0.00013, 0.00012, 0.00012, 0.00013, 0.00012, 0.00013, 0.00013, 0.00012, 0.00012, 0.00016, 0.00013, 0.00013, 0.00012, 0.00012, 0.00013, 0.00013, 0.0001, 0.00013, 0.00013, 0.00012, 0.00012, 0.00012, 0.00012, 0.00013, 0.00013, 0.00012, 0.00012, 0.00012, 0.00013, 0.00012, 0.00013, 0.00012, 0.00012, 0.00012, 0.00012, 0.00013, 0.00012, 0.00013, 0.00012, 0.00012, 0.00012, 0.00012, 0.00012, 0.00013, 0.00013, 0.00013, 0.00013, 0.00012, 0.00012, 0.00019, 0.00012, 0.00012, 0.00013, 0.00012, 0.00012, 0.00012, 0.00013, 0.00012, 0.00013, 0.00012, 0.00012, 0.00012, 0.00013, 0.00012, 0.00012, 0.00013, 0.00014, 0.00012, 0.00013, 0.00012, 0.00013, 0.00013, 0.00012, 0.00013, 0.00012, 0.00013, 0.00013, 0.00013, 0.00013, 0.00013, 0.00012, 0.00012, 0.00014, 0.00012, 0.00012, 0.00013, 0.00013, 0.00021, 0.00017, 0.00013, 0.00016, 0.00019, 0.00012, 0.00012, 0.00012, 0.00012, 0.00013, 0.00012, 0.00013, 0.00013, 0.00012, 0.00012, 0.00012, 0.00013, 0.00012, 0.00012, 0.00015, 0.00017, 0.00012, 0.00012, 0.00013, 0.00012, 0.00011, 0.00012, 0.00012, 0.00012, 0.00012, 0.00012, 0.00012, 0.00012, 0.00012, 0.00012, 0.00012, 0.00012, 0.00013, 0.00012, 0.00012, 0.00012, 0.00012, 0.00012, 0.00012, 0.00012, 0.00012, 0.00011, 0.00012, 0.00012, 0.00012, 0.00012, 0.00012, 0.00013, 0.00012, 0.00012, 0.00012, 0.00013, 0.00014, 0.00012, 0.00012, 0.00013, 0.00012, 0.00012, 0.00013, 0.00012, 0.00012, 0.00012, 0.00012, 0.00012, 0.00012, 0.00012, 0.00013, 0.00012, 0.00012, 0.00014, 0.00012, 0.00013, 0.00012, 0.00012, 0.00012, 0.00012, 0.00012, 0.00012, 0.00012, 0.00012, 0.00012, 0.00012, 0.00012, 0.00012, 0.00012, 0.00012, 0.00012, 0.00012, 0.00013, 0.00012, 0.00012, 0.00013, 0.00012, 0.00012, 0.00013, 0.00012, 0.00012, 0.00013, 0.00012, 0.00012, 0.00012, 0.00012, 0.00012, 0.00012, 0.00012, 0.00012, 0.00013, 0.00012, 0.00012, 0.00016, 0.00012, 0.00012, 0.00012, 0.00012, 0.00013, 0.00013, 0.00012, 0.00012, 0.00012, 0.00012, 0.00013, 0.00013, 0.00013, 0.00012]}, "backward-recv-time": {"start_step": 0, "end_step": 2000, "step_interval": 5, "values": [0.02324, 0.02473, 0.02485, 0.0257, 0.02421, 0.02511, 0.02424, 0.02512, 0.02482, 0.02484, 0.02503, 0.02501, 0.02497, 0.02408, 0.02453, 0.02476, 0.02472, 0.0245, 0.02469, 0.0238, 0.02472, 0.02383, 0.02443, 0.02414, 0.02458, 0.02427, 0.02418, 0.02518, 0.02515, 0.02471, 0.02487, 0.02507, 0.0252, 0.04234, 0.02563, 0.02482, 0.02527, 0.0252, 0.02511, 0.02616, 0.02552, 0.02553, 0.02507, 0.0247, 0.02488, 0.02838, 0.02802, 0.0284, 0.02834, 0.02994, 0.02821, 0.02845, 0.02966, 0.02456, 0.02638, 0.02786, 0.02477, 0.02529, 0.02816, 0.0278, 0.024, 0.02485, 0.02472, 0.02443, 0.02679, 0.02889, 0.02923, 0.02446, 0.02467, 0.02491, 0.02448, 0.02524, 0.0247, 0.02381, 0.02482, 0.02267, 0.02554, 0.02506, 0.02479, 0.02511, 0.02493, 0.02473, 0.02445, 0.02465, 0.02466, 0.02435, 0.02438, 0.02454, 0.02703, 0.02859, 0.02838, 0.02463, 0.02457, 0.02449, 0.02484, 0.02427, 0.02489, 0.02919, 0.02783, 0.02446, 0.02864, 0.02839, 0.02885, 0.02916, 0.02535, 0.02922, 0.02859, 0.02867, 0.02674, 0.02913, 0.02404, 0.02357, 0.02473, 0.02426, 0.0237, 0.02368, 0.02461, 0.02449, 0.02432, 0.02416, 0.02668, 0.0259, 0.02394, 0.02449, 0.0245, 0.02639, 0.02567, 0.02428, 0.02416, 0.0239, 0.0246, 0.0245, 0.02396, 0.02903, 0.02872, 0.02891, 0.0242, 0.0248, 0.02619, 0.02586, 0.02476, 0.02646, 0.02366, 0.02382, 0.02621, 0.02353, 0.02399, 0.02459, 0.02528, 0.02408, 0.0246, 0.02424, 0.028, 0.02928, 0.02952, 0.02881, 0.02431, 0.02457, 0.02417, 0.02444, 0.02498, 0.02401, 0.02303, 0.02437, 0.02609, 0.02618, 0.0244, 0.02636, 0.02449, 0.02888, 0.0291, 0.02963, 0.02433, 0.02789, 0.03263, 0.03258, 0.02856, 0.02595, 0.02508, 0.02561, 0.02568, 0.02893, 0.02364, 0.02454, 0.02431, 0.02431, 0.02435, 0.02361, 0.02447, 0.02415, 0.02557, 0.02442, 0.02388, 0.02473, 0.02836, 0.02932, 0.02902, 0.02464, 0.02588, 0.02525, 0.02855, 0.02485, 0.03232, 0.02798, 0.02376, 0.02448, 0.02369, 0.02397, 0.02417, 0.02554, 0.02412, 0.02385, 0.02386, 0.02939, 0.02461, 0.02396, 0.02522, 0.02468, 0.02408, 0.02344, 0.02381, 0.02444, 0.02442, 0.02457, 0.02446, 0.02491, 0.02474, 0.02468, 0.02463, 0.02469, 0.02618, 0.02458, 0.0243, 0.02465, 0.02436, 0.0246, 0.02381, 0.02431, 0.02492, 0.02438, 0.0239, 0.02778, 0.03263, 0.03015, 0.02489, 0.02497, 0.02827, 0.02851, 0.02831, 0.02923, 0.02893, 0.02474, 0.02501, 0.02434, 0.02523, 0.02437, 0.02557, 0.02446, 0.02462, 0.02479, 0.02496, 0.02454, 0.02469, 0.02509, 0.02486, 0.02485, 0.02426, 0.02434, 0.025, 0.02506, 0.02464, 0.02457, 0.02548, 0.0244, 0.025, 0.02478, 0.0246, 0.025, 0.02481, 0.02465, 0.02469, 0.02502, 0.02443, 0.02451, 0.025, 0.02468, 0.02437, 0.02501, 0.02475, 0.02536, 0.02455, 0.02462, 0.02512, 0.02448, 0.0247, 0.02447, 0.02432, 0.02473, 0.02472, 0.02439, 0.02441, 0.02485, 0.02461, 0.02454, 0.02434, 0.02462, 0.02469, 0.02464, 0.02438, 0.02452, 0.02463, 0.02444, 0.02442, 0.02471, 0.02629, 0.02488, 0.02491, 0.02465, 0.02437, 0.02469, 0.02484, 0.02511, 0.02481, 0.02578, 0.02498, 0.02521, 0.02506, 0.02571, 0.02539, 0.02521, 0.02412, 0.0257, 0.02473, 0.02452, 0.02527, 0.0256, 0.02517, 0.02489, 0.0251, 0.02453, 0.02495, 0.02483, 0.02495, 0.02445, 0.02472, 0.02508, 0.02487, 0.02471, 0.02495, 0.02544, 0.02447, 0.025, 0.02531, 0.02509, 0.02923, 0.02837, 0.02804, 0.02863, 0.03514, 0.02454, 0.02525, 0.02518, 0.02502, 0.02481, 0.02521, 0.02523, 0.02482, 0.02487, 0.02487, 0.02585, 0.02467, 0.02474, 0.02498, 0.02461, 0.02536, 0.02543, 0.02452, 0.02512, 0.02501, 0.02421, 0.02508, 0.02507, 0.02588, 0.02699, 0.02457, 0.02568, 0.0256, 0.02542, 0.02475, 0.02461, 0.02444, 0.0296, 0.02899, 0.02863, 0.02732, 0.02767, 0.02899, 0.02482, 0.02467, 0.02404]}, "backward-send-time": {"start_step": 0, "end_step": 2000, "step_interval": 5, "values": [0.00015, 0.00019, 0.00016, 0.00019, 0.00017, 0.00018, 0.00018, 0.00018, 0.00018, 0.00018, 0.00018, 0.00018, 0.00018, 0.00017, 0.00023, 0.00016, 0.00017, 0.00017, 0.00018, 0.00019, 0.00018, 0.00018, 0.00017, 0.00018, 0.00016, 0.00018, 0.00016, 0.00018, 0.00016, 0.00016, 0.00018, 0.00016, 0.00017, 0.00016, 0.00016, 0.00018, 0.00016, 0.00016, 0.00016, 0.00016, 0.00016, 0.00016, 0.00016, 0.00017, 0.00018, 0.0002, 0.00016, 0.00019, 0.00017, 0.00021, 0.00016, 0.00018, 0.00019, 0.00016, 0.00017, 0.00017, 0.00018, 0.0002, 0.00016, 0.00016, 0.00016, 0.00016, 0.00019, 0.00017, 0.00017, 0.00018, 0.00019, 0.00017, 0.00019, 0.00016, 0.00017, 0.00018, 0.00017, 0.00019, 0.00016, 0.00018, 0.00018, 0.00018, 0.00016, 0.00018, 0.00018, 0.00017, 0.00016, 0.00018, 0.00016, 0.00018, 0.00017, 0.00018, 0.00018, 0.00018, 0.00019, 0.00018, 0.00017, 0.00018, 0.00018, 0.00016, 0.00018, 0.00016, 0.00024, 0.00017, 0.00018, 0.00016, 0.00016, 0.00019, 0.00019, 0.00018, 0.00026, 0.00017, 0.00018, 0.00018, 0.00019, 0.00018, 0.00018, 0.00018, 0.00016, 0.00018, 0.00018, 0.00017, 0.00018, 0.00017, 0.00016, 0.00016, 0.00018, 0.00019, 0.00017, 0.00018, 0.00018, 0.00018, 0.00018, 0.00017, 0.00018, 0.00019, 0.00018, 0.00018, 0.00018, 0.00021, 0.00016, 0.00018, 0.00019, 0.00022, 0.00017, 0.00016, 0.00017, 0.00017, 0.00018, 0.00018, 0.00018, 0.00018, 0.00016, 0.00018, 0.00018, 0.00018, 0.00018, 0.00018, 0.00018, 0.00018, 0.00018, 0.00018, 0.00016, 0.00019, 0.00018, 0.00018, 0.00017, 0.00019, 0.00017, 0.00019, 0.00018, 0.00018, 0.00016, 0.00017, 0.00016, 0.00016, 0.00018, 0.00017, 0.00016, 0.00029, 0.00017, 0.00019, 0.0002, 0.00016, 0.00019, 0.00032, 0.00019, 0.00016, 0.00018, 0.00018, 0.00016, 0.00018, 0.00023, 0.00018, 0.00018, 0.00018, 0.00017, 0.00019, 0.00018, 0.00016, 0.00018, 0.00017, 0.00018, 0.00018, 0.00018, 0.00019, 0.00018, 0.0002, 0.00016, 0.0002, 0.00018, 0.00018, 0.00018, 0.00016, 0.00018, 0.00016, 0.00016, 0.00018, 0.00016, 0.00017, 0.00019, 0.00018, 0.00016, 0.00019, 0.00022, 0.00016, 0.00018, 0.00018, 0.00018, 0.00018, 0.00018, 0.00018, 0.00018, 0.00017, 0.00016, 0.00019, 0.00018, 0.00018, 0.00016, 0.00018, 0.00019, 0.00016, 0.00018, 0.00016, 0.00017, 0.00017, 0.00026, 0.00016, 0.00016, 0.00019, 0.00018, 0.00016, 0.00018, 0.00018, 0.00016, 0.00018, 0.00016, 0.00016, 0.00018, 0.00018, 0.00018, 0.00016, 0.00017, 0.00017, 0.00016, 0.00019, 0.00018, 0.00017, 0.00016, 0.00018, 0.00016, 0.00016, 0.00016, 0.00018, 0.00016, 0.00019, 0.00019, 0.00017, 0.00018, 0.00018, 0.00018, 0.00018, 0.00018, 0.00018, 0.00019, 0.00016, 0.00018, 0.00018, 0.00022, 0.00016, 0.00018, 0.00018, 0.00017, 0.00016, 0.00018, 0.00018, 0.00016, 0.00018, 0.00018, 0.00016, 0.00018, 0.00018, 0.00016, 0.00016, 0.00018, 0.00018, 0.00018, 0.00018, 0.00016, 0.00019, 0.00016, 0.00018, 0.00017, 0.00017, 0.00016, 0.00018, 0.00018, 0.00018, 0.00019, 0.00018, 0.00016, 0.00016, 0.00017, 0.00016, 0.00018, 0.00016, 0.00017, 0.00019, 0.00017, 0.00018, 0.00019, 0.00019, 0.00018, 0.00016, 0.00016, 0.00017, 0.00018, 0.00016, 0.00019, 0.00016, 0.00016, 0.00016, 0.00016, 0.00016, 0.00019, 0.00016, 0.00018, 0.00018, 0.00018, 0.00018, 0.00018, 0.00018, 0.00016, 0.00016, 0.0002, 0.00018, 0.00018, 0.00019, 0.00019, 0.00018, 0.00018, 0.0003, 0.00016, 0.00018, 0.00018, 0.00016, 0.00019, 0.00018, 0.00019, 0.00016, 0.00016, 0.00016, 0.00018, 0.00019, 0.00018, 0.00018, 0.00017, 0.00018, 0.00019, 0.00017, 0.00016, 0.00018, 0.00017, 0.00018, 0.00018, 0.00017, 0.00018, 0.00016, 0.00016, 0.00018, 0.00019, 0.00017, 0.00018, 0.00018, 0.00017, 0.00016, 0.00035, 0.00022, 0.00019, 0.00018, 0.00018, 0.00017, 0.00016, 0.00017]}, "forward-send-backward-recv-time": {"start_step": 0, "end_step": 2000, "step_interval": 5, "values": [7.52895, 0.10767, 0.10288, 0.12221, 0.10839, 0.10916, 0.11683, 0.11949, 0.11244, 0.10662, 0.11634, 0.12145, 0.11448, 0.10239, 0.10115, 0.10144, 0.10622, 0.1006, 0.1586, 0.10078, 0.09436, 0.10994, 0.11246, 0.10473, 0.11165, 0.11062, 0.10864, 0.10698, 0.11094, 0.1123, 0.11651, 0.11274, 0.11336, 0.17984, 0.1238, 0.12939, 0.27709, 0.1391, 0.13093, 0.12511, 0.13066, 0.1225, 0.11928, 0.11852, 0.12105, 0.1235, 0.12183, 0.11095, 0.20461, 0.11574, 0.12325, 0.12774, 0.1342, 0.12396, 0.11854, 0.1264, 0.11539, 0.11273, 0.1179, 0.13162, 0.11525, 0.13348, 0.13, 0.12472, 0.13424, 0.1156, 0.11969, 0.21123, 0.12519, 0.12897, 0.136, 0.13444, 0.12965, 0.12283, 0.13807, 0.13035, 0.12784, 0.13095, 0.12328, 0.12278, 0.1242, 0.13846, 0.1251, 0.11622, 0.12258, 0.12174, 0.12831, 0.12841, 0.12632, 0.11745, 0.12732, 0.12029, 0.13155, 0.12567, 0.11834, 0.12549, 0.12416, 0.12349, 0.11452, 0.20614, 0.12415, 0.11944, 0.12148, 0.11366, 0.12373, 0.12834, 0.11722, 0.11892, 0.11557, 0.12715, 0.12886, 0.12057, 0.12682, 0.12601, 0.13364, 0.12815, 0.12626, 0.1317, 0.12917, 0.12301, 0.12818, 0.12239, 0.12231, 0.12391, 0.12264, 0.1209, 0.12986, 0.12429, 0.11971, 0.12228, 0.12907, 0.12399, 0.12889, 0.11751, 0.11734, 0.11985, 0.12419, 0.11939, 0.12896, 0.13183, 0.13356, 0.12001, 0.12131, 0.11604, 0.11794, 0.12429, 0.1355, 0.12631, 0.13817, 0.12757, 0.12565, 0.12479, 0.12459, 0.11863, 0.12603, 0.11965, 0.11957, 0.11941, 0.12277, 0.12152, 0.13238, 0.12899, 0.12039, 0.12936, 0.12185, 0.12027, 0.11834, 0.12565, 0.12003, 0.12064, 0.11734, 0.11796, 0.11982, 0.11829, 0.11018, 0.11427, 0.10291, 0.11078, 0.11775, 0.12251, 0.11736, 0.12288, 0.11757, 0.10965, 0.1101, 0.1111, 0.10524, 0.11035, 0.1194, 0.10687, 0.1104, 0.1029, 0.11414, 0.11835, 0.11073, 0.10671, 0.11471, 0.11713, 0.11142, 0.11427, 0.10551, 0.11576, 0.10811, 0.12352, 0.11089, 0.10827, 0.11418, 0.11243, 0.11291, 0.10774, 0.10575, 0.10895, 0.11133, 0.10168, 0.11589, 0.11188, 0.11403, 0.12083, 0.12527, 0.20209, 0.12301, 0.12835, 0.1167, 0.12035, 0.12158, 0.11749, 0.11785, 0.11663, 0.11859, 0.11189, 0.11229, 0.11518, 0.1205, 0.11283, 0.11679, 0.11705, 0.11627, 0.12181, 0.12372, 0.12191, 0.12006, 0.1168, 0.12252, 0.11718, 0.12814, 0.12688, 0.12696, 0.12607, 0.12079, 0.13508, 0.13166, 0.13101, 0.12769, 0.12321, 0.12875, 0.12726, 0.12271, 0.12496, 0.13106, 0.12712, 0.12831, 0.11758, 0.13314, 0.13148, 0.13269, 0.13383, 0.1235, 0.1316, 0.14168, 0.13684, 0.12388, 0.11908, 0.12703, 0.12329, 0.12975, 0.12484, 0.11743, 0.13142, 0.12276, 0.12584, 0.12278, 0.12351, 0.12006, 0.1275, 0.12997, 0.12275, 0.12374, 0.1258, 0.12674, 0.1382, 0.11985, 0.12902, 0.11699, 0.12694, 0.12671, 0.12528, 0.12577, 0.12335, 0.12793, 0.12913, 0.12309, 0.13132, 0.12457, 0.12253, 0.11803, 0.11645, 0.12181, 0.12507, 0.12528, 0.12214, 0.12812, 0.12471, 0.11918, 0.12456, 0.12769, 0.12304, 0.12153, 0.11907, 0.13148, 0.13103, 0.13068, 0.13318, 0.12552, 0.12933, 0.13261, 0.12839, 0.13023, 0.12205, 0.12863, 0.12765, 0.12548, 0.12592, 0.12495, 0.12574, 0.12193, 0.12065, 0.12433, 0.12257, 0.11243, 0.11188, 0.11552, 0.11773, 0.11637, 0.1131, 0.11535, 0.11323, 0.11728, 0.11383, 0.11656, 0.18458, 0.11533, 0.1158, 0.11306, 0.12884, 0.12649, 0.12032, 0.11208, 0.11803, 0.13436, 0.14069, 0.12596, 0.12808, 0.12036, 0.127, 0.12774, 0.12746, 0.13166, 0.1288, 0.11946, 0.12914, 0.12045, 0.1215, 0.117, 0.11498, 0.11583, 0.11774, 0.12264, 0.12134, 0.12257, 0.12649, 0.1233, 0.12733, 0.11514, 0.12185, 0.12051, 0.13736, 0.13171, 0.13031, 0.11491, 0.11951, 0.10565, 0.11503, 0.1165, 0.11394, 0.11312, 0.11865, 0.11953, 0.12351, 0.12231, 0.12042]}, "backward-send-forward-recv-time": {"start_step": 0, "end_step": 2000, "step_interval": 5, "values": [3.33774, 0.00722, 0.00727, 0.01025, 0.00728, 0.00714, 0.00814, 0.00897, 0.00966, 0.00746, 0.00801, 0.00911, 0.00716, 0.01132, 0.00906, 0.00969, 0.00832, 0.01171, 0.00765, 0.00889, 0.00886, 0.01056, 0.00822, 0.01186, 0.00789, 0.00921, 0.01483, 0.01149, 0.00732, 0.00899, 0.00802, 0.00967, 0.01211, 0.00836, 0.00778, 0.0097, 0.00744, 0.00738, 0.00799, 0.00783, 0.00895, 0.00733, 0.00808, 0.00821, 0.00953, 0.00947, 0.00803, 0.00716, 0.0083, 0.01092, 0.01169, 0.01197, 0.01099, 0.0139, 0.01319, 0.01223, 0.00743, 0.01124, 0.01269, 0.01365, 0.01106, 0.01186, 0.01247, 0.01377, 0.01372, 0.00895, 0.00817, 0.0122, 0.00886, 0.01409, 0.01218, 0.0116, 0.01184, 0.01054, 0.0083, 0.01112, 0.01398, 0.01443, 0.01304, 0.01159, 0.01508, 0.01227, 0.01243, 0.00996, 0.01336, 0.0103, 0.0121, 0.00939, 0.01351, 0.0109, 0.0119, 0.00743, 0.01152, 0.01082, 0.0077, 0.013, 0.00863, 0.01128, 0.00747, 0.10318, 0.00737, 0.01277, 0.0074, 0.00766, 0.00929, 0.00731, 0.00777, 0.00773, 0.01305, 0.01203, 0.01277, 0.01218, 0.01038, 0.01189, 0.01149, 0.01182, 0.01209, 0.0087, 0.01115, 0.0143, 0.01389, 0.01471, 0.01226, 0.01046, 0.01269, 0.01445, 0.0131, 0.01159, 0.01285, 0.01374, 0.01248, 0.01373, 0.01412, 0.01487, 0.01463, 0.0142, 0.01491, 0.01425, 0.01332, 0.01294, 0.01394, 0.01396, 0.01223, 0.01179, 0.01522, 0.01396, 0.01383, 0.01262, 0.0137, 0.01453, 0.01605, 0.01203, 0.01365, 0.01102, 0.01296, 0.01149, 0.01352, 0.0141, 0.01337, 0.01015, 0.01142, 0.01244, 0.01056, 0.01302, 0.0136, 0.01251, 0.014, 0.01398, 0.01294, 0.01334, 0.01177, 0.01235, 0.01091, 0.01036, 0.01476, 0.01084, 0.01117, 0.01139, 0.01169, 0.01222, 0.01155, 0.0115, 0.01538, 0.01662, 0.01196, 0.01265, 0.01353, 0.0155, 0.01451, 0.01302, 0.01135, 0.01115, 0.01301, 0.01401, 0.01239, 0.01337, 0.0134, 0.01449, 0.01454, 0.01499, 0.02199, 0.01511, 0.01449, 0.01437, 0.01499, 0.01473, 0.01696, 0.01373, 0.01165, 0.01224, 0.01255, 0.01026, 0.01816, 0.01732, 0.01392, 0.01205, 0.01326, 0.012, 0.0125, 0.09407, 0.01373, 0.01234, 0.01352, 0.01298, 0.01393, 0.01293, 0.01272, 0.01269, 0.00988, 0.01398, 0.01371, 0.01512, 0.00926, 0.01203, 0.00886, 0.01072, 0.01094, 0.01129, 0.01236, 0.01167, 0.01127, 0.0134, 0.01164, 0.01227, 0.01086, 0.01128, 0.01424, 0.01338, 0.01286, 0.01139, 0.0124, 0.01253, 0.01306, 0.0104, 0.01044, 0.00925, 0.01349, 0.0106, 0.01304, 0.013, 0.01652, 0.01247, 0.01259, 0.01119, 0.01241, 0.01609, 0.01301, 0.01673, 0.01245, 0.01358, 0.01293, 0.01395, 0.01222, 0.01281, 0.01194, 0.01332, 0.01097, 0.01369, 0.01398, 0.0117, 0.01357, 0.0128, 0.01277, 0.01159, 0.01226, 0.01271, 0.0131, 0.01357, 0.0123, 0.01025, 0.01114, 0.01335, 0.01274, 0.00948, 0.01342, 0.01348, 0.01171, 0.01274, 0.01313, 0.01262, 0.01167, 0.00993, 0.01158, 0.0107, 0.01309, 0.01347, 0.015, 0.01426, 0.01127, 0.01224, 0.0128, 0.01251, 0.01492, 0.01369, 0.01553, 0.01256, 0.01398, 0.01419, 0.01663, 0.01442, 0.01314, 0.01126, 0.01132, 0.01161, 0.01215, 0.01208, 0.01721, 0.01103, 0.01311, 0.00802, 0.01029, 0.01351, 0.00888, 0.01039, 0.00882, 0.00933, 0.00881, 0.00926, 0.01082, 0.01021, 0.00961, 0.01001, 0.00836, 0.00918, 0.01044, 0.01016, 0.00966, 0.00991, 0.01218, 0.07892, 0.00899, 0.01009, 0.01201, 0.00867, 0.01068, 0.01049, 0.01158, 0.01334, 0.0109, 0.01304, 0.00961, 0.01538, 0.01469, 0.01646, 0.00905, 0.01059, 0.01386, 0.01332, 0.01461, 0.01223, 0.01253, 0.0166, 0.01015, 0.01471, 0.01602, 0.01097, 0.01225, 0.01068, 0.01085, 0.01135, 0.00802, 0.00878, 0.01148, 0.01009, 0.00941, 0.00919, 0.01177, 0.00968, 0.01046, 0.00955, 0.01107, 0.00923, 0.00916, 0.00864, 0.01069, 0.01075, 0.00939, 0.01202, 0.00876, 0.01073]}, "layernorm-grads-all-reduce-time": {"start_step": 0, "end_step": 2000, "step_interval": 5, "values": [0.0012, 0.00075, 0.00074, 0.00352, 0.00166, 0.00076, 0.00077, 0.00076, 0.00319, 0.00077, 0.00076, 0.00445, 0.00077, 0.00075, 0.00153, 0.00077, 0.00076, 0.00076, 0.00076, 0.00077, 0.00076, 0.00075, 0.00076, 0.00075, 0.00077, 0.00075, 0.00077, 0.00075, 0.00077, 0.00077, 0.00075, 0.00076, 0.00076, 0.00076, 0.00076, 0.00076, 0.00077, 0.00076, 0.00076, 0.00077, 0.00078, 0.00076, 0.00077, 0.00076, 0.00076, 0.00429, 0.00076, 0.00076, 0.00076, 0.00078, 0.00077, 0.00077, 0.00078, 0.00077, 0.0008, 0.00079, 0.00079, 0.00077, 0.00078, 0.00078, 0.00079, 0.00519, 0.00079, 0.00078, 0.00077, 0.00078, 0.00079, 0.00079, 0.00079, 0.00077, 0.00079, 0.00079, 0.00079, 0.00078, 0.00078, 0.00078, 0.00077, 0.00079, 0.00079, 0.00079, 0.00078, 0.00078, 0.00077, 0.00078, 0.00077, 0.00078, 0.00078, 0.00078, 0.00078, 0.00078, 0.00083, 0.00306, 0.00078, 0.00076, 0.00078, 0.00078, 0.00078, 0.00077, 0.00078, 0.0008, 0.00079, 0.00079, 0.00077, 0.00079, 0.00078, 0.00078, 0.00081, 0.00335, 0.00078, 0.00079, 0.0008, 0.00078, 0.00079, 0.00079, 0.00078, 0.00077, 0.00079, 0.00078, 0.00079, 0.0008, 0.00078, 0.00078, 0.00078, 0.00078, 0.00077, 0.00078, 0.00078, 0.00078, 0.00078, 0.00077, 0.00077, 0.00078, 0.00078, 0.00077, 0.00077, 0.00078, 0.00077, 0.00078, 0.00077, 0.00078, 0.00077, 0.00077, 0.00077, 0.00079, 0.00078, 0.00078, 0.00077, 0.00078, 0.00078, 0.00078, 0.00078, 0.00077, 0.00077, 0.00078, 0.00078, 0.00079, 0.00086, 0.00079, 0.00078, 0.00079, 0.00078, 0.00078, 0.00078, 0.00078, 0.00078, 0.00078, 0.00078, 0.00077, 0.0008, 0.0008, 0.00079, 0.00078, 0.00079, 0.00078, 0.00078, 0.00082, 0.00081, 0.00083, 0.00078, 0.00077, 0.00079, 0.00082, 0.0008, 0.00077, 0.00076, 0.00077, 0.00078, 0.00077, 0.00078, 0.00077, 0.00077, 0.00077, 0.00078, 0.00077, 0.00077, 0.00077, 0.00077, 0.00078, 0.00078, 0.00077, 0.00077, 0.00078, 0.00082, 0.00083, 0.00078, 0.00078, 0.00078, 0.00077, 0.00078, 0.00078, 0.00077, 0.00078, 0.00079, 0.00078, 0.00452, 0.00077, 0.00078, 0.00077, 0.00077, 0.0008, 0.00078, 0.00079, 0.00079, 0.00078, 0.00223, 0.00078, 0.00077, 0.00077, 0.00079, 0.00078, 0.00078, 0.00078, 0.00295, 0.00077, 0.00077, 0.00077, 0.00077, 0.00077, 0.00076, 0.00077, 0.0042, 0.00081, 0.00079, 0.00087, 0.00078, 0.00078, 0.00078, 0.00078, 0.00076, 0.00078, 0.0008, 0.00076, 0.00079, 0.00077, 0.00078, 0.00077, 0.00077, 0.00077, 0.00078, 0.00078, 0.00077, 0.00078, 0.00077, 0.00077, 0.00078, 0.00077, 0.00078, 0.00078, 0.00078, 0.00078, 0.00076, 0.00076, 0.00077, 0.00077, 0.00077, 0.00077, 0.00078, 0.00079, 0.00085, 0.00078, 0.00078, 0.00077, 0.00079, 0.00079, 0.00079, 0.00077, 0.00078, 0.00078, 0.00078, 0.00078, 0.00078, 0.00077, 0.00078, 0.00078, 0.00077, 0.00078, 0.00077, 0.00077, 0.00077, 0.00079, 0.00079, 0.00078, 0.00078, 0.00077, 0.00078, 0.00078, 0.00078, 0.00079, 0.00077, 0.00078, 0.00078, 0.00077, 0.00077, 0.00078, 0.00077, 0.00077, 0.00079, 0.00079, 0.00077, 0.00077, 0.00078, 0.00077, 0.00078, 0.00078, 0.00078, 0.00079, 0.00078, 0.00078, 0.00077, 0.00078, 0.00078, 0.00077, 0.00079, 0.00078, 0.00077, 0.00079, 0.00078, 0.00078, 0.00077, 0.00077, 0.0008, 0.00078, 0.00078, 0.00079, 0.00077, 0.00079, 0.00077, 0.00077, 0.00077, 0.00079, 0.00078, 0.00078, 0.00078, 0.00083, 0.0009, 0.00079, 0.00082, 0.0008, 0.0008, 0.00078, 0.00077, 0.00077, 0.00078, 0.00078, 0.00079, 0.00078, 0.00077, 0.00078, 0.00078, 0.00078, 0.00078, 0.00078, 0.00077, 0.0008, 0.00079, 0.00078, 0.00078, 0.00078, 0.00078, 0.00077, 0.00078, 0.00078, 0.00078, 0.00078, 0.00078, 0.00078, 0.00077, 0.00084, 0.00077, 0.00077, 0.00077, 0.0008, 0.00078, 0.00078, 0.00077, 0.00078, 0.00153, 0.00078, 0.00078, 0.00076]}, "embedding-grads-all-reduce-time": {"start_step": 0, "end_step": 2000, "step_interval": 5, "values": [0.00036, 0.00032, 0.00032, 0.00032, 0.00031, 0.00032, 0.00033, 0.00032, 0.00032, 0.00031, 0.00032, 0.00034, 0.00032, 0.00031, 0.00037, 0.00032, 0.00032, 0.00032, 0.00032, 0.00032, 0.00032, 0.00032, 0.00032, 0.00031, 0.00032, 0.00031, 0.00032, 0.00032, 0.00032, 0.00031, 0.00032, 0.00034, 0.00032, 0.00033, 0.00032, 0.00032, 0.00032, 0.00032, 0.00032, 0.00032, 0.00032, 0.00032, 0.00032, 0.00032, 0.00032, 0.00032, 0.00032, 0.00032, 0.00032, 0.00031, 0.00032, 0.00032, 0.00032, 0.00032, 0.00032, 0.00032, 0.00031, 0.00032, 0.00032, 0.00032, 0.00032, 0.00034, 0.00032, 0.00032, 0.00032, 0.00032, 0.00032, 0.00032, 0.00032, 0.00032, 0.00032, 0.00032, 0.00032, 0.00032, 0.00032, 0.00032, 0.00031, 0.00032, 0.00032, 0.00032, 0.00032, 0.00032, 0.00031, 0.00032, 0.00032, 0.00032, 0.00032, 0.00032, 0.00032, 0.00032, 0.00032, 0.00033, 0.00032, 0.00032, 0.00032, 0.00032, 0.00032, 0.00033, 0.00032, 0.00032, 0.00032, 0.00033, 0.00032, 0.00032, 0.00032, 0.00032, 0.00032, 0.00033, 0.00032, 0.00032, 0.00032, 0.00032, 0.00032, 0.00032, 0.00032, 0.00031, 0.00032, 0.00032, 0.00032, 0.00032, 0.00032, 0.00032, 0.00032, 0.00031, 0.00032, 0.00031, 0.00033, 0.00032, 0.00032, 0.00032, 0.00032, 0.00032, 0.00032, 0.00033, 0.00032, 0.00032, 0.00032, 0.00031, 0.00032, 0.00032, 0.00032, 0.00032, 0.00032, 0.00032, 0.00032, 0.00032, 0.00031, 0.00032, 0.00032, 0.00031, 0.00033, 0.00032, 0.00032, 0.00032, 0.00032, 0.00032, 0.00032, 0.00031, 0.00032, 0.00032, 0.00032, 0.00032, 0.00032, 0.00032, 0.00032, 0.00031, 0.00032, 0.00032, 0.00032, 0.00032, 0.00032, 0.00032, 0.00031, 0.00032, 0.00032, 0.00032, 0.00032, 0.00032, 0.00032, 0.00032, 0.00031, 0.00032, 0.00031, 0.00032, 0.00032, 0.00031, 0.00032, 0.00032, 0.00032, 0.00032, 0.00032, 0.00033, 0.00032, 0.00032, 0.00032, 0.00032, 0.00032, 0.00032, 0.00031, 0.00032, 0.00031, 0.00032, 0.00032, 0.00032, 0.00032, 0.00031, 0.00032, 0.00032, 0.00031, 0.00031, 0.00032, 0.00032, 0.00032, 0.00031, 0.00034, 0.00032, 0.00032, 0.00032, 0.00031, 0.00032, 0.00032, 0.00031, 0.00032, 0.00032, 0.00032, 0.00032, 0.00032, 0.00032, 0.00032, 0.00031, 0.00031, 0.00032, 0.00031, 0.00031, 0.00031, 0.00032, 0.00032, 0.00032, 0.00032, 0.00031, 0.00034, 0.00032, 0.00032, 0.00032, 0.00032, 0.00032, 0.00032, 0.00032, 0.00032, 0.00031, 0.00034, 0.00032, 0.00032, 0.00031, 0.00032, 0.00032, 0.00031, 0.00032, 0.00032, 0.00032, 0.00031, 0.00032, 0.00032, 0.00032, 0.00031, 0.00032, 0.00031, 0.00032, 0.00031, 0.00032, 0.00032, 0.00031, 0.00032, 0.00032, 0.00032, 0.00032, 0.00031, 0.00032, 0.00032, 0.00031, 0.00032, 0.00032, 0.00031, 0.00032, 0.00031, 0.00032, 0.00031, 0.00032, 0.00032, 0.00031, 0.00032, 0.00032, 0.00031, 0.00032, 0.00031, 0.00031, 0.00032, 0.00031, 0.00031, 0.00032, 0.00032, 0.00032, 0.00032, 0.00031, 0.00032, 0.00031, 0.00032, 0.00032, 0.00031, 0.00031, 0.00032, 0.00032, 0.00032, 0.00032, 0.00033, 0.00031, 0.00032, 0.00031, 0.00031, 0.00032, 0.00032, 0.00032, 0.00032, 0.00032, 0.00032, 0.00032, 0.00032, 0.00032, 0.00031, 0.00032, 0.00032, 0.00031, 0.00032, 0.00032, 0.00032, 0.00032, 0.00031, 0.00032, 0.00032, 0.00031, 0.00032, 0.00032, 0.00032, 0.00031, 0.00031, 0.00032, 0.00032, 0.00032, 0.00031, 0.00032, 0.00032, 0.00032, 0.00031, 0.00032, 0.00032, 0.00032, 0.00032, 0.00032, 0.00032, 0.00032, 0.00032, 0.00032, 0.00033, 0.00032, 0.00032, 0.00032, 0.00032, 0.00032, 0.00033, 0.00032, 0.00032, 0.00031, 0.00032, 0.00031, 0.00032, 0.00032, 0.00031, 0.00032, 0.00032, 0.00032, 0.00032, 0.00032, 0.00031, 0.00032, 0.00032, 0.00032, 0.00031, 0.00032, 0.00031, 0.00031, 0.00031, 0.00032, 0.00032, 0.00032, 0.00032, 0.00032, 0.00032, 0.00032, 0.00032, 0.00031]}, "all-grads-sync-time": {"start_step": 0, "end_step": 2000, "step_interval": 5, "values": [2.22391, 0.00071, 0.00073, 0.0009, 0.00073, 0.00075, 0.00074, 0.00093, 0.00097, 0.00072, 0.00071, 0.00084, 0.00088, 0.00075, 0.00086, 0.00072, 0.00072, 0.00071, 0.00072, 0.00073, 0.00072, 0.00072, 0.00073, 0.00073, 0.00072, 0.00072, 0.00072, 0.00072, 0.00071, 0.0007, 0.00072, 0.00071, 0.00072, 0.00072, 0.00071, 0.00071, 0.00074, 0.00072, 0.00074, 0.00073, 0.00073, 0.00075, 0.00074, 0.00072, 0.00072, 0.00073, 0.0009, 0.00081, 0.00071, 0.00073, 0.00073, 0.00071, 0.00074, 0.00084, 0.00072, 0.00072, 0.00083, 0.00072, 0.00073, 0.00072, 0.0009, 0.00072, 0.00072, 0.00072, 0.00074, 0.00072, 0.00073, 0.00073, 0.00073, 0.00072, 0.00074, 0.00075, 0.00072, 0.00073, 0.00073, 0.00072, 0.00073, 0.00074, 0.00073, 0.00072, 0.00073, 0.00074, 0.00073, 0.00074, 0.00073, 0.00073, 0.00073, 0.00072, 0.00072, 0.00071, 0.00074, 0.00093, 0.00074, 0.00072, 0.00072, 0.00072, 0.00072, 0.00069, 0.00084, 0.00071, 0.00073, 0.00073, 0.0008, 0.00086, 0.00098, 0.00092, 0.00099, 0.00087, 0.00096, 0.00093, 0.00073, 0.00074, 0.00072, 0.00072, 0.00072, 0.00074, 0.00072, 0.00072, 0.00072, 0.00073, 0.00073, 0.00073, 0.00072, 0.00073, 0.00072, 0.00073, 0.00073, 0.00072, 0.00073, 0.00077, 0.00075, 0.00074, 0.00087, 0.00072, 0.00073, 0.00072, 0.00073, 0.00082, 0.00081, 0.00074, 0.00074, 0.00073, 0.00072, 0.00072, 0.00074, 0.00073, 0.00071, 0.00075, 0.00076, 0.00072, 0.00085, 0.00072, 0.00073, 0.00072, 0.00074, 0.00082, 0.00097, 0.00073, 0.00072, 0.00072, 0.00073, 0.00073, 0.00073, 0.00072, 0.00072, 0.00073, 0.00073, 0.00073, 0.00077, 0.00072, 0.00073, 0.00086, 0.00087, 0.00073, 0.00093, 0.00084, 0.00097, 0.00089, 0.00074, 0.00074, 0.00087, 0.00093, 0.00087, 0.00073, 0.00072, 0.00074, 0.00072, 0.00074, 0.00074, 0.00074, 0.00073, 0.00072, 0.00093, 0.00074, 0.00073, 0.00075, 0.00085, 0.00073, 0.00072, 0.00072, 0.00073, 0.00092, 0.00074, 0.00088, 0.00073, 0.00074, 0.00073, 0.00073, 0.00072, 0.00072, 0.00075, 0.00073, 0.00072, 0.00081, 0.00073, 0.00073, 0.00071, 0.00072, 0.00071, 0.00071, 0.00072, 0.00074, 0.00072, 0.00073, 0.00093, 0.00072, 0.00074, 0.00072, 0.00073, 0.00071, 0.00074, 0.00074, 0.00087, 0.00086, 0.00072, 0.00072, 0.00074, 0.00072, 0.00074, 0.00072, 0.00079, 0.00095, 0.00083, 0.00071, 0.00093, 0.00088, 0.00072, 0.00072, 0.00073, 0.00071, 0.00075, 0.00091, 0.00072, 0.00071, 0.00072, 0.00073, 0.0007, 0.00072, 0.00074, 0.00072, 0.00074, 0.00073, 0.00075, 0.00073, 0.00073, 0.00072, 0.00073, 0.00073, 0.00071, 0.00074, 0.00072, 0.00071, 0.00071, 0.00073, 0.00072, 0.00073, 0.00073, 0.00071, 0.00074, 0.00072, 0.00073, 0.00073, 0.0007, 0.00072, 0.00072, 0.00072, 0.00073, 0.00074, 0.00072, 0.00074, 0.00073, 0.00073, 0.00074, 0.0007, 0.00072, 0.00072, 0.00073, 0.00074, 0.00071, 0.00073, 0.00072, 0.00071, 0.00073, 0.00071, 0.00073, 0.00072, 0.00074, 0.00071, 0.00073, 0.00071, 0.00073, 0.00073, 0.00071, 0.0007, 0.00072, 0.00072, 0.00073, 0.00072, 0.00071, 0.00072, 0.00073, 0.00074, 0.00071, 0.00074, 0.00071, 0.00073, 0.00072, 0.00073, 0.00073, 0.00071, 0.00073, 0.00072, 0.00073, 0.00074, 0.00074, 0.00071, 0.00072, 0.00072, 0.00074, 0.00072, 0.00073, 0.00072, 0.00074, 0.00072, 0.00073, 0.00073, 0.00073, 0.00073, 0.00074, 0.00074, 0.00075, 0.00072, 0.00073, 0.00097, 0.00103, 0.00091, 0.00097, 0.00092, 0.00088, 0.00072, 0.00071, 0.00073, 0.00074, 0.00073, 0.00075, 0.0007, 0.00072, 0.00072, 0.00072, 0.00071, 0.00073, 0.00072, 0.00074, 0.00072, 0.00073, 0.00074, 0.00073, 0.00074, 0.00073, 0.00072, 0.00073, 0.00074, 0.00074, 0.00072, 0.00075, 0.0007, 0.00072, 0.00076, 0.00073, 0.00072, 0.00072, 0.00094, 0.00082, 0.00087, 0.00071, 0.00071, 0.00096, 0.00083, 0.00089, 0.00089]}, "params-all-gather-time": {"start_step": 0, "end_step": 2000, "step_interval": 5, "values": [0.00024, 0.00025, 0.00024, 0.00043, 0.00027, 0.00024, 0.00024, 0.00024, 0.00035, 0.00024, 0.00024, 0.0004, 0.00025, 0.00024, 0.0003, 0.00025, 0.00024, 0.00024, 0.00024, 0.00025, 0.00024, 0.00025, 0.00024, 0.00024, 0.00024, 0.00024, 0.00024, 0.00024, 0.00024, 0.00024, 0.00024, 0.00024, 0.00025, 0.00025, 0.00026, 0.00024, 0.00024, 0.00025, 0.00024, 0.00025, 0.00025, 0.00024, 0.00024, 0.00024, 0.00024, 0.0003, 0.00025, 0.00025, 0.00025, 0.00025, 0.00042, 0.00025, 0.00027, 0.00025, 0.00048, 0.00025, 0.00026, 0.00025, 0.00025, 0.00026, 0.00026, 0.00056, 0.00026, 0.00043, 0.00025, 0.00025, 0.00025, 0.00025, 0.00025, 0.00025, 0.00025, 0.00025, 0.00025, 0.00025, 0.00025, 0.00024, 0.00025, 0.00026, 0.00025, 0.00025, 0.00025, 0.00025, 0.00025, 0.00024, 0.00025, 0.00024, 0.00025, 0.00025, 0.00024, 0.00025, 0.00025, 0.00033, 0.00025, 0.00026, 0.00025, 0.00025, 0.00025, 0.00025, 0.00025, 0.00026, 0.00025, 0.00025, 0.00025, 0.00025, 0.00028, 0.00043, 0.00026, 0.00034, 0.0003, 0.00025, 0.0003, 0.00024, 0.00025, 0.00026, 0.00026, 0.00024, 0.00025, 0.00024, 0.00025, 0.00025, 0.00024, 0.00025, 0.00025, 0.00024, 0.00026, 0.00024, 0.00024, 0.00025, 0.00025, 0.00025, 0.00024, 0.00025, 0.00025, 0.00024, 0.00024, 0.00025, 0.00025, 0.00025, 0.00024, 0.00025, 0.00024, 0.00025, 0.00026, 0.00024, 0.00024, 0.00025, 0.00028, 0.00025, 0.00025, 0.00025, 0.00025, 0.00028, 0.00025, 0.00025, 0.00025, 0.00025, 0.00025, 0.00025, 0.00025, 0.00026, 0.00025, 0.00025, 0.00025, 0.00025, 0.00025, 0.00027, 0.00025, 0.00025, 0.00026, 0.00026, 0.00027, 0.00025, 0.00026, 0.00025, 0.00026, 0.00046, 0.00025, 0.00025, 0.00025, 0.00025, 0.00045, 0.00025, 0.00025, 0.00025, 0.00024, 0.00025, 0.00025, 0.00024, 0.00027, 0.00025, 0.00025, 0.00025, 0.00025, 0.00024, 0.00025, 0.00025, 0.00025, 0.00024, 0.00024, 0.00025, 0.00025, 0.00025, 0.00025, 0.00025, 0.00025, 0.00025, 0.00026, 0.00024, 0.00025, 0.00025, 0.00025, 0.00025, 0.00024, 0.00024, 0.00043, 0.00024, 0.00025, 0.00025, 0.00025, 0.00024, 0.00025, 0.00025, 0.00025, 0.00025, 0.00032, 0.0005, 0.00025, 0.00024, 0.0005, 0.00038, 0.00024, 0.00025, 0.00025, 0.00025, 0.00025, 0.00025, 0.00025, 0.00025, 0.00025, 0.00025, 0.00042, 0.00025, 0.0004, 0.00026, 0.00025, 0.00025, 0.00025, 0.00025, 0.00026, 0.00025, 0.00027, 0.00025, 0.00026, 0.00025, 0.00026, 0.00025, 0.00025, 0.00025, 0.00025, 0.00025, 0.00025, 0.00026, 0.00025, 0.00026, 0.00026, 0.00025, 0.00025, 0.00025, 0.00025, 0.00024, 0.00024, 0.00025, 0.00025, 0.00025, 0.00024, 0.00025, 0.00025, 0.00025, 0.00025, 0.00025, 0.00025, 0.00025, 0.00025, 0.00025, 0.00024, 0.00025, 0.00025, 0.00025, 0.00025, 0.00024, 0.00025, 0.00039, 0.00029, 0.00026, 0.00025, 0.00025, 0.00033, 0.00025, 0.00025, 0.00026, 0.00026, 0.00027, 0.00033, 0.00025, 0.00024, 0.00025, 0.00025, 0.00025, 0.00024, 0.00025, 0.00025, 0.00025, 0.00024, 0.00025, 0.00025, 0.00025, 0.00025, 0.00025, 0.00024, 0.00025, 0.00025, 0.00025, 0.00024, 0.00025, 0.00026, 0.00025, 0.00025, 0.00025, 0.00025, 0.00025, 0.00025, 0.00025, 0.00025, 0.00025, 0.00024, 0.00024, 0.00025, 0.00025, 0.00024, 0.00024, 0.00024, 0.00026, 0.00024, 0.00025, 0.00025, 0.00025, 0.00025, 0.00024, 0.00024, 0.00024, 0.00024, 0.00025, 0.00025, 0.00044, 0.00044, 0.00046, 0.00041, 0.00047, 0.00026, 0.00025, 0.00025, 0.00025, 0.00025, 0.00025, 0.00025, 0.00025, 0.00025, 0.00024, 0.00025, 0.00025, 0.00026, 0.00024, 0.00025, 0.00025, 0.00025, 0.00025, 0.00025, 0.00025, 0.00024, 0.00025, 0.00025, 0.00026, 0.00025, 0.00026, 0.00025, 0.00025, 0.00026, 0.00025, 0.00025, 0.00024, 0.00043, 0.00026, 0.00053, 0.00025, 0.00026, 0.00025, 0.00028, 0.00042, 0.00025, 0.00025]}, "optimizer-copy-to-main-grad-time": {"start_step": 0, "end_step": 2000, "step_interval": 5, "values": [0.00041, 0.00039, 0.00039, 0.00041, 0.00042, 0.0004, 0.00041, 0.0004, 0.0004, 0.0004, 0.0004, 0.00054, 0.0004, 0.0004, 0.00056, 0.00042, 0.0004, 0.00041, 0.00041, 0.00041, 0.00041, 0.00041, 0.00042, 0.0004, 0.0004, 0.00041, 0.00041, 0.00041, 0.0004, 0.00041, 0.0004, 0.00041, 0.00041, 0.00041, 0.0004, 0.00041, 0.00042, 0.00041, 0.00042, 0.00041, 0.00042, 0.00042, 0.0004, 0.00041, 0.00042, 0.00042, 0.0004, 0.00041, 0.00043, 0.00041, 0.00042, 0.00041, 0.00042, 0.00042, 0.00043, 0.00042, 0.00042, 0.00042, 0.00043, 0.00042, 0.00041, 0.00041, 0.00042, 0.00042, 0.00043, 0.00042, 0.00043, 0.00042, 0.00042, 0.00041, 0.00042, 0.00042, 0.00042, 0.00041, 0.00042, 0.00042, 0.00042, 0.00043, 0.00043, 0.00043, 0.0004, 0.00041, 0.00043, 0.00042, 0.00042, 0.00043, 0.00042, 0.00043, 0.00042, 0.00042, 0.00048, 0.00042, 0.00041, 0.00042, 0.00042, 0.00042, 0.00043, 0.00044, 0.00042, 0.00042, 0.00041, 0.0004, 0.00041, 0.00041, 0.00041, 0.00041, 0.00041, 0.00042, 0.00042, 0.00038, 0.0004, 0.00043, 0.00041, 0.00043, 0.00041, 0.0004, 0.0004, 0.0004, 0.00041, 0.00042, 0.00041, 0.00042, 0.00041, 0.00041, 0.0004, 0.00041, 0.00041, 0.00041, 0.00041, 0.00041, 0.00041, 0.00041, 0.00041, 0.00043, 0.00043, 0.00041, 0.00041, 0.00041, 0.00041, 0.00042, 0.00038, 0.0004, 0.00039, 0.00041, 0.00042, 0.00043, 0.00038, 0.00038, 0.0004, 0.00042, 0.0004, 0.0004, 0.0004, 0.00041, 0.00041, 0.0004, 0.00045, 0.00041, 0.00041, 0.0004, 0.00043, 0.00042, 0.00042, 0.00042, 0.00041, 0.00042, 0.00042, 0.00042, 0.00041, 0.00041, 0.00041, 0.00041, 0.0004, 0.00041, 0.00041, 0.0004, 0.00041, 0.00041, 0.0004, 0.00041, 0.00041, 0.0004, 0.00041, 0.00041, 0.0004, 0.00041, 0.00042, 0.00041, 0.0004, 0.00041, 0.00042, 0.00041, 0.00041, 0.0004, 0.00041, 0.0004, 0.00041, 0.00043, 0.0004, 0.00042, 0.00042, 0.00043, 0.00041, 0.00041, 0.00041, 0.00041, 0.00041, 0.00043, 0.00042, 0.00041, 0.00038, 0.00042, 0.00041, 0.00041, 0.00041, 0.00042, 0.00041, 0.00041, 0.00042, 0.00041, 0.0004, 0.00041, 0.00041, 0.00041, 0.00041, 0.00041, 0.00041, 0.00042, 0.00043, 0.00042, 0.00041, 0.00041, 0.00041, 0.00041, 0.00041, 0.00043, 0.00041, 0.0004, 0.00042, 0.00041, 0.00041, 0.00042, 0.00041, 0.00041, 0.00042, 0.00042, 0.0004, 0.00041, 0.00041, 0.00041, 0.00046, 0.00043, 0.00043, 0.00042, 0.00042, 0.00042, 0.00042, 0.00043, 0.00042, 0.00041, 0.00043, 0.00043, 0.00039, 0.00043, 0.00042, 0.00042, 0.00043, 0.00042, 0.00042, 0.00042, 0.00043, 0.0004, 0.00042, 0.0004, 0.00043, 0.00041, 0.00042, 0.00042, 0.00043, 0.00041, 0.00041, 0.00041, 0.00042, 0.00042, 0.00042, 0.00041, 0.00043, 0.00042, 0.0004, 0.00043, 0.00041, 0.00042, 0.00041, 0.00041, 0.00043, 0.00042, 0.00042, 0.00043, 0.00042, 0.00042, 0.00041, 0.00041, 0.00041, 0.00041, 0.00041, 0.00042, 0.00041, 0.00042, 0.00042, 0.00042, 0.00042, 0.00042, 0.00042, 0.00043, 0.00041, 0.00042, 0.00042, 0.00043, 0.00044, 0.00043, 0.00041, 0.00041, 0.00042, 0.00042, 0.00041, 0.00043, 0.00041, 0.00042, 0.00041, 0.00042, 0.00041, 0.00039, 0.00041, 0.00042, 0.00042, 0.00041, 0.00042, 0.00042, 0.00042, 0.00042, 0.00041, 0.00041, 0.00042, 0.00041, 0.00041, 0.00043, 0.00042, 0.00042, 0.00042, 0.00041, 0.00041, 0.00042, 0.00043, 0.00041, 0.00041, 0.00041, 0.00042, 0.00043, 0.00042, 0.00042, 0.00044, 0.00043, 0.00042, 0.00041, 0.00042, 0.00041, 0.00043, 0.00041, 0.00044, 0.0004, 0.00042, 0.00042, 0.00041, 0.00042, 0.00042, 0.00043, 0.00042, 0.00041, 0.00041, 0.00041, 0.00042, 0.00041, 0.00041, 0.00042, 0.00041, 0.0004, 0.00041, 0.00041, 0.00041, 0.00042, 0.00041, 0.0004, 0.00052, 0.00042, 0.00042, 0.00042, 0.0004, 0.00042, 0.00041, 0.00041]}, "optimizer-clip-main-grad-time": {"start_step": 0, "end_step": 2000, "step_interval": 5, "values": [0.02442, 0.00047, 0.00045, 0.00045, 0.00045, 0.00045, 0.00045, 0.00046, 0.00045, 0.00045, 0.00044, 0.00046, 0.00045, 0.00046, 0.00069, 0.00045, 0.00045, 0.00046, 0.00046, 0.00045, 0.00045, 0.00046, 0.00044, 0.00045, 0.00045, 0.00045, 0.00045, 0.00046, 0.00045, 0.00044, 0.00044, 0.00045, 0.00045, 0.00045, 0.00045, 0.00045, 0.00046, 0.00045, 0.00045, 0.00045, 0.00044, 0.00045, 0.00044, 0.00045, 0.00045, 0.00045, 0.00046, 0.00045, 0.00045, 0.00045, 0.00045, 0.00045, 0.00045, 0.00045, 0.0005, 0.00046, 0.00045, 0.00044, 0.00047, 0.00046, 0.00045, 0.00053, 0.00046, 0.00045, 0.00045, 0.00045, 0.00046, 0.00044, 0.00045, 0.00046, 0.00045, 0.00047, 0.00045, 0.00045, 0.00045, 0.00045, 0.00045, 0.00045, 0.00044, 0.00045, 0.00045, 0.00044, 0.00045, 0.00046, 0.00045, 0.00045, 0.00045, 0.00045, 0.00045, 0.00045, 0.00052, 0.00045, 0.00047, 0.00046, 0.00039, 0.00045, 0.00045, 0.00046, 0.00046, 0.00046, 0.00045, 0.00045, 0.00045, 0.00044, 0.0004, 0.00046, 0.00044, 0.0004, 0.00046, 0.00044, 0.0004, 0.0004, 0.0004, 0.00041, 0.00047, 0.00046, 0.0004, 0.00046, 0.00045, 0.00045, 0.00039, 0.00045, 0.00047, 0.00045, 0.0004, 0.00045, 0.00044, 0.00045, 0.00045, 0.00045, 0.00045, 0.00045, 0.00046, 0.00045, 0.00045, 0.00045, 0.00045, 0.00045, 0.00045, 0.00047, 0.00044, 0.00045, 0.00044, 0.00045, 0.00045, 0.00046, 0.00044, 0.00044, 0.00045, 0.00045, 0.00045, 0.00047, 0.00045, 0.00046, 0.00045, 0.00045, 0.00049, 0.00046, 0.00046, 0.00045, 0.00045, 0.00045, 0.00044, 0.00045, 0.00045, 0.00045, 0.00045, 0.00046, 0.00045, 0.00045, 0.00045, 0.00045, 0.00045, 0.00045, 0.00046, 0.00045, 0.00045, 0.00046, 0.00046, 0.00047, 0.00045, 0.00045, 0.00045, 0.00045, 0.00046, 0.00047, 0.00048, 0.00047, 0.00046, 0.00045, 0.00045, 0.00046, 0.00045, 0.00045, 0.00047, 0.00045, 0.00045, 0.00045, 0.00045, 0.00045, 0.00047, 0.00045, 0.00046, 0.00046, 0.00045, 0.00045, 0.00046, 0.00046, 0.00058, 0.00047, 0.00044, 0.00046, 0.00045, 0.00045, 0.00045, 0.00045, 0.00045, 0.00045, 0.00045, 0.00045, 0.00044, 0.00045, 0.00045, 0.00046, 0.00045, 0.00054, 0.00045, 0.00045, 0.00045, 0.00046, 0.00044, 0.00045, 0.00047, 0.00045, 0.00045, 0.00045, 0.00045, 0.00046, 0.00045, 0.00045, 0.00045, 0.00045, 0.00045, 0.00046, 0.00045, 0.00045, 0.00045, 0.00045, 0.00046, 0.00045, 0.00051, 0.00046, 0.00045, 0.00045, 0.00045, 0.00046, 0.00045, 0.00045, 0.00045, 0.00045, 0.00045, 0.00046, 0.00044, 0.00046, 0.00046, 0.00045, 0.00045, 0.00045, 0.00045, 0.00045, 0.00045, 0.00045, 0.00045, 0.00045, 0.00044, 0.00045, 0.00045, 0.00045, 0.00045, 0.00045, 0.00044, 0.00045, 0.00048, 0.00045, 0.00045, 0.00044, 0.00045, 0.00045, 0.00045, 0.00045, 0.00045, 0.00045, 0.00045, 0.00045, 0.00045, 0.00045, 0.00046, 0.00045, 0.00045, 0.00045, 0.00045, 0.00045, 0.00045, 0.00045, 0.00045, 0.00045, 0.00046, 0.00045, 0.00046, 0.00048, 0.00044, 0.00044, 0.00045, 0.00045, 0.00044, 0.00046, 0.00045, 0.00045, 0.00045, 0.00045, 0.00045, 0.00046, 0.00045, 0.00045, 0.00046, 0.00045, 0.00045, 0.00045, 0.00045, 0.00045, 0.00045, 0.00045, 0.00045, 0.00046, 0.00045, 0.00045, 0.00045, 0.00045, 0.00046, 0.00045, 0.00045, 0.00047, 0.00045, 0.00045, 0.00045, 0.00045, 0.00046, 0.00045, 0.00045, 0.00046, 0.00046, 0.00046, 0.00046, 0.00045, 0.00046, 0.00047, 0.00045, 0.00057, 0.00046, 0.00045, 0.00045, 0.00045, 0.00046, 0.00047, 0.00047, 0.00045, 0.00046, 0.00045, 0.00045, 0.00044, 0.00046, 0.00046, 0.00045, 0.00045, 0.00047, 0.00047, 0.00045, 0.00045, 0.00045, 0.00046, 0.00045, 0.00046, 0.00045, 0.00046, 0.00045, 0.00045, 0.00046, 0.00045, 0.00046, 0.00045, 0.00045, 0.00046, 0.00059, 0.00045, 0.00047, 0.00045, 0.00046, 0.00045, 0.00045, 0.00045]}, "optimizer-count-zeros-time": {"start_step": 0, "end_step": 2000, "step_interval": 5, "values": [0.00264, 0.00186, 0.00189, 0.00186, 0.00191, 0.00186, 0.00187, 0.00189, 0.0019, 0.00189, 0.00189, 0.002, 0.00187, 0.00201, 0.0019, 0.00186, 0.00187, 0.00185, 0.00187, 0.00187, 0.00186, 0.00186, 0.00187, 0.00186, 0.00187, 0.00189, 0.00189, 0.00185, 0.00188, 0.00186, 0.00187, 0.00188, 0.00188, 0.00186, 0.00188, 0.00187, 0.00189, 0.00185, 0.00189, 0.00189, 0.00187, 0.00186, 0.00186, 0.00189, 0.00188, 0.00186, 0.00186, 0.0019, 0.00186, 0.00187, 0.00188, 0.00186, 0.00213, 0.00189, 0.00185, 0.00186, 0.00188, 0.00189, 0.00186, 0.00185, 0.00187, 0.00186, 0.00186, 0.00186, 0.00186, 0.00186, 0.00185, 0.00186, 0.00187, 0.00186, 0.00186, 0.00189, 0.00188, 0.0019, 0.00189, 0.00187, 0.00187, 0.00188, 0.00186, 0.00187, 0.00187, 0.00188, 0.00186, 0.00186, 0.00186, 0.00185, 0.00186, 0.00186, 0.00187, 0.00186, 0.00217, 0.0019, 0.00195, 0.00188, 0.00187, 0.00188, 0.00188, 0.00186, 0.00188, 0.00186, 0.00188, 0.00188, 0.00186, 0.00187, 0.00188, 0.00185, 0.00208, 0.00187, 0.00187, 0.00186, 0.00185, 0.00185, 0.00188, 0.00185, 0.00186, 0.00186, 0.00186, 0.00186, 0.00186, 0.00186, 0.00187, 0.00185, 0.00185, 0.00188, 0.00186, 0.00185, 0.00188, 0.00186, 0.00186, 0.00184, 0.00187, 0.00186, 0.00189, 0.00186, 0.00185, 0.0019, 0.00187, 0.00186, 0.00186, 0.00186, 0.00186, 0.00186, 0.00189, 0.00187, 0.0019, 0.00186, 0.00186, 0.00187, 0.00188, 0.00185, 0.00186, 0.00186, 0.00189, 0.00186, 0.00187, 0.00187, 0.00203, 0.00186, 0.00186, 0.00188, 0.00187, 0.00186, 0.00188, 0.00184, 0.00185, 0.00186, 0.00187, 0.00185, 0.00186, 0.00187, 0.00188, 0.00198, 0.00198, 0.00186, 0.00185, 0.00187, 0.00188, 0.00186, 0.00188, 0.00185, 0.00185, 0.00187, 0.00187, 0.00186, 0.00185, 0.00185, 0.00187, 0.00186, 0.00186, 0.00187, 0.00187, 0.00185, 0.00187, 0.00187, 0.00186, 0.00185, 0.00186, 0.00187, 0.00188, 0.00191, 0.00186, 0.00188, 0.00188, 0.00187, 0.00188, 0.00187, 0.00188, 0.00186, 0.00187, 0.0019, 0.00187, 0.00187, 0.00186, 0.00187, 0.00187, 0.00186, 0.0019, 0.00188, 0.00187, 0.0019, 0.0019, 0.00191, 0.00191, 0.00186, 0.00187, 0.00188, 0.00187, 0.00186, 0.00188, 0.00188, 0.00189, 0.00189, 0.00188, 0.00188, 0.00189, 0.00189, 0.00189, 0.00186, 0.00191, 0.00189, 0.00187, 0.00186, 0.0019, 0.00188, 0.00188, 0.00187, 0.00188, 0.0019, 0.00189, 0.0019, 0.00219, 0.00189, 0.0019, 0.00187, 0.00188, 0.00187, 0.00187, 0.00188, 0.00188, 0.00187, 0.00186, 0.00189, 0.00188, 0.00188, 0.00188, 0.00188, 0.00188, 0.00189, 0.00188, 0.00216, 0.00188, 0.00189, 0.00188, 0.00189, 0.00189, 0.00189, 0.00187, 0.00187, 0.00188, 0.00188, 0.00199, 0.00187, 0.00201, 0.00189, 0.00187, 0.00191, 0.00189, 0.00187, 0.00188, 0.00188, 0.00189, 0.00246, 0.00272, 0.00189, 0.00189, 0.00189, 0.00288, 0.00189, 0.00187, 0.00189, 0.00189, 0.0019, 0.0019, 0.00188, 0.0019, 0.0019, 0.00191, 0.0019, 0.0019, 0.0019, 0.00191, 0.00191, 0.00189, 0.00189, 0.0019, 0.0019, 0.00189, 0.00188, 0.00188, 0.0019, 0.00197, 0.00187, 0.00189, 0.00188, 0.00189, 0.00187, 0.0019, 0.00187, 0.00189, 0.00188, 0.00189, 0.00188, 0.00187, 0.00187, 0.00188, 0.0019, 0.00187, 0.00188, 0.00188, 0.00188, 0.00191, 0.00216, 0.00186, 0.00188, 0.00189, 0.00189, 0.00187, 0.00189, 0.0019, 0.00187, 0.00189, 0.00187, 0.00199, 0.00189, 0.00188, 0.00187, 0.00187, 0.00188, 0.00189, 0.00188, 0.00188, 0.00188, 0.00188, 0.00187, 0.00188, 0.00188, 0.00188, 0.00189, 0.00188, 0.00188, 0.0019, 0.00187, 0.00189, 0.00189, 0.00188, 0.00189, 0.00188, 0.00188, 0.00188, 0.00189, 0.00186, 0.00189, 0.00187, 0.00189, 0.0019, 0.0019, 0.00194, 0.00189, 0.00187, 0.00187, 0.00189, 0.00189, 0.002, 0.00187, 0.00187, 0.00189, 0.00187, 0.00188, 0.00189, 0.00195]}, "optimizer-inner-step-time": {"start_step": 0, "end_step": 2000, "step_interval": 5, "values": [0.00219, 0.00036, 0.00035, 0.00037, 0.00037, 0.00039, 0.00038, 0.00037, 0.00037, 0.00038, 0.00037, 0.0004, 0.00038, 0.00038, 0.00047, 0.00037, 0.00038, 0.00038, 0.00037, 0.00037, 0.00037, 0.00039, 0.00038, 0.00037, 0.00039, 0.00037, 0.00038, 0.00038, 0.00037, 0.00037, 0.00037, 0.00038, 0.00038, 0.00038, 0.00037, 0.00037, 0.00038, 0.00039, 0.00038, 0.00038, 0.00038, 0.00039, 0.00038, 0.00038, 0.00038, 0.00038, 0.00038, 0.00038, 0.00037, 0.00038, 0.00037, 0.00038, 0.00037, 0.00039, 0.00039, 0.00038, 0.00038, 0.00038, 0.00038, 0.00038, 0.00038, 0.00037, 0.00038, 0.00038, 0.00038, 0.00039, 0.00038, 0.00038, 0.00037, 0.00038, 0.0004, 0.00039, 0.0004, 0.00038, 0.00039, 0.00039, 0.00039, 0.00039, 0.00038, 0.00038, 0.00037, 0.00038, 0.00039, 0.00039, 0.00038, 0.00038, 0.00039, 0.00039, 0.00038, 0.00038, 0.00044, 0.00039, 0.0004, 0.00039, 0.00038, 0.00038, 0.00038, 0.00038, 0.00038, 0.00039, 0.00039, 0.00038, 0.00038, 0.00039, 0.00038, 0.00039, 0.00038, 0.00038, 0.00039, 0.00037, 0.00039, 0.00038, 0.00038, 0.00038, 0.00038, 0.00038, 0.00038, 0.00038, 0.00039, 0.0004, 0.00038, 0.00038, 0.00039, 0.00039, 0.0004, 0.00039, 0.00038, 0.00038, 0.00039, 0.00039, 0.00038, 0.00038, 0.00038, 0.00039, 0.00038, 0.00039, 0.00039, 0.00038, 0.00039, 0.00039, 0.00037, 0.00039, 0.00037, 0.00038, 0.00041, 0.00037, 0.00037, 0.00038, 0.00038, 0.00038, 0.00038, 0.00039, 0.00038, 0.00038, 0.00039, 0.00038, 0.0004, 0.00038, 0.00038, 0.00038, 0.00039, 0.00039, 0.00038, 0.00038, 0.0004, 0.00038, 0.0004, 0.00038, 0.00038, 0.0004, 0.00038, 0.00038, 0.00038, 0.00038, 0.00039, 0.00038, 0.00039, 0.00038, 0.00038, 0.00038, 0.00038, 0.00039, 0.00038, 0.00038, 0.00038, 0.00038, 0.00038, 0.00038, 0.00039, 0.00038, 0.00038, 0.0004, 0.00038, 0.00038, 0.00038, 0.00038, 0.00037, 0.00038, 0.00039, 0.00038, 0.00038, 0.00039, 0.00039, 0.00038, 0.00038, 0.00038, 0.00039, 0.00038, 0.00039, 0.00038, 0.00038, 0.00039, 0.00039, 0.00038, 0.00039, 0.00038, 0.00038, 0.00038, 0.00039, 0.00038, 0.00037, 0.00038, 0.00038, 0.00037, 0.00038, 0.00038, 0.00037, 0.00037, 0.00039, 0.00038, 0.00038, 0.00037, 0.00038, 0.00038, 0.00038, 0.00038, 0.00038, 0.00038, 0.00038, 0.00039, 0.00038, 0.00038, 0.00038, 0.00039, 0.00039, 0.00038, 0.00037, 0.00037, 0.00038, 0.00038, 0.00043, 0.00037, 0.00038, 0.00038, 0.00037, 0.00038, 0.00038, 0.00038, 0.00038, 0.00037, 0.00038, 0.00037, 0.00037, 0.00038, 0.00037, 0.00039, 0.00037, 0.00037, 0.00039, 0.00038, 0.00038, 0.00038, 0.00038, 0.00039, 0.00038, 0.00038, 0.0004, 0.0004, 0.00038, 0.00039, 0.00038, 0.00038, 0.00039, 0.0004, 0.00038, 0.00038, 0.00038, 0.00039, 0.00038, 0.00038, 0.00038, 0.00039, 0.00039, 0.00038, 0.00038, 0.00038, 0.00038, 0.00038, 0.00039, 0.00038, 0.00038, 0.00039, 0.00038, 0.00037, 0.00038, 0.00039, 0.00039, 0.00038, 0.00037, 0.00038, 0.00038, 0.00038, 0.00038, 0.00038, 0.00038, 0.00038, 0.00038, 0.00039, 0.00039, 0.00038, 0.00038, 0.00038, 0.00038, 0.00038, 0.00038, 0.00039, 0.00038, 0.00038, 0.00038, 0.00038, 0.00038, 0.00037, 0.00039, 0.00038, 0.00038, 0.00038, 0.00039, 0.0004, 0.00039, 0.00038, 0.00038, 0.00041, 0.0004, 0.00039, 0.00038, 0.0004, 0.00038, 0.00038, 0.00038, 0.00038, 0.00038, 0.00039, 0.00038, 0.00039, 0.00039, 0.00039, 0.00038, 0.00038, 0.00039, 0.00038, 0.00038, 0.0004, 0.00038, 0.00038, 0.00038, 0.00039, 0.00038, 0.00038, 0.00038, 0.00039, 0.00038, 0.00038, 0.00038, 0.00039, 0.00038, 0.00038, 0.00038, 0.00039, 0.00039, 0.00038, 0.00039, 0.00038, 0.00039, 0.00038, 0.00039, 0.00038, 0.00041, 0.00039, 0.00039, 0.00041, 0.00038, 0.00038, 0.00052, 0.00038, 0.00039, 0.00038, 0.00038, 0.00038, 0.00038, 0.00038]}, "optimizer-copy-main-to-model-params-time": {"start_step": 0, "end_step": 2000, "step_interval": 5, "values": [0.00097, 0.00085, 0.00083, 0.00104, 0.00084, 0.00083, 0.00084, 0.00085, 0.00085, 0.00084, 0.00083, 0.00085, 0.00083, 0.00085, 0.00178, 0.00084, 0.00084, 0.00084, 0.00084, 0.00084, 0.00085, 0.00085, 0.00085, 0.00083, 0.00082, 0.00083, 0.00084, 0.00083, 0.00084, 0.00084, 0.00083, 0.00084, 0.00083, 0.00086, 0.00085, 0.00085, 0.00084, 0.00084, 0.00085, 0.00085, 0.00084, 0.00084, 0.00084, 0.00085, 0.00085, 0.00084, 0.00085, 0.00085, 0.00084, 0.00085, 0.00118, 0.00086, 0.00087, 0.00086, 0.00108, 0.00085, 0.00085, 0.00084, 0.00084, 0.00083, 0.00085, 0.00109, 0.00084, 0.00083, 0.00084, 0.00086, 0.00085, 0.00086, 0.00085, 0.00085, 0.00085, 0.00086, 0.00085, 0.00084, 0.00087, 0.00085, 0.00087, 0.00084, 0.00086, 0.00085, 0.00085, 0.00084, 0.00085, 0.00084, 0.00085, 0.00084, 0.00085, 0.00087, 0.00085, 0.00087, 0.00096, 0.00085, 0.00085, 0.00086, 0.00084, 0.00085, 0.00086, 0.00083, 0.00085, 0.00084, 0.00083, 0.00084, 0.00084, 0.00083, 0.00084, 0.00084, 0.00085, 0.00083, 0.00083, 0.00083, 0.00083, 0.00084, 0.00083, 0.00084, 0.00083, 0.00083, 0.00085, 0.00084, 0.00083, 0.00084, 0.00083, 0.00084, 0.00083, 0.00084, 0.00084, 0.00083, 0.00084, 0.00086, 0.00084, 0.00083, 0.00084, 0.00084, 0.00083, 0.00085, 0.00085, 0.00084, 0.00083, 0.00086, 0.00086, 0.00084, 0.00085, 0.00083, 0.00084, 0.00084, 0.00083, 0.00084, 0.00083, 0.00083, 0.00083, 0.00084, 0.00085, 0.00085, 0.00083, 0.00084, 0.00083, 0.00083, 0.00094, 0.00084, 0.00084, 0.00083, 0.00084, 0.00084, 0.00083, 0.00085, 0.00084, 0.00084, 0.00083, 0.00085, 0.00084, 0.00084, 0.00083, 0.00085, 0.00084, 0.00083, 0.00085, 0.00083, 0.00083, 0.00085, 0.00083, 0.00084, 0.00098, 0.00085, 0.00084, 0.00085, 0.00083, 0.00083, 0.00084, 0.00085, 0.00085, 0.00085, 0.00084, 0.00084, 0.00084, 0.00084, 0.00083, 0.00085, 0.00085, 0.00084, 0.00087, 0.00084, 0.00083, 0.00084, 0.00085, 0.00084, 0.00084, 0.00084, 0.00085, 0.00086, 0.00086, 0.00083, 0.00083, 0.00083, 0.00085, 0.00084, 0.00085, 0.00084, 0.00084, 0.00085, 0.00084, 0.00084, 0.00084, 0.00084, 0.00083, 0.00082, 0.00084, 0.00109, 0.00084, 0.00084, 0.00084, 0.00084, 0.00084, 0.00083, 0.00083, 0.00085, 0.00085, 0.00084, 0.00084, 0.00085, 0.00084, 0.00085, 0.00083, 0.00085, 0.00084, 0.00084, 0.00084, 0.00085, 0.00085, 0.00085, 0.00084, 0.00083, 0.00093, 0.00084, 0.00083, 0.00085, 0.00084, 0.00084, 0.00084, 0.00084, 0.00084, 0.00085, 0.00085, 0.00083, 0.00084, 0.00084, 0.00083, 0.00085, 0.00086, 0.00085, 0.00083, 0.00085, 0.00085, 0.00084, 0.00085, 0.00084, 0.00084, 0.00085, 0.00085, 0.00085, 0.00084, 0.00085, 0.00083, 0.00084, 0.00083, 0.00084, 0.00085, 0.00083, 0.00084, 0.00086, 0.00086, 0.00085, 0.00084, 0.00102, 0.00089, 0.00085, 0.00084, 0.00084, 0.00084, 0.00084, 0.00085, 0.00085, 0.00084, 0.00086, 0.00096, 0.00083, 0.00085, 0.00084, 0.00084, 0.00085, 0.00084, 0.00084, 0.00083, 0.00085, 0.00084, 0.00085, 0.00085, 0.00083, 0.00084, 0.00085, 0.00085, 0.00084, 0.00086, 0.00084, 0.00084, 0.00083, 0.00095, 0.00084, 0.00084, 0.00086, 0.00085, 0.00084, 0.00085, 0.00084, 0.00084, 0.00086, 0.00085, 0.00085, 0.00085, 0.00084, 0.00083, 0.00087, 0.00084, 0.00093, 0.00085, 0.00084, 0.00084, 0.00085, 0.00083, 0.00083, 0.00084, 0.00083, 0.00085, 0.00086, 0.00084, 0.00113, 0.00084, 0.00083, 0.00084, 0.00103, 0.00085, 0.00084, 0.00087, 0.00084, 0.00084, 0.00084, 0.00083, 0.00084, 0.00086, 0.00084, 0.00084, 0.00082, 0.00085, 0.00085, 0.00083, 0.00084, 0.00084, 0.00084, 0.00084, 0.00085, 0.00084, 0.00084, 0.00082, 0.00085, 0.00084, 0.00083, 0.00084, 0.00085, 0.00094, 0.00085, 0.00085, 0.00086, 0.00116, 0.00084, 0.00137, 0.00084, 0.00083, 0.00084, 0.00084, 0.00104, 0.00085, 0.00083]}, "optimizer-time": {"start_step": 0, "end_step": 2000, "step_interval": 5, "values": [0.03257, 0.00561, 0.00555, 0.00673, 0.00567, 0.00562, 0.00561, 0.00563, 0.00577, 0.00565, 0.00561, 0.00611, 0.00562, 0.00577, 0.00929, 0.00564, 0.00561, 0.00562, 0.0056, 0.00562, 0.0056, 0.00563, 0.00563, 0.00561, 0.00559, 0.00561, 0.00563, 0.00561, 0.00562, 0.00557, 0.0056, 0.00562, 0.00562, 0.00563, 0.00562, 0.00562, 0.00568, 0.00562, 0.00565, 0.00566, 0.00566, 0.00565, 0.0056, 0.00567, 0.00567, 0.00569, 0.00566, 0.00568, 0.00565, 0.00563, 0.00698, 0.00565, 0.00598, 0.0057, 0.00701, 0.00568, 0.00567, 0.00565, 0.00567, 0.00568, 0.00563, 0.00767, 0.00563, 0.00608, 0.00566, 0.00565, 0.00568, 0.00565, 0.00565, 0.00567, 0.00566, 0.00571, 0.00568, 0.00567, 0.00567, 0.00565, 0.00569, 0.00575, 0.00565, 0.00565, 0.00562, 0.00577, 0.00568, 0.00567, 0.00563, 0.00564, 0.00565, 0.0057, 0.00565, 0.00567, 0.00638, 0.00578, 0.00578, 0.00572, 0.0056, 0.00567, 0.00571, 0.00565, 0.00565, 0.00567, 0.00563, 0.00563, 0.00563, 0.00563, 0.00562, 0.00635, 0.00583, 0.00568, 0.00584, 0.00555, 0.00577, 0.00559, 0.0056, 0.00558, 0.00584, 0.00561, 0.00557, 0.00564, 0.00562, 0.00566, 0.00555, 0.00562, 0.00565, 0.00566, 0.00559, 0.0056, 0.00561, 0.00566, 0.00564, 0.00561, 0.00563, 0.00564, 0.00564, 0.00565, 0.00564, 0.00568, 0.00564, 0.00565, 0.00566, 0.00568, 0.00554, 0.00562, 0.00556, 0.00562, 0.0057, 0.00565, 0.00583, 0.00554, 0.00562, 0.00561, 0.00564, 0.00571, 0.00563, 0.00563, 0.00565, 0.0056, 0.00607, 0.00565, 0.00564, 0.00564, 0.00565, 0.00565, 0.00563, 0.00564, 0.00563, 0.00566, 0.00564, 0.00565, 0.00565, 0.00567, 0.00565, 0.00576, 0.00575, 0.00563, 0.00566, 0.00658, 0.00565, 0.00564, 0.00568, 0.00562, 0.00663, 0.00565, 0.00564, 0.00564, 0.00562, 0.00563, 0.00568, 0.00566, 0.00565, 0.00564, 0.00565, 0.00563, 0.00565, 0.00561, 0.00564, 0.00563, 0.00562, 0.00564, 0.00568, 0.00568, 0.00567, 0.00567, 0.00569, 0.00566, 0.0056, 0.00564, 0.00567, 0.00567, 0.00586, 0.00568, 0.00555, 0.00567, 0.00562, 0.00558, 0.00585, 0.00563, 0.00566, 0.00565, 0.00565, 0.00566, 0.00559, 0.00566, 0.00566, 0.00561, 0.00573, 0.00721, 0.00562, 0.00564, 0.00593, 0.00595, 0.00563, 0.00564, 0.00566, 0.00567, 0.00565, 0.00569, 0.00564, 0.00566, 0.00568, 0.00566, 0.00578, 0.00588, 0.0064, 0.00571, 0.00566, 0.00564, 0.00565, 0.00567, 0.00566, 0.00564, 0.00643, 0.00566, 0.00567, 0.00564, 0.00601, 0.00563, 0.00566, 0.00566, 0.00566, 0.00563, 0.00566, 0.00565, 0.00557, 0.00567, 0.00564, 0.00566, 0.00565, 0.00566, 0.00564, 0.00596, 0.00567, 0.00562, 0.00565, 0.00566, 0.00564, 0.00564, 0.00569, 0.00568, 0.00569, 0.00569, 0.00575, 0.00567, 0.00583, 0.00568, 0.00566, 0.00566, 0.00567, 0.00566, 0.00567, 0.00566, 0.00564, 0.00689, 0.00665, 0.00563, 0.00566, 0.00566, 0.00685, 0.00566, 0.00565, 0.00567, 0.00567, 0.00574, 0.00611, 0.00563, 0.00565, 0.00569, 0.00568, 0.00568, 0.00568, 0.0057, 0.00566, 0.00569, 0.00567, 0.0057, 0.00566, 0.00569, 0.00564, 0.00565, 0.00568, 0.00569, 0.00571, 0.00564, 0.00566, 0.00565, 0.0058, 0.00566, 0.00565, 0.00564, 0.00566, 0.00566, 0.00567, 0.00556, 0.00565, 0.00568, 0.00564, 0.00567, 0.00566, 0.00566, 0.00566, 0.00566, 0.00565, 0.00622, 0.00564, 0.00563, 0.00565, 0.0058, 0.00565, 0.00563, 0.00567, 0.00564, 0.00566, 0.00569, 0.00579, 0.0071, 0.00625, 0.00661, 0.00596, 0.00708, 0.00571, 0.00566, 0.00572, 0.0057, 0.00565, 0.00566, 0.00568, 0.00566, 0.00569, 0.00565, 0.00568, 0.00558, 0.00572, 0.00566, 0.00564, 0.00571, 0.00569, 0.00569, 0.00567, 0.00567, 0.00564, 0.00569, 0.00563, 0.0057, 0.00565, 0.00567, 0.00569, 0.00565, 0.00602, 0.00567, 0.00566, 0.00568, 0.00691, 0.00568, 0.00824, 0.00567, 0.00569, 0.00565, 0.00566, 0.00689, 0.00567, 0.00569]}, "learning-rate": {"start_step": 0, "end_step": 2000, "step_interval": 5, "values": [0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 1e-05, 1e-05, 1e-05, 1e-05, 1e-05, 1e-05, 1e-05, 1e-05, 1e-05, 1e-05, 1e-05, 1e-05, 1e-05, 1e-05, 1e-05, 1e-05, 1e-05, 1e-05, 1e-05, 1e-05, 1e-05, 1e-05, 1e-05, 1e-05, 1e-05, 1e-05, 1e-05, 1e-05, 1e-05, 1e-05, 1e-05, 1e-05, 1e-05, 1e-05, 1e-05, 1e-05, 1e-05, 1e-05, 1e-05, 1e-05, 1e-05, 1e-05, 2e-05, 2e-05, 2e-05, 2e-05, 2e-05, 2e-05, 2e-05, 2e-05, 2e-05, 2e-05, 2e-05, 2e-05, 2e-05, 2e-05, 2e-05, 2e-05, 2e-05, 2e-05, 2e-05, 2e-05, 2e-05, 2e-05, 2e-05, 2e-05, 2e-05, 2e-05, 2e-05, 2e-05, 2e-05, 2e-05, 2e-05, 2e-05, 2e-05, 2e-05, 2e-05, 2e-05, 2e-05, 2e-05, 2e-05, 2e-05, 2e-05, 2e-05, 2e-05, 3e-05, 3e-05, 3e-05, 3e-05, 3e-05, 3e-05, 3e-05, 3e-05, 3e-05, 3e-05, 3e-05, 3e-05, 3e-05, 3e-05, 3e-05, 3e-05, 3e-05, 3e-05, 3e-05, 3e-05, 3e-05, 3e-05, 3e-05, 3e-05, 3e-05, 3e-05, 3e-05, 3e-05, 3e-05, 3e-05, 3e-05, 3e-05, 3e-05, 3e-05, 3e-05, 3e-05, 3e-05, 3e-05, 3e-05, 3e-05, 3e-05, 3e-05, 3e-05, 4e-05, 4e-05, 4e-05, 4e-05, 4e-05, 4e-05, 4e-05, 4e-05, 4e-05, 4e-05, 4e-05, 4e-05, 4e-05, 4e-05, 4e-05, 4e-05, 4e-05, 4e-05, 4e-05, 4e-05, 4e-05, 4e-05, 4e-05, 4e-05, 4e-05, 4e-05, 4e-05, 4e-05, 4e-05, 4e-05, 4e-05, 4e-05, 4e-05, 4e-05, 4e-05, 4e-05, 4e-05, 4e-05, 4e-05, 4e-05, 4e-05, 4e-05, 5e-05, 5e-05, 5e-05, 5e-05, 5e-05, 5e-05, 5e-05, 5e-05, 5e-05, 5e-05, 5e-05, 5e-05, 5e-05, 5e-05, 5e-05, 5e-05, 5e-05, 5e-05, 5e-05, 5e-05, 5e-05, 5e-05, 5e-05, 5e-05, 5e-05, 5e-05, 5e-05, 5e-05, 5e-05, 5e-05, 5e-05, 5e-05, 5e-05, 5e-05, 5e-05, 5e-05, 5e-05, 5e-05, 5e-05, 5e-05, 5e-05, 5e-05, 5e-05, 6e-05, 6e-05, 6e-05, 6e-05, 6e-05, 6e-05, 6e-05, 6e-05, 6e-05, 6e-05, 6e-05, 6e-05, 6e-05, 6e-05, 6e-05, 6e-05, 6e-05, 6e-05, 6e-05, 6e-05, 6e-05, 6e-05, 6e-05, 6e-05, 6e-05, 6e-05, 6e-05, 6e-05, 6e-05, 6e-05, 6e-05, 6e-05, 6e-05, 6e-05, 6e-05, 6e-05, 6e-05, 6e-05, 6e-05, 6e-05, 6e-05, 6e-05, 6e-05, 7e-05, 7e-05, 7e-05, 7e-05, 7e-05, 7e-05, 7e-05, 7e-05, 7e-05, 7e-05, 7e-05, 7e-05, 7e-05, 7e-05, 7e-05, 7e-05, 7e-05, 7e-05, 7e-05, 7e-05, 7e-05, 7e-05, 7e-05, 7e-05, 7e-05, 7e-05, 7e-05, 7e-05, 7e-05, 7e-05, 7e-05, 7e-05, 7e-05, 7e-05, 7e-05, 7e-05, 7e-05, 7e-05, 7e-05, 7e-05, 7e-05, 7e-05, 8e-05, 8e-05, 8e-05, 8e-05, 8e-05, 8e-05, 8e-05, 8e-05, 8e-05, 8e-05, 8e-05, 8e-05, 8e-05, 8e-05, 8e-05, 8e-05, 8e-05, 8e-05, 8e-05, 8e-05, 8e-05, 8e-05, 8e-05, 8e-05, 8e-05, 8e-05, 8e-05, 8e-05, 8e-05, 8e-05, 8e-05, 8e-05, 8e-05, 8e-05, 8e-05, 8e-05, 8e-05, 8e-05, 8e-05, 8e-05, 8e-05, 8e-05, 8e-05, 9e-05, 9e-05, 9e-05, 9e-05, 9e-05, 9e-05, 9e-05, 9e-05, 9e-05, 9e-05, 9e-05, 9e-05, 9e-05, 9e-05, 9e-05, 9e-05, 9e-05, 9e-05, 9e-05, 9e-05, 9e-05, 9e-05, 9e-05, 9e-05, 9e-05, 9e-05, 9e-05, 9e-05, 9e-05, 9e-05, 9e-05, 9e-05, 9e-05, 9e-05, 9e-05, 9e-05, 9e-05]}, "learning-rate vs samples": {"start_step": 0, "end_step": 2000, "step_interval": 5, "values": [0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 1e-05, 1e-05, 1e-05, 1e-05, 1e-05, 1e-05, 1e-05, 1e-05, 1e-05, 1e-05, 1e-05, 1e-05, 1e-05, 1e-05, 1e-05, 1e-05, 1e-05, 1e-05, 1e-05, 1e-05, 1e-05, 1e-05, 1e-05, 1e-05, 1e-05, 1e-05, 1e-05, 1e-05, 1e-05, 1e-05, 1e-05, 1e-05, 1e-05, 1e-05, 1e-05, 1e-05, 1e-05, 1e-05, 1e-05, 1e-05, 1e-05, 1e-05, 2e-05, 2e-05, 2e-05, 2e-05, 2e-05, 2e-05, 2e-05, 2e-05, 2e-05, 2e-05, 2e-05, 2e-05, 2e-05, 2e-05, 2e-05, 2e-05, 2e-05, 2e-05, 2e-05, 2e-05, 2e-05, 2e-05, 2e-05, 2e-05, 2e-05, 2e-05, 2e-05, 2e-05, 2e-05, 2e-05, 2e-05, 2e-05, 2e-05, 2e-05, 2e-05, 2e-05, 2e-05, 2e-05, 2e-05, 2e-05, 2e-05, 2e-05, 2e-05, 3e-05, 3e-05, 3e-05, 3e-05, 3e-05, 3e-05, 3e-05, 3e-05, 3e-05, 3e-05, 3e-05, 3e-05, 3e-05, 3e-05, 3e-05, 3e-05, 3e-05, 3e-05, 3e-05, 3e-05, 3e-05, 3e-05, 3e-05, 3e-05, 3e-05, 3e-05, 3e-05, 3e-05, 3e-05, 3e-05, 3e-05, 3e-05, 3e-05, 3e-05, 3e-05, 3e-05, 3e-05, 3e-05, 3e-05, 3e-05, 3e-05, 3e-05, 3e-05, 4e-05, 4e-05, 4e-05, 4e-05, 4e-05, 4e-05, 4e-05, 4e-05, 4e-05, 4e-05, 4e-05, 4e-05, 4e-05, 4e-05, 4e-05, 4e-05, 4e-05, 4e-05, 4e-05, 4e-05, 4e-05, 4e-05, 4e-05, 4e-05, 4e-05, 4e-05, 4e-05, 4e-05, 4e-05, 4e-05, 4e-05, 4e-05, 4e-05, 4e-05, 4e-05, 4e-05, 4e-05, 4e-05, 4e-05, 4e-05, 4e-05, 4e-05, 5e-05, 5e-05, 5e-05, 5e-05, 5e-05, 5e-05, 5e-05, 5e-05, 5e-05, 5e-05, 5e-05, 5e-05, 5e-05, 5e-05, 5e-05, 5e-05, 5e-05, 5e-05, 5e-05, 5e-05, 5e-05, 5e-05, 5e-05, 5e-05, 5e-05, 5e-05, 5e-05, 5e-05, 5e-05, 5e-05, 5e-05, 5e-05, 5e-05, 5e-05, 5e-05, 5e-05, 5e-05, 5e-05, 5e-05, 5e-05, 5e-05, 5e-05, 5e-05, 6e-05, 6e-05, 6e-05, 6e-05, 6e-05, 6e-05, 6e-05, 6e-05, 6e-05, 6e-05, 6e-05, 6e-05, 6e-05, 6e-05, 6e-05, 6e-05, 6e-05, 6e-05, 6e-05, 6e-05, 6e-05, 6e-05, 6e-05, 6e-05, 6e-05, 6e-05, 6e-05, 6e-05, 6e-05, 6e-05, 6e-05, 6e-05, 6e-05, 6e-05, 6e-05, 6e-05, 6e-05, 6e-05, 6e-05, 6e-05, 6e-05, 6e-05, 6e-05, 7e-05, 7e-05, 7e-05, 7e-05, 7e-05, 7e-05, 7e-05, 7e-05, 7e-05, 7e-05, 7e-05, 7e-05, 7e-05, 7e-05, 7e-05, 7e-05, 7e-05, 7e-05, 7e-05, 7e-05, 7e-05, 7e-05, 7e-05, 7e-05, 7e-05, 7e-05, 7e-05, 7e-05, 7e-05, 7e-05, 7e-05, 7e-05, 7e-05, 7e-05, 7e-05, 7e-05, 7e-05, 7e-05, 7e-05, 7e-05, 7e-05, 7e-05, 8e-05, 8e-05, 8e-05, 8e-05, 8e-05, 8e-05, 8e-05, 8e-05, 8e-05, 8e-05, 8e-05, 8e-05, 8e-05, 8e-05, 8e-05, 8e-05, 8e-05, 8e-05, 8e-05, 8e-05, 8e-05, 8e-05, 8e-05, 8e-05, 8e-05, 8e-05, 8e-05, 8e-05, 8e-05, 8e-05, 8e-05, 8e-05, 8e-05, 8e-05, 8e-05, 8e-05, 8e-05, 8e-05, 8e-05, 8e-05, 8e-05, 8e-05, 8e-05, 9e-05, 9e-05, 9e-05, 9e-05, 9e-05, 9e-05, 9e-05, 9e-05, 9e-05, 9e-05, 9e-05, 9e-05, 9e-05, 9e-05, 9e-05, 9e-05, 9e-05, 9e-05, 9e-05, 9e-05, 9e-05, 9e-05, 9e-05, 9e-05, 9e-05, 9e-05, 9e-05, 9e-05, 9e-05, 9e-05, 9e-05, 9e-05, 9e-05, 9e-05, 9e-05, 9e-05, 9e-05]}, "batch-size": {"start_step": 0, "end_step": 2000, "step_interval": 5, "values": [128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0]}, "batch-size vs samples": {"start_step": 0, "end_step": 2000, "step_interval": 5, "values": [128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0]}, "lm loss": {"start_step": 0, "end_step": 2000, "step_interval": 5, "values": [10.84303, 10.86032, 10.84988, 10.84755, 10.76639, 10.77411, 10.67857, 10.53004, 10.38397, 10.29666, 9.92036, 10.03609, 10.04286, 9.75368, 9.87024, 9.57458, 9.50956, 9.70645, 9.43156, 9.37511, 9.284, 9.18283, 9.20684, 9.02346, 9.21677, 9.08417, 9.17277, 9.18323, 9.31569, 9.00474, 8.94547, 9.06044, 9.05792, 8.66708, 8.73014, 8.76017, 8.69512, 8.74237, 8.66438, 8.77103, 8.66577, 8.85394, 8.83642, 8.49824, 8.38764, 8.42876, 8.48638, 8.38112, 8.42721, 8.57916, 8.36213, 8.18555, 8.21868, 8.21376, 8.25912, 7.90597, 8.08558, 7.88018, 8.23297, 8.21565, 7.99013, 7.95413, 7.90374, 7.72213, 7.72557, 7.62784, 7.49843, 7.88783, 7.68211, 7.43256, 7.72606, 7.75519, 7.5254, 7.28466, 7.43748, 7.32478, 7.44941, 7.21198, 7.61949, 7.26498, 7.33394, 7.19595, 7.19608, 7.40347, 7.15606, 7.26585, 6.98127, 6.98967, 7.02701, 7.12404, 6.81114, 6.9732, 7.07844, 6.98715, 6.86379, 6.74535, 6.97969, 7.04992, 6.69473, 6.57332, 6.71755, 6.73627, 6.72482, 6.72951, 6.64965, 6.39869, 6.62934, 6.6128, 6.44062, 6.62092, 6.73782, 6.60642, 6.72099, 6.69098, 6.62325, 6.50501, 6.59411, 6.40344, 6.66286, 6.24475, 6.24827, 6.29959, 6.38833, 6.34649, 6.44604, 6.28662, 6.33306, 6.23143, 6.1945, 6.39075, 6.31833, 6.31606, 6.15661, 6.15059, 6.23078, 6.37677, 6.19418, 6.14556, 6.174, 6.10964, 6.05825, 6.06794, 6.25281, 6.40554, 6.25551, 6.29757, 6.09544, 6.1725, 6.00218, 6.02712, 5.95524, 6.25067, 6.1861, 5.96596, 5.78395, 6.12333, 5.84793, 6.10088, 5.78605, 6.16305, 6.14324, 6.08193, 5.9272, 6.11128, 5.94147, 6.19288, 5.88909, 5.78652, 5.77759, 5.68182, 6.00901, 5.99171, 6.064, 5.887, 6.03556, 5.96156, 5.98678, 5.98309, 5.94332, 5.83241, 5.94309, 5.60951, 5.69435, 5.88169, 5.83567, 5.85447, 5.75902, 5.83004, 5.71739, 5.55081, 5.71567, 5.61507, 5.82158, 5.59427, 5.70169, 5.70024, 5.89399, 5.63586, 5.84189, 5.73395, 5.86128, 5.31906, 5.89065, 5.8668, 5.84568, 5.40705, 5.40162, 5.61805, 5.58944, 5.47887, 5.57169, 5.66894, 5.46961, 5.737, 5.50292, 5.58399, 5.61697, 5.61602, 5.50714, 5.6077, 5.6651, 5.67541, 5.58049, 5.65548, 5.36443, 5.67256, 5.62445, 5.41886, 5.57712, 5.62171, 5.55213, 5.34421, 5.53498, 5.48095, 5.4778, 5.37859, 5.55337, 5.60077, 5.38946, 5.5161, 5.4845, 5.3308, 5.503, 5.40661, 5.44202, 5.3156, 5.06608, 5.47488, 5.56633, 5.71203, 5.41237, 5.602, 5.6336, 5.23514, 5.26957, 5.38908, 5.39646, 5.32832, 5.49536, 5.18302, 5.2973, 5.24699, 5.3738, 5.2533, 5.4419, 5.53407, 5.31248, 5.43315, 5.33688, 5.07446, 5.3117, 5.25312, 5.30184, 5.11129, 5.27552, 5.26324, 5.47224, 5.15822, 5.26777, 5.21213, 5.35617, 4.98409, 4.9122, 5.32204, 5.39135, 5.22909, 5.3223, 5.10207, 5.16342, 5.26324, 5.06816, 5.26642, 5.06638, 5.34472, 5.24739, 5.15433, 5.24748, 5.04399, 5.32024, 5.05488, 5.02871, 5.1457, 5.11299, 5.27264, 5.15675, 5.28106, 5.09695, 5.09458, 5.25141, 5.32789, 5.25804, 5.19731, 5.14154, 5.29133, 4.95279, 5.2099, 5.09154, 5.30528, 5.17547, 5.19246, 5.11436, 4.986, 4.99619, 5.22741, 5.31255, 5.10417, 5.06172, 4.91443, 5.12691, 5.1217, 4.93205, 5.34318, 5.02802, 5.10574, 5.17142, 5.00778, 5.07028, 5.0728, 4.99912, 5.08403, 5.16803, 4.98253, 5.18553, 4.93609, 4.93034, 5.06451, 5.00328, 4.9143, 4.78254, 4.9515, 5.1248, 5.02128, 5.01937, 5.34246, 4.96515, 4.99654, 5.05289, 4.816, 4.74072, 4.99878, 5.04752, 4.87941, 4.96151, 5.05319, 5.02704, 4.8254, 4.8992, 4.91046, 4.83957, 4.74493, 5.01861, 4.76013, 5.21014, 4.79858, 5.00113, 4.74548, 4.79219, 4.82659, 4.65777, 4.66208, 4.84897, 4.81474, 4.80913, 4.92799, 4.89236, 4.93339, 4.77993, 4.89168, 4.7432, 4.92229, 4.96619, 4.88011, 4.71273, 4.7931, 4.91139, 4.72229, 4.87421, 4.70468, 4.69956, 4.65227]}, "lm loss vs samples": {"start_step": 0, "end_step": 2000, "step_interval": 5, "values": [10.84303, 10.86032, 10.84988, 10.84755, 10.76639, 10.77411, 10.67857, 10.53004, 10.38397, 10.29666, 9.92036, 10.03609, 10.04286, 9.75368, 9.87024, 9.57458, 9.50956, 9.70645, 9.43156, 9.37511, 9.284, 9.18283, 9.20684, 9.02346, 9.21677, 9.08417, 9.17277, 9.18323, 9.31569, 9.00474, 8.94547, 9.06044, 9.05792, 8.66708, 8.73014, 8.76017, 8.69512, 8.74237, 8.66438, 8.77103, 8.66577, 8.85394, 8.83642, 8.49824, 8.38764, 8.42876, 8.48638, 8.38112, 8.42721, 8.57916, 8.36213, 8.18555, 8.21868, 8.21376, 8.25912, 7.90597, 8.08558, 7.88018, 8.23297, 8.21565, 7.99013, 7.95413, 7.90374, 7.72213, 7.72557, 7.62784, 7.49843, 7.88783, 7.68211, 7.43256, 7.72606, 7.75519, 7.5254, 7.28466, 7.43748, 7.32478, 7.44941, 7.21198, 7.61949, 7.26498, 7.33394, 7.19595, 7.19608, 7.40347, 7.15606, 7.26585, 6.98127, 6.98967, 7.02701, 7.12404, 6.81114, 6.9732, 7.07844, 6.98715, 6.86379, 6.74535, 6.97969, 7.04992, 6.69473, 6.57332, 6.71755, 6.73627, 6.72482, 6.72951, 6.64965, 6.39869, 6.62934, 6.6128, 6.44062, 6.62092, 6.73782, 6.60642, 6.72099, 6.69098, 6.62325, 6.50501, 6.59411, 6.40344, 6.66286, 6.24475, 6.24827, 6.29959, 6.38833, 6.34649, 6.44604, 6.28662, 6.33306, 6.23143, 6.1945, 6.39075, 6.31833, 6.31606, 6.15661, 6.15059, 6.23078, 6.37677, 6.19418, 6.14556, 6.174, 6.10964, 6.05825, 6.06794, 6.25281, 6.40554, 6.25551, 6.29757, 6.09544, 6.1725, 6.00218, 6.02712, 5.95524, 6.25067, 6.1861, 5.96596, 5.78395, 6.12333, 5.84793, 6.10088, 5.78605, 6.16305, 6.14324, 6.08193, 5.9272, 6.11128, 5.94147, 6.19288, 5.88909, 5.78652, 5.77759, 5.68182, 6.00901, 5.99171, 6.064, 5.887, 6.03556, 5.96156, 5.98678, 5.98309, 5.94332, 5.83241, 5.94309, 5.60951, 5.69435, 5.88169, 5.83567, 5.85447, 5.75902, 5.83004, 5.71739, 5.55081, 5.71567, 5.61507, 5.82158, 5.59427, 5.70169, 5.70024, 5.89399, 5.63586, 5.84189, 5.73395, 5.86128, 5.31906, 5.89065, 5.8668, 5.84568, 5.40705, 5.40162, 5.61805, 5.58944, 5.47887, 5.57169, 5.66894, 5.46961, 5.737, 5.50292, 5.58399, 5.61697, 5.61602, 5.50714, 5.6077, 5.6651, 5.67541, 5.58049, 5.65548, 5.36443, 5.67256, 5.62445, 5.41886, 5.57712, 5.62171, 5.55213, 5.34421, 5.53498, 5.48095, 5.4778, 5.37859, 5.55337, 5.60077, 5.38946, 5.5161, 5.4845, 5.3308, 5.503, 5.40661, 5.44202, 5.3156, 5.06608, 5.47488, 5.56633, 5.71203, 5.41237, 5.602, 5.6336, 5.23514, 5.26957, 5.38908, 5.39646, 5.32832, 5.49536, 5.18302, 5.2973, 5.24699, 5.3738, 5.2533, 5.4419, 5.53407, 5.31248, 5.43315, 5.33688, 5.07446, 5.3117, 5.25312, 5.30184, 5.11129, 5.27552, 5.26324, 5.47224, 5.15822, 5.26777, 5.21213, 5.35617, 4.98409, 4.9122, 5.32204, 5.39135, 5.22909, 5.3223, 5.10207, 5.16342, 5.26324, 5.06816, 5.26642, 5.06638, 5.34472, 5.24739, 5.15433, 5.24748, 5.04399, 5.32024, 5.05488, 5.02871, 5.1457, 5.11299, 5.27264, 5.15675, 5.28106, 5.09695, 5.09458, 5.25141, 5.32789, 5.25804, 5.19731, 5.14154, 5.29133, 4.95279, 5.2099, 5.09154, 5.30528, 5.17547, 5.19246, 5.11436, 4.986, 4.99619, 5.22741, 5.31255, 5.10417, 5.06172, 4.91443, 5.12691, 5.1217, 4.93205, 5.34318, 5.02802, 5.10574, 5.17142, 5.00778, 5.07028, 5.0728, 4.99912, 5.08403, 5.16803, 4.98253, 5.18553, 4.93609, 4.93034, 5.06451, 5.00328, 4.9143, 4.78254, 4.9515, 5.1248, 5.02128, 5.01937, 5.34246, 4.96515, 4.99654, 5.05289, 4.816, 4.74072, 4.99878, 5.04752, 4.87941, 4.96151, 5.05319, 5.02704, 4.8254, 4.8992, 4.91046, 4.83957, 4.74493, 5.01861, 4.76013, 5.21014, 4.79858, 5.00113, 4.74548, 4.79219, 4.82659, 4.65777, 4.66208, 4.84897, 4.81474, 4.80913, 4.92799, 4.89236, 4.93339, 4.77993, 4.89168, 4.7432, 4.92229, 4.96619, 4.88011, 4.71273, 4.7931, 4.91139, 4.72229, 4.87421, 4.70468, 4.69956, 4.65227]}, "loss-scale": {"start_step": 0, "end_step": 2000, "step_interval": 5, "values": [1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0]}, "loss-scale vs samples": {"start_step": 0, "end_step": 2000, "step_interval": 5, "values": [1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0]}, "grad-norm": {"start_step": 0, "end_step": 2000, "step_interval": 5, "values": [13.92196, 13.64105, 14.19575, 13.10329, 13.56093, 11.06924, 10.32704, 12.58903, 11.89406, 9.6749, 7.04626, 4.0336, 3.15187, 2.82418, 2.35804, 2.43442, 2.16004, 1.97461, 2.14035, 2.12249, 2.20138, 2.2657, 2.05671, 2.22896, 1.95829, 2.02503, 1.88632, 1.84693, 1.87101, 2.18322, 2.10962, 1.97689, 1.94956, 2.15482, 2.33059, 2.0713, 2.06596, 1.83468, 1.98146, 1.78906, 2.08095, 1.74031, 1.73584, 1.83223, 1.93635, 1.78517, 1.74533, 1.74989, 1.72773, 1.51419, 1.74951, 1.76214, 1.76755, 1.83739, 1.54724, 1.80208, 1.67454, 1.80868, 1.51645, 1.42949, 1.65422, 1.43167, 1.74384, 1.82674, 1.56795, 1.61973, 1.62231, 1.51322, 1.4269, 1.55439, 1.3649, 1.40671, 1.47679, 1.40979, 1.35488, 1.43798, 1.41114, 1.34745, 1.32431, 1.23395, 1.36576, 1.22914, 1.25372, 1.35028, 1.23455, 1.29297, 1.37717, 1.26373, 1.37004, 1.08995, 1.10379, 1.10875, 1.15108, 1.26523, 0.89985, 1.39001, 1.10735, 1.30884, 1.00577, 1.31705, 1.15922, 1.16049, 1.08293, 1.30514, 0.98385, 1.11074, 1.1592, 0.9745, 1.26156, 1.13226, 0.98984, 0.97441, 0.96023, 0.94898, 1.04337, 1.04095, 0.96044, 1.19634, 1.26146, 1.4137, 0.97849, 1.01274, 1.06643, 1.01496, 0.94459, 1.13752, 1.02579, 1.05074, 1.22247, 1.26548, 1.04774, 1.44863, 1.15549, 1.15597, 1.19734, 1.2287, 1.25743, 1.88802, 1.76897, 1.48112, 1.4651, 1.39709, 1.38654, 1.09404, 1.62425, 1.69258, 1.31425, 1.11912, 1.16099, 1.18343, 1.29282, 1.58176, 1.59702, 1.35711, 1.25116, 1.93028, 1.26411, 1.16234, 1.73045, 1.37516, 1.21056, 1.1698, 1.36362, 1.31019, 1.41174, 1.1141, 1.35444, 1.27655, 1.56101, 1.26438, 1.09582, 1.27416, 1.41508, 1.54422, 1.36323, 1.24407, 1.29014, 1.18935, 1.13176, 1.03122, 1.33001, 1.37077, 1.14753, 1.11258, 1.66325, 1.11887, 1.76805, 1.40233, 1.37783, 1.50291, 1.27142, 1.30216, 1.29887, 1.46138, 1.55382, 1.23876, 1.8076, 1.40113, 1.63396, 1.55057, 1.08699, 1.24471, 1.22211, 1.14251, 1.26485, 1.45246, 1.55789, 1.71804, 1.37054, 1.61527, 1.57346, 1.43675, 1.26103, 1.17063, 1.56904, 1.17977, 1.4408, 1.72049, 1.50941, 1.30391, 1.34373, 1.32377, 1.27909, 1.56247, 1.31671, 1.38601, 1.61151, 1.49478, 1.75857, 1.27914, 1.31454, 2.08285, 1.65152, 1.54337, 1.46369, 1.68505, 1.74708, 1.34813, 1.53151, 1.36655, 1.5068, 1.33926, 1.42092, 1.39573, 1.3088, 1.90711, 1.46652, 1.29613, 1.44842, 1.30354, 1.28453, 1.49548, 1.47812, 1.39914, 1.32083, 1.19715, 1.79989, 1.43253, 1.35222, 1.42532, 1.23793, 1.41904, 1.21814, 1.25683, 1.2335, 1.46238, 1.48727, 1.4808, 1.33354, 1.33662, 1.26457, 1.31807, 1.46217, 1.35853, 1.55295, 1.20988, 1.50233, 1.51611, 1.48328, 1.32591, 1.35903, 1.25739, 1.45462, 1.40772, 1.52784, 1.49325, 1.48176, 1.41498, 1.37099, 1.4565, 1.35995, 1.85538, 1.22436, 1.50223, 1.62834, 2.02006, 1.60123, 1.72187, 1.44841, 1.22003, 1.2907, 1.31733, 1.13053, 1.33575, 1.57284, 1.47894, 1.41277, 1.40064, 1.30099, 1.35607, 1.52515, 1.48522, 1.31187, 1.24496, 1.36995, 1.60389, 1.24009, 1.55027, 1.2329, 1.34795, 1.32343, 1.38946, 1.27338, 1.46297, 1.50613, 1.56272, 1.67908, 1.41893, 1.40655, 1.34016, 1.79612, 1.52344, 1.31538, 1.82889, 1.5317, 1.18989, 1.44241, 1.33335, 1.49631, 1.45109, 1.41567, 1.28181, 1.28831, 1.39113, 1.42151, 1.1475, 1.49249, 1.42727, 1.4635, 1.13088, 1.41, 1.30719, 1.30003, 1.92172, 1.44667, 1.42061, 1.31137, 1.5365, 1.46596, 1.30019, 1.53226, 1.21709, 1.36071, 1.47588, 1.10067, 1.46261, 1.69979, 1.33386, 1.3067, 1.50275, 1.48945, 1.4021, 1.56615, 1.59437, 1.41693, 1.52987, 1.27517, 1.55287, 1.38137, 1.28009, 1.33198, 1.29291, 1.40497, 1.25603, 1.18811, 1.37138, 1.43758, 1.46419, 1.4718, 1.35085, 1.22463, 1.2576, 1.44724, 1.32087, 1.61352, 1.4648, 1.47154, 1.80709, 1.41366, 1.12723]}, "grad-norm vs samples": {"start_step": 0, "end_step": 2000, "step_interval": 5, "values": [13.92196, 13.64105, 14.19575, 13.10329, 13.56093, 11.06924, 10.32704, 12.58903, 11.89406, 9.6749, 7.04626, 4.0336, 3.15187, 2.82418, 2.35804, 2.43442, 2.16004, 1.97461, 2.14035, 2.12249, 2.20138, 2.2657, 2.05671, 2.22896, 1.95829, 2.02503, 1.88632, 1.84693, 1.87101, 2.18322, 2.10962, 1.97689, 1.94956, 2.15482, 2.33059, 2.0713, 2.06596, 1.83468, 1.98146, 1.78906, 2.08095, 1.74031, 1.73584, 1.83223, 1.93635, 1.78517, 1.74533, 1.74989, 1.72773, 1.51419, 1.74951, 1.76214, 1.76755, 1.83739, 1.54724, 1.80208, 1.67454, 1.80868, 1.51645, 1.42949, 1.65422, 1.43167, 1.74384, 1.82674, 1.56795, 1.61973, 1.62231, 1.51322, 1.4269, 1.55439, 1.3649, 1.40671, 1.47679, 1.40979, 1.35488, 1.43798, 1.41114, 1.34745, 1.32431, 1.23395, 1.36576, 1.22914, 1.25372, 1.35028, 1.23455, 1.29297, 1.37717, 1.26373, 1.37004, 1.08995, 1.10379, 1.10875, 1.15108, 1.26523, 0.89985, 1.39001, 1.10735, 1.30884, 1.00577, 1.31705, 1.15922, 1.16049, 1.08293, 1.30514, 0.98385, 1.11074, 1.1592, 0.9745, 1.26156, 1.13226, 0.98984, 0.97441, 0.96023, 0.94898, 1.04337, 1.04095, 0.96044, 1.19634, 1.26146, 1.4137, 0.97849, 1.01274, 1.06643, 1.01496, 0.94459, 1.13752, 1.02579, 1.05074, 1.22247, 1.26548, 1.04774, 1.44863, 1.15549, 1.15597, 1.19734, 1.2287, 1.25743, 1.88802, 1.76897, 1.48112, 1.4651, 1.39709, 1.38654, 1.09404, 1.62425, 1.69258, 1.31425, 1.11912, 1.16099, 1.18343, 1.29282, 1.58176, 1.59702, 1.35711, 1.25116, 1.93028, 1.26411, 1.16234, 1.73045, 1.37516, 1.21056, 1.1698, 1.36362, 1.31019, 1.41174, 1.1141, 1.35444, 1.27655, 1.56101, 1.26438, 1.09582, 1.27416, 1.41508, 1.54422, 1.36323, 1.24407, 1.29014, 1.18935, 1.13176, 1.03122, 1.33001, 1.37077, 1.14753, 1.11258, 1.66325, 1.11887, 1.76805, 1.40233, 1.37783, 1.50291, 1.27142, 1.30216, 1.29887, 1.46138, 1.55382, 1.23876, 1.8076, 1.40113, 1.63396, 1.55057, 1.08699, 1.24471, 1.22211, 1.14251, 1.26485, 1.45246, 1.55789, 1.71804, 1.37054, 1.61527, 1.57346, 1.43675, 1.26103, 1.17063, 1.56904, 1.17977, 1.4408, 1.72049, 1.50941, 1.30391, 1.34373, 1.32377, 1.27909, 1.56247, 1.31671, 1.38601, 1.61151, 1.49478, 1.75857, 1.27914, 1.31454, 2.08285, 1.65152, 1.54337, 1.46369, 1.68505, 1.74708, 1.34813, 1.53151, 1.36655, 1.5068, 1.33926, 1.42092, 1.39573, 1.3088, 1.90711, 1.46652, 1.29613, 1.44842, 1.30354, 1.28453, 1.49548, 1.47812, 1.39914, 1.32083, 1.19715, 1.79989, 1.43253, 1.35222, 1.42532, 1.23793, 1.41904, 1.21814, 1.25683, 1.2335, 1.46238, 1.48727, 1.4808, 1.33354, 1.33662, 1.26457, 1.31807, 1.46217, 1.35853, 1.55295, 1.20988, 1.50233, 1.51611, 1.48328, 1.32591, 1.35903, 1.25739, 1.45462, 1.40772, 1.52784, 1.49325, 1.48176, 1.41498, 1.37099, 1.4565, 1.35995, 1.85538, 1.22436, 1.50223, 1.62834, 2.02006, 1.60123, 1.72187, 1.44841, 1.22003, 1.2907, 1.31733, 1.13053, 1.33575, 1.57284, 1.47894, 1.41277, 1.40064, 1.30099, 1.35607, 1.52515, 1.48522, 1.31187, 1.24496, 1.36995, 1.60389, 1.24009, 1.55027, 1.2329, 1.34795, 1.32343, 1.38946, 1.27338, 1.46297, 1.50613, 1.56272, 1.67908, 1.41893, 1.40655, 1.34016, 1.79612, 1.52344, 1.31538, 1.82889, 1.5317, 1.18989, 1.44241, 1.33335, 1.49631, 1.45109, 1.41567, 1.28181, 1.28831, 1.39113, 1.42151, 1.1475, 1.49249, 1.42727, 1.4635, 1.13088, 1.41, 1.30719, 1.30003, 1.92172, 1.44667, 1.42061, 1.31137, 1.5365, 1.46596, 1.30019, 1.53226, 1.21709, 1.36071, 1.47588, 1.10067, 1.46261, 1.69979, 1.33386, 1.3067, 1.50275, 1.48945, 1.4021, 1.56615, 1.59437, 1.41693, 1.52987, 1.27517, 1.55287, 1.38137, 1.28009, 1.33198, 1.29291, 1.40497, 1.25603, 1.18811, 1.37138, 1.43758, 1.46419, 1.4718, 1.35085, 1.22463, 1.2576, 1.44724, 1.32087, 1.61352, 1.4648, 1.47154, 1.80709, 1.41366, 1.12723]}, "num-zeros": {"start_step": 0, "end_step": 2000, "step_interval": 5, "values": [65.0, 71.0, 74.0, 78.0, 68.0, 65.0, 79.0, 104.0, 95.0, 118.0, 116.0, 161.0, 141.0, 148.0, 182.0, 146.0, 164.0, 199.0, 174.0, 205.0, 166.0, 167.0, 186.0, 158.0, 195.0, 179.0, 188.0, 208.0, 187.0, 145.0, 145.0, 146.0, 156.0, 175.0, 132.0, 180.0, 177.0, 205.0, 172.0, 159.0, 158.0, 175.0, 153.0, 203.0, 196.0, 170.0, 185.0, 179.0, 140.0, 227.0, 198.0, 165.0, 172.0, 149.0, 199.0, 213.0, 179.0, 157.0, 255.0, 240.0, 186.0, 191.0, 164.0, 186.0, 208.0, 229.0, 213.0, 198.0, 198.0, 178.0, 246.0, 222.0, 177.0, 236.0, 193.0, 215.0, 226.0, 205.0, 251.0, 226.0, 224.0, 245.0, 219.0, 205.0, 198.0, 190.0, 171.0, 191.0, 171.0, 187.0, 182.0, 207.0, 233.0, 201.0, 220.0, 152.0, 216.0, 194.0, 175.0, 157.0, 165.0, 188.0, 163.0, 163.0, 160.0, 155.0, 160.0, 167.0, 144.0, 190.0, 194.0, 143.0, 153.0, 175.0, 158.0, 147.0, 166.0, 115.0, 142.0, 141.0, 117.0, 131.0, 132.0, 130.0, 164.0, 131.0, 136.0, 129.0, 150.0, 146.0, 133.0, 96.0, 139.0, 119.0, 108.0, 124.0, 109.0, 114.0, 113.0, 123.0, 125.0, 129.0, 99.0, 159.0, 109.0, 115.0, 127.0, 128.0, 101.0, 122.0, 118.0, 113.0, 110.0, 107.0, 112.0, 89.0, 107.0, 118.0, 89.0, 101.0, 127.0, 125.0, 111.0, 110.0, 121.0, 125.0, 111.0, 123.0, 109.0, 116.0, 118.0, 107.0, 87.0, 105.0, 121.0, 111.0, 127.0, 128.0, 116.0, 128.0, 116.0, 112.0, 135.0, 122.0, 106.0, 97.0, 100.0, 121.0, 94.0, 117.0, 124.0, 93.0, 116.0, 99.0, 114.0, 107.0, 96.0, 105.0, 102.0, 84.0, 138.0, 100.0, 100.0, 115.0, 133.0, 101.0, 99.0, 105.0, 116.0, 109.0, 100.0, 109.0, 120.0, 131.0, 107.0, 110.0, 111.0, 98.0, 118.0, 97.0, 122.0, 115.0, 121.0, 114.0, 91.0, 86.0, 116.0, 85.0, 79.0, 99.0, 97.0, 89.0, 103.0, 78.0, 108.0, 107.0, 78.0, 101.0, 99.0, 96.0, 119.0, 87.0, 98.0, 113.0, 112.0, 101.0, 78.0, 125.0, 101.0, 102.0, 137.0, 85.0, 97.0, 96.0, 119.0, 119.0, 93.0, 84.0, 94.0, 91.0, 132.0, 108.0, 113.0, 98.0, 127.0, 102.0, 88.0, 93.0, 124.0, 102.0, 99.0, 97.0, 99.0, 85.0, 103.0, 94.0, 108.0, 116.0, 103.0, 114.0, 105.0, 123.0, 122.0, 94.0, 104.0, 101.0, 103.0, 109.0, 115.0, 117.0, 125.0, 81.0, 115.0, 112.0, 116.0, 100.0, 108.0, 105.0, 97.0, 101.0, 105.0, 98.0, 124.0, 98.0, 101.0, 103.0, 123.0, 124.0, 122.0, 115.0, 102.0, 115.0, 116.0, 122.0, 111.0, 88.0, 99.0, 95.0, 112.0, 122.0, 131.0, 110.0, 112.0, 96.0, 108.0, 100.0, 103.0, 106.0, 119.0, 104.0, 102.0, 97.0, 125.0, 93.0, 117.0, 133.0, 112.0, 137.0, 110.0, 104.0, 120.0, 115.0, 111.0, 118.0, 113.0, 100.0, 125.0, 108.0, 109.0, 122.0, 99.0, 128.0, 105.0, 112.0, 122.0, 112.0, 114.0, 109.0, 108.0, 111.0, 113.0, 114.0, 105.0, 101.0, 110.0, 104.0, 112.0, 114.0, 109.0, 92.0, 111.0, 102.0, 91.0, 119.0, 111.0, 95.0, 107.0, 97.0, 115.0, 108.0, 124.0, 118.0, 123.0, 119.0, 122.0, 112.0, 106.0, 101.0, 93.0, 116.0, 123.0, 112.0, 120.0, 87.0, 102.0, 116.0, 113.0, 118.0, 135.0, 110.0]}, "num-zeros vs samples": {"start_step": 0, "end_step": 2000, "step_interval": 5, "values": [65.0, 71.0, 74.0, 78.0, 68.0, 65.0, 79.0, 104.0, 95.0, 118.0, 116.0, 161.0, 141.0, 148.0, 182.0, 146.0, 164.0, 199.0, 174.0, 205.0, 166.0, 167.0, 186.0, 158.0, 195.0, 179.0, 188.0, 208.0, 187.0, 145.0, 145.0, 146.0, 156.0, 175.0, 132.0, 180.0, 177.0, 205.0, 172.0, 159.0, 158.0, 175.0, 153.0, 203.0, 196.0, 170.0, 185.0, 179.0, 140.0, 227.0, 198.0, 165.0, 172.0, 149.0, 199.0, 213.0, 179.0, 157.0, 255.0, 240.0, 186.0, 191.0, 164.0, 186.0, 208.0, 229.0, 213.0, 198.0, 198.0, 178.0, 246.0, 222.0, 177.0, 236.0, 193.0, 215.0, 226.0, 205.0, 251.0, 226.0, 224.0, 245.0, 219.0, 205.0, 198.0, 190.0, 171.0, 191.0, 171.0, 187.0, 182.0, 207.0, 233.0, 201.0, 220.0, 152.0, 216.0, 194.0, 175.0, 157.0, 165.0, 188.0, 163.0, 163.0, 160.0, 155.0, 160.0, 167.0, 144.0, 190.0, 194.0, 143.0, 153.0, 175.0, 158.0, 147.0, 166.0, 115.0, 142.0, 141.0, 117.0, 131.0, 132.0, 130.0, 164.0, 131.0, 136.0, 129.0, 150.0, 146.0, 133.0, 96.0, 139.0, 119.0, 108.0, 124.0, 109.0, 114.0, 113.0, 123.0, 125.0, 129.0, 99.0, 159.0, 109.0, 115.0, 127.0, 128.0, 101.0, 122.0, 118.0, 113.0, 110.0, 107.0, 112.0, 89.0, 107.0, 118.0, 89.0, 101.0, 127.0, 125.0, 111.0, 110.0, 121.0, 125.0, 111.0, 123.0, 109.0, 116.0, 118.0, 107.0, 87.0, 105.0, 121.0, 111.0, 127.0, 128.0, 116.0, 128.0, 116.0, 112.0, 135.0, 122.0, 106.0, 97.0, 100.0, 121.0, 94.0, 117.0, 124.0, 93.0, 116.0, 99.0, 114.0, 107.0, 96.0, 105.0, 102.0, 84.0, 138.0, 100.0, 100.0, 115.0, 133.0, 101.0, 99.0, 105.0, 116.0, 109.0, 100.0, 109.0, 120.0, 131.0, 107.0, 110.0, 111.0, 98.0, 118.0, 97.0, 122.0, 115.0, 121.0, 114.0, 91.0, 86.0, 116.0, 85.0, 79.0, 99.0, 97.0, 89.0, 103.0, 78.0, 108.0, 107.0, 78.0, 101.0, 99.0, 96.0, 119.0, 87.0, 98.0, 113.0, 112.0, 101.0, 78.0, 125.0, 101.0, 102.0, 137.0, 85.0, 97.0, 96.0, 119.0, 119.0, 93.0, 84.0, 94.0, 91.0, 132.0, 108.0, 113.0, 98.0, 127.0, 102.0, 88.0, 93.0, 124.0, 102.0, 99.0, 97.0, 99.0, 85.0, 103.0, 94.0, 108.0, 116.0, 103.0, 114.0, 105.0, 123.0, 122.0, 94.0, 104.0, 101.0, 103.0, 109.0, 115.0, 117.0, 125.0, 81.0, 115.0, 112.0, 116.0, 100.0, 108.0, 105.0, 97.0, 101.0, 105.0, 98.0, 124.0, 98.0, 101.0, 103.0, 123.0, 124.0, 122.0, 115.0, 102.0, 115.0, 116.0, 122.0, 111.0, 88.0, 99.0, 95.0, 112.0, 122.0, 131.0, 110.0, 112.0, 96.0, 108.0, 100.0, 103.0, 106.0, 119.0, 104.0, 102.0, 97.0, 125.0, 93.0, 117.0, 133.0, 112.0, 137.0, 110.0, 104.0, 120.0, 115.0, 111.0, 118.0, 113.0, 100.0, 125.0, 108.0, 109.0, 122.0, 99.0, 128.0, 105.0, 112.0, 122.0, 112.0, 114.0, 109.0, 108.0, 111.0, 113.0, 114.0, 105.0, 101.0, 110.0, 104.0, 112.0, 114.0, 109.0, 92.0, 111.0, 102.0, 91.0, 119.0, 111.0, 95.0, 107.0, 97.0, 115.0, 108.0, 124.0, 118.0, 123.0, 119.0, 122.0, 112.0, 106.0, 101.0, 93.0, 116.0, 123.0, 112.0, 120.0, 87.0, 102.0, 116.0, 113.0, 118.0, 135.0, 110.0]}, "params-norm": {"start_step": 0, "end_step": 2000, "step_interval": 5, "values": [180.02467, 180.43758, 180.43758, 180.43758, 180.43777, 180.43793, 180.43831, 180.43793, 180.43802, 180.43829, 180.42958, 180.42429, 180.42331, 180.41566, 180.40419, 180.39563, 180.38898, 180.38272, 180.37767, 180.36685, 180.3624, 180.35443, 180.34731, 180.34276, 180.34059, 180.33824, 180.33578, 180.33348, 180.33122, 180.32733, 180.32587, 180.33919, 180.33737, 180.34363, 180.342, 180.33069, 180.32681, 180.32585, 180.32092, 180.32108, 180.32138, 180.32286, 180.32475, 180.31996, 180.32315, 180.32661, 180.32353, 180.32631, 180.3324, 180.33495, 180.34297, 180.3517, 180.3622, 180.37233, 180.38312, 180.39899, 180.40675, 180.41966, 180.43352, 180.4433, 180.45685, 180.47159, 180.48726, 180.50627, 180.52652, 180.54735, 180.56612, 180.58878, 180.60876, 180.63284, 180.65726, 180.68228, 180.70798, 180.73802, 180.77544, 180.79449, 180.82513, 180.85388, 180.88318, 180.90952, 180.93996, 180.9704, 181.00171, 181.03206, 181.06531, 181.1013, 181.13477, 181.15898, 181.19191, 181.22948, 181.26605, 181.30205, 181.33176, 181.36722, 181.40222, 181.43898, 181.4686, 181.50232, 181.53323, 181.56693, 181.60017, 181.63365, 181.66275, 181.69737, 181.73155, 181.76347, 181.8042, 181.83623, 181.86909, 181.90247, 181.93695, 181.96951, 182.00578, 182.04301, 182.07603, 182.11412, 182.15521, 182.18857, 182.22928, 182.26672, 182.3042, 182.34148, 182.37926, 182.41901, 182.45923, 182.49518, 182.53793, 182.57965, 182.61847, 182.65536, 182.6929, 182.72876, 182.76958, 182.80853, 182.85202, 182.88937, 182.92555, 182.96187, 182.99063, 183.02582, 183.05833, 183.08974, 183.12651, 183.16095, 183.19424, 183.233, 183.26149, 183.29265, 183.32909, 183.36882, 183.40269, 183.43456, 183.47014, 183.51022, 183.54683, 183.57953, 183.61252, 183.64738, 183.68155, 183.71558, 183.75716, 183.79567, 183.83615, 183.87654, 183.9173, 183.9584, 184.00073, 184.04141, 184.08711, 184.12192, 184.16089, 184.19904, 184.23912, 184.27597, 184.31317, 184.35162, 184.39233, 184.43021, 184.46562, 184.50061, 184.54076, 184.5798, 184.62137, 184.66426, 184.70601, 184.74544, 184.7812, 184.8163, 184.85382, 184.89362, 184.9332, 184.9715, 185.00937, 185.05093, 185.09132, 185.12502, 185.16487, 185.20316, 185.24188, 185.27464, 185.31422, 185.35551, 185.3972, 185.43919, 185.47906, 185.52074, 185.56161, 185.60054, 185.64554, 185.68713, 185.72649, 185.76546, 185.80576, 185.84767, 185.89198, 185.9361, 185.98022, 186.01895, 186.05711, 186.10294, 186.13905, 186.17926, 186.22005, 186.25861, 186.29631, 186.33633, 186.37819, 186.41498, 186.452, 186.48996, 186.52638, 186.56227, 186.59106, 186.62415, 186.66559, 186.70592, 186.74504, 186.78651, 186.83006, 186.87518, 186.91788, 186.96049, 187.00543, 187.05008, 187.09511, 187.13741, 187.17758, 187.21588, 187.25984, 187.30086, 187.34575, 187.39095, 187.43542, 187.4792, 187.51852, 187.56268, 187.60396, 187.64711, 187.68872, 187.73135, 187.77692, 187.81973, 187.86543, 187.91296, 187.96025, 188.00529, 188.04802, 188.0909, 188.13518, 188.18434, 188.22716, 188.27409, 188.32169, 188.36803, 188.41319, 188.45816, 188.50641, 188.54868, 188.59381, 188.6367, 188.68343, 188.72693, 188.77374, 188.8172, 188.86154, 188.90767, 188.95059, 188.99326, 189.04083, 189.08832, 189.13934, 189.1855, 189.2296, 189.27489, 189.32558, 189.36694, 189.41133, 189.45744, 189.50322, 189.54796, 189.59531, 189.6389, 189.68634, 189.73462, 189.78769, 189.83501, 189.88196, 189.92941, 189.97726, 190.02953, 190.08095, 190.13335, 190.18449, 190.23326, 190.28383, 190.33415, 190.38512, 190.43832, 190.49026, 190.5453, 190.59666, 190.65088, 190.70216, 190.75441, 190.80804, 190.85649, 190.90819, 190.957, 191.00778, 191.05713, 191.10803, 191.15628, 191.20445, 191.25539, 191.30585, 191.35631, 191.40929, 191.46144, 191.5153, 191.5732, 191.6273, 191.67821, 191.73494, 191.79005, 191.84462, 191.89845, 191.95538, 192.01093, 192.06554, 192.1189, 192.17081, 192.2244, 192.2774, 192.33224, 192.38445, 192.44177, 192.49707, 192.55254, 192.60464, 192.65576, 192.70808, 192.76437, 192.82317, 192.88344, 192.93953, 192.99843, 193.05219, 193.1062, 193.16641, 193.22375, 193.28175, 193.3349, 193.39145, 193.44878, 193.50717, 193.55751, 193.61333, 193.66898, 193.72675, 193.79041, 193.84534, 193.90236, 193.96567, 194.0249, 194.08501, 194.14468, 194.2052, 194.2684, 194.32666, 194.38776, 194.44768, 194.50999, 194.57324, 194.63622, 194.69333, 194.74876, 194.80455, 194.86299, 194.92128, 194.97459]}, "params-norm vs samples": {"start_step": 0, "end_step": 2000, "step_interval": 5, "values": [180.02467, 180.43758, 180.43758, 180.43758, 180.43777, 180.43793, 180.43831, 180.43793, 180.43802, 180.43829, 180.42958, 180.42429, 180.42331, 180.41566, 180.40419, 180.39563, 180.38898, 180.38272, 180.37767, 180.36685, 180.3624, 180.35443, 180.34731, 180.34276, 180.34059, 180.33824, 180.33578, 180.33348, 180.33122, 180.32733, 180.32587, 180.33919, 180.33737, 180.34363, 180.342, 180.33069, 180.32681, 180.32585, 180.32092, 180.32108, 180.32138, 180.32286, 180.32475, 180.31996, 180.32315, 180.32661, 180.32353, 180.32631, 180.3324, 180.33495, 180.34297, 180.3517, 180.3622, 180.37233, 180.38312, 180.39899, 180.40675, 180.41966, 180.43352, 180.4433, 180.45685, 180.47159, 180.48726, 180.50627, 180.52652, 180.54735, 180.56612, 180.58878, 180.60876, 180.63284, 180.65726, 180.68228, 180.70798, 180.73802, 180.77544, 180.79449, 180.82513, 180.85388, 180.88318, 180.90952, 180.93996, 180.9704, 181.00171, 181.03206, 181.06531, 181.1013, 181.13477, 181.15898, 181.19191, 181.22948, 181.26605, 181.30205, 181.33176, 181.36722, 181.40222, 181.43898, 181.4686, 181.50232, 181.53323, 181.56693, 181.60017, 181.63365, 181.66275, 181.69737, 181.73155, 181.76347, 181.8042, 181.83623, 181.86909, 181.90247, 181.93695, 181.96951, 182.00578, 182.04301, 182.07603, 182.11412, 182.15521, 182.18857, 182.22928, 182.26672, 182.3042, 182.34148, 182.37926, 182.41901, 182.45923, 182.49518, 182.53793, 182.57965, 182.61847, 182.65536, 182.6929, 182.72876, 182.76958, 182.80853, 182.85202, 182.88937, 182.92555, 182.96187, 182.99063, 183.02582, 183.05833, 183.08974, 183.12651, 183.16095, 183.19424, 183.233, 183.26149, 183.29265, 183.32909, 183.36882, 183.40269, 183.43456, 183.47014, 183.51022, 183.54683, 183.57953, 183.61252, 183.64738, 183.68155, 183.71558, 183.75716, 183.79567, 183.83615, 183.87654, 183.9173, 183.9584, 184.00073, 184.04141, 184.08711, 184.12192, 184.16089, 184.19904, 184.23912, 184.27597, 184.31317, 184.35162, 184.39233, 184.43021, 184.46562, 184.50061, 184.54076, 184.5798, 184.62137, 184.66426, 184.70601, 184.74544, 184.7812, 184.8163, 184.85382, 184.89362, 184.9332, 184.9715, 185.00937, 185.05093, 185.09132, 185.12502, 185.16487, 185.20316, 185.24188, 185.27464, 185.31422, 185.35551, 185.3972, 185.43919, 185.47906, 185.52074, 185.56161, 185.60054, 185.64554, 185.68713, 185.72649, 185.76546, 185.80576, 185.84767, 185.89198, 185.9361, 185.98022, 186.01895, 186.05711, 186.10294, 186.13905, 186.17926, 186.22005, 186.25861, 186.29631, 186.33633, 186.37819, 186.41498, 186.452, 186.48996, 186.52638, 186.56227, 186.59106, 186.62415, 186.66559, 186.70592, 186.74504, 186.78651, 186.83006, 186.87518, 186.91788, 186.96049, 187.00543, 187.05008, 187.09511, 187.13741, 187.17758, 187.21588, 187.25984, 187.30086, 187.34575, 187.39095, 187.43542, 187.4792, 187.51852, 187.56268, 187.60396, 187.64711, 187.68872, 187.73135, 187.77692, 187.81973, 187.86543, 187.91296, 187.96025, 188.00529, 188.04802, 188.0909, 188.13518, 188.18434, 188.22716, 188.27409, 188.32169, 188.36803, 188.41319, 188.45816, 188.50641, 188.54868, 188.59381, 188.6367, 188.68343, 188.72693, 188.77374, 188.8172, 188.86154, 188.90767, 188.95059, 188.99326, 189.04083, 189.08832, 189.13934, 189.1855, 189.2296, 189.27489, 189.32558, 189.36694, 189.41133, 189.45744, 189.50322, 189.54796, 189.59531, 189.6389, 189.68634, 189.73462, 189.78769, 189.83501, 189.88196, 189.92941, 189.97726, 190.02953, 190.08095, 190.13335, 190.18449, 190.23326, 190.28383, 190.33415, 190.38512, 190.43832, 190.49026, 190.5453, 190.59666, 190.65088, 190.70216, 190.75441, 190.80804, 190.85649, 190.90819, 190.957, 191.00778, 191.05713, 191.10803, 191.15628, 191.20445, 191.25539, 191.30585, 191.35631, 191.40929, 191.46144, 191.5153, 191.5732, 191.6273, 191.67821, 191.73494, 191.79005, 191.84462, 191.89845, 191.95538, 192.01093, 192.06554, 192.1189, 192.17081, 192.2244, 192.2774, 192.33224, 192.38445, 192.44177, 192.49707, 192.55254, 192.60464, 192.65576, 192.70808, 192.76437, 192.82317, 192.88344, 192.93953, 192.99843, 193.05219, 193.1062, 193.16641, 193.22375, 193.28175, 193.3349, 193.39145, 193.44878, 193.50717, 193.55751, 193.61333, 193.66898, 193.72675, 193.79041, 193.84534, 193.90236, 193.96567, 194.0249, 194.08501, 194.14468, 194.2052, 194.2684, 194.32666, 194.38776, 194.44768, 194.50999, 194.57324, 194.63622, 194.69333, 194.74876, 194.80455, 194.86299, 194.92128, 194.97459]}, "iteration-time": {"start_step": 0, "end_step": 2000, "step_interval": 5, "values": [26.15537, 1.59225, 1.58677, 1.61174, 1.60131, 1.58979, 1.6009, 1.60255, 1.59989, 1.59397, 1.59991, 1.60879, 1.59752, 1.58326, 1.60593, 1.58196, 1.58281, 1.58285, 1.65512, 1.58951, 1.57778, 1.59099, 1.59905, 1.5964, 1.60421, 1.59987, 1.60383, 1.59456, 1.59474, 1.60292, 1.59587, 1.59615, 1.59953, 1.68491, 1.61405, 1.61646, 1.76204, 1.6157, 1.60582, 1.60949, 1.60517, 1.60169, 1.5944, 1.59771, 1.59812, 1.61186, 1.60798, 1.59786, 1.69134, 1.607, 1.62116, 1.61495, 1.61958, 1.61282, 1.60615, 1.61947, 1.6053, 1.59812, 1.60103, 1.61637, 1.60915, 1.61703, 1.61268, 1.61077, 1.61236, 1.61876, 1.60773, 1.69396, 1.60939, 1.61301, 1.62827, 1.61429, 1.61159, 1.60859, 1.61405, 1.62895, 1.61614, 1.61446, 1.60675, 1.61067, 1.61896, 1.61461, 1.61244, 1.60436, 1.6079, 1.619, 1.61303, 1.61117, 1.61223, 1.60766, 1.62186, 1.60682, 1.60832, 1.60625, 1.60469, 1.61342, 1.60768, 1.60669, 1.59722, 1.69938, 1.61072, 1.61909, 1.61007, 1.6046, 1.60277, 1.61264, 1.61634, 1.61492, 1.61043, 1.62152, 1.61505, 1.61393, 1.61336, 1.61268, 1.61629, 1.61635, 1.62076, 1.61243, 1.61515, 1.61244, 1.61769, 1.61729, 1.60493, 1.60897, 1.61012, 1.61259, 1.6206, 1.60935, 1.61072, 1.61412, 1.62132, 1.61512, 1.61556, 1.61045, 1.6109, 1.61406, 1.61499, 1.60648, 1.62368, 1.61793, 1.62077, 1.61115, 1.607, 1.60097, 1.60715, 1.61148, 1.61713, 1.61144, 1.62249, 1.61481, 1.61115, 1.6037, 1.61119, 1.60767, 1.6172, 1.61279, 1.60574, 1.60707, 1.60482, 1.60401, 1.61113, 1.61346, 1.60704, 1.61142, 1.60677, 1.60612, 1.59885, 1.60751, 1.60394, 1.60565, 1.60074, 1.60646, 1.60139, 1.60114, 1.60502, 1.59931, 1.59106, 1.59528, 1.59562, 1.60655, 1.61019, 1.60604, 1.60255, 1.59481, 1.59218, 1.59628, 1.58975, 1.60275, 1.59914, 1.59723, 1.59728, 1.58386, 1.61425, 1.60353, 1.60061, 1.60375, 1.61192, 1.61512, 1.60494, 1.59982, 1.59392, 1.59773, 1.59899, 1.60034, 1.59034, 1.59986, 1.59404, 1.59171, 1.58924, 1.58292, 1.59951, 1.58972, 1.60076, 1.59525, 1.60354, 1.60474, 1.6007, 1.60461, 1.60303, 1.68738, 1.61462, 1.6112, 1.60314, 1.60468, 1.60954, 1.61515, 1.60446, 1.60607, 1.60574, 1.60376, 1.60767, 1.60168, 1.60809, 1.60685, 1.59979, 1.59981, 1.59996, 1.60233, 1.61191, 1.60192, 1.60578, 1.61979, 1.6159, 1.61226, 1.6128, 1.60991, 1.62187, 1.61382, 1.60853, 1.61365, 1.6207, 1.63823, 1.61317, 1.60999, 1.6096, 1.6053, 1.62098, 1.60515, 1.61012, 1.60877, 1.61097, 1.62766, 1.61189, 1.61276, 1.61683, 1.61267, 1.62231, 1.61022, 1.61488, 1.61227, 1.60799, 1.61989, 1.61118, 1.60947, 1.61635, 1.60971, 1.61707, 1.61308, 1.60535, 1.61359, 1.60892, 1.61075, 1.60793, 1.60987, 1.61295, 1.61056, 1.60924, 1.61593, 1.60828, 1.62137, 1.60777, 1.6163, 1.61976, 1.60496, 1.61232, 1.60943, 1.60387, 1.61497, 1.60986, 1.61254, 1.61053, 1.61641, 1.62112, 1.60996, 1.62043, 1.61238, 1.61482, 1.61865, 1.61289, 1.61175, 1.61784, 1.61203, 1.6132, 1.60843, 1.61847, 1.61033, 1.6185, 1.61766, 1.6264, 1.62151, 1.62048, 1.61539, 1.61807, 1.61346, 1.60979, 1.61291, 1.61433, 1.61137, 1.616, 1.60714, 1.6154, 1.61351, 1.60767, 1.60384, 1.60001, 1.59921, 1.60103, 1.60417, 1.60117, 1.59284, 1.60079, 1.59673, 1.59125, 1.59593, 1.59394, 1.59478, 1.59263, 1.59408, 1.59955, 1.66468, 1.59302, 1.59156, 1.59525, 1.62673, 1.61448, 1.60772, 1.60098, 1.6066, 1.62998, 1.62933, 1.6147, 1.61299, 1.61044, 1.62556, 1.61734, 1.61197, 1.61149, 1.61287, 1.62523, 1.61258, 1.60355, 1.6117, 1.61092, 1.60763, 1.61177, 1.61161, 1.6207, 1.61553, 1.62712, 1.62883, 1.6176, 1.62185, 1.60923, 1.61676, 1.62142, 1.62074, 1.61866, 1.61459, 1.59668, 1.61134, 1.60642, 1.60975, 1.61506, 1.60601, 1.62434, 1.61024, 1.61231, 1.61973, 1.61419, 1.61888]}, "lm loss validation": {"start_step": 0, "end_step": 4, "step_interval": 5, "values": [5.5974]}, "lm loss validation vs samples": {"start_step": 0, "end_step": 4, "step_interval": 5, "values": [5.5974]}, "lm loss validation ppl": {"start_step": 0, "end_step": 4, "step_interval": 5, "values": [269.72311]}, "lm loss validation ppl vs samples": {"start_step": 0, "end_step": 4, "step_interval": 5, "values": [269.72311]}}
\ No newline at end of file
diff --git a/tests/functional_tests/test_cases/gpt/gpt3_345m_weekly_dgx_h100_1N8G_mcore_tp2_pp2_native_fp8_tp_pp_sp/golden_values_lts.json b/tests/functional_tests/test_cases/gpt/gpt3_345m_weekly_dgx_h100_1N8G_mcore_tp2_pp2_native_fp8_tp_pp_sp/golden_values_lts.json
new file mode 100644
index 0000000000..6009b31b8c
--- /dev/null
+++ b/tests/functional_tests/test_cases/gpt/gpt3_345m_weekly_dgx_h100_1N8G_mcore_tp2_pp2_native_fp8_tp_pp_sp/golden_values_lts.json
@@ -0,0 +1 @@
+{"forward-backward-time": {"start_step": 0, "end_step": 2000, "step_interval": 5, "values": [21.99065, 1.57716, 1.57152, 1.59321, 1.5863, 1.57478, 1.58558, 1.58758, 1.58479, 1.57895, 1.58493, 1.59324, 1.58233, 1.56795, 1.58055, 1.56688, 1.5678, 1.56791, 1.63783, 1.57453, 1.56285, 1.57599, 1.58307, 1.58147, 1.58884, 1.58487, 1.58844, 1.57964, 1.57929, 1.58802, 1.58073, 1.58122, 1.58449, 1.66952, 1.599, 1.60139, 1.74695, 1.60068, 1.58989, 1.59432, 1.59007, 1.58621, 1.5794, 1.58271, 1.58306, 1.59677, 1.59288, 1.58275, 1.67584, 1.59164, 1.60367, 1.5998, 1.60414, 1.59758, 1.58882, 1.60425, 1.59006, 1.58295, 1.58588, 1.60115, 1.59394, 1.6001, 1.59693, 1.5944, 1.59722, 1.60347, 1.59248, 1.67877, 1.59416, 1.59784, 1.61277, 1.59908, 1.59639, 1.5935, 1.59862, 1.61381, 1.60093, 1.59916, 1.59139, 1.59544, 1.60373, 1.59931, 1.59729, 1.58924, 1.59278, 1.60393, 1.59751, 1.59588, 1.597, 1.5921, 1.60557, 1.5915, 1.59296, 1.59099, 1.58952, 1.59785, 1.59236, 1.59138, 1.58196, 1.68409, 1.59552, 1.60388, 1.59454, 1.58942, 1.58688, 1.59613, 1.60092, 1.59976, 1.59462, 1.60601, 1.59966, 1.59879, 1.59803, 1.59743, 1.60087, 1.60123, 1.60561, 1.59721, 1.60002, 1.59717, 1.60267, 1.60202, 1.58969, 1.5937, 1.59501, 1.59729, 1.6055, 1.59373, 1.59552, 1.59903, 1.60628, 1.59959, 1.60033, 1.59523, 1.59534, 1.59886, 1.59989, 1.59127, 1.60846, 1.60265, 1.6054, 1.59487, 1.59192, 1.58491, 1.59173, 1.59624, 1.60184, 1.59635, 1.60701, 1.59973, 1.59592, 1.58783, 1.59596, 1.59257, 1.60207, 1.59766, 1.59014, 1.59147, 1.58958, 1.58849, 1.59599, 1.59796, 1.59187, 1.59629, 1.59167, 1.59103, 1.58381, 1.59206, 1.58888, 1.5904, 1.58555, 1.59114, 1.58539, 1.58566, 1.5894, 1.58315, 1.57556, 1.5798, 1.57936, 1.59144, 1.59188, 1.58985, 1.58744, 1.57959, 1.57707, 1.58114, 1.57447, 1.58757, 1.58393, 1.5814, 1.58214, 1.56869, 1.59904, 1.58832, 1.58446, 1.5886, 1.5964, 1.59995, 1.58984, 1.58458, 1.57848, 1.58262, 1.58372, 1.58511, 1.57472, 1.58482, 1.57884, 1.57655, 1.57371, 1.56768, 1.58436, 1.57434, 1.58546, 1.57895, 1.58824, 1.58943, 1.58534, 1.58931, 1.58768, 1.67183, 1.5994, 1.59551, 1.58731, 1.58941, 1.59427, 1.59768, 1.58889, 1.5907, 1.58959, 1.58719, 1.59215, 1.5863, 1.59281, 1.59155, 1.58447, 1.58437, 1.5847, 1.58696, 1.59622, 1.58517, 1.59019, 1.60434, 1.59968, 1.5969, 1.59751, 1.59456, 1.6066, 1.59805, 1.59315, 1.59835, 1.60342, 1.62288, 1.59735, 1.59455, 1.59386, 1.5899, 1.60537, 1.58935, 1.59479, 1.5931, 1.59564, 1.61221, 1.59658, 1.59741, 1.60139, 1.59726, 1.60686, 1.59462, 1.59958, 1.59653, 1.59254, 1.60457, 1.59551, 1.59428, 1.60093, 1.5944, 1.60142, 1.59772, 1.58999, 1.59811, 1.59342, 1.59459, 1.59229, 1.59446, 1.59758, 1.59514, 1.59376, 1.60015, 1.59289, 1.60569, 1.59243, 1.59995, 1.60277, 1.58962, 1.59704, 1.59408, 1.58742, 1.59956, 1.5946, 1.59711, 1.59521, 1.60094, 1.60537, 1.59472, 1.60512, 1.59709, 1.59942, 1.60326, 1.59747, 1.59643, 1.60252, 1.59668, 1.5978, 1.59291, 1.60286, 1.59494, 1.60307, 1.6023, 1.61125, 1.60608, 1.60499, 1.60013, 1.60294, 1.59839, 1.59445, 1.59771, 1.59912, 1.59625, 1.60071, 1.592, 1.59986, 1.59715, 1.59092, 1.5888, 1.58483, 1.58369, 1.58578, 1.58892, 1.58607, 1.57772, 1.58567, 1.58058, 1.57579, 1.58081, 1.57885, 1.57944, 1.5775, 1.57886, 1.58441, 1.64955, 1.57793, 1.57628, 1.57996, 1.60901, 1.5979, 1.59148, 1.58504, 1.58873, 1.61471, 1.61412, 1.59947, 1.59781, 1.59535, 1.61042, 1.60213, 1.59684, 1.59637, 1.59781, 1.60971, 1.59714, 1.58835, 1.59658, 1.5958, 1.5924, 1.59655, 1.59597, 1.60519, 1.60003, 1.61195, 1.61366, 1.6023, 1.60659, 1.59405, 1.60115, 1.6049, 1.6052, 1.60253, 1.59948, 1.5816, 1.59621, 1.58755, 1.59445, 1.59719, 1.59069, 1.60911, 1.59481, 1.59684, 1.60214, 1.59905, 1.60381]}, "forward-compute-time": {"start_step": 0, "end_step": 2000, "step_interval": 5, "values": [6.16126, 0.78048, 0.77638, 0.78285, 0.77945, 0.7768, 0.78398, 0.78215, 0.7833, 0.77542, 0.78468, 0.78711, 0.78251, 0.76662, 0.76894, 0.76826, 0.77171, 0.76847, 0.83221, 0.7706, 0.76442, 0.77548, 0.77966, 0.76518, 0.7854, 0.7799, 0.77136, 0.76634, 0.78834, 0.77019, 0.78986, 0.77045, 0.78652, 0.87018, 0.80011, 0.7944, 0.94182, 0.79666, 0.78564, 0.78708, 0.78355, 0.78735, 0.78535, 0.79227, 0.79173, 0.79116, 0.79578, 0.78576, 0.88058, 0.78541, 0.7905, 0.80177, 0.80159, 0.79536, 0.78436, 0.80424, 0.79113, 0.78133, 0.79513, 0.79725, 0.78505, 0.80445, 0.7974, 0.80505, 0.80566, 0.79011, 0.78303, 0.8828, 0.7992, 0.80046, 0.79496, 0.80104, 0.80208, 0.78598, 0.79918, 0.79817, 0.80692, 0.79948, 0.79832, 0.80065, 0.79953, 0.80613, 0.80349, 0.79995, 0.80406, 0.8022, 0.80453, 0.80228, 0.8056, 0.79734, 0.80242, 0.78707, 0.79319, 0.80876, 0.78925, 0.79762, 0.79177, 0.81095, 0.78559, 0.87702, 0.80826, 0.80874, 0.79998, 0.78873, 0.79623, 0.80044, 0.7965, 0.80088, 0.80451, 0.80617, 0.80803, 0.80736, 0.80357, 0.80072, 0.80574, 0.80861, 0.80081, 0.80256, 0.8016, 0.80416, 0.80062, 0.79705, 0.79613, 0.7934, 0.79423, 0.79439, 0.79639, 0.79437, 0.80375, 0.79641, 0.8075, 0.79693, 0.80388, 0.79802, 0.79685, 0.80158, 0.79875, 0.79886, 0.80926, 0.81104, 0.80752, 0.80381, 0.79608, 0.7893, 0.78982, 0.79582, 0.79985, 0.79486, 0.8058, 0.79802, 0.79424, 0.79685, 0.79506, 0.79473, 0.79858, 0.79203, 0.79193, 0.79375, 0.79263, 0.78662, 0.78983, 0.79242, 0.78834, 0.78866, 0.78847, 0.79475, 0.78474, 0.78928, 0.78727, 0.7942, 0.78678, 0.78404, 0.7855, 0.78669, 0.7807, 0.79077, 0.78107, 0.78201, 0.78183, 0.80216, 0.79952, 0.79773, 0.7904, 0.78485, 0.7784, 0.78943, 0.78644, 0.78928, 0.79161, 0.79481, 0.79068, 0.78383, 0.79727, 0.78767, 0.79378, 0.79855, 0.79573, 0.79906, 0.79796, 0.78811, 0.77833, 0.78832, 0.79352, 0.78682, 0.78545, 0.78929, 0.78422, 0.78978, 0.78901, 0.78354, 0.78883, 0.78807, 0.79656, 0.79382, 0.79009, 0.79261, 0.79204, 0.79399, 0.79138, 0.87044, 0.79415, 0.78856, 0.7904, 0.7891, 0.78842, 0.79047, 0.78866, 0.78816, 0.78669, 0.78557, 0.78863, 0.79242, 0.79337, 0.78575, 0.78866, 0.78509, 0.78346, 0.78462, 0.78704, 0.78025, 0.78234, 0.78547, 0.78832, 0.78406, 0.79176, 0.78752, 0.79148, 0.7926, 0.78905, 0.79623, 0.79876, 0.80189, 0.79329, 0.78938, 0.78571, 0.79206, 0.79022, 0.78916, 0.79198, 0.78965, 0.78841, 0.79706, 0.79681, 0.79422, 0.79582, 0.7978, 0.7929, 0.79692, 0.79951, 0.79613, 0.78441, 0.78081, 0.78582, 0.78913, 0.79294, 0.7902, 0.78677, 0.79445, 0.79001, 0.79247, 0.78884, 0.78757, 0.79082, 0.79372, 0.79339, 0.79117, 0.79464, 0.79238, 0.78456, 0.80253, 0.7832, 0.79582, 0.78585, 0.78817, 0.7996, 0.80334, 0.80038, 0.78266, 0.79835, 0.80583, 0.7884, 0.803, 0.7964, 0.7803, 0.80771, 0.78154, 0.78737, 0.78425, 0.79511, 0.79935, 0.79899, 0.80031, 0.79737, 0.7882, 0.78726, 0.80196, 0.78826, 0.79069, 0.79987, 0.80053, 0.79658, 0.80868, 0.78979, 0.79176, 0.80466, 0.79718, 0.80577, 0.78989, 0.78977, 0.79845, 0.80176, 0.79513, 0.79765, 0.78377, 0.78605, 0.7817, 0.78486, 0.78251, 0.782, 0.77773, 0.78515, 0.78532, 0.7826, 0.78594, 0.7847, 0.78814, 0.78399, 0.78924, 0.78495, 0.85297, 0.78501, 0.78455, 0.78521, 0.79499, 0.78326, 0.78572, 0.78491, 0.78588, 0.79342, 0.79911, 0.79939, 0.79997, 0.78403, 0.79216, 0.80483, 0.79356, 0.79564, 0.79104, 0.79195, 0.79461, 0.79321, 0.78786, 0.79505, 0.78766, 0.78873, 0.7989, 0.79328, 0.79827, 0.79828, 0.79999, 0.80446, 0.80505, 0.79428, 0.80603, 0.80135, 0.79708, 0.78828, 0.78401, 0.78511, 0.79061, 0.7807, 0.78293, 0.7859, 0.78918, 0.79204, 0.7906, 0.79616, 0.79381, 0.7949, 0.79715]}, "backward-compute-time": {"start_step": 0, "end_step": 2000, "step_interval": 5, "values": [4.59311, 0.76076, 0.76217, 0.75984, 0.7615, 0.75659, 0.76053, 0.7532, 0.76274, 0.76117, 0.76101, 0.76233, 0.76144, 0.75668, 0.76922, 0.75609, 0.75913, 0.76116, 0.76025, 0.76541, 0.75884, 0.75825, 0.75703, 0.766, 0.76226, 0.76154, 0.76489, 0.76817, 0.75764, 0.76666, 0.76075, 0.75889, 0.75671, 0.76413, 0.76441, 0.76109, 0.75862, 0.76306, 0.74826, 0.75641, 0.74619, 0.74555, 0.74425, 0.74896, 0.74343, 0.75132, 0.74633, 0.74611, 0.74624, 0.74486, 0.75681, 0.756, 0.75967, 0.7522, 0.74699, 0.75759, 0.75126, 0.74675, 0.75177, 0.75405, 0.7585, 0.75155, 0.75405, 0.75102, 0.75148, 0.75893, 0.74911, 0.74587, 0.75218, 0.74921, 0.76638, 0.74462, 0.7501, 0.7496, 0.74661, 0.7608, 0.75236, 0.74756, 0.74835, 0.74741, 0.75597, 0.74513, 0.75335, 0.74569, 0.74992, 0.75987, 0.73959, 0.74426, 0.7594, 0.74595, 0.75601, 0.74294, 0.74297, 0.75107, 0.74798, 0.75807, 0.74348, 0.75472, 0.74211, 0.7499, 0.7459, 0.75376, 0.74383, 0.74411, 0.74537, 0.74321, 0.75045, 0.74449, 0.75823, 0.74876, 0.74922, 0.75592, 0.75588, 0.75204, 0.74904, 0.74934, 0.76179, 0.74708, 0.74898, 0.7495, 0.749, 0.75109, 0.75134, 0.74604, 0.74742, 0.74319, 0.75078, 0.74752, 0.75245, 0.74673, 0.75517, 0.75235, 0.74881, 0.74945, 0.75053, 0.74903, 0.75641, 0.74336, 0.76521, 0.75829, 0.75724, 0.75492, 0.7561, 0.75292, 0.74603, 0.75381, 0.74787, 0.75257, 0.76831, 0.74923, 0.75133, 0.74595, 0.75539, 0.74856, 0.75247, 0.75168, 0.74839, 0.75531, 0.74901, 0.75107, 0.75151, 0.75163, 0.75496, 0.75207, 0.75274, 0.75371, 0.75218, 0.75324, 0.75429, 0.74775, 0.75082, 0.74975, 0.75003, 0.74514, 0.74798, 0.7422, 0.74955, 0.74687, 0.74432, 0.76318, 0.76862, 0.75695, 0.75138, 0.74947, 0.74824, 0.74949, 0.74673, 0.76097, 0.75456, 0.75612, 0.74619, 0.74667, 0.75557, 0.75602, 0.74867, 0.74532, 0.75908, 0.75984, 0.75566, 0.75544, 0.74912, 0.74344, 0.74466, 0.743, 0.74211, 0.75391, 0.74844, 0.74322, 0.7419, 0.7391, 0.75107, 0.74688, 0.74472, 0.74867, 0.74188, 0.75312, 0.75735, 0.75298, 0.75011, 0.83767, 0.75688, 0.7468, 0.75125, 0.75873, 0.75439, 0.76222, 0.74909, 0.75114, 0.74996, 0.74891, 0.75631, 0.75529, 0.75222, 0.74576, 0.74916, 0.74348, 0.7422, 0.74917, 0.74763, 0.74945, 0.74253, 0.75781, 0.74585, 0.75081, 0.75209, 0.75165, 0.7532, 0.75146, 0.75199, 0.75085, 0.75606, 0.76797, 0.74123, 0.75583, 0.7498, 0.74976, 0.76018, 0.74891, 0.74315, 0.74567, 0.74733, 0.76326, 0.74371, 0.74843, 0.74397, 0.74563, 0.76375, 0.74742, 0.7484, 0.75035, 0.74757, 0.75381, 0.7431, 0.74767, 0.74383, 0.74076, 0.75278, 0.75322, 0.74717, 0.74642, 0.74435, 0.74553, 0.75415, 0.75172, 0.74406, 0.74946, 0.74845, 0.7471, 0.74058, 0.74992, 0.74948, 0.74994, 0.75938, 0.75195, 0.75199, 0.75277, 0.74398, 0.75468, 0.74625, 0.74009, 0.75462, 0.74436, 0.75709, 0.75842, 0.75583, 0.75652, 0.75955, 0.75822, 0.74976, 0.74693, 0.7489, 0.7484, 0.74876, 0.75623, 0.75485, 0.75131, 0.75086, 0.75519, 0.7563, 0.75201, 0.74461, 0.75083, 0.75104, 0.7491, 0.74353, 0.74963, 0.74824, 0.75106, 0.75407, 0.74618, 0.7523, 0.75149, 0.74913, 0.74663, 0.74746, 0.7482, 0.74592, 0.74512, 0.75269, 0.74881, 0.75383, 0.74575, 0.74092, 0.74646, 0.74972, 0.75151, 0.74727, 0.74596, 0.75029, 0.74634, 0.74441, 0.75077, 0.76193, 0.7811, 0.76201, 0.76484, 0.77016, 0.76471, 0.76985, 0.76565, 0.75567, 0.76091, 0.76601, 0.7782, 0.76131, 0.75676, 0.76458, 0.76377, 0.77738, 0.75801, 0.75902, 0.762, 0.75749, 0.75518, 0.75814, 0.7671, 0.76157, 0.76399, 0.77689, 0.76899, 0.76062, 0.76435, 0.76315, 0.75948, 0.77408, 0.75612, 0.76269, 0.75559, 0.76227, 0.77122, 0.76094, 0.76349, 0.7582, 0.75871, 0.77745, 0.76055, 0.76243, 0.76016, 0.76322, 0.76742]}, "batch-generator-time": {"start_step": 0, "end_step": 2000, "step_interval": 5, "values": [2.19292, 0.01741, 0.01488, 0.01641, 0.01712, 0.01701, 0.01724, 0.01612, 0.01735, 0.01689, 0.01449, 0.01795, 0.01495, 0.01541, 0.01502, 0.01516, 0.01428, 0.01451, 0.01769, 0.01847, 0.0169, 0.01788, 0.01813, 0.01751, 0.01774, 0.01679, 0.01619, 0.01655, 0.01654, 0.01696, 0.0174, 0.0185, 0.01671, 0.01581, 0.01697, 0.01627, 0.02111, 0.01585, 0.0176, 0.01783, 0.01799, 0.01548, 0.01578, 0.01602, 0.01539, 0.01659, 0.01748, 0.01708, 0.01454, 0.01909, 0.01622, 0.01722, 0.01943, 0.01822, 0.01639, 0.01887, 0.0157, 0.01802, 0.01601, 0.01682, 0.01679, 0.01666, 0.01696, 0.01447, 0.01725, 0.01735, 0.01643, 0.01884, 0.01609, 0.0185, 0.0184, 0.01703, 0.01561, 0.01899, 0.01693, 0.01673, 0.01557, 0.02037, 0.01648, 0.02182, 0.01581, 0.01883, 0.01486, 0.01422, 0.01602, 0.0206, 0.01692, 0.01644, 0.01443, 0.0164, 0.01772, 0.01699, 0.01792, 0.01841, 0.01616, 0.01914, 0.01786, 0.01399, 0.01385, 0.01298, 0.01984, 0.01393, 0.01641, 0.01237, 0.01672, 0.01523, 0.01481, 0.01312, 0.01514, 0.0141, 0.01688, 0.01659, 0.01531, 0.01306, 0.01415, 0.01307, 0.01504, 0.01566, 0.01521, 0.01304, 0.0151, 0.01337, 0.01578, 0.01428, 0.01733, 0.01324, 0.01568, 0.01651, 0.01314, 0.01407, 0.01374, 0.01429, 0.01421, 0.01802, 0.01439, 0.01347, 0.01541, 0.01301, 0.01489, 0.01769, 0.01406, 0.01394, 0.01544, 0.01425, 0.01399, 0.01414, 0.01541, 0.01538, 0.01478, 0.01476, 0.01498, 0.01626, 0.01614, 0.01516, 0.0146, 0.02163, 0.01496, 0.01399, 0.0156, 0.01517, 0.01657, 0.01525, 0.02091, 0.01583, 0.01574, 0.01726, 0.01555, 0.01523, 0.01459, 0.01318, 0.01563, 0.01531, 0.01592, 0.01602, 0.01375, 0.01616, 0.01854, 0.0199, 0.01523, 0.01384, 0.01396, 0.01413, 0.01587, 0.01384, 0.01554, 0.01277, 0.0125, 0.01321, 0.01511, 0.01439, 0.01651, 0.01382, 0.01689, 0.01614, 0.01571, 0.01361, 0.01704, 0.01534, 0.01385, 0.01423, 0.20705, 0.01218, 0.01233, 0.01727, 0.01275, 0.01244, 0.01327, 0.01272, 0.01371, 0.01665, 0.01392, 0.01222, 0.01222, 0.01188, 0.01265, 0.01482, 0.01632, 0.01649, 0.01702, 0.10117, 0.01844, 0.01611, 0.01574, 0.01967, 0.01779, 0.0181, 0.01873, 0.01598, 0.01615, 0.0136, 0.01405, 0.0131, 0.01348, 0.01358, 0.01592, 0.01254, 0.01772, 0.01503, 0.01408, 0.01322, 0.01435, 0.0158, 0.01713, 0.01512, 0.01582, 0.01578, 0.01584, 0.01532, 0.01652, 0.01516, 0.01295, 0.01398, 0.01359, 0.01339, 0.01358, 0.01304, 0.01422, 0.01314, 0.01282, 0.01422, 0.01411, 0.01529, 0.01575, 0.01454, 0.01377, 0.01423, 0.0158, 0.0128, 0.01659, 0.0174, 0.01592, 0.01617, 0.01462, 0.01415, 0.01495, 0.01263, 0.01928, 0.01701, 0.01799, 0.01302, 0.01537, 0.01683, 0.01358, 0.01378, 0.01553, 0.01478, 0.01516, 0.01864, 0.01487, 0.0145, 0.01315, 0.0163, 0.01453, 0.01978, 0.01808, 0.01337, 0.01516, 0.01483, 0.0141, 0.01325, 0.01391, 0.01431, 0.01452, 0.01452, 0.01284, 0.01318, 0.01339, 0.01336, 0.01442, 0.01234, 0.01424, 0.01284, 0.01762, 0.01661, 0.01281, 0.01962, 0.01329, 0.01356, 0.01369, 0.01291, 0.01345, 0.01577, 0.01307, 0.01371, 0.01245, 0.0144, 0.01266, 0.01493, 0.01942, 0.01384, 0.01403, 0.01338, 0.01325, 0.01563, 0.0138, 0.01307, 0.01453, 0.0157, 0.01517, 0.01449, 0.01345, 0.01482, 0.01389, 0.01533, 0.01504, 0.01529, 0.01484, 0.01361, 0.01578, 0.01436, 0.01584, 0.01282, 0.01395, 0.01777, 0.01465, 0.01446, 0.01422, 0.01426, 0.01624, 0.01786, 0.01661, 0.01321, 0.01562, 0.016, 0.0161, 0.01445, 0.01562, 0.01697, 0.01694, 0.01328, 0.01308, 0.01623, 0.01535, 0.01156, 0.01359, 0.01294, 0.01787, 0.01354, 0.01547, 0.01746, 0.01479, 0.01512, 0.0137, 0.01697, 0.01836, 0.0165, 0.01597, 0.01426, 0.01481, 0.01758, 0.01613, 0.01995, 0.01744, 0.01619, 0.02014, 0.01917, 0.01834, 0.02092, 0.0156, 0.01825]}, "forward-recv-time": {"start_step": 0, "end_step": 2000, "step_interval": 5, "values": [5.93081, 0.02344, 0.02331, 0.02309, 0.02318, 0.02288, 0.02295, 0.02315, 0.02278, 0.02311, 0.02303, 0.02319, 0.02297, 0.02355, 0.0232, 0.02307, 0.02294, 0.02279, 0.02348, 0.02322, 0.02312, 0.02338, 0.02754, 0.02903, 0.02328, 0.02314, 0.02339, 0.02314, 0.02316, 0.02611, 0.02298, 0.02317, 0.02368, 0.02303, 0.02318, 0.0236, 0.02624, 0.02329, 0.02423, 0.02403, 0.02326, 0.02356, 0.02358, 0.02322, 0.02307, 0.02339, 0.02352, 0.02314, 0.02321, 0.02319, 0.02427, 0.02732, 0.02447, 0.02413, 0.02414, 0.02384, 0.02448, 0.02435, 0.0243, 0.02437, 0.02392, 0.02395, 0.02424, 0.0244, 0.02386, 0.02399, 0.02583, 0.02402, 0.02381, 0.02363, 0.02384, 0.02415, 0.02408, 0.02332, 0.02351, 0.02417, 0.02341, 0.02374, 0.0239, 0.02359, 0.02348, 0.02367, 0.02309, 0.02341, 0.02304, 0.02341, 0.02349, 0.02339, 0.02324, 0.02343, 0.02447, 0.02397, 0.02425, 0.02336, 0.02357, 0.02378, 0.02358, 0.02333, 0.02324, 0.02381, 0.02363, 0.02361, 0.02379, 0.023, 0.02331, 0.02406, 0.02303, 0.02381, 0.02338, 0.0233, 0.02375, 0.02361, 0.02338, 0.0254, 0.02366, 0.02346, 0.02319, 0.0231, 0.02322, 0.02336, 0.02359, 0.02301, 0.0232, 0.0231, 0.02325, 0.02535, 0.02543, 0.0249, 0.0258, 0.02421, 0.02631, 0.02569, 0.02546, 0.02523, 0.02374, 0.02369, 0.02287, 0.02328, 0.02335, 0.02342, 0.02348, 0.02584, 0.02846, 0.02333, 0.02325, 0.02317, 0.02344, 0.02362, 0.02449, 0.02398, 0.02331, 0.02313, 0.02338, 0.02374, 0.02377, 0.02343, 0.02294, 0.02316, 0.02278, 0.02313, 0.02341, 0.02344, 0.02325, 0.02347, 0.02341, 0.02425, 0.0234, 0.0236, 0.02348, 0.02328, 0.02322, 0.02797, 0.02349, 0.02368, 0.02483, 0.02541, 0.02365, 0.02349, 0.02286, 0.02337, 0.02361, 0.02351, 0.02501, 0.02329, 0.02303, 0.02332, 0.02369, 0.02402, 0.02326, 0.02743, 0.02371, 0.02333, 0.02452, 0.02852, 0.02423, 0.02431, 0.02363, 0.02347, 0.0234, 0.02355, 0.0171, 0.02364, 0.02374, 0.02365, 0.02307, 0.02279, 0.02328, 0.02362, 0.0233, 0.02395, 0.02325, 0.02349, 0.0286, 0.02347, 0.02365, 0.02351, 0.02314, 0.02283, 0.02321, 0.02365, 0.02339, 0.02363, 0.02445, 0.0234, 0.023, 0.02306, 0.02312, 0.0258, 0.02371, 0.02351, 0.02414, 0.02516, 0.02398, 0.02387, 0.02789, 0.02332, 0.02291, 0.02319, 0.02382, 0.02362, 0.02352, 0.0236, 0.02482, 0.02336, 0.02343, 0.02386, 0.02373, 0.02332, 0.02345, 0.02366, 0.02371, 0.02383, 0.02391, 0.02309, 0.02396, 0.0237, 0.02358, 0.02332, 0.02354, 0.0237, 0.02431, 0.02339, 0.02333, 0.02358, 0.02566, 0.02353, 0.02329, 0.02355, 0.02334, 0.02388, 0.02322, 0.02748, 0.02759, 0.02327, 0.02777, 0.02798, 0.0238, 0.02318, 0.02324, 0.02335, 0.02358, 0.02398, 0.02384, 0.02417, 0.02338, 0.02373, 0.02324, 0.02322, 0.02308, 0.02335, 0.02824, 0.02882, 0.02297, 0.02325, 0.02282, 0.02322, 0.02355, 0.02322, 0.02216, 0.02334, 0.02367, 0.02317, 0.0235, 0.02347, 0.02352, 0.02303, 0.02358, 0.02344, 0.02281, 0.02283, 0.02317, 0.02298, 0.02317, 0.02316, 0.02391, 0.02343, 0.02303, 0.02332, 0.02335, 0.02338, 0.02344, 0.0231, 0.02322, 0.02326, 0.02319, 0.02352, 0.02355, 0.02458, 0.02323, 0.02296, 0.02379, 0.02609, 0.02363, 0.02342, 0.02402, 0.02329, 0.02315, 0.02333, 0.02366, 0.02341, 0.02336, 0.02367, 0.02372, 0.02313, 0.02316, 0.02322, 0.0229, 0.02346, 0.02318, 0.02345, 0.0231, 0.02329, 0.0234, 0.02416, 0.02352, 0.0233, 0.02333, 0.02358, 0.02304, 0.0234, 0.02373, 0.02367, 0.02364, 0.02394, 0.02331, 0.02361, 0.02549, 0.02611, 0.02307, 0.02307, 0.02339, 0.02305, 0.02337, 0.02343, 0.02331, 0.02306, 0.02371, 0.02326, 0.02401, 0.02338, 0.02329, 0.02355, 0.02339, 0.02318, 0.02379, 0.02372, 0.02332, 0.02367, 0.02321, 0.02384, 0.0232, 0.02419, 0.02337, 0.02355, 0.0235, 0.02303, 0.02314, 0.02384, 0.02385, 0.02327]}, "forward-send-time": {"start_step": 0, "end_step": 2000, "step_interval": 5, "values": [2.86591, 0.00012, 0.00013, 0.00012, 0.00012, 0.00012, 0.00012, 0.00012, 0.00012, 0.00012, 0.00012, 0.00012, 0.00013, 0.00012, 0.00012, 0.00012, 0.00013, 0.00012, 0.00012, 0.00012, 0.00012, 0.00012, 0.00013, 0.00012, 0.00012, 0.00012, 0.00012, 0.00012, 0.00012, 0.00012, 0.00012, 0.00012, 0.00012, 0.00011, 0.00012, 0.00012, 0.00012, 0.00012, 0.00012, 0.00012, 0.00012, 0.00012, 0.00013, 0.00012, 0.00012, 0.00012, 0.00012, 0.00012, 0.00012, 0.00012, 0.00012, 0.00014, 0.00012, 0.00012, 0.00012, 0.00012, 0.00013, 0.00013, 0.00012, 0.00012, 0.00012, 0.00012, 0.00013, 0.00013, 0.00012, 0.00013, 0.00013, 0.00013, 0.00013, 0.00012, 0.00012, 0.00015, 0.00013, 0.00012, 0.00012, 0.00012, 0.00012, 0.00012, 0.00012, 0.00012, 0.00013, 0.00013, 0.00012, 0.00012, 0.00012, 0.00012, 0.00012, 0.00012, 0.00012, 0.00013, 0.00012, 0.00012, 0.00012, 0.00012, 0.00012, 0.00013, 0.00012, 0.00013, 0.00012, 0.00012, 0.00013, 0.00013, 0.00012, 0.00012, 0.00013, 0.00012, 0.00012, 0.00013, 0.00012, 0.00012, 0.00013, 0.00012, 0.00012, 0.00013, 0.00013, 0.00012, 0.00012, 0.00012, 0.00012, 0.00012, 0.00013, 0.00013, 0.00012, 0.00011, 0.00013, 0.00012, 0.00013, 0.00013, 0.00013, 0.00013, 0.00012, 0.00012, 0.00012, 0.00012, 0.00013, 0.00013, 0.00012, 0.00012, 0.00013, 0.00012, 0.00013, 0.00013, 0.00013, 0.00012, 0.00012, 0.00012, 0.00013, 0.00012, 0.00012, 0.00012, 0.00013, 0.00014, 0.00012, 0.00013, 0.00013, 0.00013, 0.00012, 0.00013, 0.00012, 0.00012, 0.00013, 0.00013, 0.00012, 0.00012, 0.00014, 0.00012, 0.00013, 0.00013, 0.00012, 0.00013, 0.00012, 0.00014, 0.00012, 0.00012, 0.00012, 0.00012, 0.00013, 0.00013, 0.00012, 0.00012, 0.00012, 0.00013, 0.00013, 0.00012, 0.00013, 0.00012, 0.00012, 0.00013, 0.00012, 0.00013, 0.00013, 0.00012, 0.00012, 0.00016, 0.00013, 0.00013, 0.00012, 0.00012, 0.00013, 0.00013, 0.0001, 0.00013, 0.00013, 0.00012, 0.00012, 0.00012, 0.00012, 0.00013, 0.00013, 0.00012, 0.00012, 0.00012, 0.00013, 0.00012, 0.00013, 0.00012, 0.00012, 0.00012, 0.00012, 0.00013, 0.00012, 0.00013, 0.00012, 0.00012, 0.00012, 0.00012, 0.00012, 0.00013, 0.00013, 0.00013, 0.00013, 0.00012, 0.00012, 0.00019, 0.00012, 0.00012, 0.00013, 0.00012, 0.00012, 0.00012, 0.00013, 0.00012, 0.00013, 0.00012, 0.00012, 0.00012, 0.00013, 0.00012, 0.00012, 0.00013, 0.00014, 0.00012, 0.00013, 0.00012, 0.00013, 0.00013, 0.00012, 0.00013, 0.00012, 0.00013, 0.00013, 0.00013, 0.00013, 0.00013, 0.00012, 0.00012, 0.00014, 0.00012, 0.00012, 0.00013, 0.00013, 0.00021, 0.00017, 0.00013, 0.00016, 0.00019, 0.00012, 0.00012, 0.00012, 0.00012, 0.00013, 0.00012, 0.00013, 0.00013, 0.00012, 0.00012, 0.00012, 0.00013, 0.00012, 0.00012, 0.00015, 0.00017, 0.00012, 0.00012, 0.00013, 0.00012, 0.00011, 0.00012, 0.00012, 0.00012, 0.00012, 0.00012, 0.00012, 0.00012, 0.00012, 0.00012, 0.00012, 0.00012, 0.00013, 0.00012, 0.00012, 0.00012, 0.00012, 0.00012, 0.00012, 0.00012, 0.00012, 0.00011, 0.00012, 0.00012, 0.00012, 0.00012, 0.00012, 0.00013, 0.00012, 0.00012, 0.00012, 0.00013, 0.00014, 0.00012, 0.00012, 0.00013, 0.00012, 0.00012, 0.00013, 0.00012, 0.00012, 0.00012, 0.00012, 0.00012, 0.00012, 0.00012, 0.00013, 0.00012, 0.00012, 0.00014, 0.00012, 0.00013, 0.00012, 0.00012, 0.00012, 0.00012, 0.00012, 0.00012, 0.00012, 0.00012, 0.00012, 0.00012, 0.00012, 0.00012, 0.00012, 0.00012, 0.00012, 0.00012, 0.00013, 0.00012, 0.00012, 0.00013, 0.00012, 0.00012, 0.00013, 0.00012, 0.00012, 0.00013, 0.00012, 0.00012, 0.00012, 0.00012, 0.00012, 0.00012, 0.00012, 0.00012, 0.00013, 0.00012, 0.00012, 0.00016, 0.00012, 0.00012, 0.00012, 0.00012, 0.00013, 0.00013, 0.00012, 0.00012, 0.00012, 0.00012, 0.00013, 0.00013, 0.00013, 0.00012]}, "backward-recv-time": {"start_step": 0, "end_step": 2000, "step_interval": 5, "values": [0.02324, 0.02473, 0.02485, 0.0257, 0.02421, 0.02511, 0.02424, 0.02512, 0.02482, 0.02484, 0.02503, 0.02501, 0.02497, 0.02408, 0.02453, 0.02476, 0.02472, 0.0245, 0.02469, 0.0238, 0.02472, 0.02383, 0.02443, 0.02414, 0.02458, 0.02427, 0.02418, 0.02518, 0.02515, 0.02471, 0.02487, 0.02507, 0.0252, 0.04234, 0.02563, 0.02482, 0.02527, 0.0252, 0.02511, 0.02616, 0.02552, 0.02553, 0.02507, 0.0247, 0.02488, 0.02838, 0.02802, 0.0284, 0.02834, 0.02994, 0.02821, 0.02845, 0.02966, 0.02456, 0.02638, 0.02786, 0.02477, 0.02529, 0.02816, 0.0278, 0.024, 0.02485, 0.02472, 0.02443, 0.02679, 0.02889, 0.02923, 0.02446, 0.02467, 0.02491, 0.02448, 0.02524, 0.0247, 0.02381, 0.02482, 0.02267, 0.02554, 0.02506, 0.02479, 0.02511, 0.02493, 0.02473, 0.02445, 0.02465, 0.02466, 0.02435, 0.02438, 0.02454, 0.02703, 0.02859, 0.02838, 0.02463, 0.02457, 0.02449, 0.02484, 0.02427, 0.02489, 0.02919, 0.02783, 0.02446, 0.02864, 0.02839, 0.02885, 0.02916, 0.02535, 0.02922, 0.02859, 0.02867, 0.02674, 0.02913, 0.02404, 0.02357, 0.02473, 0.02426, 0.0237, 0.02368, 0.02461, 0.02449, 0.02432, 0.02416, 0.02668, 0.0259, 0.02394, 0.02449, 0.0245, 0.02639, 0.02567, 0.02428, 0.02416, 0.0239, 0.0246, 0.0245, 0.02396, 0.02903, 0.02872, 0.02891, 0.0242, 0.0248, 0.02619, 0.02586, 0.02476, 0.02646, 0.02366, 0.02382, 0.02621, 0.02353, 0.02399, 0.02459, 0.02528, 0.02408, 0.0246, 0.02424, 0.028, 0.02928, 0.02952, 0.02881, 0.02431, 0.02457, 0.02417, 0.02444, 0.02498, 0.02401, 0.02303, 0.02437, 0.02609, 0.02618, 0.0244, 0.02636, 0.02449, 0.02888, 0.0291, 0.02963, 0.02433, 0.02789, 0.03263, 0.03258, 0.02856, 0.02595, 0.02508, 0.02561, 0.02568, 0.02893, 0.02364, 0.02454, 0.02431, 0.02431, 0.02435, 0.02361, 0.02447, 0.02415, 0.02557, 0.02442, 0.02388, 0.02473, 0.02836, 0.02932, 0.02902, 0.02464, 0.02588, 0.02525, 0.02855, 0.02485, 0.03232, 0.02798, 0.02376, 0.02448, 0.02369, 0.02397, 0.02417, 0.02554, 0.02412, 0.02385, 0.02386, 0.02939, 0.02461, 0.02396, 0.02522, 0.02468, 0.02408, 0.02344, 0.02381, 0.02444, 0.02442, 0.02457, 0.02446, 0.02491, 0.02474, 0.02468, 0.02463, 0.02469, 0.02618, 0.02458, 0.0243, 0.02465, 0.02436, 0.0246, 0.02381, 0.02431, 0.02492, 0.02438, 0.0239, 0.02778, 0.03263, 0.03015, 0.02489, 0.02497, 0.02827, 0.02851, 0.02831, 0.02923, 0.02893, 0.02474, 0.02501, 0.02434, 0.02523, 0.02437, 0.02557, 0.02446, 0.02462, 0.02479, 0.02496, 0.02454, 0.02469, 0.02509, 0.02486, 0.02485, 0.02426, 0.02434, 0.025, 0.02506, 0.02464, 0.02457, 0.02548, 0.0244, 0.025, 0.02478, 0.0246, 0.025, 0.02481, 0.02465, 0.02469, 0.02502, 0.02443, 0.02451, 0.025, 0.02468, 0.02437, 0.02501, 0.02475, 0.02536, 0.02455, 0.02462, 0.02512, 0.02448, 0.0247, 0.02447, 0.02432, 0.02473, 0.02472, 0.02439, 0.02441, 0.02485, 0.02461, 0.02454, 0.02434, 0.02462, 0.02469, 0.02464, 0.02438, 0.02452, 0.02463, 0.02444, 0.02442, 0.02471, 0.02629, 0.02488, 0.02491, 0.02465, 0.02437, 0.02469, 0.02484, 0.02511, 0.02481, 0.02578, 0.02498, 0.02521, 0.02506, 0.02571, 0.02539, 0.02521, 0.02412, 0.0257, 0.02473, 0.02452, 0.02527, 0.0256, 0.02517, 0.02489, 0.0251, 0.02453, 0.02495, 0.02483, 0.02495, 0.02445, 0.02472, 0.02508, 0.02487, 0.02471, 0.02495, 0.02544, 0.02447, 0.025, 0.02531, 0.02509, 0.02923, 0.02837, 0.02804, 0.02863, 0.03514, 0.02454, 0.02525, 0.02518, 0.02502, 0.02481, 0.02521, 0.02523, 0.02482, 0.02487, 0.02487, 0.02585, 0.02467, 0.02474, 0.02498, 0.02461, 0.02536, 0.02543, 0.02452, 0.02512, 0.02501, 0.02421, 0.02508, 0.02507, 0.02588, 0.02699, 0.02457, 0.02568, 0.0256, 0.02542, 0.02475, 0.02461, 0.02444, 0.0296, 0.02899, 0.02863, 0.02732, 0.02767, 0.02899, 0.02482, 0.02467, 0.02404]}, "backward-send-time": {"start_step": 0, "end_step": 2000, "step_interval": 5, "values": [0.00015, 0.00019, 0.00016, 0.00019, 0.00017, 0.00018, 0.00018, 0.00018, 0.00018, 0.00018, 0.00018, 0.00018, 0.00018, 0.00017, 0.00023, 0.00016, 0.00017, 0.00017, 0.00018, 0.00019, 0.00018, 0.00018, 0.00017, 0.00018, 0.00016, 0.00018, 0.00016, 0.00018, 0.00016, 0.00016, 0.00018, 0.00016, 0.00017, 0.00016, 0.00016, 0.00018, 0.00016, 0.00016, 0.00016, 0.00016, 0.00016, 0.00016, 0.00016, 0.00017, 0.00018, 0.0002, 0.00016, 0.00019, 0.00017, 0.00021, 0.00016, 0.00018, 0.00019, 0.00016, 0.00017, 0.00017, 0.00018, 0.0002, 0.00016, 0.00016, 0.00016, 0.00016, 0.00019, 0.00017, 0.00017, 0.00018, 0.00019, 0.00017, 0.00019, 0.00016, 0.00017, 0.00018, 0.00017, 0.00019, 0.00016, 0.00018, 0.00018, 0.00018, 0.00016, 0.00018, 0.00018, 0.00017, 0.00016, 0.00018, 0.00016, 0.00018, 0.00017, 0.00018, 0.00018, 0.00018, 0.00019, 0.00018, 0.00017, 0.00018, 0.00018, 0.00016, 0.00018, 0.00016, 0.00024, 0.00017, 0.00018, 0.00016, 0.00016, 0.00019, 0.00019, 0.00018, 0.00026, 0.00017, 0.00018, 0.00018, 0.00019, 0.00018, 0.00018, 0.00018, 0.00016, 0.00018, 0.00018, 0.00017, 0.00018, 0.00017, 0.00016, 0.00016, 0.00018, 0.00019, 0.00017, 0.00018, 0.00018, 0.00018, 0.00018, 0.00017, 0.00018, 0.00019, 0.00018, 0.00018, 0.00018, 0.00021, 0.00016, 0.00018, 0.00019, 0.00022, 0.00017, 0.00016, 0.00017, 0.00017, 0.00018, 0.00018, 0.00018, 0.00018, 0.00016, 0.00018, 0.00018, 0.00018, 0.00018, 0.00018, 0.00018, 0.00018, 0.00018, 0.00018, 0.00016, 0.00019, 0.00018, 0.00018, 0.00017, 0.00019, 0.00017, 0.00019, 0.00018, 0.00018, 0.00016, 0.00017, 0.00016, 0.00016, 0.00018, 0.00017, 0.00016, 0.00029, 0.00017, 0.00019, 0.0002, 0.00016, 0.00019, 0.00032, 0.00019, 0.00016, 0.00018, 0.00018, 0.00016, 0.00018, 0.00023, 0.00018, 0.00018, 0.00018, 0.00017, 0.00019, 0.00018, 0.00016, 0.00018, 0.00017, 0.00018, 0.00018, 0.00018, 0.00019, 0.00018, 0.0002, 0.00016, 0.0002, 0.00018, 0.00018, 0.00018, 0.00016, 0.00018, 0.00016, 0.00016, 0.00018, 0.00016, 0.00017, 0.00019, 0.00018, 0.00016, 0.00019, 0.00022, 0.00016, 0.00018, 0.00018, 0.00018, 0.00018, 0.00018, 0.00018, 0.00018, 0.00017, 0.00016, 0.00019, 0.00018, 0.00018, 0.00016, 0.00018, 0.00019, 0.00016, 0.00018, 0.00016, 0.00017, 0.00017, 0.00026, 0.00016, 0.00016, 0.00019, 0.00018, 0.00016, 0.00018, 0.00018, 0.00016, 0.00018, 0.00016, 0.00016, 0.00018, 0.00018, 0.00018, 0.00016, 0.00017, 0.00017, 0.00016, 0.00019, 0.00018, 0.00017, 0.00016, 0.00018, 0.00016, 0.00016, 0.00016, 0.00018, 0.00016, 0.00019, 0.00019, 0.00017, 0.00018, 0.00018, 0.00018, 0.00018, 0.00018, 0.00018, 0.00019, 0.00016, 0.00018, 0.00018, 0.00022, 0.00016, 0.00018, 0.00018, 0.00017, 0.00016, 0.00018, 0.00018, 0.00016, 0.00018, 0.00018, 0.00016, 0.00018, 0.00018, 0.00016, 0.00016, 0.00018, 0.00018, 0.00018, 0.00018, 0.00016, 0.00019, 0.00016, 0.00018, 0.00017, 0.00017, 0.00016, 0.00018, 0.00018, 0.00018, 0.00019, 0.00018, 0.00016, 0.00016, 0.00017, 0.00016, 0.00018, 0.00016, 0.00017, 0.00019, 0.00017, 0.00018, 0.00019, 0.00019, 0.00018, 0.00016, 0.00016, 0.00017, 0.00018, 0.00016, 0.00019, 0.00016, 0.00016, 0.00016, 0.00016, 0.00016, 0.00019, 0.00016, 0.00018, 0.00018, 0.00018, 0.00018, 0.00018, 0.00018, 0.00016, 0.00016, 0.0002, 0.00018, 0.00018, 0.00019, 0.00019, 0.00018, 0.00018, 0.0003, 0.00016, 0.00018, 0.00018, 0.00016, 0.00019, 0.00018, 0.00019, 0.00016, 0.00016, 0.00016, 0.00018, 0.00019, 0.00018, 0.00018, 0.00017, 0.00018, 0.00019, 0.00017, 0.00016, 0.00018, 0.00017, 0.00018, 0.00018, 0.00017, 0.00018, 0.00016, 0.00016, 0.00018, 0.00019, 0.00017, 0.00018, 0.00018, 0.00017, 0.00016, 0.00035, 0.00022, 0.00019, 0.00018, 0.00018, 0.00017, 0.00016, 0.00017]}, "forward-send-backward-recv-time": {"start_step": 0, "end_step": 2000, "step_interval": 5, "values": [7.52895, 0.10767, 0.10288, 0.12221, 0.10839, 0.10916, 0.11683, 0.11949, 0.11244, 0.10662, 0.11634, 0.12145, 0.11448, 0.10239, 0.10115, 0.10144, 0.10622, 0.1006, 0.1586, 0.10078, 0.09436, 0.10994, 0.11246, 0.10473, 0.11165, 0.11062, 0.10864, 0.10698, 0.11094, 0.1123, 0.11651, 0.11274, 0.11336, 0.17984, 0.1238, 0.12939, 0.27709, 0.1391, 0.13093, 0.12511, 0.13066, 0.1225, 0.11928, 0.11852, 0.12105, 0.1235, 0.12183, 0.11095, 0.20461, 0.11574, 0.12325, 0.12774, 0.1342, 0.12396, 0.11854, 0.1264, 0.11539, 0.11273, 0.1179, 0.13162, 0.11525, 0.13348, 0.13, 0.12472, 0.13424, 0.1156, 0.11969, 0.21123, 0.12519, 0.12897, 0.136, 0.13444, 0.12965, 0.12283, 0.13807, 0.13035, 0.12784, 0.13095, 0.12328, 0.12278, 0.1242, 0.13846, 0.1251, 0.11622, 0.12258, 0.12174, 0.12831, 0.12841, 0.12632, 0.11745, 0.12732, 0.12029, 0.13155, 0.12567, 0.11834, 0.12549, 0.12416, 0.12349, 0.11452, 0.20614, 0.12415, 0.11944, 0.12148, 0.11366, 0.12373, 0.12834, 0.11722, 0.11892, 0.11557, 0.12715, 0.12886, 0.12057, 0.12682, 0.12601, 0.13364, 0.12815, 0.12626, 0.1317, 0.12917, 0.12301, 0.12818, 0.12239, 0.12231, 0.12391, 0.12264, 0.1209, 0.12986, 0.12429, 0.11971, 0.12228, 0.12907, 0.12399, 0.12889, 0.11751, 0.11734, 0.11985, 0.12419, 0.11939, 0.12896, 0.13183, 0.13356, 0.12001, 0.12131, 0.11604, 0.11794, 0.12429, 0.1355, 0.12631, 0.13817, 0.12757, 0.12565, 0.12479, 0.12459, 0.11863, 0.12603, 0.11965, 0.11957, 0.11941, 0.12277, 0.12152, 0.13238, 0.12899, 0.12039, 0.12936, 0.12185, 0.12027, 0.11834, 0.12565, 0.12003, 0.12064, 0.11734, 0.11796, 0.11982, 0.11829, 0.11018, 0.11427, 0.10291, 0.11078, 0.11775, 0.12251, 0.11736, 0.12288, 0.11757, 0.10965, 0.1101, 0.1111, 0.10524, 0.11035, 0.1194, 0.10687, 0.1104, 0.1029, 0.11414, 0.11835, 0.11073, 0.10671, 0.11471, 0.11713, 0.11142, 0.11427, 0.10551, 0.11576, 0.10811, 0.12352, 0.11089, 0.10827, 0.11418, 0.11243, 0.11291, 0.10774, 0.10575, 0.10895, 0.11133, 0.10168, 0.11589, 0.11188, 0.11403, 0.12083, 0.12527, 0.20209, 0.12301, 0.12835, 0.1167, 0.12035, 0.12158, 0.11749, 0.11785, 0.11663, 0.11859, 0.11189, 0.11229, 0.11518, 0.1205, 0.11283, 0.11679, 0.11705, 0.11627, 0.12181, 0.12372, 0.12191, 0.12006, 0.1168, 0.12252, 0.11718, 0.12814, 0.12688, 0.12696, 0.12607, 0.12079, 0.13508, 0.13166, 0.13101, 0.12769, 0.12321, 0.12875, 0.12726, 0.12271, 0.12496, 0.13106, 0.12712, 0.12831, 0.11758, 0.13314, 0.13148, 0.13269, 0.13383, 0.1235, 0.1316, 0.14168, 0.13684, 0.12388, 0.11908, 0.12703, 0.12329, 0.12975, 0.12484, 0.11743, 0.13142, 0.12276, 0.12584, 0.12278, 0.12351, 0.12006, 0.1275, 0.12997, 0.12275, 0.12374, 0.1258, 0.12674, 0.1382, 0.11985, 0.12902, 0.11699, 0.12694, 0.12671, 0.12528, 0.12577, 0.12335, 0.12793, 0.12913, 0.12309, 0.13132, 0.12457, 0.12253, 0.11803, 0.11645, 0.12181, 0.12507, 0.12528, 0.12214, 0.12812, 0.12471, 0.11918, 0.12456, 0.12769, 0.12304, 0.12153, 0.11907, 0.13148, 0.13103, 0.13068, 0.13318, 0.12552, 0.12933, 0.13261, 0.12839, 0.13023, 0.12205, 0.12863, 0.12765, 0.12548, 0.12592, 0.12495, 0.12574, 0.12193, 0.12065, 0.12433, 0.12257, 0.11243, 0.11188, 0.11552, 0.11773, 0.11637, 0.1131, 0.11535, 0.11323, 0.11728, 0.11383, 0.11656, 0.18458, 0.11533, 0.1158, 0.11306, 0.12884, 0.12649, 0.12032, 0.11208, 0.11803, 0.13436, 0.14069, 0.12596, 0.12808, 0.12036, 0.127, 0.12774, 0.12746, 0.13166, 0.1288, 0.11946, 0.12914, 0.12045, 0.1215, 0.117, 0.11498, 0.11583, 0.11774, 0.12264, 0.12134, 0.12257, 0.12649, 0.1233, 0.12733, 0.11514, 0.12185, 0.12051, 0.13736, 0.13171, 0.13031, 0.11491, 0.11951, 0.10565, 0.11503, 0.1165, 0.11394, 0.11312, 0.11865, 0.11953, 0.12351, 0.12231, 0.12042]}, "backward-send-forward-recv-time": {"start_step": 0, "end_step": 2000, "step_interval": 5, "values": [3.33774, 0.00722, 0.00727, 0.01025, 0.00728, 0.00714, 0.00814, 0.00897, 0.00966, 0.00746, 0.00801, 0.00911, 0.00716, 0.01132, 0.00906, 0.00969, 0.00832, 0.01171, 0.00765, 0.00889, 0.00886, 0.01056, 0.00822, 0.01186, 0.00789, 0.00921, 0.01483, 0.01149, 0.00732, 0.00899, 0.00802, 0.00967, 0.01211, 0.00836, 0.00778, 0.0097, 0.00744, 0.00738, 0.00799, 0.00783, 0.00895, 0.00733, 0.00808, 0.00821, 0.00953, 0.00947, 0.00803, 0.00716, 0.0083, 0.01092, 0.01169, 0.01197, 0.01099, 0.0139, 0.01319, 0.01223, 0.00743, 0.01124, 0.01269, 0.01365, 0.01106, 0.01186, 0.01247, 0.01377, 0.01372, 0.00895, 0.00817, 0.0122, 0.00886, 0.01409, 0.01218, 0.0116, 0.01184, 0.01054, 0.0083, 0.01112, 0.01398, 0.01443, 0.01304, 0.01159, 0.01508, 0.01227, 0.01243, 0.00996, 0.01336, 0.0103, 0.0121, 0.00939, 0.01351, 0.0109, 0.0119, 0.00743, 0.01152, 0.01082, 0.0077, 0.013, 0.00863, 0.01128, 0.00747, 0.10318, 0.00737, 0.01277, 0.0074, 0.00766, 0.00929, 0.00731, 0.00777, 0.00773, 0.01305, 0.01203, 0.01277, 0.01218, 0.01038, 0.01189, 0.01149, 0.01182, 0.01209, 0.0087, 0.01115, 0.0143, 0.01389, 0.01471, 0.01226, 0.01046, 0.01269, 0.01445, 0.0131, 0.01159, 0.01285, 0.01374, 0.01248, 0.01373, 0.01412, 0.01487, 0.01463, 0.0142, 0.01491, 0.01425, 0.01332, 0.01294, 0.01394, 0.01396, 0.01223, 0.01179, 0.01522, 0.01396, 0.01383, 0.01262, 0.0137, 0.01453, 0.01605, 0.01203, 0.01365, 0.01102, 0.01296, 0.01149, 0.01352, 0.0141, 0.01337, 0.01015, 0.01142, 0.01244, 0.01056, 0.01302, 0.0136, 0.01251, 0.014, 0.01398, 0.01294, 0.01334, 0.01177, 0.01235, 0.01091, 0.01036, 0.01476, 0.01084, 0.01117, 0.01139, 0.01169, 0.01222, 0.01155, 0.0115, 0.01538, 0.01662, 0.01196, 0.01265, 0.01353, 0.0155, 0.01451, 0.01302, 0.01135, 0.01115, 0.01301, 0.01401, 0.01239, 0.01337, 0.0134, 0.01449, 0.01454, 0.01499, 0.02199, 0.01511, 0.01449, 0.01437, 0.01499, 0.01473, 0.01696, 0.01373, 0.01165, 0.01224, 0.01255, 0.01026, 0.01816, 0.01732, 0.01392, 0.01205, 0.01326, 0.012, 0.0125, 0.09407, 0.01373, 0.01234, 0.01352, 0.01298, 0.01393, 0.01293, 0.01272, 0.01269, 0.00988, 0.01398, 0.01371, 0.01512, 0.00926, 0.01203, 0.00886, 0.01072, 0.01094, 0.01129, 0.01236, 0.01167, 0.01127, 0.0134, 0.01164, 0.01227, 0.01086, 0.01128, 0.01424, 0.01338, 0.01286, 0.01139, 0.0124, 0.01253, 0.01306, 0.0104, 0.01044, 0.00925, 0.01349, 0.0106, 0.01304, 0.013, 0.01652, 0.01247, 0.01259, 0.01119, 0.01241, 0.01609, 0.01301, 0.01673, 0.01245, 0.01358, 0.01293, 0.01395, 0.01222, 0.01281, 0.01194, 0.01332, 0.01097, 0.01369, 0.01398, 0.0117, 0.01357, 0.0128, 0.01277, 0.01159, 0.01226, 0.01271, 0.0131, 0.01357, 0.0123, 0.01025, 0.01114, 0.01335, 0.01274, 0.00948, 0.01342, 0.01348, 0.01171, 0.01274, 0.01313, 0.01262, 0.01167, 0.00993, 0.01158, 0.0107, 0.01309, 0.01347, 0.015, 0.01426, 0.01127, 0.01224, 0.0128, 0.01251, 0.01492, 0.01369, 0.01553, 0.01256, 0.01398, 0.01419, 0.01663, 0.01442, 0.01314, 0.01126, 0.01132, 0.01161, 0.01215, 0.01208, 0.01721, 0.01103, 0.01311, 0.00802, 0.01029, 0.01351, 0.00888, 0.01039, 0.00882, 0.00933, 0.00881, 0.00926, 0.01082, 0.01021, 0.00961, 0.01001, 0.00836, 0.00918, 0.01044, 0.01016, 0.00966, 0.00991, 0.01218, 0.07892, 0.00899, 0.01009, 0.01201, 0.00867, 0.01068, 0.01049, 0.01158, 0.01334, 0.0109, 0.01304, 0.00961, 0.01538, 0.01469, 0.01646, 0.00905, 0.01059, 0.01386, 0.01332, 0.01461, 0.01223, 0.01253, 0.0166, 0.01015, 0.01471, 0.01602, 0.01097, 0.01225, 0.01068, 0.01085, 0.01135, 0.00802, 0.00878, 0.01148, 0.01009, 0.00941, 0.00919, 0.01177, 0.00968, 0.01046, 0.00955, 0.01107, 0.00923, 0.00916, 0.00864, 0.01069, 0.01075, 0.00939, 0.01202, 0.00876, 0.01073]}, "layernorm-grads-all-reduce-time": {"start_step": 0, "end_step": 2000, "step_interval": 5, "values": [0.0012, 0.00075, 0.00074, 0.00352, 0.00166, 0.00076, 0.00077, 0.00076, 0.00319, 0.00077, 0.00076, 0.00445, 0.00077, 0.00075, 0.00153, 0.00077, 0.00076, 0.00076, 0.00076, 0.00077, 0.00076, 0.00075, 0.00076, 0.00075, 0.00077, 0.00075, 0.00077, 0.00075, 0.00077, 0.00077, 0.00075, 0.00076, 0.00076, 0.00076, 0.00076, 0.00076, 0.00077, 0.00076, 0.00076, 0.00077, 0.00078, 0.00076, 0.00077, 0.00076, 0.00076, 0.00429, 0.00076, 0.00076, 0.00076, 0.00078, 0.00077, 0.00077, 0.00078, 0.00077, 0.0008, 0.00079, 0.00079, 0.00077, 0.00078, 0.00078, 0.00079, 0.00519, 0.00079, 0.00078, 0.00077, 0.00078, 0.00079, 0.00079, 0.00079, 0.00077, 0.00079, 0.00079, 0.00079, 0.00078, 0.00078, 0.00078, 0.00077, 0.00079, 0.00079, 0.00079, 0.00078, 0.00078, 0.00077, 0.00078, 0.00077, 0.00078, 0.00078, 0.00078, 0.00078, 0.00078, 0.00083, 0.00306, 0.00078, 0.00076, 0.00078, 0.00078, 0.00078, 0.00077, 0.00078, 0.0008, 0.00079, 0.00079, 0.00077, 0.00079, 0.00078, 0.00078, 0.00081, 0.00335, 0.00078, 0.00079, 0.0008, 0.00078, 0.00079, 0.00079, 0.00078, 0.00077, 0.00079, 0.00078, 0.00079, 0.0008, 0.00078, 0.00078, 0.00078, 0.00078, 0.00077, 0.00078, 0.00078, 0.00078, 0.00078, 0.00077, 0.00077, 0.00078, 0.00078, 0.00077, 0.00077, 0.00078, 0.00077, 0.00078, 0.00077, 0.00078, 0.00077, 0.00077, 0.00077, 0.00079, 0.00078, 0.00078, 0.00077, 0.00078, 0.00078, 0.00078, 0.00078, 0.00077, 0.00077, 0.00078, 0.00078, 0.00079, 0.00086, 0.00079, 0.00078, 0.00079, 0.00078, 0.00078, 0.00078, 0.00078, 0.00078, 0.00078, 0.00078, 0.00077, 0.0008, 0.0008, 0.00079, 0.00078, 0.00079, 0.00078, 0.00078, 0.00082, 0.00081, 0.00083, 0.00078, 0.00077, 0.00079, 0.00082, 0.0008, 0.00077, 0.00076, 0.00077, 0.00078, 0.00077, 0.00078, 0.00077, 0.00077, 0.00077, 0.00078, 0.00077, 0.00077, 0.00077, 0.00077, 0.00078, 0.00078, 0.00077, 0.00077, 0.00078, 0.00082, 0.00083, 0.00078, 0.00078, 0.00078, 0.00077, 0.00078, 0.00078, 0.00077, 0.00078, 0.00079, 0.00078, 0.00452, 0.00077, 0.00078, 0.00077, 0.00077, 0.0008, 0.00078, 0.00079, 0.00079, 0.00078, 0.00223, 0.00078, 0.00077, 0.00077, 0.00079, 0.00078, 0.00078, 0.00078, 0.00295, 0.00077, 0.00077, 0.00077, 0.00077, 0.00077, 0.00076, 0.00077, 0.0042, 0.00081, 0.00079, 0.00087, 0.00078, 0.00078, 0.00078, 0.00078, 0.00076, 0.00078, 0.0008, 0.00076, 0.00079, 0.00077, 0.00078, 0.00077, 0.00077, 0.00077, 0.00078, 0.00078, 0.00077, 0.00078, 0.00077, 0.00077, 0.00078, 0.00077, 0.00078, 0.00078, 0.00078, 0.00078, 0.00076, 0.00076, 0.00077, 0.00077, 0.00077, 0.00077, 0.00078, 0.00079, 0.00085, 0.00078, 0.00078, 0.00077, 0.00079, 0.00079, 0.00079, 0.00077, 0.00078, 0.00078, 0.00078, 0.00078, 0.00078, 0.00077, 0.00078, 0.00078, 0.00077, 0.00078, 0.00077, 0.00077, 0.00077, 0.00079, 0.00079, 0.00078, 0.00078, 0.00077, 0.00078, 0.00078, 0.00078, 0.00079, 0.00077, 0.00078, 0.00078, 0.00077, 0.00077, 0.00078, 0.00077, 0.00077, 0.00079, 0.00079, 0.00077, 0.00077, 0.00078, 0.00077, 0.00078, 0.00078, 0.00078, 0.00079, 0.00078, 0.00078, 0.00077, 0.00078, 0.00078, 0.00077, 0.00079, 0.00078, 0.00077, 0.00079, 0.00078, 0.00078, 0.00077, 0.00077, 0.0008, 0.00078, 0.00078, 0.00079, 0.00077, 0.00079, 0.00077, 0.00077, 0.00077, 0.00079, 0.00078, 0.00078, 0.00078, 0.00083, 0.0009, 0.00079, 0.00082, 0.0008, 0.0008, 0.00078, 0.00077, 0.00077, 0.00078, 0.00078, 0.00079, 0.00078, 0.00077, 0.00078, 0.00078, 0.00078, 0.00078, 0.00078, 0.00077, 0.0008, 0.00079, 0.00078, 0.00078, 0.00078, 0.00078, 0.00077, 0.00078, 0.00078, 0.00078, 0.00078, 0.00078, 0.00078, 0.00077, 0.00084, 0.00077, 0.00077, 0.00077, 0.0008, 0.00078, 0.00078, 0.00077, 0.00078, 0.00153, 0.00078, 0.00078, 0.00076]}, "embedding-grads-all-reduce-time": {"start_step": 0, "end_step": 2000, "step_interval": 5, "values": [0.00036, 0.00032, 0.00032, 0.00032, 0.00031, 0.00032, 0.00033, 0.00032, 0.00032, 0.00031, 0.00032, 0.00034, 0.00032, 0.00031, 0.00037, 0.00032, 0.00032, 0.00032, 0.00032, 0.00032, 0.00032, 0.00032, 0.00032, 0.00031, 0.00032, 0.00031, 0.00032, 0.00032, 0.00032, 0.00031, 0.00032, 0.00034, 0.00032, 0.00033, 0.00032, 0.00032, 0.00032, 0.00032, 0.00032, 0.00032, 0.00032, 0.00032, 0.00032, 0.00032, 0.00032, 0.00032, 0.00032, 0.00032, 0.00032, 0.00031, 0.00032, 0.00032, 0.00032, 0.00032, 0.00032, 0.00032, 0.00031, 0.00032, 0.00032, 0.00032, 0.00032, 0.00034, 0.00032, 0.00032, 0.00032, 0.00032, 0.00032, 0.00032, 0.00032, 0.00032, 0.00032, 0.00032, 0.00032, 0.00032, 0.00032, 0.00032, 0.00031, 0.00032, 0.00032, 0.00032, 0.00032, 0.00032, 0.00031, 0.00032, 0.00032, 0.00032, 0.00032, 0.00032, 0.00032, 0.00032, 0.00032, 0.00033, 0.00032, 0.00032, 0.00032, 0.00032, 0.00032, 0.00033, 0.00032, 0.00032, 0.00032, 0.00033, 0.00032, 0.00032, 0.00032, 0.00032, 0.00032, 0.00033, 0.00032, 0.00032, 0.00032, 0.00032, 0.00032, 0.00032, 0.00032, 0.00031, 0.00032, 0.00032, 0.00032, 0.00032, 0.00032, 0.00032, 0.00032, 0.00031, 0.00032, 0.00031, 0.00033, 0.00032, 0.00032, 0.00032, 0.00032, 0.00032, 0.00032, 0.00033, 0.00032, 0.00032, 0.00032, 0.00031, 0.00032, 0.00032, 0.00032, 0.00032, 0.00032, 0.00032, 0.00032, 0.00032, 0.00031, 0.00032, 0.00032, 0.00031, 0.00033, 0.00032, 0.00032, 0.00032, 0.00032, 0.00032, 0.00032, 0.00031, 0.00032, 0.00032, 0.00032, 0.00032, 0.00032, 0.00032, 0.00032, 0.00031, 0.00032, 0.00032, 0.00032, 0.00032, 0.00032, 0.00032, 0.00031, 0.00032, 0.00032, 0.00032, 0.00032, 0.00032, 0.00032, 0.00032, 0.00031, 0.00032, 0.00031, 0.00032, 0.00032, 0.00031, 0.00032, 0.00032, 0.00032, 0.00032, 0.00032, 0.00033, 0.00032, 0.00032, 0.00032, 0.00032, 0.00032, 0.00032, 0.00031, 0.00032, 0.00031, 0.00032, 0.00032, 0.00032, 0.00032, 0.00031, 0.00032, 0.00032, 0.00031, 0.00031, 0.00032, 0.00032, 0.00032, 0.00031, 0.00034, 0.00032, 0.00032, 0.00032, 0.00031, 0.00032, 0.00032, 0.00031, 0.00032, 0.00032, 0.00032, 0.00032, 0.00032, 0.00032, 0.00032, 0.00031, 0.00031, 0.00032, 0.00031, 0.00031, 0.00031, 0.00032, 0.00032, 0.00032, 0.00032, 0.00031, 0.00034, 0.00032, 0.00032, 0.00032, 0.00032, 0.00032, 0.00032, 0.00032, 0.00032, 0.00031, 0.00034, 0.00032, 0.00032, 0.00031, 0.00032, 0.00032, 0.00031, 0.00032, 0.00032, 0.00032, 0.00031, 0.00032, 0.00032, 0.00032, 0.00031, 0.00032, 0.00031, 0.00032, 0.00031, 0.00032, 0.00032, 0.00031, 0.00032, 0.00032, 0.00032, 0.00032, 0.00031, 0.00032, 0.00032, 0.00031, 0.00032, 0.00032, 0.00031, 0.00032, 0.00031, 0.00032, 0.00031, 0.00032, 0.00032, 0.00031, 0.00032, 0.00032, 0.00031, 0.00032, 0.00031, 0.00031, 0.00032, 0.00031, 0.00031, 0.00032, 0.00032, 0.00032, 0.00032, 0.00031, 0.00032, 0.00031, 0.00032, 0.00032, 0.00031, 0.00031, 0.00032, 0.00032, 0.00032, 0.00032, 0.00033, 0.00031, 0.00032, 0.00031, 0.00031, 0.00032, 0.00032, 0.00032, 0.00032, 0.00032, 0.00032, 0.00032, 0.00032, 0.00032, 0.00031, 0.00032, 0.00032, 0.00031, 0.00032, 0.00032, 0.00032, 0.00032, 0.00031, 0.00032, 0.00032, 0.00031, 0.00032, 0.00032, 0.00032, 0.00031, 0.00031, 0.00032, 0.00032, 0.00032, 0.00031, 0.00032, 0.00032, 0.00032, 0.00031, 0.00032, 0.00032, 0.00032, 0.00032, 0.00032, 0.00032, 0.00032, 0.00032, 0.00032, 0.00033, 0.00032, 0.00032, 0.00032, 0.00032, 0.00032, 0.00033, 0.00032, 0.00032, 0.00031, 0.00032, 0.00031, 0.00032, 0.00032, 0.00031, 0.00032, 0.00032, 0.00032, 0.00032, 0.00032, 0.00031, 0.00032, 0.00032, 0.00032, 0.00031, 0.00032, 0.00031, 0.00031, 0.00031, 0.00032, 0.00032, 0.00032, 0.00032, 0.00032, 0.00032, 0.00032, 0.00032, 0.00031]}, "all-grads-sync-time": {"start_step": 0, "end_step": 2000, "step_interval": 5, "values": [2.22391, 0.00071, 0.00073, 0.0009, 0.00073, 0.00075, 0.00074, 0.00093, 0.00097, 0.00072, 0.00071, 0.00084, 0.00088, 0.00075, 0.00086, 0.00072, 0.00072, 0.00071, 0.00072, 0.00073, 0.00072, 0.00072, 0.00073, 0.00073, 0.00072, 0.00072, 0.00072, 0.00072, 0.00071, 0.0007, 0.00072, 0.00071, 0.00072, 0.00072, 0.00071, 0.00071, 0.00074, 0.00072, 0.00074, 0.00073, 0.00073, 0.00075, 0.00074, 0.00072, 0.00072, 0.00073, 0.0009, 0.00081, 0.00071, 0.00073, 0.00073, 0.00071, 0.00074, 0.00084, 0.00072, 0.00072, 0.00083, 0.00072, 0.00073, 0.00072, 0.0009, 0.00072, 0.00072, 0.00072, 0.00074, 0.00072, 0.00073, 0.00073, 0.00073, 0.00072, 0.00074, 0.00075, 0.00072, 0.00073, 0.00073, 0.00072, 0.00073, 0.00074, 0.00073, 0.00072, 0.00073, 0.00074, 0.00073, 0.00074, 0.00073, 0.00073, 0.00073, 0.00072, 0.00072, 0.00071, 0.00074, 0.00093, 0.00074, 0.00072, 0.00072, 0.00072, 0.00072, 0.00069, 0.00084, 0.00071, 0.00073, 0.00073, 0.0008, 0.00086, 0.00098, 0.00092, 0.00099, 0.00087, 0.00096, 0.00093, 0.00073, 0.00074, 0.00072, 0.00072, 0.00072, 0.00074, 0.00072, 0.00072, 0.00072, 0.00073, 0.00073, 0.00073, 0.00072, 0.00073, 0.00072, 0.00073, 0.00073, 0.00072, 0.00073, 0.00077, 0.00075, 0.00074, 0.00087, 0.00072, 0.00073, 0.00072, 0.00073, 0.00082, 0.00081, 0.00074, 0.00074, 0.00073, 0.00072, 0.00072, 0.00074, 0.00073, 0.00071, 0.00075, 0.00076, 0.00072, 0.00085, 0.00072, 0.00073, 0.00072, 0.00074, 0.00082, 0.00097, 0.00073, 0.00072, 0.00072, 0.00073, 0.00073, 0.00073, 0.00072, 0.00072, 0.00073, 0.00073, 0.00073, 0.00077, 0.00072, 0.00073, 0.00086, 0.00087, 0.00073, 0.00093, 0.00084, 0.00097, 0.00089, 0.00074, 0.00074, 0.00087, 0.00093, 0.00087, 0.00073, 0.00072, 0.00074, 0.00072, 0.00074, 0.00074, 0.00074, 0.00073, 0.00072, 0.00093, 0.00074, 0.00073, 0.00075, 0.00085, 0.00073, 0.00072, 0.00072, 0.00073, 0.00092, 0.00074, 0.00088, 0.00073, 0.00074, 0.00073, 0.00073, 0.00072, 0.00072, 0.00075, 0.00073, 0.00072, 0.00081, 0.00073, 0.00073, 0.00071, 0.00072, 0.00071, 0.00071, 0.00072, 0.00074, 0.00072, 0.00073, 0.00093, 0.00072, 0.00074, 0.00072, 0.00073, 0.00071, 0.00074, 0.00074, 0.00087, 0.00086, 0.00072, 0.00072, 0.00074, 0.00072, 0.00074, 0.00072, 0.00079, 0.00095, 0.00083, 0.00071, 0.00093, 0.00088, 0.00072, 0.00072, 0.00073, 0.00071, 0.00075, 0.00091, 0.00072, 0.00071, 0.00072, 0.00073, 0.0007, 0.00072, 0.00074, 0.00072, 0.00074, 0.00073, 0.00075, 0.00073, 0.00073, 0.00072, 0.00073, 0.00073, 0.00071, 0.00074, 0.00072, 0.00071, 0.00071, 0.00073, 0.00072, 0.00073, 0.00073, 0.00071, 0.00074, 0.00072, 0.00073, 0.00073, 0.0007, 0.00072, 0.00072, 0.00072, 0.00073, 0.00074, 0.00072, 0.00074, 0.00073, 0.00073, 0.00074, 0.0007, 0.00072, 0.00072, 0.00073, 0.00074, 0.00071, 0.00073, 0.00072, 0.00071, 0.00073, 0.00071, 0.00073, 0.00072, 0.00074, 0.00071, 0.00073, 0.00071, 0.00073, 0.00073, 0.00071, 0.0007, 0.00072, 0.00072, 0.00073, 0.00072, 0.00071, 0.00072, 0.00073, 0.00074, 0.00071, 0.00074, 0.00071, 0.00073, 0.00072, 0.00073, 0.00073, 0.00071, 0.00073, 0.00072, 0.00073, 0.00074, 0.00074, 0.00071, 0.00072, 0.00072, 0.00074, 0.00072, 0.00073, 0.00072, 0.00074, 0.00072, 0.00073, 0.00073, 0.00073, 0.00073, 0.00074, 0.00074, 0.00075, 0.00072, 0.00073, 0.00097, 0.00103, 0.00091, 0.00097, 0.00092, 0.00088, 0.00072, 0.00071, 0.00073, 0.00074, 0.00073, 0.00075, 0.0007, 0.00072, 0.00072, 0.00072, 0.00071, 0.00073, 0.00072, 0.00074, 0.00072, 0.00073, 0.00074, 0.00073, 0.00074, 0.00073, 0.00072, 0.00073, 0.00074, 0.00074, 0.00072, 0.00075, 0.0007, 0.00072, 0.00076, 0.00073, 0.00072, 0.00072, 0.00094, 0.00082, 0.00087, 0.00071, 0.00071, 0.00096, 0.00083, 0.00089, 0.00089]}, "params-all-gather-time": {"start_step": 0, "end_step": 2000, "step_interval": 5, "values": [0.00024, 0.00025, 0.00024, 0.00043, 0.00027, 0.00024, 0.00024, 0.00024, 0.00035, 0.00024, 0.00024, 0.0004, 0.00025, 0.00024, 0.0003, 0.00025, 0.00024, 0.00024, 0.00024, 0.00025, 0.00024, 0.00025, 0.00024, 0.00024, 0.00024, 0.00024, 0.00024, 0.00024, 0.00024, 0.00024, 0.00024, 0.00024, 0.00025, 0.00025, 0.00026, 0.00024, 0.00024, 0.00025, 0.00024, 0.00025, 0.00025, 0.00024, 0.00024, 0.00024, 0.00024, 0.0003, 0.00025, 0.00025, 0.00025, 0.00025, 0.00042, 0.00025, 0.00027, 0.00025, 0.00048, 0.00025, 0.00026, 0.00025, 0.00025, 0.00026, 0.00026, 0.00056, 0.00026, 0.00043, 0.00025, 0.00025, 0.00025, 0.00025, 0.00025, 0.00025, 0.00025, 0.00025, 0.00025, 0.00025, 0.00025, 0.00024, 0.00025, 0.00026, 0.00025, 0.00025, 0.00025, 0.00025, 0.00025, 0.00024, 0.00025, 0.00024, 0.00025, 0.00025, 0.00024, 0.00025, 0.00025, 0.00033, 0.00025, 0.00026, 0.00025, 0.00025, 0.00025, 0.00025, 0.00025, 0.00026, 0.00025, 0.00025, 0.00025, 0.00025, 0.00028, 0.00043, 0.00026, 0.00034, 0.0003, 0.00025, 0.0003, 0.00024, 0.00025, 0.00026, 0.00026, 0.00024, 0.00025, 0.00024, 0.00025, 0.00025, 0.00024, 0.00025, 0.00025, 0.00024, 0.00026, 0.00024, 0.00024, 0.00025, 0.00025, 0.00025, 0.00024, 0.00025, 0.00025, 0.00024, 0.00024, 0.00025, 0.00025, 0.00025, 0.00024, 0.00025, 0.00024, 0.00025, 0.00026, 0.00024, 0.00024, 0.00025, 0.00028, 0.00025, 0.00025, 0.00025, 0.00025, 0.00028, 0.00025, 0.00025, 0.00025, 0.00025, 0.00025, 0.00025, 0.00025, 0.00026, 0.00025, 0.00025, 0.00025, 0.00025, 0.00025, 0.00027, 0.00025, 0.00025, 0.00026, 0.00026, 0.00027, 0.00025, 0.00026, 0.00025, 0.00026, 0.00046, 0.00025, 0.00025, 0.00025, 0.00025, 0.00045, 0.00025, 0.00025, 0.00025, 0.00024, 0.00025, 0.00025, 0.00024, 0.00027, 0.00025, 0.00025, 0.00025, 0.00025, 0.00024, 0.00025, 0.00025, 0.00025, 0.00024, 0.00024, 0.00025, 0.00025, 0.00025, 0.00025, 0.00025, 0.00025, 0.00025, 0.00026, 0.00024, 0.00025, 0.00025, 0.00025, 0.00025, 0.00024, 0.00024, 0.00043, 0.00024, 0.00025, 0.00025, 0.00025, 0.00024, 0.00025, 0.00025, 0.00025, 0.00025, 0.00032, 0.0005, 0.00025, 0.00024, 0.0005, 0.00038, 0.00024, 0.00025, 0.00025, 0.00025, 0.00025, 0.00025, 0.00025, 0.00025, 0.00025, 0.00025, 0.00042, 0.00025, 0.0004, 0.00026, 0.00025, 0.00025, 0.00025, 0.00025, 0.00026, 0.00025, 0.00027, 0.00025, 0.00026, 0.00025, 0.00026, 0.00025, 0.00025, 0.00025, 0.00025, 0.00025, 0.00025, 0.00026, 0.00025, 0.00026, 0.00026, 0.00025, 0.00025, 0.00025, 0.00025, 0.00024, 0.00024, 0.00025, 0.00025, 0.00025, 0.00024, 0.00025, 0.00025, 0.00025, 0.00025, 0.00025, 0.00025, 0.00025, 0.00025, 0.00025, 0.00024, 0.00025, 0.00025, 0.00025, 0.00025, 0.00024, 0.00025, 0.00039, 0.00029, 0.00026, 0.00025, 0.00025, 0.00033, 0.00025, 0.00025, 0.00026, 0.00026, 0.00027, 0.00033, 0.00025, 0.00024, 0.00025, 0.00025, 0.00025, 0.00024, 0.00025, 0.00025, 0.00025, 0.00024, 0.00025, 0.00025, 0.00025, 0.00025, 0.00025, 0.00024, 0.00025, 0.00025, 0.00025, 0.00024, 0.00025, 0.00026, 0.00025, 0.00025, 0.00025, 0.00025, 0.00025, 0.00025, 0.00025, 0.00025, 0.00025, 0.00024, 0.00024, 0.00025, 0.00025, 0.00024, 0.00024, 0.00024, 0.00026, 0.00024, 0.00025, 0.00025, 0.00025, 0.00025, 0.00024, 0.00024, 0.00024, 0.00024, 0.00025, 0.00025, 0.00044, 0.00044, 0.00046, 0.00041, 0.00047, 0.00026, 0.00025, 0.00025, 0.00025, 0.00025, 0.00025, 0.00025, 0.00025, 0.00025, 0.00024, 0.00025, 0.00025, 0.00026, 0.00024, 0.00025, 0.00025, 0.00025, 0.00025, 0.00025, 0.00025, 0.00024, 0.00025, 0.00025, 0.00026, 0.00025, 0.00026, 0.00025, 0.00025, 0.00026, 0.00025, 0.00025, 0.00024, 0.00043, 0.00026, 0.00053, 0.00025, 0.00026, 0.00025, 0.00028, 0.00042, 0.00025, 0.00025]}, "optimizer-copy-to-main-grad-time": {"start_step": 0, "end_step": 2000, "step_interval": 5, "values": [0.00041, 0.00039, 0.00039, 0.00041, 0.00042, 0.0004, 0.00041, 0.0004, 0.0004, 0.0004, 0.0004, 0.00054, 0.0004, 0.0004, 0.00056, 0.00042, 0.0004, 0.00041, 0.00041, 0.00041, 0.00041, 0.00041, 0.00042, 0.0004, 0.0004, 0.00041, 0.00041, 0.00041, 0.0004, 0.00041, 0.0004, 0.00041, 0.00041, 0.00041, 0.0004, 0.00041, 0.00042, 0.00041, 0.00042, 0.00041, 0.00042, 0.00042, 0.0004, 0.00041, 0.00042, 0.00042, 0.0004, 0.00041, 0.00043, 0.00041, 0.00042, 0.00041, 0.00042, 0.00042, 0.00043, 0.00042, 0.00042, 0.00042, 0.00043, 0.00042, 0.00041, 0.00041, 0.00042, 0.00042, 0.00043, 0.00042, 0.00043, 0.00042, 0.00042, 0.00041, 0.00042, 0.00042, 0.00042, 0.00041, 0.00042, 0.00042, 0.00042, 0.00043, 0.00043, 0.00043, 0.0004, 0.00041, 0.00043, 0.00042, 0.00042, 0.00043, 0.00042, 0.00043, 0.00042, 0.00042, 0.00048, 0.00042, 0.00041, 0.00042, 0.00042, 0.00042, 0.00043, 0.00044, 0.00042, 0.00042, 0.00041, 0.0004, 0.00041, 0.00041, 0.00041, 0.00041, 0.00041, 0.00042, 0.00042, 0.00038, 0.0004, 0.00043, 0.00041, 0.00043, 0.00041, 0.0004, 0.0004, 0.0004, 0.00041, 0.00042, 0.00041, 0.00042, 0.00041, 0.00041, 0.0004, 0.00041, 0.00041, 0.00041, 0.00041, 0.00041, 0.00041, 0.00041, 0.00041, 0.00043, 0.00043, 0.00041, 0.00041, 0.00041, 0.00041, 0.00042, 0.00038, 0.0004, 0.00039, 0.00041, 0.00042, 0.00043, 0.00038, 0.00038, 0.0004, 0.00042, 0.0004, 0.0004, 0.0004, 0.00041, 0.00041, 0.0004, 0.00045, 0.00041, 0.00041, 0.0004, 0.00043, 0.00042, 0.00042, 0.00042, 0.00041, 0.00042, 0.00042, 0.00042, 0.00041, 0.00041, 0.00041, 0.00041, 0.0004, 0.00041, 0.00041, 0.0004, 0.00041, 0.00041, 0.0004, 0.00041, 0.00041, 0.0004, 0.00041, 0.00041, 0.0004, 0.00041, 0.00042, 0.00041, 0.0004, 0.00041, 0.00042, 0.00041, 0.00041, 0.0004, 0.00041, 0.0004, 0.00041, 0.00043, 0.0004, 0.00042, 0.00042, 0.00043, 0.00041, 0.00041, 0.00041, 0.00041, 0.00041, 0.00043, 0.00042, 0.00041, 0.00038, 0.00042, 0.00041, 0.00041, 0.00041, 0.00042, 0.00041, 0.00041, 0.00042, 0.00041, 0.0004, 0.00041, 0.00041, 0.00041, 0.00041, 0.00041, 0.00041, 0.00042, 0.00043, 0.00042, 0.00041, 0.00041, 0.00041, 0.00041, 0.00041, 0.00043, 0.00041, 0.0004, 0.00042, 0.00041, 0.00041, 0.00042, 0.00041, 0.00041, 0.00042, 0.00042, 0.0004, 0.00041, 0.00041, 0.00041, 0.00046, 0.00043, 0.00043, 0.00042, 0.00042, 0.00042, 0.00042, 0.00043, 0.00042, 0.00041, 0.00043, 0.00043, 0.00039, 0.00043, 0.00042, 0.00042, 0.00043, 0.00042, 0.00042, 0.00042, 0.00043, 0.0004, 0.00042, 0.0004, 0.00043, 0.00041, 0.00042, 0.00042, 0.00043, 0.00041, 0.00041, 0.00041, 0.00042, 0.00042, 0.00042, 0.00041, 0.00043, 0.00042, 0.0004, 0.00043, 0.00041, 0.00042, 0.00041, 0.00041, 0.00043, 0.00042, 0.00042, 0.00043, 0.00042, 0.00042, 0.00041, 0.00041, 0.00041, 0.00041, 0.00041, 0.00042, 0.00041, 0.00042, 0.00042, 0.00042, 0.00042, 0.00042, 0.00042, 0.00043, 0.00041, 0.00042, 0.00042, 0.00043, 0.00044, 0.00043, 0.00041, 0.00041, 0.00042, 0.00042, 0.00041, 0.00043, 0.00041, 0.00042, 0.00041, 0.00042, 0.00041, 0.00039, 0.00041, 0.00042, 0.00042, 0.00041, 0.00042, 0.00042, 0.00042, 0.00042, 0.00041, 0.00041, 0.00042, 0.00041, 0.00041, 0.00043, 0.00042, 0.00042, 0.00042, 0.00041, 0.00041, 0.00042, 0.00043, 0.00041, 0.00041, 0.00041, 0.00042, 0.00043, 0.00042, 0.00042, 0.00044, 0.00043, 0.00042, 0.00041, 0.00042, 0.00041, 0.00043, 0.00041, 0.00044, 0.0004, 0.00042, 0.00042, 0.00041, 0.00042, 0.00042, 0.00043, 0.00042, 0.00041, 0.00041, 0.00041, 0.00042, 0.00041, 0.00041, 0.00042, 0.00041, 0.0004, 0.00041, 0.00041, 0.00041, 0.00042, 0.00041, 0.0004, 0.00052, 0.00042, 0.00042, 0.00042, 0.0004, 0.00042, 0.00041, 0.00041]}, "optimizer-clip-main-grad-time": {"start_step": 0, "end_step": 2000, "step_interval": 5, "values": [0.02442, 0.00047, 0.00045, 0.00045, 0.00045, 0.00045, 0.00045, 0.00046, 0.00045, 0.00045, 0.00044, 0.00046, 0.00045, 0.00046, 0.00069, 0.00045, 0.00045, 0.00046, 0.00046, 0.00045, 0.00045, 0.00046, 0.00044, 0.00045, 0.00045, 0.00045, 0.00045, 0.00046, 0.00045, 0.00044, 0.00044, 0.00045, 0.00045, 0.00045, 0.00045, 0.00045, 0.00046, 0.00045, 0.00045, 0.00045, 0.00044, 0.00045, 0.00044, 0.00045, 0.00045, 0.00045, 0.00046, 0.00045, 0.00045, 0.00045, 0.00045, 0.00045, 0.00045, 0.00045, 0.0005, 0.00046, 0.00045, 0.00044, 0.00047, 0.00046, 0.00045, 0.00053, 0.00046, 0.00045, 0.00045, 0.00045, 0.00046, 0.00044, 0.00045, 0.00046, 0.00045, 0.00047, 0.00045, 0.00045, 0.00045, 0.00045, 0.00045, 0.00045, 0.00044, 0.00045, 0.00045, 0.00044, 0.00045, 0.00046, 0.00045, 0.00045, 0.00045, 0.00045, 0.00045, 0.00045, 0.00052, 0.00045, 0.00047, 0.00046, 0.00039, 0.00045, 0.00045, 0.00046, 0.00046, 0.00046, 0.00045, 0.00045, 0.00045, 0.00044, 0.0004, 0.00046, 0.00044, 0.0004, 0.00046, 0.00044, 0.0004, 0.0004, 0.0004, 0.00041, 0.00047, 0.00046, 0.0004, 0.00046, 0.00045, 0.00045, 0.00039, 0.00045, 0.00047, 0.00045, 0.0004, 0.00045, 0.00044, 0.00045, 0.00045, 0.00045, 0.00045, 0.00045, 0.00046, 0.00045, 0.00045, 0.00045, 0.00045, 0.00045, 0.00045, 0.00047, 0.00044, 0.00045, 0.00044, 0.00045, 0.00045, 0.00046, 0.00044, 0.00044, 0.00045, 0.00045, 0.00045, 0.00047, 0.00045, 0.00046, 0.00045, 0.00045, 0.00049, 0.00046, 0.00046, 0.00045, 0.00045, 0.00045, 0.00044, 0.00045, 0.00045, 0.00045, 0.00045, 0.00046, 0.00045, 0.00045, 0.00045, 0.00045, 0.00045, 0.00045, 0.00046, 0.00045, 0.00045, 0.00046, 0.00046, 0.00047, 0.00045, 0.00045, 0.00045, 0.00045, 0.00046, 0.00047, 0.00048, 0.00047, 0.00046, 0.00045, 0.00045, 0.00046, 0.00045, 0.00045, 0.00047, 0.00045, 0.00045, 0.00045, 0.00045, 0.00045, 0.00047, 0.00045, 0.00046, 0.00046, 0.00045, 0.00045, 0.00046, 0.00046, 0.00058, 0.00047, 0.00044, 0.00046, 0.00045, 0.00045, 0.00045, 0.00045, 0.00045, 0.00045, 0.00045, 0.00045, 0.00044, 0.00045, 0.00045, 0.00046, 0.00045, 0.00054, 0.00045, 0.00045, 0.00045, 0.00046, 0.00044, 0.00045, 0.00047, 0.00045, 0.00045, 0.00045, 0.00045, 0.00046, 0.00045, 0.00045, 0.00045, 0.00045, 0.00045, 0.00046, 0.00045, 0.00045, 0.00045, 0.00045, 0.00046, 0.00045, 0.00051, 0.00046, 0.00045, 0.00045, 0.00045, 0.00046, 0.00045, 0.00045, 0.00045, 0.00045, 0.00045, 0.00046, 0.00044, 0.00046, 0.00046, 0.00045, 0.00045, 0.00045, 0.00045, 0.00045, 0.00045, 0.00045, 0.00045, 0.00045, 0.00044, 0.00045, 0.00045, 0.00045, 0.00045, 0.00045, 0.00044, 0.00045, 0.00048, 0.00045, 0.00045, 0.00044, 0.00045, 0.00045, 0.00045, 0.00045, 0.00045, 0.00045, 0.00045, 0.00045, 0.00045, 0.00045, 0.00046, 0.00045, 0.00045, 0.00045, 0.00045, 0.00045, 0.00045, 0.00045, 0.00045, 0.00045, 0.00046, 0.00045, 0.00046, 0.00048, 0.00044, 0.00044, 0.00045, 0.00045, 0.00044, 0.00046, 0.00045, 0.00045, 0.00045, 0.00045, 0.00045, 0.00046, 0.00045, 0.00045, 0.00046, 0.00045, 0.00045, 0.00045, 0.00045, 0.00045, 0.00045, 0.00045, 0.00045, 0.00046, 0.00045, 0.00045, 0.00045, 0.00045, 0.00046, 0.00045, 0.00045, 0.00047, 0.00045, 0.00045, 0.00045, 0.00045, 0.00046, 0.00045, 0.00045, 0.00046, 0.00046, 0.00046, 0.00046, 0.00045, 0.00046, 0.00047, 0.00045, 0.00057, 0.00046, 0.00045, 0.00045, 0.00045, 0.00046, 0.00047, 0.00047, 0.00045, 0.00046, 0.00045, 0.00045, 0.00044, 0.00046, 0.00046, 0.00045, 0.00045, 0.00047, 0.00047, 0.00045, 0.00045, 0.00045, 0.00046, 0.00045, 0.00046, 0.00045, 0.00046, 0.00045, 0.00045, 0.00046, 0.00045, 0.00046, 0.00045, 0.00045, 0.00046, 0.00059, 0.00045, 0.00047, 0.00045, 0.00046, 0.00045, 0.00045, 0.00045]}, "optimizer-count-zeros-time": {"start_step": 0, "end_step": 2000, "step_interval": 5, "values": [0.00264, 0.00186, 0.00189, 0.00186, 0.00191, 0.00186, 0.00187, 0.00189, 0.0019, 0.00189, 0.00189, 0.002, 0.00187, 0.00201, 0.0019, 0.00186, 0.00187, 0.00185, 0.00187, 0.00187, 0.00186, 0.00186, 0.00187, 0.00186, 0.00187, 0.00189, 0.00189, 0.00185, 0.00188, 0.00186, 0.00187, 0.00188, 0.00188, 0.00186, 0.00188, 0.00187, 0.00189, 0.00185, 0.00189, 0.00189, 0.00187, 0.00186, 0.00186, 0.00189, 0.00188, 0.00186, 0.00186, 0.0019, 0.00186, 0.00187, 0.00188, 0.00186, 0.00213, 0.00189, 0.00185, 0.00186, 0.00188, 0.00189, 0.00186, 0.00185, 0.00187, 0.00186, 0.00186, 0.00186, 0.00186, 0.00186, 0.00185, 0.00186, 0.00187, 0.00186, 0.00186, 0.00189, 0.00188, 0.0019, 0.00189, 0.00187, 0.00187, 0.00188, 0.00186, 0.00187, 0.00187, 0.00188, 0.00186, 0.00186, 0.00186, 0.00185, 0.00186, 0.00186, 0.00187, 0.00186, 0.00217, 0.0019, 0.00195, 0.00188, 0.00187, 0.00188, 0.00188, 0.00186, 0.00188, 0.00186, 0.00188, 0.00188, 0.00186, 0.00187, 0.00188, 0.00185, 0.00208, 0.00187, 0.00187, 0.00186, 0.00185, 0.00185, 0.00188, 0.00185, 0.00186, 0.00186, 0.00186, 0.00186, 0.00186, 0.00186, 0.00187, 0.00185, 0.00185, 0.00188, 0.00186, 0.00185, 0.00188, 0.00186, 0.00186, 0.00184, 0.00187, 0.00186, 0.00189, 0.00186, 0.00185, 0.0019, 0.00187, 0.00186, 0.00186, 0.00186, 0.00186, 0.00186, 0.00189, 0.00187, 0.0019, 0.00186, 0.00186, 0.00187, 0.00188, 0.00185, 0.00186, 0.00186, 0.00189, 0.00186, 0.00187, 0.00187, 0.00203, 0.00186, 0.00186, 0.00188, 0.00187, 0.00186, 0.00188, 0.00184, 0.00185, 0.00186, 0.00187, 0.00185, 0.00186, 0.00187, 0.00188, 0.00198, 0.00198, 0.00186, 0.00185, 0.00187, 0.00188, 0.00186, 0.00188, 0.00185, 0.00185, 0.00187, 0.00187, 0.00186, 0.00185, 0.00185, 0.00187, 0.00186, 0.00186, 0.00187, 0.00187, 0.00185, 0.00187, 0.00187, 0.00186, 0.00185, 0.00186, 0.00187, 0.00188, 0.00191, 0.00186, 0.00188, 0.00188, 0.00187, 0.00188, 0.00187, 0.00188, 0.00186, 0.00187, 0.0019, 0.00187, 0.00187, 0.00186, 0.00187, 0.00187, 0.00186, 0.0019, 0.00188, 0.00187, 0.0019, 0.0019, 0.00191, 0.00191, 0.00186, 0.00187, 0.00188, 0.00187, 0.00186, 0.00188, 0.00188, 0.00189, 0.00189, 0.00188, 0.00188, 0.00189, 0.00189, 0.00189, 0.00186, 0.00191, 0.00189, 0.00187, 0.00186, 0.0019, 0.00188, 0.00188, 0.00187, 0.00188, 0.0019, 0.00189, 0.0019, 0.00219, 0.00189, 0.0019, 0.00187, 0.00188, 0.00187, 0.00187, 0.00188, 0.00188, 0.00187, 0.00186, 0.00189, 0.00188, 0.00188, 0.00188, 0.00188, 0.00188, 0.00189, 0.00188, 0.00216, 0.00188, 0.00189, 0.00188, 0.00189, 0.00189, 0.00189, 0.00187, 0.00187, 0.00188, 0.00188, 0.00199, 0.00187, 0.00201, 0.00189, 0.00187, 0.00191, 0.00189, 0.00187, 0.00188, 0.00188, 0.00189, 0.00246, 0.00272, 0.00189, 0.00189, 0.00189, 0.00288, 0.00189, 0.00187, 0.00189, 0.00189, 0.0019, 0.0019, 0.00188, 0.0019, 0.0019, 0.00191, 0.0019, 0.0019, 0.0019, 0.00191, 0.00191, 0.00189, 0.00189, 0.0019, 0.0019, 0.00189, 0.00188, 0.00188, 0.0019, 0.00197, 0.00187, 0.00189, 0.00188, 0.00189, 0.00187, 0.0019, 0.00187, 0.00189, 0.00188, 0.00189, 0.00188, 0.00187, 0.00187, 0.00188, 0.0019, 0.00187, 0.00188, 0.00188, 0.00188, 0.00191, 0.00216, 0.00186, 0.00188, 0.00189, 0.00189, 0.00187, 0.00189, 0.0019, 0.00187, 0.00189, 0.00187, 0.00199, 0.00189, 0.00188, 0.00187, 0.00187, 0.00188, 0.00189, 0.00188, 0.00188, 0.00188, 0.00188, 0.00187, 0.00188, 0.00188, 0.00188, 0.00189, 0.00188, 0.00188, 0.0019, 0.00187, 0.00189, 0.00189, 0.00188, 0.00189, 0.00188, 0.00188, 0.00188, 0.00189, 0.00186, 0.00189, 0.00187, 0.00189, 0.0019, 0.0019, 0.00194, 0.00189, 0.00187, 0.00187, 0.00189, 0.00189, 0.002, 0.00187, 0.00187, 0.00189, 0.00187, 0.00188, 0.00189, 0.00195]}, "optimizer-inner-step-time": {"start_step": 0, "end_step": 2000, "step_interval": 5, "values": [0.00219, 0.00036, 0.00035, 0.00037, 0.00037, 0.00039, 0.00038, 0.00037, 0.00037, 0.00038, 0.00037, 0.0004, 0.00038, 0.00038, 0.00047, 0.00037, 0.00038, 0.00038, 0.00037, 0.00037, 0.00037, 0.00039, 0.00038, 0.00037, 0.00039, 0.00037, 0.00038, 0.00038, 0.00037, 0.00037, 0.00037, 0.00038, 0.00038, 0.00038, 0.00037, 0.00037, 0.00038, 0.00039, 0.00038, 0.00038, 0.00038, 0.00039, 0.00038, 0.00038, 0.00038, 0.00038, 0.00038, 0.00038, 0.00037, 0.00038, 0.00037, 0.00038, 0.00037, 0.00039, 0.00039, 0.00038, 0.00038, 0.00038, 0.00038, 0.00038, 0.00038, 0.00037, 0.00038, 0.00038, 0.00038, 0.00039, 0.00038, 0.00038, 0.00037, 0.00038, 0.0004, 0.00039, 0.0004, 0.00038, 0.00039, 0.00039, 0.00039, 0.00039, 0.00038, 0.00038, 0.00037, 0.00038, 0.00039, 0.00039, 0.00038, 0.00038, 0.00039, 0.00039, 0.00038, 0.00038, 0.00044, 0.00039, 0.0004, 0.00039, 0.00038, 0.00038, 0.00038, 0.00038, 0.00038, 0.00039, 0.00039, 0.00038, 0.00038, 0.00039, 0.00038, 0.00039, 0.00038, 0.00038, 0.00039, 0.00037, 0.00039, 0.00038, 0.00038, 0.00038, 0.00038, 0.00038, 0.00038, 0.00038, 0.00039, 0.0004, 0.00038, 0.00038, 0.00039, 0.00039, 0.0004, 0.00039, 0.00038, 0.00038, 0.00039, 0.00039, 0.00038, 0.00038, 0.00038, 0.00039, 0.00038, 0.00039, 0.00039, 0.00038, 0.00039, 0.00039, 0.00037, 0.00039, 0.00037, 0.00038, 0.00041, 0.00037, 0.00037, 0.00038, 0.00038, 0.00038, 0.00038, 0.00039, 0.00038, 0.00038, 0.00039, 0.00038, 0.0004, 0.00038, 0.00038, 0.00038, 0.00039, 0.00039, 0.00038, 0.00038, 0.0004, 0.00038, 0.0004, 0.00038, 0.00038, 0.0004, 0.00038, 0.00038, 0.00038, 0.00038, 0.00039, 0.00038, 0.00039, 0.00038, 0.00038, 0.00038, 0.00038, 0.00039, 0.00038, 0.00038, 0.00038, 0.00038, 0.00038, 0.00038, 0.00039, 0.00038, 0.00038, 0.0004, 0.00038, 0.00038, 0.00038, 0.00038, 0.00037, 0.00038, 0.00039, 0.00038, 0.00038, 0.00039, 0.00039, 0.00038, 0.00038, 0.00038, 0.00039, 0.00038, 0.00039, 0.00038, 0.00038, 0.00039, 0.00039, 0.00038, 0.00039, 0.00038, 0.00038, 0.00038, 0.00039, 0.00038, 0.00037, 0.00038, 0.00038, 0.00037, 0.00038, 0.00038, 0.00037, 0.00037, 0.00039, 0.00038, 0.00038, 0.00037, 0.00038, 0.00038, 0.00038, 0.00038, 0.00038, 0.00038, 0.00038, 0.00039, 0.00038, 0.00038, 0.00038, 0.00039, 0.00039, 0.00038, 0.00037, 0.00037, 0.00038, 0.00038, 0.00043, 0.00037, 0.00038, 0.00038, 0.00037, 0.00038, 0.00038, 0.00038, 0.00038, 0.00037, 0.00038, 0.00037, 0.00037, 0.00038, 0.00037, 0.00039, 0.00037, 0.00037, 0.00039, 0.00038, 0.00038, 0.00038, 0.00038, 0.00039, 0.00038, 0.00038, 0.0004, 0.0004, 0.00038, 0.00039, 0.00038, 0.00038, 0.00039, 0.0004, 0.00038, 0.00038, 0.00038, 0.00039, 0.00038, 0.00038, 0.00038, 0.00039, 0.00039, 0.00038, 0.00038, 0.00038, 0.00038, 0.00038, 0.00039, 0.00038, 0.00038, 0.00039, 0.00038, 0.00037, 0.00038, 0.00039, 0.00039, 0.00038, 0.00037, 0.00038, 0.00038, 0.00038, 0.00038, 0.00038, 0.00038, 0.00038, 0.00038, 0.00039, 0.00039, 0.00038, 0.00038, 0.00038, 0.00038, 0.00038, 0.00038, 0.00039, 0.00038, 0.00038, 0.00038, 0.00038, 0.00038, 0.00037, 0.00039, 0.00038, 0.00038, 0.00038, 0.00039, 0.0004, 0.00039, 0.00038, 0.00038, 0.00041, 0.0004, 0.00039, 0.00038, 0.0004, 0.00038, 0.00038, 0.00038, 0.00038, 0.00038, 0.00039, 0.00038, 0.00039, 0.00039, 0.00039, 0.00038, 0.00038, 0.00039, 0.00038, 0.00038, 0.0004, 0.00038, 0.00038, 0.00038, 0.00039, 0.00038, 0.00038, 0.00038, 0.00039, 0.00038, 0.00038, 0.00038, 0.00039, 0.00038, 0.00038, 0.00038, 0.00039, 0.00039, 0.00038, 0.00039, 0.00038, 0.00039, 0.00038, 0.00039, 0.00038, 0.00041, 0.00039, 0.00039, 0.00041, 0.00038, 0.00038, 0.00052, 0.00038, 0.00039, 0.00038, 0.00038, 0.00038, 0.00038, 0.00038]}, "optimizer-copy-main-to-model-params-time": {"start_step": 0, "end_step": 2000, "step_interval": 5, "values": [0.00097, 0.00085, 0.00083, 0.00104, 0.00084, 0.00083, 0.00084, 0.00085, 0.00085, 0.00084, 0.00083, 0.00085, 0.00083, 0.00085, 0.00178, 0.00084, 0.00084, 0.00084, 0.00084, 0.00084, 0.00085, 0.00085, 0.00085, 0.00083, 0.00082, 0.00083, 0.00084, 0.00083, 0.00084, 0.00084, 0.00083, 0.00084, 0.00083, 0.00086, 0.00085, 0.00085, 0.00084, 0.00084, 0.00085, 0.00085, 0.00084, 0.00084, 0.00084, 0.00085, 0.00085, 0.00084, 0.00085, 0.00085, 0.00084, 0.00085, 0.00118, 0.00086, 0.00087, 0.00086, 0.00108, 0.00085, 0.00085, 0.00084, 0.00084, 0.00083, 0.00085, 0.00109, 0.00084, 0.00083, 0.00084, 0.00086, 0.00085, 0.00086, 0.00085, 0.00085, 0.00085, 0.00086, 0.00085, 0.00084, 0.00087, 0.00085, 0.00087, 0.00084, 0.00086, 0.00085, 0.00085, 0.00084, 0.00085, 0.00084, 0.00085, 0.00084, 0.00085, 0.00087, 0.00085, 0.00087, 0.00096, 0.00085, 0.00085, 0.00086, 0.00084, 0.00085, 0.00086, 0.00083, 0.00085, 0.00084, 0.00083, 0.00084, 0.00084, 0.00083, 0.00084, 0.00084, 0.00085, 0.00083, 0.00083, 0.00083, 0.00083, 0.00084, 0.00083, 0.00084, 0.00083, 0.00083, 0.00085, 0.00084, 0.00083, 0.00084, 0.00083, 0.00084, 0.00083, 0.00084, 0.00084, 0.00083, 0.00084, 0.00086, 0.00084, 0.00083, 0.00084, 0.00084, 0.00083, 0.00085, 0.00085, 0.00084, 0.00083, 0.00086, 0.00086, 0.00084, 0.00085, 0.00083, 0.00084, 0.00084, 0.00083, 0.00084, 0.00083, 0.00083, 0.00083, 0.00084, 0.00085, 0.00085, 0.00083, 0.00084, 0.00083, 0.00083, 0.00094, 0.00084, 0.00084, 0.00083, 0.00084, 0.00084, 0.00083, 0.00085, 0.00084, 0.00084, 0.00083, 0.00085, 0.00084, 0.00084, 0.00083, 0.00085, 0.00084, 0.00083, 0.00085, 0.00083, 0.00083, 0.00085, 0.00083, 0.00084, 0.00098, 0.00085, 0.00084, 0.00085, 0.00083, 0.00083, 0.00084, 0.00085, 0.00085, 0.00085, 0.00084, 0.00084, 0.00084, 0.00084, 0.00083, 0.00085, 0.00085, 0.00084, 0.00087, 0.00084, 0.00083, 0.00084, 0.00085, 0.00084, 0.00084, 0.00084, 0.00085, 0.00086, 0.00086, 0.00083, 0.00083, 0.00083, 0.00085, 0.00084, 0.00085, 0.00084, 0.00084, 0.00085, 0.00084, 0.00084, 0.00084, 0.00084, 0.00083, 0.00082, 0.00084, 0.00109, 0.00084, 0.00084, 0.00084, 0.00084, 0.00084, 0.00083, 0.00083, 0.00085, 0.00085, 0.00084, 0.00084, 0.00085, 0.00084, 0.00085, 0.00083, 0.00085, 0.00084, 0.00084, 0.00084, 0.00085, 0.00085, 0.00085, 0.00084, 0.00083, 0.00093, 0.00084, 0.00083, 0.00085, 0.00084, 0.00084, 0.00084, 0.00084, 0.00084, 0.00085, 0.00085, 0.00083, 0.00084, 0.00084, 0.00083, 0.00085, 0.00086, 0.00085, 0.00083, 0.00085, 0.00085, 0.00084, 0.00085, 0.00084, 0.00084, 0.00085, 0.00085, 0.00085, 0.00084, 0.00085, 0.00083, 0.00084, 0.00083, 0.00084, 0.00085, 0.00083, 0.00084, 0.00086, 0.00086, 0.00085, 0.00084, 0.00102, 0.00089, 0.00085, 0.00084, 0.00084, 0.00084, 0.00084, 0.00085, 0.00085, 0.00084, 0.00086, 0.00096, 0.00083, 0.00085, 0.00084, 0.00084, 0.00085, 0.00084, 0.00084, 0.00083, 0.00085, 0.00084, 0.00085, 0.00085, 0.00083, 0.00084, 0.00085, 0.00085, 0.00084, 0.00086, 0.00084, 0.00084, 0.00083, 0.00095, 0.00084, 0.00084, 0.00086, 0.00085, 0.00084, 0.00085, 0.00084, 0.00084, 0.00086, 0.00085, 0.00085, 0.00085, 0.00084, 0.00083, 0.00087, 0.00084, 0.00093, 0.00085, 0.00084, 0.00084, 0.00085, 0.00083, 0.00083, 0.00084, 0.00083, 0.00085, 0.00086, 0.00084, 0.00113, 0.00084, 0.00083, 0.00084, 0.00103, 0.00085, 0.00084, 0.00087, 0.00084, 0.00084, 0.00084, 0.00083, 0.00084, 0.00086, 0.00084, 0.00084, 0.00082, 0.00085, 0.00085, 0.00083, 0.00084, 0.00084, 0.00084, 0.00084, 0.00085, 0.00084, 0.00084, 0.00082, 0.00085, 0.00084, 0.00083, 0.00084, 0.00085, 0.00094, 0.00085, 0.00085, 0.00086, 0.00116, 0.00084, 0.00137, 0.00084, 0.00083, 0.00084, 0.00084, 0.00104, 0.00085, 0.00083]}, "optimizer-time": {"start_step": 0, "end_step": 2000, "step_interval": 5, "values": [0.03257, 0.00561, 0.00555, 0.00673, 0.00567, 0.00562, 0.00561, 0.00563, 0.00577, 0.00565, 0.00561, 0.00611, 0.00562, 0.00577, 0.00929, 0.00564, 0.00561, 0.00562, 0.0056, 0.00562, 0.0056, 0.00563, 0.00563, 0.00561, 0.00559, 0.00561, 0.00563, 0.00561, 0.00562, 0.00557, 0.0056, 0.00562, 0.00562, 0.00563, 0.00562, 0.00562, 0.00568, 0.00562, 0.00565, 0.00566, 0.00566, 0.00565, 0.0056, 0.00567, 0.00567, 0.00569, 0.00566, 0.00568, 0.00565, 0.00563, 0.00698, 0.00565, 0.00598, 0.0057, 0.00701, 0.00568, 0.00567, 0.00565, 0.00567, 0.00568, 0.00563, 0.00767, 0.00563, 0.00608, 0.00566, 0.00565, 0.00568, 0.00565, 0.00565, 0.00567, 0.00566, 0.00571, 0.00568, 0.00567, 0.00567, 0.00565, 0.00569, 0.00575, 0.00565, 0.00565, 0.00562, 0.00577, 0.00568, 0.00567, 0.00563, 0.00564, 0.00565, 0.0057, 0.00565, 0.00567, 0.00638, 0.00578, 0.00578, 0.00572, 0.0056, 0.00567, 0.00571, 0.00565, 0.00565, 0.00567, 0.00563, 0.00563, 0.00563, 0.00563, 0.00562, 0.00635, 0.00583, 0.00568, 0.00584, 0.00555, 0.00577, 0.00559, 0.0056, 0.00558, 0.00584, 0.00561, 0.00557, 0.00564, 0.00562, 0.00566, 0.00555, 0.00562, 0.00565, 0.00566, 0.00559, 0.0056, 0.00561, 0.00566, 0.00564, 0.00561, 0.00563, 0.00564, 0.00564, 0.00565, 0.00564, 0.00568, 0.00564, 0.00565, 0.00566, 0.00568, 0.00554, 0.00562, 0.00556, 0.00562, 0.0057, 0.00565, 0.00583, 0.00554, 0.00562, 0.00561, 0.00564, 0.00571, 0.00563, 0.00563, 0.00565, 0.0056, 0.00607, 0.00565, 0.00564, 0.00564, 0.00565, 0.00565, 0.00563, 0.00564, 0.00563, 0.00566, 0.00564, 0.00565, 0.00565, 0.00567, 0.00565, 0.00576, 0.00575, 0.00563, 0.00566, 0.00658, 0.00565, 0.00564, 0.00568, 0.00562, 0.00663, 0.00565, 0.00564, 0.00564, 0.00562, 0.00563, 0.00568, 0.00566, 0.00565, 0.00564, 0.00565, 0.00563, 0.00565, 0.00561, 0.00564, 0.00563, 0.00562, 0.00564, 0.00568, 0.00568, 0.00567, 0.00567, 0.00569, 0.00566, 0.0056, 0.00564, 0.00567, 0.00567, 0.00586, 0.00568, 0.00555, 0.00567, 0.00562, 0.00558, 0.00585, 0.00563, 0.00566, 0.00565, 0.00565, 0.00566, 0.00559, 0.00566, 0.00566, 0.00561, 0.00573, 0.00721, 0.00562, 0.00564, 0.00593, 0.00595, 0.00563, 0.00564, 0.00566, 0.00567, 0.00565, 0.00569, 0.00564, 0.00566, 0.00568, 0.00566, 0.00578, 0.00588, 0.0064, 0.00571, 0.00566, 0.00564, 0.00565, 0.00567, 0.00566, 0.00564, 0.00643, 0.00566, 0.00567, 0.00564, 0.00601, 0.00563, 0.00566, 0.00566, 0.00566, 0.00563, 0.00566, 0.00565, 0.00557, 0.00567, 0.00564, 0.00566, 0.00565, 0.00566, 0.00564, 0.00596, 0.00567, 0.00562, 0.00565, 0.00566, 0.00564, 0.00564, 0.00569, 0.00568, 0.00569, 0.00569, 0.00575, 0.00567, 0.00583, 0.00568, 0.00566, 0.00566, 0.00567, 0.00566, 0.00567, 0.00566, 0.00564, 0.00689, 0.00665, 0.00563, 0.00566, 0.00566, 0.00685, 0.00566, 0.00565, 0.00567, 0.00567, 0.00574, 0.00611, 0.00563, 0.00565, 0.00569, 0.00568, 0.00568, 0.00568, 0.0057, 0.00566, 0.00569, 0.00567, 0.0057, 0.00566, 0.00569, 0.00564, 0.00565, 0.00568, 0.00569, 0.00571, 0.00564, 0.00566, 0.00565, 0.0058, 0.00566, 0.00565, 0.00564, 0.00566, 0.00566, 0.00567, 0.00556, 0.00565, 0.00568, 0.00564, 0.00567, 0.00566, 0.00566, 0.00566, 0.00566, 0.00565, 0.00622, 0.00564, 0.00563, 0.00565, 0.0058, 0.00565, 0.00563, 0.00567, 0.00564, 0.00566, 0.00569, 0.00579, 0.0071, 0.00625, 0.00661, 0.00596, 0.00708, 0.00571, 0.00566, 0.00572, 0.0057, 0.00565, 0.00566, 0.00568, 0.00566, 0.00569, 0.00565, 0.00568, 0.00558, 0.00572, 0.00566, 0.00564, 0.00571, 0.00569, 0.00569, 0.00567, 0.00567, 0.00564, 0.00569, 0.00563, 0.0057, 0.00565, 0.00567, 0.00569, 0.00565, 0.00602, 0.00567, 0.00566, 0.00568, 0.00691, 0.00568, 0.00824, 0.00567, 0.00569, 0.00565, 0.00566, 0.00689, 0.00567, 0.00569]}, "learning-rate": {"start_step": 0, "end_step": 2000, "step_interval": 5, "values": [0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 1e-05, 1e-05, 1e-05, 1e-05, 1e-05, 1e-05, 1e-05, 1e-05, 1e-05, 1e-05, 1e-05, 1e-05, 1e-05, 1e-05, 1e-05, 1e-05, 1e-05, 1e-05, 1e-05, 1e-05, 1e-05, 1e-05, 1e-05, 1e-05, 1e-05, 1e-05, 1e-05, 1e-05, 1e-05, 1e-05, 1e-05, 1e-05, 1e-05, 1e-05, 1e-05, 1e-05, 1e-05, 1e-05, 1e-05, 1e-05, 1e-05, 1e-05, 2e-05, 2e-05, 2e-05, 2e-05, 2e-05, 2e-05, 2e-05, 2e-05, 2e-05, 2e-05, 2e-05, 2e-05, 2e-05, 2e-05, 2e-05, 2e-05, 2e-05, 2e-05, 2e-05, 2e-05, 2e-05, 2e-05, 2e-05, 2e-05, 2e-05, 2e-05, 2e-05, 2e-05, 2e-05, 2e-05, 2e-05, 2e-05, 2e-05, 2e-05, 2e-05, 2e-05, 2e-05, 2e-05, 2e-05, 2e-05, 2e-05, 2e-05, 2e-05, 3e-05, 3e-05, 3e-05, 3e-05, 3e-05, 3e-05, 3e-05, 3e-05, 3e-05, 3e-05, 3e-05, 3e-05, 3e-05, 3e-05, 3e-05, 3e-05, 3e-05, 3e-05, 3e-05, 3e-05, 3e-05, 3e-05, 3e-05, 3e-05, 3e-05, 3e-05, 3e-05, 3e-05, 3e-05, 3e-05, 3e-05, 3e-05, 3e-05, 3e-05, 3e-05, 3e-05, 3e-05, 3e-05, 3e-05, 3e-05, 3e-05, 3e-05, 3e-05, 4e-05, 4e-05, 4e-05, 4e-05, 4e-05, 4e-05, 4e-05, 4e-05, 4e-05, 4e-05, 4e-05, 4e-05, 4e-05, 4e-05, 4e-05, 4e-05, 4e-05, 4e-05, 4e-05, 4e-05, 4e-05, 4e-05, 4e-05, 4e-05, 4e-05, 4e-05, 4e-05, 4e-05, 4e-05, 4e-05, 4e-05, 4e-05, 4e-05, 4e-05, 4e-05, 4e-05, 4e-05, 4e-05, 4e-05, 4e-05, 4e-05, 4e-05, 5e-05, 5e-05, 5e-05, 5e-05, 5e-05, 5e-05, 5e-05, 5e-05, 5e-05, 5e-05, 5e-05, 5e-05, 5e-05, 5e-05, 5e-05, 5e-05, 5e-05, 5e-05, 5e-05, 5e-05, 5e-05, 5e-05, 5e-05, 5e-05, 5e-05, 5e-05, 5e-05, 5e-05, 5e-05, 5e-05, 5e-05, 5e-05, 5e-05, 5e-05, 5e-05, 5e-05, 5e-05, 5e-05, 5e-05, 5e-05, 5e-05, 5e-05, 5e-05, 6e-05, 6e-05, 6e-05, 6e-05, 6e-05, 6e-05, 6e-05, 6e-05, 6e-05, 6e-05, 6e-05, 6e-05, 6e-05, 6e-05, 6e-05, 6e-05, 6e-05, 6e-05, 6e-05, 6e-05, 6e-05, 6e-05, 6e-05, 6e-05, 6e-05, 6e-05, 6e-05, 6e-05, 6e-05, 6e-05, 6e-05, 6e-05, 6e-05, 6e-05, 6e-05, 6e-05, 6e-05, 6e-05, 6e-05, 6e-05, 6e-05, 6e-05, 6e-05, 7e-05, 7e-05, 7e-05, 7e-05, 7e-05, 7e-05, 7e-05, 7e-05, 7e-05, 7e-05, 7e-05, 7e-05, 7e-05, 7e-05, 7e-05, 7e-05, 7e-05, 7e-05, 7e-05, 7e-05, 7e-05, 7e-05, 7e-05, 7e-05, 7e-05, 7e-05, 7e-05, 7e-05, 7e-05, 7e-05, 7e-05, 7e-05, 7e-05, 7e-05, 7e-05, 7e-05, 7e-05, 7e-05, 7e-05, 7e-05, 7e-05, 7e-05, 8e-05, 8e-05, 8e-05, 8e-05, 8e-05, 8e-05, 8e-05, 8e-05, 8e-05, 8e-05, 8e-05, 8e-05, 8e-05, 8e-05, 8e-05, 8e-05, 8e-05, 8e-05, 8e-05, 8e-05, 8e-05, 8e-05, 8e-05, 8e-05, 8e-05, 8e-05, 8e-05, 8e-05, 8e-05, 8e-05, 8e-05, 8e-05, 8e-05, 8e-05, 8e-05, 8e-05, 8e-05, 8e-05, 8e-05, 8e-05, 8e-05, 8e-05, 8e-05, 9e-05, 9e-05, 9e-05, 9e-05, 9e-05, 9e-05, 9e-05, 9e-05, 9e-05, 9e-05, 9e-05, 9e-05, 9e-05, 9e-05, 9e-05, 9e-05, 9e-05, 9e-05, 9e-05, 9e-05, 9e-05, 9e-05, 9e-05, 9e-05, 9e-05, 9e-05, 9e-05, 9e-05, 9e-05, 9e-05, 9e-05, 9e-05, 9e-05, 9e-05, 9e-05, 9e-05, 9e-05]}, "learning-rate vs samples": {"start_step": 0, "end_step": 2000, "step_interval": 5, "values": [0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 1e-05, 1e-05, 1e-05, 1e-05, 1e-05, 1e-05, 1e-05, 1e-05, 1e-05, 1e-05, 1e-05, 1e-05, 1e-05, 1e-05, 1e-05, 1e-05, 1e-05, 1e-05, 1e-05, 1e-05, 1e-05, 1e-05, 1e-05, 1e-05, 1e-05, 1e-05, 1e-05, 1e-05, 1e-05, 1e-05, 1e-05, 1e-05, 1e-05, 1e-05, 1e-05, 1e-05, 1e-05, 1e-05, 1e-05, 1e-05, 1e-05, 1e-05, 2e-05, 2e-05, 2e-05, 2e-05, 2e-05, 2e-05, 2e-05, 2e-05, 2e-05, 2e-05, 2e-05, 2e-05, 2e-05, 2e-05, 2e-05, 2e-05, 2e-05, 2e-05, 2e-05, 2e-05, 2e-05, 2e-05, 2e-05, 2e-05, 2e-05, 2e-05, 2e-05, 2e-05, 2e-05, 2e-05, 2e-05, 2e-05, 2e-05, 2e-05, 2e-05, 2e-05, 2e-05, 2e-05, 2e-05, 2e-05, 2e-05, 2e-05, 2e-05, 3e-05, 3e-05, 3e-05, 3e-05, 3e-05, 3e-05, 3e-05, 3e-05, 3e-05, 3e-05, 3e-05, 3e-05, 3e-05, 3e-05, 3e-05, 3e-05, 3e-05, 3e-05, 3e-05, 3e-05, 3e-05, 3e-05, 3e-05, 3e-05, 3e-05, 3e-05, 3e-05, 3e-05, 3e-05, 3e-05, 3e-05, 3e-05, 3e-05, 3e-05, 3e-05, 3e-05, 3e-05, 3e-05, 3e-05, 3e-05, 3e-05, 3e-05, 3e-05, 4e-05, 4e-05, 4e-05, 4e-05, 4e-05, 4e-05, 4e-05, 4e-05, 4e-05, 4e-05, 4e-05, 4e-05, 4e-05, 4e-05, 4e-05, 4e-05, 4e-05, 4e-05, 4e-05, 4e-05, 4e-05, 4e-05, 4e-05, 4e-05, 4e-05, 4e-05, 4e-05, 4e-05, 4e-05, 4e-05, 4e-05, 4e-05, 4e-05, 4e-05, 4e-05, 4e-05, 4e-05, 4e-05, 4e-05, 4e-05, 4e-05, 4e-05, 5e-05, 5e-05, 5e-05, 5e-05, 5e-05, 5e-05, 5e-05, 5e-05, 5e-05, 5e-05, 5e-05, 5e-05, 5e-05, 5e-05, 5e-05, 5e-05, 5e-05, 5e-05, 5e-05, 5e-05, 5e-05, 5e-05, 5e-05, 5e-05, 5e-05, 5e-05, 5e-05, 5e-05, 5e-05, 5e-05, 5e-05, 5e-05, 5e-05, 5e-05, 5e-05, 5e-05, 5e-05, 5e-05, 5e-05, 5e-05, 5e-05, 5e-05, 5e-05, 6e-05, 6e-05, 6e-05, 6e-05, 6e-05, 6e-05, 6e-05, 6e-05, 6e-05, 6e-05, 6e-05, 6e-05, 6e-05, 6e-05, 6e-05, 6e-05, 6e-05, 6e-05, 6e-05, 6e-05, 6e-05, 6e-05, 6e-05, 6e-05, 6e-05, 6e-05, 6e-05, 6e-05, 6e-05, 6e-05, 6e-05, 6e-05, 6e-05, 6e-05, 6e-05, 6e-05, 6e-05, 6e-05, 6e-05, 6e-05, 6e-05, 6e-05, 6e-05, 7e-05, 7e-05, 7e-05, 7e-05, 7e-05, 7e-05, 7e-05, 7e-05, 7e-05, 7e-05, 7e-05, 7e-05, 7e-05, 7e-05, 7e-05, 7e-05, 7e-05, 7e-05, 7e-05, 7e-05, 7e-05, 7e-05, 7e-05, 7e-05, 7e-05, 7e-05, 7e-05, 7e-05, 7e-05, 7e-05, 7e-05, 7e-05, 7e-05, 7e-05, 7e-05, 7e-05, 7e-05, 7e-05, 7e-05, 7e-05, 7e-05, 7e-05, 8e-05, 8e-05, 8e-05, 8e-05, 8e-05, 8e-05, 8e-05, 8e-05, 8e-05, 8e-05, 8e-05, 8e-05, 8e-05, 8e-05, 8e-05, 8e-05, 8e-05, 8e-05, 8e-05, 8e-05, 8e-05, 8e-05, 8e-05, 8e-05, 8e-05, 8e-05, 8e-05, 8e-05, 8e-05, 8e-05, 8e-05, 8e-05, 8e-05, 8e-05, 8e-05, 8e-05, 8e-05, 8e-05, 8e-05, 8e-05, 8e-05, 8e-05, 8e-05, 9e-05, 9e-05, 9e-05, 9e-05, 9e-05, 9e-05, 9e-05, 9e-05, 9e-05, 9e-05, 9e-05, 9e-05, 9e-05, 9e-05, 9e-05, 9e-05, 9e-05, 9e-05, 9e-05, 9e-05, 9e-05, 9e-05, 9e-05, 9e-05, 9e-05, 9e-05, 9e-05, 9e-05, 9e-05, 9e-05, 9e-05, 9e-05, 9e-05, 9e-05, 9e-05, 9e-05, 9e-05]}, "batch-size": {"start_step": 0, "end_step": 2000, "step_interval": 5, "values": [128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0]}, "batch-size vs samples": {"start_step": 0, "end_step": 2000, "step_interval": 5, "values": [128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0]}, "lm loss": {"start_step": 0, "end_step": 2000, "step_interval": 5, "values": [10.84303, 10.86032, 10.84988, 10.84755, 10.76639, 10.77411, 10.67857, 10.53004, 10.38397, 10.29666, 9.92036, 10.03609, 10.04286, 9.75368, 9.87024, 9.57458, 9.50956, 9.70645, 9.43156, 9.37511, 9.284, 9.18283, 9.20684, 9.02346, 9.21677, 9.08417, 9.17277, 9.18323, 9.31569, 9.00474, 8.94547, 9.06044, 9.05792, 8.66708, 8.73014, 8.76017, 8.69512, 8.74237, 8.66438, 8.77103, 8.66577, 8.85394, 8.83642, 8.49824, 8.38764, 8.42876, 8.48638, 8.38112, 8.42721, 8.57916, 8.36213, 8.18555, 8.21868, 8.21376, 8.25912, 7.90597, 8.08558, 7.88018, 8.23297, 8.21565, 7.99013, 7.95413, 7.90374, 7.72213, 7.72557, 7.62784, 7.49843, 7.88783, 7.68211, 7.43256, 7.72606, 7.75519, 7.5254, 7.28466, 7.43748, 7.32478, 7.44941, 7.21198, 7.61949, 7.26498, 7.33394, 7.19595, 7.19608, 7.40347, 7.15606, 7.26585, 6.98127, 6.98967, 7.02701, 7.12404, 6.81114, 6.9732, 7.07844, 6.98715, 6.86379, 6.74535, 6.97969, 7.04992, 6.69473, 6.57332, 6.71755, 6.73627, 6.72482, 6.72951, 6.64965, 6.39869, 6.62934, 6.6128, 6.44062, 6.62092, 6.73782, 6.60642, 6.72099, 6.69098, 6.62325, 6.50501, 6.59411, 6.40344, 6.66286, 6.24475, 6.24827, 6.29959, 6.38833, 6.34649, 6.44604, 6.28662, 6.33306, 6.23143, 6.1945, 6.39075, 6.31833, 6.31606, 6.15661, 6.15059, 6.23078, 6.37677, 6.19418, 6.14556, 6.174, 6.10964, 6.05825, 6.06794, 6.25281, 6.40554, 6.25551, 6.29757, 6.09544, 6.1725, 6.00218, 6.02712, 5.95524, 6.25067, 6.1861, 5.96596, 5.78395, 6.12333, 5.84793, 6.10088, 5.78605, 6.16305, 6.14324, 6.08193, 5.9272, 6.11128, 5.94147, 6.19288, 5.88909, 5.78652, 5.77759, 5.68182, 6.00901, 5.99171, 6.064, 5.887, 6.03556, 5.96156, 5.98678, 5.98309, 5.94332, 5.83241, 5.94309, 5.60951, 5.69435, 5.88169, 5.83567, 5.85447, 5.75902, 5.83004, 5.71739, 5.55081, 5.71567, 5.61507, 5.82158, 5.59427, 5.70169, 5.70024, 5.89399, 5.63586, 5.84189, 5.73395, 5.86128, 5.31906, 5.89065, 5.8668, 5.84568, 5.40705, 5.40162, 5.61805, 5.58944, 5.47887, 5.57169, 5.66894, 5.46961, 5.737, 5.50292, 5.58399, 5.61697, 5.61602, 5.50714, 5.6077, 5.6651, 5.67541, 5.58049, 5.65548, 5.36443, 5.67256, 5.62445, 5.41886, 5.57712, 5.62171, 5.55213, 5.34421, 5.53498, 5.48095, 5.4778, 5.37859, 5.55337, 5.60077, 5.38946, 5.5161, 5.4845, 5.3308, 5.503, 5.40661, 5.44202, 5.3156, 5.06608, 5.47488, 5.56633, 5.71203, 5.41237, 5.602, 5.6336, 5.23514, 5.26957, 5.38908, 5.39646, 5.32832, 5.49536, 5.18302, 5.2973, 5.24699, 5.3738, 5.2533, 5.4419, 5.53407, 5.31248, 5.43315, 5.33688, 5.07446, 5.3117, 5.25312, 5.30184, 5.11129, 5.27552, 5.26324, 5.47224, 5.15822, 5.26777, 5.21213, 5.35617, 4.98409, 4.9122, 5.32204, 5.39135, 5.22909, 5.3223, 5.10207, 5.16342, 5.26324, 5.06816, 5.26642, 5.06638, 5.34472, 5.24739, 5.15433, 5.24748, 5.04399, 5.32024, 5.05488, 5.02871, 5.1457, 5.11299, 5.27264, 5.15675, 5.28106, 5.09695, 5.09458, 5.25141, 5.32789, 5.25804, 5.19731, 5.14154, 5.29133, 4.95279, 5.2099, 5.09154, 5.30528, 5.17547, 5.19246, 5.11436, 4.986, 4.99619, 5.22741, 5.31255, 5.10417, 5.06172, 4.91443, 5.12691, 5.1217, 4.93205, 5.34318, 5.02802, 5.10574, 5.17142, 5.00778, 5.07028, 5.0728, 4.99912, 5.08403, 5.16803, 4.98253, 5.18553, 4.93609, 4.93034, 5.06451, 5.00328, 4.9143, 4.78254, 4.9515, 5.1248, 5.02128, 5.01937, 5.34246, 4.96515, 4.99654, 5.05289, 4.816, 4.74072, 4.99878, 5.04752, 4.87941, 4.96151, 5.05319, 5.02704, 4.8254, 4.8992, 4.91046, 4.83957, 4.74493, 5.01861, 4.76013, 5.21014, 4.79858, 5.00113, 4.74548, 4.79219, 4.82659, 4.65777, 4.66208, 4.84897, 4.81474, 4.80913, 4.92799, 4.89236, 4.93339, 4.77993, 4.89168, 4.7432, 4.92229, 4.96619, 4.88011, 4.71273, 4.7931, 4.91139, 4.72229, 4.87421, 4.70468, 4.69956, 4.65227]}, "lm loss vs samples": {"start_step": 0, "end_step": 2000, "step_interval": 5, "values": [10.84303, 10.86032, 10.84988, 10.84755, 10.76639, 10.77411, 10.67857, 10.53004, 10.38397, 10.29666, 9.92036, 10.03609, 10.04286, 9.75368, 9.87024, 9.57458, 9.50956, 9.70645, 9.43156, 9.37511, 9.284, 9.18283, 9.20684, 9.02346, 9.21677, 9.08417, 9.17277, 9.18323, 9.31569, 9.00474, 8.94547, 9.06044, 9.05792, 8.66708, 8.73014, 8.76017, 8.69512, 8.74237, 8.66438, 8.77103, 8.66577, 8.85394, 8.83642, 8.49824, 8.38764, 8.42876, 8.48638, 8.38112, 8.42721, 8.57916, 8.36213, 8.18555, 8.21868, 8.21376, 8.25912, 7.90597, 8.08558, 7.88018, 8.23297, 8.21565, 7.99013, 7.95413, 7.90374, 7.72213, 7.72557, 7.62784, 7.49843, 7.88783, 7.68211, 7.43256, 7.72606, 7.75519, 7.5254, 7.28466, 7.43748, 7.32478, 7.44941, 7.21198, 7.61949, 7.26498, 7.33394, 7.19595, 7.19608, 7.40347, 7.15606, 7.26585, 6.98127, 6.98967, 7.02701, 7.12404, 6.81114, 6.9732, 7.07844, 6.98715, 6.86379, 6.74535, 6.97969, 7.04992, 6.69473, 6.57332, 6.71755, 6.73627, 6.72482, 6.72951, 6.64965, 6.39869, 6.62934, 6.6128, 6.44062, 6.62092, 6.73782, 6.60642, 6.72099, 6.69098, 6.62325, 6.50501, 6.59411, 6.40344, 6.66286, 6.24475, 6.24827, 6.29959, 6.38833, 6.34649, 6.44604, 6.28662, 6.33306, 6.23143, 6.1945, 6.39075, 6.31833, 6.31606, 6.15661, 6.15059, 6.23078, 6.37677, 6.19418, 6.14556, 6.174, 6.10964, 6.05825, 6.06794, 6.25281, 6.40554, 6.25551, 6.29757, 6.09544, 6.1725, 6.00218, 6.02712, 5.95524, 6.25067, 6.1861, 5.96596, 5.78395, 6.12333, 5.84793, 6.10088, 5.78605, 6.16305, 6.14324, 6.08193, 5.9272, 6.11128, 5.94147, 6.19288, 5.88909, 5.78652, 5.77759, 5.68182, 6.00901, 5.99171, 6.064, 5.887, 6.03556, 5.96156, 5.98678, 5.98309, 5.94332, 5.83241, 5.94309, 5.60951, 5.69435, 5.88169, 5.83567, 5.85447, 5.75902, 5.83004, 5.71739, 5.55081, 5.71567, 5.61507, 5.82158, 5.59427, 5.70169, 5.70024, 5.89399, 5.63586, 5.84189, 5.73395, 5.86128, 5.31906, 5.89065, 5.8668, 5.84568, 5.40705, 5.40162, 5.61805, 5.58944, 5.47887, 5.57169, 5.66894, 5.46961, 5.737, 5.50292, 5.58399, 5.61697, 5.61602, 5.50714, 5.6077, 5.6651, 5.67541, 5.58049, 5.65548, 5.36443, 5.67256, 5.62445, 5.41886, 5.57712, 5.62171, 5.55213, 5.34421, 5.53498, 5.48095, 5.4778, 5.37859, 5.55337, 5.60077, 5.38946, 5.5161, 5.4845, 5.3308, 5.503, 5.40661, 5.44202, 5.3156, 5.06608, 5.47488, 5.56633, 5.71203, 5.41237, 5.602, 5.6336, 5.23514, 5.26957, 5.38908, 5.39646, 5.32832, 5.49536, 5.18302, 5.2973, 5.24699, 5.3738, 5.2533, 5.4419, 5.53407, 5.31248, 5.43315, 5.33688, 5.07446, 5.3117, 5.25312, 5.30184, 5.11129, 5.27552, 5.26324, 5.47224, 5.15822, 5.26777, 5.21213, 5.35617, 4.98409, 4.9122, 5.32204, 5.39135, 5.22909, 5.3223, 5.10207, 5.16342, 5.26324, 5.06816, 5.26642, 5.06638, 5.34472, 5.24739, 5.15433, 5.24748, 5.04399, 5.32024, 5.05488, 5.02871, 5.1457, 5.11299, 5.27264, 5.15675, 5.28106, 5.09695, 5.09458, 5.25141, 5.32789, 5.25804, 5.19731, 5.14154, 5.29133, 4.95279, 5.2099, 5.09154, 5.30528, 5.17547, 5.19246, 5.11436, 4.986, 4.99619, 5.22741, 5.31255, 5.10417, 5.06172, 4.91443, 5.12691, 5.1217, 4.93205, 5.34318, 5.02802, 5.10574, 5.17142, 5.00778, 5.07028, 5.0728, 4.99912, 5.08403, 5.16803, 4.98253, 5.18553, 4.93609, 4.93034, 5.06451, 5.00328, 4.9143, 4.78254, 4.9515, 5.1248, 5.02128, 5.01937, 5.34246, 4.96515, 4.99654, 5.05289, 4.816, 4.74072, 4.99878, 5.04752, 4.87941, 4.96151, 5.05319, 5.02704, 4.8254, 4.8992, 4.91046, 4.83957, 4.74493, 5.01861, 4.76013, 5.21014, 4.79858, 5.00113, 4.74548, 4.79219, 4.82659, 4.65777, 4.66208, 4.84897, 4.81474, 4.80913, 4.92799, 4.89236, 4.93339, 4.77993, 4.89168, 4.7432, 4.92229, 4.96619, 4.88011, 4.71273, 4.7931, 4.91139, 4.72229, 4.87421, 4.70468, 4.69956, 4.65227]}, "loss-scale": {"start_step": 0, "end_step": 2000, "step_interval": 5, "values": [1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0]}, "loss-scale vs samples": {"start_step": 0, "end_step": 2000, "step_interval": 5, "values": [1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0]}, "grad-norm": {"start_step": 0, "end_step": 2000, "step_interval": 5, "values": [13.92196, 13.64105, 14.19575, 13.10329, 13.56093, 11.06924, 10.32704, 12.58903, 11.89406, 9.6749, 7.04626, 4.0336, 3.15187, 2.82418, 2.35804, 2.43442, 2.16004, 1.97461, 2.14035, 2.12249, 2.20138, 2.2657, 2.05671, 2.22896, 1.95829, 2.02503, 1.88632, 1.84693, 1.87101, 2.18322, 2.10962, 1.97689, 1.94956, 2.15482, 2.33059, 2.0713, 2.06596, 1.83468, 1.98146, 1.78906, 2.08095, 1.74031, 1.73584, 1.83223, 1.93635, 1.78517, 1.74533, 1.74989, 1.72773, 1.51419, 1.74951, 1.76214, 1.76755, 1.83739, 1.54724, 1.80208, 1.67454, 1.80868, 1.51645, 1.42949, 1.65422, 1.43167, 1.74384, 1.82674, 1.56795, 1.61973, 1.62231, 1.51322, 1.4269, 1.55439, 1.3649, 1.40671, 1.47679, 1.40979, 1.35488, 1.43798, 1.41114, 1.34745, 1.32431, 1.23395, 1.36576, 1.22914, 1.25372, 1.35028, 1.23455, 1.29297, 1.37717, 1.26373, 1.37004, 1.08995, 1.10379, 1.10875, 1.15108, 1.26523, 0.89985, 1.39001, 1.10735, 1.30884, 1.00577, 1.31705, 1.15922, 1.16049, 1.08293, 1.30514, 0.98385, 1.11074, 1.1592, 0.9745, 1.26156, 1.13226, 0.98984, 0.97441, 0.96023, 0.94898, 1.04337, 1.04095, 0.96044, 1.19634, 1.26146, 1.4137, 0.97849, 1.01274, 1.06643, 1.01496, 0.94459, 1.13752, 1.02579, 1.05074, 1.22247, 1.26548, 1.04774, 1.44863, 1.15549, 1.15597, 1.19734, 1.2287, 1.25743, 1.88802, 1.76897, 1.48112, 1.4651, 1.39709, 1.38654, 1.09404, 1.62425, 1.69258, 1.31425, 1.11912, 1.16099, 1.18343, 1.29282, 1.58176, 1.59702, 1.35711, 1.25116, 1.93028, 1.26411, 1.16234, 1.73045, 1.37516, 1.21056, 1.1698, 1.36362, 1.31019, 1.41174, 1.1141, 1.35444, 1.27655, 1.56101, 1.26438, 1.09582, 1.27416, 1.41508, 1.54422, 1.36323, 1.24407, 1.29014, 1.18935, 1.13176, 1.03122, 1.33001, 1.37077, 1.14753, 1.11258, 1.66325, 1.11887, 1.76805, 1.40233, 1.37783, 1.50291, 1.27142, 1.30216, 1.29887, 1.46138, 1.55382, 1.23876, 1.8076, 1.40113, 1.63396, 1.55057, 1.08699, 1.24471, 1.22211, 1.14251, 1.26485, 1.45246, 1.55789, 1.71804, 1.37054, 1.61527, 1.57346, 1.43675, 1.26103, 1.17063, 1.56904, 1.17977, 1.4408, 1.72049, 1.50941, 1.30391, 1.34373, 1.32377, 1.27909, 1.56247, 1.31671, 1.38601, 1.61151, 1.49478, 1.75857, 1.27914, 1.31454, 2.08285, 1.65152, 1.54337, 1.46369, 1.68505, 1.74708, 1.34813, 1.53151, 1.36655, 1.5068, 1.33926, 1.42092, 1.39573, 1.3088, 1.90711, 1.46652, 1.29613, 1.44842, 1.30354, 1.28453, 1.49548, 1.47812, 1.39914, 1.32083, 1.19715, 1.79989, 1.43253, 1.35222, 1.42532, 1.23793, 1.41904, 1.21814, 1.25683, 1.2335, 1.46238, 1.48727, 1.4808, 1.33354, 1.33662, 1.26457, 1.31807, 1.46217, 1.35853, 1.55295, 1.20988, 1.50233, 1.51611, 1.48328, 1.32591, 1.35903, 1.25739, 1.45462, 1.40772, 1.52784, 1.49325, 1.48176, 1.41498, 1.37099, 1.4565, 1.35995, 1.85538, 1.22436, 1.50223, 1.62834, 2.02006, 1.60123, 1.72187, 1.44841, 1.22003, 1.2907, 1.31733, 1.13053, 1.33575, 1.57284, 1.47894, 1.41277, 1.40064, 1.30099, 1.35607, 1.52515, 1.48522, 1.31187, 1.24496, 1.36995, 1.60389, 1.24009, 1.55027, 1.2329, 1.34795, 1.32343, 1.38946, 1.27338, 1.46297, 1.50613, 1.56272, 1.67908, 1.41893, 1.40655, 1.34016, 1.79612, 1.52344, 1.31538, 1.82889, 1.5317, 1.18989, 1.44241, 1.33335, 1.49631, 1.45109, 1.41567, 1.28181, 1.28831, 1.39113, 1.42151, 1.1475, 1.49249, 1.42727, 1.4635, 1.13088, 1.41, 1.30719, 1.30003, 1.92172, 1.44667, 1.42061, 1.31137, 1.5365, 1.46596, 1.30019, 1.53226, 1.21709, 1.36071, 1.47588, 1.10067, 1.46261, 1.69979, 1.33386, 1.3067, 1.50275, 1.48945, 1.4021, 1.56615, 1.59437, 1.41693, 1.52987, 1.27517, 1.55287, 1.38137, 1.28009, 1.33198, 1.29291, 1.40497, 1.25603, 1.18811, 1.37138, 1.43758, 1.46419, 1.4718, 1.35085, 1.22463, 1.2576, 1.44724, 1.32087, 1.61352, 1.4648, 1.47154, 1.80709, 1.41366, 1.12723]}, "grad-norm vs samples": {"start_step": 0, "end_step": 2000, "step_interval": 5, "values": [13.92196, 13.64105, 14.19575, 13.10329, 13.56093, 11.06924, 10.32704, 12.58903, 11.89406, 9.6749, 7.04626, 4.0336, 3.15187, 2.82418, 2.35804, 2.43442, 2.16004, 1.97461, 2.14035, 2.12249, 2.20138, 2.2657, 2.05671, 2.22896, 1.95829, 2.02503, 1.88632, 1.84693, 1.87101, 2.18322, 2.10962, 1.97689, 1.94956, 2.15482, 2.33059, 2.0713, 2.06596, 1.83468, 1.98146, 1.78906, 2.08095, 1.74031, 1.73584, 1.83223, 1.93635, 1.78517, 1.74533, 1.74989, 1.72773, 1.51419, 1.74951, 1.76214, 1.76755, 1.83739, 1.54724, 1.80208, 1.67454, 1.80868, 1.51645, 1.42949, 1.65422, 1.43167, 1.74384, 1.82674, 1.56795, 1.61973, 1.62231, 1.51322, 1.4269, 1.55439, 1.3649, 1.40671, 1.47679, 1.40979, 1.35488, 1.43798, 1.41114, 1.34745, 1.32431, 1.23395, 1.36576, 1.22914, 1.25372, 1.35028, 1.23455, 1.29297, 1.37717, 1.26373, 1.37004, 1.08995, 1.10379, 1.10875, 1.15108, 1.26523, 0.89985, 1.39001, 1.10735, 1.30884, 1.00577, 1.31705, 1.15922, 1.16049, 1.08293, 1.30514, 0.98385, 1.11074, 1.1592, 0.9745, 1.26156, 1.13226, 0.98984, 0.97441, 0.96023, 0.94898, 1.04337, 1.04095, 0.96044, 1.19634, 1.26146, 1.4137, 0.97849, 1.01274, 1.06643, 1.01496, 0.94459, 1.13752, 1.02579, 1.05074, 1.22247, 1.26548, 1.04774, 1.44863, 1.15549, 1.15597, 1.19734, 1.2287, 1.25743, 1.88802, 1.76897, 1.48112, 1.4651, 1.39709, 1.38654, 1.09404, 1.62425, 1.69258, 1.31425, 1.11912, 1.16099, 1.18343, 1.29282, 1.58176, 1.59702, 1.35711, 1.25116, 1.93028, 1.26411, 1.16234, 1.73045, 1.37516, 1.21056, 1.1698, 1.36362, 1.31019, 1.41174, 1.1141, 1.35444, 1.27655, 1.56101, 1.26438, 1.09582, 1.27416, 1.41508, 1.54422, 1.36323, 1.24407, 1.29014, 1.18935, 1.13176, 1.03122, 1.33001, 1.37077, 1.14753, 1.11258, 1.66325, 1.11887, 1.76805, 1.40233, 1.37783, 1.50291, 1.27142, 1.30216, 1.29887, 1.46138, 1.55382, 1.23876, 1.8076, 1.40113, 1.63396, 1.55057, 1.08699, 1.24471, 1.22211, 1.14251, 1.26485, 1.45246, 1.55789, 1.71804, 1.37054, 1.61527, 1.57346, 1.43675, 1.26103, 1.17063, 1.56904, 1.17977, 1.4408, 1.72049, 1.50941, 1.30391, 1.34373, 1.32377, 1.27909, 1.56247, 1.31671, 1.38601, 1.61151, 1.49478, 1.75857, 1.27914, 1.31454, 2.08285, 1.65152, 1.54337, 1.46369, 1.68505, 1.74708, 1.34813, 1.53151, 1.36655, 1.5068, 1.33926, 1.42092, 1.39573, 1.3088, 1.90711, 1.46652, 1.29613, 1.44842, 1.30354, 1.28453, 1.49548, 1.47812, 1.39914, 1.32083, 1.19715, 1.79989, 1.43253, 1.35222, 1.42532, 1.23793, 1.41904, 1.21814, 1.25683, 1.2335, 1.46238, 1.48727, 1.4808, 1.33354, 1.33662, 1.26457, 1.31807, 1.46217, 1.35853, 1.55295, 1.20988, 1.50233, 1.51611, 1.48328, 1.32591, 1.35903, 1.25739, 1.45462, 1.40772, 1.52784, 1.49325, 1.48176, 1.41498, 1.37099, 1.4565, 1.35995, 1.85538, 1.22436, 1.50223, 1.62834, 2.02006, 1.60123, 1.72187, 1.44841, 1.22003, 1.2907, 1.31733, 1.13053, 1.33575, 1.57284, 1.47894, 1.41277, 1.40064, 1.30099, 1.35607, 1.52515, 1.48522, 1.31187, 1.24496, 1.36995, 1.60389, 1.24009, 1.55027, 1.2329, 1.34795, 1.32343, 1.38946, 1.27338, 1.46297, 1.50613, 1.56272, 1.67908, 1.41893, 1.40655, 1.34016, 1.79612, 1.52344, 1.31538, 1.82889, 1.5317, 1.18989, 1.44241, 1.33335, 1.49631, 1.45109, 1.41567, 1.28181, 1.28831, 1.39113, 1.42151, 1.1475, 1.49249, 1.42727, 1.4635, 1.13088, 1.41, 1.30719, 1.30003, 1.92172, 1.44667, 1.42061, 1.31137, 1.5365, 1.46596, 1.30019, 1.53226, 1.21709, 1.36071, 1.47588, 1.10067, 1.46261, 1.69979, 1.33386, 1.3067, 1.50275, 1.48945, 1.4021, 1.56615, 1.59437, 1.41693, 1.52987, 1.27517, 1.55287, 1.38137, 1.28009, 1.33198, 1.29291, 1.40497, 1.25603, 1.18811, 1.37138, 1.43758, 1.46419, 1.4718, 1.35085, 1.22463, 1.2576, 1.44724, 1.32087, 1.61352, 1.4648, 1.47154, 1.80709, 1.41366, 1.12723]}, "num-zeros": {"start_step": 0, "end_step": 2000, "step_interval": 5, "values": [65.0, 71.0, 74.0, 78.0, 68.0, 65.0, 79.0, 104.0, 95.0, 118.0, 116.0, 161.0, 141.0, 148.0, 182.0, 146.0, 164.0, 199.0, 174.0, 205.0, 166.0, 167.0, 186.0, 158.0, 195.0, 179.0, 188.0, 208.0, 187.0, 145.0, 145.0, 146.0, 156.0, 175.0, 132.0, 180.0, 177.0, 205.0, 172.0, 159.0, 158.0, 175.0, 153.0, 203.0, 196.0, 170.0, 185.0, 179.0, 140.0, 227.0, 198.0, 165.0, 172.0, 149.0, 199.0, 213.0, 179.0, 157.0, 255.0, 240.0, 186.0, 191.0, 164.0, 186.0, 208.0, 229.0, 213.0, 198.0, 198.0, 178.0, 246.0, 222.0, 177.0, 236.0, 193.0, 215.0, 226.0, 205.0, 251.0, 226.0, 224.0, 245.0, 219.0, 205.0, 198.0, 190.0, 171.0, 191.0, 171.0, 187.0, 182.0, 207.0, 233.0, 201.0, 220.0, 152.0, 216.0, 194.0, 175.0, 157.0, 165.0, 188.0, 163.0, 163.0, 160.0, 155.0, 160.0, 167.0, 144.0, 190.0, 194.0, 143.0, 153.0, 175.0, 158.0, 147.0, 166.0, 115.0, 142.0, 141.0, 117.0, 131.0, 132.0, 130.0, 164.0, 131.0, 136.0, 129.0, 150.0, 146.0, 133.0, 96.0, 139.0, 119.0, 108.0, 124.0, 109.0, 114.0, 113.0, 123.0, 125.0, 129.0, 99.0, 159.0, 109.0, 115.0, 127.0, 128.0, 101.0, 122.0, 118.0, 113.0, 110.0, 107.0, 112.0, 89.0, 107.0, 118.0, 89.0, 101.0, 127.0, 125.0, 111.0, 110.0, 121.0, 125.0, 111.0, 123.0, 109.0, 116.0, 118.0, 107.0, 87.0, 105.0, 121.0, 111.0, 127.0, 128.0, 116.0, 128.0, 116.0, 112.0, 135.0, 122.0, 106.0, 97.0, 100.0, 121.0, 94.0, 117.0, 124.0, 93.0, 116.0, 99.0, 114.0, 107.0, 96.0, 105.0, 102.0, 84.0, 138.0, 100.0, 100.0, 115.0, 133.0, 101.0, 99.0, 105.0, 116.0, 109.0, 100.0, 109.0, 120.0, 131.0, 107.0, 110.0, 111.0, 98.0, 118.0, 97.0, 122.0, 115.0, 121.0, 114.0, 91.0, 86.0, 116.0, 85.0, 79.0, 99.0, 97.0, 89.0, 103.0, 78.0, 108.0, 107.0, 78.0, 101.0, 99.0, 96.0, 119.0, 87.0, 98.0, 113.0, 112.0, 101.0, 78.0, 125.0, 101.0, 102.0, 137.0, 85.0, 97.0, 96.0, 119.0, 119.0, 93.0, 84.0, 94.0, 91.0, 132.0, 108.0, 113.0, 98.0, 127.0, 102.0, 88.0, 93.0, 124.0, 102.0, 99.0, 97.0, 99.0, 85.0, 103.0, 94.0, 108.0, 116.0, 103.0, 114.0, 105.0, 123.0, 122.0, 94.0, 104.0, 101.0, 103.0, 109.0, 115.0, 117.0, 125.0, 81.0, 115.0, 112.0, 116.0, 100.0, 108.0, 105.0, 97.0, 101.0, 105.0, 98.0, 124.0, 98.0, 101.0, 103.0, 123.0, 124.0, 122.0, 115.0, 102.0, 115.0, 116.0, 122.0, 111.0, 88.0, 99.0, 95.0, 112.0, 122.0, 131.0, 110.0, 112.0, 96.0, 108.0, 100.0, 103.0, 106.0, 119.0, 104.0, 102.0, 97.0, 125.0, 93.0, 117.0, 133.0, 112.0, 137.0, 110.0, 104.0, 120.0, 115.0, 111.0, 118.0, 113.0, 100.0, 125.0, 108.0, 109.0, 122.0, 99.0, 128.0, 105.0, 112.0, 122.0, 112.0, 114.0, 109.0, 108.0, 111.0, 113.0, 114.0, 105.0, 101.0, 110.0, 104.0, 112.0, 114.0, 109.0, 92.0, 111.0, 102.0, 91.0, 119.0, 111.0, 95.0, 107.0, 97.0, 115.0, 108.0, 124.0, 118.0, 123.0, 119.0, 122.0, 112.0, 106.0, 101.0, 93.0, 116.0, 123.0, 112.0, 120.0, 87.0, 102.0, 116.0, 113.0, 118.0, 135.0, 110.0]}, "num-zeros vs samples": {"start_step": 0, "end_step": 2000, "step_interval": 5, "values": [65.0, 71.0, 74.0, 78.0, 68.0, 65.0, 79.0, 104.0, 95.0, 118.0, 116.0, 161.0, 141.0, 148.0, 182.0, 146.0, 164.0, 199.0, 174.0, 205.0, 166.0, 167.0, 186.0, 158.0, 195.0, 179.0, 188.0, 208.0, 187.0, 145.0, 145.0, 146.0, 156.0, 175.0, 132.0, 180.0, 177.0, 205.0, 172.0, 159.0, 158.0, 175.0, 153.0, 203.0, 196.0, 170.0, 185.0, 179.0, 140.0, 227.0, 198.0, 165.0, 172.0, 149.0, 199.0, 213.0, 179.0, 157.0, 255.0, 240.0, 186.0, 191.0, 164.0, 186.0, 208.0, 229.0, 213.0, 198.0, 198.0, 178.0, 246.0, 222.0, 177.0, 236.0, 193.0, 215.0, 226.0, 205.0, 251.0, 226.0, 224.0, 245.0, 219.0, 205.0, 198.0, 190.0, 171.0, 191.0, 171.0, 187.0, 182.0, 207.0, 233.0, 201.0, 220.0, 152.0, 216.0, 194.0, 175.0, 157.0, 165.0, 188.0, 163.0, 163.0, 160.0, 155.0, 160.0, 167.0, 144.0, 190.0, 194.0, 143.0, 153.0, 175.0, 158.0, 147.0, 166.0, 115.0, 142.0, 141.0, 117.0, 131.0, 132.0, 130.0, 164.0, 131.0, 136.0, 129.0, 150.0, 146.0, 133.0, 96.0, 139.0, 119.0, 108.0, 124.0, 109.0, 114.0, 113.0, 123.0, 125.0, 129.0, 99.0, 159.0, 109.0, 115.0, 127.0, 128.0, 101.0, 122.0, 118.0, 113.0, 110.0, 107.0, 112.0, 89.0, 107.0, 118.0, 89.0, 101.0, 127.0, 125.0, 111.0, 110.0, 121.0, 125.0, 111.0, 123.0, 109.0, 116.0, 118.0, 107.0, 87.0, 105.0, 121.0, 111.0, 127.0, 128.0, 116.0, 128.0, 116.0, 112.0, 135.0, 122.0, 106.0, 97.0, 100.0, 121.0, 94.0, 117.0, 124.0, 93.0, 116.0, 99.0, 114.0, 107.0, 96.0, 105.0, 102.0, 84.0, 138.0, 100.0, 100.0, 115.0, 133.0, 101.0, 99.0, 105.0, 116.0, 109.0, 100.0, 109.0, 120.0, 131.0, 107.0, 110.0, 111.0, 98.0, 118.0, 97.0, 122.0, 115.0, 121.0, 114.0, 91.0, 86.0, 116.0, 85.0, 79.0, 99.0, 97.0, 89.0, 103.0, 78.0, 108.0, 107.0, 78.0, 101.0, 99.0, 96.0, 119.0, 87.0, 98.0, 113.0, 112.0, 101.0, 78.0, 125.0, 101.0, 102.0, 137.0, 85.0, 97.0, 96.0, 119.0, 119.0, 93.0, 84.0, 94.0, 91.0, 132.0, 108.0, 113.0, 98.0, 127.0, 102.0, 88.0, 93.0, 124.0, 102.0, 99.0, 97.0, 99.0, 85.0, 103.0, 94.0, 108.0, 116.0, 103.0, 114.0, 105.0, 123.0, 122.0, 94.0, 104.0, 101.0, 103.0, 109.0, 115.0, 117.0, 125.0, 81.0, 115.0, 112.0, 116.0, 100.0, 108.0, 105.0, 97.0, 101.0, 105.0, 98.0, 124.0, 98.0, 101.0, 103.0, 123.0, 124.0, 122.0, 115.0, 102.0, 115.0, 116.0, 122.0, 111.0, 88.0, 99.0, 95.0, 112.0, 122.0, 131.0, 110.0, 112.0, 96.0, 108.0, 100.0, 103.0, 106.0, 119.0, 104.0, 102.0, 97.0, 125.0, 93.0, 117.0, 133.0, 112.0, 137.0, 110.0, 104.0, 120.0, 115.0, 111.0, 118.0, 113.0, 100.0, 125.0, 108.0, 109.0, 122.0, 99.0, 128.0, 105.0, 112.0, 122.0, 112.0, 114.0, 109.0, 108.0, 111.0, 113.0, 114.0, 105.0, 101.0, 110.0, 104.0, 112.0, 114.0, 109.0, 92.0, 111.0, 102.0, 91.0, 119.0, 111.0, 95.0, 107.0, 97.0, 115.0, 108.0, 124.0, 118.0, 123.0, 119.0, 122.0, 112.0, 106.0, 101.0, 93.0, 116.0, 123.0, 112.0, 120.0, 87.0, 102.0, 116.0, 113.0, 118.0, 135.0, 110.0]}, "params-norm": {"start_step": 0, "end_step": 2000, "step_interval": 5, "values": [180.02467, 180.43758, 180.43758, 180.43758, 180.43777, 180.43793, 180.43831, 180.43793, 180.43802, 180.43829, 180.42958, 180.42429, 180.42331, 180.41566, 180.40419, 180.39563, 180.38898, 180.38272, 180.37767, 180.36685, 180.3624, 180.35443, 180.34731, 180.34276, 180.34059, 180.33824, 180.33578, 180.33348, 180.33122, 180.32733, 180.32587, 180.33919, 180.33737, 180.34363, 180.342, 180.33069, 180.32681, 180.32585, 180.32092, 180.32108, 180.32138, 180.32286, 180.32475, 180.31996, 180.32315, 180.32661, 180.32353, 180.32631, 180.3324, 180.33495, 180.34297, 180.3517, 180.3622, 180.37233, 180.38312, 180.39899, 180.40675, 180.41966, 180.43352, 180.4433, 180.45685, 180.47159, 180.48726, 180.50627, 180.52652, 180.54735, 180.56612, 180.58878, 180.60876, 180.63284, 180.65726, 180.68228, 180.70798, 180.73802, 180.77544, 180.79449, 180.82513, 180.85388, 180.88318, 180.90952, 180.93996, 180.9704, 181.00171, 181.03206, 181.06531, 181.1013, 181.13477, 181.15898, 181.19191, 181.22948, 181.26605, 181.30205, 181.33176, 181.36722, 181.40222, 181.43898, 181.4686, 181.50232, 181.53323, 181.56693, 181.60017, 181.63365, 181.66275, 181.69737, 181.73155, 181.76347, 181.8042, 181.83623, 181.86909, 181.90247, 181.93695, 181.96951, 182.00578, 182.04301, 182.07603, 182.11412, 182.15521, 182.18857, 182.22928, 182.26672, 182.3042, 182.34148, 182.37926, 182.41901, 182.45923, 182.49518, 182.53793, 182.57965, 182.61847, 182.65536, 182.6929, 182.72876, 182.76958, 182.80853, 182.85202, 182.88937, 182.92555, 182.96187, 182.99063, 183.02582, 183.05833, 183.08974, 183.12651, 183.16095, 183.19424, 183.233, 183.26149, 183.29265, 183.32909, 183.36882, 183.40269, 183.43456, 183.47014, 183.51022, 183.54683, 183.57953, 183.61252, 183.64738, 183.68155, 183.71558, 183.75716, 183.79567, 183.83615, 183.87654, 183.9173, 183.9584, 184.00073, 184.04141, 184.08711, 184.12192, 184.16089, 184.19904, 184.23912, 184.27597, 184.31317, 184.35162, 184.39233, 184.43021, 184.46562, 184.50061, 184.54076, 184.5798, 184.62137, 184.66426, 184.70601, 184.74544, 184.7812, 184.8163, 184.85382, 184.89362, 184.9332, 184.9715, 185.00937, 185.05093, 185.09132, 185.12502, 185.16487, 185.20316, 185.24188, 185.27464, 185.31422, 185.35551, 185.3972, 185.43919, 185.47906, 185.52074, 185.56161, 185.60054, 185.64554, 185.68713, 185.72649, 185.76546, 185.80576, 185.84767, 185.89198, 185.9361, 185.98022, 186.01895, 186.05711, 186.10294, 186.13905, 186.17926, 186.22005, 186.25861, 186.29631, 186.33633, 186.37819, 186.41498, 186.452, 186.48996, 186.52638, 186.56227, 186.59106, 186.62415, 186.66559, 186.70592, 186.74504, 186.78651, 186.83006, 186.87518, 186.91788, 186.96049, 187.00543, 187.05008, 187.09511, 187.13741, 187.17758, 187.21588, 187.25984, 187.30086, 187.34575, 187.39095, 187.43542, 187.4792, 187.51852, 187.56268, 187.60396, 187.64711, 187.68872, 187.73135, 187.77692, 187.81973, 187.86543, 187.91296, 187.96025, 188.00529, 188.04802, 188.0909, 188.13518, 188.18434, 188.22716, 188.27409, 188.32169, 188.36803, 188.41319, 188.45816, 188.50641, 188.54868, 188.59381, 188.6367, 188.68343, 188.72693, 188.77374, 188.8172, 188.86154, 188.90767, 188.95059, 188.99326, 189.04083, 189.08832, 189.13934, 189.1855, 189.2296, 189.27489, 189.32558, 189.36694, 189.41133, 189.45744, 189.50322, 189.54796, 189.59531, 189.6389, 189.68634, 189.73462, 189.78769, 189.83501, 189.88196, 189.92941, 189.97726, 190.02953, 190.08095, 190.13335, 190.18449, 190.23326, 190.28383, 190.33415, 190.38512, 190.43832, 190.49026, 190.5453, 190.59666, 190.65088, 190.70216, 190.75441, 190.80804, 190.85649, 190.90819, 190.957, 191.00778, 191.05713, 191.10803, 191.15628, 191.20445, 191.25539, 191.30585, 191.35631, 191.40929, 191.46144, 191.5153, 191.5732, 191.6273, 191.67821, 191.73494, 191.79005, 191.84462, 191.89845, 191.95538, 192.01093, 192.06554, 192.1189, 192.17081, 192.2244, 192.2774, 192.33224, 192.38445, 192.44177, 192.49707, 192.55254, 192.60464, 192.65576, 192.70808, 192.76437, 192.82317, 192.88344, 192.93953, 192.99843, 193.05219, 193.1062, 193.16641, 193.22375, 193.28175, 193.3349, 193.39145, 193.44878, 193.50717, 193.55751, 193.61333, 193.66898, 193.72675, 193.79041, 193.84534, 193.90236, 193.96567, 194.0249, 194.08501, 194.14468, 194.2052, 194.2684, 194.32666, 194.38776, 194.44768, 194.50999, 194.57324, 194.63622, 194.69333, 194.74876, 194.80455, 194.86299, 194.92128, 194.97459]}, "params-norm vs samples": {"start_step": 0, "end_step": 2000, "step_interval": 5, "values": [180.02467, 180.43758, 180.43758, 180.43758, 180.43777, 180.43793, 180.43831, 180.43793, 180.43802, 180.43829, 180.42958, 180.42429, 180.42331, 180.41566, 180.40419, 180.39563, 180.38898, 180.38272, 180.37767, 180.36685, 180.3624, 180.35443, 180.34731, 180.34276, 180.34059, 180.33824, 180.33578, 180.33348, 180.33122, 180.32733, 180.32587, 180.33919, 180.33737, 180.34363, 180.342, 180.33069, 180.32681, 180.32585, 180.32092, 180.32108, 180.32138, 180.32286, 180.32475, 180.31996, 180.32315, 180.32661, 180.32353, 180.32631, 180.3324, 180.33495, 180.34297, 180.3517, 180.3622, 180.37233, 180.38312, 180.39899, 180.40675, 180.41966, 180.43352, 180.4433, 180.45685, 180.47159, 180.48726, 180.50627, 180.52652, 180.54735, 180.56612, 180.58878, 180.60876, 180.63284, 180.65726, 180.68228, 180.70798, 180.73802, 180.77544, 180.79449, 180.82513, 180.85388, 180.88318, 180.90952, 180.93996, 180.9704, 181.00171, 181.03206, 181.06531, 181.1013, 181.13477, 181.15898, 181.19191, 181.22948, 181.26605, 181.30205, 181.33176, 181.36722, 181.40222, 181.43898, 181.4686, 181.50232, 181.53323, 181.56693, 181.60017, 181.63365, 181.66275, 181.69737, 181.73155, 181.76347, 181.8042, 181.83623, 181.86909, 181.90247, 181.93695, 181.96951, 182.00578, 182.04301, 182.07603, 182.11412, 182.15521, 182.18857, 182.22928, 182.26672, 182.3042, 182.34148, 182.37926, 182.41901, 182.45923, 182.49518, 182.53793, 182.57965, 182.61847, 182.65536, 182.6929, 182.72876, 182.76958, 182.80853, 182.85202, 182.88937, 182.92555, 182.96187, 182.99063, 183.02582, 183.05833, 183.08974, 183.12651, 183.16095, 183.19424, 183.233, 183.26149, 183.29265, 183.32909, 183.36882, 183.40269, 183.43456, 183.47014, 183.51022, 183.54683, 183.57953, 183.61252, 183.64738, 183.68155, 183.71558, 183.75716, 183.79567, 183.83615, 183.87654, 183.9173, 183.9584, 184.00073, 184.04141, 184.08711, 184.12192, 184.16089, 184.19904, 184.23912, 184.27597, 184.31317, 184.35162, 184.39233, 184.43021, 184.46562, 184.50061, 184.54076, 184.5798, 184.62137, 184.66426, 184.70601, 184.74544, 184.7812, 184.8163, 184.85382, 184.89362, 184.9332, 184.9715, 185.00937, 185.05093, 185.09132, 185.12502, 185.16487, 185.20316, 185.24188, 185.27464, 185.31422, 185.35551, 185.3972, 185.43919, 185.47906, 185.52074, 185.56161, 185.60054, 185.64554, 185.68713, 185.72649, 185.76546, 185.80576, 185.84767, 185.89198, 185.9361, 185.98022, 186.01895, 186.05711, 186.10294, 186.13905, 186.17926, 186.22005, 186.25861, 186.29631, 186.33633, 186.37819, 186.41498, 186.452, 186.48996, 186.52638, 186.56227, 186.59106, 186.62415, 186.66559, 186.70592, 186.74504, 186.78651, 186.83006, 186.87518, 186.91788, 186.96049, 187.00543, 187.05008, 187.09511, 187.13741, 187.17758, 187.21588, 187.25984, 187.30086, 187.34575, 187.39095, 187.43542, 187.4792, 187.51852, 187.56268, 187.60396, 187.64711, 187.68872, 187.73135, 187.77692, 187.81973, 187.86543, 187.91296, 187.96025, 188.00529, 188.04802, 188.0909, 188.13518, 188.18434, 188.22716, 188.27409, 188.32169, 188.36803, 188.41319, 188.45816, 188.50641, 188.54868, 188.59381, 188.6367, 188.68343, 188.72693, 188.77374, 188.8172, 188.86154, 188.90767, 188.95059, 188.99326, 189.04083, 189.08832, 189.13934, 189.1855, 189.2296, 189.27489, 189.32558, 189.36694, 189.41133, 189.45744, 189.50322, 189.54796, 189.59531, 189.6389, 189.68634, 189.73462, 189.78769, 189.83501, 189.88196, 189.92941, 189.97726, 190.02953, 190.08095, 190.13335, 190.18449, 190.23326, 190.28383, 190.33415, 190.38512, 190.43832, 190.49026, 190.5453, 190.59666, 190.65088, 190.70216, 190.75441, 190.80804, 190.85649, 190.90819, 190.957, 191.00778, 191.05713, 191.10803, 191.15628, 191.20445, 191.25539, 191.30585, 191.35631, 191.40929, 191.46144, 191.5153, 191.5732, 191.6273, 191.67821, 191.73494, 191.79005, 191.84462, 191.89845, 191.95538, 192.01093, 192.06554, 192.1189, 192.17081, 192.2244, 192.2774, 192.33224, 192.38445, 192.44177, 192.49707, 192.55254, 192.60464, 192.65576, 192.70808, 192.76437, 192.82317, 192.88344, 192.93953, 192.99843, 193.05219, 193.1062, 193.16641, 193.22375, 193.28175, 193.3349, 193.39145, 193.44878, 193.50717, 193.55751, 193.61333, 193.66898, 193.72675, 193.79041, 193.84534, 193.90236, 193.96567, 194.0249, 194.08501, 194.14468, 194.2052, 194.2684, 194.32666, 194.38776, 194.44768, 194.50999, 194.57324, 194.63622, 194.69333, 194.74876, 194.80455, 194.86299, 194.92128, 194.97459]}, "iteration-time": {"start_step": 0, "end_step": 2000, "step_interval": 5, "values": [26.15537, 1.59225, 1.58677, 1.61174, 1.60131, 1.58979, 1.6009, 1.60255, 1.59989, 1.59397, 1.59991, 1.60879, 1.59752, 1.58326, 1.60593, 1.58196, 1.58281, 1.58285, 1.65512, 1.58951, 1.57778, 1.59099, 1.59905, 1.5964, 1.60421, 1.59987, 1.60383, 1.59456, 1.59474, 1.60292, 1.59587, 1.59615, 1.59953, 1.68491, 1.61405, 1.61646, 1.76204, 1.6157, 1.60582, 1.60949, 1.60517, 1.60169, 1.5944, 1.59771, 1.59812, 1.61186, 1.60798, 1.59786, 1.69134, 1.607, 1.62116, 1.61495, 1.61958, 1.61282, 1.60615, 1.61947, 1.6053, 1.59812, 1.60103, 1.61637, 1.60915, 1.61703, 1.61268, 1.61077, 1.61236, 1.61876, 1.60773, 1.69396, 1.60939, 1.61301, 1.62827, 1.61429, 1.61159, 1.60859, 1.61405, 1.62895, 1.61614, 1.61446, 1.60675, 1.61067, 1.61896, 1.61461, 1.61244, 1.60436, 1.6079, 1.619, 1.61303, 1.61117, 1.61223, 1.60766, 1.62186, 1.60682, 1.60832, 1.60625, 1.60469, 1.61342, 1.60768, 1.60669, 1.59722, 1.69938, 1.61072, 1.61909, 1.61007, 1.6046, 1.60277, 1.61264, 1.61634, 1.61492, 1.61043, 1.62152, 1.61505, 1.61393, 1.61336, 1.61268, 1.61629, 1.61635, 1.62076, 1.61243, 1.61515, 1.61244, 1.61769, 1.61729, 1.60493, 1.60897, 1.61012, 1.61259, 1.6206, 1.60935, 1.61072, 1.61412, 1.62132, 1.61512, 1.61556, 1.61045, 1.6109, 1.61406, 1.61499, 1.60648, 1.62368, 1.61793, 1.62077, 1.61115, 1.607, 1.60097, 1.60715, 1.61148, 1.61713, 1.61144, 1.62249, 1.61481, 1.61115, 1.6037, 1.61119, 1.60767, 1.6172, 1.61279, 1.60574, 1.60707, 1.60482, 1.60401, 1.61113, 1.61346, 1.60704, 1.61142, 1.60677, 1.60612, 1.59885, 1.60751, 1.60394, 1.60565, 1.60074, 1.60646, 1.60139, 1.60114, 1.60502, 1.59931, 1.59106, 1.59528, 1.59562, 1.60655, 1.61019, 1.60604, 1.60255, 1.59481, 1.59218, 1.59628, 1.58975, 1.60275, 1.59914, 1.59723, 1.59728, 1.58386, 1.61425, 1.60353, 1.60061, 1.60375, 1.61192, 1.61512, 1.60494, 1.59982, 1.59392, 1.59773, 1.59899, 1.60034, 1.59034, 1.59986, 1.59404, 1.59171, 1.58924, 1.58292, 1.59951, 1.58972, 1.60076, 1.59525, 1.60354, 1.60474, 1.6007, 1.60461, 1.60303, 1.68738, 1.61462, 1.6112, 1.60314, 1.60468, 1.60954, 1.61515, 1.60446, 1.60607, 1.60574, 1.60376, 1.60767, 1.60168, 1.60809, 1.60685, 1.59979, 1.59981, 1.59996, 1.60233, 1.61191, 1.60192, 1.60578, 1.61979, 1.6159, 1.61226, 1.6128, 1.60991, 1.62187, 1.61382, 1.60853, 1.61365, 1.6207, 1.63823, 1.61317, 1.60999, 1.6096, 1.6053, 1.62098, 1.60515, 1.61012, 1.60877, 1.61097, 1.62766, 1.61189, 1.61276, 1.61683, 1.61267, 1.62231, 1.61022, 1.61488, 1.61227, 1.60799, 1.61989, 1.61118, 1.60947, 1.61635, 1.60971, 1.61707, 1.61308, 1.60535, 1.61359, 1.60892, 1.61075, 1.60793, 1.60987, 1.61295, 1.61056, 1.60924, 1.61593, 1.60828, 1.62137, 1.60777, 1.6163, 1.61976, 1.60496, 1.61232, 1.60943, 1.60387, 1.61497, 1.60986, 1.61254, 1.61053, 1.61641, 1.62112, 1.60996, 1.62043, 1.61238, 1.61482, 1.61865, 1.61289, 1.61175, 1.61784, 1.61203, 1.6132, 1.60843, 1.61847, 1.61033, 1.6185, 1.61766, 1.6264, 1.62151, 1.62048, 1.61539, 1.61807, 1.61346, 1.60979, 1.61291, 1.61433, 1.61137, 1.616, 1.60714, 1.6154, 1.61351, 1.60767, 1.60384, 1.60001, 1.59921, 1.60103, 1.60417, 1.60117, 1.59284, 1.60079, 1.59673, 1.59125, 1.59593, 1.59394, 1.59478, 1.59263, 1.59408, 1.59955, 1.66468, 1.59302, 1.59156, 1.59525, 1.62673, 1.61448, 1.60772, 1.60098, 1.6066, 1.62998, 1.62933, 1.6147, 1.61299, 1.61044, 1.62556, 1.61734, 1.61197, 1.61149, 1.61287, 1.62523, 1.61258, 1.60355, 1.6117, 1.61092, 1.60763, 1.61177, 1.61161, 1.6207, 1.61553, 1.62712, 1.62883, 1.6176, 1.62185, 1.60923, 1.61676, 1.62142, 1.62074, 1.61866, 1.61459, 1.59668, 1.61134, 1.60642, 1.60975, 1.61506, 1.60601, 1.62434, 1.61024, 1.61231, 1.61973, 1.61419, 1.61888]}, "lm loss validation": {"start_step": 0, "end_step": 4, "step_interval": 5, "values": [5.5974]}, "lm loss validation vs samples": {"start_step": 0, "end_step": 4, "step_interval": 5, "values": [5.5974]}, "lm loss validation ppl": {"start_step": 0, "end_step": 4, "step_interval": 5, "values": [269.72311]}, "lm loss validation ppl vs samples": {"start_step": 0, "end_step": 4, "step_interval": 5, "values": [269.72311]}}
\ No newline at end of file
diff --git a/tests/functional_tests/test_cases/gpt/gpt3_345m_weekly_dgx_h100_1N8G_mcore_tp2_pp2_native_fp8_tp_pp_sp/model_config.yaml b/tests/functional_tests/test_cases/gpt/gpt3_345m_weekly_dgx_h100_1N8G_mcore_tp2_pp2_native_fp8_tp_pp_sp/model_config.yaml
new file mode 100644
index 0000000000..de27041eba
--- /dev/null
+++ b/tests/functional_tests/test_cases/gpt/gpt3_345m_weekly_dgx_h100_1N8G_mcore_tp2_pp2_native_fp8_tp_pp_sp/model_config.yaml
@@ -0,0 +1,55 @@
+ENV_VARS:
+  CUDA_DEVICE_MAX_CONNECTIONS: 1
+  NVTE_ALLOW_NONDETERMINISTIC_ALGO: 0
+  NVTE_FUSED_ATTN: 0
+  NCCL_ALGO: Tree
+  CUBLAS_WORKSPACE_CONFIG: :4096:8
+MODEL_ARGS:
+  --num-layers: 12
+  --hidden-size: 512
+  --num-attention-heads: 8
+  --log-params-norm: true
+  --log-num-zeros-in-grad: true
+  --log-validation-ppl-to-tensorboard: true
+  --log-timers-to-tensorboard: true
+  --tensorboard-dir: ${TENSORBOARD_PATH}
+  --micro-batch-size: 2
+  --global-batch-size: 128
+  --seq-length: 1024
+  --max-position-embeddings: 1024
+  --train-iters: 2000
+  --timing-log-level: 2
+  --lr-decay-iters: 320000
+  --save: ${CHECKPOINT_PATH}
+  --load: ${CHECKPOINT_PATH}
+  --data-path: ${DATA_PATH}/my-gpt3_00_text_document
+  --vocab-file: ${DATA_PATH}/bpe/vocab.json
+  --merge-file: ${DATA_PATH}/bpe/merges.txt
+  --split: 949,50,1
+  --distributed-backend: nccl
+  --lr: 0.00015
+  --lr-decay-style: cosine
+  --min-lr: 1.0e-5
+  --weight-decay: 1e-2
+  --clip-grad: 1.0
+  --lr-warmup-fraction: .01
+  --log-interval: 1
+  --save-interval: 10000
+  --eval-interval: 1000
+  --eval-iters: 10
+  --transformer-impl: transformer_engine
+  --tensor-model-parallel-size: 2
+  --pipeline-model-parallel-size: 2
+  --sequence-parallel: true
+  --deterministic-mode: true
+  --no-gradient-accumulation-fusion: true
+  --fp8-format: hybrid
+  --fp8-amax-history-len: 1024
+  --fp8-amax-compute-algo: max
+  --fp8-param-gather: true
+  --use-distributed-optimizer: true
+  --attention-softmax-in-fp32: true
+  --ckpt-format: torch_dist
+  --data-cache-path: ${DATA_CACHE_PATH}
+  --bf16: true
+TEST_TYPE: regular
diff --git a/tests/functional_tests/test_cases/gpt/gpt3_345m_weekly_dgx_h100_1N8G_mcore_tp4_pp2_fp8_tp_pp/golden_values_dev.json b/tests/functional_tests/test_cases/gpt/gpt3_345m_weekly_dgx_h100_1N8G_mcore_tp4_pp2_fp8_tp_pp/golden_values_dev.json
new file mode 100644
index 0000000000..3d10208bdb
--- /dev/null
+++ b/tests/functional_tests/test_cases/gpt/gpt3_345m_weekly_dgx_h100_1N8G_mcore_tp4_pp2_fp8_tp_pp/golden_values_dev.json
@@ -0,0 +1 @@
+{"forward-backward-time": {"start_step": 0, "end_step": 2000, "step_interval": 5, "values": [23.87084, 2.7908, 2.78539, 2.7894, 2.7852, 2.79146, 2.78472, 2.78272, 2.79513, 2.79226, 2.78492, 2.79008, 2.7883, 2.79109, 2.79145, 2.79405, 2.79452, 2.79382, 2.79611, 2.79622, 2.79284, 2.79072, 2.79713, 2.79936, 2.79764, 2.78902, 2.79179, 2.79398, 2.79758, 2.78776, 2.79263, 2.79691, 2.80152, 2.80908, 2.80472, 2.79568, 2.80506, 2.80202, 2.80799, 2.80521, 2.80461, 2.8094, 2.80343, 2.80761, 2.81112, 2.81918, 2.80453, 2.80312, 2.80829, 2.80344, 2.80562, 2.80427, 2.79734, 2.81406, 2.90515, 2.82407, 2.81478, 2.81303, 2.81592, 2.81601, 2.82191, 2.81825, 2.82313, 2.81813, 2.8193, 2.81849, 2.80988, 2.81403, 2.81327, 2.80905, 2.80847, 2.80536, 2.80854, 2.8101, 2.81145, 2.80684, 2.81147, 2.81242, 2.80609, 2.80189, 2.79515, 2.7996, 2.80311, 2.8045, 2.80721, 2.80272, 2.81517, 2.80665, 2.81404, 2.81132, 2.80918, 2.80977, 2.80802, 2.80672, 2.80661, 2.80353, 2.81098, 2.80324, 2.80589, 2.80502, 2.80911, 2.80853, 2.80753, 2.80189, 2.80083, 2.8104, 2.80739, 2.80143, 2.8113, 2.80321, 2.80139, 2.79801, 2.80488, 2.80348, 2.80222, 2.80147, 2.80475, 2.79774, 2.79626, 2.80141, 2.80405, 2.80603, 2.80138, 2.80245, 2.79478, 2.80184, 2.80852, 2.8046, 2.81228, 2.80607, 2.80189, 2.80761, 2.80561, 2.8108, 2.79699, 2.80217, 2.82211, 2.79924, 2.81403, 2.80853, 2.8231, 2.81577, 2.8231, 2.82156, 2.81887, 2.82238, 2.81839, 2.82501, 2.81996, 2.82429, 2.82644, 2.82806, 2.82682, 2.8177, 2.81557, 2.82321, 2.80343, 2.83308, 2.81556, 2.80394, 2.8065, 2.80837, 2.80217, 2.81017, 2.80941, 2.80836, 2.80137, 2.80618, 2.8106, 2.81859, 2.81372, 2.80415, 2.81048, 2.80289, 2.8074, 2.80851, 2.80327, 2.80386, 2.80501, 2.80423, 2.80829, 2.80479, 2.80551, 2.80503, 2.80867, 2.80686, 2.80919, 2.80825, 2.80825, 2.80524, 2.8104, 2.81017, 2.8092, 2.80887, 2.80127, 2.80865, 2.81409, 2.81338, 2.81622, 2.81551, 2.78402, 2.78667, 2.77607, 2.78149, 2.79485, 2.77794, 2.77679, 2.77522, 2.77183, 2.76873, 2.76746, 2.78341, 2.77337, 2.77333, 2.77216, 2.76418, 2.77521, 2.77572, 2.77007, 2.77107, 2.77433, 2.7767, 2.77171, 2.78519, 2.77337, 2.77435, 2.77481, 2.77069, 2.77522, 2.77587, 2.78393, 2.7743, 2.78225, 2.77729, 2.7811, 2.77531, 2.77781, 2.77542, 2.76967, 2.77202, 2.77351, 2.78458, 2.77568, 2.78594, 2.7783, 2.78007, 2.78444, 2.77342, 2.77788, 2.8174, 2.80994, 2.81175, 2.8116, 2.80961, 2.81294, 2.80664, 2.82069, 2.80473, 2.80257, 2.80502, 2.79658, 2.80824, 2.80374, 2.80925, 2.80871, 2.80288, 2.82051, 2.81324, 2.81301, 2.81015, 2.81433, 2.81771, 2.82163, 2.82047, 2.84243, 2.82391, 2.82193, 2.82874, 2.82499, 2.82329, 2.82269, 2.78491, 2.78347, 2.78283, 2.77915, 2.78184, 2.78745, 2.77885, 2.78616, 2.78454, 2.79387, 2.78599, 2.78264, 2.78415, 2.77954, 2.78012, 2.77574, 2.77417, 2.77157, 2.77598, 2.78523, 2.78094, 2.77956, 2.78155, 2.76974, 2.76609, 2.77059, 2.7715, 2.77799, 2.78545, 2.79125, 2.78957, 2.7735, 2.77351, 2.77438, 2.77082, 2.76702, 2.76913, 2.77001, 2.77136, 2.77805, 2.77172, 2.77423, 2.77469, 2.76739, 2.76274, 2.76413, 2.769, 2.7747, 2.77447, 2.77236, 2.77322, 2.77126, 2.76432, 2.77139, 2.75782, 2.76437, 2.77311, 2.77485, 2.77226, 2.7716, 2.77527, 2.76108, 2.76967, 2.76835, 2.76738, 2.77531, 2.77528, 2.76726, 2.77204, 2.76615, 2.76217, 2.76346, 2.76358, 2.86867, 2.76052, 2.76931, 2.77037, 2.76368, 2.76923, 2.76194, 2.77432, 2.77035, 2.76442, 2.77453, 2.76955, 2.75944, 2.76101, 2.76318, 2.76891, 2.7675, 2.77756, 2.77522, 2.76826, 2.76436, 2.77785, 2.77783, 2.76832, 2.76347, 2.76291, 2.77118, 2.76677, 2.76612, 2.76582, 2.76273, 2.75857, 2.75873, 2.7722, 2.76177, 2.77171, 2.77644, 2.7639, 2.7721, 2.76437, 2.76496, 2.78781, 2.7708, 2.77914, 2.7677, 2.77621]}, "forward-compute-time": {"start_step": 0, "end_step": 2000, "step_interval": 5, "values": [9.51205, 1.43678, 1.43791, 1.4403, 1.43427, 1.43756, 1.43758, 1.43562, 1.44189, 1.44431, 1.43685, 1.43669, 1.43665, 1.43656, 1.44116, 1.44015, 1.44001, 1.44016, 1.4435, 1.44113, 1.44161, 1.44108, 1.44253, 1.44731, 1.44571, 1.43765, 1.44091, 1.44413, 1.44785, 1.43882, 1.44323, 1.43963, 1.44096, 1.44584, 1.4433, 1.43872, 1.44424, 1.44585, 1.4456, 1.44851, 1.44579, 1.4472, 1.44488, 1.44427, 1.44702, 1.44843, 1.44696, 1.44174, 1.44868, 1.44573, 1.44263, 1.44873, 1.44368, 1.45098, 1.50386, 1.46222, 1.45889, 1.46823, 1.45958, 1.46199, 1.45939, 1.46248, 1.46055, 1.46617, 1.46663, 1.46838, 1.45647, 1.45342, 1.45158, 1.44745, 1.45071, 1.44757, 1.45057, 1.45354, 1.45015, 1.45365, 1.45031, 1.45396, 1.44855, 1.44723, 1.44555, 1.44612, 1.44775, 1.44969, 1.45014, 1.4487, 1.447, 1.44896, 1.4498, 1.45306, 1.45037, 1.4495, 1.44838, 1.44482, 1.45215, 1.448, 1.45159, 1.44448, 1.44896, 1.44752, 1.44756, 1.45023, 1.45026, 1.44675, 1.44444, 1.45064, 1.44643, 1.44631, 1.45024, 1.44933, 1.44526, 1.44522, 1.44467, 1.4481, 1.44864, 1.45043, 1.45185, 1.44907, 1.44793, 1.45106, 1.44909, 1.44946, 1.44262, 1.43975, 1.44103, 1.44743, 1.45025, 1.4482, 1.45283, 1.44737, 1.44579, 1.44509, 1.44631, 1.44428, 1.44535, 1.45213, 1.45201, 1.44741, 1.45012, 1.45313, 1.47204, 1.46712, 1.47171, 1.47404, 1.47244, 1.46786, 1.46879, 1.46914, 1.47064, 1.46718, 1.47001, 1.47261, 1.47278, 1.46528, 1.46833, 1.46966, 1.44696, 1.45977, 1.44861, 1.44782, 1.44378, 1.44407, 1.44816, 1.45245, 1.449, 1.44784, 1.4449, 1.44523, 1.44905, 1.45312, 1.44739, 1.44742, 1.45369, 1.44478, 1.44662, 1.44949, 1.4459, 1.4448, 1.44385, 1.44392, 1.45267, 1.44333, 1.44892, 1.44724, 1.4485, 1.44583, 1.44996, 1.4476, 1.4446, 1.44975, 1.451, 1.45004, 1.44925, 1.45149, 1.44617, 1.44967, 1.44957, 1.45131, 1.45283, 1.4513, 1.42552, 1.41683, 1.41289, 1.41323, 1.41749, 1.41143, 1.41101, 1.4112, 1.4135, 1.41006, 1.4137, 1.41016, 1.41535, 1.41173, 1.41324, 1.40716, 1.40976, 1.40928, 1.41, 1.40851, 1.40949, 1.41481, 1.40726, 1.41247, 1.40893, 1.40726, 1.41201, 1.41338, 1.41944, 1.41452, 1.41165, 1.41022, 1.41318, 1.41802, 1.41449, 1.41063, 1.41492, 1.41265, 1.41132, 1.41365, 1.41475, 1.41847, 1.41122, 1.41128, 1.41301, 1.41405, 1.41415, 1.41581, 1.41619, 1.42827, 1.42088, 1.42041, 1.42456, 1.42192, 1.42307, 1.42073, 1.42805, 1.42078, 1.42396, 1.42359, 1.42048, 1.42105, 1.41976, 1.4247, 1.42503, 1.42186, 1.42845, 1.42785, 1.42791, 1.4201, 1.42849, 1.42307, 1.43185, 1.43491, 1.44341, 1.43591, 1.44767, 1.44319, 1.43803, 1.4396, 1.43766, 1.41441, 1.41492, 1.41502, 1.41802, 1.41644, 1.41395, 1.4088, 1.41436, 1.41116, 1.41904, 1.41497, 1.4117, 1.41375, 1.41211, 1.41098, 1.41349, 1.40846, 1.41118, 1.41363, 1.41608, 1.41063, 1.40863, 1.40931, 1.40576, 1.40253, 1.40633, 1.4031, 1.40517, 1.40582, 1.40973, 1.41428, 1.41255, 1.41129, 1.4127, 1.41154, 1.40611, 1.40611, 1.40794, 1.41156, 1.40745, 1.41035, 1.4097, 1.40988, 1.40878, 1.40716, 1.40765, 1.41137, 1.4109, 1.40902, 1.41507, 1.40796, 1.41525, 1.40249, 1.40831, 1.39916, 1.40546, 1.40999, 1.41032, 1.41283, 1.41312, 1.40738, 1.40936, 1.40757, 1.41053, 1.40694, 1.40948, 1.41066, 1.40854, 1.40655, 1.41367, 1.41378, 1.40999, 1.41174, 1.51942, 1.40444, 1.4119, 1.41683, 1.40936, 1.41487, 1.40883, 1.41143, 1.41268, 1.40887, 1.41527, 1.41408, 1.41281, 1.41183, 1.4134, 1.4109, 1.41349, 1.41109, 1.41503, 1.4111, 1.40948, 1.41361, 1.41212, 1.40741, 1.40997, 1.41405, 1.41032, 1.40943, 1.40908, 1.40969, 1.40965, 1.40759, 1.41424, 1.41408, 1.41111, 1.41223, 1.4114, 1.41026, 1.41191, 1.40822, 1.40981, 1.41905, 1.4096, 1.41551, 1.40808, 1.41685]}, "backward-compute-time": {"start_step": 0, "end_step": 2000, "step_interval": 5, "values": [5.76315, 1.31571, 1.31593, 1.31502, 1.31389, 1.32096, 1.31535, 1.31393, 1.31645, 1.31983, 1.31373, 1.31879, 1.31981, 1.31802, 1.31437, 1.31804, 1.3168, 1.3164, 1.31781, 1.31891, 1.31627, 1.31955, 1.31518, 1.32254, 1.32375, 1.31999, 1.31794, 1.32051, 1.32225, 1.32201, 1.32279, 1.32113, 1.32401, 1.32399, 1.32517, 1.32129, 1.32334, 1.32013, 1.32408, 1.32339, 1.32077, 1.32325, 1.32393, 1.32691, 1.3248, 1.32346, 1.32319, 1.32546, 1.32574, 1.32432, 1.32506, 1.32316, 1.32102, 1.32498, 1.31925, 1.32089, 1.31762, 1.32259, 1.32419, 1.3238, 1.3311, 1.31611, 1.31766, 1.31858, 1.31753, 1.31906, 1.32287, 1.32538, 1.32481, 1.32145, 1.32464, 1.32198, 1.3244, 1.32137, 1.31992, 1.31987, 1.32194, 1.31437, 1.3176, 1.31699, 1.31617, 1.31875, 1.32414, 1.32452, 1.31883, 1.32118, 1.32409, 1.32097, 1.32779, 1.31828, 1.31626, 1.32197, 1.32549, 1.32434, 1.32206, 1.31897, 1.31696, 1.32081, 1.31817, 1.32008, 1.32093, 1.32034, 1.32057, 1.3194, 1.31784, 1.32222, 1.31761, 1.31937, 1.32438, 1.32014, 1.31951, 1.31748, 1.31751, 1.31806, 1.31789, 1.32196, 1.32358, 1.31991, 1.31901, 1.32185, 1.32603, 1.32323, 1.32207, 1.31786, 1.31601, 1.32365, 1.32045, 1.31939, 1.32039, 1.31927, 1.31562, 1.32046, 1.31813, 1.32192, 1.31787, 1.31521, 1.33243, 1.31979, 1.3209, 1.32524, 1.32073, 1.31982, 1.31934, 1.32334, 1.31999, 1.32008, 1.32149, 1.32088, 1.31917, 1.3216, 1.3281, 1.32441, 1.33089, 1.32051, 1.31858, 1.32678, 1.32537, 1.3342, 1.32893, 1.32448, 1.32645, 1.32391, 1.3234, 1.32535, 1.32031, 1.32412, 1.3238, 1.32447, 1.32647, 1.32957, 1.32786, 1.3237, 1.32721, 1.32175, 1.32877, 1.32685, 1.32128, 1.32422, 1.32282, 1.32689, 1.33079, 1.33206, 1.32599, 1.32533, 1.32086, 1.32573, 1.32664, 1.31836, 1.32782, 1.32904, 1.32799, 1.32601, 1.32546, 1.32741, 1.32429, 1.32809, 1.32601, 1.32401, 1.32374, 1.32751, 1.32317, 1.32231, 1.32071, 1.32437, 1.32903, 1.3223, 1.32056, 1.32302, 1.32275, 1.32175, 1.31913, 1.32111, 1.3226, 1.32065, 1.32224, 1.31853, 1.32253, 1.32127, 1.3209, 1.31926, 1.31964, 1.3227, 1.32157, 1.32205, 1.3223, 1.31767, 1.31875, 1.31811, 1.3211, 1.3162, 1.32259, 1.3172, 1.31878, 1.31747, 1.32111, 1.31966, 1.31682, 1.32112, 1.31521, 1.31669, 1.31901, 1.32814, 1.32216, 1.32442, 1.32313, 1.32151, 1.3243, 1.3203, 1.31897, 1.32073, 1.32493, 1.3246, 1.31844, 1.3284, 1.32684, 1.31608, 1.32499, 1.31768, 1.31464, 1.31825, 1.31743, 1.32077, 1.31974, 1.32195, 1.32195, 1.32016, 1.32093, 1.32005, 1.32407, 1.31906, 1.32446, 1.32365, 1.32141, 1.32093, 1.33319, 1.32834, 1.32237, 1.32312, 1.31793, 1.32722, 1.31541, 1.322, 1.3218, 1.31794, 1.31628, 1.31547, 1.32499, 1.31709, 1.317, 1.32129, 1.32324, 1.3231, 1.32155, 1.32292, 1.32269, 1.32156, 1.31852, 1.31872, 1.31758, 1.32143, 1.32104, 1.32353, 1.32012, 1.32147, 1.32263, 1.32328, 1.32548, 1.32214, 1.32307, 1.32574, 1.32903, 1.3278, 1.32381, 1.32116, 1.32264, 1.32367, 1.31807, 1.32574, 1.32105, 1.32208, 1.32432, 1.32324, 1.32004, 1.32242, 1.32161, 1.32001, 1.32057, 1.31875, 1.32152, 1.32786, 1.32575, 1.32357, 1.3226, 1.31921, 1.32595, 1.31832, 1.31725, 1.32287, 1.32418, 1.32617, 1.32128, 1.32384, 1.31932, 1.32117, 1.3209, 1.32292, 1.32281, 1.33147, 1.32181, 1.32357, 1.32241, 1.32062, 1.32002, 1.32089, 1.32929, 1.3178, 1.31998, 1.32166, 1.32279, 1.32038, 1.31604, 1.321, 1.31845, 1.31976, 1.32049, 1.32671, 1.30205, 1.30334, 1.30428, 1.30688, 1.30105, 1.306, 1.30598, 1.30505, 1.30135, 1.30452, 1.30666, 1.30463, 1.30387, 1.30213, 1.30721, 1.30426, 1.30532, 1.30358, 1.30289, 1.30331, 1.30072, 1.30374, 1.30623, 1.30837, 1.30441, 1.30441, 1.30428, 1.30182, 1.29924, 1.31777, 1.31621, 1.32106, 1.31759, 1.32273]}, "batch-generator-time": {"start_step": 0, "end_step": 2000, "step_interval": 5, "values": [4.17805, 0.02532, 0.02443, 0.0259, 0.02446, 0.02433, 0.02525, 0.02434, 0.02571, 0.02834, 0.02652, 0.02646, 0.02518, 0.02481, 0.0279, 0.02807, 0.0266, 0.02845, 0.0313, 0.02866, 0.02895, 0.02709, 0.02883, 0.02971, 0.03025, 0.02951, 0.02896, 0.03006, 0.03215, 0.0295, 0.03352, 0.02739, 0.02956, 0.02814, 0.02868, 0.02699, 0.02842, 0.03193, 0.02797, 0.02967, 0.0318, 0.02963, 0.02835, 0.02797, 0.02797, 0.03173, 0.02956, 0.02665, 0.02908, 0.02921, 0.02665, 0.02893, 0.02866, 0.02772, 0.02944, 0.03233, 0.02893, 0.03067, 0.03096, 0.02981, 0.02909, 0.02673, 0.02735, 0.03183, 0.03003, 0.02892, 0.02792, 0.03046, 0.02823, 0.03032, 0.03123, 0.02966, 0.03045, 0.03048, 0.03141, 0.03097, 0.02999, 0.03135, 0.0285, 0.02735, 0.02803, 0.02831, 0.02764, 0.03034, 0.02971, 0.02926, 0.02972, 0.02952, 0.03075, 0.03009, 0.02964, 0.02882, 0.03045, 0.02898, 0.02803, 0.02824, 0.02708, 0.02867, 0.0342, 0.03142, 0.03184, 0.03236, 0.03305, 0.03116, 0.02898, 0.03026, 0.02775, 0.02983, 0.03023, 0.02832, 0.03086, 0.02777, 0.03086, 0.0307, 0.02887, 0.03065, 0.03095, 0.02937, 0.02703, 0.02981, 0.02895, 0.03324, 0.02658, 0.02662, 0.02448, 0.02629, 0.02739, 0.0271, 0.02673, 0.0253, 0.02683, 0.02718, 0.02671, 0.0276, 0.02593, 0.02704, 0.0285, 0.02845, 0.02811, 0.02883, 0.03435, 0.03167, 0.03261, 0.03235, 0.03414, 0.03091, 0.03163, 0.02955, 0.03106, 0.03182, 0.03113, 0.03157, 0.03216, 0.03397, 0.03111, 0.02941, 0.02991, 0.02875, 0.03204, 0.02798, 0.02854, 0.03038, 0.02648, 0.02916, 0.02799, 0.02855, 0.02792, 0.0274, 0.02603, 0.02879, 0.0292, 0.02864, 0.02841, 0.02759, 0.02946, 0.02947, 0.02937, 0.02887, 0.0288, 0.02812, 0.02927, 0.02796, 0.02893, 0.02755, 0.0266, 0.02892, 0.02827, 0.02802, 0.02761, 0.0284, 0.03055, 0.02773, 0.02955, 0.02851, 0.02789, 0.02748, 0.0272, 0.02827, 0.02809, 0.02816, 0.40686, 0.0267, 0.02546, 0.02555, 0.02624, 0.02523, 0.02567, 0.0279, 0.02868, 0.02572, 0.02653, 0.02383, 0.02613, 0.02506, 0.0243, 0.02629, 0.02418, 0.02447, 0.02537, 0.02552, 0.02379, 0.02344, 0.02378, 0.02314, 0.02354, 0.02382, 0.02379, 0.02659, 0.02476, 0.02631, 0.02468, 0.02598, 0.02324, 0.02455, 0.0251, 0.02405, 0.02442, 0.02377, 0.02361, 0.02478, 0.02379, 0.02477, 0.02439, 0.02295, 0.02552, 0.02359, 0.02286, 0.02462, 0.02531, 0.03164, 0.0315, 0.03143, 0.03142, 0.03168, 0.03139, 0.03399, 0.03158, 0.03159, 0.03346, 0.03175, 0.03166, 0.03151, 0.03142, 0.03168, 0.0317, 0.03164, 0.03167, 0.03175, 0.03163, 0.03326, 0.03172, 0.03141, 0.03173, 0.0333, 0.03168, 0.03167, 0.03183, 0.03165, 0.03174, 0.03408, 0.03301, 0.0256, 0.02643, 0.03, 0.02476, 0.02404, 0.02678, 0.02289, 0.02528, 0.02495, 0.02516, 0.02679, 0.02413, 0.0253, 0.02382, 0.02499, 0.02624, 0.02366, 0.02553, 0.02515, 0.02467, 0.02526, 0.02422, 0.02599, 0.02234, 0.02467, 0.02456, 0.02225, 0.02224, 0.02432, 0.02273, 0.02327, 0.02338, 0.02313, 0.02296, 0.02582, 0.02257, 0.02356, 0.02376, 0.02243, 0.02388, 0.02445, 0.02411, 0.02604, 0.02457, 0.02385, 0.02605, 0.02638, 0.02472, 0.02454, 0.02557, 0.02531, 0.02518, 0.02578, 0.02479, 0.02654, 0.02415, 0.02363, 0.02446, 0.02512, 0.02364, 0.02344, 0.0248, 0.02395, 0.02369, 0.02275, 0.0266, 0.02372, 0.02937, 0.02788, 0.02818, 0.02749, 0.0294, 0.02843, 0.02616, 0.02729, 0.02853, 0.02827, 0.02973, 0.02869, 0.02904, 0.02745, 0.02987, 0.02735, 0.02842, 0.02783, 0.02939, 0.02873, 0.02953, 0.02571, 0.02937, 0.02728, 0.03078, 0.02725, 0.02698, 0.02961, 0.02757, 0.02692, 0.02716, 0.02762, 0.02805, 0.02617, 0.02782, 0.02921, 0.02637, 0.02679, 0.02731, 0.02744, 0.02767, 0.02735, 0.02706, 0.02798, 0.02659, 0.02462, 0.02353, 0.02612, 0.02398, 0.02999, 0.02748, 0.02836]}, "forward-recv-time": {"start_step": 0, "end_step": 2000, "step_interval": 5, "values": [6.80244, 0.02327, 0.02357, 0.02418, 0.02403, 0.02416, 0.02299, 0.02437, 0.02654, 0.02645, 0.02351, 0.02322, 0.02321, 0.02333, 0.02356, 0.02407, 0.02284, 0.02336, 0.02305, 0.02309, 0.02437, 0.02382, 0.02371, 0.02295, 0.0237, 0.02304, 0.02301, 0.02347, 0.02339, 0.02268, 0.02304, 0.02357, 0.02381, 0.02335, 0.02274, 0.02277, 0.02379, 0.02387, 0.02489, 0.023, 0.02356, 0.02397, 0.02382, 0.0233, 0.02371, 0.02556, 0.02297, 0.02329, 0.02457, 0.02391, 0.02309, 0.02372, 0.02319, 0.02317, 0.02516, 0.02376, 0.02587, 0.02328, 0.02429, 0.02353, 0.02342, 0.02529, 0.02337, 0.02294, 0.02608, 0.0263, 0.02427, 0.02258, 0.02358, 0.02315, 0.02427, 0.02338, 0.02373, 0.02348, 0.02312, 0.02582, 0.02644, 0.02485, 0.02527, 0.02355, 0.02335, 0.0233, 0.02482, 0.02366, 0.02378, 0.02279, 0.02307, 0.02344, 0.02368, 0.02351, 0.02442, 0.023, 0.02371, 0.02324, 0.02397, 0.02339, 0.02331, 0.02303, 0.02316, 0.02451, 0.02588, 0.02323, 0.02313, 0.02372, 0.02372, 0.02396, 0.02313, 0.02377, 0.02325, 0.02357, 0.0239, 0.02373, 0.02305, 0.02327, 0.02337, 0.02558, 0.02412, 0.024, 0.02298, 0.02346, 0.02341, 0.02499, 0.02595, 0.02356, 0.02359, 0.02334, 0.02429, 0.02386, 0.02382, 0.02371, 0.02386, 0.02339, 0.02348, 0.02376, 0.02405, 0.0237, 0.02364, 0.02322, 0.02388, 0.02466, 0.02377, 0.02381, 0.02312, 0.02337, 0.02587, 0.0234, 0.02326, 0.02514, 0.02305, 0.02396, 0.02437, 0.02598, 0.02368, 0.02533, 0.02665, 0.0236, 0.02411, 0.02378, 0.02367, 0.02564, 0.02335, 0.02437, 0.02359, 0.02359, 0.02322, 0.02273, 0.02363, 0.02409, 0.02377, 0.02329, 0.02348, 0.02525, 0.02415, 0.02404, 0.02377, 0.02324, 0.02347, 0.02488, 0.02554, 0.02377, 0.02292, 0.02356, 0.02386, 0.0231, 0.024, 0.02405, 0.02445, 0.02374, 0.0233, 0.02593, 0.02463, 0.02393, 0.02351, 0.02352, 0.02404, 0.02313, 0.02358, 0.023, 0.02347, 0.02311, 0.0184, 0.02425, 0.02279, 0.02306, 0.02344, 0.02342, 0.0236, 0.02302, 0.02314, 0.02343, 0.02401, 0.02356, 0.02333, 0.02337, 0.0239, 0.0232, 0.02319, 0.02315, 0.02311, 0.02332, 0.02322, 0.02374, 0.0239, 0.02339, 0.02406, 0.02358, 0.02348, 0.02325, 0.02315, 0.02296, 0.02357, 0.02349, 0.02309, 0.02301, 0.02331, 0.02297, 0.0231, 0.02275, 0.0228, 0.02389, 0.02406, 0.02363, 0.02344, 0.02354, 0.02484, 0.02357, 0.02352, 0.02299, 0.02319, 0.02863, 0.02719, 0.02688, 0.0269, 0.02723, 0.02735, 0.02746, 0.02726, 0.02718, 0.02716, 0.02769, 0.02662, 0.02726, 0.0267, 0.02696, 0.02791, 0.0283, 0.03114, 0.02684, 0.02732, 0.02729, 0.02733, 0.02819, 0.02627, 0.02696, 0.02662, 0.02733, 0.02779, 0.02734, 0.02763, 0.02837, 0.02759, 0.0243, 0.02432, 0.02438, 0.02516, 0.02609, 0.02417, 0.02421, 0.02474, 0.02395, 0.02467, 0.02473, 0.02401, 0.02443, 0.02436, 0.02298, 0.02466, 0.02296, 0.02367, 0.02539, 0.02323, 0.02331, 0.02342, 0.02489, 0.02322, 0.02363, 0.02342, 0.02351, 0.02406, 0.02499, 0.02419, 0.02319, 0.02365, 0.02437, 0.02332, 0.02567, 0.02334, 0.02317, 0.02303, 0.02331, 0.02511, 0.02368, 0.02344, 0.02325, 0.0228, 0.02289, 0.02343, 0.02335, 0.0232, 0.02328, 0.02284, 0.0232, 0.02311, 0.02333, 0.02283, 0.02447, 0.02426, 0.02348, 0.02331, 0.02357, 0.02346, 0.02327, 0.02297, 0.0251, 0.02286, 0.0231, 0.02375, 0.02341, 0.0236, 0.0242, 0.02362, 0.02329, 0.02326, 0.02314, 0.02334, 0.02339, 0.02303, 0.02333, 0.02388, 0.02393, 0.02465, 0.02337, 0.02531, 0.02298, 0.02289, 0.02335, 0.02349, 0.02508, 0.02386, 0.02407, 0.0236, 0.02345, 0.02369, 0.02324, 0.02345, 0.02571, 0.02352, 0.02371, 0.02373, 0.02446, 0.02392, 0.02353, 0.02392, 0.02388, 0.02532, 0.02461, 0.02311, 0.02351, 0.02348, 0.02325, 0.02355, 0.02471, 0.02432, 0.0244, 0.02494, 0.02414, 0.02399, 0.02358, 0.02344, 0.02423]}, "forward-send-time": {"start_step": 0, "end_step": 2000, "step_interval": 5, "values": [2.84466, 0.00012, 0.00011, 0.00012, 0.00012, 0.00012, 0.00011, 0.00011, 0.00012, 0.00012, 0.00012, 0.00012, 0.00011, 0.00011, 0.00012, 0.00013, 0.00012, 0.00012, 0.00011, 0.00011, 0.00012, 0.00012, 0.00012, 0.00011, 0.00012, 0.00011, 0.00012, 0.00012, 0.00012, 0.00011, 0.00011, 0.00012, 0.00012, 0.00012, 0.00011, 0.00011, 0.00012, 0.00012, 0.00013, 0.00011, 0.00012, 0.00012, 0.00012, 0.00012, 0.00012, 0.00012, 0.00012, 0.00012, 0.00012, 0.00012, 0.00012, 0.00012, 0.00012, 0.00012, 0.00013, 0.00013, 0.00014, 0.00012, 0.00012, 0.00011, 0.00012, 0.00013, 0.00012, 0.00012, 0.00013, 0.00013, 0.00012, 0.00011, 0.00012, 0.00012, 0.00012, 0.00012, 0.00012, 0.00012, 0.00012, 0.00012, 0.00012, 0.00012, 0.00012, 0.00012, 0.00012, 0.00012, 0.00012, 0.00012, 0.00012, 0.00011, 0.00012, 0.00012, 0.00012, 0.00012, 0.00012, 0.00011, 0.00011, 0.00012, 0.00012, 0.00011, 0.00012, 0.00012, 0.00012, 0.00012, 0.00012, 0.00011, 0.00011, 0.00012, 0.00011, 0.00012, 0.00012, 0.00012, 0.00011, 0.00012, 0.00012, 0.00012, 0.00012, 0.00012, 0.00012, 0.00012, 0.00012, 0.00013, 0.00013, 0.00011, 0.00011, 0.00021, 0.00012, 0.00012, 0.00012, 0.00012, 0.00012, 0.00013, 0.00012, 0.00012, 0.00012, 0.00012, 0.00012, 0.00012, 0.00012, 0.00012, 0.00012, 0.00012, 0.00012, 0.00012, 0.00012, 0.00012, 0.00011, 0.00012, 0.00011, 0.00012, 0.00011, 0.00013, 0.00012, 0.00012, 0.00012, 0.00012, 0.00013, 0.00012, 0.00012, 0.00011, 0.00013, 0.00012, 0.00012, 0.00011, 0.00011, 0.00011, 0.00012, 0.00012, 0.00012, 0.00011, 0.00012, 0.00012, 0.00012, 0.00012, 0.00012, 0.00011, 0.00012, 0.00012, 0.00012, 0.00012, 0.00011, 0.00011, 0.00012, 0.00012, 0.00012, 0.00012, 0.00012, 0.00011, 0.00012, 0.00012, 0.00012, 0.00012, 0.00012, 0.00013, 0.00012, 0.00012, 0.00012, 0.00011, 0.00012, 0.00011, 0.00012, 0.00011, 0.00012, 0.00011, 0.00011, 0.00012, 0.00012, 0.00012, 0.00013, 0.00012, 0.00013, 0.00012, 0.00012, 0.00012, 0.00012, 0.00012, 0.00012, 0.00012, 0.00012, 0.00012, 0.00012, 0.00012, 0.00012, 0.00012, 0.00012, 0.00012, 0.00013, 0.00012, 0.00012, 0.00012, 0.00012, 0.00012, 0.00012, 0.00012, 0.00012, 0.00012, 0.00012, 0.00012, 0.00012, 0.00012, 0.00012, 0.00012, 0.00012, 0.00012, 0.00013, 0.00013, 0.00012, 0.00012, 0.00012, 0.00012, 0.00012, 0.00012, 0.00012, 0.00016, 0.00015, 0.00015, 0.00015, 0.00016, 0.00015, 0.00015, 0.00014, 0.00015, 0.00015, 0.00015, 0.00015, 0.00015, 0.00014, 0.00015, 0.00015, 0.00015, 0.00016, 0.00015, 0.00015, 0.00014, 0.00014, 0.00016, 0.00015, 0.0002, 0.00014, 0.00015, 0.00014, 0.00015, 0.00014, 0.00015, 0.00015, 0.00012, 0.00012, 0.00012, 0.00012, 0.00012, 0.00012, 0.00012, 0.00013, 0.00013, 0.00013, 0.00013, 0.00012, 0.00012, 0.00012, 0.00012, 0.00013, 0.00012, 0.00012, 0.00013, 0.00012, 0.00012, 0.00012, 0.00013, 0.00012, 0.00012, 0.00012, 0.00012, 0.00012, 0.00012, 0.00012, 0.00012, 0.00012, 0.00013, 0.00012, 0.00013, 0.00012, 0.00012, 0.00012, 0.00012, 0.00012, 0.00012, 0.00013, 0.00012, 0.00012, 0.00012, 0.00012, 0.00012, 0.00012, 0.00012, 0.00012, 0.00012, 0.00012, 0.00012, 0.00012, 0.00012, 0.00013, 0.00011, 0.00013, 0.00014, 0.00012, 0.00012, 0.00012, 0.00012, 0.00012, 0.00012, 0.00012, 0.00013, 0.00013, 0.00012, 0.00013, 0.00012, 0.00011, 0.00012, 0.00012, 0.00012, 0.00012, 0.00013, 0.00014, 0.00012, 0.00013, 0.00012, 0.00012, 0.00012, 0.00012, 0.00012, 0.00012, 0.00013, 0.00012, 0.00013, 0.00013, 0.00012, 0.00012, 0.00012, 0.00012, 0.00013, 0.00013, 0.00012, 0.00012, 0.00013, 0.00013, 0.00012, 0.00012, 0.00012, 0.00013, 0.00012, 0.00012, 0.00012, 0.00012, 0.00012, 0.00012, 0.00012, 0.00012, 0.00013, 0.00012, 0.00013, 0.00014, 0.00012, 0.00013, 0.00012]}, "backward-recv-time": {"start_step": 0, "end_step": 2000, "step_interval": 5, "values": [0.02202, 0.02306, 0.02274, 0.02305, 0.02218, 0.02282, 0.02254, 0.02256, 0.02256, 0.02201, 0.02227, 0.02236, 0.02184, 0.02219, 0.02311, 0.02279, 0.0224, 0.02326, 0.0223, 0.0226, 0.02262, 0.02192, 0.02207, 0.02234, 0.0225, 0.02331, 0.02364, 0.02244, 0.02259, 0.02244, 0.02307, 0.0232, 0.02442, 0.02498, 0.02229, 0.0228, 0.02468, 0.02377, 0.02241, 0.02261, 0.02253, 0.02261, 0.02234, 0.02253, 0.02252, 0.02275, 0.02272, 0.02219, 0.02235, 0.02245, 0.02519, 0.02285, 0.02297, 0.02413, 0.02237, 0.02293, 0.0228, 0.02258, 0.02227, 0.02742, 0.02319, 0.02305, 0.02286, 0.02291, 0.02288, 0.02328, 0.02324, 0.02362, 0.02461, 0.02229, 0.02295, 0.02276, 0.0234, 0.02322, 0.02241, 0.02264, 0.02302, 0.0234, 0.02233, 0.02257, 0.02316, 0.02277, 0.02753, 0.02283, 0.02254, 0.02283, 0.0218, 0.02217, 0.02286, 0.02257, 0.0228, 0.0227, 0.02081, 0.0228, 0.02621, 0.02311, 0.02273, 0.0228, 0.02247, 0.0229, 0.02301, 0.02246, 0.02269, 0.02282, 0.02255, 0.02285, 0.02311, 0.0227, 0.02235, 0.02252, 0.02338, 0.02261, 0.02365, 0.02278, 0.02199, 0.0226, 0.02251, 0.02252, 0.0226, 0.02281, 0.02411, 0.02301, 0.02114, 0.02254, 0.0225, 0.02292, 0.02388, 0.02719, 0.02225, 0.02241, 0.02306, 0.02278, 0.02254, 0.02221, 0.02262, 0.02523, 0.02237, 0.0224, 0.0224, 0.02234, 0.02308, 0.02372, 0.02327, 0.02279, 0.02316, 0.02344, 0.02202, 0.02286, 0.02663, 0.02281, 0.0234, 0.02273, 0.02221, 0.02282, 0.02274, 0.02532, 0.02225, 0.02195, 0.02261, 0.02257, 0.02265, 0.02262, 0.02232, 0.023, 0.02283, 0.02245, 0.02247, 0.0238, 0.02512, 0.02216, 0.0226, 0.02248, 0.02442, 0.02357, 0.02268, 0.02197, 0.02269, 0.02234, 0.02252, 0.02254, 0.02296, 0.02323, 0.02487, 0.02507, 0.02281, 0.02321, 0.01969, 0.02212, 0.02259, 0.02247, 0.02216, 0.02227, 0.02334, 0.02365, 0.02317, 0.02332, 0.02536, 0.02524, 0.02256, 0.02014, 0.02168, 0.02553, 0.02195, 0.02188, 0.02265, 0.02181, 0.02201, 0.02208, 0.02185, 0.02258, 0.02179, 0.02208, 0.02184, 0.02172, 0.02131, 0.02178, 0.02181, 0.02153, 0.02161, 0.02189, 0.02179, 0.02189, 0.02152, 0.02237, 0.01986, 0.02159, 0.02198, 0.02172, 0.02198, 0.02071, 0.0218, 0.02168, 0.02163, 0.02171, 0.02187, 0.02247, 0.0254, 0.02003, 0.02151, 0.02205, 0.02189, 0.02196, 0.02212, 0.02259, 0.02231, 0.02186, 0.0214, 0.02189, 0.02217, 0.02191, 0.02194, 0.02196, 0.02437, 0.0235, 0.02355, 0.02243, 0.02206, 0.02142, 0.02199, 0.02213, 0.02157, 0.02436, 0.02121, 0.02302, 0.0223, 0.02427, 0.02238, 0.02253, 0.01864, 0.02424, 0.02409, 0.0246, 0.02317, 0.02239, 0.02214, 0.02205, 0.022, 0.02349, 0.02219, 0.02161, 0.022, 0.02154, 0.02174, 0.0218, 0.02159, 0.02209, 0.022, 0.02163, 0.02288, 0.02366, 0.0234, 0.02153, 0.02198, 0.0241, 0.02181, 0.02185, 0.02225, 0.0216, 0.02178, 0.02096, 0.02214, 0.02076, 0.0219, 0.02303, 0.02184, 0.02342, 0.01921, 0.02176, 0.02172, 0.02189, 0.0219, 0.02192, 0.02085, 0.02133, 0.02429, 0.02384, 0.0242, 0.0195, 0.02178, 0.02175, 0.02146, 0.02171, 0.02168, 0.02164, 0.02417, 0.02331, 0.02162, 0.02199, 0.02187, 0.02172, 0.02155, 0.02173, 0.02177, 0.02367, 0.02387, 0.02186, 0.02165, 0.0215, 0.02171, 0.02193, 0.02169, 0.02399, 0.02207, 0.02179, 0.02207, 0.02217, 0.02226, 0.02196, 0.02201, 0.02182, 0.02159, 0.02152, 0.02173, 0.02179, 0.02146, 0.02161, 0.02161, 0.02191, 0.02365, 0.02194, 0.02182, 0.02252, 0.0217, 0.02184, 0.02214, 0.0207, 0.02212, 0.02196, 0.02227, 0.0219, 0.02213, 0.02179, 0.02192, 0.02063, 0.02245, 0.02495, 0.02207, 0.02234, 0.0219, 0.02176, 0.02221, 0.02198, 0.02398, 0.02453, 0.02261, 0.02208, 0.02163, 0.02214, 0.02159, 0.02483, 0.02236, 0.0221, 0.02206, 0.02218, 0.02227, 0.02233, 0.02258, 0.02182, 0.02191, 0.02178]}, "backward-send-time": {"start_step": 0, "end_step": 2000, "step_interval": 5, "values": [0.00016, 0.00018, 0.00018, 0.00018, 0.00018, 0.00018, 0.00018, 0.00018, 0.00017, 0.00018, 0.00016, 0.00019, 0.00019, 0.00018, 0.00017, 0.00019, 0.00018, 0.00019, 0.00018, 0.00018, 0.00018, 0.00018, 0.00017, 0.00018, 0.00018, 0.00019, 0.00019, 0.00017, 0.00018, 0.00017, 0.00019, 0.00018, 0.00022, 0.0002, 0.00018, 0.00019, 0.00016, 0.00017, 0.00018, 0.00019, 0.00018, 0.00017, 0.00018, 0.00018, 0.00017, 0.00018, 0.00017, 0.00018, 0.00018, 0.00019, 0.00018, 0.0002, 0.00017, 0.0002, 0.00018, 0.00019, 0.00018, 0.00019, 0.00019, 0.00018, 0.00018, 0.00019, 0.00018, 0.00019, 0.00018, 0.00018, 0.00019, 0.00018, 0.00016, 0.00017, 0.00018, 0.00018, 0.00018, 0.00018, 0.00019, 0.00019, 0.00019, 0.00018, 0.00017, 0.00018, 0.00018, 0.00018, 0.00021, 0.00019, 0.00019, 0.00018, 0.00016, 0.00018, 0.00019, 0.00019, 0.00019, 0.00018, 0.00019, 0.00018, 0.00018, 0.0002, 0.00018, 0.00018, 0.00016, 0.00018, 0.00018, 0.00018, 0.00018, 0.00019, 0.00018, 0.00019, 0.00018, 0.00018, 0.00018, 0.00018, 0.00018, 0.00022, 0.00018, 0.00018, 0.0002, 0.00018, 0.00019, 0.00019, 0.00018, 0.00019, 0.00019, 0.00018, 0.00016, 0.00019, 0.00018, 0.00018, 0.00032, 0.00019, 0.00018, 0.00018, 0.00019, 0.00019, 0.00019, 0.00018, 0.00017, 0.00019, 0.00016, 0.00016, 0.00017, 0.00019, 0.00019, 0.00018, 0.00018, 0.00019, 0.00018, 0.00017, 0.00018, 0.00019, 0.00016, 0.00018, 0.00018, 0.00018, 0.00019, 0.00019, 0.00018, 0.00026, 0.00019, 0.00018, 0.00018, 0.00017, 0.00018, 0.00019, 0.00019, 0.00018, 0.00018, 0.00018, 0.00018, 0.00018, 0.00022, 0.00018, 0.00019, 0.00019, 0.00016, 0.00019, 0.00019, 0.00019, 0.00018, 0.00019, 0.00018, 0.00019, 0.00018, 0.00019, 0.00018, 0.00017, 0.00018, 0.00018, 0.00027, 0.00018, 0.00019, 0.00018, 0.00019, 0.00018, 0.00018, 0.00019, 0.00021, 0.00018, 0.00019, 0.00016, 0.00019, 0.00016, 0.00019, 0.00023, 0.00017, 0.00016, 0.00018, 0.00019, 0.00019, 0.00019, 0.00021, 0.00016, 0.00018, 0.00017, 0.00018, 0.00017, 0.00018, 0.00017, 0.00019, 0.00018, 0.00018, 0.00018, 0.00017, 0.00018, 0.00018, 0.00016, 0.00018, 0.00018, 0.00018, 0.00018, 0.00019, 0.00018, 0.00018, 0.00019, 0.00021, 0.00018, 0.00018, 0.00018, 0.00018, 0.00018, 0.00019, 0.00023, 0.00018, 0.00016, 0.00019, 0.00018, 0.00016, 0.00018, 0.00019, 0.00017, 0.00019, 0.00018, 0.00016, 0.00017, 0.00018, 0.00018, 0.00016, 0.00018, 0.00017, 0.00016, 0.00019, 0.00017, 0.00018, 0.00018, 0.00017, 0.00018, 0.00017, 0.00017, 0.00018, 0.00017, 0.00018, 0.00017, 0.00016, 0.00016, 0.00018, 0.00018, 0.00018, 0.00016, 0.00018, 0.00025, 0.00018, 0.00017, 0.00018, 0.00017, 0.00018, 0.00017, 0.00019, 0.00018, 0.00018, 0.00018, 0.00018, 0.00018, 0.00017, 0.00018, 0.00016, 0.00016, 0.00017, 0.00018, 0.00018, 0.00018, 0.00018, 0.00019, 0.00017, 0.00018, 0.00017, 0.00018, 0.00018, 0.00018, 0.00018, 0.00019, 0.00019, 0.00018, 0.00018, 0.00016, 0.00018, 0.00018, 0.00016, 0.00016, 0.00016, 0.00019, 0.00017, 0.00017, 0.00018, 0.00017, 0.00018, 0.00017, 0.00018, 0.00017, 0.00018, 0.00017, 0.00016, 0.00018, 0.00018, 0.00018, 0.00021, 0.00016, 0.00016, 0.00018, 0.00018, 0.00018, 0.00016, 0.00018, 0.00018, 0.00016, 0.00016, 0.00018, 0.00017, 0.00019, 0.00018, 0.00018, 0.00018, 0.00018, 0.00017, 0.00019, 0.00018, 0.00017, 0.00017, 0.00018, 0.00017, 0.00018, 0.00018, 0.00017, 0.00018, 0.00019, 0.00018, 0.00016, 0.00019, 0.00018, 0.00018, 0.00018, 0.00016, 0.00018, 0.00018, 0.00018, 0.00018, 0.00017, 0.00018, 0.00016, 0.00018, 0.00019, 0.00018, 0.00018, 0.00016, 0.00016, 0.00017, 0.00021, 0.00016, 0.00018, 0.00018, 0.00017, 0.00018, 0.00018, 0.00018, 0.00018, 0.00018, 0.00019, 0.00018, 0.00017, 0.00017, 0.00018, 0.00017, 0.00018]}, "forward-send-backward-recv-time": {"start_step": 0, "end_step": 2000, "step_interval": 5, "values": [7.26791, 0.08664, 0.09388, 0.09112, 0.08445, 0.09357, 0.09373, 0.09614, 0.09989, 0.10112, 0.08956, 0.08704, 0.09001, 0.09155, 0.09857, 0.09953, 0.0961, 0.10113, 0.10125, 0.11004, 0.10313, 0.09862, 0.10585, 0.10919, 0.10583, 0.10172, 0.10458, 0.10404, 0.1052, 0.09641, 0.10412, 0.09781, 0.09972, 0.10136, 0.10163, 0.09609, 0.09969, 0.10085, 0.10306, 0.10325, 0.10455, 0.10533, 0.1025, 0.09569, 0.09963, 0.11379, 0.10728, 0.10291, 0.10638, 0.1012, 0.09514, 0.10381, 0.10024, 0.10547, 0.10487, 0.11789, 0.11734, 0.11997, 0.113, 0.10597, 0.11163, 0.11506, 0.12069, 0.12521, 0.12131, 0.11375, 0.10345, 0.10129, 0.10181, 0.10088, 0.0947, 0.09723, 0.09642, 0.10255, 0.10466, 0.09713, 0.10564, 0.10312, 0.10025, 0.09561, 0.09512, 0.09519, 0.08816, 0.09549, 0.09265, 0.09294, 0.10255, 0.09939, 0.10544, 0.10344, 0.10858, 0.1088, 0.10697, 0.09761, 0.09215, 0.09749, 0.10389, 0.09421, 0.09597, 0.09688, 0.10356, 0.10031, 0.10358, 0.10022, 0.09494, 0.09521, 0.08777, 0.09024, 0.09559, 0.08704, 0.09044, 0.08853, 0.09387, 0.09487, 0.09496, 0.0917, 0.09224, 0.08543, 0.08296, 0.0931, 0.08686, 0.09041, 0.08634, 0.0838, 0.07721, 0.08382, 0.08905, 0.07994, 0.08964, 0.09067, 0.08724, 0.09031, 0.09142, 0.08955, 0.08642, 0.08734, 0.09313, 0.0892, 0.08811, 0.08748, 0.10918, 0.10445, 0.10103, 0.10406, 0.10336, 0.10399, 0.11053, 0.10502, 0.1058, 0.10377, 0.10177, 0.10263, 0.10865, 0.10227, 0.1032, 0.10523, 0.08465, 0.08812, 0.09221, 0.0869, 0.09106, 0.09518, 0.08366, 0.09187, 0.09167, 0.09065, 0.08392, 0.08171, 0.08992, 0.09232, 0.08837, 0.08382, 0.08792, 0.08609, 0.08649, 0.09183, 0.09528, 0.08861, 0.08269, 0.07853, 0.08798, 0.08353, 0.08436, 0.09088, 0.08495, 0.08552, 0.08561, 0.08913, 0.08612, 0.08093, 0.08731, 0.08686, 0.08376, 0.09109, 0.08222, 0.08599, 0.08546, 0.09351, 0.09605, 0.09994, 0.05805, 0.06314, 0.06773, 0.06769, 0.07278, 0.07311, 0.07124, 0.07502, 0.06435, 0.06762, 0.06901, 0.0791, 0.0778, 0.07332, 0.07358, 0.07456, 0.08054, 0.08433, 0.07505, 0.07588, 0.08407, 0.0787, 0.08207, 0.0796, 0.07151, 0.06957, 0.07132, 0.06499, 0.06604, 0.07296, 0.07397, 0.067, 0.07615, 0.07913, 0.07517, 0.07077, 0.07248, 0.07492, 0.07227, 0.07335, 0.0763, 0.07019, 0.07546, 0.07774, 0.07407, 0.0729, 0.07638, 0.07126, 0.07892, 0.09584, 0.09387, 0.09457, 0.09277, 0.0883, 0.08843, 0.09465, 0.09754, 0.09491, 0.09011, 0.08659, 0.08508, 0.08604, 0.09074, 0.08671, 0.08822, 0.08652, 0.10003, 0.09872, 0.09528, 0.09138, 0.09197, 0.09145, 0.09609, 0.09717, 0.09187, 0.08329, 0.07444, 0.08501, 0.09292, 0.07912, 0.09086, 0.06371, 0.06325, 0.06657, 0.06269, 0.0684, 0.06721, 0.07116, 0.07046, 0.0677, 0.06735, 0.06869, 0.06628, 0.06387, 0.06598, 0.06628, 0.06315, 0.07014, 0.06138, 0.06023, 0.06541, 0.06746, 0.07002, 0.07338, 0.06917, 0.06109, 0.06706, 0.07059, 0.07159, 0.07375, 0.08229, 0.07701, 0.07396, 0.07568, 0.07085, 0.07045, 0.06836, 0.06539, 0.0665, 0.07089, 0.0709, 0.06602, 0.0697, 0.07478, 0.0684, 0.0647, 0.0626, 0.06703, 0.06836, 0.06571, 0.07061, 0.07022, 0.0716, 0.06385, 0.06344, 0.05399, 0.06182, 0.0629, 0.06795, 0.07021, 0.06979, 0.06991, 0.07026, 0.06139, 0.06342, 0.06547, 0.06176, 0.06228, 0.07216, 0.07562, 0.07274, 0.07226, 0.08023, 0.07444, 0.04375, 0.0697, 0.07621, 0.07857, 0.07477, 0.07791, 0.08106, 0.08001, 0.07886, 0.07928, 0.08279, 0.07305, 0.08365, 0.08546, 0.08515, 0.08206, 0.08649, 0.09308, 0.09213, 0.08788, 0.08419, 0.0881, 0.09226, 0.08474, 0.08747, 0.08269, 0.08805, 0.08503, 0.08089, 0.08025, 0.07691, 0.07938, 0.07913, 0.08725, 0.08008, 0.08335, 0.0882, 0.08124, 0.08869, 0.08118, 0.08321, 0.08276, 0.07892, 0.08691, 0.07849, 0.08318]}, "backward-send-forward-recv-time": {"start_step": 0, "end_step": 2000, "step_interval": 5, "values": [3.02438, 0.02964, 0.02158, 0.02612, 0.02742, 0.02646, 0.02144, 0.01953, 0.02104, 0.01973, 0.0221, 0.02679, 0.02821, 0.0292, 0.02641, 0.02434, 0.02851, 0.02189, 0.02401, 0.02493, 0.02324, 0.02474, 0.02466, 0.01958, 0.02074, 0.02324, 0.02406, 0.02422, 0.02172, 0.02415, 0.02078, 0.02874, 0.02875, 0.02888, 0.03126, 0.03155, 0.0297, 0.0288, 0.03235, 0.02835, 0.02837, 0.02808, 0.02869, 0.03298, 0.03478, 0.02725, 0.02531, 0.02971, 0.0248, 0.02835, 0.03171, 0.02666, 0.02768, 0.0316, 0.11725, 0.02233, 0.01927, 0.01846, 0.02324, 0.0208, 0.02765, 0.02234, 0.02152, 0.02055, 0.0218, 0.02092, 0.02617, 0.02621, 0.02575, 0.02487, 0.02854, 0.02512, 0.02754, 0.02441, 0.02799, 0.02601, 0.02443, 0.02664, 0.02842, 0.02747, 0.02197, 0.02705, 0.0286, 0.02828, 0.03081, 0.02999, 0.03156, 0.02772, 0.02622, 0.02462, 0.02412, 0.02594, 0.02264, 0.03102, 0.02956, 0.02597, 0.02756, 0.03008, 0.02803, 0.02913, 0.02661, 0.02374, 0.02365, 0.02578, 0.02542, 0.03028, 0.03098, 0.02753, 0.02526, 0.02933, 0.02658, 0.02632, 0.02526, 0.02436, 0.02205, 0.02173, 0.02147, 0.02635, 0.02715, 0.01835, 0.02341, 0.02286, 0.02713, 0.03176, 0.03552, 0.02684, 0.02459, 0.03111, 0.02691, 0.02888, 0.02912, 0.02835, 0.02868, 0.0319, 0.02488, 0.02699, 0.02738, 0.02288, 0.03107, 0.03026, 0.02374, 0.02063, 0.02531, 0.02048, 0.02199, 0.02504, 0.01991, 0.03009, 0.02384, 0.02452, 0.02777, 0.02276, 0.02322, 0.02545, 0.02596, 0.02803, 0.03054, 0.03445, 0.02978, 0.02853, 0.02578, 0.02477, 0.03074, 0.02951, 0.03089, 0.03187, 0.02945, 0.03462, 0.02761, 0.03327, 0.03222, 0.03039, 0.03257, 0.02712, 0.02729, 0.02863, 0.02412, 0.02627, 0.03209, 0.03064, 0.02986, 0.02923, 0.03127, 0.02881, 0.03666, 0.03233, 0.03454, 0.03286, 0.03299, 0.03171, 0.03363, 0.03637, 0.03532, 0.02997, 0.03427, 0.03447, 0.03788, 0.03045, 0.02935, 0.02785, 0.06375, 0.04913, 0.04593, 0.04639, 0.04315, 0.04609, 0.04022, 0.04069, 0.0458, 0.04145, 0.04193, 0.03809, 0.03122, 0.0379, 0.04024, 0.03151, 0.03065, 0.03028, 0.03812, 0.03701, 0.03342, 0.03675, 0.03239, 0.0438, 0.03695, 0.0419, 0.04267, 0.04585, 0.04997, 0.04424, 0.04745, 0.04667, 0.04464, 0.03917, 0.03907, 0.03699, 0.04231, 0.03898, 0.04045, 0.03812, 0.0373, 0.04307, 0.03851, 0.03799, 0.04077, 0.0409, 0.04045, 0.04407, 0.0328, 0.02602, 0.03043, 0.0238, 0.02775, 0.03236, 0.02827, 0.02216, 0.02607, 0.02209, 0.02438, 0.02661, 0.02817, 0.0302, 0.02384, 0.02743, 0.03022, 0.02263, 0.02281, 0.02357, 0.02756, 0.02656, 0.02806, 0.02726, 0.02917, 0.02779, 0.04648, 0.03625, 0.03939, 0.03798, 0.03027, 0.03365, 0.03112, 0.0507, 0.05041, 0.0488, 0.0478, 0.04287, 0.04273, 0.03793, 0.04099, 0.0473, 0.04686, 0.04606, 0.04653, 0.04791, 0.0434, 0.04395, 0.04672, 0.03952, 0.04338, 0.05238, 0.05084, 0.0447, 0.04529, 0.04014, 0.04009, 0.04618, 0.03869, 0.04044, 0.04097, 0.04238, 0.03044, 0.04364, 0.04057, 0.03549, 0.03892, 0.03761, 0.03631, 0.04319, 0.04214, 0.04271, 0.04566, 0.04209, 0.0419, 0.03476, 0.04175, 0.03736, 0.04126, 0.04073, 0.04268, 0.04088, 0.03755, 0.04007, 0.0375, 0.03951, 0.04011, 0.04621, 0.04174, 0.04428, 0.03833, 0.03393, 0.03343, 0.03715, 0.03224, 0.0391, 0.03809, 0.0352, 0.04357, 0.04052, 0.02489, 0.02136, 0.02147, 0.01936, 0.01974, 0.01753, 0.1141, 0.01901, 0.02217, 0.02537, 0.01881, 0.01782, 0.01594, 0.01966, 0.01818, 0.02087, 0.02147, 0.02626, 0.01794, 0.01552, 0.01646, 0.01963, 0.01985, 0.02306, 0.02056, 0.01929, 0.0188, 0.02041, 0.01882, 0.01934, 0.01928, 0.01858, 0.01964, 0.01987, 0.02011, 0.01922, 0.01909, 0.02055, 0.01875, 0.02072, 0.02181, 0.02052, 0.01786, 0.01986, 0.01947, 0.02245, 0.01734, 0.01752, 0.01965, 0.02295, 0.02233, 0.01907]}, "layernorm-grads-all-reduce-time": {"start_step": 0, "end_step": 2000, "step_interval": 5, "values": [3e-05, 2e-05, 2e-05, 2e-05, 2e-05, 2e-05, 2e-05, 2e-05, 2e-05, 2e-05, 2e-05, 2e-05, 2e-05, 2e-05, 2e-05, 2e-05, 1e-05, 2e-05, 1e-05, 2e-05, 2e-05, 2e-05, 2e-05, 1e-05, 2e-05, 2e-05, 2e-05, 1e-05, 2e-05, 1e-05, 2e-05, 2e-05, 1e-05, 2e-05, 1e-05, 1e-05, 2e-05, 2e-05, 2e-05, 2e-05, 2e-05, 2e-05, 2e-05, 2e-05, 2e-05, 1e-05, 2e-05, 2e-05, 2e-05, 2e-05, 2e-05, 2e-05, 2e-05, 2e-05, 2e-05, 2e-05, 1e-05, 1e-05, 2e-05, 2e-05, 2e-05, 2e-05, 2e-05, 1e-05, 2e-05, 1e-05, 2e-05, 2e-05, 1e-05, 1e-05, 2e-05, 2e-05, 2e-05, 2e-05, 2e-05, 2e-05, 2e-05, 2e-05, 1e-05, 2e-05, 2e-05, 2e-05, 2e-05, 1e-05, 2e-05, 2e-05, 2e-05, 2e-05, 1e-05, 2e-05, 2e-05, 1e-05, 1e-05, 2e-05, 2e-05, 1e-05, 2e-05, 1e-05, 2e-05, 2e-05, 2e-05, 2e-05, 1e-05, 2e-05, 2e-05, 1e-05, 2e-05, 1e-05, 1e-05, 1e-05, 2e-05, 1e-05, 1e-05, 2e-05, 2e-05, 2e-05, 2e-05, 1e-05, 2e-05, 1e-05, 2e-05, 2e-05, 2e-05, 2e-05, 1e-05, 2e-05, 1e-05, 1e-05, 2e-05, 2e-05, 1e-05, 2e-05, 2e-05, 2e-05, 1e-05, 1e-05, 1e-05, 1e-05, 2e-05, 2e-05, 1e-05, 2e-05, 1e-05, 1e-05, 1e-05, 2e-05, 2e-05, 2e-05, 2e-05, 2e-05, 2e-05, 2e-05, 2e-05, 2e-05, 2e-05, 2e-05, 2e-05, 2e-05, 1e-05, 2e-05, 2e-05, 2e-05, 2e-05, 1e-05, 2e-05, 2e-05, 2e-05, 2e-05, 2e-05, 2e-05, 2e-05, 1e-05, 2e-05, 2e-05, 2e-05, 2e-05, 2e-05, 1e-05, 1e-05, 2e-05, 2e-05, 2e-05, 2e-05, 2e-05, 2e-05, 2e-05, 2e-05, 2e-05, 2e-05, 2e-05, 1e-05, 2e-05, 2e-05, 2e-05, 2e-05, 1e-05, 2e-05, 2e-05, 1e-05, 2e-05, 2e-05, 1e-05, 1e-05, 1e-05, 2e-05, 1e-05, 1e-05, 1e-05, 1e-05, 1e-05, 1e-05, 2e-05, 1e-05, 1e-05, 1e-05, 1e-05, 1e-05, 1e-05, 1e-05, 1e-05, 1e-05, 1e-05, 1e-05, 2e-05, 1e-05, 1e-05, 1e-05, 2e-05, 1e-05, 1e-05, 1e-05, 2e-05, 1e-05, 1e-05, 1e-05, 1e-05, 1e-05, 2e-05, 1e-05, 2e-05, 1e-05, 1e-05, 1e-05, 1e-05, 1e-05, 1e-05, 1e-05, 1e-05, 1e-05, 2e-05, 2e-05, 2e-05, 2e-05, 2e-05, 2e-05, 2e-05, 2e-05, 2e-05, 2e-05, 2e-05, 2e-05, 2e-05, 2e-05, 2e-05, 2e-05, 2e-05, 2e-05, 2e-05, 2e-05, 2e-05, 2e-05, 2e-05, 2e-05, 2e-05, 2e-05, 2e-05, 2e-05, 2e-05, 2e-05, 2e-05, 2e-05, 1e-05, 1e-05, 1e-05, 1e-05, 1e-05, 1e-05, 1e-05, 1e-05, 1e-05, 1e-05, 1e-05, 1e-05, 1e-05, 1e-05, 1e-05, 1e-05, 1e-05, 1e-05, 1e-05, 1e-05, 1e-05, 1e-05, 1e-05, 1e-05, 1e-05, 1e-05, 1e-05, 1e-05, 1e-05, 1e-05, 1e-05, 1e-05, 1e-05, 1e-05, 2e-05, 1e-05, 1e-05, 1e-05, 1e-05, 1e-05, 1e-05, 1e-05, 1e-05, 1e-05, 1e-05, 1e-05, 1e-05, 1e-05, 1e-05, 1e-05, 1e-05, 1e-05, 1e-05, 1e-05, 1e-05, 1e-05, 1e-05, 2e-05, 1e-05, 1e-05, 1e-05, 1e-05, 1e-05, 1e-05, 1e-05, 1e-05, 1e-05, 2e-05, 1e-05, 1e-05, 1e-05, 1e-05, 1e-05, 1e-05, 1e-05, 1e-05, 1e-05, 1e-05, 1e-05, 1e-05, 1e-05, 1e-05, 1e-05, 1e-05, 1e-05, 1e-05, 1e-05, 1e-05, 1e-05, 1e-05, 1e-05, 1e-05, 1e-05, 1e-05, 1e-05, 1e-05, 1e-05, 1e-05, 1e-05, 1e-05, 1e-05, 1e-05, 1e-05, 1e-05, 1e-05, 1e-05, 1e-05, 1e-05, 1e-05, 1e-05, 1e-05, 2e-05, 1e-05, 1e-05, 1e-05, 2e-05, 1e-05, 1e-05, 1e-05]}, "embedding-grads-all-reduce-time": {"start_step": 0, "end_step": 2000, "step_interval": 5, "values": [0.00057, 0.00025, 0.00022, 0.00022, 0.00022, 0.00022, 0.00025, 0.00022, 0.00022, 0.00034, 0.00022, 0.00024, 0.00022, 0.00026, 0.00022, 0.00022, 0.00022, 0.00022, 0.00022, 0.00026, 0.00026, 0.00025, 0.00022, 0.00025, 0.00022, 0.00022, 0.00024, 0.00022, 0.00022, 0.00022, 0.00022, 0.00022, 0.00022, 0.00026, 0.00022, 0.00025, 0.00022, 0.00023, 0.00025, 0.00022, 0.00022, 0.00022, 0.00022, 0.00025, 0.00022, 0.00022, 0.00021, 0.00022, 0.00022, 0.00022, 0.00022, 0.00022, 0.00022, 0.00025, 0.00025, 0.00022, 0.00022, 0.00022, 0.00022, 0.00022, 0.00022, 0.00022, 0.00026, 0.00025, 0.00025, 0.00021, 0.00022, 0.00022, 0.00022, 0.00025, 0.00025, 0.00025, 0.00022, 0.00023, 0.00022, 0.00022, 0.00023, 0.00022, 0.00025, 0.00022, 0.00022, 0.00022, 0.00025, 0.00021, 0.00022, 0.00022, 0.00025, 0.00022, 0.00022, 0.00025, 0.00025, 0.00025, 0.00022, 0.00033, 0.00022, 0.00022, 0.00023, 0.00025, 0.00025, 0.00022, 0.00022, 0.00022, 0.00022, 0.00022, 0.00022, 0.00022, 0.00022, 0.00022, 0.00022, 0.00022, 0.00022, 0.00022, 0.00022, 0.00022, 0.00022, 0.00022, 0.00025, 0.00022, 0.00022, 0.00026, 0.00022, 0.00022, 0.00023, 0.00022, 0.00022, 0.00022, 0.00022, 0.00022, 0.00026, 0.00022, 0.00026, 0.00025, 0.00024, 0.00025, 0.00022, 0.00025, 0.00022, 0.00022, 0.00026, 0.00025, 0.00022, 0.00022, 0.00022, 0.00025, 0.00025, 0.00025, 0.00025, 0.00022, 0.00022, 0.00023, 0.00022, 0.00022, 0.00022, 0.00022, 0.00023, 0.00022, 0.00022, 0.00024, 0.00023, 0.00022, 0.00023, 0.00022, 0.00021, 0.00022, 0.00025, 0.00022, 0.00022, 0.00022, 0.00022, 0.00022, 0.00022, 0.00025, 0.00025, 0.00024, 0.00022, 0.00024, 0.00022, 0.00025, 0.00022, 0.00022, 0.00026, 0.00025, 0.00024, 0.00022, 0.00022, 0.00025, 0.00022, 0.00022, 0.00022, 0.00025, 0.00022, 0.00026, 0.00022, 0.00022, 0.00022, 0.00022, 0.00027, 0.00022, 0.00025, 0.00022, 0.00026, 0.00025, 0.00021, 0.00022, 0.00022, 0.00022, 0.00023, 0.00022, 0.00026, 0.00022, 0.00021, 0.00026, 0.00025, 0.00022, 0.00025, 0.00022, 0.00022, 0.00022, 0.00022, 0.00022, 0.00022, 0.00022, 0.00022, 0.00026, 0.00025, 0.00021, 0.00022, 0.00026, 0.00025, 0.00022, 0.00022, 0.00025, 0.00022, 0.00022, 0.00022, 0.00022, 0.00025, 0.00021, 0.00021, 0.00022, 0.00022, 0.00025, 0.00022, 0.00022, 0.00025, 0.00025, 0.00025, 0.00022, 0.00022, 0.00023, 0.00023, 0.00023, 0.00023, 0.00023, 0.00023, 0.00023, 0.00023, 0.00024, 0.00024, 0.00024, 0.00023, 0.00023, 0.00023, 0.00023, 0.00023, 0.00023, 0.00023, 0.00023, 0.00023, 0.00023, 0.00023, 0.00023, 0.00023, 0.00023, 0.00023, 0.00023, 0.00023, 0.00023, 0.00023, 0.00023, 0.00023, 0.00025, 0.00025, 0.00022, 0.00021, 0.00021, 0.00023, 0.00021, 0.00021, 0.00025, 0.00021, 0.00021, 0.00025, 0.00022, 0.00021, 0.00025, 0.00022, 0.00021, 0.00021, 0.00025, 0.00021, 0.00021, 0.00021, 0.00025, 0.00025, 0.00022, 0.00022, 0.00021, 0.00025, 0.00021, 0.00021, 0.00021, 0.00021, 0.00021, 0.00021, 0.00022, 0.00022, 0.00021, 0.00021, 0.00021, 0.00022, 0.00021, 0.00022, 0.00022, 0.00022, 0.00024, 0.00021, 0.00022, 0.00022, 0.00024, 0.00021, 0.00025, 0.00021, 0.00025, 0.00021, 0.00025, 0.00022, 0.00021, 0.00021, 0.00021, 0.00025, 0.00023, 0.00021, 0.00021, 0.00025, 0.00021, 0.00021, 0.00022, 0.00025, 0.00021, 0.00021, 0.00022, 0.00022, 0.00021, 0.00021, 0.00022, 0.00025, 0.00022, 0.00022, 0.00025, 0.00022, 0.00022, 0.00022, 0.00022, 0.00022, 0.00022, 0.00022, 0.00023, 0.00022, 0.00022, 0.00022, 0.00022, 0.00021, 0.00022, 0.00022, 0.00022, 0.00022, 0.00022, 0.00025, 0.00022, 0.00022, 0.00025, 0.00022, 0.00021, 0.00022, 0.00022, 0.00025, 0.00022, 0.00022, 0.00022, 0.00022, 0.00022, 0.00033, 0.00022, 0.00021, 0.00022, 0.00022, 0.00022, 0.00021, 0.00024]}, "all-grads-sync-time": {"start_step": 0, "end_step": 2000, "step_interval": 5, "values": [0.66214, 0.00023, 0.00022, 0.00023, 0.00028, 0.00028, 0.00027, 0.00028, 0.00025, 0.00023, 0.00024, 0.00023, 0.00023, 0.00023, 0.00024, 0.00023, 0.00023, 0.00024, 0.00023, 0.00023, 0.00023, 0.0003, 0.00028, 0.00028, 0.00034, 0.00028, 0.00028, 0.00028, 0.00028, 0.00022, 0.00026, 0.00023, 0.00022, 0.00028, 0.00032, 0.00023, 0.00028, 0.00023, 0.00028, 0.00022, 0.00022, 0.00028, 0.00023, 0.00037, 0.00023, 0.00023, 0.00028, 0.00028, 0.00023, 0.00022, 0.00024, 0.00024, 0.00022, 0.00022, 0.00029, 0.00023, 0.00023, 0.00029, 0.00023, 0.00023, 0.00028, 0.00023, 0.00029, 0.00023, 0.00027, 0.00023, 0.00023, 0.00023, 0.00023, 0.00023, 0.00028, 0.00022, 0.00024, 0.00024, 0.00034, 0.00036, 0.00026, 0.00027, 0.00028, 0.00023, 0.00024, 0.00024, 0.00028, 0.00028, 0.00028, 0.00025, 0.00023, 0.00028, 0.00027, 0.00022, 0.00023, 0.00029, 0.00022, 0.00024, 0.00027, 0.00023, 0.00029, 0.00024, 0.00028, 0.00028, 0.00028, 0.00028, 0.00023, 0.00028, 0.00023, 0.00023, 0.00028, 0.00028, 0.0003, 0.00023, 0.00027, 0.00025, 0.00023, 0.00023, 0.00028, 0.00024, 0.00023, 0.00023, 0.00023, 0.00023, 0.00023, 0.00023, 0.00028, 0.00027, 0.00023, 0.00023, 0.00029, 0.00023, 0.00023, 0.00029, 0.00028, 0.00028, 0.00028, 0.00024, 0.00028, 0.00024, 0.00023, 0.00025, 0.00026, 0.00029, 0.00028, 0.00028, 0.00028, 0.00028, 0.00028, 0.00023, 0.00023, 0.00023, 0.00024, 0.00023, 0.0003, 0.00024, 0.00028, 0.00028, 0.00023, 0.00023, 0.00022, 0.00027, 0.00023, 0.00028, 0.00024, 0.00024, 0.00023, 0.00023, 0.00023, 0.00028, 0.00022, 0.00029, 0.00029, 0.00028, 0.00022, 0.00024, 0.0003, 0.00025, 0.00028, 0.00023, 0.00022, 0.00028, 0.00024, 0.00029, 0.00029, 0.00028, 0.00025, 0.00028, 0.00029, 0.00028, 0.00029, 0.00029, 0.00023, 0.00028, 0.00028, 0.00028, 0.00024, 0.0003, 0.00028, 0.00025, 0.00028, 0.00025, 0.00023, 0.00023, 0.00023, 0.00023, 0.00028, 0.00023, 0.00028, 0.00028, 0.00022, 0.00028, 0.00022, 0.00029, 0.00022, 0.00023, 0.00022, 0.00022, 0.00022, 0.00028, 0.00022, 0.00023, 0.00022, 0.00028, 0.00022, 0.00023, 0.00027, 0.00022, 0.00024, 0.00022, 0.00028, 0.00022, 0.00022, 0.00022, 0.00027, 0.00022, 0.00022, 0.00028, 0.00028, 0.00022, 0.00023, 0.00022, 0.00022, 0.00028, 0.00024, 0.00028, 0.00022, 0.00022, 0.00022, 0.00027, 0.00022, 0.00024, 0.00024, 0.00023, 0.00028, 0.00022, 0.00028, 0.00022, 0.00028, 0.00028, 0.00023, 0.00025, 0.00025, 0.00035, 0.00023, 0.00023, 0.00028, 0.00024, 0.00025, 0.00028, 0.00023, 0.00023, 0.00023, 0.00028, 0.00025, 0.00022, 0.00029, 0.00023, 0.00023, 0.00022, 0.00022, 0.00024, 0.00027, 0.00027, 0.00028, 0.00022, 0.00022, 0.00025, 0.00022, 0.00022, 0.00028, 0.00021, 0.00027, 0.00021, 0.00023, 0.00023, 0.00021, 0.00022, 0.00021, 0.00028, 0.00027, 0.00027, 0.00028, 0.00022, 0.00027, 0.00023, 0.00022, 0.00022, 0.00024, 0.00027, 0.00028, 0.00027, 0.00022, 0.00022, 0.00027, 0.00022, 0.00027, 0.00022, 0.00023, 0.00022, 0.00021, 0.00021, 0.00022, 0.00022, 0.00027, 0.00024, 0.00027, 0.00023, 0.00022, 0.00021, 0.00021, 0.00021, 0.00028, 0.00022, 0.00023, 0.00022, 0.00028, 0.00023, 0.00027, 0.00022, 0.00028, 0.00023, 0.00028, 0.00021, 0.00023, 0.00022, 0.00022, 0.00027, 0.00022, 0.00027, 0.00034, 0.00021, 0.00023, 0.00021, 0.00023, 0.00022, 0.00022, 0.00028, 0.00025, 0.00023, 0.00023, 0.00022, 0.00022, 0.00022, 0.00023, 0.00022, 0.00022, 0.00022, 0.00023, 0.00022, 0.00028, 0.00022, 0.00022, 0.00022, 0.00028, 0.00022, 0.00022, 0.00022, 0.00028, 0.00021, 0.00029, 0.00022, 0.00022, 0.00022, 0.00022, 0.00022, 0.00023, 0.00022, 0.00023, 0.0003, 0.00022, 0.00023, 0.00022, 0.00022, 0.00022, 0.00022, 0.00024, 0.00022, 0.00022, 0.00028, 0.00022, 0.00022, 0.00024, 0.00022]}, "optimizer-copy-to-main-grad-time": {"start_step": 0, "end_step": 2000, "step_interval": 5, "values": [0.00016, 0.00015, 0.00015, 0.00015, 0.00016, 0.00015, 0.00015, 0.00015, 0.00015, 0.00015, 0.00015, 0.00015, 0.00015, 0.00015, 0.00015, 0.00015, 0.00015, 0.00015, 0.00015, 0.00015, 0.00015, 0.00015, 0.00015, 0.00015, 0.00015, 0.00015, 0.00015, 0.00015, 0.00015, 0.00015, 0.00015, 0.00015, 0.00015, 0.00015, 0.00015, 0.00015, 0.00015, 0.00015, 0.00015, 0.00015, 0.00015, 0.00015, 0.00015, 0.00015, 0.00015, 0.00015, 0.00015, 0.00015, 0.00015, 0.00015, 0.00015, 0.00015, 0.00015, 0.00015, 0.00015, 0.00015, 0.00015, 0.00015, 0.00015, 0.00015, 0.00016, 0.00015, 0.00015, 0.00015, 0.00015, 0.00016, 0.00015, 0.00015, 0.00015, 0.00015, 0.00015, 0.00015, 0.00015, 0.00015, 0.00015, 0.00016, 0.00015, 0.00015, 0.00015, 0.00015, 0.00015, 0.00015, 0.00015, 0.00015, 0.00015, 0.00016, 0.00015, 0.00015, 0.00015, 0.00015, 0.00014, 0.00016, 0.00014, 0.00015, 0.00015, 0.00015, 0.00015, 0.00015, 0.00015, 0.00015, 0.00015, 0.00015, 0.00015, 0.00015, 0.00015, 0.00015, 0.00015, 0.00015, 0.00015, 0.00015, 0.00016, 0.00015, 0.00015, 0.00015, 0.00015, 0.00015, 0.00015, 0.00015, 0.00015, 0.00015, 0.00015, 0.00015, 0.00015, 0.00015, 0.00015, 0.00015, 0.00015, 0.00015, 0.00015, 0.00015, 0.00015, 0.00015, 0.00015, 0.00015, 0.00015, 0.00015, 0.00015, 0.00015, 0.00015, 0.00016, 0.00015, 0.00015, 0.00015, 0.00015, 0.00015, 0.00015, 0.00016, 0.00015, 0.00015, 0.00015, 0.00015, 0.00015, 0.00015, 0.00015, 0.00016, 0.00015, 0.00015, 0.00015, 0.00015, 0.00015, 0.00015, 0.00015, 0.00015, 0.00015, 0.00015, 0.00015, 0.00015, 0.00015, 0.00015, 0.00015, 0.00015, 0.00015, 0.00015, 0.00015, 0.00015, 0.00015, 0.00015, 0.00015, 0.00015, 0.00015, 0.00015, 0.00015, 0.00015, 0.00015, 0.00015, 0.00015, 0.00015, 0.00015, 0.00015, 0.00015, 0.00015, 0.00015, 0.00015, 0.00015, 0.00015, 0.00015, 0.00015, 0.00015, 0.00015, 0.00015, 0.00015, 0.00015, 0.00014, 0.00015, 0.00015, 0.00014, 0.00014, 0.00014, 0.00014, 0.00014, 0.00014, 0.00014, 0.00014, 0.00014, 0.00014, 0.00014, 0.00014, 0.00015, 0.00014, 0.00014, 0.00014, 0.00014, 0.00014, 0.00014, 0.00014, 0.00014, 0.00014, 0.00014, 0.00014, 0.00014, 0.00014, 0.00015, 0.00014, 0.00014, 0.00014, 0.00014, 0.00014, 0.00014, 0.00014, 0.00015, 0.00014, 0.00015, 0.00014, 0.00014, 0.00014, 0.00014, 0.00015, 0.00014, 0.00014, 0.0002, 0.00019, 0.00019, 0.00019, 0.00019, 0.00019, 0.00018, 0.0002, 0.0002, 0.00019, 0.00019, 0.00019, 0.0002, 0.00019, 0.0002, 0.00019, 0.00019, 0.00019, 0.00019, 0.0002, 0.00019, 0.00019, 0.00019, 0.00018, 0.0002, 0.00019, 0.0002, 0.00019, 0.00019, 0.00019, 0.00019, 0.00019, 0.00015, 0.00014, 0.00014, 0.00014, 0.00014, 0.00014, 0.00013, 0.00013, 0.00013, 0.00014, 0.00013, 0.00013, 0.00014, 0.00013, 0.00015, 0.00014, 0.00014, 0.00014, 0.00013, 0.00014, 0.00013, 0.00013, 0.00014, 0.00013, 0.00014, 0.00013, 0.00013, 0.00013, 0.00013, 0.00013, 0.00014, 0.00013, 0.00014, 0.00013, 0.00014, 0.00013, 0.00013, 0.00014, 0.00013, 0.00013, 0.00013, 0.00013, 0.00014, 0.00014, 0.00013, 0.00013, 0.00015, 0.00014, 0.00013, 0.00013, 0.00014, 0.00013, 0.00014, 0.00013, 0.00013, 0.00014, 0.00014, 0.00013, 0.00013, 0.00013, 0.00014, 0.00013, 0.00013, 0.00013, 0.00013, 0.00014, 0.00013, 0.00013, 0.00013, 0.00013, 0.00015, 0.00013, 0.00013, 0.00014, 0.00014, 0.00014, 0.00013, 0.00013, 0.00014, 0.00014, 0.00014, 0.00014, 0.00014, 0.00014, 0.00014, 0.00014, 0.00014, 0.00014, 0.00014, 0.00014, 0.00013, 0.00014, 0.00013, 0.00014, 0.00014, 0.00014, 0.00014, 0.00013, 0.00014, 0.00014, 0.00013, 0.00014, 0.00014, 0.00014, 0.00014, 0.00013, 0.00014, 0.00014, 0.00014, 0.00014, 0.00014, 0.00015, 0.00013, 0.00014, 0.00014, 0.00014, 0.00014, 0.00014, 0.00014]}, "optimizer-clip-main-grad-time": {"start_step": 0, "end_step": 2000, "step_interval": 5, "values": [6.52041, 0.00055, 0.00054, 0.00054, 0.00054, 0.00054, 0.00054, 0.00054, 0.00057, 0.00059, 0.00059, 0.00055, 0.00058, 0.00055, 0.00059, 0.00056, 0.00055, 0.00054, 0.00055, 0.00054, 0.00054, 0.00054, 0.00055, 0.00055, 0.00055, 0.00054, 0.00053, 0.00054, 0.00069, 0.00054, 0.00071, 0.00057, 0.00073, 0.00055, 0.00054, 0.00054, 0.00054, 0.00056, 0.00055, 0.00054, 0.00054, 0.00054, 0.00054, 0.00057, 0.00059, 0.00054, 0.00054, 0.00054, 0.00055, 0.00055, 0.00055, 0.00056, 0.00054, 0.00056, 0.00055, 0.00054, 0.00054, 0.00054, 0.00054, 0.00058, 0.00054, 0.00054, 0.00054, 0.00054, 0.00054, 0.00054, 0.0007, 0.00055, 0.00055, 0.00055, 0.00056, 0.00056, 0.00056, 0.00054, 0.00054, 0.00056, 0.00057, 0.00054, 0.00054, 0.00056, 0.00054, 0.0006, 0.00055, 0.00054, 0.00054, 0.00054, 0.00054, 0.00055, 0.00054, 0.00054, 0.00054, 0.00053, 0.00053, 0.00058, 0.00049, 0.00054, 0.00048, 0.00055, 0.00054, 0.00055, 0.00054, 0.00057, 0.00054, 0.00057, 0.00069, 0.00054, 0.00055, 0.00048, 0.00054, 0.00048, 0.00048, 0.0005, 0.00056, 0.00055, 0.00054, 0.00055, 0.00054, 0.00054, 0.00048, 0.00055, 0.00054, 0.00055, 0.00058, 0.00054, 0.00054, 0.00054, 0.00054, 0.00055, 0.00054, 0.00054, 0.00054, 0.00055, 0.00054, 0.00054, 0.00054, 0.00054, 0.00054, 0.00058, 0.00055, 0.00054, 0.00054, 0.00055, 0.00053, 0.00054, 0.00055, 0.00054, 0.00054, 0.00054, 0.00055, 0.00048, 0.00054, 0.00054, 0.00055, 0.00054, 0.00056, 0.00056, 0.00054, 0.00054, 0.00054, 0.00057, 0.00054, 0.00054, 0.00055, 0.00054, 0.00056, 0.00056, 0.00054, 0.00055, 0.00055, 0.00054, 0.00054, 0.00048, 0.00054, 0.00056, 0.00055, 0.00054, 0.00058, 0.00054, 0.00054, 0.00054, 0.00054, 0.00057, 0.00066, 0.00058, 0.00056, 0.00055, 0.00055, 0.00055, 0.00055, 0.00058, 0.00055, 0.00055, 0.00054, 0.00054, 0.00054, 0.00054, 0.00071, 0.00055, 0.00054, 0.00054, 0.0006, 0.00054, 0.00053, 0.00056, 0.00054, 0.00053, 0.00054, 0.00054, 0.00054, 0.00054, 0.00054, 0.00054, 0.00056, 0.00053, 0.00053, 0.00053, 0.00054, 0.00056, 0.00054, 0.00054, 0.00053, 0.00053, 0.00053, 0.00054, 0.00055, 0.00053, 0.00054, 0.00053, 0.00054, 0.00057, 0.00054, 0.00054, 0.00054, 0.00054, 0.00053, 0.00056, 0.00054, 0.00056, 0.00053, 0.00054, 0.00065, 0.00054, 0.00053, 0.00054, 0.00054, 0.00055, 0.00054, 0.00054, 0.00055, 0.00072, 0.00073, 0.00073, 0.00074, 0.00073, 0.00072, 0.00071, 0.00072, 0.0008, 0.00072, 0.00072, 0.00072, 0.00072, 0.00072, 0.00073, 0.00116, 0.00072, 0.00072, 0.00073, 0.00073, 0.00074, 0.00072, 0.00072, 0.00072, 0.00073, 0.00075, 0.00077, 0.00072, 0.00072, 0.00072, 0.00072, 0.00072, 0.00054, 0.00053, 0.00059, 0.00053, 0.00053, 0.00052, 0.00053, 0.00053, 0.00055, 0.00053, 0.00052, 0.00053, 0.00054, 0.00053, 0.00055, 0.00053, 0.00052, 0.00052, 0.00053, 0.00055, 0.00053, 0.00057, 0.00053, 0.00053, 0.00055, 0.00052, 0.00054, 0.00052, 0.00053, 0.00053, 0.00052, 0.00052, 0.00053, 0.00052, 0.00053, 0.00053, 0.00052, 0.00054, 0.00056, 0.00052, 0.00052, 0.00052, 0.00053, 0.00054, 0.00054, 0.00053, 0.00052, 0.00055, 0.00052, 0.00057, 0.00052, 0.00053, 0.00053, 0.00053, 0.00055, 0.00053, 0.00052, 0.00052, 0.00053, 0.00052, 0.00055, 0.00052, 0.00053, 0.00053, 0.00052, 0.00054, 0.00054, 0.00058, 0.00051, 0.00054, 0.00053, 0.00053, 0.00053, 0.00056, 0.00056, 0.00054, 0.00053, 0.00054, 0.00055, 0.00053, 0.00054, 0.00057, 0.00054, 0.00056, 0.00054, 0.00055, 0.00054, 0.00053, 0.00053, 0.00053, 0.00054, 0.00055, 0.00053, 0.00054, 0.00055, 0.00055, 0.00068, 0.00053, 0.00053, 0.00054, 0.00053, 0.00059, 0.00054, 0.00057, 0.00053, 0.00054, 0.00056, 0.00054, 0.00056, 0.00059, 0.00054, 0.00066, 0.00053, 0.00053, 0.00053, 0.00053, 0.00056, 0.0007, 0.00055]}, "optimizer-count-zeros-time": {"start_step": 0, "end_step": 2000, "step_interval": 5, "values": [0.00377, 0.00267, 0.00263, 0.00264, 0.00263, 0.00264, 0.00267, 0.00265, 0.00264, 0.00265, 0.00266, 0.00266, 0.00264, 0.00267, 0.00266, 0.00265, 0.00263, 0.00266, 0.00263, 0.00264, 0.00264, 0.00264, 0.00264, 0.00262, 0.00264, 0.00265, 0.00265, 0.00264, 0.00279, 0.00265, 0.0029, 0.00265, 0.00467, 0.00274, 0.00266, 0.00265, 0.00264, 0.00264, 0.00264, 0.00267, 0.00265, 0.00263, 0.00264, 0.00264, 0.00264, 0.00265, 0.00264, 0.00264, 0.00266, 0.00265, 0.00272, 0.00265, 0.00266, 0.00265, 0.00264, 0.00266, 0.00266, 0.00265, 0.00266, 0.00277, 0.00266, 0.00267, 0.00266, 0.00266, 0.00266, 0.00265, 0.00264, 0.00266, 0.00269, 0.00259, 0.00261, 0.00261, 0.0026, 0.00263, 0.00275, 0.00259, 0.00263, 0.00262, 0.0026, 0.00262, 0.00262, 0.0026, 0.00273, 0.00262, 0.00261, 0.00261, 0.0026, 0.0026, 0.00262, 0.00262, 0.00259, 0.0026, 0.0026, 0.00292, 0.00276, 0.00261, 0.00262, 0.00262, 0.00262, 0.00261, 0.00261, 0.0026, 0.0026, 0.00261, 0.00292, 0.00264, 0.00266, 0.0026, 0.00263, 0.00261, 0.00259, 0.00261, 0.0026, 0.00261, 0.00259, 0.0026, 0.00261, 0.00262, 0.00261, 0.0026, 0.00264, 0.00262, 0.00288, 0.00263, 0.00258, 0.00261, 0.00266, 0.00274, 0.00261, 0.0026, 0.00263, 0.00261, 0.0026, 0.00262, 0.00262, 0.00261, 0.00262, 0.00262, 0.00261, 0.0026, 0.00268, 0.00264, 0.00265, 0.00266, 0.00266, 0.00265, 0.00272, 0.00264, 0.00278, 0.00265, 0.00266, 0.00266, 0.00267, 0.00264, 0.00264, 0.00272, 0.0026, 0.00261, 0.00261, 0.00261, 0.00262, 0.00262, 0.00263, 0.00261, 0.00262, 0.00259, 0.00261, 0.00262, 0.00269, 0.0026, 0.00262, 0.00262, 0.00261, 0.00262, 0.00261, 0.00261, 0.00263, 0.0026, 0.00262, 0.0026, 0.00263, 0.00262, 0.0034, 0.00265, 0.00259, 0.00259, 0.0026, 0.00261, 0.00261, 0.0026, 0.00277, 0.0026, 0.00262, 0.00261, 0.00264, 0.00261, 0.00263, 0.00268, 0.00261, 0.0026, 0.00239, 0.00238, 0.0024, 0.00237, 0.00238, 0.00237, 0.00239, 0.00237, 0.0024, 0.0024, 0.00243, 0.00239, 0.0024, 0.0024, 0.00238, 0.00241, 0.00242, 0.00239, 0.00246, 0.00242, 0.0024, 0.00238, 0.00238, 0.00239, 0.00239, 0.00239, 0.00239, 0.0024, 0.0024, 0.00239, 0.00239, 0.00244, 0.00238, 0.00237, 0.00238, 0.0024, 0.00242, 0.00238, 0.00238, 0.00241, 0.00268, 0.00241, 0.00241, 0.00239, 0.00242, 0.00238, 0.00241, 0.00243, 0.00467, 0.00362, 0.00363, 0.0036, 0.00366, 0.00361, 0.00362, 0.00363, 0.00361, 0.00375, 0.00372, 0.00364, 0.0036, 0.00364, 0.00361, 0.00361, 0.00363, 0.00364, 0.00364, 0.00363, 0.00364, 0.00363, 0.00387, 0.00363, 0.00364, 0.00363, 0.00362, 0.00364, 0.00362, 0.00361, 0.00361, 0.00362, 0.00365, 0.00238, 0.00239, 0.00237, 0.0024, 0.0024, 0.00237, 0.00239, 0.00239, 0.00236, 0.00239, 0.00239, 0.00239, 0.00237, 0.00241, 0.00242, 0.00243, 0.00239, 0.0024, 0.00238, 0.00239, 0.00239, 0.00237, 0.00239, 0.00243, 0.00239, 0.00243, 0.00238, 0.00238, 0.00238, 0.00239, 0.00236, 0.0024, 0.00241, 0.00237, 0.00241, 0.0024, 0.00241, 0.00239, 0.00237, 0.0024, 0.00239, 0.0024, 0.00239, 0.00237, 0.00241, 0.00239, 0.00237, 0.00237, 0.0024, 0.00239, 0.00238, 0.00238, 0.0024, 0.00254, 0.00238, 0.00239, 0.00238, 0.00238, 0.00239, 0.00238, 0.00243, 0.00239, 0.00239, 0.00245, 0.00239, 0.00238, 0.00238, 0.00263, 0.00238, 0.00243, 0.00236, 0.00238, 0.00238, 0.00237, 0.00238, 0.00239, 0.0026, 0.00242, 0.0024, 0.0024, 0.0024, 0.0024, 0.00238, 0.00238, 0.00243, 0.00242, 0.0024, 0.00239, 0.0024, 0.0024, 0.00239, 0.00243, 0.00238, 0.0024, 0.00237, 0.00237, 0.00297, 0.0024, 0.0024, 0.00238, 0.00239, 0.00241, 0.00238, 0.00239, 0.00237, 0.00239, 0.00239, 0.00273, 0.00252, 0.00238, 0.00239, 0.00239, 0.00238, 0.00236, 0.0024, 0.0024, 0.00241, 0.00253, 0.00238]}, "optimizer-inner-step-time": {"start_step": 0, "end_step": 2000, "step_interval": 5, "values": [0.0039, 0.00046, 0.00046, 0.00046, 0.00046, 0.00046, 0.00045, 0.00044, 0.00045, 0.00046, 0.00046, 0.00045, 0.00045, 0.00045, 0.00046, 0.00046, 0.00045, 0.00046, 0.00047, 0.00045, 0.00046, 0.00045, 0.00046, 0.00059, 0.00046, 0.00046, 0.00045, 0.00046, 0.00062, 0.00046, 0.00061, 0.00045, 0.00047, 0.00046, 0.00045, 0.00046, 0.00045, 0.00045, 0.00047, 0.00046, 0.00045, 0.00045, 0.00045, 0.00045, 0.00045, 0.00045, 0.00046, 0.00052, 0.00045, 0.00045, 0.00046, 0.00046, 0.00044, 0.00045, 0.00045, 0.00045, 0.00045, 0.00045, 0.00046, 0.00046, 0.00045, 0.00045, 0.00045, 0.00045, 0.00045, 0.00046, 0.00046, 0.00046, 0.00046, 0.00047, 0.00045, 0.00045, 0.00047, 0.00046, 0.00046, 0.00045, 0.00046, 0.00046, 0.00046, 0.00046, 0.00046, 0.00045, 0.00046, 0.00046, 0.00045, 0.00053, 0.00046, 0.00045, 0.00046, 0.00046, 0.00047, 0.00045, 0.00045, 0.00054, 0.00045, 0.00046, 0.00047, 0.00047, 0.00046, 0.00046, 0.00047, 0.00046, 0.00047, 0.00046, 0.00064, 0.00046, 0.00046, 0.00046, 0.00046, 0.00046, 0.00046, 0.00046, 0.00046, 0.00046, 0.00046, 0.00046, 0.00047, 0.00047, 0.00046, 0.00046, 0.00046, 0.00046, 0.00049, 0.00047, 0.00047, 0.00046, 0.00048, 0.00046, 0.00046, 0.00046, 0.00046, 0.00047, 0.00046, 0.00047, 0.00046, 0.00047, 0.00047, 0.00046, 0.00046, 0.00046, 0.00047, 0.00048, 0.00046, 0.00047, 0.00046, 0.00047, 0.00059, 0.00048, 0.00046, 0.00046, 0.00046, 0.00046, 0.00047, 0.00047, 0.00046, 0.00047, 0.00047, 0.00046, 0.00047, 0.00047, 0.00046, 0.00047, 0.00055, 0.00046, 0.00046, 0.00046, 0.00047, 0.00047, 0.00047, 0.00046, 0.00047, 0.00047, 0.00047, 0.00047, 0.00047, 0.00048, 0.00046, 0.00047, 0.00047, 0.00047, 0.00046, 0.00046, 0.00046, 0.00047, 0.00047, 0.00047, 0.00048, 0.00046, 0.00046, 0.00048, 0.00047, 0.00047, 0.00047, 0.00047, 0.00048, 0.00046, 0.00049, 0.00047, 0.00046, 0.00047, 0.00046, 0.00048, 0.00045, 0.00045, 0.00046, 0.00046, 0.00047, 0.00046, 0.00045, 0.00045, 0.00045, 0.00047, 0.00046, 0.00047, 0.00063, 0.00046, 0.00045, 0.00045, 0.00045, 0.00045, 0.00046, 0.00046, 0.00047, 0.00045, 0.00048, 0.00046, 0.00046, 0.00045, 0.00045, 0.00045, 0.00045, 0.00046, 0.00046, 0.00046, 0.00045, 0.00049, 0.00046, 0.00048, 0.00045, 0.00047, 0.00057, 0.00045, 0.00047, 0.00045, 0.00046, 0.00047, 0.00045, 0.00046, 0.00051, 0.00059, 0.00058, 0.0006, 0.00059, 0.00058, 0.00058, 0.00058, 0.00059, 0.00058, 0.00058, 0.00058, 0.00058, 0.00059, 0.00058, 0.00059, 0.00059, 0.00058, 0.00059, 0.00059, 0.00059, 0.00061, 0.00059, 0.00058, 0.00058, 0.0006, 0.00059, 0.00058, 0.00058, 0.00059, 0.0006, 0.0006, 0.0006, 0.00045, 0.00045, 0.00045, 0.00043, 0.00044, 0.00045, 0.00043, 0.00045, 0.00043, 0.00045, 0.00043, 0.00044, 0.00045, 0.00044, 0.00044, 0.00044, 0.00044, 0.00044, 0.00044, 0.00045, 0.00043, 0.00043, 0.00044, 0.00061, 0.00046, 0.00045, 0.00043, 0.00045, 0.00043, 0.00044, 0.00044, 0.00045, 0.00044, 0.00044, 0.0006, 0.00044, 0.00044, 0.00044, 0.00044, 0.00045, 0.00042, 0.00043, 0.00043, 0.00043, 0.00045, 0.00045, 0.00044, 0.00046, 0.00044, 0.00044, 0.00043, 0.00043, 0.00047, 0.00043, 0.00043, 0.00044, 0.00043, 0.00044, 0.00044, 0.00043, 0.00045, 0.00044, 0.00044, 0.00044, 0.00043, 0.00044, 0.00044, 0.00045, 0.00045, 0.00044, 0.00045, 0.00045, 0.00044, 0.00046, 0.00044, 0.00045, 0.00045, 0.00045, 0.00045, 0.00045, 0.00046, 0.00045, 0.00044, 0.00046, 0.00044, 0.00045, 0.00059, 0.00045, 0.00045, 0.00045, 0.00045, 0.00045, 0.00044, 0.00045, 0.00046, 0.00046, 0.00052, 0.00046, 0.00045, 0.00044, 0.00044, 0.00045, 0.00043, 0.00046, 0.00045, 0.00045, 0.00046, 0.00049, 0.00046, 0.00045, 0.00046, 0.00049, 0.00045, 0.00043, 0.00044, 0.00044, 0.00046, 0.00056, 0.00044]}, "optimizer-copy-main-to-model-params-time": {"start_step": 0, "end_step": 2000, "step_interval": 5, "values": [0.00074, 0.00046, 0.00046, 0.00047, 0.00046, 0.00046, 0.00046, 0.00046, 0.00047, 0.00047, 0.00046, 0.00046, 0.00046, 0.00046, 0.00047, 0.00046, 0.00046, 0.00046, 0.00046, 0.00046, 0.00046, 0.00046, 0.00046, 0.00047, 0.00046, 0.00047, 0.00046, 0.00047, 0.00057, 0.00047, 0.00067, 0.00046, 0.0005, 0.00046, 0.00046, 0.00046, 0.00049, 0.00047, 0.00046, 0.00046, 0.00047, 0.00046, 0.00047, 0.00046, 0.00046, 0.00046, 0.00046, 0.00064, 0.00046, 0.00049, 0.00047, 0.00047, 0.00053, 0.00046, 0.00046, 0.00046, 0.00046, 0.00046, 0.00047, 0.00046, 0.00047, 0.00046, 0.00046, 0.00047, 0.00046, 0.00047, 0.00047, 0.0005, 0.00046, 0.00047, 0.00047, 0.00046, 0.00046, 0.00046, 0.00047, 0.00046, 0.00046, 0.00046, 0.00046, 0.00047, 0.00046, 0.00046, 0.00046, 0.00047, 0.00046, 0.00047, 0.00047, 0.00046, 0.00047, 0.00047, 0.00046, 0.00047, 0.00046, 0.00072, 0.00048, 0.00046, 0.00046, 0.00047, 0.00047, 0.00047, 0.00046, 0.00047, 0.00047, 0.00046, 0.00053, 0.00046, 0.00046, 0.00046, 0.00047, 0.00047, 0.00046, 0.00047, 0.00047, 0.00046, 0.00046, 0.00047, 0.00047, 0.00047, 0.00047, 0.00047, 0.00047, 0.00049, 0.00047, 0.00047, 0.00046, 0.00047, 0.0005, 0.00047, 0.00047, 0.00047, 0.00046, 0.00047, 0.00046, 0.00047, 0.00048, 0.00048, 0.00047, 0.00047, 0.00047, 0.00048, 0.00046, 0.0005, 0.00046, 0.00046, 0.00047, 0.00046, 0.00066, 0.00046, 0.00046, 0.00047, 0.00046, 0.00048, 0.00048, 0.00046, 0.00046, 0.00047, 0.00047, 0.00047, 0.00046, 0.00046, 0.00047, 0.00047, 0.0007, 0.00046, 0.00047, 0.00046, 0.00047, 0.0005, 0.00047, 0.00047, 0.00047, 0.00047, 0.00047, 0.00047, 0.00048, 0.00046, 0.00046, 0.00047, 0.00047, 0.00047, 0.00047, 0.00047, 0.00048, 0.00046, 0.00047, 0.00046, 0.00047, 0.00046, 0.00046, 0.00047, 0.00047, 0.00048, 0.00047, 0.00047, 0.00048, 0.00047, 0.00049, 0.00046, 0.00047, 0.00046, 0.00047, 0.00049, 0.00046, 0.00046, 0.00047, 0.00047, 0.00046, 0.00047, 0.00046, 0.00046, 0.00057, 0.00046, 0.00046, 0.00046, 0.00072, 0.00046, 0.00046, 0.00046, 0.00046, 0.00047, 0.00046, 0.00046, 0.00046, 0.00047, 0.00051, 0.00047, 0.00047, 0.00047, 0.00046, 0.00046, 0.00046, 0.00048, 0.00047, 0.00047, 0.00047, 0.00047, 0.00047, 0.0005, 0.00047, 0.00047, 0.00048, 0.00046, 0.00046, 0.00047, 0.00047, 0.00047, 0.00046, 0.00047, 0.00069, 0.00061, 0.00061, 0.00062, 0.00063, 0.00063, 0.00061, 0.00062, 0.00062, 0.00062, 0.00061, 0.00062, 0.00062, 0.00063, 0.00062, 0.00062, 0.00074, 0.00062, 0.00061, 0.00062, 0.00062, 0.00064, 0.00062, 0.00061, 0.00062, 0.00062, 0.00061, 0.00062, 0.00063, 0.00062, 0.00062, 0.00062, 0.00062, 0.00046, 0.00046, 0.00046, 0.00046, 0.00046, 0.00047, 0.00046, 0.00046, 0.00046, 0.00049, 0.00047, 0.00049, 0.00046, 0.00049, 0.00046, 0.00046, 0.00046, 0.00047, 0.00046, 0.00047, 0.00046, 0.00046, 0.00046, 0.00072, 0.00049, 0.00047, 0.00046, 0.00046, 0.00046, 0.00046, 0.00046, 0.00047, 0.00046, 0.00046, 0.00064, 0.00048, 0.00046, 0.00046, 0.00046, 0.00047, 0.00046, 0.00046, 0.00046, 0.00047, 0.00046, 0.00046, 0.00046, 0.00046, 0.00047, 0.00046, 0.00046, 0.00046, 0.00051, 0.00046, 0.00046, 0.00046, 0.00046, 0.00046, 0.00046, 0.00046, 0.0005, 0.00046, 0.00047, 0.00047, 0.00046, 0.00046, 0.00046, 0.00047, 0.00047, 0.00047, 0.00047, 0.00048, 0.00046, 0.00046, 0.00046, 0.00046, 0.00046, 0.00048, 0.00047, 0.00047, 0.00047, 0.00047, 0.00046, 0.00046, 0.00046, 0.00047, 0.0007, 0.00046, 0.00046, 0.00046, 0.00047, 0.00046, 0.00046, 0.00046, 0.00047, 0.00046, 0.00051, 0.00048, 0.00047, 0.00046, 0.00047, 0.00046, 0.00047, 0.00047, 0.00046, 0.00046, 0.00047, 0.00047, 0.00048, 0.00046, 0.00047, 0.0005, 0.00046, 0.00047, 0.00046, 0.00046, 0.00046, 0.00065, 0.00047]}, "optimizer-time": {"start_step": 0, "end_step": 2000, "step_interval": 5, "values": [6.53084, 0.00464, 0.00458, 0.0046, 0.00463, 0.00462, 0.00461, 0.0046, 0.00462, 0.00466, 0.00468, 0.00464, 0.00464, 0.00464, 0.00466, 0.00465, 0.00461, 0.00462, 0.0046, 0.00459, 0.00462, 0.00459, 0.0046, 0.00474, 0.0046, 0.0046, 0.00459, 0.00461, 0.00533, 0.00461, 0.00562, 0.00464, 0.00716, 0.00471, 0.00463, 0.00461, 0.00461, 0.00462, 0.00462, 0.00465, 0.00464, 0.00461, 0.00459, 0.00463, 0.00464, 0.0046, 0.00459, 0.00494, 0.00461, 0.00464, 0.00472, 0.00463, 0.00467, 0.00463, 0.00461, 0.00461, 0.00461, 0.00459, 0.00465, 0.00478, 0.00462, 0.00464, 0.0046, 0.00464, 0.00461, 0.00462, 0.00484, 0.00467, 0.00469, 0.00458, 0.00458, 0.00458, 0.00459, 0.00459, 0.00474, 0.00455, 0.00464, 0.00458, 0.00457, 0.0046, 0.00458, 0.0046, 0.0047, 0.00458, 0.00459, 0.00468, 0.00458, 0.00456, 0.00459, 0.00458, 0.00454, 0.00457, 0.00454, 0.00535, 0.00469, 0.00459, 0.00457, 0.0046, 0.00459, 0.00459, 0.00458, 0.0046, 0.00456, 0.00459, 0.00551, 0.00461, 0.00463, 0.00451, 0.00459, 0.00451, 0.00449, 0.00453, 0.00459, 0.00458, 0.00454, 0.00456, 0.00458, 0.00462, 0.00451, 0.00457, 0.00461, 0.0046, 0.00497, 0.00461, 0.00455, 0.00458, 0.00469, 0.00472, 0.0046, 0.00459, 0.00459, 0.0046, 0.00457, 0.0046, 0.00462, 0.00461, 0.00458, 0.00464, 0.00459, 0.0046, 0.00465, 0.00469, 0.00462, 0.00463, 0.00463, 0.00463, 0.00518, 0.00462, 0.00478, 0.00458, 0.00463, 0.00462, 0.00466, 0.00465, 0.00463, 0.0048, 0.00458, 0.00458, 0.00458, 0.00461, 0.00458, 0.00461, 0.00505, 0.00457, 0.00461, 0.00456, 0.00461, 0.00463, 0.00467, 0.00457, 0.0046, 0.00454, 0.00459, 0.00462, 0.00461, 0.00459, 0.00465, 0.00457, 0.0046, 0.00457, 0.00459, 0.00461, 0.00563, 0.00466, 0.00459, 0.00456, 0.00458, 0.00457, 0.00457, 0.00462, 0.00476, 0.00461, 0.00459, 0.00458, 0.00478, 0.00458, 0.00498, 0.00465, 0.00458, 0.00462, 0.00441, 0.00438, 0.00432, 0.00434, 0.00433, 0.00431, 0.00434, 0.00431, 0.00433, 0.00433, 0.00454, 0.00435, 0.00437, 0.00435, 0.00489, 0.00436, 0.00436, 0.00435, 0.00438, 0.00436, 0.00432, 0.00433, 0.00433, 0.00437, 0.00441, 0.00434, 0.00434, 0.00432, 0.00434, 0.0044, 0.00432, 0.0044, 0.00432, 0.00431, 0.00433, 0.00442, 0.00438, 0.00454, 0.00434, 0.00437, 0.00523, 0.00436, 0.00437, 0.00435, 0.00437, 0.00436, 0.00435, 0.00441, 0.00694, 0.00622, 0.00624, 0.00622, 0.00629, 0.00622, 0.0062, 0.0062, 0.00622, 0.00645, 0.00629, 0.00622, 0.00619, 0.00626, 0.0062, 0.00622, 0.00688, 0.00622, 0.00622, 0.00623, 0.00625, 0.00629, 0.00647, 0.00622, 0.00622, 0.00625, 0.00625, 0.00629, 0.00622, 0.0062, 0.00624, 0.00622, 0.00626, 0.00434, 0.00431, 0.00435, 0.0043, 0.00431, 0.00428, 0.00427, 0.00431, 0.00429, 0.00435, 0.00428, 0.00431, 0.00431, 0.00433, 0.00435, 0.00433, 0.00428, 0.00432, 0.00428, 0.00432, 0.00427, 0.00434, 0.0043, 0.00485, 0.00439, 0.00433, 0.00428, 0.0043, 0.00428, 0.00429, 0.00428, 0.0043, 0.00432, 0.00427, 0.00475, 0.00433, 0.0043, 0.00434, 0.00432, 0.00436, 0.00428, 0.00429, 0.00429, 0.00429, 0.00433, 0.0043, 0.00428, 0.00433, 0.0043, 0.00433, 0.00427, 0.00427, 0.00439, 0.00443, 0.00428, 0.00431, 0.00426, 0.00429, 0.0043, 0.00426, 0.00441, 0.00428, 0.0043, 0.00436, 0.00429, 0.00431, 0.00428, 0.00462, 0.00436, 0.00436, 0.00431, 0.00439, 0.00429, 0.00433, 0.00433, 0.00433, 0.00453, 0.00436, 0.00436, 0.00432, 0.00435, 0.00441, 0.00431, 0.00437, 0.00436, 0.00437, 0.00495, 0.00431, 0.00434, 0.00433, 0.00433, 0.00438, 0.00429, 0.00433, 0.00433, 0.00431, 0.0054, 0.00436, 0.00437, 0.00433, 0.0043, 0.0044, 0.0043, 0.00436, 0.00431, 0.00431, 0.00435, 0.00472, 0.00451, 0.00436, 0.00433, 0.0047, 0.00432, 0.00427, 0.00432, 0.00431, 0.0044, 0.00518, 0.00433]}, "learning-rate": {"start_step": 0, "end_step": 2000, "step_interval": 5, "values": [0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 1e-05, 1e-05, 1e-05, 1e-05, 1e-05, 1e-05, 1e-05, 1e-05, 1e-05, 1e-05, 1e-05, 1e-05, 1e-05, 1e-05, 1e-05, 1e-05, 1e-05, 1e-05, 1e-05, 1e-05, 1e-05, 1e-05, 1e-05, 1e-05, 1e-05, 1e-05, 1e-05, 1e-05, 1e-05, 1e-05, 1e-05, 1e-05, 1e-05, 1e-05, 1e-05, 1e-05, 1e-05, 1e-05, 1e-05, 1e-05, 1e-05, 1e-05, 2e-05, 2e-05, 2e-05, 2e-05, 2e-05, 2e-05, 2e-05, 2e-05, 2e-05, 2e-05, 2e-05, 2e-05, 2e-05, 2e-05, 2e-05, 2e-05, 2e-05, 2e-05, 2e-05, 2e-05, 2e-05, 2e-05, 2e-05, 2e-05, 2e-05, 2e-05, 2e-05, 2e-05, 2e-05, 2e-05, 2e-05, 2e-05, 2e-05, 2e-05, 2e-05, 2e-05, 2e-05, 2e-05, 2e-05, 2e-05, 2e-05, 2e-05, 2e-05, 3e-05, 3e-05, 3e-05, 3e-05, 3e-05, 3e-05, 3e-05, 3e-05, 3e-05, 3e-05, 3e-05, 3e-05, 3e-05, 3e-05, 3e-05, 3e-05, 3e-05, 3e-05, 3e-05, 3e-05, 3e-05, 3e-05, 3e-05, 3e-05, 3e-05, 3e-05, 3e-05, 3e-05, 3e-05, 3e-05, 3e-05, 3e-05, 3e-05, 3e-05, 3e-05, 3e-05, 3e-05, 3e-05, 3e-05, 3e-05, 3e-05, 3e-05, 3e-05, 4e-05, 4e-05, 4e-05, 4e-05, 4e-05, 4e-05, 4e-05, 4e-05, 4e-05, 4e-05, 4e-05, 4e-05, 4e-05, 4e-05, 4e-05, 4e-05, 4e-05, 4e-05, 4e-05, 4e-05, 4e-05, 4e-05, 4e-05, 4e-05, 4e-05, 4e-05, 4e-05, 4e-05, 4e-05, 4e-05, 4e-05, 4e-05, 4e-05, 4e-05, 4e-05, 4e-05, 4e-05, 4e-05, 4e-05, 4e-05, 4e-05, 4e-05, 5e-05, 5e-05, 5e-05, 5e-05, 5e-05, 5e-05, 5e-05, 5e-05, 5e-05, 5e-05, 5e-05, 5e-05, 5e-05, 5e-05, 5e-05, 5e-05, 5e-05, 5e-05, 5e-05, 5e-05, 5e-05, 5e-05, 5e-05, 5e-05, 5e-05, 5e-05, 5e-05, 5e-05, 5e-05, 5e-05, 5e-05, 5e-05, 5e-05, 5e-05, 5e-05, 5e-05, 5e-05, 5e-05, 5e-05, 5e-05, 5e-05, 5e-05, 5e-05, 6e-05, 6e-05, 6e-05, 6e-05, 6e-05, 6e-05, 6e-05, 6e-05, 6e-05, 6e-05, 6e-05, 6e-05, 6e-05, 6e-05, 6e-05, 6e-05, 6e-05, 6e-05, 6e-05, 6e-05, 6e-05, 6e-05, 6e-05, 6e-05, 6e-05, 6e-05, 6e-05, 6e-05, 6e-05, 6e-05, 6e-05, 6e-05, 6e-05, 6e-05, 6e-05, 6e-05, 6e-05, 6e-05, 6e-05, 6e-05, 6e-05, 6e-05, 6e-05, 7e-05, 7e-05, 7e-05, 7e-05, 7e-05, 7e-05, 7e-05, 7e-05, 7e-05, 7e-05, 7e-05, 7e-05, 7e-05, 7e-05, 7e-05, 7e-05, 7e-05, 7e-05, 7e-05, 7e-05, 7e-05, 7e-05, 7e-05, 7e-05, 7e-05, 7e-05, 7e-05, 7e-05, 7e-05, 7e-05, 7e-05, 7e-05, 7e-05, 7e-05, 7e-05, 7e-05, 7e-05, 7e-05, 7e-05, 7e-05, 7e-05, 7e-05, 8e-05, 8e-05, 8e-05, 8e-05, 8e-05, 8e-05, 8e-05, 8e-05, 8e-05, 8e-05, 8e-05, 8e-05, 8e-05, 8e-05, 8e-05, 8e-05, 8e-05, 8e-05, 8e-05, 8e-05, 8e-05, 8e-05, 8e-05, 8e-05, 8e-05, 8e-05, 8e-05, 8e-05, 8e-05, 8e-05, 8e-05, 8e-05, 8e-05, 8e-05, 8e-05, 8e-05, 8e-05, 8e-05, 8e-05, 8e-05, 8e-05, 8e-05, 8e-05, 9e-05, 9e-05, 9e-05, 9e-05, 9e-05, 9e-05, 9e-05, 9e-05, 9e-05, 9e-05, 9e-05, 9e-05, 9e-05, 9e-05, 9e-05, 9e-05, 9e-05, 9e-05, 9e-05, 9e-05, 9e-05, 9e-05, 9e-05, 9e-05, 9e-05, 9e-05, 9e-05, 9e-05, 9e-05, 9e-05, 9e-05, 9e-05, 9e-05, 9e-05, 9e-05, 9e-05, 9e-05]}, "learning-rate vs samples": {"start_step": 0, "end_step": 2000, "step_interval": 5, "values": [0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 1e-05, 1e-05, 1e-05, 1e-05, 1e-05, 1e-05, 1e-05, 1e-05, 1e-05, 1e-05, 1e-05, 1e-05, 1e-05, 1e-05, 1e-05, 1e-05, 1e-05, 1e-05, 1e-05, 1e-05, 1e-05, 1e-05, 1e-05, 1e-05, 1e-05, 1e-05, 1e-05, 1e-05, 1e-05, 1e-05, 1e-05, 1e-05, 1e-05, 1e-05, 1e-05, 1e-05, 1e-05, 1e-05, 1e-05, 1e-05, 1e-05, 1e-05, 2e-05, 2e-05, 2e-05, 2e-05, 2e-05, 2e-05, 2e-05, 2e-05, 2e-05, 2e-05, 2e-05, 2e-05, 2e-05, 2e-05, 2e-05, 2e-05, 2e-05, 2e-05, 2e-05, 2e-05, 2e-05, 2e-05, 2e-05, 2e-05, 2e-05, 2e-05, 2e-05, 2e-05, 2e-05, 2e-05, 2e-05, 2e-05, 2e-05, 2e-05, 2e-05, 2e-05, 2e-05, 2e-05, 2e-05, 2e-05, 2e-05, 2e-05, 2e-05, 3e-05, 3e-05, 3e-05, 3e-05, 3e-05, 3e-05, 3e-05, 3e-05, 3e-05, 3e-05, 3e-05, 3e-05, 3e-05, 3e-05, 3e-05, 3e-05, 3e-05, 3e-05, 3e-05, 3e-05, 3e-05, 3e-05, 3e-05, 3e-05, 3e-05, 3e-05, 3e-05, 3e-05, 3e-05, 3e-05, 3e-05, 3e-05, 3e-05, 3e-05, 3e-05, 3e-05, 3e-05, 3e-05, 3e-05, 3e-05, 3e-05, 3e-05, 3e-05, 4e-05, 4e-05, 4e-05, 4e-05, 4e-05, 4e-05, 4e-05, 4e-05, 4e-05, 4e-05, 4e-05, 4e-05, 4e-05, 4e-05, 4e-05, 4e-05, 4e-05, 4e-05, 4e-05, 4e-05, 4e-05, 4e-05, 4e-05, 4e-05, 4e-05, 4e-05, 4e-05, 4e-05, 4e-05, 4e-05, 4e-05, 4e-05, 4e-05, 4e-05, 4e-05, 4e-05, 4e-05, 4e-05, 4e-05, 4e-05, 4e-05, 4e-05, 5e-05, 5e-05, 5e-05, 5e-05, 5e-05, 5e-05, 5e-05, 5e-05, 5e-05, 5e-05, 5e-05, 5e-05, 5e-05, 5e-05, 5e-05, 5e-05, 5e-05, 5e-05, 5e-05, 5e-05, 5e-05, 5e-05, 5e-05, 5e-05, 5e-05, 5e-05, 5e-05, 5e-05, 5e-05, 5e-05, 5e-05, 5e-05, 5e-05, 5e-05, 5e-05, 5e-05, 5e-05, 5e-05, 5e-05, 5e-05, 5e-05, 5e-05, 5e-05, 6e-05, 6e-05, 6e-05, 6e-05, 6e-05, 6e-05, 6e-05, 6e-05, 6e-05, 6e-05, 6e-05, 6e-05, 6e-05, 6e-05, 6e-05, 6e-05, 6e-05, 6e-05, 6e-05, 6e-05, 6e-05, 6e-05, 6e-05, 6e-05, 6e-05, 6e-05, 6e-05, 6e-05, 6e-05, 6e-05, 6e-05, 6e-05, 6e-05, 6e-05, 6e-05, 6e-05, 6e-05, 6e-05, 6e-05, 6e-05, 6e-05, 6e-05, 6e-05, 7e-05, 7e-05, 7e-05, 7e-05, 7e-05, 7e-05, 7e-05, 7e-05, 7e-05, 7e-05, 7e-05, 7e-05, 7e-05, 7e-05, 7e-05, 7e-05, 7e-05, 7e-05, 7e-05, 7e-05, 7e-05, 7e-05, 7e-05, 7e-05, 7e-05, 7e-05, 7e-05, 7e-05, 7e-05, 7e-05, 7e-05, 7e-05, 7e-05, 7e-05, 7e-05, 7e-05, 7e-05, 7e-05, 7e-05, 7e-05, 7e-05, 7e-05, 8e-05, 8e-05, 8e-05, 8e-05, 8e-05, 8e-05, 8e-05, 8e-05, 8e-05, 8e-05, 8e-05, 8e-05, 8e-05, 8e-05, 8e-05, 8e-05, 8e-05, 8e-05, 8e-05, 8e-05, 8e-05, 8e-05, 8e-05, 8e-05, 8e-05, 8e-05, 8e-05, 8e-05, 8e-05, 8e-05, 8e-05, 8e-05, 8e-05, 8e-05, 8e-05, 8e-05, 8e-05, 8e-05, 8e-05, 8e-05, 8e-05, 8e-05, 8e-05, 9e-05, 9e-05, 9e-05, 9e-05, 9e-05, 9e-05, 9e-05, 9e-05, 9e-05, 9e-05, 9e-05, 9e-05, 9e-05, 9e-05, 9e-05, 9e-05, 9e-05, 9e-05, 9e-05, 9e-05, 9e-05, 9e-05, 9e-05, 9e-05, 9e-05, 9e-05, 9e-05, 9e-05, 9e-05, 9e-05, 9e-05, 9e-05, 9e-05, 9e-05, 9e-05, 9e-05, 9e-05]}, "batch-size": {"start_step": 0, "end_step": 2000, "step_interval": 5, "values": [128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0]}, "batch-size vs samples": {"start_step": 0, "end_step": 2000, "step_interval": 5, "values": [128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0]}, "lm loss": {"start_step": 0, "end_step": 2000, "step_interval": 5, "values": [10.89904, 10.90777, 10.89232, 10.83544, 10.6834, 10.65974, 10.44873, 10.16308, 9.95831, 9.85932, 9.60254, 9.85446, 9.88893, 9.63287, 9.79405, 9.51078, 9.46463, 9.65471, 9.39306, 9.33895, 9.24972, 9.15413, 9.17988, 9.0065, 9.19899, 9.06474, 9.16249, 9.16631, 9.30043, 8.98957, 8.93842, 9.05744, 9.05222, 8.66356, 8.72626, 8.7667, 8.70006, 8.74817, 8.67179, 8.78274, 8.67795, 8.86767, 8.84929, 8.51536, 8.40624, 8.45093, 8.51004, 8.40653, 8.45216, 8.6026, 8.38502, 8.21394, 8.24297, 8.23879, 8.28518, 7.93123, 8.10705, 7.90575, 8.25948, 8.24016, 8.01415, 7.97894, 7.93174, 7.74864, 7.74918, 7.65293, 7.52384, 7.91349, 7.70509, 7.46214, 7.74596, 7.77384, 7.5447, 7.30561, 7.45871, 7.34545, 7.46856, 7.23017, 7.64088, 7.27983, 7.34981, 7.21134, 7.21081, 7.42102, 7.17384, 7.28052, 6.99786, 7.00152, 7.03624, 7.13136, 6.82298, 6.98762, 7.08699, 6.99714, 6.87231, 6.75444, 6.98392, 7.05773, 6.69999, 6.57801, 6.72248, 6.73865, 6.73005, 6.73698, 6.65374, 6.40729, 6.6365, 6.61972, 6.44423, 6.62637, 6.74067, 6.60551, 6.72345, 6.68935, 6.62052, 6.50773, 6.59703, 6.40181, 6.66219, 6.24576, 6.24815, 6.29992, 6.38652, 6.34284, 6.44395, 6.2868, 6.33137, 6.23064, 6.19419, 6.38932, 6.31955, 6.31115, 6.15595, 6.14904, 6.23012, 6.37609, 6.19108, 6.14016, 6.17443, 6.108, 6.05677, 6.07051, 6.2515, 6.40359, 6.25653, 6.30179, 6.09464, 6.1786, 6.00393, 6.03024, 5.95456, 6.25097, 6.18949, 5.96652, 5.78509, 6.12471, 5.85239, 6.09954, 5.78907, 6.1634, 6.14662, 6.08899, 5.93324, 6.11629, 5.94863, 6.19744, 5.89699, 5.79464, 5.78508, 5.6887, 6.01484, 5.99513, 6.06793, 5.88964, 6.04218, 5.96664, 5.9946, 5.98873, 5.94909, 5.83777, 5.94965, 5.62073, 5.70203, 5.88937, 5.84442, 5.86415, 5.75977, 5.83426, 5.72464, 5.56351, 5.71986, 5.62642, 5.83426, 5.60742, 5.71258, 5.70976, 5.8987, 5.64295, 5.85277, 5.73889, 5.87053, 5.32966, 5.89533, 5.87205, 5.85426, 5.41037, 5.40663, 5.62114, 5.59572, 5.48482, 5.57586, 5.67197, 5.4726, 5.74298, 5.50672, 5.5935, 5.61776, 5.6179, 5.51203, 5.61413, 5.67291, 5.68327, 5.58724, 5.66009, 5.37678, 5.68099, 5.62359, 5.42053, 5.57867, 5.62946, 5.54954, 5.33822, 5.53445, 5.48149, 5.47842, 5.37511, 5.5464, 5.60351, 5.38706, 5.51715, 5.48729, 5.33094, 5.50178, 5.40732, 5.44712, 5.31548, 5.06617, 5.47969, 5.56831, 5.7133, 5.41401, 5.59841, 5.63558, 5.2322, 5.27319, 5.38792, 5.39306, 5.32904, 5.49509, 5.17834, 5.29764, 5.24393, 5.37614, 5.25456, 5.44258, 5.54017, 5.31017, 5.43225, 5.33341, 5.07298, 5.31187, 5.2557, 5.30514, 5.10844, 5.27459, 5.26496, 5.47616, 5.16669, 5.26555, 5.21176, 5.355, 4.98377, 4.91178, 5.33096, 5.38935, 5.23414, 5.31329, 5.10388, 5.16417, 5.26356, 5.06801, 5.27045, 5.07377, 5.34602, 5.24563, 5.15001, 5.24094, 5.04069, 5.31488, 5.04958, 5.02979, 5.13788, 5.11434, 5.26734, 5.14852, 5.27369, 5.08851, 5.09324, 5.24624, 5.32324, 5.25443, 5.19052, 5.14435, 5.29055, 4.94885, 5.20441, 5.0907, 5.29874, 5.17267, 5.18858, 5.11677, 4.98159, 4.99122, 5.22123, 5.30764, 5.10222, 5.0544, 4.91358, 5.12177, 5.11614, 4.92915, 5.33612, 5.01913, 5.10051, 5.16573, 4.99929, 5.06049, 5.06814, 4.99437, 5.07642, 5.16464, 4.98109, 5.1825, 4.92945, 4.92916, 5.06868, 4.99902, 4.90979, 4.77687, 4.94499, 5.11671, 5.01541, 5.02126, 5.32954, 4.95713, 4.99895, 5.05055, 4.81011, 4.73872, 5.00091, 5.04398, 4.87805, 4.95233, 5.04347, 5.02539, 4.82104, 4.90025, 4.90912, 4.83747, 4.75039, 5.01482, 4.74829, 5.21037, 4.79047, 5.00245, 4.74175, 4.79189, 4.82107, 4.65381, 4.66051, 4.84616, 4.81073, 4.8078, 4.92405, 4.88723, 4.93597, 4.77468, 4.88361, 4.74125, 4.92209, 4.96252, 4.87874, 4.71289, 4.79114, 4.90017, 4.7175, 4.87202, 4.69846, 4.70626, 4.65256]}, "lm loss vs samples": {"start_step": 0, "end_step": 2000, "step_interval": 5, "values": [10.89904, 10.90777, 10.89232, 10.83544, 10.6834, 10.65974, 10.44873, 10.16308, 9.95831, 9.85932, 9.60254, 9.85446, 9.88893, 9.63287, 9.79405, 9.51078, 9.46463, 9.65471, 9.39306, 9.33895, 9.24972, 9.15413, 9.17988, 9.0065, 9.19899, 9.06474, 9.16249, 9.16631, 9.30043, 8.98957, 8.93842, 9.05744, 9.05222, 8.66356, 8.72626, 8.7667, 8.70006, 8.74817, 8.67179, 8.78274, 8.67795, 8.86767, 8.84929, 8.51536, 8.40624, 8.45093, 8.51004, 8.40653, 8.45216, 8.6026, 8.38502, 8.21394, 8.24297, 8.23879, 8.28518, 7.93123, 8.10705, 7.90575, 8.25948, 8.24016, 8.01415, 7.97894, 7.93174, 7.74864, 7.74918, 7.65293, 7.52384, 7.91349, 7.70509, 7.46214, 7.74596, 7.77384, 7.5447, 7.30561, 7.45871, 7.34545, 7.46856, 7.23017, 7.64088, 7.27983, 7.34981, 7.21134, 7.21081, 7.42102, 7.17384, 7.28052, 6.99786, 7.00152, 7.03624, 7.13136, 6.82298, 6.98762, 7.08699, 6.99714, 6.87231, 6.75444, 6.98392, 7.05773, 6.69999, 6.57801, 6.72248, 6.73865, 6.73005, 6.73698, 6.65374, 6.40729, 6.6365, 6.61972, 6.44423, 6.62637, 6.74067, 6.60551, 6.72345, 6.68935, 6.62052, 6.50773, 6.59703, 6.40181, 6.66219, 6.24576, 6.24815, 6.29992, 6.38652, 6.34284, 6.44395, 6.2868, 6.33137, 6.23064, 6.19419, 6.38932, 6.31955, 6.31115, 6.15595, 6.14904, 6.23012, 6.37609, 6.19108, 6.14016, 6.17443, 6.108, 6.05677, 6.07051, 6.2515, 6.40359, 6.25653, 6.30179, 6.09464, 6.1786, 6.00393, 6.03024, 5.95456, 6.25097, 6.18949, 5.96652, 5.78509, 6.12471, 5.85239, 6.09954, 5.78907, 6.1634, 6.14662, 6.08899, 5.93324, 6.11629, 5.94863, 6.19744, 5.89699, 5.79464, 5.78508, 5.6887, 6.01484, 5.99513, 6.06793, 5.88964, 6.04218, 5.96664, 5.9946, 5.98873, 5.94909, 5.83777, 5.94965, 5.62073, 5.70203, 5.88937, 5.84442, 5.86415, 5.75977, 5.83426, 5.72464, 5.56351, 5.71986, 5.62642, 5.83426, 5.60742, 5.71258, 5.70976, 5.8987, 5.64295, 5.85277, 5.73889, 5.87053, 5.32966, 5.89533, 5.87205, 5.85426, 5.41037, 5.40663, 5.62114, 5.59572, 5.48482, 5.57586, 5.67197, 5.4726, 5.74298, 5.50672, 5.5935, 5.61776, 5.6179, 5.51203, 5.61413, 5.67291, 5.68327, 5.58724, 5.66009, 5.37678, 5.68099, 5.62359, 5.42053, 5.57867, 5.62946, 5.54954, 5.33822, 5.53445, 5.48149, 5.47842, 5.37511, 5.5464, 5.60351, 5.38706, 5.51715, 5.48729, 5.33094, 5.50178, 5.40732, 5.44712, 5.31548, 5.06617, 5.47969, 5.56831, 5.7133, 5.41401, 5.59841, 5.63558, 5.2322, 5.27319, 5.38792, 5.39306, 5.32904, 5.49509, 5.17834, 5.29764, 5.24393, 5.37614, 5.25456, 5.44258, 5.54017, 5.31017, 5.43225, 5.33341, 5.07298, 5.31187, 5.2557, 5.30514, 5.10844, 5.27459, 5.26496, 5.47616, 5.16669, 5.26555, 5.21176, 5.355, 4.98377, 4.91178, 5.33096, 5.38935, 5.23414, 5.31329, 5.10388, 5.16417, 5.26356, 5.06801, 5.27045, 5.07377, 5.34602, 5.24563, 5.15001, 5.24094, 5.04069, 5.31488, 5.04958, 5.02979, 5.13788, 5.11434, 5.26734, 5.14852, 5.27369, 5.08851, 5.09324, 5.24624, 5.32324, 5.25443, 5.19052, 5.14435, 5.29055, 4.94885, 5.20441, 5.0907, 5.29874, 5.17267, 5.18858, 5.11677, 4.98159, 4.99122, 5.22123, 5.30764, 5.10222, 5.0544, 4.91358, 5.12177, 5.11614, 4.92915, 5.33612, 5.01913, 5.10051, 5.16573, 4.99929, 5.06049, 5.06814, 4.99437, 5.07642, 5.16464, 4.98109, 5.1825, 4.92945, 4.92916, 5.06868, 4.99902, 4.90979, 4.77687, 4.94499, 5.11671, 5.01541, 5.02126, 5.32954, 4.95713, 4.99895, 5.05055, 4.81011, 4.73872, 5.00091, 5.04398, 4.87805, 4.95233, 5.04347, 5.02539, 4.82104, 4.90025, 4.90912, 4.83747, 4.75039, 5.01482, 4.74829, 5.21037, 4.79047, 5.00245, 4.74175, 4.79189, 4.82107, 4.65381, 4.66051, 4.84616, 4.81073, 4.8078, 4.92405, 4.88723, 4.93597, 4.77468, 4.88361, 4.74125, 4.92209, 4.96252, 4.87874, 4.71289, 4.79114, 4.90017, 4.7175, 4.87202, 4.69846, 4.70626, 4.65256]}, "loss-scale": {"start_step": 0, "end_step": 2000, "step_interval": 5, "values": [1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0]}, "loss-scale vs samples": {"start_step": 0, "end_step": 2000, "step_interval": 5, "values": [1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0]}, "grad-norm": {"start_step": 0, "end_step": 2000, "step_interval": 5, "values": [13.85752, 13.16701, 13.66167, 12.68371, 12.08638, 9.51321, 6.94209, 7.08694, 6.10814, 4.68821, 4.2751, 2.87984, 2.44435, 2.3806, 2.05602, 2.21803, 2.17031, 1.89335, 2.22351, 2.07816, 2.13217, 2.16577, 2.02595, 2.23917, 2.00742, 2.14445, 1.91002, 1.89231, 1.93089, 2.06379, 2.16765, 2.23679, 1.89668, 2.34753, 2.35194, 2.16267, 2.15162, 1.83098, 2.05276, 1.74395, 2.36831, 1.97031, 1.80751, 1.87923, 1.94701, 1.80892, 1.71885, 1.77109, 1.75698, 1.55174, 1.76422, 1.75578, 1.7467, 1.926, 1.6754, 1.89063, 1.76173, 1.82379, 1.52589, 1.48723, 1.63648, 1.49118, 1.79292, 1.82033, 1.59591, 1.62383, 1.63898, 1.62368, 1.43237, 1.62305, 1.35226, 1.37441, 1.77832, 1.4053, 1.36387, 1.43489, 1.33927, 1.41507, 1.32726, 1.26584, 1.3881, 1.23171, 1.40194, 1.20354, 1.1842, 1.32033, 1.50387, 1.25756, 1.20187, 1.05786, 1.15737, 1.22128, 1.02487, 1.08879, 0.98695, 1.28999, 0.98417, 1.58629, 1.03703, 1.06213, 1.55961, 1.47669, 0.90784, 1.45527, 1.29065, 1.13286, 1.14779, 0.95484, 1.09964, 0.89588, 0.84205, 0.91582, 1.04481, 1.01608, 1.02993, 1.12143, 1.08948, 1.31986, 0.92092, 1.1799, 1.09173, 1.10393, 1.19122, 1.03752, 1.03062, 1.19126, 1.02231, 1.0955, 1.05064, 1.06655, 1.1517, 1.11568, 1.37446, 1.21005, 1.53165, 1.24599, 1.03436, 1.56617, 1.39613, 1.20613, 1.59751, 1.76157, 1.17134, 1.06152, 1.22514, 1.97917, 1.11879, 1.62597, 1.18846, 0.95412, 1.17247, 1.50913, 1.42049, 1.32267, 1.02991, 1.60853, 1.51052, 1.23861, 1.4438, 1.81637, 1.43133, 1.52934, 1.66869, 1.18507, 1.38099, 1.44638, 1.56369, 1.1851, 1.63779, 1.22939, 1.13585, 0.93198, 1.58024, 1.61619, 1.48199, 1.39642, 1.72479, 1.20982, 1.33257, 1.14605, 1.14908, 1.46659, 1.41611, 1.64334, 1.40953, 1.89405, 1.62101, 1.55, 1.25036, 1.73578, 1.20849, 1.16164, 2.00175, 1.79359, 1.54068, 1.27095, 1.51292, 1.45211, 1.55181, 1.38317, 1.19552, 1.41924, 1.0843, 1.11099, 1.49128, 1.31175, 1.31568, 1.31643, 1.38944, 1.83714, 1.51633, 1.66291, 1.32027, 1.40224, 1.23381, 1.24726, 1.17329, 1.41173, 1.41298, 1.21975, 1.40395, 1.29766, 1.647, 1.77185, 1.70549, 1.66243, 1.35144, 1.53811, 1.34558, 1.49398, 1.11503, 1.29778, 1.74207, 1.44213, 1.53886, 1.63632, 1.20482, 1.57111, 1.4054, 1.21748, 1.63569, 1.23136, 1.58159, 1.59579, 1.48012, 1.5323, 1.55081, 1.4194, 1.57228, 1.48387, 1.38849, 1.27392, 1.46178, 1.25824, 1.36062, 1.39751, 1.30771, 1.33147, 1.56583, 1.32709, 1.3646, 1.55907, 1.61002, 1.45173, 1.42035, 2.16284, 1.75737, 1.67782, 1.31786, 1.45228, 1.59778, 1.56015, 1.4983, 1.23696, 1.35268, 1.40317, 1.37404, 1.67666, 1.49364, 1.47162, 1.50218, 1.40879, 1.26151, 1.53009, 1.2357, 1.52653, 1.16029, 1.37287, 1.45359, 1.43811, 1.48164, 1.84101, 1.47755, 1.57834, 1.61834, 1.37842, 1.4784, 1.5761, 1.25832, 1.22282, 1.47102, 1.22564, 1.24267, 1.4204, 1.52394, 1.4913, 1.42263, 1.42192, 1.14735, 1.34499, 1.41439, 1.29824, 1.69085, 1.44146, 1.55667, 1.25423, 1.36428, 1.18219, 1.19336, 1.33449, 1.6401, 1.40383, 1.31292, 1.52789, 1.3215, 1.5794, 1.52614, 1.22037, 1.55665, 1.33214, 1.42978, 1.54699, 1.14418, 1.6388, 1.34807, 1.3749, 1.28337, 1.39417, 1.59994, 1.36359, 1.36119, 1.19917, 1.33658, 1.27596, 1.44996, 1.61368, 1.41282, 1.45175, 1.23245, 1.34616, 1.42121, 1.22977, 1.59453, 1.46628, 1.2612, 1.66869, 1.34891, 1.38326, 1.54549, 1.62587, 1.50361, 1.33282, 1.30675, 1.24628, 1.22264, 1.39221, 1.62236, 1.59048, 1.51538, 1.71681, 1.34251, 1.22656, 1.61992, 1.40775, 1.39241, 1.37966, 1.26457, 1.31626, 1.23459, 1.33073, 1.25512, 1.32646, 1.32216, 1.2607, 1.26972, 1.41721, 1.4656, 1.22975, 1.33206, 1.36899, 1.3651, 1.49566, 1.54131, 1.24469, 1.32355, 1.39775, 1.35713, 1.23875, 1.37455, 1.14642]}, "grad-norm vs samples": {"start_step": 0, "end_step": 2000, "step_interval": 5, "values": [13.85752, 13.16701, 13.66167, 12.68371, 12.08638, 9.51321, 6.94209, 7.08694, 6.10814, 4.68821, 4.2751, 2.87984, 2.44435, 2.3806, 2.05602, 2.21803, 2.17031, 1.89335, 2.22351, 2.07816, 2.13217, 2.16577, 2.02595, 2.23917, 2.00742, 2.14445, 1.91002, 1.89231, 1.93089, 2.06379, 2.16765, 2.23679, 1.89668, 2.34753, 2.35194, 2.16267, 2.15162, 1.83098, 2.05276, 1.74395, 2.36831, 1.97031, 1.80751, 1.87923, 1.94701, 1.80892, 1.71885, 1.77109, 1.75698, 1.55174, 1.76422, 1.75578, 1.7467, 1.926, 1.6754, 1.89063, 1.76173, 1.82379, 1.52589, 1.48723, 1.63648, 1.49118, 1.79292, 1.82033, 1.59591, 1.62383, 1.63898, 1.62368, 1.43237, 1.62305, 1.35226, 1.37441, 1.77832, 1.4053, 1.36387, 1.43489, 1.33927, 1.41507, 1.32726, 1.26584, 1.3881, 1.23171, 1.40194, 1.20354, 1.1842, 1.32033, 1.50387, 1.25756, 1.20187, 1.05786, 1.15737, 1.22128, 1.02487, 1.08879, 0.98695, 1.28999, 0.98417, 1.58629, 1.03703, 1.06213, 1.55961, 1.47669, 0.90784, 1.45527, 1.29065, 1.13286, 1.14779, 0.95484, 1.09964, 0.89588, 0.84205, 0.91582, 1.04481, 1.01608, 1.02993, 1.12143, 1.08948, 1.31986, 0.92092, 1.1799, 1.09173, 1.10393, 1.19122, 1.03752, 1.03062, 1.19126, 1.02231, 1.0955, 1.05064, 1.06655, 1.1517, 1.11568, 1.37446, 1.21005, 1.53165, 1.24599, 1.03436, 1.56617, 1.39613, 1.20613, 1.59751, 1.76157, 1.17134, 1.06152, 1.22514, 1.97917, 1.11879, 1.62597, 1.18846, 0.95412, 1.17247, 1.50913, 1.42049, 1.32267, 1.02991, 1.60853, 1.51052, 1.23861, 1.4438, 1.81637, 1.43133, 1.52934, 1.66869, 1.18507, 1.38099, 1.44638, 1.56369, 1.1851, 1.63779, 1.22939, 1.13585, 0.93198, 1.58024, 1.61619, 1.48199, 1.39642, 1.72479, 1.20982, 1.33257, 1.14605, 1.14908, 1.46659, 1.41611, 1.64334, 1.40953, 1.89405, 1.62101, 1.55, 1.25036, 1.73578, 1.20849, 1.16164, 2.00175, 1.79359, 1.54068, 1.27095, 1.51292, 1.45211, 1.55181, 1.38317, 1.19552, 1.41924, 1.0843, 1.11099, 1.49128, 1.31175, 1.31568, 1.31643, 1.38944, 1.83714, 1.51633, 1.66291, 1.32027, 1.40224, 1.23381, 1.24726, 1.17329, 1.41173, 1.41298, 1.21975, 1.40395, 1.29766, 1.647, 1.77185, 1.70549, 1.66243, 1.35144, 1.53811, 1.34558, 1.49398, 1.11503, 1.29778, 1.74207, 1.44213, 1.53886, 1.63632, 1.20482, 1.57111, 1.4054, 1.21748, 1.63569, 1.23136, 1.58159, 1.59579, 1.48012, 1.5323, 1.55081, 1.4194, 1.57228, 1.48387, 1.38849, 1.27392, 1.46178, 1.25824, 1.36062, 1.39751, 1.30771, 1.33147, 1.56583, 1.32709, 1.3646, 1.55907, 1.61002, 1.45173, 1.42035, 2.16284, 1.75737, 1.67782, 1.31786, 1.45228, 1.59778, 1.56015, 1.4983, 1.23696, 1.35268, 1.40317, 1.37404, 1.67666, 1.49364, 1.47162, 1.50218, 1.40879, 1.26151, 1.53009, 1.2357, 1.52653, 1.16029, 1.37287, 1.45359, 1.43811, 1.48164, 1.84101, 1.47755, 1.57834, 1.61834, 1.37842, 1.4784, 1.5761, 1.25832, 1.22282, 1.47102, 1.22564, 1.24267, 1.4204, 1.52394, 1.4913, 1.42263, 1.42192, 1.14735, 1.34499, 1.41439, 1.29824, 1.69085, 1.44146, 1.55667, 1.25423, 1.36428, 1.18219, 1.19336, 1.33449, 1.6401, 1.40383, 1.31292, 1.52789, 1.3215, 1.5794, 1.52614, 1.22037, 1.55665, 1.33214, 1.42978, 1.54699, 1.14418, 1.6388, 1.34807, 1.3749, 1.28337, 1.39417, 1.59994, 1.36359, 1.36119, 1.19917, 1.33658, 1.27596, 1.44996, 1.61368, 1.41282, 1.45175, 1.23245, 1.34616, 1.42121, 1.22977, 1.59453, 1.46628, 1.2612, 1.66869, 1.34891, 1.38326, 1.54549, 1.62587, 1.50361, 1.33282, 1.30675, 1.24628, 1.22264, 1.39221, 1.62236, 1.59048, 1.51538, 1.71681, 1.34251, 1.22656, 1.61992, 1.40775, 1.39241, 1.37966, 1.26457, 1.31626, 1.23459, 1.33073, 1.25512, 1.32646, 1.32216, 1.2607, 1.26972, 1.41721, 1.4656, 1.22975, 1.33206, 1.36899, 1.3651, 1.49566, 1.54131, 1.24469, 1.32355, 1.39775, 1.35713, 1.23875, 1.37455, 1.14642]}, "num-zeros": {"start_step": 0, "end_step": 2000, "step_interval": 5, "values": [58.0, 87.0, 81.0, 84.0, 84.0, 90.0, 104.0, 124.0, 102.0, 132.0, 129.0, 152.0, 143.0, 181.0, 202.0, 161.0, 161.0, 177.0, 184.0, 189.0, 151.0, 167.0, 183.0, 182.0, 186.0, 154.0, 178.0, 163.0, 167.0, 148.0, 145.0, 138.0, 187.0, 168.0, 140.0, 142.0, 167.0, 204.0, 169.0, 203.0, 148.0, 155.0, 141.0, 200.0, 190.0, 169.0, 187.0, 196.0, 175.0, 229.0, 207.0, 188.0, 199.0, 157.0, 186.0, 178.0, 154.0, 138.0, 248.0, 232.0, 174.0, 186.0, 188.0, 193.0, 201.0, 239.0, 207.0, 166.0, 208.0, 203.0, 208.0, 254.0, 168.0, 251.0, 210.0, 201.0, 239.0, 211.0, 241.0, 211.0, 204.0, 215.0, 193.0, 225.0, 213.0, 184.0, 182.0, 191.0, 206.0, 206.0, 188.0, 218.0, 214.0, 205.0, 203.0, 166.0, 206.0, 174.0, 195.0, 174.0, 140.0, 154.0, 176.0, 165.0, 129.0, 148.0, 168.0, 157.0, 137.0, 180.0, 175.0, 163.0, 175.0, 145.0, 138.0, 134.0, 159.0, 128.0, 173.0, 161.0, 151.0, 113.0, 133.0, 129.0, 177.0, 125.0, 153.0, 137.0, 120.0, 142.0, 148.0, 143.0, 100.0, 113.0, 106.0, 124.0, 129.0, 93.0, 119.0, 125.0, 107.0, 107.0, 141.0, 141.0, 122.0, 91.0, 142.0, 120.0, 101.0, 141.0, 130.0, 112.0, 107.0, 110.0, 132.0, 105.0, 102.0, 116.0, 115.0, 122.0, 96.0, 122.0, 87.0, 104.0, 112.0, 91.0, 110.0, 107.0, 101.0, 103.0, 107.0, 117.0, 83.0, 102.0, 105.0, 133.0, 96.0, 115.0, 93.0, 128.0, 129.0, 113.0, 112.0, 104.0, 104.0, 90.0, 85.0, 92.0, 96.0, 79.0, 140.0, 112.0, 103.0, 85.0, 96.0, 103.0, 104.0, 90.0, 109.0, 115.0, 113.0, 82.0, 123.0, 128.0, 86.0, 113.0, 103.0, 100.0, 129.0, 90.0, 96.0, 92.0, 106.0, 106.0, 113.0, 127.0, 112.0, 118.0, 96.0, 106.0, 114.0, 93.0, 85.0, 74.0, 105.0, 113.0, 97.0, 113.0, 107.0, 97.0, 109.0, 87.0, 89.0, 108.0, 106.0, 87.0, 120.0, 115.0, 109.0, 111.0, 100.0, 114.0, 102.0, 106.0, 94.0, 106.0, 77.0, 124.0, 112.0, 102.0, 104.0, 111.0, 109.0, 125.0, 114.0, 109.0, 120.0, 120.0, 103.0, 107.0, 86.0, 111.0, 95.0, 102.0, 108.0, 78.0, 100.0, 90.0, 107.0, 101.0, 104.0, 119.0, 100.0, 113.0, 110.0, 113.0, 90.0, 101.0, 107.0, 106.0, 111.0, 88.0, 125.0, 93.0, 106.0, 103.0, 116.0, 127.0, 100.0, 84.0, 102.0, 97.0, 97.0, 94.0, 120.0, 109.0, 110.0, 98.0, 97.0, 113.0, 108.0, 106.0, 143.0, 104.0, 111.0, 106.0, 103.0, 99.0, 110.0, 106.0, 130.0, 121.0, 112.0, 103.0, 101.0, 97.0, 115.0, 127.0, 117.0, 116.0, 109.0, 101.0, 129.0, 101.0, 99.0, 112.0, 91.0, 113.0, 104.0, 122.0, 91.0, 120.0, 124.0, 89.0, 106.0, 106.0, 119.0, 101.0, 98.0, 102.0, 129.0, 107.0, 116.0, 126.0, 127.0, 112.0, 86.0, 106.0, 136.0, 135.0, 107.0, 93.0, 102.0, 118.0, 117.0, 104.0, 123.0, 99.0, 114.0, 92.0, 128.0, 92.0, 107.0, 92.0, 124.0, 106.0, 101.0, 112.0, 106.0, 99.0, 107.0, 110.0, 97.0, 108.0, 117.0, 119.0, 102.0, 116.0, 116.0, 118.0, 108.0, 130.0, 116.0, 118.0, 122.0, 105.0, 104.0, 126.0, 123.0, 118.0, 124.0, 126.0, 97.0, 123.0, 133.0, 101.0, 117.0, 114.0, 120.0, 139.0]}, "num-zeros vs samples": {"start_step": 0, "end_step": 2000, "step_interval": 5, "values": [58.0, 87.0, 81.0, 84.0, 84.0, 90.0, 104.0, 124.0, 102.0, 132.0, 129.0, 152.0, 143.0, 181.0, 202.0, 161.0, 161.0, 177.0, 184.0, 189.0, 151.0, 167.0, 183.0, 182.0, 186.0, 154.0, 178.0, 163.0, 167.0, 148.0, 145.0, 138.0, 187.0, 168.0, 140.0, 142.0, 167.0, 204.0, 169.0, 203.0, 148.0, 155.0, 141.0, 200.0, 190.0, 169.0, 187.0, 196.0, 175.0, 229.0, 207.0, 188.0, 199.0, 157.0, 186.0, 178.0, 154.0, 138.0, 248.0, 232.0, 174.0, 186.0, 188.0, 193.0, 201.0, 239.0, 207.0, 166.0, 208.0, 203.0, 208.0, 254.0, 168.0, 251.0, 210.0, 201.0, 239.0, 211.0, 241.0, 211.0, 204.0, 215.0, 193.0, 225.0, 213.0, 184.0, 182.0, 191.0, 206.0, 206.0, 188.0, 218.0, 214.0, 205.0, 203.0, 166.0, 206.0, 174.0, 195.0, 174.0, 140.0, 154.0, 176.0, 165.0, 129.0, 148.0, 168.0, 157.0, 137.0, 180.0, 175.0, 163.0, 175.0, 145.0, 138.0, 134.0, 159.0, 128.0, 173.0, 161.0, 151.0, 113.0, 133.0, 129.0, 177.0, 125.0, 153.0, 137.0, 120.0, 142.0, 148.0, 143.0, 100.0, 113.0, 106.0, 124.0, 129.0, 93.0, 119.0, 125.0, 107.0, 107.0, 141.0, 141.0, 122.0, 91.0, 142.0, 120.0, 101.0, 141.0, 130.0, 112.0, 107.0, 110.0, 132.0, 105.0, 102.0, 116.0, 115.0, 122.0, 96.0, 122.0, 87.0, 104.0, 112.0, 91.0, 110.0, 107.0, 101.0, 103.0, 107.0, 117.0, 83.0, 102.0, 105.0, 133.0, 96.0, 115.0, 93.0, 128.0, 129.0, 113.0, 112.0, 104.0, 104.0, 90.0, 85.0, 92.0, 96.0, 79.0, 140.0, 112.0, 103.0, 85.0, 96.0, 103.0, 104.0, 90.0, 109.0, 115.0, 113.0, 82.0, 123.0, 128.0, 86.0, 113.0, 103.0, 100.0, 129.0, 90.0, 96.0, 92.0, 106.0, 106.0, 113.0, 127.0, 112.0, 118.0, 96.0, 106.0, 114.0, 93.0, 85.0, 74.0, 105.0, 113.0, 97.0, 113.0, 107.0, 97.0, 109.0, 87.0, 89.0, 108.0, 106.0, 87.0, 120.0, 115.0, 109.0, 111.0, 100.0, 114.0, 102.0, 106.0, 94.0, 106.0, 77.0, 124.0, 112.0, 102.0, 104.0, 111.0, 109.0, 125.0, 114.0, 109.0, 120.0, 120.0, 103.0, 107.0, 86.0, 111.0, 95.0, 102.0, 108.0, 78.0, 100.0, 90.0, 107.0, 101.0, 104.0, 119.0, 100.0, 113.0, 110.0, 113.0, 90.0, 101.0, 107.0, 106.0, 111.0, 88.0, 125.0, 93.0, 106.0, 103.0, 116.0, 127.0, 100.0, 84.0, 102.0, 97.0, 97.0, 94.0, 120.0, 109.0, 110.0, 98.0, 97.0, 113.0, 108.0, 106.0, 143.0, 104.0, 111.0, 106.0, 103.0, 99.0, 110.0, 106.0, 130.0, 121.0, 112.0, 103.0, 101.0, 97.0, 115.0, 127.0, 117.0, 116.0, 109.0, 101.0, 129.0, 101.0, 99.0, 112.0, 91.0, 113.0, 104.0, 122.0, 91.0, 120.0, 124.0, 89.0, 106.0, 106.0, 119.0, 101.0, 98.0, 102.0, 129.0, 107.0, 116.0, 126.0, 127.0, 112.0, 86.0, 106.0, 136.0, 135.0, 107.0, 93.0, 102.0, 118.0, 117.0, 104.0, 123.0, 99.0, 114.0, 92.0, 128.0, 92.0, 107.0, 92.0, 124.0, 106.0, 101.0, 112.0, 106.0, 99.0, 107.0, 110.0, 97.0, 108.0, 117.0, 119.0, 102.0, 116.0, 116.0, 118.0, 108.0, 130.0, 116.0, 118.0, 122.0, 105.0, 104.0, 126.0, 123.0, 118.0, 124.0, 126.0, 97.0, 123.0, 133.0, 101.0, 117.0, 114.0, 120.0, 139.0]}, "params-norm": {"start_step": 0, "end_step": 2000, "step_interval": 5, "values": [180.15739, 180.15739, 180.15739, 180.15739, 180.15739, 180.15738, 180.15736, 180.15726, 180.15707, 180.15691, 180.15549, 180.15459, 180.15424, 180.15187, 180.15096, 180.15027, 180.14986, 180.14993, 180.15019, 180.15031, 180.15027, 180.14986, 180.14978, 180.15002, 180.15096, 180.15236, 180.15356, 180.15433, 180.15535, 180.15683, 180.15872, 180.16106, 180.16333, 180.16548, 180.16803, 180.17111, 180.17455, 180.1783, 180.18213, 180.18637, 180.19121, 180.19637, 180.20183, 180.20786, 180.21451, 180.22182, 180.22966, 180.23802, 180.24725, 180.25742, 180.2684, 180.28008, 180.29228, 180.30507, 180.31865, 180.33281, 180.34721, 180.36223, 180.37819, 180.39531, 180.41338, 180.43228, 180.45262, 180.47394, 180.49564, 180.51866, 180.54247, 180.56686, 180.59306, 180.6189, 180.64566, 180.6731, 180.70131, 180.72955, 180.75832, 180.78758, 180.81717, 180.84805, 180.8793, 180.91136, 180.94365, 180.97591, 181.00896, 181.04247, 181.07669, 181.11148, 181.14615, 181.18118, 181.2169, 181.25371, 181.29126, 181.32945, 181.36674, 181.40437, 181.4427, 181.4816, 181.51944, 181.5558, 181.59123, 181.62697, 181.66261, 181.69635, 181.73094, 181.76637, 181.8006, 181.83632, 181.87393, 181.91217, 181.95012, 181.9888, 182.0287, 182.06952, 182.11082, 182.15179, 182.19136, 182.23178, 182.27216, 182.31206, 182.35109, 182.39093, 182.43059, 182.47116, 182.51115, 182.55157, 182.59242, 182.63356, 182.67308, 182.71248, 182.75157, 182.79005, 182.8289, 182.86778, 182.90854, 182.9481, 182.98575, 183.02332, 183.0623, 183.0995, 183.13556, 183.17046, 183.20383, 183.23506, 183.26553, 183.2989, 183.33479, 183.37086, 183.40509, 183.44055, 183.47644, 183.51241, 183.54857, 183.58354, 183.61832, 183.65422, 183.69316, 183.73344, 183.77179, 183.80856, 183.84579, 183.88249, 183.91859, 183.95512, 183.99037, 184.02548, 184.063, 184.10135, 184.13824, 184.17474, 184.21408, 184.25304, 184.29404, 184.33496, 184.37621, 184.41531, 184.4537, 184.4928, 184.53014, 184.56731, 184.60611, 184.64619, 184.68703, 184.72823, 184.77042, 184.81314, 184.85387, 184.89021, 184.92393, 184.95621, 184.99136, 185.02664, 185.06209, 185.10019, 185.14125, 185.18129, 185.22131, 185.26175, 185.30276, 185.34607, 185.38876, 185.43182, 185.47507, 185.51636, 185.55836, 185.60168, 185.64523, 185.68893, 185.73134, 185.77113, 185.80952, 185.84686, 185.88496, 185.92491, 185.96541, 186.00458, 186.04584, 186.08769, 186.13078, 186.17444, 186.2169, 186.25897, 186.30052, 186.34146, 186.38252, 186.42355, 186.46315, 186.50108, 186.53908, 186.57777, 186.61641, 186.65698, 186.69749, 186.73779, 186.776, 186.81406, 186.85432, 186.89455, 186.93593, 186.97723, 187.02032, 187.06329, 187.10561, 187.14796, 187.19154, 187.23483, 187.27914, 187.32254, 187.36426, 187.40421, 187.44449, 187.48557, 187.52713, 187.5705, 187.61469, 187.65993, 187.70628, 187.75299, 187.79915, 187.84256, 187.8851, 187.92828, 187.97391, 188.02026, 188.06656, 188.11136, 188.15483, 188.19771, 188.23875, 188.28041, 188.32339, 188.36717, 188.41173, 188.4559, 188.49995, 188.54559, 188.59273, 188.64139, 188.68826, 188.73679, 188.7838, 188.82909, 188.87553, 188.92162, 188.96811, 189.01474, 189.06255, 189.10872, 189.15393, 189.19994, 189.24557, 189.29164, 189.3381, 189.38397, 189.42863, 189.47279, 189.51843, 189.5647, 189.61183, 189.66019, 189.7094, 189.7603, 189.81245, 189.86432, 189.91537, 189.96579, 190.01378, 190.06058, 190.10844, 190.15665, 190.20692, 190.2585, 190.31071, 190.36349, 190.41649, 190.46754, 190.51726, 190.56802, 190.62105, 190.67397, 190.72807, 190.78218, 190.8349, 190.88562, 190.93848, 190.99274, 191.04617, 191.0997, 191.15161, 191.20273, 191.25496, 191.30672, 191.35922, 191.41141, 191.46227, 191.51437, 191.56682, 191.6205, 191.67529, 191.73068, 191.78505, 191.8385, 191.89308, 191.94789, 192.0024, 192.05864, 192.11432, 192.1684, 192.22186, 192.27574, 192.33052, 192.38582, 192.44121, 192.49785, 192.55418, 192.60825, 192.66292, 192.71729, 192.77345, 192.82953, 192.88582, 192.94179, 192.99664, 193.05156, 193.1075, 193.16364, 193.22198, 193.27934, 193.33693, 193.3927, 193.44841, 193.50385, 193.55917, 193.61432, 193.67184, 193.72919, 193.78648, 193.8439, 193.90105, 193.95886, 194.0177, 194.07675, 194.13638, 194.19586, 194.25424, 194.31471, 194.37587, 194.43796, 194.50008, 194.56322, 194.62543, 194.68716, 194.74808, 194.80829, 194.8662, 194.92447, 194.9838, 195.04256, 195.10059, 195.16046, 195.22166, 195.2832]}, "params-norm vs samples": {"start_step": 0, "end_step": 2000, "step_interval": 5, "values": [180.15739, 180.15739, 180.15739, 180.15739, 180.15739, 180.15738, 180.15736, 180.15726, 180.15707, 180.15691, 180.15549, 180.15459, 180.15424, 180.15187, 180.15096, 180.15027, 180.14986, 180.14993, 180.15019, 180.15031, 180.15027, 180.14986, 180.14978, 180.15002, 180.15096, 180.15236, 180.15356, 180.15433, 180.15535, 180.15683, 180.15872, 180.16106, 180.16333, 180.16548, 180.16803, 180.17111, 180.17455, 180.1783, 180.18213, 180.18637, 180.19121, 180.19637, 180.20183, 180.20786, 180.21451, 180.22182, 180.22966, 180.23802, 180.24725, 180.25742, 180.2684, 180.28008, 180.29228, 180.30507, 180.31865, 180.33281, 180.34721, 180.36223, 180.37819, 180.39531, 180.41338, 180.43228, 180.45262, 180.47394, 180.49564, 180.51866, 180.54247, 180.56686, 180.59306, 180.6189, 180.64566, 180.6731, 180.70131, 180.72955, 180.75832, 180.78758, 180.81717, 180.84805, 180.8793, 180.91136, 180.94365, 180.97591, 181.00896, 181.04247, 181.07669, 181.11148, 181.14615, 181.18118, 181.2169, 181.25371, 181.29126, 181.32945, 181.36674, 181.40437, 181.4427, 181.4816, 181.51944, 181.5558, 181.59123, 181.62697, 181.66261, 181.69635, 181.73094, 181.76637, 181.8006, 181.83632, 181.87393, 181.91217, 181.95012, 181.9888, 182.0287, 182.06952, 182.11082, 182.15179, 182.19136, 182.23178, 182.27216, 182.31206, 182.35109, 182.39093, 182.43059, 182.47116, 182.51115, 182.55157, 182.59242, 182.63356, 182.67308, 182.71248, 182.75157, 182.79005, 182.8289, 182.86778, 182.90854, 182.9481, 182.98575, 183.02332, 183.0623, 183.0995, 183.13556, 183.17046, 183.20383, 183.23506, 183.26553, 183.2989, 183.33479, 183.37086, 183.40509, 183.44055, 183.47644, 183.51241, 183.54857, 183.58354, 183.61832, 183.65422, 183.69316, 183.73344, 183.77179, 183.80856, 183.84579, 183.88249, 183.91859, 183.95512, 183.99037, 184.02548, 184.063, 184.10135, 184.13824, 184.17474, 184.21408, 184.25304, 184.29404, 184.33496, 184.37621, 184.41531, 184.4537, 184.4928, 184.53014, 184.56731, 184.60611, 184.64619, 184.68703, 184.72823, 184.77042, 184.81314, 184.85387, 184.89021, 184.92393, 184.95621, 184.99136, 185.02664, 185.06209, 185.10019, 185.14125, 185.18129, 185.22131, 185.26175, 185.30276, 185.34607, 185.38876, 185.43182, 185.47507, 185.51636, 185.55836, 185.60168, 185.64523, 185.68893, 185.73134, 185.77113, 185.80952, 185.84686, 185.88496, 185.92491, 185.96541, 186.00458, 186.04584, 186.08769, 186.13078, 186.17444, 186.2169, 186.25897, 186.30052, 186.34146, 186.38252, 186.42355, 186.46315, 186.50108, 186.53908, 186.57777, 186.61641, 186.65698, 186.69749, 186.73779, 186.776, 186.81406, 186.85432, 186.89455, 186.93593, 186.97723, 187.02032, 187.06329, 187.10561, 187.14796, 187.19154, 187.23483, 187.27914, 187.32254, 187.36426, 187.40421, 187.44449, 187.48557, 187.52713, 187.5705, 187.61469, 187.65993, 187.70628, 187.75299, 187.79915, 187.84256, 187.8851, 187.92828, 187.97391, 188.02026, 188.06656, 188.11136, 188.15483, 188.19771, 188.23875, 188.28041, 188.32339, 188.36717, 188.41173, 188.4559, 188.49995, 188.54559, 188.59273, 188.64139, 188.68826, 188.73679, 188.7838, 188.82909, 188.87553, 188.92162, 188.96811, 189.01474, 189.06255, 189.10872, 189.15393, 189.19994, 189.24557, 189.29164, 189.3381, 189.38397, 189.42863, 189.47279, 189.51843, 189.5647, 189.61183, 189.66019, 189.7094, 189.7603, 189.81245, 189.86432, 189.91537, 189.96579, 190.01378, 190.06058, 190.10844, 190.15665, 190.20692, 190.2585, 190.31071, 190.36349, 190.41649, 190.46754, 190.51726, 190.56802, 190.62105, 190.67397, 190.72807, 190.78218, 190.8349, 190.88562, 190.93848, 190.99274, 191.04617, 191.0997, 191.15161, 191.20273, 191.25496, 191.30672, 191.35922, 191.41141, 191.46227, 191.51437, 191.56682, 191.6205, 191.67529, 191.73068, 191.78505, 191.8385, 191.89308, 191.94789, 192.0024, 192.05864, 192.11432, 192.1684, 192.22186, 192.27574, 192.33052, 192.38582, 192.44121, 192.49785, 192.55418, 192.60825, 192.66292, 192.71729, 192.77345, 192.82953, 192.88582, 192.94179, 192.99664, 193.05156, 193.1075, 193.16364, 193.22198, 193.27934, 193.33693, 193.3927, 193.44841, 193.50385, 193.55917, 193.61432, 193.67184, 193.72919, 193.78648, 193.8439, 193.90105, 193.95886, 194.0177, 194.07675, 194.13638, 194.19586, 194.25424, 194.31471, 194.37587, 194.43796, 194.50008, 194.56322, 194.62543, 194.68716, 194.74808, 194.80829, 194.8662, 194.92447, 194.9838, 195.04256, 195.10059, 195.16046, 195.22166, 195.2832]}, "iteration-time": {"start_step": 0, "end_step": 2000, "step_interval": 5, "values": [30.41341, 2.8046, 2.79928, 2.80445, 2.79909, 2.80635, 2.79849, 2.79809, 2.80876, 2.80642, 2.79859, 2.80408, 2.80282, 2.80528, 2.80514, 2.80807, 2.80806, 2.80751, 2.80996, 2.80978, 2.80663, 2.80424, 2.81097, 2.81307, 2.81122, 2.80264, 2.80542, 2.80789, 2.81202, 2.80175, 2.80699, 2.81063, 2.81844, 2.82302, 2.81854, 2.8107, 2.81902, 2.8157, 2.82159, 2.81915, 2.81816, 2.82321, 2.81751, 2.82121, 2.82517, 2.83278, 2.81862, 2.81687, 2.82205, 2.8171, 2.81951, 2.81838, 2.81328, 2.82805, 2.91883, 2.83795, 2.82853, 2.82715, 2.82978, 2.83004, 2.83565, 2.83193, 2.83679, 2.83184, 2.83322, 2.83292, 2.82436, 2.82807, 2.82713, 2.82297, 2.82207, 2.81925, 2.82219, 2.82388, 2.82547, 2.82046, 2.82554, 2.82609, 2.81973, 2.81555, 2.80902, 2.81328, 2.81723, 2.81808, 2.8209, 2.81658, 2.82868, 2.82046, 2.82766, 2.82547, 2.82306, 2.82434, 2.82165, 2.82182, 2.82079, 2.8171, 2.82456, 2.81695, 2.81958, 2.81888, 2.82274, 2.82232, 2.82111, 2.81589, 2.81554, 2.82411, 2.82116, 2.81529, 2.82499, 2.81696, 2.81507, 2.81149, 2.81848, 2.81732, 2.81615, 2.81512, 2.81829, 2.8116, 2.80978, 2.81506, 2.81764, 2.8198, 2.81632, 2.81606, 2.80897, 2.81568, 2.82245, 2.81885, 2.82606, 2.81987, 2.8158, 2.82143, 2.8193, 2.82472, 2.81111, 2.81631, 2.83592, 2.81315, 2.82779, 2.82235, 2.83714, 2.8297, 2.837, 2.83586, 2.83284, 2.83636, 2.83258, 2.83915, 2.83419, 2.83824, 2.84049, 2.84197, 2.84072, 2.83281, 2.82944, 2.8375, 2.81702, 2.84669, 2.82923, 2.81781, 2.82019, 2.82199, 2.81611, 2.82377, 2.82298, 2.82195, 2.81502, 2.81982, 2.8244, 2.83221, 2.82765, 2.81874, 2.82405, 2.81662, 2.82101, 2.8221, 2.81703, 2.81771, 2.81876, 2.81927, 2.8219, 2.81857, 2.82075, 2.8191, 2.82229, 2.82063, 2.82301, 2.82242, 2.82223, 2.81908, 2.82481, 2.82407, 2.82328, 2.82304, 2.8156, 2.8223, 2.8283, 2.82746, 2.83015, 2.82908, 2.79797, 2.79998, 2.78923, 2.79503, 2.80833, 2.79099, 2.78989, 2.78911, 2.78508, 2.78213, 2.78209, 2.79677, 2.78643, 2.78646, 2.78817, 2.77762, 2.78837, 2.78968, 2.78321, 2.78471, 2.78732, 2.79108, 2.78484, 2.79823, 2.78713, 2.78768, 2.78784, 2.78488, 2.7883, 2.78899, 2.79726, 2.78764, 2.79575, 2.7903, 2.7943, 2.78923, 2.79105, 2.78913, 2.78266, 2.78538, 2.78833, 2.79805, 2.78908, 2.79905, 2.79128, 2.79609, 2.79756, 2.78663, 2.79377, 2.83553, 2.82821, 2.82975, 2.82985, 2.8276, 2.83102, 2.82461, 2.83883, 2.82299, 2.82069, 2.82305, 2.81459, 2.82648, 2.82175, 2.82728, 2.82733, 2.82099, 2.83858, 2.83126, 2.83115, 2.82847, 2.83258, 2.83579, 2.83969, 2.83857, 2.86059, 2.84207, 2.84007, 2.84684, 2.84306, 2.84137, 2.84087, 2.79807, 2.79644, 2.79588, 2.79211, 2.79479, 2.80066, 2.79173, 2.79944, 2.79749, 2.80704, 2.79981, 2.79552, 2.79711, 2.7928, 2.79311, 2.78965, 2.78698, 2.78443, 2.78879, 2.79821, 2.79383, 2.79253, 2.79447, 2.78491, 2.77925, 2.78353, 2.78445, 2.79082, 2.79857, 2.80414, 2.80257, 2.78642, 2.78648, 2.78739, 2.78471, 2.78001, 2.78196, 2.78327, 2.78431, 2.791, 2.78454, 2.78713, 2.78803, 2.78024, 2.776, 2.77716, 2.78213, 2.78774, 2.78732, 2.78532, 2.78606, 2.78414, 2.77758, 2.78443, 2.77071, 2.77741, 2.78603, 2.78774, 2.78521, 2.78444, 2.78878, 2.774, 2.78293, 2.78129, 2.78025, 2.78828, 2.78815, 2.78075, 2.78504, 2.77911, 2.77515, 2.77671, 2.77649, 2.88175, 2.77346, 2.78223, 2.78354, 2.77649, 2.78232, 2.77496, 2.78767, 2.7835, 2.77767, 2.7876, 2.78256, 2.77263, 2.77761, 2.77618, 2.782, 2.78046, 2.7906, 2.78832, 2.78117, 2.77888, 2.79122, 2.79084, 2.78287, 2.77695, 2.77599, 2.78415, 2.77982, 2.77929, 2.77879, 2.77575, 2.77152, 2.77167, 2.78528, 2.77604, 2.785, 2.78948, 2.7772, 2.78592, 2.77735, 2.77812, 2.80061, 2.78402, 2.79223, 2.78189, 2.78928]}, "lm loss validation": {"start_step": 0, "end_step": 4, "step_interval": 5, "values": [5.60622]}, "lm loss validation vs samples": {"start_step": 0, "end_step": 4, "step_interval": 5, "values": [5.60622]}, "lm loss validation ppl": {"start_step": 0, "end_step": 4, "step_interval": 5, "values": [272.11401]}, "lm loss validation ppl vs samples": {"start_step": 0, "end_step": 4, "step_interval": 5, "values": [272.11401]}}
\ No newline at end of file
diff --git a/tests/functional_tests/test_cases/gpt/gpt3_345m_weekly_dgx_h100_1N8G_mcore_tp4_pp2_fp8_tp_pp/golden_values_lts.json b/tests/functional_tests/test_cases/gpt/gpt3_345m_weekly_dgx_h100_1N8G_mcore_tp4_pp2_fp8_tp_pp/golden_values_lts.json
new file mode 100644
index 0000000000..3d10208bdb
--- /dev/null
+++ b/tests/functional_tests/test_cases/gpt/gpt3_345m_weekly_dgx_h100_1N8G_mcore_tp4_pp2_fp8_tp_pp/golden_values_lts.json
@@ -0,0 +1 @@
+{"forward-backward-time": {"start_step": 0, "end_step": 2000, "step_interval": 5, "values": [23.87084, 2.7908, 2.78539, 2.7894, 2.7852, 2.79146, 2.78472, 2.78272, 2.79513, 2.79226, 2.78492, 2.79008, 2.7883, 2.79109, 2.79145, 2.79405, 2.79452, 2.79382, 2.79611, 2.79622, 2.79284, 2.79072, 2.79713, 2.79936, 2.79764, 2.78902, 2.79179, 2.79398, 2.79758, 2.78776, 2.79263, 2.79691, 2.80152, 2.80908, 2.80472, 2.79568, 2.80506, 2.80202, 2.80799, 2.80521, 2.80461, 2.8094, 2.80343, 2.80761, 2.81112, 2.81918, 2.80453, 2.80312, 2.80829, 2.80344, 2.80562, 2.80427, 2.79734, 2.81406, 2.90515, 2.82407, 2.81478, 2.81303, 2.81592, 2.81601, 2.82191, 2.81825, 2.82313, 2.81813, 2.8193, 2.81849, 2.80988, 2.81403, 2.81327, 2.80905, 2.80847, 2.80536, 2.80854, 2.8101, 2.81145, 2.80684, 2.81147, 2.81242, 2.80609, 2.80189, 2.79515, 2.7996, 2.80311, 2.8045, 2.80721, 2.80272, 2.81517, 2.80665, 2.81404, 2.81132, 2.80918, 2.80977, 2.80802, 2.80672, 2.80661, 2.80353, 2.81098, 2.80324, 2.80589, 2.80502, 2.80911, 2.80853, 2.80753, 2.80189, 2.80083, 2.8104, 2.80739, 2.80143, 2.8113, 2.80321, 2.80139, 2.79801, 2.80488, 2.80348, 2.80222, 2.80147, 2.80475, 2.79774, 2.79626, 2.80141, 2.80405, 2.80603, 2.80138, 2.80245, 2.79478, 2.80184, 2.80852, 2.8046, 2.81228, 2.80607, 2.80189, 2.80761, 2.80561, 2.8108, 2.79699, 2.80217, 2.82211, 2.79924, 2.81403, 2.80853, 2.8231, 2.81577, 2.8231, 2.82156, 2.81887, 2.82238, 2.81839, 2.82501, 2.81996, 2.82429, 2.82644, 2.82806, 2.82682, 2.8177, 2.81557, 2.82321, 2.80343, 2.83308, 2.81556, 2.80394, 2.8065, 2.80837, 2.80217, 2.81017, 2.80941, 2.80836, 2.80137, 2.80618, 2.8106, 2.81859, 2.81372, 2.80415, 2.81048, 2.80289, 2.8074, 2.80851, 2.80327, 2.80386, 2.80501, 2.80423, 2.80829, 2.80479, 2.80551, 2.80503, 2.80867, 2.80686, 2.80919, 2.80825, 2.80825, 2.80524, 2.8104, 2.81017, 2.8092, 2.80887, 2.80127, 2.80865, 2.81409, 2.81338, 2.81622, 2.81551, 2.78402, 2.78667, 2.77607, 2.78149, 2.79485, 2.77794, 2.77679, 2.77522, 2.77183, 2.76873, 2.76746, 2.78341, 2.77337, 2.77333, 2.77216, 2.76418, 2.77521, 2.77572, 2.77007, 2.77107, 2.77433, 2.7767, 2.77171, 2.78519, 2.77337, 2.77435, 2.77481, 2.77069, 2.77522, 2.77587, 2.78393, 2.7743, 2.78225, 2.77729, 2.7811, 2.77531, 2.77781, 2.77542, 2.76967, 2.77202, 2.77351, 2.78458, 2.77568, 2.78594, 2.7783, 2.78007, 2.78444, 2.77342, 2.77788, 2.8174, 2.80994, 2.81175, 2.8116, 2.80961, 2.81294, 2.80664, 2.82069, 2.80473, 2.80257, 2.80502, 2.79658, 2.80824, 2.80374, 2.80925, 2.80871, 2.80288, 2.82051, 2.81324, 2.81301, 2.81015, 2.81433, 2.81771, 2.82163, 2.82047, 2.84243, 2.82391, 2.82193, 2.82874, 2.82499, 2.82329, 2.82269, 2.78491, 2.78347, 2.78283, 2.77915, 2.78184, 2.78745, 2.77885, 2.78616, 2.78454, 2.79387, 2.78599, 2.78264, 2.78415, 2.77954, 2.78012, 2.77574, 2.77417, 2.77157, 2.77598, 2.78523, 2.78094, 2.77956, 2.78155, 2.76974, 2.76609, 2.77059, 2.7715, 2.77799, 2.78545, 2.79125, 2.78957, 2.7735, 2.77351, 2.77438, 2.77082, 2.76702, 2.76913, 2.77001, 2.77136, 2.77805, 2.77172, 2.77423, 2.77469, 2.76739, 2.76274, 2.76413, 2.769, 2.7747, 2.77447, 2.77236, 2.77322, 2.77126, 2.76432, 2.77139, 2.75782, 2.76437, 2.77311, 2.77485, 2.77226, 2.7716, 2.77527, 2.76108, 2.76967, 2.76835, 2.76738, 2.77531, 2.77528, 2.76726, 2.77204, 2.76615, 2.76217, 2.76346, 2.76358, 2.86867, 2.76052, 2.76931, 2.77037, 2.76368, 2.76923, 2.76194, 2.77432, 2.77035, 2.76442, 2.77453, 2.76955, 2.75944, 2.76101, 2.76318, 2.76891, 2.7675, 2.77756, 2.77522, 2.76826, 2.76436, 2.77785, 2.77783, 2.76832, 2.76347, 2.76291, 2.77118, 2.76677, 2.76612, 2.76582, 2.76273, 2.75857, 2.75873, 2.7722, 2.76177, 2.77171, 2.77644, 2.7639, 2.7721, 2.76437, 2.76496, 2.78781, 2.7708, 2.77914, 2.7677, 2.77621]}, "forward-compute-time": {"start_step": 0, "end_step": 2000, "step_interval": 5, "values": [9.51205, 1.43678, 1.43791, 1.4403, 1.43427, 1.43756, 1.43758, 1.43562, 1.44189, 1.44431, 1.43685, 1.43669, 1.43665, 1.43656, 1.44116, 1.44015, 1.44001, 1.44016, 1.4435, 1.44113, 1.44161, 1.44108, 1.44253, 1.44731, 1.44571, 1.43765, 1.44091, 1.44413, 1.44785, 1.43882, 1.44323, 1.43963, 1.44096, 1.44584, 1.4433, 1.43872, 1.44424, 1.44585, 1.4456, 1.44851, 1.44579, 1.4472, 1.44488, 1.44427, 1.44702, 1.44843, 1.44696, 1.44174, 1.44868, 1.44573, 1.44263, 1.44873, 1.44368, 1.45098, 1.50386, 1.46222, 1.45889, 1.46823, 1.45958, 1.46199, 1.45939, 1.46248, 1.46055, 1.46617, 1.46663, 1.46838, 1.45647, 1.45342, 1.45158, 1.44745, 1.45071, 1.44757, 1.45057, 1.45354, 1.45015, 1.45365, 1.45031, 1.45396, 1.44855, 1.44723, 1.44555, 1.44612, 1.44775, 1.44969, 1.45014, 1.4487, 1.447, 1.44896, 1.4498, 1.45306, 1.45037, 1.4495, 1.44838, 1.44482, 1.45215, 1.448, 1.45159, 1.44448, 1.44896, 1.44752, 1.44756, 1.45023, 1.45026, 1.44675, 1.44444, 1.45064, 1.44643, 1.44631, 1.45024, 1.44933, 1.44526, 1.44522, 1.44467, 1.4481, 1.44864, 1.45043, 1.45185, 1.44907, 1.44793, 1.45106, 1.44909, 1.44946, 1.44262, 1.43975, 1.44103, 1.44743, 1.45025, 1.4482, 1.45283, 1.44737, 1.44579, 1.44509, 1.44631, 1.44428, 1.44535, 1.45213, 1.45201, 1.44741, 1.45012, 1.45313, 1.47204, 1.46712, 1.47171, 1.47404, 1.47244, 1.46786, 1.46879, 1.46914, 1.47064, 1.46718, 1.47001, 1.47261, 1.47278, 1.46528, 1.46833, 1.46966, 1.44696, 1.45977, 1.44861, 1.44782, 1.44378, 1.44407, 1.44816, 1.45245, 1.449, 1.44784, 1.4449, 1.44523, 1.44905, 1.45312, 1.44739, 1.44742, 1.45369, 1.44478, 1.44662, 1.44949, 1.4459, 1.4448, 1.44385, 1.44392, 1.45267, 1.44333, 1.44892, 1.44724, 1.4485, 1.44583, 1.44996, 1.4476, 1.4446, 1.44975, 1.451, 1.45004, 1.44925, 1.45149, 1.44617, 1.44967, 1.44957, 1.45131, 1.45283, 1.4513, 1.42552, 1.41683, 1.41289, 1.41323, 1.41749, 1.41143, 1.41101, 1.4112, 1.4135, 1.41006, 1.4137, 1.41016, 1.41535, 1.41173, 1.41324, 1.40716, 1.40976, 1.40928, 1.41, 1.40851, 1.40949, 1.41481, 1.40726, 1.41247, 1.40893, 1.40726, 1.41201, 1.41338, 1.41944, 1.41452, 1.41165, 1.41022, 1.41318, 1.41802, 1.41449, 1.41063, 1.41492, 1.41265, 1.41132, 1.41365, 1.41475, 1.41847, 1.41122, 1.41128, 1.41301, 1.41405, 1.41415, 1.41581, 1.41619, 1.42827, 1.42088, 1.42041, 1.42456, 1.42192, 1.42307, 1.42073, 1.42805, 1.42078, 1.42396, 1.42359, 1.42048, 1.42105, 1.41976, 1.4247, 1.42503, 1.42186, 1.42845, 1.42785, 1.42791, 1.4201, 1.42849, 1.42307, 1.43185, 1.43491, 1.44341, 1.43591, 1.44767, 1.44319, 1.43803, 1.4396, 1.43766, 1.41441, 1.41492, 1.41502, 1.41802, 1.41644, 1.41395, 1.4088, 1.41436, 1.41116, 1.41904, 1.41497, 1.4117, 1.41375, 1.41211, 1.41098, 1.41349, 1.40846, 1.41118, 1.41363, 1.41608, 1.41063, 1.40863, 1.40931, 1.40576, 1.40253, 1.40633, 1.4031, 1.40517, 1.40582, 1.40973, 1.41428, 1.41255, 1.41129, 1.4127, 1.41154, 1.40611, 1.40611, 1.40794, 1.41156, 1.40745, 1.41035, 1.4097, 1.40988, 1.40878, 1.40716, 1.40765, 1.41137, 1.4109, 1.40902, 1.41507, 1.40796, 1.41525, 1.40249, 1.40831, 1.39916, 1.40546, 1.40999, 1.41032, 1.41283, 1.41312, 1.40738, 1.40936, 1.40757, 1.41053, 1.40694, 1.40948, 1.41066, 1.40854, 1.40655, 1.41367, 1.41378, 1.40999, 1.41174, 1.51942, 1.40444, 1.4119, 1.41683, 1.40936, 1.41487, 1.40883, 1.41143, 1.41268, 1.40887, 1.41527, 1.41408, 1.41281, 1.41183, 1.4134, 1.4109, 1.41349, 1.41109, 1.41503, 1.4111, 1.40948, 1.41361, 1.41212, 1.40741, 1.40997, 1.41405, 1.41032, 1.40943, 1.40908, 1.40969, 1.40965, 1.40759, 1.41424, 1.41408, 1.41111, 1.41223, 1.4114, 1.41026, 1.41191, 1.40822, 1.40981, 1.41905, 1.4096, 1.41551, 1.40808, 1.41685]}, "backward-compute-time": {"start_step": 0, "end_step": 2000, "step_interval": 5, "values": [5.76315, 1.31571, 1.31593, 1.31502, 1.31389, 1.32096, 1.31535, 1.31393, 1.31645, 1.31983, 1.31373, 1.31879, 1.31981, 1.31802, 1.31437, 1.31804, 1.3168, 1.3164, 1.31781, 1.31891, 1.31627, 1.31955, 1.31518, 1.32254, 1.32375, 1.31999, 1.31794, 1.32051, 1.32225, 1.32201, 1.32279, 1.32113, 1.32401, 1.32399, 1.32517, 1.32129, 1.32334, 1.32013, 1.32408, 1.32339, 1.32077, 1.32325, 1.32393, 1.32691, 1.3248, 1.32346, 1.32319, 1.32546, 1.32574, 1.32432, 1.32506, 1.32316, 1.32102, 1.32498, 1.31925, 1.32089, 1.31762, 1.32259, 1.32419, 1.3238, 1.3311, 1.31611, 1.31766, 1.31858, 1.31753, 1.31906, 1.32287, 1.32538, 1.32481, 1.32145, 1.32464, 1.32198, 1.3244, 1.32137, 1.31992, 1.31987, 1.32194, 1.31437, 1.3176, 1.31699, 1.31617, 1.31875, 1.32414, 1.32452, 1.31883, 1.32118, 1.32409, 1.32097, 1.32779, 1.31828, 1.31626, 1.32197, 1.32549, 1.32434, 1.32206, 1.31897, 1.31696, 1.32081, 1.31817, 1.32008, 1.32093, 1.32034, 1.32057, 1.3194, 1.31784, 1.32222, 1.31761, 1.31937, 1.32438, 1.32014, 1.31951, 1.31748, 1.31751, 1.31806, 1.31789, 1.32196, 1.32358, 1.31991, 1.31901, 1.32185, 1.32603, 1.32323, 1.32207, 1.31786, 1.31601, 1.32365, 1.32045, 1.31939, 1.32039, 1.31927, 1.31562, 1.32046, 1.31813, 1.32192, 1.31787, 1.31521, 1.33243, 1.31979, 1.3209, 1.32524, 1.32073, 1.31982, 1.31934, 1.32334, 1.31999, 1.32008, 1.32149, 1.32088, 1.31917, 1.3216, 1.3281, 1.32441, 1.33089, 1.32051, 1.31858, 1.32678, 1.32537, 1.3342, 1.32893, 1.32448, 1.32645, 1.32391, 1.3234, 1.32535, 1.32031, 1.32412, 1.3238, 1.32447, 1.32647, 1.32957, 1.32786, 1.3237, 1.32721, 1.32175, 1.32877, 1.32685, 1.32128, 1.32422, 1.32282, 1.32689, 1.33079, 1.33206, 1.32599, 1.32533, 1.32086, 1.32573, 1.32664, 1.31836, 1.32782, 1.32904, 1.32799, 1.32601, 1.32546, 1.32741, 1.32429, 1.32809, 1.32601, 1.32401, 1.32374, 1.32751, 1.32317, 1.32231, 1.32071, 1.32437, 1.32903, 1.3223, 1.32056, 1.32302, 1.32275, 1.32175, 1.31913, 1.32111, 1.3226, 1.32065, 1.32224, 1.31853, 1.32253, 1.32127, 1.3209, 1.31926, 1.31964, 1.3227, 1.32157, 1.32205, 1.3223, 1.31767, 1.31875, 1.31811, 1.3211, 1.3162, 1.32259, 1.3172, 1.31878, 1.31747, 1.32111, 1.31966, 1.31682, 1.32112, 1.31521, 1.31669, 1.31901, 1.32814, 1.32216, 1.32442, 1.32313, 1.32151, 1.3243, 1.3203, 1.31897, 1.32073, 1.32493, 1.3246, 1.31844, 1.3284, 1.32684, 1.31608, 1.32499, 1.31768, 1.31464, 1.31825, 1.31743, 1.32077, 1.31974, 1.32195, 1.32195, 1.32016, 1.32093, 1.32005, 1.32407, 1.31906, 1.32446, 1.32365, 1.32141, 1.32093, 1.33319, 1.32834, 1.32237, 1.32312, 1.31793, 1.32722, 1.31541, 1.322, 1.3218, 1.31794, 1.31628, 1.31547, 1.32499, 1.31709, 1.317, 1.32129, 1.32324, 1.3231, 1.32155, 1.32292, 1.32269, 1.32156, 1.31852, 1.31872, 1.31758, 1.32143, 1.32104, 1.32353, 1.32012, 1.32147, 1.32263, 1.32328, 1.32548, 1.32214, 1.32307, 1.32574, 1.32903, 1.3278, 1.32381, 1.32116, 1.32264, 1.32367, 1.31807, 1.32574, 1.32105, 1.32208, 1.32432, 1.32324, 1.32004, 1.32242, 1.32161, 1.32001, 1.32057, 1.31875, 1.32152, 1.32786, 1.32575, 1.32357, 1.3226, 1.31921, 1.32595, 1.31832, 1.31725, 1.32287, 1.32418, 1.32617, 1.32128, 1.32384, 1.31932, 1.32117, 1.3209, 1.32292, 1.32281, 1.33147, 1.32181, 1.32357, 1.32241, 1.32062, 1.32002, 1.32089, 1.32929, 1.3178, 1.31998, 1.32166, 1.32279, 1.32038, 1.31604, 1.321, 1.31845, 1.31976, 1.32049, 1.32671, 1.30205, 1.30334, 1.30428, 1.30688, 1.30105, 1.306, 1.30598, 1.30505, 1.30135, 1.30452, 1.30666, 1.30463, 1.30387, 1.30213, 1.30721, 1.30426, 1.30532, 1.30358, 1.30289, 1.30331, 1.30072, 1.30374, 1.30623, 1.30837, 1.30441, 1.30441, 1.30428, 1.30182, 1.29924, 1.31777, 1.31621, 1.32106, 1.31759, 1.32273]}, "batch-generator-time": {"start_step": 0, "end_step": 2000, "step_interval": 5, "values": [4.17805, 0.02532, 0.02443, 0.0259, 0.02446, 0.02433, 0.02525, 0.02434, 0.02571, 0.02834, 0.02652, 0.02646, 0.02518, 0.02481, 0.0279, 0.02807, 0.0266, 0.02845, 0.0313, 0.02866, 0.02895, 0.02709, 0.02883, 0.02971, 0.03025, 0.02951, 0.02896, 0.03006, 0.03215, 0.0295, 0.03352, 0.02739, 0.02956, 0.02814, 0.02868, 0.02699, 0.02842, 0.03193, 0.02797, 0.02967, 0.0318, 0.02963, 0.02835, 0.02797, 0.02797, 0.03173, 0.02956, 0.02665, 0.02908, 0.02921, 0.02665, 0.02893, 0.02866, 0.02772, 0.02944, 0.03233, 0.02893, 0.03067, 0.03096, 0.02981, 0.02909, 0.02673, 0.02735, 0.03183, 0.03003, 0.02892, 0.02792, 0.03046, 0.02823, 0.03032, 0.03123, 0.02966, 0.03045, 0.03048, 0.03141, 0.03097, 0.02999, 0.03135, 0.0285, 0.02735, 0.02803, 0.02831, 0.02764, 0.03034, 0.02971, 0.02926, 0.02972, 0.02952, 0.03075, 0.03009, 0.02964, 0.02882, 0.03045, 0.02898, 0.02803, 0.02824, 0.02708, 0.02867, 0.0342, 0.03142, 0.03184, 0.03236, 0.03305, 0.03116, 0.02898, 0.03026, 0.02775, 0.02983, 0.03023, 0.02832, 0.03086, 0.02777, 0.03086, 0.0307, 0.02887, 0.03065, 0.03095, 0.02937, 0.02703, 0.02981, 0.02895, 0.03324, 0.02658, 0.02662, 0.02448, 0.02629, 0.02739, 0.0271, 0.02673, 0.0253, 0.02683, 0.02718, 0.02671, 0.0276, 0.02593, 0.02704, 0.0285, 0.02845, 0.02811, 0.02883, 0.03435, 0.03167, 0.03261, 0.03235, 0.03414, 0.03091, 0.03163, 0.02955, 0.03106, 0.03182, 0.03113, 0.03157, 0.03216, 0.03397, 0.03111, 0.02941, 0.02991, 0.02875, 0.03204, 0.02798, 0.02854, 0.03038, 0.02648, 0.02916, 0.02799, 0.02855, 0.02792, 0.0274, 0.02603, 0.02879, 0.0292, 0.02864, 0.02841, 0.02759, 0.02946, 0.02947, 0.02937, 0.02887, 0.0288, 0.02812, 0.02927, 0.02796, 0.02893, 0.02755, 0.0266, 0.02892, 0.02827, 0.02802, 0.02761, 0.0284, 0.03055, 0.02773, 0.02955, 0.02851, 0.02789, 0.02748, 0.0272, 0.02827, 0.02809, 0.02816, 0.40686, 0.0267, 0.02546, 0.02555, 0.02624, 0.02523, 0.02567, 0.0279, 0.02868, 0.02572, 0.02653, 0.02383, 0.02613, 0.02506, 0.0243, 0.02629, 0.02418, 0.02447, 0.02537, 0.02552, 0.02379, 0.02344, 0.02378, 0.02314, 0.02354, 0.02382, 0.02379, 0.02659, 0.02476, 0.02631, 0.02468, 0.02598, 0.02324, 0.02455, 0.0251, 0.02405, 0.02442, 0.02377, 0.02361, 0.02478, 0.02379, 0.02477, 0.02439, 0.02295, 0.02552, 0.02359, 0.02286, 0.02462, 0.02531, 0.03164, 0.0315, 0.03143, 0.03142, 0.03168, 0.03139, 0.03399, 0.03158, 0.03159, 0.03346, 0.03175, 0.03166, 0.03151, 0.03142, 0.03168, 0.0317, 0.03164, 0.03167, 0.03175, 0.03163, 0.03326, 0.03172, 0.03141, 0.03173, 0.0333, 0.03168, 0.03167, 0.03183, 0.03165, 0.03174, 0.03408, 0.03301, 0.0256, 0.02643, 0.03, 0.02476, 0.02404, 0.02678, 0.02289, 0.02528, 0.02495, 0.02516, 0.02679, 0.02413, 0.0253, 0.02382, 0.02499, 0.02624, 0.02366, 0.02553, 0.02515, 0.02467, 0.02526, 0.02422, 0.02599, 0.02234, 0.02467, 0.02456, 0.02225, 0.02224, 0.02432, 0.02273, 0.02327, 0.02338, 0.02313, 0.02296, 0.02582, 0.02257, 0.02356, 0.02376, 0.02243, 0.02388, 0.02445, 0.02411, 0.02604, 0.02457, 0.02385, 0.02605, 0.02638, 0.02472, 0.02454, 0.02557, 0.02531, 0.02518, 0.02578, 0.02479, 0.02654, 0.02415, 0.02363, 0.02446, 0.02512, 0.02364, 0.02344, 0.0248, 0.02395, 0.02369, 0.02275, 0.0266, 0.02372, 0.02937, 0.02788, 0.02818, 0.02749, 0.0294, 0.02843, 0.02616, 0.02729, 0.02853, 0.02827, 0.02973, 0.02869, 0.02904, 0.02745, 0.02987, 0.02735, 0.02842, 0.02783, 0.02939, 0.02873, 0.02953, 0.02571, 0.02937, 0.02728, 0.03078, 0.02725, 0.02698, 0.02961, 0.02757, 0.02692, 0.02716, 0.02762, 0.02805, 0.02617, 0.02782, 0.02921, 0.02637, 0.02679, 0.02731, 0.02744, 0.02767, 0.02735, 0.02706, 0.02798, 0.02659, 0.02462, 0.02353, 0.02612, 0.02398, 0.02999, 0.02748, 0.02836]}, "forward-recv-time": {"start_step": 0, "end_step": 2000, "step_interval": 5, "values": [6.80244, 0.02327, 0.02357, 0.02418, 0.02403, 0.02416, 0.02299, 0.02437, 0.02654, 0.02645, 0.02351, 0.02322, 0.02321, 0.02333, 0.02356, 0.02407, 0.02284, 0.02336, 0.02305, 0.02309, 0.02437, 0.02382, 0.02371, 0.02295, 0.0237, 0.02304, 0.02301, 0.02347, 0.02339, 0.02268, 0.02304, 0.02357, 0.02381, 0.02335, 0.02274, 0.02277, 0.02379, 0.02387, 0.02489, 0.023, 0.02356, 0.02397, 0.02382, 0.0233, 0.02371, 0.02556, 0.02297, 0.02329, 0.02457, 0.02391, 0.02309, 0.02372, 0.02319, 0.02317, 0.02516, 0.02376, 0.02587, 0.02328, 0.02429, 0.02353, 0.02342, 0.02529, 0.02337, 0.02294, 0.02608, 0.0263, 0.02427, 0.02258, 0.02358, 0.02315, 0.02427, 0.02338, 0.02373, 0.02348, 0.02312, 0.02582, 0.02644, 0.02485, 0.02527, 0.02355, 0.02335, 0.0233, 0.02482, 0.02366, 0.02378, 0.02279, 0.02307, 0.02344, 0.02368, 0.02351, 0.02442, 0.023, 0.02371, 0.02324, 0.02397, 0.02339, 0.02331, 0.02303, 0.02316, 0.02451, 0.02588, 0.02323, 0.02313, 0.02372, 0.02372, 0.02396, 0.02313, 0.02377, 0.02325, 0.02357, 0.0239, 0.02373, 0.02305, 0.02327, 0.02337, 0.02558, 0.02412, 0.024, 0.02298, 0.02346, 0.02341, 0.02499, 0.02595, 0.02356, 0.02359, 0.02334, 0.02429, 0.02386, 0.02382, 0.02371, 0.02386, 0.02339, 0.02348, 0.02376, 0.02405, 0.0237, 0.02364, 0.02322, 0.02388, 0.02466, 0.02377, 0.02381, 0.02312, 0.02337, 0.02587, 0.0234, 0.02326, 0.02514, 0.02305, 0.02396, 0.02437, 0.02598, 0.02368, 0.02533, 0.02665, 0.0236, 0.02411, 0.02378, 0.02367, 0.02564, 0.02335, 0.02437, 0.02359, 0.02359, 0.02322, 0.02273, 0.02363, 0.02409, 0.02377, 0.02329, 0.02348, 0.02525, 0.02415, 0.02404, 0.02377, 0.02324, 0.02347, 0.02488, 0.02554, 0.02377, 0.02292, 0.02356, 0.02386, 0.0231, 0.024, 0.02405, 0.02445, 0.02374, 0.0233, 0.02593, 0.02463, 0.02393, 0.02351, 0.02352, 0.02404, 0.02313, 0.02358, 0.023, 0.02347, 0.02311, 0.0184, 0.02425, 0.02279, 0.02306, 0.02344, 0.02342, 0.0236, 0.02302, 0.02314, 0.02343, 0.02401, 0.02356, 0.02333, 0.02337, 0.0239, 0.0232, 0.02319, 0.02315, 0.02311, 0.02332, 0.02322, 0.02374, 0.0239, 0.02339, 0.02406, 0.02358, 0.02348, 0.02325, 0.02315, 0.02296, 0.02357, 0.02349, 0.02309, 0.02301, 0.02331, 0.02297, 0.0231, 0.02275, 0.0228, 0.02389, 0.02406, 0.02363, 0.02344, 0.02354, 0.02484, 0.02357, 0.02352, 0.02299, 0.02319, 0.02863, 0.02719, 0.02688, 0.0269, 0.02723, 0.02735, 0.02746, 0.02726, 0.02718, 0.02716, 0.02769, 0.02662, 0.02726, 0.0267, 0.02696, 0.02791, 0.0283, 0.03114, 0.02684, 0.02732, 0.02729, 0.02733, 0.02819, 0.02627, 0.02696, 0.02662, 0.02733, 0.02779, 0.02734, 0.02763, 0.02837, 0.02759, 0.0243, 0.02432, 0.02438, 0.02516, 0.02609, 0.02417, 0.02421, 0.02474, 0.02395, 0.02467, 0.02473, 0.02401, 0.02443, 0.02436, 0.02298, 0.02466, 0.02296, 0.02367, 0.02539, 0.02323, 0.02331, 0.02342, 0.02489, 0.02322, 0.02363, 0.02342, 0.02351, 0.02406, 0.02499, 0.02419, 0.02319, 0.02365, 0.02437, 0.02332, 0.02567, 0.02334, 0.02317, 0.02303, 0.02331, 0.02511, 0.02368, 0.02344, 0.02325, 0.0228, 0.02289, 0.02343, 0.02335, 0.0232, 0.02328, 0.02284, 0.0232, 0.02311, 0.02333, 0.02283, 0.02447, 0.02426, 0.02348, 0.02331, 0.02357, 0.02346, 0.02327, 0.02297, 0.0251, 0.02286, 0.0231, 0.02375, 0.02341, 0.0236, 0.0242, 0.02362, 0.02329, 0.02326, 0.02314, 0.02334, 0.02339, 0.02303, 0.02333, 0.02388, 0.02393, 0.02465, 0.02337, 0.02531, 0.02298, 0.02289, 0.02335, 0.02349, 0.02508, 0.02386, 0.02407, 0.0236, 0.02345, 0.02369, 0.02324, 0.02345, 0.02571, 0.02352, 0.02371, 0.02373, 0.02446, 0.02392, 0.02353, 0.02392, 0.02388, 0.02532, 0.02461, 0.02311, 0.02351, 0.02348, 0.02325, 0.02355, 0.02471, 0.02432, 0.0244, 0.02494, 0.02414, 0.02399, 0.02358, 0.02344, 0.02423]}, "forward-send-time": {"start_step": 0, "end_step": 2000, "step_interval": 5, "values": [2.84466, 0.00012, 0.00011, 0.00012, 0.00012, 0.00012, 0.00011, 0.00011, 0.00012, 0.00012, 0.00012, 0.00012, 0.00011, 0.00011, 0.00012, 0.00013, 0.00012, 0.00012, 0.00011, 0.00011, 0.00012, 0.00012, 0.00012, 0.00011, 0.00012, 0.00011, 0.00012, 0.00012, 0.00012, 0.00011, 0.00011, 0.00012, 0.00012, 0.00012, 0.00011, 0.00011, 0.00012, 0.00012, 0.00013, 0.00011, 0.00012, 0.00012, 0.00012, 0.00012, 0.00012, 0.00012, 0.00012, 0.00012, 0.00012, 0.00012, 0.00012, 0.00012, 0.00012, 0.00012, 0.00013, 0.00013, 0.00014, 0.00012, 0.00012, 0.00011, 0.00012, 0.00013, 0.00012, 0.00012, 0.00013, 0.00013, 0.00012, 0.00011, 0.00012, 0.00012, 0.00012, 0.00012, 0.00012, 0.00012, 0.00012, 0.00012, 0.00012, 0.00012, 0.00012, 0.00012, 0.00012, 0.00012, 0.00012, 0.00012, 0.00012, 0.00011, 0.00012, 0.00012, 0.00012, 0.00012, 0.00012, 0.00011, 0.00011, 0.00012, 0.00012, 0.00011, 0.00012, 0.00012, 0.00012, 0.00012, 0.00012, 0.00011, 0.00011, 0.00012, 0.00011, 0.00012, 0.00012, 0.00012, 0.00011, 0.00012, 0.00012, 0.00012, 0.00012, 0.00012, 0.00012, 0.00012, 0.00012, 0.00013, 0.00013, 0.00011, 0.00011, 0.00021, 0.00012, 0.00012, 0.00012, 0.00012, 0.00012, 0.00013, 0.00012, 0.00012, 0.00012, 0.00012, 0.00012, 0.00012, 0.00012, 0.00012, 0.00012, 0.00012, 0.00012, 0.00012, 0.00012, 0.00012, 0.00011, 0.00012, 0.00011, 0.00012, 0.00011, 0.00013, 0.00012, 0.00012, 0.00012, 0.00012, 0.00013, 0.00012, 0.00012, 0.00011, 0.00013, 0.00012, 0.00012, 0.00011, 0.00011, 0.00011, 0.00012, 0.00012, 0.00012, 0.00011, 0.00012, 0.00012, 0.00012, 0.00012, 0.00012, 0.00011, 0.00012, 0.00012, 0.00012, 0.00012, 0.00011, 0.00011, 0.00012, 0.00012, 0.00012, 0.00012, 0.00012, 0.00011, 0.00012, 0.00012, 0.00012, 0.00012, 0.00012, 0.00013, 0.00012, 0.00012, 0.00012, 0.00011, 0.00012, 0.00011, 0.00012, 0.00011, 0.00012, 0.00011, 0.00011, 0.00012, 0.00012, 0.00012, 0.00013, 0.00012, 0.00013, 0.00012, 0.00012, 0.00012, 0.00012, 0.00012, 0.00012, 0.00012, 0.00012, 0.00012, 0.00012, 0.00012, 0.00012, 0.00012, 0.00012, 0.00012, 0.00013, 0.00012, 0.00012, 0.00012, 0.00012, 0.00012, 0.00012, 0.00012, 0.00012, 0.00012, 0.00012, 0.00012, 0.00012, 0.00012, 0.00012, 0.00012, 0.00012, 0.00012, 0.00013, 0.00013, 0.00012, 0.00012, 0.00012, 0.00012, 0.00012, 0.00012, 0.00012, 0.00016, 0.00015, 0.00015, 0.00015, 0.00016, 0.00015, 0.00015, 0.00014, 0.00015, 0.00015, 0.00015, 0.00015, 0.00015, 0.00014, 0.00015, 0.00015, 0.00015, 0.00016, 0.00015, 0.00015, 0.00014, 0.00014, 0.00016, 0.00015, 0.0002, 0.00014, 0.00015, 0.00014, 0.00015, 0.00014, 0.00015, 0.00015, 0.00012, 0.00012, 0.00012, 0.00012, 0.00012, 0.00012, 0.00012, 0.00013, 0.00013, 0.00013, 0.00013, 0.00012, 0.00012, 0.00012, 0.00012, 0.00013, 0.00012, 0.00012, 0.00013, 0.00012, 0.00012, 0.00012, 0.00013, 0.00012, 0.00012, 0.00012, 0.00012, 0.00012, 0.00012, 0.00012, 0.00012, 0.00012, 0.00013, 0.00012, 0.00013, 0.00012, 0.00012, 0.00012, 0.00012, 0.00012, 0.00012, 0.00013, 0.00012, 0.00012, 0.00012, 0.00012, 0.00012, 0.00012, 0.00012, 0.00012, 0.00012, 0.00012, 0.00012, 0.00012, 0.00012, 0.00013, 0.00011, 0.00013, 0.00014, 0.00012, 0.00012, 0.00012, 0.00012, 0.00012, 0.00012, 0.00012, 0.00013, 0.00013, 0.00012, 0.00013, 0.00012, 0.00011, 0.00012, 0.00012, 0.00012, 0.00012, 0.00013, 0.00014, 0.00012, 0.00013, 0.00012, 0.00012, 0.00012, 0.00012, 0.00012, 0.00012, 0.00013, 0.00012, 0.00013, 0.00013, 0.00012, 0.00012, 0.00012, 0.00012, 0.00013, 0.00013, 0.00012, 0.00012, 0.00013, 0.00013, 0.00012, 0.00012, 0.00012, 0.00013, 0.00012, 0.00012, 0.00012, 0.00012, 0.00012, 0.00012, 0.00012, 0.00012, 0.00013, 0.00012, 0.00013, 0.00014, 0.00012, 0.00013, 0.00012]}, "backward-recv-time": {"start_step": 0, "end_step": 2000, "step_interval": 5, "values": [0.02202, 0.02306, 0.02274, 0.02305, 0.02218, 0.02282, 0.02254, 0.02256, 0.02256, 0.02201, 0.02227, 0.02236, 0.02184, 0.02219, 0.02311, 0.02279, 0.0224, 0.02326, 0.0223, 0.0226, 0.02262, 0.02192, 0.02207, 0.02234, 0.0225, 0.02331, 0.02364, 0.02244, 0.02259, 0.02244, 0.02307, 0.0232, 0.02442, 0.02498, 0.02229, 0.0228, 0.02468, 0.02377, 0.02241, 0.02261, 0.02253, 0.02261, 0.02234, 0.02253, 0.02252, 0.02275, 0.02272, 0.02219, 0.02235, 0.02245, 0.02519, 0.02285, 0.02297, 0.02413, 0.02237, 0.02293, 0.0228, 0.02258, 0.02227, 0.02742, 0.02319, 0.02305, 0.02286, 0.02291, 0.02288, 0.02328, 0.02324, 0.02362, 0.02461, 0.02229, 0.02295, 0.02276, 0.0234, 0.02322, 0.02241, 0.02264, 0.02302, 0.0234, 0.02233, 0.02257, 0.02316, 0.02277, 0.02753, 0.02283, 0.02254, 0.02283, 0.0218, 0.02217, 0.02286, 0.02257, 0.0228, 0.0227, 0.02081, 0.0228, 0.02621, 0.02311, 0.02273, 0.0228, 0.02247, 0.0229, 0.02301, 0.02246, 0.02269, 0.02282, 0.02255, 0.02285, 0.02311, 0.0227, 0.02235, 0.02252, 0.02338, 0.02261, 0.02365, 0.02278, 0.02199, 0.0226, 0.02251, 0.02252, 0.0226, 0.02281, 0.02411, 0.02301, 0.02114, 0.02254, 0.0225, 0.02292, 0.02388, 0.02719, 0.02225, 0.02241, 0.02306, 0.02278, 0.02254, 0.02221, 0.02262, 0.02523, 0.02237, 0.0224, 0.0224, 0.02234, 0.02308, 0.02372, 0.02327, 0.02279, 0.02316, 0.02344, 0.02202, 0.02286, 0.02663, 0.02281, 0.0234, 0.02273, 0.02221, 0.02282, 0.02274, 0.02532, 0.02225, 0.02195, 0.02261, 0.02257, 0.02265, 0.02262, 0.02232, 0.023, 0.02283, 0.02245, 0.02247, 0.0238, 0.02512, 0.02216, 0.0226, 0.02248, 0.02442, 0.02357, 0.02268, 0.02197, 0.02269, 0.02234, 0.02252, 0.02254, 0.02296, 0.02323, 0.02487, 0.02507, 0.02281, 0.02321, 0.01969, 0.02212, 0.02259, 0.02247, 0.02216, 0.02227, 0.02334, 0.02365, 0.02317, 0.02332, 0.02536, 0.02524, 0.02256, 0.02014, 0.02168, 0.02553, 0.02195, 0.02188, 0.02265, 0.02181, 0.02201, 0.02208, 0.02185, 0.02258, 0.02179, 0.02208, 0.02184, 0.02172, 0.02131, 0.02178, 0.02181, 0.02153, 0.02161, 0.02189, 0.02179, 0.02189, 0.02152, 0.02237, 0.01986, 0.02159, 0.02198, 0.02172, 0.02198, 0.02071, 0.0218, 0.02168, 0.02163, 0.02171, 0.02187, 0.02247, 0.0254, 0.02003, 0.02151, 0.02205, 0.02189, 0.02196, 0.02212, 0.02259, 0.02231, 0.02186, 0.0214, 0.02189, 0.02217, 0.02191, 0.02194, 0.02196, 0.02437, 0.0235, 0.02355, 0.02243, 0.02206, 0.02142, 0.02199, 0.02213, 0.02157, 0.02436, 0.02121, 0.02302, 0.0223, 0.02427, 0.02238, 0.02253, 0.01864, 0.02424, 0.02409, 0.0246, 0.02317, 0.02239, 0.02214, 0.02205, 0.022, 0.02349, 0.02219, 0.02161, 0.022, 0.02154, 0.02174, 0.0218, 0.02159, 0.02209, 0.022, 0.02163, 0.02288, 0.02366, 0.0234, 0.02153, 0.02198, 0.0241, 0.02181, 0.02185, 0.02225, 0.0216, 0.02178, 0.02096, 0.02214, 0.02076, 0.0219, 0.02303, 0.02184, 0.02342, 0.01921, 0.02176, 0.02172, 0.02189, 0.0219, 0.02192, 0.02085, 0.02133, 0.02429, 0.02384, 0.0242, 0.0195, 0.02178, 0.02175, 0.02146, 0.02171, 0.02168, 0.02164, 0.02417, 0.02331, 0.02162, 0.02199, 0.02187, 0.02172, 0.02155, 0.02173, 0.02177, 0.02367, 0.02387, 0.02186, 0.02165, 0.0215, 0.02171, 0.02193, 0.02169, 0.02399, 0.02207, 0.02179, 0.02207, 0.02217, 0.02226, 0.02196, 0.02201, 0.02182, 0.02159, 0.02152, 0.02173, 0.02179, 0.02146, 0.02161, 0.02161, 0.02191, 0.02365, 0.02194, 0.02182, 0.02252, 0.0217, 0.02184, 0.02214, 0.0207, 0.02212, 0.02196, 0.02227, 0.0219, 0.02213, 0.02179, 0.02192, 0.02063, 0.02245, 0.02495, 0.02207, 0.02234, 0.0219, 0.02176, 0.02221, 0.02198, 0.02398, 0.02453, 0.02261, 0.02208, 0.02163, 0.02214, 0.02159, 0.02483, 0.02236, 0.0221, 0.02206, 0.02218, 0.02227, 0.02233, 0.02258, 0.02182, 0.02191, 0.02178]}, "backward-send-time": {"start_step": 0, "end_step": 2000, "step_interval": 5, "values": [0.00016, 0.00018, 0.00018, 0.00018, 0.00018, 0.00018, 0.00018, 0.00018, 0.00017, 0.00018, 0.00016, 0.00019, 0.00019, 0.00018, 0.00017, 0.00019, 0.00018, 0.00019, 0.00018, 0.00018, 0.00018, 0.00018, 0.00017, 0.00018, 0.00018, 0.00019, 0.00019, 0.00017, 0.00018, 0.00017, 0.00019, 0.00018, 0.00022, 0.0002, 0.00018, 0.00019, 0.00016, 0.00017, 0.00018, 0.00019, 0.00018, 0.00017, 0.00018, 0.00018, 0.00017, 0.00018, 0.00017, 0.00018, 0.00018, 0.00019, 0.00018, 0.0002, 0.00017, 0.0002, 0.00018, 0.00019, 0.00018, 0.00019, 0.00019, 0.00018, 0.00018, 0.00019, 0.00018, 0.00019, 0.00018, 0.00018, 0.00019, 0.00018, 0.00016, 0.00017, 0.00018, 0.00018, 0.00018, 0.00018, 0.00019, 0.00019, 0.00019, 0.00018, 0.00017, 0.00018, 0.00018, 0.00018, 0.00021, 0.00019, 0.00019, 0.00018, 0.00016, 0.00018, 0.00019, 0.00019, 0.00019, 0.00018, 0.00019, 0.00018, 0.00018, 0.0002, 0.00018, 0.00018, 0.00016, 0.00018, 0.00018, 0.00018, 0.00018, 0.00019, 0.00018, 0.00019, 0.00018, 0.00018, 0.00018, 0.00018, 0.00018, 0.00022, 0.00018, 0.00018, 0.0002, 0.00018, 0.00019, 0.00019, 0.00018, 0.00019, 0.00019, 0.00018, 0.00016, 0.00019, 0.00018, 0.00018, 0.00032, 0.00019, 0.00018, 0.00018, 0.00019, 0.00019, 0.00019, 0.00018, 0.00017, 0.00019, 0.00016, 0.00016, 0.00017, 0.00019, 0.00019, 0.00018, 0.00018, 0.00019, 0.00018, 0.00017, 0.00018, 0.00019, 0.00016, 0.00018, 0.00018, 0.00018, 0.00019, 0.00019, 0.00018, 0.00026, 0.00019, 0.00018, 0.00018, 0.00017, 0.00018, 0.00019, 0.00019, 0.00018, 0.00018, 0.00018, 0.00018, 0.00018, 0.00022, 0.00018, 0.00019, 0.00019, 0.00016, 0.00019, 0.00019, 0.00019, 0.00018, 0.00019, 0.00018, 0.00019, 0.00018, 0.00019, 0.00018, 0.00017, 0.00018, 0.00018, 0.00027, 0.00018, 0.00019, 0.00018, 0.00019, 0.00018, 0.00018, 0.00019, 0.00021, 0.00018, 0.00019, 0.00016, 0.00019, 0.00016, 0.00019, 0.00023, 0.00017, 0.00016, 0.00018, 0.00019, 0.00019, 0.00019, 0.00021, 0.00016, 0.00018, 0.00017, 0.00018, 0.00017, 0.00018, 0.00017, 0.00019, 0.00018, 0.00018, 0.00018, 0.00017, 0.00018, 0.00018, 0.00016, 0.00018, 0.00018, 0.00018, 0.00018, 0.00019, 0.00018, 0.00018, 0.00019, 0.00021, 0.00018, 0.00018, 0.00018, 0.00018, 0.00018, 0.00019, 0.00023, 0.00018, 0.00016, 0.00019, 0.00018, 0.00016, 0.00018, 0.00019, 0.00017, 0.00019, 0.00018, 0.00016, 0.00017, 0.00018, 0.00018, 0.00016, 0.00018, 0.00017, 0.00016, 0.00019, 0.00017, 0.00018, 0.00018, 0.00017, 0.00018, 0.00017, 0.00017, 0.00018, 0.00017, 0.00018, 0.00017, 0.00016, 0.00016, 0.00018, 0.00018, 0.00018, 0.00016, 0.00018, 0.00025, 0.00018, 0.00017, 0.00018, 0.00017, 0.00018, 0.00017, 0.00019, 0.00018, 0.00018, 0.00018, 0.00018, 0.00018, 0.00017, 0.00018, 0.00016, 0.00016, 0.00017, 0.00018, 0.00018, 0.00018, 0.00018, 0.00019, 0.00017, 0.00018, 0.00017, 0.00018, 0.00018, 0.00018, 0.00018, 0.00019, 0.00019, 0.00018, 0.00018, 0.00016, 0.00018, 0.00018, 0.00016, 0.00016, 0.00016, 0.00019, 0.00017, 0.00017, 0.00018, 0.00017, 0.00018, 0.00017, 0.00018, 0.00017, 0.00018, 0.00017, 0.00016, 0.00018, 0.00018, 0.00018, 0.00021, 0.00016, 0.00016, 0.00018, 0.00018, 0.00018, 0.00016, 0.00018, 0.00018, 0.00016, 0.00016, 0.00018, 0.00017, 0.00019, 0.00018, 0.00018, 0.00018, 0.00018, 0.00017, 0.00019, 0.00018, 0.00017, 0.00017, 0.00018, 0.00017, 0.00018, 0.00018, 0.00017, 0.00018, 0.00019, 0.00018, 0.00016, 0.00019, 0.00018, 0.00018, 0.00018, 0.00016, 0.00018, 0.00018, 0.00018, 0.00018, 0.00017, 0.00018, 0.00016, 0.00018, 0.00019, 0.00018, 0.00018, 0.00016, 0.00016, 0.00017, 0.00021, 0.00016, 0.00018, 0.00018, 0.00017, 0.00018, 0.00018, 0.00018, 0.00018, 0.00018, 0.00019, 0.00018, 0.00017, 0.00017, 0.00018, 0.00017, 0.00018]}, "forward-send-backward-recv-time": {"start_step": 0, "end_step": 2000, "step_interval": 5, "values": [7.26791, 0.08664, 0.09388, 0.09112, 0.08445, 0.09357, 0.09373, 0.09614, 0.09989, 0.10112, 0.08956, 0.08704, 0.09001, 0.09155, 0.09857, 0.09953, 0.0961, 0.10113, 0.10125, 0.11004, 0.10313, 0.09862, 0.10585, 0.10919, 0.10583, 0.10172, 0.10458, 0.10404, 0.1052, 0.09641, 0.10412, 0.09781, 0.09972, 0.10136, 0.10163, 0.09609, 0.09969, 0.10085, 0.10306, 0.10325, 0.10455, 0.10533, 0.1025, 0.09569, 0.09963, 0.11379, 0.10728, 0.10291, 0.10638, 0.1012, 0.09514, 0.10381, 0.10024, 0.10547, 0.10487, 0.11789, 0.11734, 0.11997, 0.113, 0.10597, 0.11163, 0.11506, 0.12069, 0.12521, 0.12131, 0.11375, 0.10345, 0.10129, 0.10181, 0.10088, 0.0947, 0.09723, 0.09642, 0.10255, 0.10466, 0.09713, 0.10564, 0.10312, 0.10025, 0.09561, 0.09512, 0.09519, 0.08816, 0.09549, 0.09265, 0.09294, 0.10255, 0.09939, 0.10544, 0.10344, 0.10858, 0.1088, 0.10697, 0.09761, 0.09215, 0.09749, 0.10389, 0.09421, 0.09597, 0.09688, 0.10356, 0.10031, 0.10358, 0.10022, 0.09494, 0.09521, 0.08777, 0.09024, 0.09559, 0.08704, 0.09044, 0.08853, 0.09387, 0.09487, 0.09496, 0.0917, 0.09224, 0.08543, 0.08296, 0.0931, 0.08686, 0.09041, 0.08634, 0.0838, 0.07721, 0.08382, 0.08905, 0.07994, 0.08964, 0.09067, 0.08724, 0.09031, 0.09142, 0.08955, 0.08642, 0.08734, 0.09313, 0.0892, 0.08811, 0.08748, 0.10918, 0.10445, 0.10103, 0.10406, 0.10336, 0.10399, 0.11053, 0.10502, 0.1058, 0.10377, 0.10177, 0.10263, 0.10865, 0.10227, 0.1032, 0.10523, 0.08465, 0.08812, 0.09221, 0.0869, 0.09106, 0.09518, 0.08366, 0.09187, 0.09167, 0.09065, 0.08392, 0.08171, 0.08992, 0.09232, 0.08837, 0.08382, 0.08792, 0.08609, 0.08649, 0.09183, 0.09528, 0.08861, 0.08269, 0.07853, 0.08798, 0.08353, 0.08436, 0.09088, 0.08495, 0.08552, 0.08561, 0.08913, 0.08612, 0.08093, 0.08731, 0.08686, 0.08376, 0.09109, 0.08222, 0.08599, 0.08546, 0.09351, 0.09605, 0.09994, 0.05805, 0.06314, 0.06773, 0.06769, 0.07278, 0.07311, 0.07124, 0.07502, 0.06435, 0.06762, 0.06901, 0.0791, 0.0778, 0.07332, 0.07358, 0.07456, 0.08054, 0.08433, 0.07505, 0.07588, 0.08407, 0.0787, 0.08207, 0.0796, 0.07151, 0.06957, 0.07132, 0.06499, 0.06604, 0.07296, 0.07397, 0.067, 0.07615, 0.07913, 0.07517, 0.07077, 0.07248, 0.07492, 0.07227, 0.07335, 0.0763, 0.07019, 0.07546, 0.07774, 0.07407, 0.0729, 0.07638, 0.07126, 0.07892, 0.09584, 0.09387, 0.09457, 0.09277, 0.0883, 0.08843, 0.09465, 0.09754, 0.09491, 0.09011, 0.08659, 0.08508, 0.08604, 0.09074, 0.08671, 0.08822, 0.08652, 0.10003, 0.09872, 0.09528, 0.09138, 0.09197, 0.09145, 0.09609, 0.09717, 0.09187, 0.08329, 0.07444, 0.08501, 0.09292, 0.07912, 0.09086, 0.06371, 0.06325, 0.06657, 0.06269, 0.0684, 0.06721, 0.07116, 0.07046, 0.0677, 0.06735, 0.06869, 0.06628, 0.06387, 0.06598, 0.06628, 0.06315, 0.07014, 0.06138, 0.06023, 0.06541, 0.06746, 0.07002, 0.07338, 0.06917, 0.06109, 0.06706, 0.07059, 0.07159, 0.07375, 0.08229, 0.07701, 0.07396, 0.07568, 0.07085, 0.07045, 0.06836, 0.06539, 0.0665, 0.07089, 0.0709, 0.06602, 0.0697, 0.07478, 0.0684, 0.0647, 0.0626, 0.06703, 0.06836, 0.06571, 0.07061, 0.07022, 0.0716, 0.06385, 0.06344, 0.05399, 0.06182, 0.0629, 0.06795, 0.07021, 0.06979, 0.06991, 0.07026, 0.06139, 0.06342, 0.06547, 0.06176, 0.06228, 0.07216, 0.07562, 0.07274, 0.07226, 0.08023, 0.07444, 0.04375, 0.0697, 0.07621, 0.07857, 0.07477, 0.07791, 0.08106, 0.08001, 0.07886, 0.07928, 0.08279, 0.07305, 0.08365, 0.08546, 0.08515, 0.08206, 0.08649, 0.09308, 0.09213, 0.08788, 0.08419, 0.0881, 0.09226, 0.08474, 0.08747, 0.08269, 0.08805, 0.08503, 0.08089, 0.08025, 0.07691, 0.07938, 0.07913, 0.08725, 0.08008, 0.08335, 0.0882, 0.08124, 0.08869, 0.08118, 0.08321, 0.08276, 0.07892, 0.08691, 0.07849, 0.08318]}, "backward-send-forward-recv-time": {"start_step": 0, "end_step": 2000, "step_interval": 5, "values": [3.02438, 0.02964, 0.02158, 0.02612, 0.02742, 0.02646, 0.02144, 0.01953, 0.02104, 0.01973, 0.0221, 0.02679, 0.02821, 0.0292, 0.02641, 0.02434, 0.02851, 0.02189, 0.02401, 0.02493, 0.02324, 0.02474, 0.02466, 0.01958, 0.02074, 0.02324, 0.02406, 0.02422, 0.02172, 0.02415, 0.02078, 0.02874, 0.02875, 0.02888, 0.03126, 0.03155, 0.0297, 0.0288, 0.03235, 0.02835, 0.02837, 0.02808, 0.02869, 0.03298, 0.03478, 0.02725, 0.02531, 0.02971, 0.0248, 0.02835, 0.03171, 0.02666, 0.02768, 0.0316, 0.11725, 0.02233, 0.01927, 0.01846, 0.02324, 0.0208, 0.02765, 0.02234, 0.02152, 0.02055, 0.0218, 0.02092, 0.02617, 0.02621, 0.02575, 0.02487, 0.02854, 0.02512, 0.02754, 0.02441, 0.02799, 0.02601, 0.02443, 0.02664, 0.02842, 0.02747, 0.02197, 0.02705, 0.0286, 0.02828, 0.03081, 0.02999, 0.03156, 0.02772, 0.02622, 0.02462, 0.02412, 0.02594, 0.02264, 0.03102, 0.02956, 0.02597, 0.02756, 0.03008, 0.02803, 0.02913, 0.02661, 0.02374, 0.02365, 0.02578, 0.02542, 0.03028, 0.03098, 0.02753, 0.02526, 0.02933, 0.02658, 0.02632, 0.02526, 0.02436, 0.02205, 0.02173, 0.02147, 0.02635, 0.02715, 0.01835, 0.02341, 0.02286, 0.02713, 0.03176, 0.03552, 0.02684, 0.02459, 0.03111, 0.02691, 0.02888, 0.02912, 0.02835, 0.02868, 0.0319, 0.02488, 0.02699, 0.02738, 0.02288, 0.03107, 0.03026, 0.02374, 0.02063, 0.02531, 0.02048, 0.02199, 0.02504, 0.01991, 0.03009, 0.02384, 0.02452, 0.02777, 0.02276, 0.02322, 0.02545, 0.02596, 0.02803, 0.03054, 0.03445, 0.02978, 0.02853, 0.02578, 0.02477, 0.03074, 0.02951, 0.03089, 0.03187, 0.02945, 0.03462, 0.02761, 0.03327, 0.03222, 0.03039, 0.03257, 0.02712, 0.02729, 0.02863, 0.02412, 0.02627, 0.03209, 0.03064, 0.02986, 0.02923, 0.03127, 0.02881, 0.03666, 0.03233, 0.03454, 0.03286, 0.03299, 0.03171, 0.03363, 0.03637, 0.03532, 0.02997, 0.03427, 0.03447, 0.03788, 0.03045, 0.02935, 0.02785, 0.06375, 0.04913, 0.04593, 0.04639, 0.04315, 0.04609, 0.04022, 0.04069, 0.0458, 0.04145, 0.04193, 0.03809, 0.03122, 0.0379, 0.04024, 0.03151, 0.03065, 0.03028, 0.03812, 0.03701, 0.03342, 0.03675, 0.03239, 0.0438, 0.03695, 0.0419, 0.04267, 0.04585, 0.04997, 0.04424, 0.04745, 0.04667, 0.04464, 0.03917, 0.03907, 0.03699, 0.04231, 0.03898, 0.04045, 0.03812, 0.0373, 0.04307, 0.03851, 0.03799, 0.04077, 0.0409, 0.04045, 0.04407, 0.0328, 0.02602, 0.03043, 0.0238, 0.02775, 0.03236, 0.02827, 0.02216, 0.02607, 0.02209, 0.02438, 0.02661, 0.02817, 0.0302, 0.02384, 0.02743, 0.03022, 0.02263, 0.02281, 0.02357, 0.02756, 0.02656, 0.02806, 0.02726, 0.02917, 0.02779, 0.04648, 0.03625, 0.03939, 0.03798, 0.03027, 0.03365, 0.03112, 0.0507, 0.05041, 0.0488, 0.0478, 0.04287, 0.04273, 0.03793, 0.04099, 0.0473, 0.04686, 0.04606, 0.04653, 0.04791, 0.0434, 0.04395, 0.04672, 0.03952, 0.04338, 0.05238, 0.05084, 0.0447, 0.04529, 0.04014, 0.04009, 0.04618, 0.03869, 0.04044, 0.04097, 0.04238, 0.03044, 0.04364, 0.04057, 0.03549, 0.03892, 0.03761, 0.03631, 0.04319, 0.04214, 0.04271, 0.04566, 0.04209, 0.0419, 0.03476, 0.04175, 0.03736, 0.04126, 0.04073, 0.04268, 0.04088, 0.03755, 0.04007, 0.0375, 0.03951, 0.04011, 0.04621, 0.04174, 0.04428, 0.03833, 0.03393, 0.03343, 0.03715, 0.03224, 0.0391, 0.03809, 0.0352, 0.04357, 0.04052, 0.02489, 0.02136, 0.02147, 0.01936, 0.01974, 0.01753, 0.1141, 0.01901, 0.02217, 0.02537, 0.01881, 0.01782, 0.01594, 0.01966, 0.01818, 0.02087, 0.02147, 0.02626, 0.01794, 0.01552, 0.01646, 0.01963, 0.01985, 0.02306, 0.02056, 0.01929, 0.0188, 0.02041, 0.01882, 0.01934, 0.01928, 0.01858, 0.01964, 0.01987, 0.02011, 0.01922, 0.01909, 0.02055, 0.01875, 0.02072, 0.02181, 0.02052, 0.01786, 0.01986, 0.01947, 0.02245, 0.01734, 0.01752, 0.01965, 0.02295, 0.02233, 0.01907]}, "layernorm-grads-all-reduce-time": {"start_step": 0, "end_step": 2000, "step_interval": 5, "values": [3e-05, 2e-05, 2e-05, 2e-05, 2e-05, 2e-05, 2e-05, 2e-05, 2e-05, 2e-05, 2e-05, 2e-05, 2e-05, 2e-05, 2e-05, 2e-05, 1e-05, 2e-05, 1e-05, 2e-05, 2e-05, 2e-05, 2e-05, 1e-05, 2e-05, 2e-05, 2e-05, 1e-05, 2e-05, 1e-05, 2e-05, 2e-05, 1e-05, 2e-05, 1e-05, 1e-05, 2e-05, 2e-05, 2e-05, 2e-05, 2e-05, 2e-05, 2e-05, 2e-05, 2e-05, 1e-05, 2e-05, 2e-05, 2e-05, 2e-05, 2e-05, 2e-05, 2e-05, 2e-05, 2e-05, 2e-05, 1e-05, 1e-05, 2e-05, 2e-05, 2e-05, 2e-05, 2e-05, 1e-05, 2e-05, 1e-05, 2e-05, 2e-05, 1e-05, 1e-05, 2e-05, 2e-05, 2e-05, 2e-05, 2e-05, 2e-05, 2e-05, 2e-05, 1e-05, 2e-05, 2e-05, 2e-05, 2e-05, 1e-05, 2e-05, 2e-05, 2e-05, 2e-05, 1e-05, 2e-05, 2e-05, 1e-05, 1e-05, 2e-05, 2e-05, 1e-05, 2e-05, 1e-05, 2e-05, 2e-05, 2e-05, 2e-05, 1e-05, 2e-05, 2e-05, 1e-05, 2e-05, 1e-05, 1e-05, 1e-05, 2e-05, 1e-05, 1e-05, 2e-05, 2e-05, 2e-05, 2e-05, 1e-05, 2e-05, 1e-05, 2e-05, 2e-05, 2e-05, 2e-05, 1e-05, 2e-05, 1e-05, 1e-05, 2e-05, 2e-05, 1e-05, 2e-05, 2e-05, 2e-05, 1e-05, 1e-05, 1e-05, 1e-05, 2e-05, 2e-05, 1e-05, 2e-05, 1e-05, 1e-05, 1e-05, 2e-05, 2e-05, 2e-05, 2e-05, 2e-05, 2e-05, 2e-05, 2e-05, 2e-05, 2e-05, 2e-05, 2e-05, 2e-05, 1e-05, 2e-05, 2e-05, 2e-05, 2e-05, 1e-05, 2e-05, 2e-05, 2e-05, 2e-05, 2e-05, 2e-05, 2e-05, 1e-05, 2e-05, 2e-05, 2e-05, 2e-05, 2e-05, 1e-05, 1e-05, 2e-05, 2e-05, 2e-05, 2e-05, 2e-05, 2e-05, 2e-05, 2e-05, 2e-05, 2e-05, 2e-05, 1e-05, 2e-05, 2e-05, 2e-05, 2e-05, 1e-05, 2e-05, 2e-05, 1e-05, 2e-05, 2e-05, 1e-05, 1e-05, 1e-05, 2e-05, 1e-05, 1e-05, 1e-05, 1e-05, 1e-05, 1e-05, 2e-05, 1e-05, 1e-05, 1e-05, 1e-05, 1e-05, 1e-05, 1e-05, 1e-05, 1e-05, 1e-05, 1e-05, 2e-05, 1e-05, 1e-05, 1e-05, 2e-05, 1e-05, 1e-05, 1e-05, 2e-05, 1e-05, 1e-05, 1e-05, 1e-05, 1e-05, 2e-05, 1e-05, 2e-05, 1e-05, 1e-05, 1e-05, 1e-05, 1e-05, 1e-05, 1e-05, 1e-05, 1e-05, 2e-05, 2e-05, 2e-05, 2e-05, 2e-05, 2e-05, 2e-05, 2e-05, 2e-05, 2e-05, 2e-05, 2e-05, 2e-05, 2e-05, 2e-05, 2e-05, 2e-05, 2e-05, 2e-05, 2e-05, 2e-05, 2e-05, 2e-05, 2e-05, 2e-05, 2e-05, 2e-05, 2e-05, 2e-05, 2e-05, 2e-05, 2e-05, 1e-05, 1e-05, 1e-05, 1e-05, 1e-05, 1e-05, 1e-05, 1e-05, 1e-05, 1e-05, 1e-05, 1e-05, 1e-05, 1e-05, 1e-05, 1e-05, 1e-05, 1e-05, 1e-05, 1e-05, 1e-05, 1e-05, 1e-05, 1e-05, 1e-05, 1e-05, 1e-05, 1e-05, 1e-05, 1e-05, 1e-05, 1e-05, 1e-05, 1e-05, 2e-05, 1e-05, 1e-05, 1e-05, 1e-05, 1e-05, 1e-05, 1e-05, 1e-05, 1e-05, 1e-05, 1e-05, 1e-05, 1e-05, 1e-05, 1e-05, 1e-05, 1e-05, 1e-05, 1e-05, 1e-05, 1e-05, 1e-05, 2e-05, 1e-05, 1e-05, 1e-05, 1e-05, 1e-05, 1e-05, 1e-05, 1e-05, 1e-05, 2e-05, 1e-05, 1e-05, 1e-05, 1e-05, 1e-05, 1e-05, 1e-05, 1e-05, 1e-05, 1e-05, 1e-05, 1e-05, 1e-05, 1e-05, 1e-05, 1e-05, 1e-05, 1e-05, 1e-05, 1e-05, 1e-05, 1e-05, 1e-05, 1e-05, 1e-05, 1e-05, 1e-05, 1e-05, 1e-05, 1e-05, 1e-05, 1e-05, 1e-05, 1e-05, 1e-05, 1e-05, 1e-05, 1e-05, 1e-05, 1e-05, 1e-05, 1e-05, 1e-05, 2e-05, 1e-05, 1e-05, 1e-05, 2e-05, 1e-05, 1e-05, 1e-05]}, "embedding-grads-all-reduce-time": {"start_step": 0, "end_step": 2000, "step_interval": 5, "values": [0.00057, 0.00025, 0.00022, 0.00022, 0.00022, 0.00022, 0.00025, 0.00022, 0.00022, 0.00034, 0.00022, 0.00024, 0.00022, 0.00026, 0.00022, 0.00022, 0.00022, 0.00022, 0.00022, 0.00026, 0.00026, 0.00025, 0.00022, 0.00025, 0.00022, 0.00022, 0.00024, 0.00022, 0.00022, 0.00022, 0.00022, 0.00022, 0.00022, 0.00026, 0.00022, 0.00025, 0.00022, 0.00023, 0.00025, 0.00022, 0.00022, 0.00022, 0.00022, 0.00025, 0.00022, 0.00022, 0.00021, 0.00022, 0.00022, 0.00022, 0.00022, 0.00022, 0.00022, 0.00025, 0.00025, 0.00022, 0.00022, 0.00022, 0.00022, 0.00022, 0.00022, 0.00022, 0.00026, 0.00025, 0.00025, 0.00021, 0.00022, 0.00022, 0.00022, 0.00025, 0.00025, 0.00025, 0.00022, 0.00023, 0.00022, 0.00022, 0.00023, 0.00022, 0.00025, 0.00022, 0.00022, 0.00022, 0.00025, 0.00021, 0.00022, 0.00022, 0.00025, 0.00022, 0.00022, 0.00025, 0.00025, 0.00025, 0.00022, 0.00033, 0.00022, 0.00022, 0.00023, 0.00025, 0.00025, 0.00022, 0.00022, 0.00022, 0.00022, 0.00022, 0.00022, 0.00022, 0.00022, 0.00022, 0.00022, 0.00022, 0.00022, 0.00022, 0.00022, 0.00022, 0.00022, 0.00022, 0.00025, 0.00022, 0.00022, 0.00026, 0.00022, 0.00022, 0.00023, 0.00022, 0.00022, 0.00022, 0.00022, 0.00022, 0.00026, 0.00022, 0.00026, 0.00025, 0.00024, 0.00025, 0.00022, 0.00025, 0.00022, 0.00022, 0.00026, 0.00025, 0.00022, 0.00022, 0.00022, 0.00025, 0.00025, 0.00025, 0.00025, 0.00022, 0.00022, 0.00023, 0.00022, 0.00022, 0.00022, 0.00022, 0.00023, 0.00022, 0.00022, 0.00024, 0.00023, 0.00022, 0.00023, 0.00022, 0.00021, 0.00022, 0.00025, 0.00022, 0.00022, 0.00022, 0.00022, 0.00022, 0.00022, 0.00025, 0.00025, 0.00024, 0.00022, 0.00024, 0.00022, 0.00025, 0.00022, 0.00022, 0.00026, 0.00025, 0.00024, 0.00022, 0.00022, 0.00025, 0.00022, 0.00022, 0.00022, 0.00025, 0.00022, 0.00026, 0.00022, 0.00022, 0.00022, 0.00022, 0.00027, 0.00022, 0.00025, 0.00022, 0.00026, 0.00025, 0.00021, 0.00022, 0.00022, 0.00022, 0.00023, 0.00022, 0.00026, 0.00022, 0.00021, 0.00026, 0.00025, 0.00022, 0.00025, 0.00022, 0.00022, 0.00022, 0.00022, 0.00022, 0.00022, 0.00022, 0.00022, 0.00026, 0.00025, 0.00021, 0.00022, 0.00026, 0.00025, 0.00022, 0.00022, 0.00025, 0.00022, 0.00022, 0.00022, 0.00022, 0.00025, 0.00021, 0.00021, 0.00022, 0.00022, 0.00025, 0.00022, 0.00022, 0.00025, 0.00025, 0.00025, 0.00022, 0.00022, 0.00023, 0.00023, 0.00023, 0.00023, 0.00023, 0.00023, 0.00023, 0.00023, 0.00024, 0.00024, 0.00024, 0.00023, 0.00023, 0.00023, 0.00023, 0.00023, 0.00023, 0.00023, 0.00023, 0.00023, 0.00023, 0.00023, 0.00023, 0.00023, 0.00023, 0.00023, 0.00023, 0.00023, 0.00023, 0.00023, 0.00023, 0.00023, 0.00025, 0.00025, 0.00022, 0.00021, 0.00021, 0.00023, 0.00021, 0.00021, 0.00025, 0.00021, 0.00021, 0.00025, 0.00022, 0.00021, 0.00025, 0.00022, 0.00021, 0.00021, 0.00025, 0.00021, 0.00021, 0.00021, 0.00025, 0.00025, 0.00022, 0.00022, 0.00021, 0.00025, 0.00021, 0.00021, 0.00021, 0.00021, 0.00021, 0.00021, 0.00022, 0.00022, 0.00021, 0.00021, 0.00021, 0.00022, 0.00021, 0.00022, 0.00022, 0.00022, 0.00024, 0.00021, 0.00022, 0.00022, 0.00024, 0.00021, 0.00025, 0.00021, 0.00025, 0.00021, 0.00025, 0.00022, 0.00021, 0.00021, 0.00021, 0.00025, 0.00023, 0.00021, 0.00021, 0.00025, 0.00021, 0.00021, 0.00022, 0.00025, 0.00021, 0.00021, 0.00022, 0.00022, 0.00021, 0.00021, 0.00022, 0.00025, 0.00022, 0.00022, 0.00025, 0.00022, 0.00022, 0.00022, 0.00022, 0.00022, 0.00022, 0.00022, 0.00023, 0.00022, 0.00022, 0.00022, 0.00022, 0.00021, 0.00022, 0.00022, 0.00022, 0.00022, 0.00022, 0.00025, 0.00022, 0.00022, 0.00025, 0.00022, 0.00021, 0.00022, 0.00022, 0.00025, 0.00022, 0.00022, 0.00022, 0.00022, 0.00022, 0.00033, 0.00022, 0.00021, 0.00022, 0.00022, 0.00022, 0.00021, 0.00024]}, "all-grads-sync-time": {"start_step": 0, "end_step": 2000, "step_interval": 5, "values": [0.66214, 0.00023, 0.00022, 0.00023, 0.00028, 0.00028, 0.00027, 0.00028, 0.00025, 0.00023, 0.00024, 0.00023, 0.00023, 0.00023, 0.00024, 0.00023, 0.00023, 0.00024, 0.00023, 0.00023, 0.00023, 0.0003, 0.00028, 0.00028, 0.00034, 0.00028, 0.00028, 0.00028, 0.00028, 0.00022, 0.00026, 0.00023, 0.00022, 0.00028, 0.00032, 0.00023, 0.00028, 0.00023, 0.00028, 0.00022, 0.00022, 0.00028, 0.00023, 0.00037, 0.00023, 0.00023, 0.00028, 0.00028, 0.00023, 0.00022, 0.00024, 0.00024, 0.00022, 0.00022, 0.00029, 0.00023, 0.00023, 0.00029, 0.00023, 0.00023, 0.00028, 0.00023, 0.00029, 0.00023, 0.00027, 0.00023, 0.00023, 0.00023, 0.00023, 0.00023, 0.00028, 0.00022, 0.00024, 0.00024, 0.00034, 0.00036, 0.00026, 0.00027, 0.00028, 0.00023, 0.00024, 0.00024, 0.00028, 0.00028, 0.00028, 0.00025, 0.00023, 0.00028, 0.00027, 0.00022, 0.00023, 0.00029, 0.00022, 0.00024, 0.00027, 0.00023, 0.00029, 0.00024, 0.00028, 0.00028, 0.00028, 0.00028, 0.00023, 0.00028, 0.00023, 0.00023, 0.00028, 0.00028, 0.0003, 0.00023, 0.00027, 0.00025, 0.00023, 0.00023, 0.00028, 0.00024, 0.00023, 0.00023, 0.00023, 0.00023, 0.00023, 0.00023, 0.00028, 0.00027, 0.00023, 0.00023, 0.00029, 0.00023, 0.00023, 0.00029, 0.00028, 0.00028, 0.00028, 0.00024, 0.00028, 0.00024, 0.00023, 0.00025, 0.00026, 0.00029, 0.00028, 0.00028, 0.00028, 0.00028, 0.00028, 0.00023, 0.00023, 0.00023, 0.00024, 0.00023, 0.0003, 0.00024, 0.00028, 0.00028, 0.00023, 0.00023, 0.00022, 0.00027, 0.00023, 0.00028, 0.00024, 0.00024, 0.00023, 0.00023, 0.00023, 0.00028, 0.00022, 0.00029, 0.00029, 0.00028, 0.00022, 0.00024, 0.0003, 0.00025, 0.00028, 0.00023, 0.00022, 0.00028, 0.00024, 0.00029, 0.00029, 0.00028, 0.00025, 0.00028, 0.00029, 0.00028, 0.00029, 0.00029, 0.00023, 0.00028, 0.00028, 0.00028, 0.00024, 0.0003, 0.00028, 0.00025, 0.00028, 0.00025, 0.00023, 0.00023, 0.00023, 0.00023, 0.00028, 0.00023, 0.00028, 0.00028, 0.00022, 0.00028, 0.00022, 0.00029, 0.00022, 0.00023, 0.00022, 0.00022, 0.00022, 0.00028, 0.00022, 0.00023, 0.00022, 0.00028, 0.00022, 0.00023, 0.00027, 0.00022, 0.00024, 0.00022, 0.00028, 0.00022, 0.00022, 0.00022, 0.00027, 0.00022, 0.00022, 0.00028, 0.00028, 0.00022, 0.00023, 0.00022, 0.00022, 0.00028, 0.00024, 0.00028, 0.00022, 0.00022, 0.00022, 0.00027, 0.00022, 0.00024, 0.00024, 0.00023, 0.00028, 0.00022, 0.00028, 0.00022, 0.00028, 0.00028, 0.00023, 0.00025, 0.00025, 0.00035, 0.00023, 0.00023, 0.00028, 0.00024, 0.00025, 0.00028, 0.00023, 0.00023, 0.00023, 0.00028, 0.00025, 0.00022, 0.00029, 0.00023, 0.00023, 0.00022, 0.00022, 0.00024, 0.00027, 0.00027, 0.00028, 0.00022, 0.00022, 0.00025, 0.00022, 0.00022, 0.00028, 0.00021, 0.00027, 0.00021, 0.00023, 0.00023, 0.00021, 0.00022, 0.00021, 0.00028, 0.00027, 0.00027, 0.00028, 0.00022, 0.00027, 0.00023, 0.00022, 0.00022, 0.00024, 0.00027, 0.00028, 0.00027, 0.00022, 0.00022, 0.00027, 0.00022, 0.00027, 0.00022, 0.00023, 0.00022, 0.00021, 0.00021, 0.00022, 0.00022, 0.00027, 0.00024, 0.00027, 0.00023, 0.00022, 0.00021, 0.00021, 0.00021, 0.00028, 0.00022, 0.00023, 0.00022, 0.00028, 0.00023, 0.00027, 0.00022, 0.00028, 0.00023, 0.00028, 0.00021, 0.00023, 0.00022, 0.00022, 0.00027, 0.00022, 0.00027, 0.00034, 0.00021, 0.00023, 0.00021, 0.00023, 0.00022, 0.00022, 0.00028, 0.00025, 0.00023, 0.00023, 0.00022, 0.00022, 0.00022, 0.00023, 0.00022, 0.00022, 0.00022, 0.00023, 0.00022, 0.00028, 0.00022, 0.00022, 0.00022, 0.00028, 0.00022, 0.00022, 0.00022, 0.00028, 0.00021, 0.00029, 0.00022, 0.00022, 0.00022, 0.00022, 0.00022, 0.00023, 0.00022, 0.00023, 0.0003, 0.00022, 0.00023, 0.00022, 0.00022, 0.00022, 0.00022, 0.00024, 0.00022, 0.00022, 0.00028, 0.00022, 0.00022, 0.00024, 0.00022]}, "optimizer-copy-to-main-grad-time": {"start_step": 0, "end_step": 2000, "step_interval": 5, "values": [0.00016, 0.00015, 0.00015, 0.00015, 0.00016, 0.00015, 0.00015, 0.00015, 0.00015, 0.00015, 0.00015, 0.00015, 0.00015, 0.00015, 0.00015, 0.00015, 0.00015, 0.00015, 0.00015, 0.00015, 0.00015, 0.00015, 0.00015, 0.00015, 0.00015, 0.00015, 0.00015, 0.00015, 0.00015, 0.00015, 0.00015, 0.00015, 0.00015, 0.00015, 0.00015, 0.00015, 0.00015, 0.00015, 0.00015, 0.00015, 0.00015, 0.00015, 0.00015, 0.00015, 0.00015, 0.00015, 0.00015, 0.00015, 0.00015, 0.00015, 0.00015, 0.00015, 0.00015, 0.00015, 0.00015, 0.00015, 0.00015, 0.00015, 0.00015, 0.00015, 0.00016, 0.00015, 0.00015, 0.00015, 0.00015, 0.00016, 0.00015, 0.00015, 0.00015, 0.00015, 0.00015, 0.00015, 0.00015, 0.00015, 0.00015, 0.00016, 0.00015, 0.00015, 0.00015, 0.00015, 0.00015, 0.00015, 0.00015, 0.00015, 0.00015, 0.00016, 0.00015, 0.00015, 0.00015, 0.00015, 0.00014, 0.00016, 0.00014, 0.00015, 0.00015, 0.00015, 0.00015, 0.00015, 0.00015, 0.00015, 0.00015, 0.00015, 0.00015, 0.00015, 0.00015, 0.00015, 0.00015, 0.00015, 0.00015, 0.00015, 0.00016, 0.00015, 0.00015, 0.00015, 0.00015, 0.00015, 0.00015, 0.00015, 0.00015, 0.00015, 0.00015, 0.00015, 0.00015, 0.00015, 0.00015, 0.00015, 0.00015, 0.00015, 0.00015, 0.00015, 0.00015, 0.00015, 0.00015, 0.00015, 0.00015, 0.00015, 0.00015, 0.00015, 0.00015, 0.00016, 0.00015, 0.00015, 0.00015, 0.00015, 0.00015, 0.00015, 0.00016, 0.00015, 0.00015, 0.00015, 0.00015, 0.00015, 0.00015, 0.00015, 0.00016, 0.00015, 0.00015, 0.00015, 0.00015, 0.00015, 0.00015, 0.00015, 0.00015, 0.00015, 0.00015, 0.00015, 0.00015, 0.00015, 0.00015, 0.00015, 0.00015, 0.00015, 0.00015, 0.00015, 0.00015, 0.00015, 0.00015, 0.00015, 0.00015, 0.00015, 0.00015, 0.00015, 0.00015, 0.00015, 0.00015, 0.00015, 0.00015, 0.00015, 0.00015, 0.00015, 0.00015, 0.00015, 0.00015, 0.00015, 0.00015, 0.00015, 0.00015, 0.00015, 0.00015, 0.00015, 0.00015, 0.00015, 0.00014, 0.00015, 0.00015, 0.00014, 0.00014, 0.00014, 0.00014, 0.00014, 0.00014, 0.00014, 0.00014, 0.00014, 0.00014, 0.00014, 0.00014, 0.00015, 0.00014, 0.00014, 0.00014, 0.00014, 0.00014, 0.00014, 0.00014, 0.00014, 0.00014, 0.00014, 0.00014, 0.00014, 0.00014, 0.00015, 0.00014, 0.00014, 0.00014, 0.00014, 0.00014, 0.00014, 0.00014, 0.00015, 0.00014, 0.00015, 0.00014, 0.00014, 0.00014, 0.00014, 0.00015, 0.00014, 0.00014, 0.0002, 0.00019, 0.00019, 0.00019, 0.00019, 0.00019, 0.00018, 0.0002, 0.0002, 0.00019, 0.00019, 0.00019, 0.0002, 0.00019, 0.0002, 0.00019, 0.00019, 0.00019, 0.00019, 0.0002, 0.00019, 0.00019, 0.00019, 0.00018, 0.0002, 0.00019, 0.0002, 0.00019, 0.00019, 0.00019, 0.00019, 0.00019, 0.00015, 0.00014, 0.00014, 0.00014, 0.00014, 0.00014, 0.00013, 0.00013, 0.00013, 0.00014, 0.00013, 0.00013, 0.00014, 0.00013, 0.00015, 0.00014, 0.00014, 0.00014, 0.00013, 0.00014, 0.00013, 0.00013, 0.00014, 0.00013, 0.00014, 0.00013, 0.00013, 0.00013, 0.00013, 0.00013, 0.00014, 0.00013, 0.00014, 0.00013, 0.00014, 0.00013, 0.00013, 0.00014, 0.00013, 0.00013, 0.00013, 0.00013, 0.00014, 0.00014, 0.00013, 0.00013, 0.00015, 0.00014, 0.00013, 0.00013, 0.00014, 0.00013, 0.00014, 0.00013, 0.00013, 0.00014, 0.00014, 0.00013, 0.00013, 0.00013, 0.00014, 0.00013, 0.00013, 0.00013, 0.00013, 0.00014, 0.00013, 0.00013, 0.00013, 0.00013, 0.00015, 0.00013, 0.00013, 0.00014, 0.00014, 0.00014, 0.00013, 0.00013, 0.00014, 0.00014, 0.00014, 0.00014, 0.00014, 0.00014, 0.00014, 0.00014, 0.00014, 0.00014, 0.00014, 0.00014, 0.00013, 0.00014, 0.00013, 0.00014, 0.00014, 0.00014, 0.00014, 0.00013, 0.00014, 0.00014, 0.00013, 0.00014, 0.00014, 0.00014, 0.00014, 0.00013, 0.00014, 0.00014, 0.00014, 0.00014, 0.00014, 0.00015, 0.00013, 0.00014, 0.00014, 0.00014, 0.00014, 0.00014, 0.00014]}, "optimizer-clip-main-grad-time": {"start_step": 0, "end_step": 2000, "step_interval": 5, "values": [6.52041, 0.00055, 0.00054, 0.00054, 0.00054, 0.00054, 0.00054, 0.00054, 0.00057, 0.00059, 0.00059, 0.00055, 0.00058, 0.00055, 0.00059, 0.00056, 0.00055, 0.00054, 0.00055, 0.00054, 0.00054, 0.00054, 0.00055, 0.00055, 0.00055, 0.00054, 0.00053, 0.00054, 0.00069, 0.00054, 0.00071, 0.00057, 0.00073, 0.00055, 0.00054, 0.00054, 0.00054, 0.00056, 0.00055, 0.00054, 0.00054, 0.00054, 0.00054, 0.00057, 0.00059, 0.00054, 0.00054, 0.00054, 0.00055, 0.00055, 0.00055, 0.00056, 0.00054, 0.00056, 0.00055, 0.00054, 0.00054, 0.00054, 0.00054, 0.00058, 0.00054, 0.00054, 0.00054, 0.00054, 0.00054, 0.00054, 0.0007, 0.00055, 0.00055, 0.00055, 0.00056, 0.00056, 0.00056, 0.00054, 0.00054, 0.00056, 0.00057, 0.00054, 0.00054, 0.00056, 0.00054, 0.0006, 0.00055, 0.00054, 0.00054, 0.00054, 0.00054, 0.00055, 0.00054, 0.00054, 0.00054, 0.00053, 0.00053, 0.00058, 0.00049, 0.00054, 0.00048, 0.00055, 0.00054, 0.00055, 0.00054, 0.00057, 0.00054, 0.00057, 0.00069, 0.00054, 0.00055, 0.00048, 0.00054, 0.00048, 0.00048, 0.0005, 0.00056, 0.00055, 0.00054, 0.00055, 0.00054, 0.00054, 0.00048, 0.00055, 0.00054, 0.00055, 0.00058, 0.00054, 0.00054, 0.00054, 0.00054, 0.00055, 0.00054, 0.00054, 0.00054, 0.00055, 0.00054, 0.00054, 0.00054, 0.00054, 0.00054, 0.00058, 0.00055, 0.00054, 0.00054, 0.00055, 0.00053, 0.00054, 0.00055, 0.00054, 0.00054, 0.00054, 0.00055, 0.00048, 0.00054, 0.00054, 0.00055, 0.00054, 0.00056, 0.00056, 0.00054, 0.00054, 0.00054, 0.00057, 0.00054, 0.00054, 0.00055, 0.00054, 0.00056, 0.00056, 0.00054, 0.00055, 0.00055, 0.00054, 0.00054, 0.00048, 0.00054, 0.00056, 0.00055, 0.00054, 0.00058, 0.00054, 0.00054, 0.00054, 0.00054, 0.00057, 0.00066, 0.00058, 0.00056, 0.00055, 0.00055, 0.00055, 0.00055, 0.00058, 0.00055, 0.00055, 0.00054, 0.00054, 0.00054, 0.00054, 0.00071, 0.00055, 0.00054, 0.00054, 0.0006, 0.00054, 0.00053, 0.00056, 0.00054, 0.00053, 0.00054, 0.00054, 0.00054, 0.00054, 0.00054, 0.00054, 0.00056, 0.00053, 0.00053, 0.00053, 0.00054, 0.00056, 0.00054, 0.00054, 0.00053, 0.00053, 0.00053, 0.00054, 0.00055, 0.00053, 0.00054, 0.00053, 0.00054, 0.00057, 0.00054, 0.00054, 0.00054, 0.00054, 0.00053, 0.00056, 0.00054, 0.00056, 0.00053, 0.00054, 0.00065, 0.00054, 0.00053, 0.00054, 0.00054, 0.00055, 0.00054, 0.00054, 0.00055, 0.00072, 0.00073, 0.00073, 0.00074, 0.00073, 0.00072, 0.00071, 0.00072, 0.0008, 0.00072, 0.00072, 0.00072, 0.00072, 0.00072, 0.00073, 0.00116, 0.00072, 0.00072, 0.00073, 0.00073, 0.00074, 0.00072, 0.00072, 0.00072, 0.00073, 0.00075, 0.00077, 0.00072, 0.00072, 0.00072, 0.00072, 0.00072, 0.00054, 0.00053, 0.00059, 0.00053, 0.00053, 0.00052, 0.00053, 0.00053, 0.00055, 0.00053, 0.00052, 0.00053, 0.00054, 0.00053, 0.00055, 0.00053, 0.00052, 0.00052, 0.00053, 0.00055, 0.00053, 0.00057, 0.00053, 0.00053, 0.00055, 0.00052, 0.00054, 0.00052, 0.00053, 0.00053, 0.00052, 0.00052, 0.00053, 0.00052, 0.00053, 0.00053, 0.00052, 0.00054, 0.00056, 0.00052, 0.00052, 0.00052, 0.00053, 0.00054, 0.00054, 0.00053, 0.00052, 0.00055, 0.00052, 0.00057, 0.00052, 0.00053, 0.00053, 0.00053, 0.00055, 0.00053, 0.00052, 0.00052, 0.00053, 0.00052, 0.00055, 0.00052, 0.00053, 0.00053, 0.00052, 0.00054, 0.00054, 0.00058, 0.00051, 0.00054, 0.00053, 0.00053, 0.00053, 0.00056, 0.00056, 0.00054, 0.00053, 0.00054, 0.00055, 0.00053, 0.00054, 0.00057, 0.00054, 0.00056, 0.00054, 0.00055, 0.00054, 0.00053, 0.00053, 0.00053, 0.00054, 0.00055, 0.00053, 0.00054, 0.00055, 0.00055, 0.00068, 0.00053, 0.00053, 0.00054, 0.00053, 0.00059, 0.00054, 0.00057, 0.00053, 0.00054, 0.00056, 0.00054, 0.00056, 0.00059, 0.00054, 0.00066, 0.00053, 0.00053, 0.00053, 0.00053, 0.00056, 0.0007, 0.00055]}, "optimizer-count-zeros-time": {"start_step": 0, "end_step": 2000, "step_interval": 5, "values": [0.00377, 0.00267, 0.00263, 0.00264, 0.00263, 0.00264, 0.00267, 0.00265, 0.00264, 0.00265, 0.00266, 0.00266, 0.00264, 0.00267, 0.00266, 0.00265, 0.00263, 0.00266, 0.00263, 0.00264, 0.00264, 0.00264, 0.00264, 0.00262, 0.00264, 0.00265, 0.00265, 0.00264, 0.00279, 0.00265, 0.0029, 0.00265, 0.00467, 0.00274, 0.00266, 0.00265, 0.00264, 0.00264, 0.00264, 0.00267, 0.00265, 0.00263, 0.00264, 0.00264, 0.00264, 0.00265, 0.00264, 0.00264, 0.00266, 0.00265, 0.00272, 0.00265, 0.00266, 0.00265, 0.00264, 0.00266, 0.00266, 0.00265, 0.00266, 0.00277, 0.00266, 0.00267, 0.00266, 0.00266, 0.00266, 0.00265, 0.00264, 0.00266, 0.00269, 0.00259, 0.00261, 0.00261, 0.0026, 0.00263, 0.00275, 0.00259, 0.00263, 0.00262, 0.0026, 0.00262, 0.00262, 0.0026, 0.00273, 0.00262, 0.00261, 0.00261, 0.0026, 0.0026, 0.00262, 0.00262, 0.00259, 0.0026, 0.0026, 0.00292, 0.00276, 0.00261, 0.00262, 0.00262, 0.00262, 0.00261, 0.00261, 0.0026, 0.0026, 0.00261, 0.00292, 0.00264, 0.00266, 0.0026, 0.00263, 0.00261, 0.00259, 0.00261, 0.0026, 0.00261, 0.00259, 0.0026, 0.00261, 0.00262, 0.00261, 0.0026, 0.00264, 0.00262, 0.00288, 0.00263, 0.00258, 0.00261, 0.00266, 0.00274, 0.00261, 0.0026, 0.00263, 0.00261, 0.0026, 0.00262, 0.00262, 0.00261, 0.00262, 0.00262, 0.00261, 0.0026, 0.00268, 0.00264, 0.00265, 0.00266, 0.00266, 0.00265, 0.00272, 0.00264, 0.00278, 0.00265, 0.00266, 0.00266, 0.00267, 0.00264, 0.00264, 0.00272, 0.0026, 0.00261, 0.00261, 0.00261, 0.00262, 0.00262, 0.00263, 0.00261, 0.00262, 0.00259, 0.00261, 0.00262, 0.00269, 0.0026, 0.00262, 0.00262, 0.00261, 0.00262, 0.00261, 0.00261, 0.00263, 0.0026, 0.00262, 0.0026, 0.00263, 0.00262, 0.0034, 0.00265, 0.00259, 0.00259, 0.0026, 0.00261, 0.00261, 0.0026, 0.00277, 0.0026, 0.00262, 0.00261, 0.00264, 0.00261, 0.00263, 0.00268, 0.00261, 0.0026, 0.00239, 0.00238, 0.0024, 0.00237, 0.00238, 0.00237, 0.00239, 0.00237, 0.0024, 0.0024, 0.00243, 0.00239, 0.0024, 0.0024, 0.00238, 0.00241, 0.00242, 0.00239, 0.00246, 0.00242, 0.0024, 0.00238, 0.00238, 0.00239, 0.00239, 0.00239, 0.00239, 0.0024, 0.0024, 0.00239, 0.00239, 0.00244, 0.00238, 0.00237, 0.00238, 0.0024, 0.00242, 0.00238, 0.00238, 0.00241, 0.00268, 0.00241, 0.00241, 0.00239, 0.00242, 0.00238, 0.00241, 0.00243, 0.00467, 0.00362, 0.00363, 0.0036, 0.00366, 0.00361, 0.00362, 0.00363, 0.00361, 0.00375, 0.00372, 0.00364, 0.0036, 0.00364, 0.00361, 0.00361, 0.00363, 0.00364, 0.00364, 0.00363, 0.00364, 0.00363, 0.00387, 0.00363, 0.00364, 0.00363, 0.00362, 0.00364, 0.00362, 0.00361, 0.00361, 0.00362, 0.00365, 0.00238, 0.00239, 0.00237, 0.0024, 0.0024, 0.00237, 0.00239, 0.00239, 0.00236, 0.00239, 0.00239, 0.00239, 0.00237, 0.00241, 0.00242, 0.00243, 0.00239, 0.0024, 0.00238, 0.00239, 0.00239, 0.00237, 0.00239, 0.00243, 0.00239, 0.00243, 0.00238, 0.00238, 0.00238, 0.00239, 0.00236, 0.0024, 0.00241, 0.00237, 0.00241, 0.0024, 0.00241, 0.00239, 0.00237, 0.0024, 0.00239, 0.0024, 0.00239, 0.00237, 0.00241, 0.00239, 0.00237, 0.00237, 0.0024, 0.00239, 0.00238, 0.00238, 0.0024, 0.00254, 0.00238, 0.00239, 0.00238, 0.00238, 0.00239, 0.00238, 0.00243, 0.00239, 0.00239, 0.00245, 0.00239, 0.00238, 0.00238, 0.00263, 0.00238, 0.00243, 0.00236, 0.00238, 0.00238, 0.00237, 0.00238, 0.00239, 0.0026, 0.00242, 0.0024, 0.0024, 0.0024, 0.0024, 0.00238, 0.00238, 0.00243, 0.00242, 0.0024, 0.00239, 0.0024, 0.0024, 0.00239, 0.00243, 0.00238, 0.0024, 0.00237, 0.00237, 0.00297, 0.0024, 0.0024, 0.00238, 0.00239, 0.00241, 0.00238, 0.00239, 0.00237, 0.00239, 0.00239, 0.00273, 0.00252, 0.00238, 0.00239, 0.00239, 0.00238, 0.00236, 0.0024, 0.0024, 0.00241, 0.00253, 0.00238]}, "optimizer-inner-step-time": {"start_step": 0, "end_step": 2000, "step_interval": 5, "values": [0.0039, 0.00046, 0.00046, 0.00046, 0.00046, 0.00046, 0.00045, 0.00044, 0.00045, 0.00046, 0.00046, 0.00045, 0.00045, 0.00045, 0.00046, 0.00046, 0.00045, 0.00046, 0.00047, 0.00045, 0.00046, 0.00045, 0.00046, 0.00059, 0.00046, 0.00046, 0.00045, 0.00046, 0.00062, 0.00046, 0.00061, 0.00045, 0.00047, 0.00046, 0.00045, 0.00046, 0.00045, 0.00045, 0.00047, 0.00046, 0.00045, 0.00045, 0.00045, 0.00045, 0.00045, 0.00045, 0.00046, 0.00052, 0.00045, 0.00045, 0.00046, 0.00046, 0.00044, 0.00045, 0.00045, 0.00045, 0.00045, 0.00045, 0.00046, 0.00046, 0.00045, 0.00045, 0.00045, 0.00045, 0.00045, 0.00046, 0.00046, 0.00046, 0.00046, 0.00047, 0.00045, 0.00045, 0.00047, 0.00046, 0.00046, 0.00045, 0.00046, 0.00046, 0.00046, 0.00046, 0.00046, 0.00045, 0.00046, 0.00046, 0.00045, 0.00053, 0.00046, 0.00045, 0.00046, 0.00046, 0.00047, 0.00045, 0.00045, 0.00054, 0.00045, 0.00046, 0.00047, 0.00047, 0.00046, 0.00046, 0.00047, 0.00046, 0.00047, 0.00046, 0.00064, 0.00046, 0.00046, 0.00046, 0.00046, 0.00046, 0.00046, 0.00046, 0.00046, 0.00046, 0.00046, 0.00046, 0.00047, 0.00047, 0.00046, 0.00046, 0.00046, 0.00046, 0.00049, 0.00047, 0.00047, 0.00046, 0.00048, 0.00046, 0.00046, 0.00046, 0.00046, 0.00047, 0.00046, 0.00047, 0.00046, 0.00047, 0.00047, 0.00046, 0.00046, 0.00046, 0.00047, 0.00048, 0.00046, 0.00047, 0.00046, 0.00047, 0.00059, 0.00048, 0.00046, 0.00046, 0.00046, 0.00046, 0.00047, 0.00047, 0.00046, 0.00047, 0.00047, 0.00046, 0.00047, 0.00047, 0.00046, 0.00047, 0.00055, 0.00046, 0.00046, 0.00046, 0.00047, 0.00047, 0.00047, 0.00046, 0.00047, 0.00047, 0.00047, 0.00047, 0.00047, 0.00048, 0.00046, 0.00047, 0.00047, 0.00047, 0.00046, 0.00046, 0.00046, 0.00047, 0.00047, 0.00047, 0.00048, 0.00046, 0.00046, 0.00048, 0.00047, 0.00047, 0.00047, 0.00047, 0.00048, 0.00046, 0.00049, 0.00047, 0.00046, 0.00047, 0.00046, 0.00048, 0.00045, 0.00045, 0.00046, 0.00046, 0.00047, 0.00046, 0.00045, 0.00045, 0.00045, 0.00047, 0.00046, 0.00047, 0.00063, 0.00046, 0.00045, 0.00045, 0.00045, 0.00045, 0.00046, 0.00046, 0.00047, 0.00045, 0.00048, 0.00046, 0.00046, 0.00045, 0.00045, 0.00045, 0.00045, 0.00046, 0.00046, 0.00046, 0.00045, 0.00049, 0.00046, 0.00048, 0.00045, 0.00047, 0.00057, 0.00045, 0.00047, 0.00045, 0.00046, 0.00047, 0.00045, 0.00046, 0.00051, 0.00059, 0.00058, 0.0006, 0.00059, 0.00058, 0.00058, 0.00058, 0.00059, 0.00058, 0.00058, 0.00058, 0.00058, 0.00059, 0.00058, 0.00059, 0.00059, 0.00058, 0.00059, 0.00059, 0.00059, 0.00061, 0.00059, 0.00058, 0.00058, 0.0006, 0.00059, 0.00058, 0.00058, 0.00059, 0.0006, 0.0006, 0.0006, 0.00045, 0.00045, 0.00045, 0.00043, 0.00044, 0.00045, 0.00043, 0.00045, 0.00043, 0.00045, 0.00043, 0.00044, 0.00045, 0.00044, 0.00044, 0.00044, 0.00044, 0.00044, 0.00044, 0.00045, 0.00043, 0.00043, 0.00044, 0.00061, 0.00046, 0.00045, 0.00043, 0.00045, 0.00043, 0.00044, 0.00044, 0.00045, 0.00044, 0.00044, 0.0006, 0.00044, 0.00044, 0.00044, 0.00044, 0.00045, 0.00042, 0.00043, 0.00043, 0.00043, 0.00045, 0.00045, 0.00044, 0.00046, 0.00044, 0.00044, 0.00043, 0.00043, 0.00047, 0.00043, 0.00043, 0.00044, 0.00043, 0.00044, 0.00044, 0.00043, 0.00045, 0.00044, 0.00044, 0.00044, 0.00043, 0.00044, 0.00044, 0.00045, 0.00045, 0.00044, 0.00045, 0.00045, 0.00044, 0.00046, 0.00044, 0.00045, 0.00045, 0.00045, 0.00045, 0.00045, 0.00046, 0.00045, 0.00044, 0.00046, 0.00044, 0.00045, 0.00059, 0.00045, 0.00045, 0.00045, 0.00045, 0.00045, 0.00044, 0.00045, 0.00046, 0.00046, 0.00052, 0.00046, 0.00045, 0.00044, 0.00044, 0.00045, 0.00043, 0.00046, 0.00045, 0.00045, 0.00046, 0.00049, 0.00046, 0.00045, 0.00046, 0.00049, 0.00045, 0.00043, 0.00044, 0.00044, 0.00046, 0.00056, 0.00044]}, "optimizer-copy-main-to-model-params-time": {"start_step": 0, "end_step": 2000, "step_interval": 5, "values": [0.00074, 0.00046, 0.00046, 0.00047, 0.00046, 0.00046, 0.00046, 0.00046, 0.00047, 0.00047, 0.00046, 0.00046, 0.00046, 0.00046, 0.00047, 0.00046, 0.00046, 0.00046, 0.00046, 0.00046, 0.00046, 0.00046, 0.00046, 0.00047, 0.00046, 0.00047, 0.00046, 0.00047, 0.00057, 0.00047, 0.00067, 0.00046, 0.0005, 0.00046, 0.00046, 0.00046, 0.00049, 0.00047, 0.00046, 0.00046, 0.00047, 0.00046, 0.00047, 0.00046, 0.00046, 0.00046, 0.00046, 0.00064, 0.00046, 0.00049, 0.00047, 0.00047, 0.00053, 0.00046, 0.00046, 0.00046, 0.00046, 0.00046, 0.00047, 0.00046, 0.00047, 0.00046, 0.00046, 0.00047, 0.00046, 0.00047, 0.00047, 0.0005, 0.00046, 0.00047, 0.00047, 0.00046, 0.00046, 0.00046, 0.00047, 0.00046, 0.00046, 0.00046, 0.00046, 0.00047, 0.00046, 0.00046, 0.00046, 0.00047, 0.00046, 0.00047, 0.00047, 0.00046, 0.00047, 0.00047, 0.00046, 0.00047, 0.00046, 0.00072, 0.00048, 0.00046, 0.00046, 0.00047, 0.00047, 0.00047, 0.00046, 0.00047, 0.00047, 0.00046, 0.00053, 0.00046, 0.00046, 0.00046, 0.00047, 0.00047, 0.00046, 0.00047, 0.00047, 0.00046, 0.00046, 0.00047, 0.00047, 0.00047, 0.00047, 0.00047, 0.00047, 0.00049, 0.00047, 0.00047, 0.00046, 0.00047, 0.0005, 0.00047, 0.00047, 0.00047, 0.00046, 0.00047, 0.00046, 0.00047, 0.00048, 0.00048, 0.00047, 0.00047, 0.00047, 0.00048, 0.00046, 0.0005, 0.00046, 0.00046, 0.00047, 0.00046, 0.00066, 0.00046, 0.00046, 0.00047, 0.00046, 0.00048, 0.00048, 0.00046, 0.00046, 0.00047, 0.00047, 0.00047, 0.00046, 0.00046, 0.00047, 0.00047, 0.0007, 0.00046, 0.00047, 0.00046, 0.00047, 0.0005, 0.00047, 0.00047, 0.00047, 0.00047, 0.00047, 0.00047, 0.00048, 0.00046, 0.00046, 0.00047, 0.00047, 0.00047, 0.00047, 0.00047, 0.00048, 0.00046, 0.00047, 0.00046, 0.00047, 0.00046, 0.00046, 0.00047, 0.00047, 0.00048, 0.00047, 0.00047, 0.00048, 0.00047, 0.00049, 0.00046, 0.00047, 0.00046, 0.00047, 0.00049, 0.00046, 0.00046, 0.00047, 0.00047, 0.00046, 0.00047, 0.00046, 0.00046, 0.00057, 0.00046, 0.00046, 0.00046, 0.00072, 0.00046, 0.00046, 0.00046, 0.00046, 0.00047, 0.00046, 0.00046, 0.00046, 0.00047, 0.00051, 0.00047, 0.00047, 0.00047, 0.00046, 0.00046, 0.00046, 0.00048, 0.00047, 0.00047, 0.00047, 0.00047, 0.00047, 0.0005, 0.00047, 0.00047, 0.00048, 0.00046, 0.00046, 0.00047, 0.00047, 0.00047, 0.00046, 0.00047, 0.00069, 0.00061, 0.00061, 0.00062, 0.00063, 0.00063, 0.00061, 0.00062, 0.00062, 0.00062, 0.00061, 0.00062, 0.00062, 0.00063, 0.00062, 0.00062, 0.00074, 0.00062, 0.00061, 0.00062, 0.00062, 0.00064, 0.00062, 0.00061, 0.00062, 0.00062, 0.00061, 0.00062, 0.00063, 0.00062, 0.00062, 0.00062, 0.00062, 0.00046, 0.00046, 0.00046, 0.00046, 0.00046, 0.00047, 0.00046, 0.00046, 0.00046, 0.00049, 0.00047, 0.00049, 0.00046, 0.00049, 0.00046, 0.00046, 0.00046, 0.00047, 0.00046, 0.00047, 0.00046, 0.00046, 0.00046, 0.00072, 0.00049, 0.00047, 0.00046, 0.00046, 0.00046, 0.00046, 0.00046, 0.00047, 0.00046, 0.00046, 0.00064, 0.00048, 0.00046, 0.00046, 0.00046, 0.00047, 0.00046, 0.00046, 0.00046, 0.00047, 0.00046, 0.00046, 0.00046, 0.00046, 0.00047, 0.00046, 0.00046, 0.00046, 0.00051, 0.00046, 0.00046, 0.00046, 0.00046, 0.00046, 0.00046, 0.00046, 0.0005, 0.00046, 0.00047, 0.00047, 0.00046, 0.00046, 0.00046, 0.00047, 0.00047, 0.00047, 0.00047, 0.00048, 0.00046, 0.00046, 0.00046, 0.00046, 0.00046, 0.00048, 0.00047, 0.00047, 0.00047, 0.00047, 0.00046, 0.00046, 0.00046, 0.00047, 0.0007, 0.00046, 0.00046, 0.00046, 0.00047, 0.00046, 0.00046, 0.00046, 0.00047, 0.00046, 0.00051, 0.00048, 0.00047, 0.00046, 0.00047, 0.00046, 0.00047, 0.00047, 0.00046, 0.00046, 0.00047, 0.00047, 0.00048, 0.00046, 0.00047, 0.0005, 0.00046, 0.00047, 0.00046, 0.00046, 0.00046, 0.00065, 0.00047]}, "optimizer-time": {"start_step": 0, "end_step": 2000, "step_interval": 5, "values": [6.53084, 0.00464, 0.00458, 0.0046, 0.00463, 0.00462, 0.00461, 0.0046, 0.00462, 0.00466, 0.00468, 0.00464, 0.00464, 0.00464, 0.00466, 0.00465, 0.00461, 0.00462, 0.0046, 0.00459, 0.00462, 0.00459, 0.0046, 0.00474, 0.0046, 0.0046, 0.00459, 0.00461, 0.00533, 0.00461, 0.00562, 0.00464, 0.00716, 0.00471, 0.00463, 0.00461, 0.00461, 0.00462, 0.00462, 0.00465, 0.00464, 0.00461, 0.00459, 0.00463, 0.00464, 0.0046, 0.00459, 0.00494, 0.00461, 0.00464, 0.00472, 0.00463, 0.00467, 0.00463, 0.00461, 0.00461, 0.00461, 0.00459, 0.00465, 0.00478, 0.00462, 0.00464, 0.0046, 0.00464, 0.00461, 0.00462, 0.00484, 0.00467, 0.00469, 0.00458, 0.00458, 0.00458, 0.00459, 0.00459, 0.00474, 0.00455, 0.00464, 0.00458, 0.00457, 0.0046, 0.00458, 0.0046, 0.0047, 0.00458, 0.00459, 0.00468, 0.00458, 0.00456, 0.00459, 0.00458, 0.00454, 0.00457, 0.00454, 0.00535, 0.00469, 0.00459, 0.00457, 0.0046, 0.00459, 0.00459, 0.00458, 0.0046, 0.00456, 0.00459, 0.00551, 0.00461, 0.00463, 0.00451, 0.00459, 0.00451, 0.00449, 0.00453, 0.00459, 0.00458, 0.00454, 0.00456, 0.00458, 0.00462, 0.00451, 0.00457, 0.00461, 0.0046, 0.00497, 0.00461, 0.00455, 0.00458, 0.00469, 0.00472, 0.0046, 0.00459, 0.00459, 0.0046, 0.00457, 0.0046, 0.00462, 0.00461, 0.00458, 0.00464, 0.00459, 0.0046, 0.00465, 0.00469, 0.00462, 0.00463, 0.00463, 0.00463, 0.00518, 0.00462, 0.00478, 0.00458, 0.00463, 0.00462, 0.00466, 0.00465, 0.00463, 0.0048, 0.00458, 0.00458, 0.00458, 0.00461, 0.00458, 0.00461, 0.00505, 0.00457, 0.00461, 0.00456, 0.00461, 0.00463, 0.00467, 0.00457, 0.0046, 0.00454, 0.00459, 0.00462, 0.00461, 0.00459, 0.00465, 0.00457, 0.0046, 0.00457, 0.00459, 0.00461, 0.00563, 0.00466, 0.00459, 0.00456, 0.00458, 0.00457, 0.00457, 0.00462, 0.00476, 0.00461, 0.00459, 0.00458, 0.00478, 0.00458, 0.00498, 0.00465, 0.00458, 0.00462, 0.00441, 0.00438, 0.00432, 0.00434, 0.00433, 0.00431, 0.00434, 0.00431, 0.00433, 0.00433, 0.00454, 0.00435, 0.00437, 0.00435, 0.00489, 0.00436, 0.00436, 0.00435, 0.00438, 0.00436, 0.00432, 0.00433, 0.00433, 0.00437, 0.00441, 0.00434, 0.00434, 0.00432, 0.00434, 0.0044, 0.00432, 0.0044, 0.00432, 0.00431, 0.00433, 0.00442, 0.00438, 0.00454, 0.00434, 0.00437, 0.00523, 0.00436, 0.00437, 0.00435, 0.00437, 0.00436, 0.00435, 0.00441, 0.00694, 0.00622, 0.00624, 0.00622, 0.00629, 0.00622, 0.0062, 0.0062, 0.00622, 0.00645, 0.00629, 0.00622, 0.00619, 0.00626, 0.0062, 0.00622, 0.00688, 0.00622, 0.00622, 0.00623, 0.00625, 0.00629, 0.00647, 0.00622, 0.00622, 0.00625, 0.00625, 0.00629, 0.00622, 0.0062, 0.00624, 0.00622, 0.00626, 0.00434, 0.00431, 0.00435, 0.0043, 0.00431, 0.00428, 0.00427, 0.00431, 0.00429, 0.00435, 0.00428, 0.00431, 0.00431, 0.00433, 0.00435, 0.00433, 0.00428, 0.00432, 0.00428, 0.00432, 0.00427, 0.00434, 0.0043, 0.00485, 0.00439, 0.00433, 0.00428, 0.0043, 0.00428, 0.00429, 0.00428, 0.0043, 0.00432, 0.00427, 0.00475, 0.00433, 0.0043, 0.00434, 0.00432, 0.00436, 0.00428, 0.00429, 0.00429, 0.00429, 0.00433, 0.0043, 0.00428, 0.00433, 0.0043, 0.00433, 0.00427, 0.00427, 0.00439, 0.00443, 0.00428, 0.00431, 0.00426, 0.00429, 0.0043, 0.00426, 0.00441, 0.00428, 0.0043, 0.00436, 0.00429, 0.00431, 0.00428, 0.00462, 0.00436, 0.00436, 0.00431, 0.00439, 0.00429, 0.00433, 0.00433, 0.00433, 0.00453, 0.00436, 0.00436, 0.00432, 0.00435, 0.00441, 0.00431, 0.00437, 0.00436, 0.00437, 0.00495, 0.00431, 0.00434, 0.00433, 0.00433, 0.00438, 0.00429, 0.00433, 0.00433, 0.00431, 0.0054, 0.00436, 0.00437, 0.00433, 0.0043, 0.0044, 0.0043, 0.00436, 0.00431, 0.00431, 0.00435, 0.00472, 0.00451, 0.00436, 0.00433, 0.0047, 0.00432, 0.00427, 0.00432, 0.00431, 0.0044, 0.00518, 0.00433]}, "learning-rate": {"start_step": 0, "end_step": 2000, "step_interval": 5, "values": [0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 1e-05, 1e-05, 1e-05, 1e-05, 1e-05, 1e-05, 1e-05, 1e-05, 1e-05, 1e-05, 1e-05, 1e-05, 1e-05, 1e-05, 1e-05, 1e-05, 1e-05, 1e-05, 1e-05, 1e-05, 1e-05, 1e-05, 1e-05, 1e-05, 1e-05, 1e-05, 1e-05, 1e-05, 1e-05, 1e-05, 1e-05, 1e-05, 1e-05, 1e-05, 1e-05, 1e-05, 1e-05, 1e-05, 1e-05, 1e-05, 1e-05, 1e-05, 2e-05, 2e-05, 2e-05, 2e-05, 2e-05, 2e-05, 2e-05, 2e-05, 2e-05, 2e-05, 2e-05, 2e-05, 2e-05, 2e-05, 2e-05, 2e-05, 2e-05, 2e-05, 2e-05, 2e-05, 2e-05, 2e-05, 2e-05, 2e-05, 2e-05, 2e-05, 2e-05, 2e-05, 2e-05, 2e-05, 2e-05, 2e-05, 2e-05, 2e-05, 2e-05, 2e-05, 2e-05, 2e-05, 2e-05, 2e-05, 2e-05, 2e-05, 2e-05, 3e-05, 3e-05, 3e-05, 3e-05, 3e-05, 3e-05, 3e-05, 3e-05, 3e-05, 3e-05, 3e-05, 3e-05, 3e-05, 3e-05, 3e-05, 3e-05, 3e-05, 3e-05, 3e-05, 3e-05, 3e-05, 3e-05, 3e-05, 3e-05, 3e-05, 3e-05, 3e-05, 3e-05, 3e-05, 3e-05, 3e-05, 3e-05, 3e-05, 3e-05, 3e-05, 3e-05, 3e-05, 3e-05, 3e-05, 3e-05, 3e-05, 3e-05, 3e-05, 4e-05, 4e-05, 4e-05, 4e-05, 4e-05, 4e-05, 4e-05, 4e-05, 4e-05, 4e-05, 4e-05, 4e-05, 4e-05, 4e-05, 4e-05, 4e-05, 4e-05, 4e-05, 4e-05, 4e-05, 4e-05, 4e-05, 4e-05, 4e-05, 4e-05, 4e-05, 4e-05, 4e-05, 4e-05, 4e-05, 4e-05, 4e-05, 4e-05, 4e-05, 4e-05, 4e-05, 4e-05, 4e-05, 4e-05, 4e-05, 4e-05, 4e-05, 5e-05, 5e-05, 5e-05, 5e-05, 5e-05, 5e-05, 5e-05, 5e-05, 5e-05, 5e-05, 5e-05, 5e-05, 5e-05, 5e-05, 5e-05, 5e-05, 5e-05, 5e-05, 5e-05, 5e-05, 5e-05, 5e-05, 5e-05, 5e-05, 5e-05, 5e-05, 5e-05, 5e-05, 5e-05, 5e-05, 5e-05, 5e-05, 5e-05, 5e-05, 5e-05, 5e-05, 5e-05, 5e-05, 5e-05, 5e-05, 5e-05, 5e-05, 5e-05, 6e-05, 6e-05, 6e-05, 6e-05, 6e-05, 6e-05, 6e-05, 6e-05, 6e-05, 6e-05, 6e-05, 6e-05, 6e-05, 6e-05, 6e-05, 6e-05, 6e-05, 6e-05, 6e-05, 6e-05, 6e-05, 6e-05, 6e-05, 6e-05, 6e-05, 6e-05, 6e-05, 6e-05, 6e-05, 6e-05, 6e-05, 6e-05, 6e-05, 6e-05, 6e-05, 6e-05, 6e-05, 6e-05, 6e-05, 6e-05, 6e-05, 6e-05, 6e-05, 7e-05, 7e-05, 7e-05, 7e-05, 7e-05, 7e-05, 7e-05, 7e-05, 7e-05, 7e-05, 7e-05, 7e-05, 7e-05, 7e-05, 7e-05, 7e-05, 7e-05, 7e-05, 7e-05, 7e-05, 7e-05, 7e-05, 7e-05, 7e-05, 7e-05, 7e-05, 7e-05, 7e-05, 7e-05, 7e-05, 7e-05, 7e-05, 7e-05, 7e-05, 7e-05, 7e-05, 7e-05, 7e-05, 7e-05, 7e-05, 7e-05, 7e-05, 8e-05, 8e-05, 8e-05, 8e-05, 8e-05, 8e-05, 8e-05, 8e-05, 8e-05, 8e-05, 8e-05, 8e-05, 8e-05, 8e-05, 8e-05, 8e-05, 8e-05, 8e-05, 8e-05, 8e-05, 8e-05, 8e-05, 8e-05, 8e-05, 8e-05, 8e-05, 8e-05, 8e-05, 8e-05, 8e-05, 8e-05, 8e-05, 8e-05, 8e-05, 8e-05, 8e-05, 8e-05, 8e-05, 8e-05, 8e-05, 8e-05, 8e-05, 8e-05, 9e-05, 9e-05, 9e-05, 9e-05, 9e-05, 9e-05, 9e-05, 9e-05, 9e-05, 9e-05, 9e-05, 9e-05, 9e-05, 9e-05, 9e-05, 9e-05, 9e-05, 9e-05, 9e-05, 9e-05, 9e-05, 9e-05, 9e-05, 9e-05, 9e-05, 9e-05, 9e-05, 9e-05, 9e-05, 9e-05, 9e-05, 9e-05, 9e-05, 9e-05, 9e-05, 9e-05, 9e-05]}, "learning-rate vs samples": {"start_step": 0, "end_step": 2000, "step_interval": 5, "values": [0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 1e-05, 1e-05, 1e-05, 1e-05, 1e-05, 1e-05, 1e-05, 1e-05, 1e-05, 1e-05, 1e-05, 1e-05, 1e-05, 1e-05, 1e-05, 1e-05, 1e-05, 1e-05, 1e-05, 1e-05, 1e-05, 1e-05, 1e-05, 1e-05, 1e-05, 1e-05, 1e-05, 1e-05, 1e-05, 1e-05, 1e-05, 1e-05, 1e-05, 1e-05, 1e-05, 1e-05, 1e-05, 1e-05, 1e-05, 1e-05, 1e-05, 1e-05, 2e-05, 2e-05, 2e-05, 2e-05, 2e-05, 2e-05, 2e-05, 2e-05, 2e-05, 2e-05, 2e-05, 2e-05, 2e-05, 2e-05, 2e-05, 2e-05, 2e-05, 2e-05, 2e-05, 2e-05, 2e-05, 2e-05, 2e-05, 2e-05, 2e-05, 2e-05, 2e-05, 2e-05, 2e-05, 2e-05, 2e-05, 2e-05, 2e-05, 2e-05, 2e-05, 2e-05, 2e-05, 2e-05, 2e-05, 2e-05, 2e-05, 2e-05, 2e-05, 3e-05, 3e-05, 3e-05, 3e-05, 3e-05, 3e-05, 3e-05, 3e-05, 3e-05, 3e-05, 3e-05, 3e-05, 3e-05, 3e-05, 3e-05, 3e-05, 3e-05, 3e-05, 3e-05, 3e-05, 3e-05, 3e-05, 3e-05, 3e-05, 3e-05, 3e-05, 3e-05, 3e-05, 3e-05, 3e-05, 3e-05, 3e-05, 3e-05, 3e-05, 3e-05, 3e-05, 3e-05, 3e-05, 3e-05, 3e-05, 3e-05, 3e-05, 3e-05, 4e-05, 4e-05, 4e-05, 4e-05, 4e-05, 4e-05, 4e-05, 4e-05, 4e-05, 4e-05, 4e-05, 4e-05, 4e-05, 4e-05, 4e-05, 4e-05, 4e-05, 4e-05, 4e-05, 4e-05, 4e-05, 4e-05, 4e-05, 4e-05, 4e-05, 4e-05, 4e-05, 4e-05, 4e-05, 4e-05, 4e-05, 4e-05, 4e-05, 4e-05, 4e-05, 4e-05, 4e-05, 4e-05, 4e-05, 4e-05, 4e-05, 4e-05, 5e-05, 5e-05, 5e-05, 5e-05, 5e-05, 5e-05, 5e-05, 5e-05, 5e-05, 5e-05, 5e-05, 5e-05, 5e-05, 5e-05, 5e-05, 5e-05, 5e-05, 5e-05, 5e-05, 5e-05, 5e-05, 5e-05, 5e-05, 5e-05, 5e-05, 5e-05, 5e-05, 5e-05, 5e-05, 5e-05, 5e-05, 5e-05, 5e-05, 5e-05, 5e-05, 5e-05, 5e-05, 5e-05, 5e-05, 5e-05, 5e-05, 5e-05, 5e-05, 6e-05, 6e-05, 6e-05, 6e-05, 6e-05, 6e-05, 6e-05, 6e-05, 6e-05, 6e-05, 6e-05, 6e-05, 6e-05, 6e-05, 6e-05, 6e-05, 6e-05, 6e-05, 6e-05, 6e-05, 6e-05, 6e-05, 6e-05, 6e-05, 6e-05, 6e-05, 6e-05, 6e-05, 6e-05, 6e-05, 6e-05, 6e-05, 6e-05, 6e-05, 6e-05, 6e-05, 6e-05, 6e-05, 6e-05, 6e-05, 6e-05, 6e-05, 6e-05, 7e-05, 7e-05, 7e-05, 7e-05, 7e-05, 7e-05, 7e-05, 7e-05, 7e-05, 7e-05, 7e-05, 7e-05, 7e-05, 7e-05, 7e-05, 7e-05, 7e-05, 7e-05, 7e-05, 7e-05, 7e-05, 7e-05, 7e-05, 7e-05, 7e-05, 7e-05, 7e-05, 7e-05, 7e-05, 7e-05, 7e-05, 7e-05, 7e-05, 7e-05, 7e-05, 7e-05, 7e-05, 7e-05, 7e-05, 7e-05, 7e-05, 7e-05, 8e-05, 8e-05, 8e-05, 8e-05, 8e-05, 8e-05, 8e-05, 8e-05, 8e-05, 8e-05, 8e-05, 8e-05, 8e-05, 8e-05, 8e-05, 8e-05, 8e-05, 8e-05, 8e-05, 8e-05, 8e-05, 8e-05, 8e-05, 8e-05, 8e-05, 8e-05, 8e-05, 8e-05, 8e-05, 8e-05, 8e-05, 8e-05, 8e-05, 8e-05, 8e-05, 8e-05, 8e-05, 8e-05, 8e-05, 8e-05, 8e-05, 8e-05, 8e-05, 9e-05, 9e-05, 9e-05, 9e-05, 9e-05, 9e-05, 9e-05, 9e-05, 9e-05, 9e-05, 9e-05, 9e-05, 9e-05, 9e-05, 9e-05, 9e-05, 9e-05, 9e-05, 9e-05, 9e-05, 9e-05, 9e-05, 9e-05, 9e-05, 9e-05, 9e-05, 9e-05, 9e-05, 9e-05, 9e-05, 9e-05, 9e-05, 9e-05, 9e-05, 9e-05, 9e-05, 9e-05]}, "batch-size": {"start_step": 0, "end_step": 2000, "step_interval": 5, "values": [128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0]}, "batch-size vs samples": {"start_step": 0, "end_step": 2000, "step_interval": 5, "values": [128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0, 128.0]}, "lm loss": {"start_step": 0, "end_step": 2000, "step_interval": 5, "values": [10.89904, 10.90777, 10.89232, 10.83544, 10.6834, 10.65974, 10.44873, 10.16308, 9.95831, 9.85932, 9.60254, 9.85446, 9.88893, 9.63287, 9.79405, 9.51078, 9.46463, 9.65471, 9.39306, 9.33895, 9.24972, 9.15413, 9.17988, 9.0065, 9.19899, 9.06474, 9.16249, 9.16631, 9.30043, 8.98957, 8.93842, 9.05744, 9.05222, 8.66356, 8.72626, 8.7667, 8.70006, 8.74817, 8.67179, 8.78274, 8.67795, 8.86767, 8.84929, 8.51536, 8.40624, 8.45093, 8.51004, 8.40653, 8.45216, 8.6026, 8.38502, 8.21394, 8.24297, 8.23879, 8.28518, 7.93123, 8.10705, 7.90575, 8.25948, 8.24016, 8.01415, 7.97894, 7.93174, 7.74864, 7.74918, 7.65293, 7.52384, 7.91349, 7.70509, 7.46214, 7.74596, 7.77384, 7.5447, 7.30561, 7.45871, 7.34545, 7.46856, 7.23017, 7.64088, 7.27983, 7.34981, 7.21134, 7.21081, 7.42102, 7.17384, 7.28052, 6.99786, 7.00152, 7.03624, 7.13136, 6.82298, 6.98762, 7.08699, 6.99714, 6.87231, 6.75444, 6.98392, 7.05773, 6.69999, 6.57801, 6.72248, 6.73865, 6.73005, 6.73698, 6.65374, 6.40729, 6.6365, 6.61972, 6.44423, 6.62637, 6.74067, 6.60551, 6.72345, 6.68935, 6.62052, 6.50773, 6.59703, 6.40181, 6.66219, 6.24576, 6.24815, 6.29992, 6.38652, 6.34284, 6.44395, 6.2868, 6.33137, 6.23064, 6.19419, 6.38932, 6.31955, 6.31115, 6.15595, 6.14904, 6.23012, 6.37609, 6.19108, 6.14016, 6.17443, 6.108, 6.05677, 6.07051, 6.2515, 6.40359, 6.25653, 6.30179, 6.09464, 6.1786, 6.00393, 6.03024, 5.95456, 6.25097, 6.18949, 5.96652, 5.78509, 6.12471, 5.85239, 6.09954, 5.78907, 6.1634, 6.14662, 6.08899, 5.93324, 6.11629, 5.94863, 6.19744, 5.89699, 5.79464, 5.78508, 5.6887, 6.01484, 5.99513, 6.06793, 5.88964, 6.04218, 5.96664, 5.9946, 5.98873, 5.94909, 5.83777, 5.94965, 5.62073, 5.70203, 5.88937, 5.84442, 5.86415, 5.75977, 5.83426, 5.72464, 5.56351, 5.71986, 5.62642, 5.83426, 5.60742, 5.71258, 5.70976, 5.8987, 5.64295, 5.85277, 5.73889, 5.87053, 5.32966, 5.89533, 5.87205, 5.85426, 5.41037, 5.40663, 5.62114, 5.59572, 5.48482, 5.57586, 5.67197, 5.4726, 5.74298, 5.50672, 5.5935, 5.61776, 5.6179, 5.51203, 5.61413, 5.67291, 5.68327, 5.58724, 5.66009, 5.37678, 5.68099, 5.62359, 5.42053, 5.57867, 5.62946, 5.54954, 5.33822, 5.53445, 5.48149, 5.47842, 5.37511, 5.5464, 5.60351, 5.38706, 5.51715, 5.48729, 5.33094, 5.50178, 5.40732, 5.44712, 5.31548, 5.06617, 5.47969, 5.56831, 5.7133, 5.41401, 5.59841, 5.63558, 5.2322, 5.27319, 5.38792, 5.39306, 5.32904, 5.49509, 5.17834, 5.29764, 5.24393, 5.37614, 5.25456, 5.44258, 5.54017, 5.31017, 5.43225, 5.33341, 5.07298, 5.31187, 5.2557, 5.30514, 5.10844, 5.27459, 5.26496, 5.47616, 5.16669, 5.26555, 5.21176, 5.355, 4.98377, 4.91178, 5.33096, 5.38935, 5.23414, 5.31329, 5.10388, 5.16417, 5.26356, 5.06801, 5.27045, 5.07377, 5.34602, 5.24563, 5.15001, 5.24094, 5.04069, 5.31488, 5.04958, 5.02979, 5.13788, 5.11434, 5.26734, 5.14852, 5.27369, 5.08851, 5.09324, 5.24624, 5.32324, 5.25443, 5.19052, 5.14435, 5.29055, 4.94885, 5.20441, 5.0907, 5.29874, 5.17267, 5.18858, 5.11677, 4.98159, 4.99122, 5.22123, 5.30764, 5.10222, 5.0544, 4.91358, 5.12177, 5.11614, 4.92915, 5.33612, 5.01913, 5.10051, 5.16573, 4.99929, 5.06049, 5.06814, 4.99437, 5.07642, 5.16464, 4.98109, 5.1825, 4.92945, 4.92916, 5.06868, 4.99902, 4.90979, 4.77687, 4.94499, 5.11671, 5.01541, 5.02126, 5.32954, 4.95713, 4.99895, 5.05055, 4.81011, 4.73872, 5.00091, 5.04398, 4.87805, 4.95233, 5.04347, 5.02539, 4.82104, 4.90025, 4.90912, 4.83747, 4.75039, 5.01482, 4.74829, 5.21037, 4.79047, 5.00245, 4.74175, 4.79189, 4.82107, 4.65381, 4.66051, 4.84616, 4.81073, 4.8078, 4.92405, 4.88723, 4.93597, 4.77468, 4.88361, 4.74125, 4.92209, 4.96252, 4.87874, 4.71289, 4.79114, 4.90017, 4.7175, 4.87202, 4.69846, 4.70626, 4.65256]}, "lm loss vs samples": {"start_step": 0, "end_step": 2000, "step_interval": 5, "values": [10.89904, 10.90777, 10.89232, 10.83544, 10.6834, 10.65974, 10.44873, 10.16308, 9.95831, 9.85932, 9.60254, 9.85446, 9.88893, 9.63287, 9.79405, 9.51078, 9.46463, 9.65471, 9.39306, 9.33895, 9.24972, 9.15413, 9.17988, 9.0065, 9.19899, 9.06474, 9.16249, 9.16631, 9.30043, 8.98957, 8.93842, 9.05744, 9.05222, 8.66356, 8.72626, 8.7667, 8.70006, 8.74817, 8.67179, 8.78274, 8.67795, 8.86767, 8.84929, 8.51536, 8.40624, 8.45093, 8.51004, 8.40653, 8.45216, 8.6026, 8.38502, 8.21394, 8.24297, 8.23879, 8.28518, 7.93123, 8.10705, 7.90575, 8.25948, 8.24016, 8.01415, 7.97894, 7.93174, 7.74864, 7.74918, 7.65293, 7.52384, 7.91349, 7.70509, 7.46214, 7.74596, 7.77384, 7.5447, 7.30561, 7.45871, 7.34545, 7.46856, 7.23017, 7.64088, 7.27983, 7.34981, 7.21134, 7.21081, 7.42102, 7.17384, 7.28052, 6.99786, 7.00152, 7.03624, 7.13136, 6.82298, 6.98762, 7.08699, 6.99714, 6.87231, 6.75444, 6.98392, 7.05773, 6.69999, 6.57801, 6.72248, 6.73865, 6.73005, 6.73698, 6.65374, 6.40729, 6.6365, 6.61972, 6.44423, 6.62637, 6.74067, 6.60551, 6.72345, 6.68935, 6.62052, 6.50773, 6.59703, 6.40181, 6.66219, 6.24576, 6.24815, 6.29992, 6.38652, 6.34284, 6.44395, 6.2868, 6.33137, 6.23064, 6.19419, 6.38932, 6.31955, 6.31115, 6.15595, 6.14904, 6.23012, 6.37609, 6.19108, 6.14016, 6.17443, 6.108, 6.05677, 6.07051, 6.2515, 6.40359, 6.25653, 6.30179, 6.09464, 6.1786, 6.00393, 6.03024, 5.95456, 6.25097, 6.18949, 5.96652, 5.78509, 6.12471, 5.85239, 6.09954, 5.78907, 6.1634, 6.14662, 6.08899, 5.93324, 6.11629, 5.94863, 6.19744, 5.89699, 5.79464, 5.78508, 5.6887, 6.01484, 5.99513, 6.06793, 5.88964, 6.04218, 5.96664, 5.9946, 5.98873, 5.94909, 5.83777, 5.94965, 5.62073, 5.70203, 5.88937, 5.84442, 5.86415, 5.75977, 5.83426, 5.72464, 5.56351, 5.71986, 5.62642, 5.83426, 5.60742, 5.71258, 5.70976, 5.8987, 5.64295, 5.85277, 5.73889, 5.87053, 5.32966, 5.89533, 5.87205, 5.85426, 5.41037, 5.40663, 5.62114, 5.59572, 5.48482, 5.57586, 5.67197, 5.4726, 5.74298, 5.50672, 5.5935, 5.61776, 5.6179, 5.51203, 5.61413, 5.67291, 5.68327, 5.58724, 5.66009, 5.37678, 5.68099, 5.62359, 5.42053, 5.57867, 5.62946, 5.54954, 5.33822, 5.53445, 5.48149, 5.47842, 5.37511, 5.5464, 5.60351, 5.38706, 5.51715, 5.48729, 5.33094, 5.50178, 5.40732, 5.44712, 5.31548, 5.06617, 5.47969, 5.56831, 5.7133, 5.41401, 5.59841, 5.63558, 5.2322, 5.27319, 5.38792, 5.39306, 5.32904, 5.49509, 5.17834, 5.29764, 5.24393, 5.37614, 5.25456, 5.44258, 5.54017, 5.31017, 5.43225, 5.33341, 5.07298, 5.31187, 5.2557, 5.30514, 5.10844, 5.27459, 5.26496, 5.47616, 5.16669, 5.26555, 5.21176, 5.355, 4.98377, 4.91178, 5.33096, 5.38935, 5.23414, 5.31329, 5.10388, 5.16417, 5.26356, 5.06801, 5.27045, 5.07377, 5.34602, 5.24563, 5.15001, 5.24094, 5.04069, 5.31488, 5.04958, 5.02979, 5.13788, 5.11434, 5.26734, 5.14852, 5.27369, 5.08851, 5.09324, 5.24624, 5.32324, 5.25443, 5.19052, 5.14435, 5.29055, 4.94885, 5.20441, 5.0907, 5.29874, 5.17267, 5.18858, 5.11677, 4.98159, 4.99122, 5.22123, 5.30764, 5.10222, 5.0544, 4.91358, 5.12177, 5.11614, 4.92915, 5.33612, 5.01913, 5.10051, 5.16573, 4.99929, 5.06049, 5.06814, 4.99437, 5.07642, 5.16464, 4.98109, 5.1825, 4.92945, 4.92916, 5.06868, 4.99902, 4.90979, 4.77687, 4.94499, 5.11671, 5.01541, 5.02126, 5.32954, 4.95713, 4.99895, 5.05055, 4.81011, 4.73872, 5.00091, 5.04398, 4.87805, 4.95233, 5.04347, 5.02539, 4.82104, 4.90025, 4.90912, 4.83747, 4.75039, 5.01482, 4.74829, 5.21037, 4.79047, 5.00245, 4.74175, 4.79189, 4.82107, 4.65381, 4.66051, 4.84616, 4.81073, 4.8078, 4.92405, 4.88723, 4.93597, 4.77468, 4.88361, 4.74125, 4.92209, 4.96252, 4.87874, 4.71289, 4.79114, 4.90017, 4.7175, 4.87202, 4.69846, 4.70626, 4.65256]}, "loss-scale": {"start_step": 0, "end_step": 2000, "step_interval": 5, "values": [1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0]}, "loss-scale vs samples": {"start_step": 0, "end_step": 2000, "step_interval": 5, "values": [1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0]}, "grad-norm": {"start_step": 0, "end_step": 2000, "step_interval": 5, "values": [13.85752, 13.16701, 13.66167, 12.68371, 12.08638, 9.51321, 6.94209, 7.08694, 6.10814, 4.68821, 4.2751, 2.87984, 2.44435, 2.3806, 2.05602, 2.21803, 2.17031, 1.89335, 2.22351, 2.07816, 2.13217, 2.16577, 2.02595, 2.23917, 2.00742, 2.14445, 1.91002, 1.89231, 1.93089, 2.06379, 2.16765, 2.23679, 1.89668, 2.34753, 2.35194, 2.16267, 2.15162, 1.83098, 2.05276, 1.74395, 2.36831, 1.97031, 1.80751, 1.87923, 1.94701, 1.80892, 1.71885, 1.77109, 1.75698, 1.55174, 1.76422, 1.75578, 1.7467, 1.926, 1.6754, 1.89063, 1.76173, 1.82379, 1.52589, 1.48723, 1.63648, 1.49118, 1.79292, 1.82033, 1.59591, 1.62383, 1.63898, 1.62368, 1.43237, 1.62305, 1.35226, 1.37441, 1.77832, 1.4053, 1.36387, 1.43489, 1.33927, 1.41507, 1.32726, 1.26584, 1.3881, 1.23171, 1.40194, 1.20354, 1.1842, 1.32033, 1.50387, 1.25756, 1.20187, 1.05786, 1.15737, 1.22128, 1.02487, 1.08879, 0.98695, 1.28999, 0.98417, 1.58629, 1.03703, 1.06213, 1.55961, 1.47669, 0.90784, 1.45527, 1.29065, 1.13286, 1.14779, 0.95484, 1.09964, 0.89588, 0.84205, 0.91582, 1.04481, 1.01608, 1.02993, 1.12143, 1.08948, 1.31986, 0.92092, 1.1799, 1.09173, 1.10393, 1.19122, 1.03752, 1.03062, 1.19126, 1.02231, 1.0955, 1.05064, 1.06655, 1.1517, 1.11568, 1.37446, 1.21005, 1.53165, 1.24599, 1.03436, 1.56617, 1.39613, 1.20613, 1.59751, 1.76157, 1.17134, 1.06152, 1.22514, 1.97917, 1.11879, 1.62597, 1.18846, 0.95412, 1.17247, 1.50913, 1.42049, 1.32267, 1.02991, 1.60853, 1.51052, 1.23861, 1.4438, 1.81637, 1.43133, 1.52934, 1.66869, 1.18507, 1.38099, 1.44638, 1.56369, 1.1851, 1.63779, 1.22939, 1.13585, 0.93198, 1.58024, 1.61619, 1.48199, 1.39642, 1.72479, 1.20982, 1.33257, 1.14605, 1.14908, 1.46659, 1.41611, 1.64334, 1.40953, 1.89405, 1.62101, 1.55, 1.25036, 1.73578, 1.20849, 1.16164, 2.00175, 1.79359, 1.54068, 1.27095, 1.51292, 1.45211, 1.55181, 1.38317, 1.19552, 1.41924, 1.0843, 1.11099, 1.49128, 1.31175, 1.31568, 1.31643, 1.38944, 1.83714, 1.51633, 1.66291, 1.32027, 1.40224, 1.23381, 1.24726, 1.17329, 1.41173, 1.41298, 1.21975, 1.40395, 1.29766, 1.647, 1.77185, 1.70549, 1.66243, 1.35144, 1.53811, 1.34558, 1.49398, 1.11503, 1.29778, 1.74207, 1.44213, 1.53886, 1.63632, 1.20482, 1.57111, 1.4054, 1.21748, 1.63569, 1.23136, 1.58159, 1.59579, 1.48012, 1.5323, 1.55081, 1.4194, 1.57228, 1.48387, 1.38849, 1.27392, 1.46178, 1.25824, 1.36062, 1.39751, 1.30771, 1.33147, 1.56583, 1.32709, 1.3646, 1.55907, 1.61002, 1.45173, 1.42035, 2.16284, 1.75737, 1.67782, 1.31786, 1.45228, 1.59778, 1.56015, 1.4983, 1.23696, 1.35268, 1.40317, 1.37404, 1.67666, 1.49364, 1.47162, 1.50218, 1.40879, 1.26151, 1.53009, 1.2357, 1.52653, 1.16029, 1.37287, 1.45359, 1.43811, 1.48164, 1.84101, 1.47755, 1.57834, 1.61834, 1.37842, 1.4784, 1.5761, 1.25832, 1.22282, 1.47102, 1.22564, 1.24267, 1.4204, 1.52394, 1.4913, 1.42263, 1.42192, 1.14735, 1.34499, 1.41439, 1.29824, 1.69085, 1.44146, 1.55667, 1.25423, 1.36428, 1.18219, 1.19336, 1.33449, 1.6401, 1.40383, 1.31292, 1.52789, 1.3215, 1.5794, 1.52614, 1.22037, 1.55665, 1.33214, 1.42978, 1.54699, 1.14418, 1.6388, 1.34807, 1.3749, 1.28337, 1.39417, 1.59994, 1.36359, 1.36119, 1.19917, 1.33658, 1.27596, 1.44996, 1.61368, 1.41282, 1.45175, 1.23245, 1.34616, 1.42121, 1.22977, 1.59453, 1.46628, 1.2612, 1.66869, 1.34891, 1.38326, 1.54549, 1.62587, 1.50361, 1.33282, 1.30675, 1.24628, 1.22264, 1.39221, 1.62236, 1.59048, 1.51538, 1.71681, 1.34251, 1.22656, 1.61992, 1.40775, 1.39241, 1.37966, 1.26457, 1.31626, 1.23459, 1.33073, 1.25512, 1.32646, 1.32216, 1.2607, 1.26972, 1.41721, 1.4656, 1.22975, 1.33206, 1.36899, 1.3651, 1.49566, 1.54131, 1.24469, 1.32355, 1.39775, 1.35713, 1.23875, 1.37455, 1.14642]}, "grad-norm vs samples": {"start_step": 0, "end_step": 2000, "step_interval": 5, "values": [13.85752, 13.16701, 13.66167, 12.68371, 12.08638, 9.51321, 6.94209, 7.08694, 6.10814, 4.68821, 4.2751, 2.87984, 2.44435, 2.3806, 2.05602, 2.21803, 2.17031, 1.89335, 2.22351, 2.07816, 2.13217, 2.16577, 2.02595, 2.23917, 2.00742, 2.14445, 1.91002, 1.89231, 1.93089, 2.06379, 2.16765, 2.23679, 1.89668, 2.34753, 2.35194, 2.16267, 2.15162, 1.83098, 2.05276, 1.74395, 2.36831, 1.97031, 1.80751, 1.87923, 1.94701, 1.80892, 1.71885, 1.77109, 1.75698, 1.55174, 1.76422, 1.75578, 1.7467, 1.926, 1.6754, 1.89063, 1.76173, 1.82379, 1.52589, 1.48723, 1.63648, 1.49118, 1.79292, 1.82033, 1.59591, 1.62383, 1.63898, 1.62368, 1.43237, 1.62305, 1.35226, 1.37441, 1.77832, 1.4053, 1.36387, 1.43489, 1.33927, 1.41507, 1.32726, 1.26584, 1.3881, 1.23171, 1.40194, 1.20354, 1.1842, 1.32033, 1.50387, 1.25756, 1.20187, 1.05786, 1.15737, 1.22128, 1.02487, 1.08879, 0.98695, 1.28999, 0.98417, 1.58629, 1.03703, 1.06213, 1.55961, 1.47669, 0.90784, 1.45527, 1.29065, 1.13286, 1.14779, 0.95484, 1.09964, 0.89588, 0.84205, 0.91582, 1.04481, 1.01608, 1.02993, 1.12143, 1.08948, 1.31986, 0.92092, 1.1799, 1.09173, 1.10393, 1.19122, 1.03752, 1.03062, 1.19126, 1.02231, 1.0955, 1.05064, 1.06655, 1.1517, 1.11568, 1.37446, 1.21005, 1.53165, 1.24599, 1.03436, 1.56617, 1.39613, 1.20613, 1.59751, 1.76157, 1.17134, 1.06152, 1.22514, 1.97917, 1.11879, 1.62597, 1.18846, 0.95412, 1.17247, 1.50913, 1.42049, 1.32267, 1.02991, 1.60853, 1.51052, 1.23861, 1.4438, 1.81637, 1.43133, 1.52934, 1.66869, 1.18507, 1.38099, 1.44638, 1.56369, 1.1851, 1.63779, 1.22939, 1.13585, 0.93198, 1.58024, 1.61619, 1.48199, 1.39642, 1.72479, 1.20982, 1.33257, 1.14605, 1.14908, 1.46659, 1.41611, 1.64334, 1.40953, 1.89405, 1.62101, 1.55, 1.25036, 1.73578, 1.20849, 1.16164, 2.00175, 1.79359, 1.54068, 1.27095, 1.51292, 1.45211, 1.55181, 1.38317, 1.19552, 1.41924, 1.0843, 1.11099, 1.49128, 1.31175, 1.31568, 1.31643, 1.38944, 1.83714, 1.51633, 1.66291, 1.32027, 1.40224, 1.23381, 1.24726, 1.17329, 1.41173, 1.41298, 1.21975, 1.40395, 1.29766, 1.647, 1.77185, 1.70549, 1.66243, 1.35144, 1.53811, 1.34558, 1.49398, 1.11503, 1.29778, 1.74207, 1.44213, 1.53886, 1.63632, 1.20482, 1.57111, 1.4054, 1.21748, 1.63569, 1.23136, 1.58159, 1.59579, 1.48012, 1.5323, 1.55081, 1.4194, 1.57228, 1.48387, 1.38849, 1.27392, 1.46178, 1.25824, 1.36062, 1.39751, 1.30771, 1.33147, 1.56583, 1.32709, 1.3646, 1.55907, 1.61002, 1.45173, 1.42035, 2.16284, 1.75737, 1.67782, 1.31786, 1.45228, 1.59778, 1.56015, 1.4983, 1.23696, 1.35268, 1.40317, 1.37404, 1.67666, 1.49364, 1.47162, 1.50218, 1.40879, 1.26151, 1.53009, 1.2357, 1.52653, 1.16029, 1.37287, 1.45359, 1.43811, 1.48164, 1.84101, 1.47755, 1.57834, 1.61834, 1.37842, 1.4784, 1.5761, 1.25832, 1.22282, 1.47102, 1.22564, 1.24267, 1.4204, 1.52394, 1.4913, 1.42263, 1.42192, 1.14735, 1.34499, 1.41439, 1.29824, 1.69085, 1.44146, 1.55667, 1.25423, 1.36428, 1.18219, 1.19336, 1.33449, 1.6401, 1.40383, 1.31292, 1.52789, 1.3215, 1.5794, 1.52614, 1.22037, 1.55665, 1.33214, 1.42978, 1.54699, 1.14418, 1.6388, 1.34807, 1.3749, 1.28337, 1.39417, 1.59994, 1.36359, 1.36119, 1.19917, 1.33658, 1.27596, 1.44996, 1.61368, 1.41282, 1.45175, 1.23245, 1.34616, 1.42121, 1.22977, 1.59453, 1.46628, 1.2612, 1.66869, 1.34891, 1.38326, 1.54549, 1.62587, 1.50361, 1.33282, 1.30675, 1.24628, 1.22264, 1.39221, 1.62236, 1.59048, 1.51538, 1.71681, 1.34251, 1.22656, 1.61992, 1.40775, 1.39241, 1.37966, 1.26457, 1.31626, 1.23459, 1.33073, 1.25512, 1.32646, 1.32216, 1.2607, 1.26972, 1.41721, 1.4656, 1.22975, 1.33206, 1.36899, 1.3651, 1.49566, 1.54131, 1.24469, 1.32355, 1.39775, 1.35713, 1.23875, 1.37455, 1.14642]}, "num-zeros": {"start_step": 0, "end_step": 2000, "step_interval": 5, "values": [58.0, 87.0, 81.0, 84.0, 84.0, 90.0, 104.0, 124.0, 102.0, 132.0, 129.0, 152.0, 143.0, 181.0, 202.0, 161.0, 161.0, 177.0, 184.0, 189.0, 151.0, 167.0, 183.0, 182.0, 186.0, 154.0, 178.0, 163.0, 167.0, 148.0, 145.0, 138.0, 187.0, 168.0, 140.0, 142.0, 167.0, 204.0, 169.0, 203.0, 148.0, 155.0, 141.0, 200.0, 190.0, 169.0, 187.0, 196.0, 175.0, 229.0, 207.0, 188.0, 199.0, 157.0, 186.0, 178.0, 154.0, 138.0, 248.0, 232.0, 174.0, 186.0, 188.0, 193.0, 201.0, 239.0, 207.0, 166.0, 208.0, 203.0, 208.0, 254.0, 168.0, 251.0, 210.0, 201.0, 239.0, 211.0, 241.0, 211.0, 204.0, 215.0, 193.0, 225.0, 213.0, 184.0, 182.0, 191.0, 206.0, 206.0, 188.0, 218.0, 214.0, 205.0, 203.0, 166.0, 206.0, 174.0, 195.0, 174.0, 140.0, 154.0, 176.0, 165.0, 129.0, 148.0, 168.0, 157.0, 137.0, 180.0, 175.0, 163.0, 175.0, 145.0, 138.0, 134.0, 159.0, 128.0, 173.0, 161.0, 151.0, 113.0, 133.0, 129.0, 177.0, 125.0, 153.0, 137.0, 120.0, 142.0, 148.0, 143.0, 100.0, 113.0, 106.0, 124.0, 129.0, 93.0, 119.0, 125.0, 107.0, 107.0, 141.0, 141.0, 122.0, 91.0, 142.0, 120.0, 101.0, 141.0, 130.0, 112.0, 107.0, 110.0, 132.0, 105.0, 102.0, 116.0, 115.0, 122.0, 96.0, 122.0, 87.0, 104.0, 112.0, 91.0, 110.0, 107.0, 101.0, 103.0, 107.0, 117.0, 83.0, 102.0, 105.0, 133.0, 96.0, 115.0, 93.0, 128.0, 129.0, 113.0, 112.0, 104.0, 104.0, 90.0, 85.0, 92.0, 96.0, 79.0, 140.0, 112.0, 103.0, 85.0, 96.0, 103.0, 104.0, 90.0, 109.0, 115.0, 113.0, 82.0, 123.0, 128.0, 86.0, 113.0, 103.0, 100.0, 129.0, 90.0, 96.0, 92.0, 106.0, 106.0, 113.0, 127.0, 112.0, 118.0, 96.0, 106.0, 114.0, 93.0, 85.0, 74.0, 105.0, 113.0, 97.0, 113.0, 107.0, 97.0, 109.0, 87.0, 89.0, 108.0, 106.0, 87.0, 120.0, 115.0, 109.0, 111.0, 100.0, 114.0, 102.0, 106.0, 94.0, 106.0, 77.0, 124.0, 112.0, 102.0, 104.0, 111.0, 109.0, 125.0, 114.0, 109.0, 120.0, 120.0, 103.0, 107.0, 86.0, 111.0, 95.0, 102.0, 108.0, 78.0, 100.0, 90.0, 107.0, 101.0, 104.0, 119.0, 100.0, 113.0, 110.0, 113.0, 90.0, 101.0, 107.0, 106.0, 111.0, 88.0, 125.0, 93.0, 106.0, 103.0, 116.0, 127.0, 100.0, 84.0, 102.0, 97.0, 97.0, 94.0, 120.0, 109.0, 110.0, 98.0, 97.0, 113.0, 108.0, 106.0, 143.0, 104.0, 111.0, 106.0, 103.0, 99.0, 110.0, 106.0, 130.0, 121.0, 112.0, 103.0, 101.0, 97.0, 115.0, 127.0, 117.0, 116.0, 109.0, 101.0, 129.0, 101.0, 99.0, 112.0, 91.0, 113.0, 104.0, 122.0, 91.0, 120.0, 124.0, 89.0, 106.0, 106.0, 119.0, 101.0, 98.0, 102.0, 129.0, 107.0, 116.0, 126.0, 127.0, 112.0, 86.0, 106.0, 136.0, 135.0, 107.0, 93.0, 102.0, 118.0, 117.0, 104.0, 123.0, 99.0, 114.0, 92.0, 128.0, 92.0, 107.0, 92.0, 124.0, 106.0, 101.0, 112.0, 106.0, 99.0, 107.0, 110.0, 97.0, 108.0, 117.0, 119.0, 102.0, 116.0, 116.0, 118.0, 108.0, 130.0, 116.0, 118.0, 122.0, 105.0, 104.0, 126.0, 123.0, 118.0, 124.0, 126.0, 97.0, 123.0, 133.0, 101.0, 117.0, 114.0, 120.0, 139.0]}, "num-zeros vs samples": {"start_step": 0, "end_step": 2000, "step_interval": 5, "values": [58.0, 87.0, 81.0, 84.0, 84.0, 90.0, 104.0, 124.0, 102.0, 132.0, 129.0, 152.0, 143.0, 181.0, 202.0, 161.0, 161.0, 177.0, 184.0, 189.0, 151.0, 167.0, 183.0, 182.0, 186.0, 154.0, 178.0, 163.0, 167.0, 148.0, 145.0, 138.0, 187.0, 168.0, 140.0, 142.0, 167.0, 204.0, 169.0, 203.0, 148.0, 155.0, 141.0, 200.0, 190.0, 169.0, 187.0, 196.0, 175.0, 229.0, 207.0, 188.0, 199.0, 157.0, 186.0, 178.0, 154.0, 138.0, 248.0, 232.0, 174.0, 186.0, 188.0, 193.0, 201.0, 239.0, 207.0, 166.0, 208.0, 203.0, 208.0, 254.0, 168.0, 251.0, 210.0, 201.0, 239.0, 211.0, 241.0, 211.0, 204.0, 215.0, 193.0, 225.0, 213.0, 184.0, 182.0, 191.0, 206.0, 206.0, 188.0, 218.0, 214.0, 205.0, 203.0, 166.0, 206.0, 174.0, 195.0, 174.0, 140.0, 154.0, 176.0, 165.0, 129.0, 148.0, 168.0, 157.0, 137.0, 180.0, 175.0, 163.0, 175.0, 145.0, 138.0, 134.0, 159.0, 128.0, 173.0, 161.0, 151.0, 113.0, 133.0, 129.0, 177.0, 125.0, 153.0, 137.0, 120.0, 142.0, 148.0, 143.0, 100.0, 113.0, 106.0, 124.0, 129.0, 93.0, 119.0, 125.0, 107.0, 107.0, 141.0, 141.0, 122.0, 91.0, 142.0, 120.0, 101.0, 141.0, 130.0, 112.0, 107.0, 110.0, 132.0, 105.0, 102.0, 116.0, 115.0, 122.0, 96.0, 122.0, 87.0, 104.0, 112.0, 91.0, 110.0, 107.0, 101.0, 103.0, 107.0, 117.0, 83.0, 102.0, 105.0, 133.0, 96.0, 115.0, 93.0, 128.0, 129.0, 113.0, 112.0, 104.0, 104.0, 90.0, 85.0, 92.0, 96.0, 79.0, 140.0, 112.0, 103.0, 85.0, 96.0, 103.0, 104.0, 90.0, 109.0, 115.0, 113.0, 82.0, 123.0, 128.0, 86.0, 113.0, 103.0, 100.0, 129.0, 90.0, 96.0, 92.0, 106.0, 106.0, 113.0, 127.0, 112.0, 118.0, 96.0, 106.0, 114.0, 93.0, 85.0, 74.0, 105.0, 113.0, 97.0, 113.0, 107.0, 97.0, 109.0, 87.0, 89.0, 108.0, 106.0, 87.0, 120.0, 115.0, 109.0, 111.0, 100.0, 114.0, 102.0, 106.0, 94.0, 106.0, 77.0, 124.0, 112.0, 102.0, 104.0, 111.0, 109.0, 125.0, 114.0, 109.0, 120.0, 120.0, 103.0, 107.0, 86.0, 111.0, 95.0, 102.0, 108.0, 78.0, 100.0, 90.0, 107.0, 101.0, 104.0, 119.0, 100.0, 113.0, 110.0, 113.0, 90.0, 101.0, 107.0, 106.0, 111.0, 88.0, 125.0, 93.0, 106.0, 103.0, 116.0, 127.0, 100.0, 84.0, 102.0, 97.0, 97.0, 94.0, 120.0, 109.0, 110.0, 98.0, 97.0, 113.0, 108.0, 106.0, 143.0, 104.0, 111.0, 106.0, 103.0, 99.0, 110.0, 106.0, 130.0, 121.0, 112.0, 103.0, 101.0, 97.0, 115.0, 127.0, 117.0, 116.0, 109.0, 101.0, 129.0, 101.0, 99.0, 112.0, 91.0, 113.0, 104.0, 122.0, 91.0, 120.0, 124.0, 89.0, 106.0, 106.0, 119.0, 101.0, 98.0, 102.0, 129.0, 107.0, 116.0, 126.0, 127.0, 112.0, 86.0, 106.0, 136.0, 135.0, 107.0, 93.0, 102.0, 118.0, 117.0, 104.0, 123.0, 99.0, 114.0, 92.0, 128.0, 92.0, 107.0, 92.0, 124.0, 106.0, 101.0, 112.0, 106.0, 99.0, 107.0, 110.0, 97.0, 108.0, 117.0, 119.0, 102.0, 116.0, 116.0, 118.0, 108.0, 130.0, 116.0, 118.0, 122.0, 105.0, 104.0, 126.0, 123.0, 118.0, 124.0, 126.0, 97.0, 123.0, 133.0, 101.0, 117.0, 114.0, 120.0, 139.0]}, "params-norm": {"start_step": 0, "end_step": 2000, "step_interval": 5, "values": [180.15739, 180.15739, 180.15739, 180.15739, 180.15739, 180.15738, 180.15736, 180.15726, 180.15707, 180.15691, 180.15549, 180.15459, 180.15424, 180.15187, 180.15096, 180.15027, 180.14986, 180.14993, 180.15019, 180.15031, 180.15027, 180.14986, 180.14978, 180.15002, 180.15096, 180.15236, 180.15356, 180.15433, 180.15535, 180.15683, 180.15872, 180.16106, 180.16333, 180.16548, 180.16803, 180.17111, 180.17455, 180.1783, 180.18213, 180.18637, 180.19121, 180.19637, 180.20183, 180.20786, 180.21451, 180.22182, 180.22966, 180.23802, 180.24725, 180.25742, 180.2684, 180.28008, 180.29228, 180.30507, 180.31865, 180.33281, 180.34721, 180.36223, 180.37819, 180.39531, 180.41338, 180.43228, 180.45262, 180.47394, 180.49564, 180.51866, 180.54247, 180.56686, 180.59306, 180.6189, 180.64566, 180.6731, 180.70131, 180.72955, 180.75832, 180.78758, 180.81717, 180.84805, 180.8793, 180.91136, 180.94365, 180.97591, 181.00896, 181.04247, 181.07669, 181.11148, 181.14615, 181.18118, 181.2169, 181.25371, 181.29126, 181.32945, 181.36674, 181.40437, 181.4427, 181.4816, 181.51944, 181.5558, 181.59123, 181.62697, 181.66261, 181.69635, 181.73094, 181.76637, 181.8006, 181.83632, 181.87393, 181.91217, 181.95012, 181.9888, 182.0287, 182.06952, 182.11082, 182.15179, 182.19136, 182.23178, 182.27216, 182.31206, 182.35109, 182.39093, 182.43059, 182.47116, 182.51115, 182.55157, 182.59242, 182.63356, 182.67308, 182.71248, 182.75157, 182.79005, 182.8289, 182.86778, 182.90854, 182.9481, 182.98575, 183.02332, 183.0623, 183.0995, 183.13556, 183.17046, 183.20383, 183.23506, 183.26553, 183.2989, 183.33479, 183.37086, 183.40509, 183.44055, 183.47644, 183.51241, 183.54857, 183.58354, 183.61832, 183.65422, 183.69316, 183.73344, 183.77179, 183.80856, 183.84579, 183.88249, 183.91859, 183.95512, 183.99037, 184.02548, 184.063, 184.10135, 184.13824, 184.17474, 184.21408, 184.25304, 184.29404, 184.33496, 184.37621, 184.41531, 184.4537, 184.4928, 184.53014, 184.56731, 184.60611, 184.64619, 184.68703, 184.72823, 184.77042, 184.81314, 184.85387, 184.89021, 184.92393, 184.95621, 184.99136, 185.02664, 185.06209, 185.10019, 185.14125, 185.18129, 185.22131, 185.26175, 185.30276, 185.34607, 185.38876, 185.43182, 185.47507, 185.51636, 185.55836, 185.60168, 185.64523, 185.68893, 185.73134, 185.77113, 185.80952, 185.84686, 185.88496, 185.92491, 185.96541, 186.00458, 186.04584, 186.08769, 186.13078, 186.17444, 186.2169, 186.25897, 186.30052, 186.34146, 186.38252, 186.42355, 186.46315, 186.50108, 186.53908, 186.57777, 186.61641, 186.65698, 186.69749, 186.73779, 186.776, 186.81406, 186.85432, 186.89455, 186.93593, 186.97723, 187.02032, 187.06329, 187.10561, 187.14796, 187.19154, 187.23483, 187.27914, 187.32254, 187.36426, 187.40421, 187.44449, 187.48557, 187.52713, 187.5705, 187.61469, 187.65993, 187.70628, 187.75299, 187.79915, 187.84256, 187.8851, 187.92828, 187.97391, 188.02026, 188.06656, 188.11136, 188.15483, 188.19771, 188.23875, 188.28041, 188.32339, 188.36717, 188.41173, 188.4559, 188.49995, 188.54559, 188.59273, 188.64139, 188.68826, 188.73679, 188.7838, 188.82909, 188.87553, 188.92162, 188.96811, 189.01474, 189.06255, 189.10872, 189.15393, 189.19994, 189.24557, 189.29164, 189.3381, 189.38397, 189.42863, 189.47279, 189.51843, 189.5647, 189.61183, 189.66019, 189.7094, 189.7603, 189.81245, 189.86432, 189.91537, 189.96579, 190.01378, 190.06058, 190.10844, 190.15665, 190.20692, 190.2585, 190.31071, 190.36349, 190.41649, 190.46754, 190.51726, 190.56802, 190.62105, 190.67397, 190.72807, 190.78218, 190.8349, 190.88562, 190.93848, 190.99274, 191.04617, 191.0997, 191.15161, 191.20273, 191.25496, 191.30672, 191.35922, 191.41141, 191.46227, 191.51437, 191.56682, 191.6205, 191.67529, 191.73068, 191.78505, 191.8385, 191.89308, 191.94789, 192.0024, 192.05864, 192.11432, 192.1684, 192.22186, 192.27574, 192.33052, 192.38582, 192.44121, 192.49785, 192.55418, 192.60825, 192.66292, 192.71729, 192.77345, 192.82953, 192.88582, 192.94179, 192.99664, 193.05156, 193.1075, 193.16364, 193.22198, 193.27934, 193.33693, 193.3927, 193.44841, 193.50385, 193.55917, 193.61432, 193.67184, 193.72919, 193.78648, 193.8439, 193.90105, 193.95886, 194.0177, 194.07675, 194.13638, 194.19586, 194.25424, 194.31471, 194.37587, 194.43796, 194.50008, 194.56322, 194.62543, 194.68716, 194.74808, 194.80829, 194.8662, 194.92447, 194.9838, 195.04256, 195.10059, 195.16046, 195.22166, 195.2832]}, "params-norm vs samples": {"start_step": 0, "end_step": 2000, "step_interval": 5, "values": [180.15739, 180.15739, 180.15739, 180.15739, 180.15739, 180.15738, 180.15736, 180.15726, 180.15707, 180.15691, 180.15549, 180.15459, 180.15424, 180.15187, 180.15096, 180.15027, 180.14986, 180.14993, 180.15019, 180.15031, 180.15027, 180.14986, 180.14978, 180.15002, 180.15096, 180.15236, 180.15356, 180.15433, 180.15535, 180.15683, 180.15872, 180.16106, 180.16333, 180.16548, 180.16803, 180.17111, 180.17455, 180.1783, 180.18213, 180.18637, 180.19121, 180.19637, 180.20183, 180.20786, 180.21451, 180.22182, 180.22966, 180.23802, 180.24725, 180.25742, 180.2684, 180.28008, 180.29228, 180.30507, 180.31865, 180.33281, 180.34721, 180.36223, 180.37819, 180.39531, 180.41338, 180.43228, 180.45262, 180.47394, 180.49564, 180.51866, 180.54247, 180.56686, 180.59306, 180.6189, 180.64566, 180.6731, 180.70131, 180.72955, 180.75832, 180.78758, 180.81717, 180.84805, 180.8793, 180.91136, 180.94365, 180.97591, 181.00896, 181.04247, 181.07669, 181.11148, 181.14615, 181.18118, 181.2169, 181.25371, 181.29126, 181.32945, 181.36674, 181.40437, 181.4427, 181.4816, 181.51944, 181.5558, 181.59123, 181.62697, 181.66261, 181.69635, 181.73094, 181.76637, 181.8006, 181.83632, 181.87393, 181.91217, 181.95012, 181.9888, 182.0287, 182.06952, 182.11082, 182.15179, 182.19136, 182.23178, 182.27216, 182.31206, 182.35109, 182.39093, 182.43059, 182.47116, 182.51115, 182.55157, 182.59242, 182.63356, 182.67308, 182.71248, 182.75157, 182.79005, 182.8289, 182.86778, 182.90854, 182.9481, 182.98575, 183.02332, 183.0623, 183.0995, 183.13556, 183.17046, 183.20383, 183.23506, 183.26553, 183.2989, 183.33479, 183.37086, 183.40509, 183.44055, 183.47644, 183.51241, 183.54857, 183.58354, 183.61832, 183.65422, 183.69316, 183.73344, 183.77179, 183.80856, 183.84579, 183.88249, 183.91859, 183.95512, 183.99037, 184.02548, 184.063, 184.10135, 184.13824, 184.17474, 184.21408, 184.25304, 184.29404, 184.33496, 184.37621, 184.41531, 184.4537, 184.4928, 184.53014, 184.56731, 184.60611, 184.64619, 184.68703, 184.72823, 184.77042, 184.81314, 184.85387, 184.89021, 184.92393, 184.95621, 184.99136, 185.02664, 185.06209, 185.10019, 185.14125, 185.18129, 185.22131, 185.26175, 185.30276, 185.34607, 185.38876, 185.43182, 185.47507, 185.51636, 185.55836, 185.60168, 185.64523, 185.68893, 185.73134, 185.77113, 185.80952, 185.84686, 185.88496, 185.92491, 185.96541, 186.00458, 186.04584, 186.08769, 186.13078, 186.17444, 186.2169, 186.25897, 186.30052, 186.34146, 186.38252, 186.42355, 186.46315, 186.50108, 186.53908, 186.57777, 186.61641, 186.65698, 186.69749, 186.73779, 186.776, 186.81406, 186.85432, 186.89455, 186.93593, 186.97723, 187.02032, 187.06329, 187.10561, 187.14796, 187.19154, 187.23483, 187.27914, 187.32254, 187.36426, 187.40421, 187.44449, 187.48557, 187.52713, 187.5705, 187.61469, 187.65993, 187.70628, 187.75299, 187.79915, 187.84256, 187.8851, 187.92828, 187.97391, 188.02026, 188.06656, 188.11136, 188.15483, 188.19771, 188.23875, 188.28041, 188.32339, 188.36717, 188.41173, 188.4559, 188.49995, 188.54559, 188.59273, 188.64139, 188.68826, 188.73679, 188.7838, 188.82909, 188.87553, 188.92162, 188.96811, 189.01474, 189.06255, 189.10872, 189.15393, 189.19994, 189.24557, 189.29164, 189.3381, 189.38397, 189.42863, 189.47279, 189.51843, 189.5647, 189.61183, 189.66019, 189.7094, 189.7603, 189.81245, 189.86432, 189.91537, 189.96579, 190.01378, 190.06058, 190.10844, 190.15665, 190.20692, 190.2585, 190.31071, 190.36349, 190.41649, 190.46754, 190.51726, 190.56802, 190.62105, 190.67397, 190.72807, 190.78218, 190.8349, 190.88562, 190.93848, 190.99274, 191.04617, 191.0997, 191.15161, 191.20273, 191.25496, 191.30672, 191.35922, 191.41141, 191.46227, 191.51437, 191.56682, 191.6205, 191.67529, 191.73068, 191.78505, 191.8385, 191.89308, 191.94789, 192.0024, 192.05864, 192.11432, 192.1684, 192.22186, 192.27574, 192.33052, 192.38582, 192.44121, 192.49785, 192.55418, 192.60825, 192.66292, 192.71729, 192.77345, 192.82953, 192.88582, 192.94179, 192.99664, 193.05156, 193.1075, 193.16364, 193.22198, 193.27934, 193.33693, 193.3927, 193.44841, 193.50385, 193.55917, 193.61432, 193.67184, 193.72919, 193.78648, 193.8439, 193.90105, 193.95886, 194.0177, 194.07675, 194.13638, 194.19586, 194.25424, 194.31471, 194.37587, 194.43796, 194.50008, 194.56322, 194.62543, 194.68716, 194.74808, 194.80829, 194.8662, 194.92447, 194.9838, 195.04256, 195.10059, 195.16046, 195.22166, 195.2832]}, "iteration-time": {"start_step": 0, "end_step": 2000, "step_interval": 5, "values": [30.41341, 2.8046, 2.79928, 2.80445, 2.79909, 2.80635, 2.79849, 2.79809, 2.80876, 2.80642, 2.79859, 2.80408, 2.80282, 2.80528, 2.80514, 2.80807, 2.80806, 2.80751, 2.80996, 2.80978, 2.80663, 2.80424, 2.81097, 2.81307, 2.81122, 2.80264, 2.80542, 2.80789, 2.81202, 2.80175, 2.80699, 2.81063, 2.81844, 2.82302, 2.81854, 2.8107, 2.81902, 2.8157, 2.82159, 2.81915, 2.81816, 2.82321, 2.81751, 2.82121, 2.82517, 2.83278, 2.81862, 2.81687, 2.82205, 2.8171, 2.81951, 2.81838, 2.81328, 2.82805, 2.91883, 2.83795, 2.82853, 2.82715, 2.82978, 2.83004, 2.83565, 2.83193, 2.83679, 2.83184, 2.83322, 2.83292, 2.82436, 2.82807, 2.82713, 2.82297, 2.82207, 2.81925, 2.82219, 2.82388, 2.82547, 2.82046, 2.82554, 2.82609, 2.81973, 2.81555, 2.80902, 2.81328, 2.81723, 2.81808, 2.8209, 2.81658, 2.82868, 2.82046, 2.82766, 2.82547, 2.82306, 2.82434, 2.82165, 2.82182, 2.82079, 2.8171, 2.82456, 2.81695, 2.81958, 2.81888, 2.82274, 2.82232, 2.82111, 2.81589, 2.81554, 2.82411, 2.82116, 2.81529, 2.82499, 2.81696, 2.81507, 2.81149, 2.81848, 2.81732, 2.81615, 2.81512, 2.81829, 2.8116, 2.80978, 2.81506, 2.81764, 2.8198, 2.81632, 2.81606, 2.80897, 2.81568, 2.82245, 2.81885, 2.82606, 2.81987, 2.8158, 2.82143, 2.8193, 2.82472, 2.81111, 2.81631, 2.83592, 2.81315, 2.82779, 2.82235, 2.83714, 2.8297, 2.837, 2.83586, 2.83284, 2.83636, 2.83258, 2.83915, 2.83419, 2.83824, 2.84049, 2.84197, 2.84072, 2.83281, 2.82944, 2.8375, 2.81702, 2.84669, 2.82923, 2.81781, 2.82019, 2.82199, 2.81611, 2.82377, 2.82298, 2.82195, 2.81502, 2.81982, 2.8244, 2.83221, 2.82765, 2.81874, 2.82405, 2.81662, 2.82101, 2.8221, 2.81703, 2.81771, 2.81876, 2.81927, 2.8219, 2.81857, 2.82075, 2.8191, 2.82229, 2.82063, 2.82301, 2.82242, 2.82223, 2.81908, 2.82481, 2.82407, 2.82328, 2.82304, 2.8156, 2.8223, 2.8283, 2.82746, 2.83015, 2.82908, 2.79797, 2.79998, 2.78923, 2.79503, 2.80833, 2.79099, 2.78989, 2.78911, 2.78508, 2.78213, 2.78209, 2.79677, 2.78643, 2.78646, 2.78817, 2.77762, 2.78837, 2.78968, 2.78321, 2.78471, 2.78732, 2.79108, 2.78484, 2.79823, 2.78713, 2.78768, 2.78784, 2.78488, 2.7883, 2.78899, 2.79726, 2.78764, 2.79575, 2.7903, 2.7943, 2.78923, 2.79105, 2.78913, 2.78266, 2.78538, 2.78833, 2.79805, 2.78908, 2.79905, 2.79128, 2.79609, 2.79756, 2.78663, 2.79377, 2.83553, 2.82821, 2.82975, 2.82985, 2.8276, 2.83102, 2.82461, 2.83883, 2.82299, 2.82069, 2.82305, 2.81459, 2.82648, 2.82175, 2.82728, 2.82733, 2.82099, 2.83858, 2.83126, 2.83115, 2.82847, 2.83258, 2.83579, 2.83969, 2.83857, 2.86059, 2.84207, 2.84007, 2.84684, 2.84306, 2.84137, 2.84087, 2.79807, 2.79644, 2.79588, 2.79211, 2.79479, 2.80066, 2.79173, 2.79944, 2.79749, 2.80704, 2.79981, 2.79552, 2.79711, 2.7928, 2.79311, 2.78965, 2.78698, 2.78443, 2.78879, 2.79821, 2.79383, 2.79253, 2.79447, 2.78491, 2.77925, 2.78353, 2.78445, 2.79082, 2.79857, 2.80414, 2.80257, 2.78642, 2.78648, 2.78739, 2.78471, 2.78001, 2.78196, 2.78327, 2.78431, 2.791, 2.78454, 2.78713, 2.78803, 2.78024, 2.776, 2.77716, 2.78213, 2.78774, 2.78732, 2.78532, 2.78606, 2.78414, 2.77758, 2.78443, 2.77071, 2.77741, 2.78603, 2.78774, 2.78521, 2.78444, 2.78878, 2.774, 2.78293, 2.78129, 2.78025, 2.78828, 2.78815, 2.78075, 2.78504, 2.77911, 2.77515, 2.77671, 2.77649, 2.88175, 2.77346, 2.78223, 2.78354, 2.77649, 2.78232, 2.77496, 2.78767, 2.7835, 2.77767, 2.7876, 2.78256, 2.77263, 2.77761, 2.77618, 2.782, 2.78046, 2.7906, 2.78832, 2.78117, 2.77888, 2.79122, 2.79084, 2.78287, 2.77695, 2.77599, 2.78415, 2.77982, 2.77929, 2.77879, 2.77575, 2.77152, 2.77167, 2.78528, 2.77604, 2.785, 2.78948, 2.7772, 2.78592, 2.77735, 2.77812, 2.80061, 2.78402, 2.79223, 2.78189, 2.78928]}, "lm loss validation": {"start_step": 0, "end_step": 4, "step_interval": 5, "values": [5.60622]}, "lm loss validation vs samples": {"start_step": 0, "end_step": 4, "step_interval": 5, "values": [5.60622]}, "lm loss validation ppl": {"start_step": 0, "end_step": 4, "step_interval": 5, "values": [272.11401]}, "lm loss validation ppl vs samples": {"start_step": 0, "end_step": 4, "step_interval": 5, "values": [272.11401]}}
\ No newline at end of file
diff --git a/tests/functional_tests/test_cases/gpt/gpt3_345m_weekly_dgx_h100_1N8G_mcore_tp4_pp2_fp8_tp_pp/model_config.yaml b/tests/functional_tests/test_cases/gpt/gpt3_345m_weekly_dgx_h100_1N8G_mcore_tp4_pp2_fp8_tp_pp/model_config.yaml
new file mode 100644
index 0000000000..aa529c3316
--- /dev/null
+++ b/tests/functional_tests/test_cases/gpt/gpt3_345m_weekly_dgx_h100_1N8G_mcore_tp4_pp2_fp8_tp_pp/model_config.yaml
@@ -0,0 +1,52 @@
+ENV_VARS:
+  CUDA_DEVICE_MAX_CONNECTIONS: 1
+  NVTE_ALLOW_NONDETERMINISTIC_ALGO: 0
+  NVTE_FUSED_ATTN: 0
+  NCCL_ALGO: Tree
+  CUBLAS_WORKSPACE_CONFIG: :4096:8
+MODEL_ARGS:
+  --num-layers: 12
+  --hidden-size: 512
+  --num-attention-heads: 8
+  --log-params-norm: true
+  --log-num-zeros-in-grad: true
+  --log-validation-ppl-to-tensorboard: true
+  --log-timers-to-tensorboard: true
+  --tensorboard-dir: ${TENSORBOARD_PATH}
+  --micro-batch-size: 2
+  --global-batch-size: 128
+  --seq-length: 1024
+  --max-position-embeddings: 1024
+  --train-iters: 2000
+  --timing-log-level: 2
+  --lr-decay-iters: 320000
+  --save: ${CHECKPOINT_PATH}
+  --load: ${CHECKPOINT_PATH}
+  --data-path: ${DATA_PATH}/my-gpt3_00_text_document
+  --vocab-file: ${DATA_PATH}/bpe/vocab.json
+  --merge-file: ${DATA_PATH}/bpe/merges.txt
+  --split: 949,50,1
+  --distributed-backend: nccl
+  --lr: 0.00015
+  --lr-decay-style: cosine
+  --min-lr: 1.0e-5
+  --weight-decay: 1e-2
+  --clip-grad: 1.0
+  --lr-warmup-fraction: .01
+  --log-interval: 1
+  --save-interval: 10000
+  --eval-interval: 1000
+  --eval-iters: 10
+  --transformer-impl: transformer_engine
+  --tensor-model-parallel-size: 4
+  --pipeline-model-parallel-size: 2
+  --deterministic-mode: true
+  --no-gradient-accumulation-fusion: true
+  --fp8-format: hybrid
+  --fp8-amax-history-len: 1024
+  --fp8-amax-compute-algo: max
+  --attention-softmax-in-fp32: true
+  --ckpt-format: torch_dist
+  --data-cache-path: ${DATA_CACHE_PATH}
+  --bf16: true
+TEST_TYPE: regular
diff --git a/tests/functional_tests/test_cases/gpt/gpt3_mr_mcore_te_tp1_pp1_dist_optimizer_no_mmap_bin_files_dgx_a100_1N8G/golden_values_dev.json b/tests/functional_tests/test_cases/gpt/gpt3_mr_mcore_te_tp1_pp1_dist_optimizer_no_mmap_bin_files_dgx_a100_1N8G/golden_values_dev.json
new file mode 100644
index 0000000000..e51c439962
--- /dev/null
+++ b/tests/functional_tests/test_cases/gpt/gpt3_mr_mcore_te_tp1_pp1_dist_optimizer_no_mmap_bin_files_dgx_a100_1N8G/golden_values_dev.json
@@ -0,0 +1,53 @@
+{
+    "lm loss": {
+        "start_step": 0,
+        "end_step": 50,
+        "step_interval": 5,
+        "values": [
+            10.84023,
+            10.87155,
+            10.85055,
+            10.79652,
+            10.68174,
+            10.60636,
+            10.12763,
+            10.22194,
+            10.13822,
+            9.82359
+        ]
+    },
+    "num-zeros": {
+        "start_step": 0,
+        "end_step": 50,
+        "step_interval": 5,
+        "values": [
+            1659.0,
+            1902.0,
+            1912.0,
+            1887.0,
+            1968.0,
+            1827.0,
+            1689.0,
+            1944.0,
+            2371.0,
+            2342.0
+        ]
+    },
+    "iteration-time": {
+        "start_step": 0,
+        "end_step": 50,
+        "step_interval": 5,
+        "values": [
+            6.28261,
+            0.08657,
+            0.08474,
+            0.09247,
+            0.10393,
+            0.12224,
+            0.08752,
+            0.08709,
+            0.08465,
+            0.0841
+        ]
+    }
+}
\ No newline at end of file
diff --git a/tests/functional_tests/test_cases/gpt/gpt3_mr_mcore_te_tp1_pp1_dist_optimizer_no_mmap_bin_files_dgx_a100_1N8G/golden_values_lts.json b/tests/functional_tests/test_cases/gpt/gpt3_mr_mcore_te_tp1_pp1_dist_optimizer_no_mmap_bin_files_dgx_a100_1N8G/golden_values_lts.json
new file mode 100644
index 0000000000..87e9341e6a
--- /dev/null
+++ b/tests/functional_tests/test_cases/gpt/gpt3_mr_mcore_te_tp1_pp1_dist_optimizer_no_mmap_bin_files_dgx_a100_1N8G/golden_values_lts.json
@@ -0,0 +1 @@
+{"lm loss": {"start_step": 0, "end_step": 50, "step_interval": 5, "values": [10.8401, 10.87262, 10.85025, 10.79646, 10.68152, 10.60614, 10.12765, 10.22184, 10.13787, 9.82312]}, "num-zeros": {"start_step": 0, "end_step": 50, "step_interval": 5, "values": [1670.0, 1901.0, 1954.0, 1932.0, 1998.0, 1768.0, 1651.0, 2063.0, 2348.0, 2324.0]}, "iteration_timing_avg": 0.06904588235294119}
\ No newline at end of file
diff --git a/tests/functional_tests/test_cases/gpt/gpt3_mr_mcore_te_tp1_pp1_dist_optimizer_no_mmap_bin_files_dgx_a100_1N8G/model_config.yaml b/tests/functional_tests/test_cases/gpt/gpt3_mr_mcore_te_tp1_pp1_dist_optimizer_no_mmap_bin_files_dgx_a100_1N8G/model_config.yaml
new file mode 100644
index 0000000000..ee84d93de2
--- /dev/null
+++ b/tests/functional_tests/test_cases/gpt/gpt3_mr_mcore_te_tp1_pp1_dist_optimizer_no_mmap_bin_files_dgx_a100_1N8G/model_config.yaml
@@ -0,0 +1,52 @@
+ENV_VARS:
+  CUDA_DEVICE_MAX_CONNECTIONS: 1
+  NVTE_ALLOW_NONDETERMINISTIC_ALGO: 0
+  NCCL_ALGO: Tree
+  CUBLAS_WORKSPACE_CONFIG: :4096:8
+  N_REPEATS: 5
+MODEL_ARGS:
+  --num-layers: 12
+  --hidden-size: 512
+  --num-attention-heads: 8
+  --log-params-norm: true
+  --log-num-zeros-in-grad: true
+  --log-validation-ppl-to-tensorboard: true
+  --log-timers-to-tensorboard: true
+  --tensorboard-dir: ${TENSORBOARD_PATH}
+  --micro-batch-size: 4
+  --global-batch-size: 32
+  --seq-length: 1024
+  --max-position-embeddings: 1024
+  --train-iters: 50
+  --timing-log-level: 2
+  --lr-decay-iters: 320000
+  --save: ${CHECKPOINT_PATH}
+  --load: ${CHECKPOINT_PATH}
+  --data-path: ${DATA_PATH}/my-gpt3_00_text_document
+  --vocab-file: ${DATA_PATH}/bpe/vocab.json
+  --merge-file: ${DATA_PATH}/bpe/merges.txt
+  --split: 949,50,1
+  --distributed-backend: nccl
+  --lr: 0.00015
+  --lr-decay-style: cosine
+  --min-lr: 1.0e-5
+  --weight-decay: 1e-2
+  --clip-grad: 1.0
+  --lr-warmup-fraction: .01
+  --log-interval: 1
+  --save-interval: 10000
+  --eval-interval: 1000
+  --eval-iters: 10
+  --transformer-impl: transformer_engine
+  --tensor-model-parallel-size: 1
+  --pipeline-model-parallel-size: 1
+  --use-distributed-optimizer: true
+  --no-mmap-bin-files: true
+  --deterministic-mode: true
+  --no-gradient-accumulation-fusion: true
+  --attention-softmax-in-fp32: true
+  --use-mcore-models: true
+  --ckpt-format: torch_dist
+  --data-cache-path: ${DATA_CACHE_PATH}
+  --bf16: true
+TEST_TYPE: regular
diff --git a/tests/functional_tests/test_cases/gpt/gpt3_mr_mcore_te_tp1_pp1_resume_torch_dist_dist_optimizer_dgx_a100_1N8G/model_config.yaml b/tests/functional_tests/test_cases/gpt/gpt3_mr_mcore_te_tp1_pp1_resume_torch_dist_dist_optimizer_dgx_a100_1N8G/model_config.yaml
new file mode 100644
index 0000000000..ffdaec80ad
--- /dev/null
+++ b/tests/functional_tests/test_cases/gpt/gpt3_mr_mcore_te_tp1_pp1_resume_torch_dist_dist_optimizer_dgx_a100_1N8G/model_config.yaml
@@ -0,0 +1,54 @@
+ENV_VARS:
+  CUDA_DEVICE_MAX_CONNECTIONS: 1
+  NVTE_ALLOW_NONDETERMINISTIC_ALGO: 0
+  NCCL_ALGO: Tree
+  CUBLAS_WORKSPACE_CONFIG: :4096:8
+  N_REPEATS: 5
+MODEL_ARGS:
+  --num-layers: 12
+  --hidden-size: 512
+  --num-attention-heads: 8
+  --log-params-norm: true
+  --log-num-zeros-in-grad: true
+  --log-validation-ppl-to-tensorboard: true
+  --log-timers-to-tensorboard: true
+  --tensorboard-dir: ${TENSORBOARD_PATH}
+  --micro-batch-size: 4
+  --global-batch-size: 32
+  --seq-length: 1024
+  --max-position-embeddings: 1024
+  --train-iters: 100
+  --timing-log-level: 2
+  --lr-decay-iters: 320000
+  --save: ${CHECKPOINT_PATH}
+  --load: ${CHECKPOINT_PATH}
+  --data-path: ${DATA_PATH}/my-gpt3_00_text_document
+  --vocab-file: ${DATA_PATH}/bpe/vocab.json
+  --merge-file: ${DATA_PATH}/bpe/merges.txt
+  --split: 949,50,1
+  --distributed-backend: nccl
+  --lr: 0.00015
+  --lr-decay-style: cosine
+  --min-lr: 1.0e-5
+  --weight-decay: 1e-2
+  --clip-grad: 1.0
+  --lr-warmup-fraction: .01
+  --log-interval: 1
+  --save-interval: 50
+  --eval-interval: 1000
+  --eval-iters: 10
+  --transformer-impl: transformer_engine
+  --tensor-model-parallel-size: 1
+  --pipeline-model-parallel-size: 1
+  --use-distributed-optimizer: true
+  --no-ckpt-fully-parallel-save: true
+  --async-save: true
+  --deterministic-mode: true
+  --no-gradient-accumulation-fusion: true
+  --attention-softmax-in-fp32: true
+  --use-checkpoint-opt_param-scheduler: true
+  --use-mcore-models: true
+  --ckpt-format: torch_dist
+  --data-cache-path: ${DATA_CACHE_PATH}
+  --bf16: true
+TEST_TYPE: ckpt-resume
diff --git a/tests/functional_tests/test_cases/gpt/gpt3_mr_mcore_te_tp1_pp1_resume_torch_dist_dist_optimizer_no_mmap_bin_files_dgx_a100_1N8G/model_config.yaml b/tests/functional_tests/test_cases/gpt/gpt3_mr_mcore_te_tp1_pp1_resume_torch_dist_dist_optimizer_no_mmap_bin_files_dgx_a100_1N8G/model_config.yaml
new file mode 100644
index 0000000000..9dd9e9ecd0
--- /dev/null
+++ b/tests/functional_tests/test_cases/gpt/gpt3_mr_mcore_te_tp1_pp1_resume_torch_dist_dist_optimizer_no_mmap_bin_files_dgx_a100_1N8G/model_config.yaml
@@ -0,0 +1,53 @@
+ENV_VARS:
+  CUDA_DEVICE_MAX_CONNECTIONS: 1
+  NVTE_ALLOW_NONDETERMINISTIC_ALGO: 0
+  NCCL_ALGO: Tree
+  CUBLAS_WORKSPACE_CONFIG: :4096:8
+  N_REPEATS: 5
+MODEL_ARGS:
+  --num-layers: 12
+  --hidden-size: 512
+  --num-attention-heads: 8
+  --log-params-norm: true
+  --log-num-zeros-in-grad: true
+  --log-validation-ppl-to-tensorboard: true
+  --log-timers-to-tensorboard: true
+  --tensorboard-dir: ${TENSORBOARD_PATH}
+  --micro-batch-size: 4
+  --global-batch-size: 32
+  --seq-length: 1024
+  --max-position-embeddings: 1024
+  --train-iters: 100
+  --timing-log-level: 2
+  --lr-decay-iters: 320000
+  --save: ${CHECKPOINT_PATH}
+  --load: ${CHECKPOINT_PATH}
+  --data-path: ${DATA_PATH}/my-gpt3_00_text_document
+  --vocab-file: ${DATA_PATH}/bpe/vocab.json
+  --merge-file: ${DATA_PATH}/bpe/merges.txt
+  --split: 949,50,1
+  --distributed-backend: nccl
+  --lr: 0.00015
+  --lr-decay-style: cosine
+  --min-lr: 1.0e-5
+  --weight-decay: 1e-2
+  --clip-grad: 1.0
+  --lr-warmup-fraction: .01
+  --log-interval: 1
+  --save-interval: 50
+  --eval-interval: 1000
+  --eval-iters: 10
+  --transformer-impl: transformer_engine
+  --tensor-model-parallel-size: 1
+  --pipeline-model-parallel-size: 1
+  --use-distributed-optimizer: true
+  --no-mmap-bin-files: true
+  --deterministic-mode: true
+  --no-gradient-accumulation-fusion: true
+  --attention-softmax-in-fp32: true
+  --use-checkpoint-opt_param-scheduler: true
+  --use-mcore-models: true
+  --ckpt-format: torch_dist
+  --data-cache-path: ${DATA_CACHE_PATH}
+  --bf16: true
+TEST_TYPE: ckpt-resume
diff --git a/tests/functional_tests/test_cases/gpt/gpt3_mr_mcore_te_tp1_pp1_resume_torch_dist_uniform_full_recompute_dgx_a100_1N8G/model_config.yaml b/tests/functional_tests/test_cases/gpt/gpt3_mr_mcore_te_tp1_pp1_resume_torch_dist_uniform_full_recompute_dgx_a100_1N8G/model_config.yaml
new file mode 100644
index 0000000000..470ba6f926
--- /dev/null
+++ b/tests/functional_tests/test_cases/gpt/gpt3_mr_mcore_te_tp1_pp1_resume_torch_dist_uniform_full_recompute_dgx_a100_1N8G/model_config.yaml
@@ -0,0 +1,54 @@
+ENV_VARS:
+  CUDA_DEVICE_MAX_CONNECTIONS: 1
+  NVTE_ALLOW_NONDETERMINISTIC_ALGO: 0
+  NCCL_ALGO: Tree
+  CUBLAS_WORKSPACE_CONFIG: :4096:8
+  N_REPEATS: 5
+MODEL_ARGS:
+  --num-layers: 12
+  --hidden-size: 512
+  --num-attention-heads: 8
+  --log-params-norm: true
+  --log-num-zeros-in-grad: true
+  --log-validation-ppl-to-tensorboard: true
+  --log-timers-to-tensorboard: true
+  --tensorboard-dir: ${TENSORBOARD_PATH}
+  --micro-batch-size: 4
+  --global-batch-size: 32
+  --seq-length: 1024
+  --max-position-embeddings: 1024
+  --train-iters: 100
+  --timing-log-level: 2
+  --lr-decay-iters: 320000
+  --save: ${CHECKPOINT_PATH}
+  --load: ${CHECKPOINT_PATH}
+  --data-path: ${DATA_PATH}/my-gpt3_00_text_document
+  --vocab-file: ${DATA_PATH}/bpe/vocab.json
+  --merge-file: ${DATA_PATH}/bpe/merges.txt
+  --split: 949,50,1
+  --distributed-backend: nccl
+  --lr: 0.00015
+  --lr-decay-style: cosine
+  --min-lr: 1.0e-5
+  --weight-decay: 1e-2
+  --clip-grad: 1.0
+  --lr-warmup-fraction: .01
+  --log-interval: 1
+  --save-interval: 50
+  --eval-interval: 1000
+  --eval-iters: 10
+  --transformer-impl: transformer_engine
+  --tensor-model-parallel-size: 1
+  --pipeline-model-parallel-size: 1
+  --recompute-granularity: full
+  --recompute-method: uniform
+  --recompute-num-layers: 1
+  --deterministic-mode: true
+  --no-gradient-accumulation-fusion: true
+  --attention-softmax-in-fp32: true
+  --use-checkpoint-opt_param-scheduler: true
+  --use-mcore-models: true
+  --ckpt-format: torch_dist
+  --data-cache-path: ${DATA_CACHE_PATH}
+  --bf16: true
+TEST_TYPE: ckpt-resume
diff --git a/tests/functional_tests/test_cases/gpt/gpt3_mr_mcore_te_tp1_pp1_uniform_full_recompute_dgx_a100_1N8G/golden_values_dev.json b/tests/functional_tests/test_cases/gpt/gpt3_mr_mcore_te_tp1_pp1_uniform_full_recompute_dgx_a100_1N8G/golden_values_dev.json
new file mode 100644
index 0000000000..81b3c96c4e
--- /dev/null
+++ b/tests/functional_tests/test_cases/gpt/gpt3_mr_mcore_te_tp1_pp1_uniform_full_recompute_dgx_a100_1N8G/golden_values_dev.json
@@ -0,0 +1,53 @@
+{
+    "lm loss": {
+        "start_step": 0,
+        "end_step": 50,
+        "step_interval": 5,
+        "values": [
+            10.84023,
+            10.87155,
+            10.85054,
+            10.79648,
+            10.68178,
+            10.60635,
+            10.12766,
+            10.22201,
+            10.13823,
+            9.82362
+        ]
+    },
+    "num-zeros": {
+        "start_step": 0,
+        "end_step": 50,
+        "step_interval": 5,
+        "values": [
+            1659.0,
+            1902.0,
+            1846.0,
+            1951.0,
+            1993.0,
+            1810.0,
+            1697.0,
+            1952.0,
+            2348.0,
+            2258.0
+        ]
+    },
+    "iteration-time": {
+        "start_step": 0,
+        "end_step": 50,
+        "step_interval": 5,
+        "values": [
+            6.51506,
+            0.12227,
+            0.1189,
+            0.12098,
+            0.11904,
+            0.12003,
+            0.11939,
+            0.11848,
+            0.11884,
+            0.11924
+        ]
+    }
+}
\ No newline at end of file
diff --git a/tests/functional_tests/test_cases/gpt/gpt3_mr_mcore_te_tp1_pp1_uniform_full_recompute_dgx_a100_1N8G/golden_values_lts.json b/tests/functional_tests/test_cases/gpt/gpt3_mr_mcore_te_tp1_pp1_uniform_full_recompute_dgx_a100_1N8G/golden_values_lts.json
new file mode 100644
index 0000000000..94554bb448
--- /dev/null
+++ b/tests/functional_tests/test_cases/gpt/gpt3_mr_mcore_te_tp1_pp1_uniform_full_recompute_dgx_a100_1N8G/golden_values_lts.json
@@ -0,0 +1 @@
+{"lm loss": {"start_step": 0, "end_step": 50, "step_interval": 5, "values": [10.8401, 10.87262, 10.85023, 10.79645, 10.68149, 10.60617, 10.1277, 10.22183, 10.13794, 9.8231]}, "num-zeros": {"start_step": 0, "end_step": 50, "step_interval": 5, "values": [1670.0, 1901.0, 1923.0, 1922.0, 2020.0, 1815.0, 1713.0, 1963.0, 2266.0, 2324.0]}, "iteration_timing_avg": 0.09164500000000002}
\ No newline at end of file
diff --git a/tests/functional_tests/test_cases/gpt/gpt3_mr_mcore_te_tp1_pp1_uniform_full_recompute_dgx_a100_1N8G/model_config.yaml b/tests/functional_tests/test_cases/gpt/gpt3_mr_mcore_te_tp1_pp1_uniform_full_recompute_dgx_a100_1N8G/model_config.yaml
new file mode 100644
index 0000000000..fb07f9d30c
--- /dev/null
+++ b/tests/functional_tests/test_cases/gpt/gpt3_mr_mcore_te_tp1_pp1_uniform_full_recompute_dgx_a100_1N8G/model_config.yaml
@@ -0,0 +1,53 @@
+ENV_VARS:
+  CUDA_DEVICE_MAX_CONNECTIONS: 1
+  NVTE_ALLOW_NONDETERMINISTIC_ALGO: 0
+  NCCL_ALGO: Tree
+  CUBLAS_WORKSPACE_CONFIG: :4096:8
+  N_REPEATS: 5
+MODEL_ARGS:
+  --num-layers: 12
+  --hidden-size: 512
+  --num-attention-heads: 8
+  --log-params-norm: true
+  --log-num-zeros-in-grad: true
+  --log-validation-ppl-to-tensorboard: true
+  --log-timers-to-tensorboard: true
+  --tensorboard-dir: ${TENSORBOARD_PATH}
+  --micro-batch-size: 4
+  --global-batch-size: 32
+  --seq-length: 1024
+  --max-position-embeddings: 1024
+  --train-iters: 50
+  --timing-log-level: 2
+  --lr-decay-iters: 320000
+  --save: ${CHECKPOINT_PATH}
+  --load: ${CHECKPOINT_PATH}
+  --data-path: ${DATA_PATH}/my-gpt3_00_text_document
+  --vocab-file: ${DATA_PATH}/bpe/vocab.json
+  --merge-file: ${DATA_PATH}/bpe/merges.txt
+  --split: 949,50,1
+  --distributed-backend: nccl
+  --lr: 0.00015
+  --lr-decay-style: cosine
+  --min-lr: 1.0e-5
+  --weight-decay: 1e-2
+  --clip-grad: 1.0
+  --lr-warmup-fraction: .01
+  --log-interval: 1
+  --save-interval: 10000
+  --eval-interval: 1000
+  --eval-iters: 10
+  --transformer-impl: transformer_engine
+  --tensor-model-parallel-size: 1
+  --pipeline-model-parallel-size: 1
+  --recompute-granularity: full
+  --recompute-method: uniform
+  --recompute-num-layers: 1
+  --deterministic-mode: true
+  --no-gradient-accumulation-fusion: true
+  --attention-softmax-in-fp32: true
+  --use-mcore-models: true
+  --ckpt-format: torch_dist
+  --data-cache-path: ${DATA_CACHE_PATH}
+  --bf16: true
+TEST_TYPE: regular
diff --git a/tests/functional_tests/test_cases/gpt/gpt3_mr_mcore_te_tp1_pp2_resume_torch_dist_reshard_2x1x4_te_8experts2parallel_dist_optimizer_dgx_a100_1N8G/model_config.yaml b/tests/functional_tests/test_cases/gpt/gpt3_mr_mcore_te_tp1_pp2_resume_torch_dist_reshard_2x1x4_te_8experts2parallel_dist_optimizer_dgx_a100_1N8G/model_config.yaml
new file mode 100644
index 0000000000..7cdb56dd00
--- /dev/null
+++ b/tests/functional_tests/test_cases/gpt/gpt3_mr_mcore_te_tp1_pp2_resume_torch_dist_reshard_2x1x4_te_8experts2parallel_dist_optimizer_dgx_a100_1N8G/model_config.yaml
@@ -0,0 +1,56 @@
+ENV_VARS:
+  CUDA_DEVICE_MAX_CONNECTIONS: 1
+  NVTE_ALLOW_NONDETERMINISTIC_ALGO: 1
+  N_REPEATS: 5
+MODEL_ARGS:
+  --num-layers: 12
+  --hidden-size: 512
+  --num-attention-heads: 8
+  --log-params-norm: true
+  --log-num-zeros-in-grad: true
+  --log-validation-ppl-to-tensorboard: true
+  --log-timers-to-tensorboard: true
+  --tensorboard-dir: ${TENSORBOARD_PATH}
+  --micro-batch-size: 4
+  --global-batch-size: 32
+  --seq-length: 1024
+  --max-position-embeddings: 1024
+  --train-iters: 100
+  --timing-log-level: 2
+  --lr-decay-iters: 320000
+  --save: ${CHECKPOINT_PATH}
+  --load: ${CHECKPOINT_PATH}
+  --data-path: ${DATA_PATH}/my-gpt3_00_text_document
+  --vocab-file: ${DATA_PATH}/bpe/vocab.json
+  --merge-file: ${DATA_PATH}/bpe/merges.txt
+  --split: 949,50,1
+  --distributed-backend: nccl
+  --lr: 0.00015
+  --lr-decay-style: cosine
+  --min-lr: 1.0e-5
+  --weight-decay: 1e-2
+  --clip-grad: 1.0
+  --lr-warmup-fraction: .01
+  --log-interval: 1
+  --save-interval: 50
+  --eval-interval: 1000
+  --eval-iters: 10
+  --transformer-impl: transformer_engine
+  --tensor-model-parallel-size: 1
+  --pipeline-model-parallel-size: 2
+  --expert-model-parallel-size: 2
+  --sequence-parallel: true
+  --num-experts: 8
+  --use-distributed-optimizer: true
+  --moe-router-load-balancing-type: sinkhorn
+  --moe-router-topk: 1
+  --ckpt-fully-parallel-save: true
+  --ckpt-fully-parallel-load: true
+  --no-gradient-accumulation-fusion: true
+  --attention-softmax-in-fp32: true
+  --use-checkpoint-opt_param-scheduler: true
+  --use-mcore-models: true
+  --ckpt-format: torch_dist
+  --data-cache-path: ${DATA_CACHE_PATH}
+  --bf16: true
+TEST_TYPE: ckpt-resume
diff --git a/tests/functional_tests/test_cases/gpt/gpt3_mr_mcore_te_tp1_pp2_resume_torch_dist_rope_embeddings_dgx_a100_1N8G/model_config.yaml b/tests/functional_tests/test_cases/gpt/gpt3_mr_mcore_te_tp1_pp2_resume_torch_dist_rope_embeddings_dgx_a100_1N8G/model_config.yaml
new file mode 100644
index 0000000000..7bdd0c46e2
--- /dev/null
+++ b/tests/functional_tests/test_cases/gpt/gpt3_mr_mcore_te_tp1_pp2_resume_torch_dist_rope_embeddings_dgx_a100_1N8G/model_config.yaml
@@ -0,0 +1,53 @@
+ENV_VARS:
+  CUDA_DEVICE_MAX_CONNECTIONS: 1
+  NVTE_ALLOW_NONDETERMINISTIC_ALGO: 0
+  NCCL_ALGO: Tree
+  CUBLAS_WORKSPACE_CONFIG: :4096:8
+  N_REPEATS: 5
+MODEL_ARGS:
+  --num-layers: 12
+  --hidden-size: 512
+  --num-attention-heads: 8
+  --log-params-norm: true
+  --log-num-zeros-in-grad: true
+  --log-validation-ppl-to-tensorboard: true
+  --log-timers-to-tensorboard: true
+  --tensorboard-dir: ${TENSORBOARD_PATH}
+  --micro-batch-size: 4
+  --global-batch-size: 32
+  --seq-length: 1024
+  --max-position-embeddings: 1024
+  --train-iters: 100
+  --timing-log-level: 2
+  --lr-decay-iters: 320000
+  --save: ${CHECKPOINT_PATH}
+  --load: ${CHECKPOINT_PATH}
+  --data-path: ${DATA_PATH}/my-gpt3_00_text_document
+  --vocab-file: ${DATA_PATH}/bpe/vocab.json
+  --merge-file: ${DATA_PATH}/bpe/merges.txt
+  --split: 949,50,1
+  --distributed-backend: nccl
+  --lr: 0.00015
+  --lr-decay-style: cosine
+  --min-lr: 1.0e-5
+  --weight-decay: 1e-2
+  --clip-grad: 1.0
+  --lr-warmup-fraction: .01
+  --log-interval: 1
+  --save-interval: 50
+  --eval-interval: 1000
+  --eval-iters: 10
+  --transformer-impl: transformer_engine
+  --tensor-model-parallel-size: 1
+  --pipeline-model-parallel-size: 2
+  --position-embedding-type: rope
+  --no-ckpt-fully-parallel-save: true
+  --deterministic-mode: true
+  --no-gradient-accumulation-fusion: true
+  --attention-softmax-in-fp32: true
+  --use-checkpoint-opt_param-scheduler: true
+  --use-mcore-models: true
+  --ckpt-format: torch_dist
+  --data-cache-path: ${DATA_CACHE_PATH}
+  --bf16: true
+TEST_TYPE: ckpt-resume
diff --git a/tests/functional_tests/test_cases/gpt/gpt3_mr_mcore_te_tp1_pp2_resume_torch_dist_rope_embeddings_interleaved_no_fusion_dgx_a100_1N8G/model_config.yaml b/tests/functional_tests/test_cases/gpt/gpt3_mr_mcore_te_tp1_pp2_resume_torch_dist_rope_embeddings_interleaved_no_fusion_dgx_a100_1N8G/model_config.yaml
new file mode 100644
index 0000000000..b014fdabc0
--- /dev/null
+++ b/tests/functional_tests/test_cases/gpt/gpt3_mr_mcore_te_tp1_pp2_resume_torch_dist_rope_embeddings_interleaved_no_fusion_dgx_a100_1N8G/model_config.yaml
@@ -0,0 +1,54 @@
+ENV_VARS:
+  CUDA_DEVICE_MAX_CONNECTIONS: 1
+  NVTE_ALLOW_NONDETERMINISTIC_ALGO: 0
+  NCCL_ALGO: Tree
+  CUBLAS_WORKSPACE_CONFIG: :4096:8
+  N_REPEATS: 5
+MODEL_ARGS:
+  --num-layers: 12
+  --hidden-size: 512
+  --num-attention-heads: 8
+  --log-params-norm: true
+  --log-num-zeros-in-grad: true
+  --log-validation-ppl-to-tensorboard: true
+  --log-timers-to-tensorboard: true
+  --tensorboard-dir: ${TENSORBOARD_PATH}
+  --micro-batch-size: 4
+  --global-batch-size: 32
+  --seq-length: 1024
+  --max-position-embeddings: 1024
+  --train-iters: 100
+  --timing-log-level: 2
+  --lr-decay-iters: 320000
+  --save: ${CHECKPOINT_PATH}
+  --load: ${CHECKPOINT_PATH}
+  --data-path: ${DATA_PATH}/my-gpt3_00_text_document
+  --vocab-file: ${DATA_PATH}/bpe/vocab.json
+  --merge-file: ${DATA_PATH}/bpe/merges.txt
+  --split: 949,50,1
+  --distributed-backend: nccl
+  --lr: 0.00015
+  --lr-decay-style: cosine
+  --min-lr: 1.0e-5
+  --weight-decay: 1e-2
+  --clip-grad: 1.0
+  --lr-warmup-fraction: .01
+  --log-interval: 1
+  --save-interval: 50
+  --eval-interval: 1000
+  --eval-iters: 10
+  --transformer-impl: transformer_engine
+  --tensor-model-parallel-size: 1
+  --pipeline-model-parallel-size: 2
+  --position-embedding-type: rope
+  --rotary-interleaved: true
+  --no-rope-fusion: true
+  --deterministic-mode: true
+  --no-gradient-accumulation-fusion: true
+  --attention-softmax-in-fp32: true
+  --use-checkpoint-opt_param-scheduler: true
+  --use-mcore-models: true
+  --ckpt-format: torch_dist
+  --data-cache-path: ${DATA_CACHE_PATH}
+  --bf16: true
+TEST_TYPE: ckpt-resume
diff --git a/tests/functional_tests/test_cases/gpt/gpt3_mr_mcore_te_tp1_pp2_rope_embeddings_dgx_a100_1N8G/golden_values_dev.json b/tests/functional_tests/test_cases/gpt/gpt3_mr_mcore_te_tp1_pp2_rope_embeddings_dgx_a100_1N8G/golden_values_dev.json
new file mode 100644
index 0000000000..7e9cd7113b
--- /dev/null
+++ b/tests/functional_tests/test_cases/gpt/gpt3_mr_mcore_te_tp1_pp2_rope_embeddings_dgx_a100_1N8G/golden_values_dev.json
@@ -0,0 +1,53 @@
+{
+    "lm loss": {
+        "start_step": 0,
+        "end_step": 50,
+        "step_interval": 5,
+        "values": [
+            10.84764,
+            10.87731,
+            10.90275,
+            10.82072,
+            10.67949,
+            10.60184,
+            10.06545,
+            10.19304,
+            10.11419,
+            9.76015
+        ]
+    },
+    "num-zeros": {
+        "start_step": 0,
+        "end_step": 50,
+        "step_interval": 5,
+        "values": [
+            1736.0,
+            2079.0,
+            1956.0,
+            1911.0,
+            1949.0,
+            1814.0,
+            1629.0,
+            2059.0,
+            2268.0,
+            2291.0
+        ]
+    },
+    "iteration-time": {
+        "start_step": 0,
+        "end_step": 50,
+        "step_interval": 5,
+        "values": [
+            12.77945,
+            0.1334,
+            0.12654,
+            0.12546,
+            0.12505,
+            0.12667,
+            0.12644,
+            0.12524,
+            0.12609,
+            0.1254
+        ]
+    }
+}
\ No newline at end of file
diff --git a/tests/functional_tests/test_cases/gpt/gpt3_mr_mcore_te_tp1_pp2_rope_embeddings_dgx_a100_1N8G/golden_values_lts.json b/tests/functional_tests/test_cases/gpt/gpt3_mr_mcore_te_tp1_pp2_rope_embeddings_dgx_a100_1N8G/golden_values_lts.json
new file mode 100644
index 0000000000..2778958a4b
--- /dev/null
+++ b/tests/functional_tests/test_cases/gpt/gpt3_mr_mcore_te_tp1_pp2_rope_embeddings_dgx_a100_1N8G/golden_values_lts.json
@@ -0,0 +1 @@
+{"lm loss": {"start_step": 0, "end_step": 50, "step_interval": 5, "values": [10.8468, 10.87772, 10.90302, 10.82024, 10.67979, 10.60157, 10.06448, 10.19311, 10.1141, 9.76008]}, "num-zeros": {"start_step": 0, "end_step": 50, "step_interval": 5, "values": [1707.0, 2086.0, 2030.0, 2000.0, 1910.0, 1894.0, 1744.0, 2071.0, 2344.0, 2377.0]}, "iteration_timing_avg": 0.11051617647058823}
\ No newline at end of file
diff --git a/tests/functional_tests/test_cases/gpt/gpt3_mr_mcore_te_tp1_pp2_rope_embeddings_dgx_a100_1N8G/model_config.yaml b/tests/functional_tests/test_cases/gpt/gpt3_mr_mcore_te_tp1_pp2_rope_embeddings_dgx_a100_1N8G/model_config.yaml
new file mode 100644
index 0000000000..b2a1643ec8
--- /dev/null
+++ b/tests/functional_tests/test_cases/gpt/gpt3_mr_mcore_te_tp1_pp2_rope_embeddings_dgx_a100_1N8G/model_config.yaml
@@ -0,0 +1,52 @@
+ENV_VARS:
+  CUDA_DEVICE_MAX_CONNECTIONS: 1
+  NVTE_ALLOW_NONDETERMINISTIC_ALGO: 0
+  NCCL_ALGO: Tree
+  CUBLAS_WORKSPACE_CONFIG: :4096:8
+  N_REPEATS: 5
+MODEL_ARGS:
+  --num-layers: 12
+  --hidden-size: 512
+  --num-attention-heads: 8
+  --log-params-norm: true
+  --log-num-zeros-in-grad: true
+  --log-validation-ppl-to-tensorboard: true
+  --log-timers-to-tensorboard: true
+  --tensorboard-dir: ${TENSORBOARD_PATH}
+  --micro-batch-size: 4
+  --global-batch-size: 32
+  --seq-length: 1024
+  --max-position-embeddings: 1024
+  --train-iters: 50
+  --timing-log-level: 2
+  --lr-decay-iters: 320000
+  --save: ${CHECKPOINT_PATH}
+  --load: ${CHECKPOINT_PATH}
+  --data-path: ${DATA_PATH}/my-gpt3_00_text_document
+  --vocab-file: ${DATA_PATH}/bpe/vocab.json
+  --merge-file: ${DATA_PATH}/bpe/merges.txt
+  --split: 949,50,1
+  --distributed-backend: nccl
+  --lr: 0.00015
+  --lr-decay-style: cosine
+  --min-lr: 1.0e-5
+  --weight-decay: 1e-2
+  --clip-grad: 1.0
+  --lr-warmup-fraction: .01
+  --log-interval: 1
+  --save-interval: 10000
+  --eval-interval: 1000
+  --eval-iters: 10
+  --transformer-impl: transformer_engine
+  --tensor-model-parallel-size: 1
+  --pipeline-model-parallel-size: 2
+  --position-embedding-type: rope
+  --no-ckpt-fully-parallel-save: true
+  --deterministic-mode: true
+  --no-gradient-accumulation-fusion: true
+  --attention-softmax-in-fp32: true
+  --use-mcore-models: true
+  --ckpt-format: torch_dist
+  --data-cache-path: ${DATA_CACHE_PATH}
+  --bf16: true
+TEST_TYPE: regular
diff --git a/tests/functional_tests/test_cases/gpt/gpt3_mr_mcore_te_tp1_pp2_rope_embeddings_interleaved_no_fusion_dgx_a100_1N8G/golden_values_dev.json b/tests/functional_tests/test_cases/gpt/gpt3_mr_mcore_te_tp1_pp2_rope_embeddings_interleaved_no_fusion_dgx_a100_1N8G/golden_values_dev.json
new file mode 100644
index 0000000000..fb0e744efe
--- /dev/null
+++ b/tests/functional_tests/test_cases/gpt/gpt3_mr_mcore_te_tp1_pp2_rope_embeddings_interleaved_no_fusion_dgx_a100_1N8G/golden_values_dev.json
@@ -0,0 +1,53 @@
+{
+    "lm loss": {
+        "start_step": 0,
+        "end_step": 50,
+        "step_interval": 5,
+        "values": [
+            10.84554,
+            10.87656,
+            10.90228,
+            10.81911,
+            10.67825,
+            10.601,
+            10.06457,
+            10.1925,
+            10.11357,
+            9.75985
+        ]
+    },
+    "num-zeros": {
+        "start_step": 0,
+        "end_step": 50,
+        "step_interval": 5,
+        "values": [
+            1700.0,
+            2112.0,
+            2053.0,
+            1898.0,
+            1941.0,
+            1899.0,
+            1814.0,
+            2030.0,
+            2283.0,
+            2327.0
+        ]
+    },
+    "iteration-time": {
+        "start_step": 0,
+        "end_step": 50,
+        "step_interval": 5,
+        "values": [
+            8.59015,
+            0.15146,
+            0.15003,
+            0.1497,
+            0.14973,
+            0.14788,
+            0.14821,
+            0.14842,
+            0.14869,
+            0.14835
+        ]
+    }
+}
\ No newline at end of file
diff --git a/tests/functional_tests/test_cases/gpt/gpt3_mr_mcore_te_tp1_pp2_rope_embeddings_interleaved_no_fusion_dgx_a100_1N8G/golden_values_lts.json b/tests/functional_tests/test_cases/gpt/gpt3_mr_mcore_te_tp1_pp2_rope_embeddings_interleaved_no_fusion_dgx_a100_1N8G/golden_values_lts.json
new file mode 100644
index 0000000000..33a65cca16
--- /dev/null
+++ b/tests/functional_tests/test_cases/gpt/gpt3_mr_mcore_te_tp1_pp2_rope_embeddings_interleaved_no_fusion_dgx_a100_1N8G/golden_values_lts.json
@@ -0,0 +1 @@
+{"lm loss": {"start_step": 0, "end_step": 50, "step_interval": 5, "values": [10.84474, 10.87687, 10.90254, 10.81872, 10.67848, 10.60075, 10.06363, 10.19268, 10.11342, 9.75986]}, "num-zeros": {"start_step": 0, "end_step": 50, "step_interval": 5, "values": [1776.0, 2161.0, 2052.0, 1892.0, 1971.0, 1946.0, 1701.0, 1985.0, 2295.0, 2293.0]}, "iteration_timing_avg": 0.11052176470588236}
\ No newline at end of file
diff --git a/tests/functional_tests/test_cases/gpt/gpt3_mr_mcore_te_tp1_pp2_rope_embeddings_interleaved_no_fusion_dgx_a100_1N8G/model_config.yaml b/tests/functional_tests/test_cases/gpt/gpt3_mr_mcore_te_tp1_pp2_rope_embeddings_interleaved_no_fusion_dgx_a100_1N8G/model_config.yaml
new file mode 100644
index 0000000000..6c2c9e51ab
--- /dev/null
+++ b/tests/functional_tests/test_cases/gpt/gpt3_mr_mcore_te_tp1_pp2_rope_embeddings_interleaved_no_fusion_dgx_a100_1N8G/model_config.yaml
@@ -0,0 +1,53 @@
+ENV_VARS:
+  CUDA_DEVICE_MAX_CONNECTIONS: 1
+  NVTE_ALLOW_NONDETERMINISTIC_ALGO: 0
+  NCCL_ALGO: Tree
+  CUBLAS_WORKSPACE_CONFIG: :4096:8
+  N_REPEATS: 5
+MODEL_ARGS:
+  --num-layers: 12
+  --hidden-size: 512
+  --num-attention-heads: 8
+  --log-params-norm: true
+  --log-num-zeros-in-grad: true
+  --log-validation-ppl-to-tensorboard: true
+  --log-timers-to-tensorboard: true
+  --tensorboard-dir: ${TENSORBOARD_PATH}
+  --micro-batch-size: 4
+  --global-batch-size: 32
+  --seq-length: 1024
+  --max-position-embeddings: 1024
+  --train-iters: 50
+  --timing-log-level: 2
+  --lr-decay-iters: 320000
+  --save: ${CHECKPOINT_PATH}
+  --load: ${CHECKPOINT_PATH}
+  --data-path: ${DATA_PATH}/my-gpt3_00_text_document
+  --vocab-file: ${DATA_PATH}/bpe/vocab.json
+  --merge-file: ${DATA_PATH}/bpe/merges.txt
+  --split: 949,50,1
+  --distributed-backend: nccl
+  --lr: 0.00015
+  --lr-decay-style: cosine
+  --min-lr: 1.0e-5
+  --weight-decay: 1e-2
+  --clip-grad: 1.0
+  --lr-warmup-fraction: .01
+  --log-interval: 1
+  --save-interval: 10000
+  --eval-interval: 1000
+  --eval-iters: 10
+  --transformer-impl: transformer_engine
+  --tensor-model-parallel-size: 1
+  --pipeline-model-parallel-size: 2
+  --position-embedding-type: rope
+  --rotary-interleaved: true
+  --no-rope-fusion: true
+  --deterministic-mode: true
+  --no-gradient-accumulation-fusion: true
+  --attention-softmax-in-fp32: true
+  --use-mcore-models: true
+  --ckpt-format: torch_dist
+  --data-cache-path: ${DATA_CACHE_PATH}
+  --bf16: true
+TEST_TYPE: regular
diff --git a/tests/functional_tests/test_cases/gpt/gpt3_mr_mcore_te_tp1_pp4_disable_bias_linear_dgx_a100_1N8G/golden_values_dev.json b/tests/functional_tests/test_cases/gpt/gpt3_mr_mcore_te_tp1_pp4_disable_bias_linear_dgx_a100_1N8G/golden_values_dev.json
new file mode 100644
index 0000000000..dd3edb44d6
--- /dev/null
+++ b/tests/functional_tests/test_cases/gpt/gpt3_mr_mcore_te_tp1_pp4_disable_bias_linear_dgx_a100_1N8G/golden_values_dev.json
@@ -0,0 +1,53 @@
+{
+    "lm loss": {
+        "start_step": 0,
+        "end_step": 50,
+        "step_interval": 5,
+        "values": [
+            10.79196,
+            10.86773,
+            10.89184,
+            10.78351,
+            10.66166,
+            10.58279,
+            10.08537,
+            10.19442,
+            10.13771,
+            9.81474
+        ]
+    },
+    "num-zeros": {
+        "start_step": 0,
+        "end_step": 50,
+        "step_interval": 5,
+        "values": [
+            1605.0,
+            1799.0,
+            1895.0,
+            1949.0,
+            1789.0,
+            1675.0,
+            1616.0,
+            1849.0,
+            2353.0,
+            2365.0
+        ]
+    },
+    "iteration-time": {
+        "start_step": 0,
+        "end_step": 50,
+        "step_interval": 5,
+        "values": [
+            11.50222,
+            0.14899,
+            0.15017,
+            0.14635,
+            0.14834,
+            0.14836,
+            0.14862,
+            0.14731,
+            0.14874,
+            0.14738
+        ]
+    }
+}
\ No newline at end of file
diff --git a/tests/functional_tests/test_cases/gpt/gpt3_mr_mcore_te_tp1_pp4_disable_bias_linear_dgx_a100_1N8G/golden_values_lts.json b/tests/functional_tests/test_cases/gpt/gpt3_mr_mcore_te_tp1_pp4_disable_bias_linear_dgx_a100_1N8G/golden_values_lts.json
new file mode 100644
index 0000000000..cdabc8e9d3
--- /dev/null
+++ b/tests/functional_tests/test_cases/gpt/gpt3_mr_mcore_te_tp1_pp4_disable_bias_linear_dgx_a100_1N8G/golden_values_lts.json
@@ -0,0 +1 @@
+{"lm loss": {"start_step": 0, "end_step": 50, "step_interval": 5, "values": [10.79205, 10.86789, 10.89149, 10.78328, 10.66126, 10.58275, 10.08467, 10.19448, 10.13785, 9.81454]}, "num-zeros": {"start_step": 0, "end_step": 50, "step_interval": 5, "values": [1580.0, 1778.0, 1849.0, 1841.0, 1884.0, 1679.0, 1544.0, 1953.0, 2449.0, 2335.0]}, "iteration_timing_avg": 0.12243558823529416}
\ No newline at end of file
diff --git a/tests/functional_tests/test_cases/gpt/gpt3_mr_mcore_te_tp1_pp4_disable_bias_linear_dgx_a100_1N8G/model_config.yaml b/tests/functional_tests/test_cases/gpt/gpt3_mr_mcore_te_tp1_pp4_disable_bias_linear_dgx_a100_1N8G/model_config.yaml
new file mode 100644
index 0000000000..2e0188551a
--- /dev/null
+++ b/tests/functional_tests/test_cases/gpt/gpt3_mr_mcore_te_tp1_pp4_disable_bias_linear_dgx_a100_1N8G/model_config.yaml
@@ -0,0 +1,52 @@
+ENV_VARS:
+  CUDA_DEVICE_MAX_CONNECTIONS: 1
+  NVTE_ALLOW_NONDETERMINISTIC_ALGO: 0
+  NCCL_ALGO: Tree
+  CUBLAS_WORKSPACE_CONFIG: :4096:8
+  N_REPEATS: 5
+MODEL_ARGS:
+  --num-layers: 12
+  --hidden-size: 512
+  --num-attention-heads: 8
+  --log-params-norm: true
+  --log-num-zeros-in-grad: true
+  --log-validation-ppl-to-tensorboard: true
+  --log-timers-to-tensorboard: true
+  --tensorboard-dir: ${TENSORBOARD_PATH}
+  --micro-batch-size: 4
+  --global-batch-size: 32
+  --seq-length: 1024
+  --max-position-embeddings: 1024
+  --train-iters: 50
+  --timing-log-level: 2
+  --lr-decay-iters: 320000
+  --save: ${CHECKPOINT_PATH}
+  --load: ${CHECKPOINT_PATH}
+  --data-path: ${DATA_PATH}/my-gpt3_00_text_document
+  --vocab-file: ${DATA_PATH}/bpe/vocab.json
+  --merge-file: ${DATA_PATH}/bpe/merges.txt
+  --split: 949,50,1
+  --distributed-backend: nccl
+  --lr: 0.00015
+  --lr-decay-style: cosine
+  --min-lr: 1.0e-5
+  --weight-decay: 1e-2
+  --clip-grad: 1.0
+  --lr-warmup-fraction: .01
+  --log-interval: 1
+  --save-interval: 10000
+  --eval-interval: 1000
+  --eval-iters: 10
+  --transformer-impl: transformer_engine
+  --tensor-model-parallel-size: 1
+  --pipeline-model-parallel-size: 4
+  --disable-bias-linear: true
+  --async-save: true
+  --deterministic-mode: true
+  --no-gradient-accumulation-fusion: true
+  --attention-softmax-in-fp32: true
+  --use-mcore-models: true
+  --ckpt-format: torch_dist
+  --data-cache-path: ${DATA_CACHE_PATH}
+  --bf16: true
+TEST_TYPE: regular
diff --git a/tests/functional_tests/test_cases/gpt/gpt3_mr_mcore_te_tp1_pp4_resume_torch_dist_disable_bias_linear_dgx_a100_1N8G/model_config.yaml b/tests/functional_tests/test_cases/gpt/gpt3_mr_mcore_te_tp1_pp4_resume_torch_dist_disable_bias_linear_dgx_a100_1N8G/model_config.yaml
new file mode 100644
index 0000000000..8fa10f4b9d
--- /dev/null
+++ b/tests/functional_tests/test_cases/gpt/gpt3_mr_mcore_te_tp1_pp4_resume_torch_dist_disable_bias_linear_dgx_a100_1N8G/model_config.yaml
@@ -0,0 +1,53 @@
+ENV_VARS:
+  CUDA_DEVICE_MAX_CONNECTIONS: 1
+  NVTE_ALLOW_NONDETERMINISTIC_ALGO: 0
+  NCCL_ALGO: Tree
+  CUBLAS_WORKSPACE_CONFIG: :4096:8
+  N_REPEATS: 5
+MODEL_ARGS:
+  --num-layers: 12
+  --hidden-size: 512
+  --num-attention-heads: 8
+  --log-params-norm: true
+  --log-num-zeros-in-grad: true
+  --log-validation-ppl-to-tensorboard: true
+  --log-timers-to-tensorboard: true
+  --tensorboard-dir: ${TENSORBOARD_PATH}
+  --micro-batch-size: 4
+  --global-batch-size: 32
+  --seq-length: 1024
+  --max-position-embeddings: 1024
+  --train-iters: 100
+  --timing-log-level: 2
+  --lr-decay-iters: 320000
+  --save: ${CHECKPOINT_PATH}
+  --load: ${CHECKPOINT_PATH}
+  --data-path: ${DATA_PATH}/my-gpt3_00_text_document
+  --vocab-file: ${DATA_PATH}/bpe/vocab.json
+  --merge-file: ${DATA_PATH}/bpe/merges.txt
+  --split: 949,50,1
+  --distributed-backend: nccl
+  --lr: 0.00015
+  --lr-decay-style: cosine
+  --min-lr: 1.0e-5
+  --weight-decay: 1e-2
+  --clip-grad: 1.0
+  --lr-warmup-fraction: .01
+  --log-interval: 1
+  --save-interval: 50
+  --eval-interval: 1000
+  --eval-iters: 10
+  --transformer-impl: transformer_engine
+  --tensor-model-parallel-size: 1
+  --pipeline-model-parallel-size: 4
+  --disable-bias-linear: true
+  --async-save: true
+  --deterministic-mode: true
+  --no-gradient-accumulation-fusion: true
+  --attention-softmax-in-fp32: true
+  --use-checkpoint-opt_param-scheduler: true
+  --use-mcore-models: true
+  --ckpt-format: torch_dist
+  --data-cache-path: ${DATA_CACHE_PATH}
+  --bf16: true
+TEST_TYPE: ckpt-resume
diff --git a/tests/functional_tests/test_cases/gpt/gpt3_mr_mcore_te_tp1_pp4_resume_torch_dist_sequence_parallel_dgx_a100_1N8G/model_config.yaml b/tests/functional_tests/test_cases/gpt/gpt3_mr_mcore_te_tp1_pp4_resume_torch_dist_sequence_parallel_dgx_a100_1N8G/model_config.yaml
new file mode 100644
index 0000000000..c64a4ef5e7
--- /dev/null
+++ b/tests/functional_tests/test_cases/gpt/gpt3_mr_mcore_te_tp1_pp4_resume_torch_dist_sequence_parallel_dgx_a100_1N8G/model_config.yaml
@@ -0,0 +1,53 @@
+ENV_VARS:
+  CUDA_DEVICE_MAX_CONNECTIONS: 1
+  NVTE_ALLOW_NONDETERMINISTIC_ALGO: 0
+  NCCL_ALGO: Tree
+  CUBLAS_WORKSPACE_CONFIG: :4096:8
+  N_REPEATS: 5
+MODEL_ARGS:
+  --num-layers: 12
+  --hidden-size: 512
+  --num-attention-heads: 8
+  --log-params-norm: true
+  --log-num-zeros-in-grad: true
+  --log-validation-ppl-to-tensorboard: true
+  --log-timers-to-tensorboard: true
+  --tensorboard-dir: ${TENSORBOARD_PATH}
+  --micro-batch-size: 4
+  --global-batch-size: 32
+  --seq-length: 1024
+  --max-position-embeddings: 1024
+  --train-iters: 100
+  --timing-log-level: 2
+  --lr-decay-iters: 320000
+  --save: ${CHECKPOINT_PATH}
+  --load: ${CHECKPOINT_PATH}
+  --data-path: ${DATA_PATH}/my-gpt3_00_text_document
+  --vocab-file: ${DATA_PATH}/bpe/vocab.json
+  --merge-file: ${DATA_PATH}/bpe/merges.txt
+  --split: 949,50,1
+  --distributed-backend: nccl
+  --lr: 0.00015
+  --lr-decay-style: cosine
+  --min-lr: 1.0e-5
+  --weight-decay: 1e-2
+  --clip-grad: 1.0
+  --lr-warmup-fraction: .01
+  --log-interval: 1
+  --save-interval: 50
+  --eval-interval: 1000
+  --eval-iters: 10
+  --transformer-impl: transformer_engine
+  --tensor-model-parallel-size: 1
+  --pipeline-model-parallel-size: 4
+  --sequence-parallel: true
+  --ckpt-fully-parallel-load: true
+  --deterministic-mode: true
+  --no-gradient-accumulation-fusion: true
+  --attention-softmax-in-fp32: true
+  --use-checkpoint-opt_param-scheduler: true
+  --use-mcore-models: true
+  --ckpt-format: torch_dist
+  --data-cache-path: ${DATA_CACHE_PATH}
+  --bf16: true
+TEST_TYPE: ckpt-resume
diff --git a/tests/functional_tests/test_cases/gpt/gpt3_mr_mcore_te_tp1_pp4_resume_torch_dist_swiglu_dgx_a100_1N8G/model_config.yaml b/tests/functional_tests/test_cases/gpt/gpt3_mr_mcore_te_tp1_pp4_resume_torch_dist_swiglu_dgx_a100_1N8G/model_config.yaml
new file mode 100644
index 0000000000..dda1876e1a
--- /dev/null
+++ b/tests/functional_tests/test_cases/gpt/gpt3_mr_mcore_te_tp1_pp4_resume_torch_dist_swiglu_dgx_a100_1N8G/model_config.yaml
@@ -0,0 +1,54 @@
+ENV_VARS:
+  CUDA_DEVICE_MAX_CONNECTIONS: 1
+  NVTE_ALLOW_NONDETERMINISTIC_ALGO: 0
+  NCCL_ALGO: Tree
+  CUBLAS_WORKSPACE_CONFIG: :4096:8
+  N_REPEATS: 5
+MODEL_ARGS:
+  --num-layers: 12
+  --hidden-size: 512
+  --num-attention-heads: 8
+  --log-params-norm: true
+  --log-num-zeros-in-grad: true
+  --log-validation-ppl-to-tensorboard: true
+  --log-timers-to-tensorboard: true
+  --tensorboard-dir: ${TENSORBOARD_PATH}
+  --micro-batch-size: 4
+  --global-batch-size: 32
+  --seq-length: 1024
+  --max-position-embeddings: 1024
+  --train-iters: 100
+  --timing-log-level: 2
+  --lr-decay-iters: 320000
+  --save: ${CHECKPOINT_PATH}
+  --load: ${CHECKPOINT_PATH}
+  --data-path: ${DATA_PATH}/my-gpt3_00_text_document
+  --vocab-file: ${DATA_PATH}/bpe/vocab.json
+  --merge-file: ${DATA_PATH}/bpe/merges.txt
+  --split: 949,50,1
+  --distributed-backend: nccl
+  --lr: 0.00015
+  --lr-decay-style: cosine
+  --min-lr: 1.0e-5
+  --weight-decay: 1e-2
+  --clip-grad: 1.0
+  --lr-warmup-fraction: .01
+  --log-interval: 1
+  --save-interval: 50
+  --eval-interval: 1000
+  --eval-iters: 10
+  --transformer-impl: transformer_engine
+  --tensor-model-parallel-size: 1
+  --pipeline-model-parallel-size: 4
+  --swiglu: true
+  --ckpt-fully-parallel-load: true
+  --async-save: true
+  --deterministic-mode: true
+  --no-gradient-accumulation-fusion: true
+  --attention-softmax-in-fp32: true
+  --use-checkpoint-opt_param-scheduler: true
+  --use-mcore-models: true
+  --ckpt-format: torch_dist
+  --data-cache-path: ${DATA_CACHE_PATH}
+  --bf16: true
+TEST_TYPE: ckpt-resume
diff --git a/tests/functional_tests/test_cases/gpt/gpt3_mr_mcore_te_tp1_pp4_resume_torch_dist_untie_embeddings_and_outputs_dgx_a100_1N8G/model_config.yaml b/tests/functional_tests/test_cases/gpt/gpt3_mr_mcore_te_tp1_pp4_resume_torch_dist_untie_embeddings_and_outputs_dgx_a100_1N8G/model_config.yaml
new file mode 100644
index 0000000000..df7ba9fb3b
--- /dev/null
+++ b/tests/functional_tests/test_cases/gpt/gpt3_mr_mcore_te_tp1_pp4_resume_torch_dist_untie_embeddings_and_outputs_dgx_a100_1N8G/model_config.yaml
@@ -0,0 +1,52 @@
+ENV_VARS:
+  CUDA_DEVICE_MAX_CONNECTIONS: 1
+  NVTE_ALLOW_NONDETERMINISTIC_ALGO: 0
+  NCCL_ALGO: Tree
+  CUBLAS_WORKSPACE_CONFIG: :4096:8
+  N_REPEATS: 5
+MODEL_ARGS:
+  --num-layers: 12
+  --hidden-size: 512
+  --num-attention-heads: 8
+  --log-params-norm: true
+  --log-num-zeros-in-grad: true
+  --log-validation-ppl-to-tensorboard: true
+  --log-timers-to-tensorboard: true
+  --tensorboard-dir: ${TENSORBOARD_PATH}
+  --micro-batch-size: 4
+  --global-batch-size: 32
+  --seq-length: 1024
+  --max-position-embeddings: 1024
+  --train-iters: 100
+  --timing-log-level: 2
+  --lr-decay-iters: 320000
+  --save: ${CHECKPOINT_PATH}
+  --load: ${CHECKPOINT_PATH}
+  --data-path: ${DATA_PATH}/my-gpt3_00_text_document
+  --vocab-file: ${DATA_PATH}/bpe/vocab.json
+  --merge-file: ${DATA_PATH}/bpe/merges.txt
+  --split: 949,50,1
+  --distributed-backend: nccl
+  --lr: 0.00015
+  --lr-decay-style: cosine
+  --min-lr: 1.0e-5
+  --weight-decay: 1e-2
+  --clip-grad: 1.0
+  --lr-warmup-fraction: .01
+  --log-interval: 1
+  --save-interval: 50
+  --eval-interval: 1000
+  --eval-iters: 10
+  --transformer-impl: transformer_engine
+  --tensor-model-parallel-size: 1
+  --pipeline-model-parallel-size: 4
+  --untie-embeddings-and-output-weights: true
+  --deterministic-mode: true
+  --no-gradient-accumulation-fusion: true
+  --attention-softmax-in-fp32: true
+  --use-checkpoint-opt_param-scheduler: true
+  --use-mcore-models: true
+  --ckpt-format: torch_dist
+  --data-cache-path: ${DATA_CACHE_PATH}
+  --bf16: true
+TEST_TYPE: ckpt-resume
diff --git a/tests/functional_tests/test_cases/gpt/gpt3_mr_mcore_te_tp1_pp4_sequence_parallel_dgx_a100_1N8G/golden_values_dev.json b/tests/functional_tests/test_cases/gpt/gpt3_mr_mcore_te_tp1_pp4_sequence_parallel_dgx_a100_1N8G/golden_values_dev.json
new file mode 100644
index 0000000000..0ee531577c
--- /dev/null
+++ b/tests/functional_tests/test_cases/gpt/gpt3_mr_mcore_te_tp1_pp4_sequence_parallel_dgx_a100_1N8G/golden_values_dev.json
@@ -0,0 +1,53 @@
+{
+    "lm loss": {
+        "start_step": 0,
+        "end_step": 50,
+        "step_interval": 5,
+        "values": [
+            10.79196,
+            10.86679,
+            10.89085,
+            10.78206,
+            10.65999,
+            10.58008,
+            10.08261,
+            10.19125,
+            10.13465,
+            9.81171
+        ]
+    },
+    "num-zeros": {
+        "start_step": 0,
+        "end_step": 50,
+        "step_interval": 5,
+        "values": [
+            1613.0,
+            1818.0,
+            1858.0,
+            1810.0,
+            1856.0,
+            1720.0,
+            1644.0,
+            1892.0,
+            2329.0,
+            2395.0
+        ]
+    },
+    "iteration-time": {
+        "start_step": 0,
+        "end_step": 50,
+        "step_interval": 5,
+        "values": [
+            15.1637,
+            0.16095,
+            0.15953,
+            0.15875,
+            0.15733,
+            0.15765,
+            0.15696,
+            0.15947,
+            0.15779,
+            0.15614
+        ]
+    }
+}
\ No newline at end of file
diff --git a/tests/functional_tests/test_cases/gpt/gpt3_mr_mcore_te_tp1_pp4_sequence_parallel_dgx_a100_1N8G/golden_values_lts.json b/tests/functional_tests/test_cases/gpt/gpt3_mr_mcore_te_tp1_pp4_sequence_parallel_dgx_a100_1N8G/golden_values_lts.json
new file mode 100644
index 0000000000..6123f3ca4f
--- /dev/null
+++ b/tests/functional_tests/test_cases/gpt/gpt3_mr_mcore_te_tp1_pp4_sequence_parallel_dgx_a100_1N8G/golden_values_lts.json
@@ -0,0 +1 @@
+{"lm loss": {"start_step": 0, "end_step": 50, "step_interval": 5, "values": [10.79208, 10.86687, 10.89062, 10.78178, 10.65967, 10.58006, 10.08189, 10.19133, 10.13481, 9.81153]}, "num-zeros": {"start_step": 0, "end_step": 50, "step_interval": 5, "values": [1633.0, 1860.0, 1755.0, 1886.0, 1874.0, 1796.0, 1586.0, 1926.0, 2330.0, 2361.0]}, "iteration_timing_avg": 0.12348235294117646}
\ No newline at end of file
diff --git a/tests/functional_tests/test_cases/gpt/gpt3_mr_mcore_te_tp1_pp4_sequence_parallel_dgx_a100_1N8G/model_config.yaml b/tests/functional_tests/test_cases/gpt/gpt3_mr_mcore_te_tp1_pp4_sequence_parallel_dgx_a100_1N8G/model_config.yaml
new file mode 100644
index 0000000000..479916c654
--- /dev/null
+++ b/tests/functional_tests/test_cases/gpt/gpt3_mr_mcore_te_tp1_pp4_sequence_parallel_dgx_a100_1N8G/model_config.yaml
@@ -0,0 +1,52 @@
+ENV_VARS:
+  CUDA_DEVICE_MAX_CONNECTIONS: 1
+  NVTE_ALLOW_NONDETERMINISTIC_ALGO: 0
+  NCCL_ALGO: Tree
+  CUBLAS_WORKSPACE_CONFIG: :4096:8
+  N_REPEATS: 5
+MODEL_ARGS:
+  --num-layers: 12
+  --hidden-size: 512
+  --num-attention-heads: 8
+  --log-params-norm: true
+  --log-num-zeros-in-grad: true
+  --log-validation-ppl-to-tensorboard: true
+  --log-timers-to-tensorboard: true
+  --tensorboard-dir: ${TENSORBOARD_PATH}
+  --micro-batch-size: 4
+  --global-batch-size: 32
+  --seq-length: 1024
+  --max-position-embeddings: 1024
+  --train-iters: 50
+  --timing-log-level: 2
+  --lr-decay-iters: 320000
+  --save: ${CHECKPOINT_PATH}
+  --load: ${CHECKPOINT_PATH}
+  --data-path: ${DATA_PATH}/my-gpt3_00_text_document
+  --vocab-file: ${DATA_PATH}/bpe/vocab.json
+  --merge-file: ${DATA_PATH}/bpe/merges.txt
+  --split: 949,50,1
+  --distributed-backend: nccl
+  --lr: 0.00015
+  --lr-decay-style: cosine
+  --min-lr: 1.0e-5
+  --weight-decay: 1e-2
+  --clip-grad: 1.0
+  --lr-warmup-fraction: .01
+  --log-interval: 1
+  --save-interval: 10000
+  --eval-interval: 1000
+  --eval-iters: 10
+  --transformer-impl: transformer_engine
+  --tensor-model-parallel-size: 1
+  --pipeline-model-parallel-size: 4
+  --sequence-parallel: true
+  --ckpt-fully-parallel-load: true
+  --deterministic-mode: true
+  --no-gradient-accumulation-fusion: true
+  --attention-softmax-in-fp32: true
+  --use-mcore-models: true
+  --ckpt-format: torch_dist
+  --data-cache-path: ${DATA_CACHE_PATH}
+  --bf16: true
+TEST_TYPE: regular
diff --git a/tests/functional_tests/test_cases/gpt/gpt3_mr_mcore_te_tp1_pp4_swiglu_dgx_a100_1N8G/golden_values_dev.json b/tests/functional_tests/test_cases/gpt/gpt3_mr_mcore_te_tp1_pp4_swiglu_dgx_a100_1N8G/golden_values_dev.json
new file mode 100644
index 0000000000..f12807d602
--- /dev/null
+++ b/tests/functional_tests/test_cases/gpt/gpt3_mr_mcore_te_tp1_pp4_swiglu_dgx_a100_1N8G/golden_values_dev.json
@@ -0,0 +1,53 @@
+{
+    "lm loss": {
+        "start_step": 0,
+        "end_step": 50,
+        "step_interval": 5,
+        "values": [
+            10.74036,
+            10.81703,
+            10.84134,
+            10.75628,
+            10.69559,
+            10.62957,
+            10.20355,
+            10.36111,
+            10.25566,
+            9.94185
+        ]
+    },
+    "num-zeros": {
+        "start_step": 0,
+        "end_step": 50,
+        "step_interval": 5,
+        "values": [
+            2496.0,
+            2855.0,
+            3001.0,
+            2810.0,
+            2625.0,
+            2656.0,
+            2274.0,
+            2513.0,
+            2546.0,
+            2430.0
+        ]
+    },
+    "iteration-time": {
+        "start_step": 0,
+        "end_step": 50,
+        "step_interval": 5,
+        "values": [
+            16.15292,
+            0.16367,
+            0.15632,
+            0.15503,
+            0.15497,
+            0.15498,
+            0.15472,
+            0.15372,
+            0.1535,
+            0.15422
+        ]
+    }
+}
\ No newline at end of file
diff --git a/tests/functional_tests/test_cases/gpt/gpt3_mr_mcore_te_tp1_pp4_swiglu_dgx_a100_1N8G/golden_values_lts.json b/tests/functional_tests/test_cases/gpt/gpt3_mr_mcore_te_tp1_pp4_swiglu_dgx_a100_1N8G/golden_values_lts.json
new file mode 100644
index 0000000000..02520951bb
--- /dev/null
+++ b/tests/functional_tests/test_cases/gpt/gpt3_mr_mcore_te_tp1_pp4_swiglu_dgx_a100_1N8G/golden_values_lts.json
@@ -0,0 +1 @@
+{"lm loss": {"start_step": 0, "end_step": 50, "step_interval": 5, "values": [10.74049, 10.81937, 10.84178, 10.75551, 10.69818, 10.63091, 10.20265, 10.36288, 10.25632, 9.94256]}, "num-zeros": {"start_step": 0, "end_step": 50, "step_interval": 5, "values": [2527.0, 2937.0, 2975.0, 2749.0, 2580.0, 2593.0, 2320.0, 2616.0, 2541.0, 2393.0]}, "iteration_timing_avg": 0.12725500000000006}
\ No newline at end of file
diff --git a/tests/functional_tests/test_cases/gpt/gpt3_mr_mcore_te_tp1_pp4_swiglu_dgx_a100_1N8G/model_config.yaml b/tests/functional_tests/test_cases/gpt/gpt3_mr_mcore_te_tp1_pp4_swiglu_dgx_a100_1N8G/model_config.yaml
new file mode 100644
index 0000000000..20c57f0c95
--- /dev/null
+++ b/tests/functional_tests/test_cases/gpt/gpt3_mr_mcore_te_tp1_pp4_swiglu_dgx_a100_1N8G/model_config.yaml
@@ -0,0 +1,53 @@
+ENV_VARS:
+  CUDA_DEVICE_MAX_CONNECTIONS: 1
+  NVTE_ALLOW_NONDETERMINISTIC_ALGO: 0
+  NCCL_ALGO: Tree
+  CUBLAS_WORKSPACE_CONFIG: :4096:8
+  N_REPEATS: 5
+MODEL_ARGS:
+  --num-layers: 12
+  --hidden-size: 512
+  --num-attention-heads: 8
+  --log-params-norm: true
+  --log-num-zeros-in-grad: true
+  --log-validation-ppl-to-tensorboard: true
+  --log-timers-to-tensorboard: true
+  --tensorboard-dir: ${TENSORBOARD_PATH}
+  --micro-batch-size: 4
+  --global-batch-size: 32
+  --seq-length: 1024
+  --max-position-embeddings: 1024
+  --train-iters: 50
+  --timing-log-level: 2
+  --lr-decay-iters: 320000
+  --save: ${CHECKPOINT_PATH}
+  --load: ${CHECKPOINT_PATH}
+  --data-path: ${DATA_PATH}/my-gpt3_00_text_document
+  --vocab-file: ${DATA_PATH}/bpe/vocab.json
+  --merge-file: ${DATA_PATH}/bpe/merges.txt
+  --split: 949,50,1
+  --distributed-backend: nccl
+  --lr: 0.00015
+  --lr-decay-style: cosine
+  --min-lr: 1.0e-5
+  --weight-decay: 1e-2
+  --clip-grad: 1.0
+  --lr-warmup-fraction: .01
+  --log-interval: 1
+  --save-interval: 10000
+  --eval-interval: 1000
+  --eval-iters: 10
+  --transformer-impl: transformer_engine
+  --tensor-model-parallel-size: 1
+  --pipeline-model-parallel-size: 4
+  --swiglu: true
+  --ckpt-fully-parallel-load: true
+  --async-save: true
+  --deterministic-mode: true
+  --no-gradient-accumulation-fusion: true
+  --attention-softmax-in-fp32: true
+  --use-mcore-models: true
+  --ckpt-format: torch_dist
+  --data-cache-path: ${DATA_CACHE_PATH}
+  --bf16: true
+TEST_TYPE: regular
diff --git a/tests/functional_tests/test_cases/gpt/gpt3_mr_mcore_te_tp1_pp4_untie_embeddings_and_outputs_dgx_a100_1N8G/golden_values_dev.json b/tests/functional_tests/test_cases/gpt/gpt3_mr_mcore_te_tp1_pp4_untie_embeddings_and_outputs_dgx_a100_1N8G/golden_values_dev.json
new file mode 100644
index 0000000000..a16146d7f7
--- /dev/null
+++ b/tests/functional_tests/test_cases/gpt/gpt3_mr_mcore_te_tp1_pp4_untie_embeddings_and_outputs_dgx_a100_1N8G/golden_values_dev.json
@@ -0,0 +1,53 @@
+{
+    "lm loss": {
+        "start_step": 0,
+        "end_step": 50,
+        "step_interval": 5,
+        "values": [
+            10.90084,
+            10.91069,
+            10.91584,
+            10.84814,
+            10.70705,
+            10.63102,
+            10.15359,
+            10.26095,
+            10.16041,
+            9.83157
+        ]
+    },
+    "num-zeros": {
+        "start_step": 0,
+        "end_step": 50,
+        "step_interval": 5,
+        "values": [
+            22726994.0,
+            23021698.0,
+            22501118.0,
+            22830752.0,
+            22739448.0,
+            22547214.0,
+            22955480.0,
+            22589960.0,
+            22659556.0,
+            22884632.0
+        ]
+    },
+    "iteration-time": {
+        "start_step": 0,
+        "end_step": 50,
+        "step_interval": 5,
+        "values": [
+            13.91217,
+            0.15925,
+            0.16084,
+            0.15713,
+            0.15337,
+            0.15329,
+            0.15378,
+            0.15301,
+            0.15333,
+            0.15296
+        ]
+    }
+}
\ No newline at end of file
diff --git a/tests/functional_tests/test_cases/gpt/gpt3_mr_mcore_te_tp1_pp4_untie_embeddings_and_outputs_dgx_a100_1N8G/golden_values_lts.json b/tests/functional_tests/test_cases/gpt/gpt3_mr_mcore_te_tp1_pp4_untie_embeddings_and_outputs_dgx_a100_1N8G/golden_values_lts.json
new file mode 100644
index 0000000000..2039e2f498
--- /dev/null
+++ b/tests/functional_tests/test_cases/gpt/gpt3_mr_mcore_te_tp1_pp4_untie_embeddings_and_outputs_dgx_a100_1N8G/golden_values_lts.json
@@ -0,0 +1 @@
+{"lm loss": {"start_step": 0, "end_step": 50, "step_interval": 5, "values": [10.90105, 10.91104, 10.91635, 10.84822, 10.70727, 10.63018, 10.15241, 10.26052, 10.15994, 9.83162]}, "num-zeros": {"start_step": 0, "end_step": 50, "step_interval": 5, "values": [22727086.0, 23021732.0, 22500940.0, 22830674.0, 22739332.0, 22547236.0, 22955516.0, 22590012.0, 22659588.0, 22884630.0]}, "iteration_timing_avg": 0.1246464705882353}
\ No newline at end of file
diff --git a/tests/functional_tests/test_cases/gpt/gpt3_mr_mcore_te_tp1_pp4_untie_embeddings_and_outputs_dgx_a100_1N8G/model_config.yaml b/tests/functional_tests/test_cases/gpt/gpt3_mr_mcore_te_tp1_pp4_untie_embeddings_and_outputs_dgx_a100_1N8G/model_config.yaml
new file mode 100644
index 0000000000..f7c52c997f
--- /dev/null
+++ b/tests/functional_tests/test_cases/gpt/gpt3_mr_mcore_te_tp1_pp4_untie_embeddings_and_outputs_dgx_a100_1N8G/model_config.yaml
@@ -0,0 +1,51 @@
+ENV_VARS:
+  CUDA_DEVICE_MAX_CONNECTIONS: 1
+  NVTE_ALLOW_NONDETERMINISTIC_ALGO: 0
+  NCCL_ALGO: Tree
+  CUBLAS_WORKSPACE_CONFIG: :4096:8
+  N_REPEATS: 5
+MODEL_ARGS:
+  --num-layers: 12
+  --hidden-size: 512
+  --num-attention-heads: 8
+  --log-params-norm: true
+  --log-num-zeros-in-grad: true
+  --log-validation-ppl-to-tensorboard: true
+  --log-timers-to-tensorboard: true
+  --tensorboard-dir: ${TENSORBOARD_PATH}
+  --micro-batch-size: 4
+  --global-batch-size: 32
+  --seq-length: 1024
+  --max-position-embeddings: 1024
+  --train-iters: 50
+  --timing-log-level: 2
+  --lr-decay-iters: 320000
+  --save: ${CHECKPOINT_PATH}
+  --load: ${CHECKPOINT_PATH}
+  --data-path: ${DATA_PATH}/my-gpt3_00_text_document
+  --vocab-file: ${DATA_PATH}/bpe/vocab.json
+  --merge-file: ${DATA_PATH}/bpe/merges.txt
+  --split: 949,50,1
+  --distributed-backend: nccl
+  --lr: 0.00015
+  --lr-decay-style: cosine
+  --min-lr: 1.0e-5
+  --weight-decay: 1e-2
+  --clip-grad: 1.0
+  --lr-warmup-fraction: .01
+  --log-interval: 1
+  --save-interval: 10000
+  --eval-interval: 1000
+  --eval-iters: 10
+  --transformer-impl: transformer_engine
+  --tensor-model-parallel-size: 1
+  --pipeline-model-parallel-size: 4
+  --untie-embeddings-and-output-weights: true
+  --deterministic-mode: true
+  --no-gradient-accumulation-fusion: true
+  --attention-softmax-in-fp32: true
+  --use-mcore-models: true
+  --ckpt-format: torch_dist
+  --data-cache-path: ${DATA_CACHE_PATH}
+  --bf16: true
+TEST_TYPE: regular
diff --git a/tests/functional_tests/test_cases/gpt/gpt3_mr_mcore_te_tp1_pp4_vp1_calculate_per_token_loss_dgx_a100_1N8G/golden_values_dev.json b/tests/functional_tests/test_cases/gpt/gpt3_mr_mcore_te_tp1_pp4_vp1_calculate_per_token_loss_dgx_a100_1N8G/golden_values_dev.json
new file mode 100644
index 0000000000..23063db970
--- /dev/null
+++ b/tests/functional_tests/test_cases/gpt/gpt3_mr_mcore_te_tp1_pp4_vp1_calculate_per_token_loss_dgx_a100_1N8G/golden_values_dev.json
@@ -0,0 +1,53 @@
+{
+    "lm loss": {
+        "start_step": 0,
+        "end_step": 50,
+        "step_interval": 5,
+        "values": [
+            10.81873,
+            10.87454,
+            10.87863,
+            10.79574,
+            10.68112,
+            10.59511,
+            10.10041,
+            10.21268,
+            10.13892,
+            9.80847
+        ]
+    },
+    "num-zeros": {
+        "start_step": 0,
+        "end_step": 50,
+        "step_interval": 5,
+        "values": [
+            1541.0,
+            1772.0,
+            1858.0,
+            1801.0,
+            1906.0,
+            1716.0,
+            1550.0,
+            1839.0,
+            2367.0,
+            2271.0
+        ]
+    },
+    "iteration-time": {
+        "start_step": 0,
+        "end_step": 50,
+        "step_interval": 5,
+        "values": [
+            18.02446,
+            0.16375,
+            0.14912,
+            0.14978,
+            0.1495,
+            0.14922,
+            0.15031,
+            0.14892,
+            0.149,
+            0.15001
+        ]
+    }
+}
\ No newline at end of file
diff --git a/tests/functional_tests/test_cases/gpt/gpt3_mr_mcore_te_tp1_pp4_vp1_calculate_per_token_loss_dgx_a100_1N8G/golden_values_lts.json b/tests/functional_tests/test_cases/gpt/gpt3_mr_mcore_te_tp1_pp4_vp1_calculate_per_token_loss_dgx_a100_1N8G/golden_values_lts.json
new file mode 100644
index 0000000000..939863d9d8
--- /dev/null
+++ b/tests/functional_tests/test_cases/gpt/gpt3_mr_mcore_te_tp1_pp4_vp1_calculate_per_token_loss_dgx_a100_1N8G/golden_values_lts.json
@@ -0,0 +1 @@
+{"lm loss": {"start_step": 0, "end_step": 50, "step_interval": 5, "values": [10.82005, 10.87449, 10.87798, 10.79509, 10.68164, 10.59517, 10.10046, 10.21236, 10.13863, 9.80877]}, "num-zeros": {"start_step": 0, "end_step": 50, "step_interval": 5, "values": [1559.0, 1719.0, 1856.0, 1791.0, 1900.0, 1709.0, 1627.0, 1831.0, 2272.0, 2312.0]}, "iteration_timing_avg": 0.12502588235294115}
\ No newline at end of file
diff --git a/tests/functional_tests/test_cases/gpt/gpt3_mr_mcore_te_tp1_pp4_vp1_calculate_per_token_loss_dgx_a100_1N8G/model_config.yaml b/tests/functional_tests/test_cases/gpt/gpt3_mr_mcore_te_tp1_pp4_vp1_calculate_per_token_loss_dgx_a100_1N8G/model_config.yaml
new file mode 100644
index 0000000000..210febf448
--- /dev/null
+++ b/tests/functional_tests/test_cases/gpt/gpt3_mr_mcore_te_tp1_pp4_vp1_calculate_per_token_loss_dgx_a100_1N8G/model_config.yaml
@@ -0,0 +1,52 @@
+ENV_VARS:
+  CUDA_DEVICE_MAX_CONNECTIONS: 1
+  NVTE_ALLOW_NONDETERMINISTIC_ALGO: 0
+  NCCL_ALGO: Tree
+  CUBLAS_WORKSPACE_CONFIG: :4096:8
+  N_REPEATS: 5
+MODEL_ARGS:
+  --num-layers: 12
+  --hidden-size: 512
+  --num-attention-heads: 8
+  --log-params-norm: true
+  --log-num-zeros-in-grad: true
+  --log-validation-ppl-to-tensorboard: true
+  --log-timers-to-tensorboard: true
+  --tensorboard-dir: ${TENSORBOARD_PATH}
+  --micro-batch-size: 4
+  --global-batch-size: 32
+  --seq-length: 1024
+  --max-position-embeddings: 1024
+  --train-iters: 50
+  --timing-log-level: 2
+  --lr-decay-iters: 320000
+  --save: ${CHECKPOINT_PATH}
+  --load: ${CHECKPOINT_PATH}
+  --data-path: ${DATA_PATH}/my-gpt3_00_text_document
+  --vocab-file: ${DATA_PATH}/bpe/vocab.json
+  --merge-file: ${DATA_PATH}/bpe/merges.txt
+  --split: 949,50,1
+  --distributed-backend: nccl
+  --lr: 0.00015
+  --lr-decay-style: cosine
+  --min-lr: 1.0e-5
+  --weight-decay: 1e-2
+  --clip-grad: 1.0
+  --lr-warmup-fraction: .01
+  --log-interval: 1
+  --save-interval: 10000
+  --eval-interval: 1000
+  --eval-iters: 10
+  --transformer-impl: transformer_engine
+  --tensor-model-parallel-size: 1
+  --pipeline-model-parallel-size: 4
+  --num-layers-per-virtual-pipeline-stage: 1
+  --calculate-per-token-loss: true
+  --deterministic-mode: true
+  --no-gradient-accumulation-fusion: true
+  --attention-softmax-in-fp32: true
+  --use-mcore-models: true
+  --ckpt-format: torch_dist
+  --data-cache-path: ${DATA_CACHE_PATH}
+  --bf16: true
+TEST_TYPE: regular
diff --git a/tests/functional_tests/test_cases/gpt/gpt3_mr_mcore_te_tp1_pp4_vp1_decoupled_lr_dgx_a100_1N8G/golden_values_dev.json b/tests/functional_tests/test_cases/gpt/gpt3_mr_mcore_te_tp1_pp4_vp1_decoupled_lr_dgx_a100_1N8G/golden_values_dev.json
new file mode 100644
index 0000000000..2bec4985c5
--- /dev/null
+++ b/tests/functional_tests/test_cases/gpt/gpt3_mr_mcore_te_tp1_pp4_vp1_decoupled_lr_dgx_a100_1N8G/golden_values_dev.json
@@ -0,0 +1,53 @@
+{
+    "lm loss": {
+        "start_step": 0,
+        "end_step": 50,
+        "step_interval": 5,
+        "values": [
+            10.81873,
+            10.87453,
+            10.87859,
+            10.7957,
+            10.681,
+            10.5941,
+            10.09982,
+            10.20983,
+            10.13667,
+            9.79979
+        ]
+    },
+    "num-zeros": {
+        "start_step": 0,
+        "end_step": 50,
+        "step_interval": 5,
+        "values": [
+            1541.0,
+            1751.0,
+            1852.0,
+            1767.0,
+            1890.0,
+            1830.0,
+            1637.0,
+            1901.0,
+            2234.0,
+            2261.0
+        ]
+    },
+    "iteration-time": {
+        "start_step": 0,
+        "end_step": 50,
+        "step_interval": 5,
+        "values": [
+            14.03783,
+            0.15431,
+            0.15263,
+            0.15176,
+            0.15147,
+            0.1516,
+            0.15291,
+            0.15327,
+            0.15243,
+            0.15189
+        ]
+    }
+}
\ No newline at end of file
diff --git a/tests/functional_tests/test_cases/gpt/gpt3_mr_mcore_te_tp1_pp4_vp1_decoupled_lr_dgx_a100_1N8G/golden_values_lts.json b/tests/functional_tests/test_cases/gpt/gpt3_mr_mcore_te_tp1_pp4_vp1_decoupled_lr_dgx_a100_1N8G/golden_values_lts.json
new file mode 100644
index 0000000000..460f463a0a
--- /dev/null
+++ b/tests/functional_tests/test_cases/gpt/gpt3_mr_mcore_te_tp1_pp4_vp1_decoupled_lr_dgx_a100_1N8G/golden_values_lts.json
@@ -0,0 +1 @@
+{"lm loss": {"start_step": 0, "end_step": 50, "step_interval": 5, "values": [10.82005, 10.87448, 10.87794, 10.79507, 10.68154, 10.59412, 10.09987, 10.20952, 10.13639, 9.80012]}, "num-zeros": {"start_step": 0, "end_step": 50, "step_interval": 5, "values": [1559.0, 1734.0, 1884.0, 1684.0, 1815.0, 1766.0, 1601.0, 1904.0, 2361.0, 2347.0]}, "iteration_timing_avg": 0.12273676470588235}
\ No newline at end of file
diff --git a/tests/functional_tests/test_cases/gpt/gpt3_mr_mcore_te_tp1_pp4_vp1_decoupled_lr_dgx_a100_1N8G/model_config.yaml b/tests/functional_tests/test_cases/gpt/gpt3_mr_mcore_te_tp1_pp4_vp1_decoupled_lr_dgx_a100_1N8G/model_config.yaml
new file mode 100644
index 0000000000..fd67df60ca
--- /dev/null
+++ b/tests/functional_tests/test_cases/gpt/gpt3_mr_mcore_te_tp1_pp4_vp1_decoupled_lr_dgx_a100_1N8G/model_config.yaml
@@ -0,0 +1,51 @@
+ENV_VARS:
+  CUDA_DEVICE_MAX_CONNECTIONS: 1
+  NVTE_ALLOW_NONDETERMINISTIC_ALGO: 0
+  NCCL_ALGO: Tree
+  CUBLAS_WORKSPACE_CONFIG: :4096:8
+  N_REPEATS: 5
+MODEL_ARGS:
+  --num-layers: 12
+  --hidden-size: 512
+  --num-attention-heads: 8
+  --log-params-norm: true
+  --log-num-zeros-in-grad: true
+  --log-validation-ppl-to-tensorboard: true
+  --log-timers-to-tensorboard: true
+  --tensorboard-dir: ${TENSORBOARD_PATH}
+  --micro-batch-size: 4
+  --global-batch-size: 32
+  --seq-length: 1024
+  --max-position-embeddings: 1024
+  --train-iters: 50
+  --timing-log-level: 2
+  --lr-decay-iters: 320000
+  --save: ${CHECKPOINT_PATH}
+  --load: ${CHECKPOINT_PATH}
+  --data-path: ${DATA_PATH}/my-gpt3_00_text_document
+  --vocab-file: ${DATA_PATH}/bpe/vocab.json
+  --merge-file: ${DATA_PATH}/bpe/merges.txt
+  --split: 949,50,1
+  --distributed-backend: nccl
+  --lr: 0.00015
+  --lr-decay-style: cosine
+  --min-lr: 1.0e-5
+  --weight-decay: 1e-2
+  --clip-grad: 1.0
+  --lr-warmup-fraction: .01
+  --log-interval: 1
+  --save-interval: 10000
+  --eval-interval: 1000
+  --eval-iters: 10
+  --transformer-impl: transformer_engine
+  --tensor-model-parallel-size: 1
+  --pipeline-model-parallel-size: 4
+  --num-layers-per-virtual-pipeline-stage: 1
+  --decoupled-lr: 0.0002
+  --deterministic-mode: true
+  --no-gradient-accumulation-fusion: true
+  --attention-softmax-in-fp32: true
+  --ckpt-format: torch
+  --data-cache-path: ${DATA_CACHE_PATH}
+  --bf16: true
+TEST_TYPE: regular
diff --git a/tests/functional_tests/test_cases/gpt/gpt3_mr_mcore_te_tp1_pp4_vp1_dgx_a100_1N8G/golden_values_dev.json b/tests/functional_tests/test_cases/gpt/gpt3_mr_mcore_te_tp1_pp4_vp1_dgx_a100_1N8G/golden_values_dev.json
new file mode 100644
index 0000000000..2d10551b46
--- /dev/null
+++ b/tests/functional_tests/test_cases/gpt/gpt3_mr_mcore_te_tp1_pp4_vp1_dgx_a100_1N8G/golden_values_dev.json
@@ -0,0 +1,53 @@
+{
+    "lm loss": {
+        "start_step": 0,
+        "end_step": 50,
+        "step_interval": 5,
+        "values": [
+            10.81873,
+            10.87454,
+            10.87863,
+            10.79574,
+            10.68112,
+            10.59511,
+            10.10041,
+            10.21268,
+            10.13892,
+            9.80847
+        ]
+    },
+    "num-zeros": {
+        "start_step": 0,
+        "end_step": 50,
+        "step_interval": 5,
+        "values": [
+            1541.0,
+            1772.0,
+            1858.0,
+            1801.0,
+            1906.0,
+            1716.0,
+            1550.0,
+            1839.0,
+            2367.0,
+            2271.0
+        ]
+    },
+    "iteration-time": {
+        "start_step": 0,
+        "end_step": 50,
+        "step_interval": 5,
+        "values": [
+            17.5936,
+            0.15713,
+            0.15692,
+            0.15724,
+            0.15684,
+            0.15618,
+            0.15852,
+            0.1578,
+            0.15764,
+            0.15655
+        ]
+    }
+}
\ No newline at end of file
diff --git a/tests/functional_tests/test_cases/gpt/gpt3_mr_mcore_te_tp1_pp4_vp1_dgx_a100_1N8G/golden_values_lts.json b/tests/functional_tests/test_cases/gpt/gpt3_mr_mcore_te_tp1_pp4_vp1_dgx_a100_1N8G/golden_values_lts.json
new file mode 100644
index 0000000000..939863d9d8
--- /dev/null
+++ b/tests/functional_tests/test_cases/gpt/gpt3_mr_mcore_te_tp1_pp4_vp1_dgx_a100_1N8G/golden_values_lts.json
@@ -0,0 +1 @@
+{"lm loss": {"start_step": 0, "end_step": 50, "step_interval": 5, "values": [10.82005, 10.87449, 10.87798, 10.79509, 10.68164, 10.59517, 10.10046, 10.21236, 10.13863, 9.80877]}, "num-zeros": {"start_step": 0, "end_step": 50, "step_interval": 5, "values": [1559.0, 1719.0, 1856.0, 1791.0, 1900.0, 1709.0, 1627.0, 1831.0, 2272.0, 2312.0]}, "iteration_timing_avg": 0.12502588235294115}
\ No newline at end of file
diff --git a/tests/functional_tests/test_cases/gpt/gpt3_mr_mcore_te_tp1_pp4_vp1_dgx_a100_1N8G/model_config.yaml b/tests/functional_tests/test_cases/gpt/gpt3_mr_mcore_te_tp1_pp4_vp1_dgx_a100_1N8G/model_config.yaml
new file mode 100644
index 0000000000..0c0bc85f61
--- /dev/null
+++ b/tests/functional_tests/test_cases/gpt/gpt3_mr_mcore_te_tp1_pp4_vp1_dgx_a100_1N8G/model_config.yaml
@@ -0,0 +1,51 @@
+ENV_VARS:
+  CUDA_DEVICE_MAX_CONNECTIONS: 1
+  NVTE_ALLOW_NONDETERMINISTIC_ALGO: 0
+  NCCL_ALGO: Tree
+  CUBLAS_WORKSPACE_CONFIG: :4096:8
+  N_REPEATS: 5
+MODEL_ARGS:
+  --num-layers: 12
+  --hidden-size: 512
+  --num-attention-heads: 8
+  --log-params-norm: true
+  --log-num-zeros-in-grad: true
+  --log-validation-ppl-to-tensorboard: true
+  --log-timers-to-tensorboard: true
+  --tensorboard-dir: ${TENSORBOARD_PATH}
+  --micro-batch-size: 4
+  --global-batch-size: 32
+  --seq-length: 1024
+  --max-position-embeddings: 1024
+  --train-iters: 50
+  --timing-log-level: 2
+  --lr-decay-iters: 320000
+  --save: ${CHECKPOINT_PATH}
+  --load: ${CHECKPOINT_PATH}
+  --data-path: ${DATA_PATH}/my-gpt3_00_text_document
+  --vocab-file: ${DATA_PATH}/bpe/vocab.json
+  --merge-file: ${DATA_PATH}/bpe/merges.txt
+  --split: 949,50,1
+  --distributed-backend: nccl
+  --lr: 0.00015
+  --lr-decay-style: cosine
+  --min-lr: 1.0e-5
+  --weight-decay: 1e-2
+  --clip-grad: 1.0
+  --lr-warmup-fraction: .01
+  --log-interval: 1
+  --save-interval: 10000
+  --eval-interval: 1000
+  --eval-iters: 10
+  --transformer-impl: transformer_engine
+  --tensor-model-parallel-size: 1
+  --pipeline-model-parallel-size: 4
+  --num-layers-per-virtual-pipeline-stage: 1
+  --deterministic-mode: true
+  --no-gradient-accumulation-fusion: true
+  --attention-softmax-in-fp32: true
+  --use-mcore-models: true
+  --ckpt-format: torch_dist
+  --data-cache-path: ${DATA_CACHE_PATH}
+  --bf16: true
+TEST_TYPE: regular
diff --git a/tests/functional_tests/test_cases/gpt/gpt3_mr_mcore_te_tp1_pp4_vp1_dist_optimizer_overlap_grad_reduce_dgx_a100_1N8G/golden_values_dev.json b/tests/functional_tests/test_cases/gpt/gpt3_mr_mcore_te_tp1_pp4_vp1_dist_optimizer_overlap_grad_reduce_dgx_a100_1N8G/golden_values_dev.json
new file mode 100644
index 0000000000..93786325b4
--- /dev/null
+++ b/tests/functional_tests/test_cases/gpt/gpt3_mr_mcore_te_tp1_pp4_vp1_dist_optimizer_overlap_grad_reduce_dgx_a100_1N8G/golden_values_dev.json
@@ -0,0 +1,53 @@
+{
+    "lm loss": {
+        "start_step": 0,
+        "end_step": 50,
+        "step_interval": 5,
+        "values": [
+            10.81873,
+            10.87454,
+            10.87861,
+            10.79574,
+            10.68113,
+            10.59509,
+            10.10038,
+            10.21266,
+            10.13893,
+            9.80846
+        ]
+    },
+    "num-zeros": {
+        "start_step": 0,
+        "end_step": 50,
+        "step_interval": 5,
+        "values": [
+            1541.0,
+            1772.0,
+            1881.0,
+            1769.0,
+            1797.0,
+            1694.0,
+            1585.0,
+            1910.0,
+            2390.0,
+            2332.0
+        ]
+    },
+    "iteration-time": {
+        "start_step": 0,
+        "end_step": 50,
+        "step_interval": 5,
+        "values": [
+            15.92171,
+            0.15319,
+            0.1555,
+            0.14739,
+            0.14905,
+            0.15095,
+            0.15403,
+            0.1498,
+            0.15281,
+            0.15013
+        ]
+    }
+}
\ No newline at end of file
diff --git a/tests/functional_tests/test_cases/gpt/gpt3_mr_mcore_te_tp1_pp4_vp1_dist_optimizer_overlap_grad_reduce_dgx_a100_1N8G/golden_values_lts.json b/tests/functional_tests/test_cases/gpt/gpt3_mr_mcore_te_tp1_pp4_vp1_dist_optimizer_overlap_grad_reduce_dgx_a100_1N8G/golden_values_lts.json
new file mode 100644
index 0000000000..2d807f5ac2
--- /dev/null
+++ b/tests/functional_tests/test_cases/gpt/gpt3_mr_mcore_te_tp1_pp4_vp1_dist_optimizer_overlap_grad_reduce_dgx_a100_1N8G/golden_values_lts.json
@@ -0,0 +1 @@
+{"lm loss": {"start_step": 0, "end_step": 50, "step_interval": 5, "values": [10.82005, 10.87449, 10.87798, 10.79511, 10.68164, 10.59513, 10.10043, 10.21239, 10.13865, 9.80879]}, "num-zeros": {"start_step": 0, "end_step": 50, "step_interval": 5, "values": [1559.0, 1719.0, 1856.0, 1735.0, 1873.0, 1765.0, 1535.0, 1910.0, 2278.0, 2247.0]}, "iteration_timing_avg": 0.12168999999999999}
\ No newline at end of file
diff --git a/tests/functional_tests/test_cases/gpt/gpt3_mr_mcore_te_tp1_pp4_vp1_dist_optimizer_overlap_grad_reduce_dgx_a100_1N8G/model_config.yaml b/tests/functional_tests/test_cases/gpt/gpt3_mr_mcore_te_tp1_pp4_vp1_dist_optimizer_overlap_grad_reduce_dgx_a100_1N8G/model_config.yaml
new file mode 100644
index 0000000000..7a92bfd8cd
--- /dev/null
+++ b/tests/functional_tests/test_cases/gpt/gpt3_mr_mcore_te_tp1_pp4_vp1_dist_optimizer_overlap_grad_reduce_dgx_a100_1N8G/model_config.yaml
@@ -0,0 +1,53 @@
+ENV_VARS:
+  CUDA_DEVICE_MAX_CONNECTIONS: 1
+  NVTE_ALLOW_NONDETERMINISTIC_ALGO: 0
+  NCCL_ALGO: Tree
+  CUBLAS_WORKSPACE_CONFIG: :4096:8
+  N_REPEATS: 5
+MODEL_ARGS:
+  --num-layers: 12
+  --hidden-size: 512
+  --num-attention-heads: 8
+  --log-params-norm: true
+  --log-num-zeros-in-grad: true
+  --log-validation-ppl-to-tensorboard: true
+  --log-timers-to-tensorboard: true
+  --tensorboard-dir: ${TENSORBOARD_PATH}
+  --micro-batch-size: 4
+  --global-batch-size: 32
+  --seq-length: 1024
+  --max-position-embeddings: 1024
+  --train-iters: 50
+  --timing-log-level: 2
+  --lr-decay-iters: 320000
+  --save: ${CHECKPOINT_PATH}
+  --load: ${CHECKPOINT_PATH}
+  --data-path: ${DATA_PATH}/my-gpt3_00_text_document
+  --vocab-file: ${DATA_PATH}/bpe/vocab.json
+  --merge-file: ${DATA_PATH}/bpe/merges.txt
+  --split: 949,50,1
+  --distributed-backend: nccl
+  --lr: 0.00015
+  --lr-decay-style: cosine
+  --min-lr: 1.0e-5
+  --weight-decay: 1e-2
+  --clip-grad: 1.0
+  --lr-warmup-fraction: .01
+  --log-interval: 1
+  --save-interval: 10000
+  --eval-interval: 1000
+  --eval-iters: 10
+  --transformer-impl: transformer_engine
+  --tensor-model-parallel-size: 1
+  --pipeline-model-parallel-size: 4
+  --num-layers-per-virtual-pipeline-stage: 1
+  --use-distributed-optimizer: true
+  --overlap-grad-reduce: true
+  --deterministic-mode: true
+  --no-gradient-accumulation-fusion: true
+  --attention-softmax-in-fp32: true
+  --use-mcore-models: true
+  --ckpt-format: torch_dist
+  --data-cache-path: ${DATA_CACHE_PATH}
+  --bf16: true
+TEST_TYPE: regular
diff --git a/tests/functional_tests/test_cases/gpt/gpt3_mr_mcore_te_tp1_pp4_vp1_dist_optimizer_overlap_grad_reduce_param_gather_dgx_a100_1N8G/golden_values_dev.json b/tests/functional_tests/test_cases/gpt/gpt3_mr_mcore_te_tp1_pp4_vp1_dist_optimizer_overlap_grad_reduce_param_gather_dgx_a100_1N8G/golden_values_dev.json
new file mode 100644
index 0000000000..ad76b6a8ff
--- /dev/null
+++ b/tests/functional_tests/test_cases/gpt/gpt3_mr_mcore_te_tp1_pp4_vp1_dist_optimizer_overlap_grad_reduce_param_gather_dgx_a100_1N8G/golden_values_dev.json
@@ -0,0 +1,53 @@
+{
+    "lm loss": {
+        "start_step": 0,
+        "end_step": 50,
+        "step_interval": 5,
+        "values": [
+            10.81873,
+            10.87454,
+            10.87861,
+            10.79574,
+            10.68113,
+            10.59509,
+            10.10038,
+            10.21266,
+            10.13893,
+            9.80846
+        ]
+    },
+    "num-zeros": {
+        "start_step": 0,
+        "end_step": 50,
+        "step_interval": 5,
+        "values": [
+            1541.0,
+            1772.0,
+            1881.0,
+            1769.0,
+            1797.0,
+            1694.0,
+            1585.0,
+            1910.0,
+            2390.0,
+            2332.0
+        ]
+    },
+    "iteration-time": {
+        "start_step": 0,
+        "end_step": 50,
+        "step_interval": 5,
+        "values": [
+            14.17765,
+            0.15486,
+            0.33332,
+            0.15908,
+            0.32072,
+            0.15738,
+            0.32195,
+            0.15809,
+            0.32044,
+            0.15366
+        ]
+    }
+}
\ No newline at end of file
diff --git a/tests/functional_tests/test_cases/gpt/gpt3_mr_mcore_te_tp1_pp4_vp1_dist_optimizer_overlap_grad_reduce_param_gather_dgx_a100_1N8G/golden_values_lts.json b/tests/functional_tests/test_cases/gpt/gpt3_mr_mcore_te_tp1_pp4_vp1_dist_optimizer_overlap_grad_reduce_param_gather_dgx_a100_1N8G/golden_values_lts.json
new file mode 100644
index 0000000000..f23c85a133
--- /dev/null
+++ b/tests/functional_tests/test_cases/gpt/gpt3_mr_mcore_te_tp1_pp4_vp1_dist_optimizer_overlap_grad_reduce_param_gather_dgx_a100_1N8G/golden_values_lts.json
@@ -0,0 +1 @@
+{"lm loss": {"start_step": 0, "end_step": 50, "step_interval": 5, "values": [10.82005, 10.87449, 10.87798, 10.79511, 10.68164, 10.59513, 10.10043, 10.21239, 10.13865, 9.80879]}, "num-zeros": {"start_step": 0, "end_step": 50, "step_interval": 5, "values": [1559.0, 1719.0, 1856.0, 1735.0, 1873.0, 1765.0, 1535.0, 1910.0, 2278.0, 2247.0]}, "iteration_timing_avg": 0.12873676470588236}
\ No newline at end of file
diff --git a/tests/functional_tests/test_cases/gpt/gpt3_mr_mcore_te_tp1_pp4_vp1_dist_optimizer_overlap_grad_reduce_param_gather_dgx_a100_1N8G/model_config.yaml b/tests/functional_tests/test_cases/gpt/gpt3_mr_mcore_te_tp1_pp4_vp1_dist_optimizer_overlap_grad_reduce_param_gather_dgx_a100_1N8G/model_config.yaml
new file mode 100644
index 0000000000..ef5b64d284
--- /dev/null
+++ b/tests/functional_tests/test_cases/gpt/gpt3_mr_mcore_te_tp1_pp4_vp1_dist_optimizer_overlap_grad_reduce_param_gather_dgx_a100_1N8G/model_config.yaml
@@ -0,0 +1,56 @@
+ENV_VARS:
+  CUDA_DEVICE_MAX_CONNECTIONS: 1
+  NVTE_ALLOW_NONDETERMINISTIC_ALGO: 0
+  NCCL_ALGO: Tree
+  CUBLAS_WORKSPACE_CONFIG: :4096:8
+  N_REPEATS: 5
+MODEL_ARGS:
+  --num-layers: 12
+  --hidden-size: 512
+  --num-attention-heads: 8
+  --log-params-norm: true
+  --log-num-zeros-in-grad: true
+  --log-validation-ppl-to-tensorboard: true
+  --log-timers-to-tensorboard: true
+  --tensorboard-dir: ${TENSORBOARD_PATH}
+  --micro-batch-size: 4
+  --global-batch-size: 32
+  --seq-length: 1024
+  --max-position-embeddings: 1024
+  --train-iters: 50
+  --timing-log-level: 2
+  --lr-decay-iters: 320000
+  --save: ${CHECKPOINT_PATH}
+  --load: ${CHECKPOINT_PATH}
+  --data-path: ${DATA_PATH}/my-gpt3_00_text_document
+  --vocab-file: ${DATA_PATH}/bpe/vocab.json
+  --merge-file: ${DATA_PATH}/bpe/merges.txt
+  --split: 949,50,1
+  --distributed-backend: nccl
+  --lr: 0.00015
+  --lr-decay-style: cosine
+  --min-lr: 1.0e-5
+  --weight-decay: 1e-2
+  --clip-grad: 1.0
+  --lr-warmup-fraction: .01
+  --log-interval: 1
+  --save-interval: 10000
+  --eval-interval: 1000
+  --eval-iters: 10
+  --transformer-impl: transformer_engine
+  --tensor-model-parallel-size: 1
+  --pipeline-model-parallel-size: 4
+  --num-layers-per-virtual-pipeline-stage: 1
+  --use-distributed-optimizer: true
+  --overlap-grad-reduce: true
+  --overlap-param-gather: true
+  --check-weight-hash-across-dp-replicas-interval: 10
+  --ckpt-fully-parallel-load: true
+  --deterministic-mode: true
+  --no-gradient-accumulation-fusion: true
+  --attention-softmax-in-fp32: true
+  --use-mcore-models: true
+  --ckpt-format: torch_dist
+  --data-cache-path: ${DATA_CACHE_PATH}
+  --bf16: true
+TEST_TYPE: regular
diff --git a/tests/functional_tests/test_cases/gpt/gpt3_mr_mcore_te_tp1_pp4_vp1_dist_optimizer_overlap_grad_reduce_param_gather_overlap_optimizer_dgx_a100_1N8G/golden_values_dev.json b/tests/functional_tests/test_cases/gpt/gpt3_mr_mcore_te_tp1_pp4_vp1_dist_optimizer_overlap_grad_reduce_param_gather_overlap_optimizer_dgx_a100_1N8G/golden_values_dev.json
new file mode 100644
index 0000000000..a7676e88e4
--- /dev/null
+++ b/tests/functional_tests/test_cases/gpt/gpt3_mr_mcore_te_tp1_pp4_vp1_dist_optimizer_overlap_grad_reduce_param_gather_overlap_optimizer_dgx_a100_1N8G/golden_values_dev.json
@@ -0,0 +1,53 @@
+{
+    "lm loss": {
+        "start_step": 0,
+        "end_step": 50,
+        "step_interval": 5,
+        "values": [
+            10.81873,
+            10.87454,
+            10.87863,
+            10.79573,
+            10.68112,
+            10.5951,
+            10.10042,
+            10.21267,
+            10.13896,
+            9.80845
+        ]
+    },
+    "num-zeros": {
+        "start_step": 0,
+        "end_step": 50,
+        "step_interval": 5,
+        "values": [
+            1541.0,
+            1772.0,
+            1858.0,
+            1727.0,
+            1898.0,
+            1687.0,
+            1576.0,
+            1885.0,
+            2366.0,
+            2245.0
+        ]
+    },
+    "iteration-time": {
+        "start_step": 0,
+        "end_step": 50,
+        "step_interval": 5,
+        "values": [
+            15.86625,
+            0.15828,
+            0.3133,
+            0.1592,
+            0.30692,
+            0.1571,
+            0.31058,
+            0.15887,
+            0.31333,
+            0.15827
+        ]
+    }
+}
\ No newline at end of file
diff --git a/tests/functional_tests/test_cases/gpt/gpt3_mr_mcore_te_tp1_pp4_vp1_dist_optimizer_overlap_grad_reduce_param_gather_overlap_optimizer_dgx_a100_1N8G/golden_values_lts.json b/tests/functional_tests/test_cases/gpt/gpt3_mr_mcore_te_tp1_pp4_vp1_dist_optimizer_overlap_grad_reduce_param_gather_overlap_optimizer_dgx_a100_1N8G/golden_values_lts.json
new file mode 100644
index 0000000000..549ceb7eab
--- /dev/null
+++ b/tests/functional_tests/test_cases/gpt/gpt3_mr_mcore_te_tp1_pp4_vp1_dist_optimizer_overlap_grad_reduce_param_gather_overlap_optimizer_dgx_a100_1N8G/golden_values_lts.json
@@ -0,0 +1 @@
+{"lm loss": {"start_step": 0, "end_step": 50, "step_interval": 5, "values": [10.82005, 10.87449, 10.87799, 10.79508, 10.68166, 10.59514, 10.10042, 10.21238, 10.13865, 9.80879]}, "num-zeros": {"start_step": 0, "end_step": 50, "step_interval": 5, "values": [1559.0, 1719.0, 1857.0, 1746.0, 1883.0, 1738.0, 1475.0, 1851.0, 2303.0, 2258.0]}, "iteration_timing_avg": 0.12873676470588236}
\ No newline at end of file
diff --git a/tests/functional_tests/test_cases/gpt/gpt3_mr_mcore_te_tp1_pp4_vp1_dist_optimizer_overlap_grad_reduce_param_gather_overlap_optimizer_dgx_a100_1N8G/model_config.yaml b/tests/functional_tests/test_cases/gpt/gpt3_mr_mcore_te_tp1_pp4_vp1_dist_optimizer_overlap_grad_reduce_param_gather_overlap_optimizer_dgx_a100_1N8G/model_config.yaml
new file mode 100644
index 0000000000..ca1de0ad37
--- /dev/null
+++ b/tests/functional_tests/test_cases/gpt/gpt3_mr_mcore_te_tp1_pp4_vp1_dist_optimizer_overlap_grad_reduce_param_gather_overlap_optimizer_dgx_a100_1N8G/model_config.yaml
@@ -0,0 +1,57 @@
+ENV_VARS:
+  CUDA_DEVICE_MAX_CONNECTIONS: 1
+  NVTE_ALLOW_NONDETERMINISTIC_ALGO: 0
+  NCCL_ALGO: Tree
+  CUBLAS_WORKSPACE_CONFIG: :4096:8
+  N_REPEATS: 5
+MODEL_ARGS:
+  --num-layers: 12
+  --hidden-size: 512
+  --num-attention-heads: 8
+  --log-params-norm: true
+  --log-num-zeros-in-grad: true
+  --log-validation-ppl-to-tensorboard: true
+  --log-timers-to-tensorboard: true
+  --tensorboard-dir: ${TENSORBOARD_PATH}
+  --micro-batch-size: 4
+  --global-batch-size: 32
+  --seq-length: 1024
+  --max-position-embeddings: 1024
+  --train-iters: 50
+  --timing-log-level: 2
+  --lr-decay-iters: 320000
+  --save: ${CHECKPOINT_PATH}
+  --load: ${CHECKPOINT_PATH}
+  --data-path: ${DATA_PATH}/my-gpt3_00_text_document
+  --vocab-file: ${DATA_PATH}/bpe/vocab.json
+  --merge-file: ${DATA_PATH}/bpe/merges.txt
+  --split: 949,50,1
+  --distributed-backend: nccl
+  --lr: 0.00015
+  --lr-decay-style: cosine
+  --min-lr: 1.0e-5
+  --weight-decay: 1e-2
+  --clip-grad: 1.0
+  --lr-warmup-fraction: .01
+  --log-interval: 1
+  --save-interval: 10000
+  --eval-interval: 1000
+  --eval-iters: 10
+  --transformer-impl: transformer_engine
+  --tensor-model-parallel-size: 1
+  --pipeline-model-parallel-size: 4
+  --num-layers-per-virtual-pipeline-stage: 1
+  --use-distributed-optimizer: true
+  --overlap-grad-reduce: true
+  --overlap-param-gather: true
+  --overlap-param-gather-with-optimizer-step: true
+  --check-weight-hash-across-dp-replicas-interval: 10
+  --ckpt-fully-parallel-load: true
+  --deterministic-mode: true
+  --no-gradient-accumulation-fusion: true
+  --attention-softmax-in-fp32: true
+  --use-mcore-models: true
+  --ckpt-format: torch
+  --data-cache-path: ${DATA_CACHE_PATH}
+  --bf16: true
+TEST_TYPE: regular
diff --git a/tests/functional_tests/test_cases/gpt/gpt3_mr_mcore_te_tp1_pp4_vp1_dist_optimizer_overlap_grad_reduce_untied_dgx_a100_1N8G/golden_values_dev.json b/tests/functional_tests/test_cases/gpt/gpt3_mr_mcore_te_tp1_pp4_vp1_dist_optimizer_overlap_grad_reduce_untied_dgx_a100_1N8G/golden_values_dev.json
new file mode 100644
index 0000000000..4038eb02c5
--- /dev/null
+++ b/tests/functional_tests/test_cases/gpt/gpt3_mr_mcore_te_tp1_pp4_vp1_dist_optimizer_overlap_grad_reduce_untied_dgx_a100_1N8G/golden_values_dev.json
@@ -0,0 +1,53 @@
+{
+    "lm loss": {
+        "start_step": 0,
+        "end_step": 50,
+        "step_interval": 5,
+        "values": [
+            10.93652,
+            10.93558,
+            10.94232,
+            10.8808,
+            10.757,
+            10.66384,
+            10.16729,
+            10.27264,
+            10.19596,
+            9.86011
+        ]
+    },
+    "num-zeros": {
+        "start_step": 0,
+        "end_step": 50,
+        "step_interval": 5,
+        "values": [
+            22727554.0,
+            23020832.0,
+            22501232.0,
+            22830016.0,
+            22739628.0,
+            22548222.0,
+            22955658.0,
+            22589964.0,
+            22659956.0,
+            22884552.0
+        ]
+    },
+    "iteration-time": {
+        "start_step": 0,
+        "end_step": 50,
+        "step_interval": 5,
+        "values": [
+            17.17984,
+            0.15935,
+            0.15614,
+            0.15328,
+            0.15161,
+            0.15181,
+            0.15359,
+            0.15403,
+            0.15298,
+            0.15161
+        ]
+    }
+}
\ No newline at end of file
diff --git a/tests/functional_tests/test_cases/gpt/gpt3_mr_mcore_te_tp1_pp4_vp1_dist_optimizer_overlap_grad_reduce_untied_dgx_a100_1N8G/golden_values_lts.json b/tests/functional_tests/test_cases/gpt/gpt3_mr_mcore_te_tp1_pp4_vp1_dist_optimizer_overlap_grad_reduce_untied_dgx_a100_1N8G/golden_values_lts.json
new file mode 100644
index 0000000000..64f030d4bc
--- /dev/null
+++ b/tests/functional_tests/test_cases/gpt/gpt3_mr_mcore_te_tp1_pp4_vp1_dist_optimizer_overlap_grad_reduce_untied_dgx_a100_1N8G/golden_values_lts.json
@@ -0,0 +1 @@
+{"lm loss": {"start_step": 0, "end_step": 50, "step_interval": 5, "values": [10.9359, 10.93547, 10.94238, 10.88073, 10.75653, 10.66332, 10.1672, 10.27241, 10.19577, 9.86006]}, "num-zeros": {"start_step": 0, "end_step": 50, "step_interval": 5, "values": [22727686.0, 23020980.0, 22501260.0, 22830024.0, 22739772.0, 22548148.0, 22955712.0, 22589816.0, 22660000.0, 22884332.0]}, "iteration_timing_avg": 0.12799705882352944}
\ No newline at end of file
diff --git a/tests/functional_tests/test_cases/gpt/gpt3_mr_mcore_te_tp1_pp4_vp1_dist_optimizer_overlap_grad_reduce_untied_dgx_a100_1N8G/model_config.yaml b/tests/functional_tests/test_cases/gpt/gpt3_mr_mcore_te_tp1_pp4_vp1_dist_optimizer_overlap_grad_reduce_untied_dgx_a100_1N8G/model_config.yaml
new file mode 100644
index 0000000000..30137a040d
--- /dev/null
+++ b/tests/functional_tests/test_cases/gpt/gpt3_mr_mcore_te_tp1_pp4_vp1_dist_optimizer_overlap_grad_reduce_untied_dgx_a100_1N8G/model_config.yaml
@@ -0,0 +1,54 @@
+ENV_VARS:
+  CUDA_DEVICE_MAX_CONNECTIONS: 1
+  NVTE_ALLOW_NONDETERMINISTIC_ALGO: 0
+  NCCL_ALGO: Tree
+  CUBLAS_WORKSPACE_CONFIG: :4096:8
+  N_REPEATS: 5
+MODEL_ARGS:
+  --num-layers: 12
+  --hidden-size: 512
+  --num-attention-heads: 8
+  --log-params-norm: true
+  --log-num-zeros-in-grad: true
+  --log-validation-ppl-to-tensorboard: true
+  --log-timers-to-tensorboard: true
+  --tensorboard-dir: ${TENSORBOARD_PATH}
+  --micro-batch-size: 4
+  --global-batch-size: 32
+  --seq-length: 1024
+  --max-position-embeddings: 1024
+  --train-iters: 50
+  --timing-log-level: 2
+  --lr-decay-iters: 320000
+  --save: ${CHECKPOINT_PATH}
+  --load: ${CHECKPOINT_PATH}
+  --data-path: ${DATA_PATH}/my-gpt3_00_text_document
+  --vocab-file: ${DATA_PATH}/bpe/vocab.json
+  --merge-file: ${DATA_PATH}/bpe/merges.txt
+  --split: 949,50,1
+  --distributed-backend: nccl
+  --lr: 0.00015
+  --lr-decay-style: cosine
+  --min-lr: 1.0e-5
+  --weight-decay: 1e-2
+  --clip-grad: 1.0
+  --lr-warmup-fraction: .01
+  --log-interval: 1
+  --save-interval: 10000
+  --eval-interval: 1000
+  --eval-iters: 10
+  --transformer-impl: transformer_engine
+  --tensor-model-parallel-size: 1
+  --pipeline-model-parallel-size: 4
+  --num-layers-per-virtual-pipeline-stage: 1
+  --use-distributed-optimizer: true
+  --overlap-grad-reduce: true
+  --untie-embeddings-and-output-weights: true
+  --deterministic-mode: true
+  --no-gradient-accumulation-fusion: true
+  --attention-softmax-in-fp32: true
+  --use-mcore-models: true
+  --ckpt-format: torch_dist
+  --data-cache-path: ${DATA_CACHE_PATH}
+  --bf16: true
+TEST_TYPE: regular
diff --git a/tests/functional_tests/test_cases/gpt/gpt3_mr_mcore_te_tp1_pp4_vp1_resume_torch_decoupled_lr_dgx_a100_1N8G/model_config.yaml b/tests/functional_tests/test_cases/gpt/gpt3_mr_mcore_te_tp1_pp4_vp1_resume_torch_decoupled_lr_dgx_a100_1N8G/model_config.yaml
new file mode 100644
index 0000000000..1513a18192
--- /dev/null
+++ b/tests/functional_tests/test_cases/gpt/gpt3_mr_mcore_te_tp1_pp4_vp1_resume_torch_decoupled_lr_dgx_a100_1N8G/model_config.yaml
@@ -0,0 +1,52 @@
+ENV_VARS:
+  CUDA_DEVICE_MAX_CONNECTIONS: 1
+  NVTE_ALLOW_NONDETERMINISTIC_ALGO: 0
+  NCCL_ALGO: Tree
+  CUBLAS_WORKSPACE_CONFIG: :4096:8
+  N_REPEATS: 5
+MODEL_ARGS:
+  --num-layers: 12
+  --hidden-size: 512
+  --num-attention-heads: 8
+  --log-params-norm: true
+  --log-num-zeros-in-grad: true
+  --log-validation-ppl-to-tensorboard: true
+  --log-timers-to-tensorboard: true
+  --tensorboard-dir: ${TENSORBOARD_PATH}
+  --micro-batch-size: 4
+  --global-batch-size: 32
+  --seq-length: 1024
+  --max-position-embeddings: 1024
+  --train-iters: 100
+  --timing-log-level: 2
+  --lr-decay-iters: 320000
+  --save: ${CHECKPOINT_PATH}
+  --load: ${CHECKPOINT_PATH}
+  --data-path: ${DATA_PATH}/my-gpt3_00_text_document
+  --vocab-file: ${DATA_PATH}/bpe/vocab.json
+  --merge-file: ${DATA_PATH}/bpe/merges.txt
+  --split: 949,50,1
+  --distributed-backend: nccl
+  --lr: 0.00015
+  --lr-decay-style: cosine
+  --min-lr: 1.0e-5
+  --weight-decay: 1e-2
+  --clip-grad: 1.0
+  --lr-warmup-fraction: .01
+  --log-interval: 1
+  --save-interval: 50
+  --eval-interval: 1000
+  --eval-iters: 10
+  --transformer-impl: transformer_engine
+  --tensor-model-parallel-size: 1
+  --pipeline-model-parallel-size: 4
+  --num-layers-per-virtual-pipeline-stage: 1
+  --decoupled-lr: 0.0002
+  --deterministic-mode: true
+  --no-gradient-accumulation-fusion: true
+  --attention-softmax-in-fp32: true
+  --use-checkpoint-opt_param-scheduler: true
+  --ckpt-format: torch
+  --data-cache-path: ${DATA_CACHE_PATH}
+  --bf16: true
+TEST_TYPE: ckpt-resume
diff --git a/tests/functional_tests/test_cases/gpt/gpt3_mr_mcore_te_tp1_pp4_vp1_resume_torch_dist_calculate_per_token_loss_dgx_a100_1N8G/model_config.yaml b/tests/functional_tests/test_cases/gpt/gpt3_mr_mcore_te_tp1_pp4_vp1_resume_torch_dist_calculate_per_token_loss_dgx_a100_1N8G/model_config.yaml
new file mode 100644
index 0000000000..077c9a36e8
--- /dev/null
+++ b/tests/functional_tests/test_cases/gpt/gpt3_mr_mcore_te_tp1_pp4_vp1_resume_torch_dist_calculate_per_token_loss_dgx_a100_1N8G/model_config.yaml
@@ -0,0 +1,53 @@
+ENV_VARS:
+  CUDA_DEVICE_MAX_CONNECTIONS: 1
+  NVTE_ALLOW_NONDETERMINISTIC_ALGO: 0
+  NCCL_ALGO: Tree
+  CUBLAS_WORKSPACE_CONFIG: :4096:8
+  N_REPEATS: 5
+MODEL_ARGS:
+  --num-layers: 12
+  --hidden-size: 512
+  --num-attention-heads: 8
+  --log-params-norm: true
+  --log-num-zeros-in-grad: true
+  --log-validation-ppl-to-tensorboard: true
+  --log-timers-to-tensorboard: true
+  --tensorboard-dir: ${TENSORBOARD_PATH}
+  --micro-batch-size: 4
+  --global-batch-size: 32
+  --seq-length: 1024
+  --max-position-embeddings: 1024
+  --train-iters: 100
+  --timing-log-level: 2
+  --lr-decay-iters: 320000
+  --save: ${CHECKPOINT_PATH}
+  --load: ${CHECKPOINT_PATH}
+  --data-path: ${DATA_PATH}/my-gpt3_00_text_document
+  --vocab-file: ${DATA_PATH}/bpe/vocab.json
+  --merge-file: ${DATA_PATH}/bpe/merges.txt
+  --split: 949,50,1
+  --distributed-backend: nccl
+  --lr: 0.00015
+  --lr-decay-style: cosine
+  --min-lr: 1.0e-5
+  --weight-decay: 1e-2
+  --clip-grad: 1.0
+  --lr-warmup-fraction: .01
+  --log-interval: 1
+  --save-interval: 50
+  --eval-interval: 1000
+  --eval-iters: 10
+  --transformer-impl: transformer_engine
+  --tensor-model-parallel-size: 1
+  --pipeline-model-parallel-size: 4
+  --num-layers-per-virtual-pipeline-stage: 1
+  --calculate-per-token-loss: true
+  --deterministic-mode: true
+  --no-gradient-accumulation-fusion: true
+  --attention-softmax-in-fp32: true
+  --use-checkpoint-opt_param-scheduler: true
+  --use-mcore-models: true
+  --ckpt-format: torch_dist
+  --data-cache-path: ${DATA_CACHE_PATH}
+  --bf16: true
+TEST_TYPE: ckpt-resume
diff --git a/tests/functional_tests/test_cases/gpt/gpt3_mr_mcore_te_tp1_pp4_vp1_resume_torch_dist_dgx_a100_1N8G/model_config.yaml b/tests/functional_tests/test_cases/gpt/gpt3_mr_mcore_te_tp1_pp4_vp1_resume_torch_dist_dgx_a100_1N8G/model_config.yaml
new file mode 100644
index 0000000000..1ccbe1ae31
--- /dev/null
+++ b/tests/functional_tests/test_cases/gpt/gpt3_mr_mcore_te_tp1_pp4_vp1_resume_torch_dist_dgx_a100_1N8G/model_config.yaml
@@ -0,0 +1,52 @@
+ENV_VARS:
+  CUDA_DEVICE_MAX_CONNECTIONS: 1
+  NVTE_ALLOW_NONDETERMINISTIC_ALGO: 0
+  NCCL_ALGO: Tree
+  CUBLAS_WORKSPACE_CONFIG: :4096:8
+  N_REPEATS: 5
+MODEL_ARGS:
+  --num-layers: 12
+  --hidden-size: 512
+  --num-attention-heads: 8
+  --log-params-norm: true
+  --log-num-zeros-in-grad: true
+  --log-validation-ppl-to-tensorboard: true
+  --log-timers-to-tensorboard: true
+  --tensorboard-dir: ${TENSORBOARD_PATH}
+  --micro-batch-size: 4
+  --global-batch-size: 32
+  --seq-length: 1024
+  --max-position-embeddings: 1024
+  --train-iters: 100
+  --timing-log-level: 2
+  --lr-decay-iters: 320000
+  --save: ${CHECKPOINT_PATH}
+  --load: ${CHECKPOINT_PATH}
+  --data-path: ${DATA_PATH}/my-gpt3_00_text_document
+  --vocab-file: ${DATA_PATH}/bpe/vocab.json
+  --merge-file: ${DATA_PATH}/bpe/merges.txt
+  --split: 949,50,1
+  --distributed-backend: nccl
+  --lr: 0.00015
+  --lr-decay-style: cosine
+  --min-lr: 1.0e-5
+  --weight-decay: 1e-2
+  --clip-grad: 1.0
+  --lr-warmup-fraction: .01
+  --log-interval: 1
+  --save-interval: 50
+  --eval-interval: 1000
+  --eval-iters: 10
+  --transformer-impl: transformer_engine
+  --tensor-model-parallel-size: 1
+  --pipeline-model-parallel-size: 4
+  --num-layers-per-virtual-pipeline-stage: 1
+  --deterministic-mode: true
+  --no-gradient-accumulation-fusion: true
+  --attention-softmax-in-fp32: true
+  --use-checkpoint-opt_param-scheduler: true
+  --use-mcore-models: true
+  --ckpt-format: torch_dist
+  --data-cache-path: ${DATA_CACHE_PATH}
+  --bf16: true
+TEST_TYPE: ckpt-resume
diff --git a/tests/functional_tests/test_cases/gpt/gpt3_mr_mcore_te_tp1_pp4_vp1_resume_torch_dist_dist_optimizer_overlap_grad_reduce_dgx_a100_1N8G/model_config.yaml b/tests/functional_tests/test_cases/gpt/gpt3_mr_mcore_te_tp1_pp4_vp1_resume_torch_dist_dist_optimizer_overlap_grad_reduce_dgx_a100_1N8G/model_config.yaml
new file mode 100644
index 0000000000..b9ca819495
--- /dev/null
+++ b/tests/functional_tests/test_cases/gpt/gpt3_mr_mcore_te_tp1_pp4_vp1_resume_torch_dist_dist_optimizer_overlap_grad_reduce_dgx_a100_1N8G/model_config.yaml
@@ -0,0 +1,54 @@
+ENV_VARS:
+  CUDA_DEVICE_MAX_CONNECTIONS: 1
+  NVTE_ALLOW_NONDETERMINISTIC_ALGO: 0
+  NCCL_ALGO: Tree
+  CUBLAS_WORKSPACE_CONFIG: :4096:8
+  N_REPEATS: 5
+MODEL_ARGS:
+  --num-layers: 12
+  --hidden-size: 512
+  --num-attention-heads: 8
+  --log-params-norm: true
+  --log-num-zeros-in-grad: true
+  --log-validation-ppl-to-tensorboard: true
+  --log-timers-to-tensorboard: true
+  --tensorboard-dir: ${TENSORBOARD_PATH}
+  --micro-batch-size: 4
+  --global-batch-size: 32
+  --seq-length: 1024
+  --max-position-embeddings: 1024
+  --train-iters: 100
+  --timing-log-level: 2
+  --lr-decay-iters: 320000
+  --save: ${CHECKPOINT_PATH}
+  --load: ${CHECKPOINT_PATH}
+  --data-path: ${DATA_PATH}/my-gpt3_00_text_document
+  --vocab-file: ${DATA_PATH}/bpe/vocab.json
+  --merge-file: ${DATA_PATH}/bpe/merges.txt
+  --split: 949,50,1
+  --distributed-backend: nccl
+  --lr: 0.00015
+  --lr-decay-style: cosine
+  --min-lr: 1.0e-5
+  --weight-decay: 1e-2
+  --clip-grad: 1.0
+  --lr-warmup-fraction: .01
+  --log-interval: 1
+  --save-interval: 50
+  --eval-interval: 1000
+  --eval-iters: 10
+  --transformer-impl: transformer_engine
+  --tensor-model-parallel-size: 1
+  --pipeline-model-parallel-size: 4
+  --num-layers-per-virtual-pipeline-stage: 1
+  --use-distributed-optimizer: true
+  --overlap-grad-reduce: true
+  --deterministic-mode: true
+  --no-gradient-accumulation-fusion: true
+  --attention-softmax-in-fp32: true
+  --use-checkpoint-opt_param-scheduler: true
+  --use-mcore-models: true
+  --ckpt-format: torch_dist
+  --data-cache-path: ${DATA_CACHE_PATH}
+  --bf16: true
+TEST_TYPE: ckpt-resume
diff --git a/tests/functional_tests/test_cases/gpt/gpt3_mr_mcore_te_tp1_pp4_vp1_resume_torch_dist_dist_optimizer_overlap_grad_reduce_param_gather_dgx_a100_1N8G/model_config.yaml b/tests/functional_tests/test_cases/gpt/gpt3_mr_mcore_te_tp1_pp4_vp1_resume_torch_dist_dist_optimizer_overlap_grad_reduce_param_gather_dgx_a100_1N8G/model_config.yaml
new file mode 100644
index 0000000000..25ea6c933b
--- /dev/null
+++ b/tests/functional_tests/test_cases/gpt/gpt3_mr_mcore_te_tp1_pp4_vp1_resume_torch_dist_dist_optimizer_overlap_grad_reduce_param_gather_dgx_a100_1N8G/model_config.yaml
@@ -0,0 +1,57 @@
+ENV_VARS:
+  CUDA_DEVICE_MAX_CONNECTIONS: 1
+  NVTE_ALLOW_NONDETERMINISTIC_ALGO: 0
+  NCCL_ALGO: Tree
+  CUBLAS_WORKSPACE_CONFIG: :4096:8
+  N_REPEATS: 5
+MODEL_ARGS:
+  --num-layers: 12
+  --hidden-size: 512
+  --num-attention-heads: 8
+  --log-params-norm: true
+  --log-num-zeros-in-grad: true
+  --log-validation-ppl-to-tensorboard: true
+  --log-timers-to-tensorboard: true
+  --tensorboard-dir: ${TENSORBOARD_PATH}
+  --micro-batch-size: 4
+  --global-batch-size: 32
+  --seq-length: 1024
+  --max-position-embeddings: 1024
+  --train-iters: 100
+  --timing-log-level: 2
+  --lr-decay-iters: 320000
+  --save: ${CHECKPOINT_PATH}
+  --load: ${CHECKPOINT_PATH}
+  --data-path: ${DATA_PATH}/my-gpt3_00_text_document
+  --vocab-file: ${DATA_PATH}/bpe/vocab.json
+  --merge-file: ${DATA_PATH}/bpe/merges.txt
+  --split: 949,50,1
+  --distributed-backend: nccl
+  --lr: 0.00015
+  --lr-decay-style: cosine
+  --min-lr: 1.0e-5
+  --weight-decay: 1e-2
+  --clip-grad: 1.0
+  --lr-warmup-fraction: .01
+  --log-interval: 1
+  --save-interval: 50
+  --eval-interval: 1000
+  --eval-iters: 10
+  --transformer-impl: transformer_engine
+  --tensor-model-parallel-size: 1
+  --pipeline-model-parallel-size: 4
+  --num-layers-per-virtual-pipeline-stage: 1
+  --use-distributed-optimizer: true
+  --overlap-grad-reduce: true
+  --overlap-param-gather: true
+  --check-weight-hash-across-dp-replicas-interval: 10
+  --ckpt-fully-parallel-load: true
+  --deterministic-mode: true
+  --no-gradient-accumulation-fusion: true
+  --attention-softmax-in-fp32: true
+  --use-checkpoint-opt_param-scheduler: true
+  --use-mcore-models: true
+  --ckpt-format: torch_dist
+  --data-cache-path: ${DATA_CACHE_PATH}
+  --bf16: true
+TEST_TYPE: ckpt-resume
diff --git a/tests/functional_tests/test_cases/gpt/gpt3_mr_mcore_te_tp1_pp4_vp1_resume_torch_dist_dist_optimizer_overlap_grad_reduce_untied_dgx_a100_1N8G/model_config.yaml b/tests/functional_tests/test_cases/gpt/gpt3_mr_mcore_te_tp1_pp4_vp1_resume_torch_dist_dist_optimizer_overlap_grad_reduce_untied_dgx_a100_1N8G/model_config.yaml
new file mode 100644
index 0000000000..7b7bc27f4b
--- /dev/null
+++ b/tests/functional_tests/test_cases/gpt/gpt3_mr_mcore_te_tp1_pp4_vp1_resume_torch_dist_dist_optimizer_overlap_grad_reduce_untied_dgx_a100_1N8G/model_config.yaml
@@ -0,0 +1,55 @@
+ENV_VARS:
+  CUDA_DEVICE_MAX_CONNECTIONS: 1
+  NVTE_ALLOW_NONDETERMINISTIC_ALGO: 0
+  NCCL_ALGO: Tree
+  CUBLAS_WORKSPACE_CONFIG: :4096:8
+  N_REPEATS: 5
+MODEL_ARGS:
+  --num-layers: 12
+  --hidden-size: 512
+  --num-attention-heads: 8
+  --log-params-norm: true
+  --log-num-zeros-in-grad: true
+  --log-validation-ppl-to-tensorboard: true
+  --log-timers-to-tensorboard: true
+  --tensorboard-dir: ${TENSORBOARD_PATH}
+  --micro-batch-size: 4
+  --global-batch-size: 32
+  --seq-length: 1024
+  --max-position-embeddings: 1024
+  --train-iters: 100
+  --timing-log-level: 2
+  --lr-decay-iters: 320000
+  --save: ${CHECKPOINT_PATH}
+  --load: ${CHECKPOINT_PATH}
+  --data-path: ${DATA_PATH}/my-gpt3_00_text_document
+  --vocab-file: ${DATA_PATH}/bpe/vocab.json
+  --merge-file: ${DATA_PATH}/bpe/merges.txt
+  --split: 949,50,1
+  --distributed-backend: nccl
+  --lr: 0.00015
+  --lr-decay-style: cosine
+  --min-lr: 1.0e-5
+  --weight-decay: 1e-2
+  --clip-grad: 1.0
+  --lr-warmup-fraction: .01
+  --log-interval: 1
+  --save-interval: 50
+  --eval-interval: 1000
+  --eval-iters: 10
+  --transformer-impl: transformer_engine
+  --tensor-model-parallel-size: 1
+  --pipeline-model-parallel-size: 4
+  --num-layers-per-virtual-pipeline-stage: 1
+  --use-distributed-optimizer: true
+  --overlap-grad-reduce: true
+  --untie-embeddings-and-output-weights: true
+  --deterministic-mode: true
+  --no-gradient-accumulation-fusion: true
+  --attention-softmax-in-fp32: true
+  --use-checkpoint-opt_param-scheduler: true
+  --use-mcore-models: true
+  --ckpt-format: torch_dist
+  --data-cache-path: ${DATA_CACHE_PATH}
+  --bf16: true
+TEST_TYPE: ckpt-resume
diff --git a/tests/functional_tests/test_cases/gpt/gpt3_mr_mcore_te_tp1_pp4_vp1_uneven_pipeline_dgx_a100_1N8G/golden_values_dev.json b/tests/functional_tests/test_cases/gpt/gpt3_mr_mcore_te_tp1_pp4_vp1_uneven_pipeline_dgx_a100_1N8G/golden_values_dev.json
new file mode 100644
index 0000000000..c54f356abb
--- /dev/null
+++ b/tests/functional_tests/test_cases/gpt/gpt3_mr_mcore_te_tp1_pp4_vp1_uneven_pipeline_dgx_a100_1N8G/golden_values_dev.json
@@ -0,0 +1,53 @@
+{
+    "lm loss": {
+        "start_step": 0,
+        "end_step": 50,
+        "step_interval": 5,
+        "values": [
+            10.97322,
+            10.96026,
+            10.95554,
+            10.91036,
+            10.78829,
+            10.71161,
+            10.22425,
+            10.28927,
+            10.19078,
+            9.86422
+        ]
+    },
+    "num-zeros": {
+        "start_step": 0,
+        "end_step": 50,
+        "step_interval": 5,
+        "values": [
+            22727092.0,
+            23021952.0,
+            22501020.0,
+            22831056.0,
+            22740126.0,
+            22547804.0,
+            22955336.0,
+            22589332.0,
+            22658910.0,
+            22885098.0
+        ]
+    },
+    "iteration-time": {
+        "start_step": 0,
+        "end_step": 50,
+        "step_interval": 5,
+        "values": [
+            16.34154,
+            0.1456,
+            0.14396,
+            0.14478,
+            0.14447,
+            0.1447,
+            0.14477,
+            0.14342,
+            0.14486,
+            0.14486
+        ]
+    }
+}
\ No newline at end of file
diff --git a/tests/functional_tests/test_cases/gpt/gpt3_mr_mcore_te_tp1_pp4_vp1_uneven_pipeline_dgx_a100_1N8G/golden_values_lts.json b/tests/functional_tests/test_cases/gpt/gpt3_mr_mcore_te_tp1_pp4_vp1_uneven_pipeline_dgx_a100_1N8G/golden_values_lts.json
new file mode 100644
index 0000000000..48bbcc3792
--- /dev/null
+++ b/tests/functional_tests/test_cases/gpt/gpt3_mr_mcore_te_tp1_pp4_vp1_uneven_pipeline_dgx_a100_1N8G/golden_values_lts.json
@@ -0,0 +1 @@
+{"lm loss": {"start_step": 0, "end_step": 50, "step_interval": 5, "values": [10.9735, 10.96043, 10.95576, 10.91038, 10.78791, 10.71201, 10.22424, 10.28926, 10.19049, 9.86378]},"num-zeros": {"start_step": 0, "end_step": 50, "step_interval": 5, "values": [22727052.0, 23021930.0, 22501022.0, 22831208.0, 22740024.0, 22547916.0, 22955210.0, 22589344.0, 22658940.0, 22884970.0]},"iteration_timing_avg": 0.1367805882352941}
\ No newline at end of file
diff --git a/tests/functional_tests/test_cases/gpt/gpt3_mr_mcore_te_tp1_pp4_vp1_uneven_pipeline_dgx_a100_1N8G/model_config.yaml b/tests/functional_tests/test_cases/gpt/gpt3_mr_mcore_te_tp1_pp4_vp1_uneven_pipeline_dgx_a100_1N8G/model_config.yaml
new file mode 100644
index 0000000000..059265a079
--- /dev/null
+++ b/tests/functional_tests/test_cases/gpt/gpt3_mr_mcore_te_tp1_pp4_vp1_uneven_pipeline_dgx_a100_1N8G/model_config.yaml
@@ -0,0 +1,52 @@
+ENV_VARS:
+  CUDA_DEVICE_MAX_CONNECTIONS: 1
+  NVTE_ALLOW_NONDETERMINISTIC_ALGO: 0
+  NCCL_ALGO: Tree
+  CUBLAS_WORKSPACE_CONFIG: :4096:8
+MODEL_ARGS:
+  --num-layers: 12
+  --hidden-size: 512
+  --num-attention-heads: 8
+  --log-params-norm: true
+  --log-num-zeros-in-grad: true
+  --log-validation-ppl-to-tensorboard: true
+  --log-timers-to-tensorboard: true
+  --tensorboard-dir: ${TENSORBOARD_PATH}
+  --micro-batch-size: 4
+  --global-batch-size: 32
+  --seq-length: 1024
+  --max-position-embeddings: 1024
+  --train-iters: 50
+  --timing-log-level: 2
+  --lr-decay-iters: 320000
+  --save: ${CHECKPOINT_PATH}
+  --load: ${CHECKPOINT_PATH}
+  --data-path: ${DATA_PATH}/my-gpt3_00_text_document
+  --vocab-file: ${DATA_PATH}/bpe/vocab.json
+  --merge-file: ${DATA_PATH}/bpe/merges.txt
+  --split: 949,50,1
+  --distributed-backend: nccl
+  --lr: 0.00015
+  --lr-decay-style: cosine
+  --min-lr: 1.0e-5
+  --weight-decay: 1e-2
+  --clip-grad: 1.0
+  --lr-warmup-fraction: .01
+  --log-interval: 1
+  --save-interval: 10000
+  --eval-interval: 1000
+  --eval-iters: 10
+  --transformer-impl: transformer_engine
+  --tensor-model-parallel-size: 1
+  --pipeline-model-parallel-size: 4
+  --untie-embeddings-and-output-weights: true
+  --deterministic-mode: true
+  --no-gradient-accumulation-fusion: true
+  --attention-softmax-in-fp32: true
+  --use-mcore-models: true
+  --ckpt-format: torch_dist
+  --data-cache-path: ${DATA_CACHE_PATH}
+  --bf16: true
+  --decoder-first-pipeline-num-layers: 2
+  --decoder-last-pipeline-num-layers: 2
+TEST_TYPE: regular
\ No newline at end of file
diff --git a/tests/functional_tests/test_cases/gpt/gpt3_mr_mcore_te_tp2_pp1_cp2_nondeterministic_dgx_a100_1N8G/golden_values_dev.json b/tests/functional_tests/test_cases/gpt/gpt3_mr_mcore_te_tp2_pp1_cp2_nondeterministic_dgx_a100_1N8G/golden_values_dev.json
new file mode 100644
index 0000000000..b87c0bca78
--- /dev/null
+++ b/tests/functional_tests/test_cases/gpt/gpt3_mr_mcore_te_tp2_pp1_cp2_nondeterministic_dgx_a100_1N8G/golden_values_dev.json
@@ -0,0 +1 @@
+{"lm loss": {"start_step": 0, "end_step": 50, "step_interval": 5, "values": [10.88759, 10.90846, 10.88099, 10.84518, 10.69285, 10.6019, 10.09544, 10.18239, 10.08764, 9.76749]}, "num-zeros": {"start_step": 0, "end_step": 50, "step_interval": 5, "values": [578.0, 659.0, 683.0, 700.0, 697.0, 620.0, 572.0, 774.0, 807.0, 837.0]}, "iteration_timing_avg": 0.3462723529411765}
diff --git a/tests/functional_tests/test_cases/gpt/gpt3_mr_mcore_te_tp2_pp1_cp2_nondeterministic_dgx_a100_1N8G/golden_values_lts.json b/tests/functional_tests/test_cases/gpt/gpt3_mr_mcore_te_tp2_pp1_cp2_nondeterministic_dgx_a100_1N8G/golden_values_lts.json
new file mode 100644
index 0000000000..b87c0bca78
--- /dev/null
+++ b/tests/functional_tests/test_cases/gpt/gpt3_mr_mcore_te_tp2_pp1_cp2_nondeterministic_dgx_a100_1N8G/golden_values_lts.json
@@ -0,0 +1 @@
+{"lm loss": {"start_step": 0, "end_step": 50, "step_interval": 5, "values": [10.88759, 10.90846, 10.88099, 10.84518, 10.69285, 10.6019, 10.09544, 10.18239, 10.08764, 9.76749]}, "num-zeros": {"start_step": 0, "end_step": 50, "step_interval": 5, "values": [578.0, 659.0, 683.0, 700.0, 697.0, 620.0, 572.0, 774.0, 807.0, 837.0]}, "iteration_timing_avg": 0.3462723529411765}
diff --git a/tests/functional_tests/test_cases/gpt/gpt3_mr_mcore_te_tp2_pp1_cp2_nondeterministic_dgx_a100_1N8G/model_config.yaml b/tests/functional_tests/test_cases/gpt/gpt3_mr_mcore_te_tp2_pp1_cp2_nondeterministic_dgx_a100_1N8G/model_config.yaml
new file mode 100644
index 0000000000..7da0cc5ddd
--- /dev/null
+++ b/tests/functional_tests/test_cases/gpt/gpt3_mr_mcore_te_tp2_pp1_cp2_nondeterministic_dgx_a100_1N8G/model_config.yaml
@@ -0,0 +1,51 @@
+ENV_VARS:
+  CUDA_DEVICE_MAX_CONNECTIONS: 1
+  NVTE_ALLOW_NONDETERMINISTIC_ALGO: 1
+  N_REPEATS: 5
+MODEL_ARGS:
+  --num-layers: 12
+  --hidden-size: 512
+  --num-attention-heads: 8
+  --log-params-norm: true
+  --log-num-zeros-in-grad: true
+  --log-validation-ppl-to-tensorboard: true
+  --log-timers-to-tensorboard: true
+  --tensorboard-dir: ${TENSORBOARD_PATH}
+  --micro-batch-size: 4
+  --global-batch-size: 32
+  --seq-length: 1024
+  --max-position-embeddings: 1024
+  --train-iters: 50
+  --timing-log-level: 2
+  --lr-decay-iters: 320000
+  --save: ${CHECKPOINT_PATH}
+  --load: ${CHECKPOINT_PATH}
+  --data-path: ${DATA_PATH}/my-gpt3_00_text_document
+  --vocab-file: ${DATA_PATH}/bpe/vocab.json
+  --merge-file: ${DATA_PATH}/bpe/merges.txt
+  --split: 949,50,1
+  --distributed-backend: nccl
+  --lr: 0.00015
+  --lr-decay-style: cosine
+  --min-lr: 1.0e-5
+  --weight-decay: 1e-2
+  --clip-grad: 1.0
+  --lr-warmup-fraction: .01
+  --log-interval: 1
+  --save-interval: 10000
+  --eval-interval: 1000
+  --eval-iters: 10
+  --transformer-impl: transformer_engine
+  --tensor-model-parallel-size: 2
+  --pipeline-model-parallel-size: 1
+  --context-parallel-size: 2
+  --sequence-parallel: true
+  --hidden-dropout: 0.0
+  --attention-dropout: 0.0
+  --no-gradient-accumulation-fusion: true
+  --attention-softmax-in-fp32: true
+  --use-mcore-models: true
+  --ckpt-format: torch_dist
+  --data-cache-path: ${DATA_CACHE_PATH}
+  --bf16: true
+TEST_TYPE: regular
diff --git a/tests/functional_tests/test_cases/gpt/gpt3_mr_mcore_te_tp2_pp1_resume_torch_dist_cp2_nondeterministic_dgx_a100_1N8G/model_config.yaml b/tests/functional_tests/test_cases/gpt/gpt3_mr_mcore_te_tp2_pp1_resume_torch_dist_cp2_nondeterministic_dgx_a100_1N8G/model_config.yaml
new file mode 100644
index 0000000000..476a1b6b93
--- /dev/null
+++ b/tests/functional_tests/test_cases/gpt/gpt3_mr_mcore_te_tp2_pp1_resume_torch_dist_cp2_nondeterministic_dgx_a100_1N8G/model_config.yaml
@@ -0,0 +1,52 @@
+ENV_VARS:
+  CUDA_DEVICE_MAX_CONNECTIONS: 1
+  NVTE_ALLOW_NONDETERMINISTIC_ALGO: 1
+  N_REPEATS: 5
+MODEL_ARGS:
+  --num-layers: 12
+  --hidden-size: 512
+  --num-attention-heads: 8
+  --log-params-norm: true
+  --log-num-zeros-in-grad: true
+  --log-validation-ppl-to-tensorboard: true
+  --log-timers-to-tensorboard: true
+  --tensorboard-dir: ${TENSORBOARD_PATH}
+  --micro-batch-size: 4
+  --global-batch-size: 32
+  --seq-length: 1024
+  --max-position-embeddings: 1024
+  --train-iters: 100
+  --timing-log-level: 2
+  --lr-decay-iters: 320000
+  --save: ${CHECKPOINT_PATH}
+  --load: ${CHECKPOINT_PATH}
+  --data-path: ${DATA_PATH}/my-gpt3_00_text_document
+  --vocab-file: ${DATA_PATH}/bpe/vocab.json
+  --merge-file: ${DATA_PATH}/bpe/merges.txt
+  --split: 949,50,1
+  --distributed-backend: nccl
+  --lr: 0.00015
+  --lr-decay-style: cosine
+  --min-lr: 1.0e-5
+  --weight-decay: 1e-2
+  --clip-grad: 1.0
+  --lr-warmup-fraction: .01
+  --log-interval: 1
+  --save-interval: 50
+  --eval-interval: 1000
+  --eval-iters: 10
+  --transformer-impl: transformer_engine
+  --tensor-model-parallel-size: 2
+  --pipeline-model-parallel-size: 1
+  --context-parallel-size: 2
+  --sequence-parallel: true
+  --hidden-dropout: 0.0
+  --attention-dropout: 0.0
+  --no-gradient-accumulation-fusion: true
+  --attention-softmax-in-fp32: true
+  --use-checkpoint-opt_param-scheduler: true
+  --use-mcore-models: true
+  --ckpt-format: torch_dist
+  --data-cache-path: ${DATA_CACHE_PATH}
+  --bf16: true
+TEST_TYPE: ckpt-resume
diff --git a/tests/functional_tests/test_cases/gpt/gpt3_mr_mcore_te_tp2_pp1_resume_torch_dist_te_8experts2parallel_dgx_a100_1N8G/model_config.yaml b/tests/functional_tests/test_cases/gpt/gpt3_mr_mcore_te_tp2_pp1_resume_torch_dist_te_8experts2parallel_dgx_a100_1N8G/model_config.yaml
new file mode 100644
index 0000000000..613559a96e
--- /dev/null
+++ b/tests/functional_tests/test_cases/gpt/gpt3_mr_mcore_te_tp2_pp1_resume_torch_dist_te_8experts2parallel_dgx_a100_1N8G/model_config.yaml
@@ -0,0 +1,57 @@
+ENV_VARS:
+  CUDA_DEVICE_MAX_CONNECTIONS: 1
+  NVTE_ALLOW_NONDETERMINISTIC_ALGO: 0
+  NCCL_ALGO: Tree
+  CUBLAS_WORKSPACE_CONFIG: :4096:8
+  N_REPEATS: 5
+MODEL_ARGS:
+  --num-layers: 12
+  --hidden-size: 512
+  --num-attention-heads: 8
+  --log-params-norm: true
+  --log-num-zeros-in-grad: true
+  --log-validation-ppl-to-tensorboard: true
+  --log-timers-to-tensorboard: true
+  --tensorboard-dir: ${TENSORBOARD_PATH}
+  --micro-batch-size: 4
+  --global-batch-size: 32
+  --seq-length: 1024
+  --max-position-embeddings: 1024
+  --train-iters: 100
+  --timing-log-level: 2
+  --lr-decay-iters: 320000
+  --save: ${CHECKPOINT_PATH}
+  --load: ${CHECKPOINT_PATH}
+  --data-path: ${DATA_PATH}/my-gpt3_00_text_document
+  --vocab-file: ${DATA_PATH}/bpe/vocab.json
+  --merge-file: ${DATA_PATH}/bpe/merges.txt
+  --split: 949,50,1
+  --distributed-backend: nccl
+  --lr: 0.00015
+  --lr-decay-style: cosine
+  --min-lr: 1.0e-5
+  --weight-decay: 1e-2
+  --clip-grad: 1.0
+  --lr-warmup-fraction: .01
+  --log-interval: 1
+  --save-interval: 50
+  --eval-interval: 1000
+  --eval-iters: 10
+  --transformer-impl: transformer_engine
+  --tensor-model-parallel-size: 2
+  --pipeline-model-parallel-size: 1
+  --expert-model-parallel-size: 2
+  --sequence-parallel: true
+  --num-experts: 8
+  --moe-router-load-balancing-type: sinkhorn
+  --moe-router-topk: 1
+  --ckpt-fully-parallel-load: true
+  --deterministic-mode: true
+  --no-gradient-accumulation-fusion: true
+  --attention-softmax-in-fp32: true
+  --use-checkpoint-opt_param-scheduler: true
+  --use-mcore-models: true
+  --ckpt-format: torch_dist
+  --data-cache-path: ${DATA_CACHE_PATH}
+  --bf16: true
+TEST_TYPE: ckpt-resume
diff --git a/tests/functional_tests/test_cases/gpt/gpt3_mr_mcore_te_tp2_pp1_resume_torch_dist_te_8experts2parallel_dist_optimizer_dgx_a100_1N8G/model_config.yaml b/tests/functional_tests/test_cases/gpt/gpt3_mr_mcore_te_tp2_pp1_resume_torch_dist_te_8experts2parallel_dist_optimizer_dgx_a100_1N8G/model_config.yaml
new file mode 100644
index 0000000000..a1f86a64c7
--- /dev/null
+++ b/tests/functional_tests/test_cases/gpt/gpt3_mr_mcore_te_tp2_pp1_resume_torch_dist_te_8experts2parallel_dist_optimizer_dgx_a100_1N8G/model_config.yaml
@@ -0,0 +1,58 @@
+ENV_VARS:
+  CUDA_DEVICE_MAX_CONNECTIONS: 1
+  NVTE_ALLOW_NONDETERMINISTIC_ALGO: 0
+  NCCL_ALGO: Tree
+  CUBLAS_WORKSPACE_CONFIG: :4096:8
+  N_REPEATS: 5
+MODEL_ARGS:
+  --num-layers: 12
+  --hidden-size: 512
+  --num-attention-heads: 8
+  --log-params-norm: true
+  --log-num-zeros-in-grad: true
+  --log-validation-ppl-to-tensorboard: true
+  --log-timers-to-tensorboard: true
+  --tensorboard-dir: ${TENSORBOARD_PATH}
+  --micro-batch-size: 4
+  --global-batch-size: 32
+  --seq-length: 1024
+  --max-position-embeddings: 1024
+  --train-iters: 100
+  --timing-log-level: 2
+  --lr-decay-iters: 320000
+  --save: ${CHECKPOINT_PATH}
+  --load: ${CHECKPOINT_PATH}
+  --data-path: ${DATA_PATH}/my-gpt3_00_text_document
+  --vocab-file: ${DATA_PATH}/bpe/vocab.json
+  --merge-file: ${DATA_PATH}/bpe/merges.txt
+  --split: 949,50,1
+  --distributed-backend: nccl
+  --lr: 0.00015
+  --lr-decay-style: cosine
+  --min-lr: 1.0e-5
+  --weight-decay: 1e-2
+  --clip-grad: 1.0
+  --lr-warmup-fraction: .01
+  --log-interval: 1
+  --save-interval: 50
+  --eval-interval: 1000
+  --eval-iters: 10
+  --transformer-impl: transformer_engine
+  --tensor-model-parallel-size: 2
+  --pipeline-model-parallel-size: 1
+  --expert-model-parallel-size: 2
+  --sequence-parallel: true
+  --num-experts: 8
+  --use-distributed-optimizer: true
+  --moe-router-load-balancing-type: sinkhorn
+  --moe-router-topk: 1
+  --ckpt-fully-parallel-load: true
+  --deterministic-mode: true
+  --no-gradient-accumulation-fusion: true
+  --attention-softmax-in-fp32: true
+  --use-checkpoint-opt_param-scheduler: true
+  --use-mcore-models: true
+  --ckpt-format: torch_dist
+  --data-cache-path: ${DATA_CACHE_PATH}
+  --bf16: true
+TEST_TYPE: ckpt-resume
diff --git a/tests/functional_tests/test_cases/gpt/gpt3_mr_mcore_te_tp2_pp1_resume_torch_dist_te_8experts2parallel_groupedGEMM_dgx_a100_1N8G/model_config.yaml b/tests/functional_tests/test_cases/gpt/gpt3_mr_mcore_te_tp2_pp1_resume_torch_dist_te_8experts2parallel_groupedGEMM_dgx_a100_1N8G/model_config.yaml
new file mode 100644
index 0000000000..6c454ecca7
--- /dev/null
+++ b/tests/functional_tests/test_cases/gpt/gpt3_mr_mcore_te_tp2_pp1_resume_torch_dist_te_8experts2parallel_groupedGEMM_dgx_a100_1N8G/model_config.yaml
@@ -0,0 +1,58 @@
+ENV_VARS:
+  CUDA_DEVICE_MAX_CONNECTIONS: 1
+  NVTE_ALLOW_NONDETERMINISTIC_ALGO: 0
+  NCCL_ALGO: Tree
+  CUBLAS_WORKSPACE_CONFIG: :4096:8
+  N_REPEATS: 5
+MODEL_ARGS:
+  --num-layers: 12
+  --hidden-size: 512
+  --num-attention-heads: 8
+  --log-params-norm: true
+  --log-num-zeros-in-grad: true
+  --log-validation-ppl-to-tensorboard: true
+  --log-timers-to-tensorboard: true
+  --tensorboard-dir: ${TENSORBOARD_PATH}
+  --micro-batch-size: 4
+  --global-batch-size: 32
+  --seq-length: 1024
+  --max-position-embeddings: 1024
+  --train-iters: 100
+  --timing-log-level: 2
+  --lr-decay-iters: 320000
+  --save: ${CHECKPOINT_PATH}
+  --load: ${CHECKPOINT_PATH}
+  --data-path: ${DATA_PATH}/my-gpt3_00_text_document
+  --vocab-file: ${DATA_PATH}/bpe/vocab.json
+  --merge-file: ${DATA_PATH}/bpe/merges.txt
+  --split: 949,50,1
+  --distributed-backend: nccl
+  --lr: 0.00015
+  --lr-decay-style: cosine
+  --min-lr: 1.0e-5
+  --weight-decay: 1e-2
+  --clip-grad: 1.0
+  --lr-warmup-fraction: .01
+  --log-interval: 1
+  --save-interval: 50
+  --eval-interval: 1000
+  --eval-iters: 10
+  --transformer-impl: transformer_engine
+  --tensor-model-parallel-size: 2
+  --pipeline-model-parallel-size: 1
+  --expert-model-parallel-size: 2
+  --moe-grouped-gemm: true
+  --disable-bias-linear: true
+  --sequence-parallel: true
+  --num-experts: 8
+  --moe-router-load-balancing-type: sinkhorn
+  --moe-router-topk: 1
+  --deterministic-mode: true
+  --no-gradient-accumulation-fusion: true
+  --attention-softmax-in-fp32: true
+  --use-checkpoint-opt_param-scheduler: true
+  --use-mcore-models: true
+  --ckpt-format: torch_dist
+  --data-cache-path: ${DATA_CACHE_PATH}
+  --bf16: true
+TEST_TYPE: ckpt-resume
diff --git a/tests/functional_tests/test_cases/gpt/gpt3_mr_mcore_te_tp2_pp1_resume_torch_dist_te_8experts2parallel_overlap_grad_reduce_param_gather_groupedGEMM_dgx_a100_1N8G/model_config.yaml b/tests/functional_tests/test_cases/gpt/gpt3_mr_mcore_te_tp2_pp1_resume_torch_dist_te_8experts2parallel_overlap_grad_reduce_param_gather_groupedGEMM_dgx_a100_1N8G/model_config.yaml
new file mode 100644
index 0000000000..cf4a90e410
--- /dev/null
+++ b/tests/functional_tests/test_cases/gpt/gpt3_mr_mcore_te_tp2_pp1_resume_torch_dist_te_8experts2parallel_overlap_grad_reduce_param_gather_groupedGEMM_dgx_a100_1N8G/model_config.yaml
@@ -0,0 +1,62 @@
+ENV_VARS:
+  CUDA_DEVICE_MAX_CONNECTIONS: 1
+  NVTE_ALLOW_NONDETERMINISTIC_ALGO: 0
+  NCCL_ALGO: Tree
+  CUBLAS_WORKSPACE_CONFIG: :4096:8
+  N_REPEATS: 5
+MODEL_ARGS:
+  --num-layers: 12
+  --hidden-size: 512
+  --num-attention-heads: 8
+  --log-params-norm: true
+  --log-num-zeros-in-grad: true
+  --log-validation-ppl-to-tensorboard: true
+  --log-timers-to-tensorboard: true
+  --tensorboard-dir: ${TENSORBOARD_PATH}
+  --micro-batch-size: 4
+  --global-batch-size: 32
+  --seq-length: 1024
+  --max-position-embeddings: 1024
+  --train-iters: 100
+  --timing-log-level: 2
+  --lr-decay-iters: 320000
+  --save: ${CHECKPOINT_PATH}
+  --load: ${CHECKPOINT_PATH}
+  --data-path: ${DATA_PATH}/my-gpt3_00_text_document
+  --vocab-file: ${DATA_PATH}/bpe/vocab.json
+  --merge-file: ${DATA_PATH}/bpe/merges.txt
+  --split: 949,50,1
+  --distributed-backend: nccl
+  --lr: 0.00015
+  --lr-decay-style: cosine
+  --min-lr: 1.0e-5
+  --weight-decay: 1e-2
+  --clip-grad: 1.0
+  --lr-warmup-fraction: .01
+  --log-interval: 1
+  --save-interval: 50
+  --eval-interval: 1000
+  --eval-iters: 10
+  --transformer-impl: transformer_engine
+  --tensor-model-parallel-size: 2
+  --pipeline-model-parallel-size: 1
+  --expert-model-parallel-size: 2
+  --no-ckpt-fully-parallel-save: true
+  --moe-grouped-gemm: true
+  --disable-bias-linear: true
+  --sequence-parallel: true
+  --num-experts: 8
+  --use-distributed-optimizer: true
+  --moe-router-load-balancing-type: sinkhorn
+  --moe-router-topk: 1
+  --overlap-grad-reduce: true
+  --overlap-param-gather: true
+  --deterministic-mode: true
+  --no-gradient-accumulation-fusion: true
+  --attention-softmax-in-fp32: true
+  --use-checkpoint-opt_param-scheduler: true
+  --use-mcore-models: true
+  --ckpt-format: torch_dist
+  --data-cache-path: ${DATA_CACHE_PATH}
+  --bf16: true
+TEST_TYPE: ckpt-resume
diff --git a/tests/functional_tests/test_cases/gpt/gpt3_mr_mcore_te_tp2_pp1_resume_torch_dist_te_8experts2parallel_top2router_dgx_a100_1N8G/model_config.yaml b/tests/functional_tests/test_cases/gpt/gpt3_mr_mcore_te_tp2_pp1_resume_torch_dist_te_8experts2parallel_top2router_dgx_a100_1N8G/model_config.yaml
new file mode 100644
index 0000000000..793bfb21d4
--- /dev/null
+++ b/tests/functional_tests/test_cases/gpt/gpt3_mr_mcore_te_tp2_pp1_resume_torch_dist_te_8experts2parallel_top2router_dgx_a100_1N8G/model_config.yaml
@@ -0,0 +1,59 @@
+ENV_VARS:
+  CUDA_DEVICE_MAX_CONNECTIONS: 1
+  NVTE_ALLOW_NONDETERMINISTIC_ALGO: 0
+  NCCL_ALGO: Tree
+  CUBLAS_WORKSPACE_CONFIG: :4096:8
+  N_REPEATS: 5
+MODEL_ARGS:
+  --num-layers: 12
+  --hidden-size: 512
+  --num-attention-heads: 8
+  --log-params-norm: true
+  --log-num-zeros-in-grad: true
+  --log-validation-ppl-to-tensorboard: true
+  --log-timers-to-tensorboard: true
+  --tensorboard-dir: ${TENSORBOARD_PATH}
+  --micro-batch-size: 4
+  --global-batch-size: 32
+  --seq-length: 1024
+  --max-position-embeddings: 1024
+  --train-iters: 100
+  --timing-log-level: 2
+  --lr-decay-iters: 320000
+  --save: ${CHECKPOINT_PATH}
+  --load: ${CHECKPOINT_PATH}
+  --data-path: ${DATA_PATH}/my-gpt3_00_text_document
+  --vocab-file: ${DATA_PATH}/bpe/vocab.json
+  --merge-file: ${DATA_PATH}/bpe/merges.txt
+  --split: 949,50,1
+  --distributed-backend: nccl
+  --lr: 0.00015
+  --lr-decay-style: cosine
+  --min-lr: 1.0e-5
+  --weight-decay: 1e-2
+  --clip-grad: 1.0
+  --lr-warmup-fraction: .01
+  --log-interval: 1
+  --save-interval: 50
+  --eval-interval: 1000
+  --eval-iters: 10
+  --transformer-impl: transformer_engine
+  --tensor-model-parallel-size: 2
+  --pipeline-model-parallel-size: 1
+  --expert-model-parallel-size: 2
+  --disable-bias-linear: true
+  --sequence-parallel: true
+  --num-experts: 8
+  --moe-router-load-balancing-type: aux_loss
+  --moe-router-topk: 2
+  --moe-aux-loss-coeff: 1e-2
+  --deterministic-mode: true
+  --no-gradient-accumulation-fusion: true
+  --moe-grouped-gemm: true
+  --attention-softmax-in-fp32: true
+  --use-checkpoint-opt_param-scheduler: true
+  --use-mcore-models: true
+  --ckpt-format: torch_dist
+  --data-cache-path: ${DATA_CACHE_PATH}
+  --bf16: true
+TEST_TYPE: ckpt-resume
diff --git a/tests/functional_tests/test_cases/gpt/gpt3_mr_mcore_te_tp2_pp1_te_8experts2parallel_dgx_a100_1N8G/golden_values_dev.json b/tests/functional_tests/test_cases/gpt/gpt3_mr_mcore_te_tp2_pp1_te_8experts2parallel_dgx_a100_1N8G/golden_values_dev.json
new file mode 100644
index 0000000000..58da8cc58f
--- /dev/null
+++ b/tests/functional_tests/test_cases/gpt/gpt3_mr_mcore_te_tp2_pp1_te_8experts2parallel_dgx_a100_1N8G/golden_values_dev.json
@@ -0,0 +1,53 @@
+{
+    "lm loss": {
+        "start_step": 0,
+        "end_step": 50,
+        "step_interval": 5,
+        "values": [
+            10.80636,
+            10.86329,
+            10.86543,
+            10.80292,
+            10.71495,
+            10.63908,
+            10.19523,
+            10.30868,
+            10.21881,
+            9.91605
+        ]
+    },
+    "num-zeros": {
+        "start_step": 0,
+        "end_step": 50,
+        "step_interval": 5,
+        "values": [
+            30624.0,
+            37092.0,
+            37682.0,
+            35847.0,
+            33454.0,
+            34950.0,
+            30874.0,
+            35631.0,
+            36594.0,
+            37604.0
+        ]
+    },
+    "iteration-time": {
+        "start_step": 0,
+        "end_step": 50,
+        "step_interval": 5,
+        "values": [
+            13.94166,
+            0.60018,
+            0.59665,
+            0.59556,
+            0.59626,
+            0.59829,
+            0.60898,
+            0.60665,
+            0.60729,
+            0.60397
+        ]
+    }
+}
\ No newline at end of file
diff --git a/tests/functional_tests/test_cases/gpt/gpt3_mr_mcore_te_tp2_pp1_te_8experts2parallel_dgx_a100_1N8G/golden_values_lts.json b/tests/functional_tests/test_cases/gpt/gpt3_mr_mcore_te_tp2_pp1_te_8experts2parallel_dgx_a100_1N8G/golden_values_lts.json
new file mode 100644
index 0000000000..7e38f08536
--- /dev/null
+++ b/tests/functional_tests/test_cases/gpt/gpt3_mr_mcore_te_tp2_pp1_te_8experts2parallel_dgx_a100_1N8G/golden_values_lts.json
@@ -0,0 +1 @@
+{"lm loss": {"start_step": 0, "end_step": 50, "step_interval": 5, "values": [10.79806, 10.86466, 10.87219, 10.80704, 10.71201, 10.63836, 10.19365, 10.30955, 10.22074, 9.91587]}, "num-zeros": {"start_step": 0, "end_step": 50, "step_interval": 5, "values": [31010.0, 37271.0, 37922.0, 36177.0, 33568.0, 34619.0, 31252.0, 34977.0, 36315.0, 37480.0]}, "iteration_timing_avg": 0.35529294117647064}
\ No newline at end of file
diff --git a/tests/functional_tests/test_cases/gpt/gpt3_mr_mcore_te_tp2_pp1_te_8experts2parallel_dgx_a100_1N8G/model_config.yaml b/tests/functional_tests/test_cases/gpt/gpt3_mr_mcore_te_tp2_pp1_te_8experts2parallel_dgx_a100_1N8G/model_config.yaml
new file mode 100644
index 0000000000..29b87e9073
--- /dev/null
+++ b/tests/functional_tests/test_cases/gpt/gpt3_mr_mcore_te_tp2_pp1_te_8experts2parallel_dgx_a100_1N8G/model_config.yaml
@@ -0,0 +1,56 @@
+ENV_VARS:
+  CUDA_DEVICE_MAX_CONNECTIONS: 1
+  NVTE_ALLOW_NONDETERMINISTIC_ALGO: 0
+  NCCL_ALGO: Tree
+  CUBLAS_WORKSPACE_CONFIG: :4096:8
+  N_REPEATS: 5
+MODEL_ARGS:
+  --num-layers: 12
+  --hidden-size: 512
+  --num-attention-heads: 8
+  --log-params-norm: true
+  --log-num-zeros-in-grad: true
+  --log-validation-ppl-to-tensorboard: true
+  --log-timers-to-tensorboard: true
+  --tensorboard-dir: ${TENSORBOARD_PATH}
+  --micro-batch-size: 4
+  --global-batch-size: 32
+  --seq-length: 1024
+  --max-position-embeddings: 1024
+  --train-iters: 50
+  --timing-log-level: 2
+  --lr-decay-iters: 320000
+  --save: ${CHECKPOINT_PATH}
+  --load: ${CHECKPOINT_PATH}
+  --data-path: ${DATA_PATH}/my-gpt3_00_text_document
+  --vocab-file: ${DATA_PATH}/bpe/vocab.json
+  --merge-file: ${DATA_PATH}/bpe/merges.txt
+  --split: 949,50,1
+  --distributed-backend: nccl
+  --lr: 0.00015
+  --lr-decay-style: cosine
+  --min-lr: 1.0e-5
+  --weight-decay: 1e-2
+  --clip-grad: 1.0
+  --lr-warmup-fraction: .01
+  --log-interval: 1
+  --save-interval: 10000
+  --eval-interval: 1000
+  --eval-iters: 10
+  --transformer-impl: transformer_engine
+  --tensor-model-parallel-size: 2
+  --pipeline-model-parallel-size: 1
+  --expert-model-parallel-size: 2
+  --sequence-parallel: true
+  --num-experts: 8
+  --moe-router-load-balancing-type: sinkhorn
+  --moe-router-topk: 1
+  --ckpt-fully-parallel-load: true
+  --deterministic-mode: true
+  --no-gradient-accumulation-fusion: true
+  --attention-softmax-in-fp32: true
+  --use-mcore-models: true
+  --ckpt-format: torch_dist
+  --data-cache-path: ${DATA_CACHE_PATH}
+  --bf16: true
+TEST_TYPE: regular
diff --git a/tests/functional_tests/test_cases/gpt/gpt3_mr_mcore_te_tp2_pp1_te_8experts2parallel_dist_optimizer_dgx_a100_1N8G/golden_values_dev.json b/tests/functional_tests/test_cases/gpt/gpt3_mr_mcore_te_tp2_pp1_te_8experts2parallel_dist_optimizer_dgx_a100_1N8G/golden_values_dev.json
new file mode 100644
index 0000000000..94a76546a8
--- /dev/null
+++ b/tests/functional_tests/test_cases/gpt/gpt3_mr_mcore_te_tp2_pp1_te_8experts2parallel_dist_optimizer_dgx_a100_1N8G/golden_values_dev.json
@@ -0,0 +1,53 @@
+{
+    "lm loss": {
+        "start_step": 0,
+        "end_step": 50,
+        "step_interval": 5,
+        "values": [
+            10.80636,
+            10.86329,
+            10.86571,
+            10.8026,
+            10.7141,
+            10.63888,
+            10.19509,
+            10.30815,
+            10.21888,
+            9.9159
+        ]
+    },
+    "num-zeros": {
+        "start_step": 0,
+        "end_step": 50,
+        "step_interval": 5,
+        "values": [
+            30624.0,
+            37092.0,
+            37247.0,
+            36055.0,
+            33117.0,
+            34947.0,
+            30805.0,
+            35186.0,
+            36773.0,
+            37592.0
+        ]
+    },
+    "iteration-time": {
+        "start_step": 0,
+        "end_step": 50,
+        "step_interval": 5,
+        "values": [
+            9.92901,
+            0.59358,
+            0.59144,
+            0.59107,
+            0.59173,
+            0.59173,
+            0.59581,
+            0.59219,
+            0.59163,
+            0.59599
+        ]
+    }
+}
\ No newline at end of file
diff --git a/tests/functional_tests/test_cases/gpt/gpt3_mr_mcore_te_tp2_pp1_te_8experts2parallel_dist_optimizer_dgx_a100_1N8G/golden_values_lts.json b/tests/functional_tests/test_cases/gpt/gpt3_mr_mcore_te_tp2_pp1_te_8experts2parallel_dist_optimizer_dgx_a100_1N8G/golden_values_lts.json
new file mode 100644
index 0000000000..c7739ce696
--- /dev/null
+++ b/tests/functional_tests/test_cases/gpt/gpt3_mr_mcore_te_tp2_pp1_te_8experts2parallel_dist_optimizer_dgx_a100_1N8G/golden_values_lts.json
@@ -0,0 +1 @@
+{"lm loss": {"start_step": 0, "end_step": 50, "step_interval": 5, "values": [10.79806, 10.86508, 10.87232, 10.80773, 10.71115, 10.63886, 10.19259, 10.30975, 10.22077, 9.9157]}, "num-zeros": {"start_step": 0, "end_step": 50, "step_interval": 5, "values": [31010.0, 37093.0, 37540.0, 35923.0, 33445.0, 34824.0, 30686.0, 35286.0, 36691.0, 37420.0]}, "iteration_timing_avg": 0.3566726470588235}
diff --git a/tests/functional_tests/test_cases/gpt/gpt3_mr_mcore_te_tp2_pp1_te_8experts2parallel_dist_optimizer_dgx_a100_1N8G/model_config.yaml b/tests/functional_tests/test_cases/gpt/gpt3_mr_mcore_te_tp2_pp1_te_8experts2parallel_dist_optimizer_dgx_a100_1N8G/model_config.yaml
new file mode 100644
index 0000000000..c4b791a9d4
--- /dev/null
+++ b/tests/functional_tests/test_cases/gpt/gpt3_mr_mcore_te_tp2_pp1_te_8experts2parallel_dist_optimizer_dgx_a100_1N8G/model_config.yaml
@@ -0,0 +1,57 @@
+ENV_VARS:
+  CUDA_DEVICE_MAX_CONNECTIONS: 1
+  NVTE_ALLOW_NONDETERMINISTIC_ALGO: 0
+  NCCL_ALGO: Tree
+  CUBLAS_WORKSPACE_CONFIG: :4096:8
+  N_REPEATS: 5
+MODEL_ARGS:
+  --num-layers: 12
+  --hidden-size: 512
+  --num-attention-heads: 8
+  --log-params-norm: true
+  --log-num-zeros-in-grad: true
+  --log-validation-ppl-to-tensorboard: true
+  --log-timers-to-tensorboard: true
+  --tensorboard-dir: ${TENSORBOARD_PATH}
+  --micro-batch-size: 4
+  --global-batch-size: 32
+  --seq-length: 1024
+  --max-position-embeddings: 1024
+  --train-iters: 50
+  --timing-log-level: 2
+  --lr-decay-iters: 320000
+  --save: ${CHECKPOINT_PATH}
+  --load: ${CHECKPOINT_PATH}
+  --data-path: ${DATA_PATH}/my-gpt3_00_text_document
+  --vocab-file: ${DATA_PATH}/bpe/vocab.json
+  --merge-file: ${DATA_PATH}/bpe/merges.txt
+  --split: 949,50,1
+  --distributed-backend: nccl
+  --lr: 0.00015
+  --lr-decay-style: cosine
+  --min-lr: 1.0e-5
+  --weight-decay: 1e-2
+  --clip-grad: 1.0
+  --lr-warmup-fraction: .01
+  --log-interval: 1
+  --save-interval: 10000
+  --eval-interval: 1000
+  --eval-iters: 10
+  --transformer-impl: transformer_engine
+  --tensor-model-parallel-size: 2
+  --pipeline-model-parallel-size: 1
+  --expert-model-parallel-size: 2
+  --sequence-parallel: true
+  --num-experts: 8
+  --use-distributed-optimizer: true
+  --moe-router-load-balancing-type: sinkhorn
+  --moe-router-topk: 1
+  --ckpt-fully-parallel-load: true
+  --deterministic-mode: true
+  --no-gradient-accumulation-fusion: true
+  --attention-softmax-in-fp32: true
+  --use-mcore-models: true
+  --ckpt-format: torch_dist
+  --data-cache-path: ${DATA_CACHE_PATH}
+  --bf16: true
+TEST_TYPE: regular
diff --git a/tests/functional_tests/test_cases/gpt/gpt3_mr_mcore_te_tp2_pp1_te_8experts2parallel_groupedGEMM_dgx_a100_1N8G/golden_values_dev.json b/tests/functional_tests/test_cases/gpt/gpt3_mr_mcore_te_tp2_pp1_te_8experts2parallel_groupedGEMM_dgx_a100_1N8G/golden_values_dev.json
new file mode 100644
index 0000000000..a868ef2477
--- /dev/null
+++ b/tests/functional_tests/test_cases/gpt/gpt3_mr_mcore_te_tp2_pp1_te_8experts2parallel_groupedGEMM_dgx_a100_1N8G/golden_values_dev.json
@@ -0,0 +1,53 @@
+{
+    "lm loss": {
+        "start_step": 0,
+        "end_step": 50,
+        "step_interval": 5,
+        "values": [
+            10.8068,
+            10.85847,
+            10.86845,
+            10.803,
+            10.71773,
+            10.6467,
+            10.20917,
+            10.3267,
+            10.22478,
+            9.93069
+        ]
+    },
+    "num-zeros": {
+        "start_step": 0,
+        "end_step": 50,
+        "step_interval": 5,
+        "values": [
+            31086.0,
+            37745.0,
+            38183.0,
+            36578.0,
+            33138.0,
+            34639.0,
+            30196.0,
+            34818.0,
+            36041.0,
+            37408.0
+        ]
+    },
+    "iteration-time": {
+        "start_step": 0,
+        "end_step": 50,
+        "step_interval": 5,
+        "values": [
+            8.01738,
+            0.3967,
+            0.40469,
+            0.39646,
+            0.39763,
+            0.39581,
+            0.39805,
+            0.39688,
+            0.39585,
+            0.39707
+        ]
+    }
+}
\ No newline at end of file
diff --git a/tests/functional_tests/test_cases/gpt/gpt3_mr_mcore_te_tp2_pp1_te_8experts2parallel_groupedGEMM_dgx_a100_1N8G/golden_values_lts.json b/tests/functional_tests/test_cases/gpt/gpt3_mr_mcore_te_tp2_pp1_te_8experts2parallel_groupedGEMM_dgx_a100_1N8G/golden_values_lts.json
new file mode 100644
index 0000000000..787d84d479
--- /dev/null
+++ b/tests/functional_tests/test_cases/gpt/gpt3_mr_mcore_te_tp2_pp1_te_8experts2parallel_groupedGEMM_dgx_a100_1N8G/golden_values_lts.json
@@ -0,0 +1 @@
+{"lm loss": {"start_step": 0, "end_step": 50, "step_interval": 5, "values": [10.80392, 10.86451, 10.86393, 10.80306, 10.71669, 10.64561, 10.21267, 10.32342, 10.22503, 9.92985]}, "num-zeros": {"start_step": 0, "end_step": 50, "step_interval": 5, "values": [31227.0, 37874.0, 38070.0, 36215.0, 33120.0, 34374.0, 30579.0, 35192.0, 36094.0, 37183.0]}, "iteration_timing_avg": 0.2153429411764706}
diff --git a/tests/functional_tests/test_cases/gpt/gpt3_mr_mcore_te_tp2_pp1_te_8experts2parallel_groupedGEMM_dgx_a100_1N8G/model_config.yaml b/tests/functional_tests/test_cases/gpt/gpt3_mr_mcore_te_tp2_pp1_te_8experts2parallel_groupedGEMM_dgx_a100_1N8G/model_config.yaml
new file mode 100644
index 0000000000..c2631e84e0
--- /dev/null
+++ b/tests/functional_tests/test_cases/gpt/gpt3_mr_mcore_te_tp2_pp1_te_8experts2parallel_groupedGEMM_dgx_a100_1N8G/model_config.yaml
@@ -0,0 +1,58 @@
+ENV_VARS:
+  CUDA_DEVICE_MAX_CONNECTIONS: 1
+  NVTE_ALLOW_NONDETERMINISTIC_ALGO: 0
+  NCCL_ALGO: Tree
+  CUBLAS_WORKSPACE_CONFIG: :4096:8
+  N_REPEATS: 5
+MODEL_ARGS:
+  --num-layers: 12
+  --hidden-size: 512
+  --num-attention-heads: 8
+  --log-params-norm: true
+  --log-num-zeros-in-grad: true
+  --log-validation-ppl-to-tensorboard: true
+  --log-timers-to-tensorboard: true
+  --tensorboard-dir: ${TENSORBOARD_PATH}
+  --micro-batch-size: 4
+  --global-batch-size: 32
+  --seq-length: 1024
+  --max-position-embeddings: 1024
+  --train-iters: 50
+  --timing-log-level: 2
+  --lr-decay-iters: 320000
+  --save: ${CHECKPOINT_PATH}
+  --load: ${CHECKPOINT_PATH}
+  --data-path: ${DATA_PATH}/my-gpt3_00_text_document
+  --vocab-file: ${DATA_PATH}/bpe/vocab.json
+  --merge-file: ${DATA_PATH}/bpe/merges.txt
+  --split: 949,50,1
+  --distributed-backend: nccl
+  --lr: 0.00015
+  --lr-decay-style: cosine
+  --min-lr: 1.0e-5
+  --weight-decay: 1e-2
+  --clip-grad: 1.0
+  --lr-warmup-fraction: .01
+  --log-interval: 1
+  --save-interval: 10000
+  --eval-interval: 1000
+  --eval-iters: 10
+  --transformer-impl: transformer_engine
+  --tensor-model-parallel-size: 2
+  --pipeline-model-parallel-size: 1
+  --expert-model-parallel-size: 2
+  --no-ckpt-fully-parallel-save: true
+  --moe-grouped-gemm: true
+  --disable-bias-linear: true
+  --sequence-parallel: true
+  --num-experts: 8
+  --moe-router-load-balancing-type: sinkhorn
+  --moe-router-topk: 1
+  --deterministic-mode: true
+  --no-gradient-accumulation-fusion: true
+  --attention-softmax-in-fp32: true
+  --use-mcore-models: true
+  --ckpt-format: torch_dist
+  --data-cache-path: ${DATA_CACHE_PATH}
+  --bf16: true
+TEST_TYPE: regular
diff --git a/tests/functional_tests/test_cases/gpt/gpt3_mr_mcore_te_tp2_pp1_te_8experts2parallel_overlap_grad_reduce_param_gather_groupedGEMM_dgx_a100_1N8G/golden_values_dev.json b/tests/functional_tests/test_cases/gpt/gpt3_mr_mcore_te_tp2_pp1_te_8experts2parallel_overlap_grad_reduce_param_gather_groupedGEMM_dgx_a100_1N8G/golden_values_dev.json
new file mode 100644
index 0000000000..0845354088
--- /dev/null
+++ b/tests/functional_tests/test_cases/gpt/gpt3_mr_mcore_te_tp2_pp1_te_8experts2parallel_overlap_grad_reduce_param_gather_groupedGEMM_dgx_a100_1N8G/golden_values_dev.json
@@ -0,0 +1,53 @@
+{
+    "lm loss": {
+        "start_step": 0,
+        "end_step": 50,
+        "step_interval": 5,
+        "values": [
+            10.8068,
+            10.85847,
+            10.86885,
+            10.80298,
+            10.71737,
+            10.64505,
+            10.20965,
+            10.32635,
+            10.22509,
+            9.93052
+        ]
+    },
+    "num-zeros": {
+        "start_step": 0,
+        "end_step": 50,
+        "step_interval": 5,
+        "values": [
+            31086.0,
+            37745.0,
+            38026.0,
+            36288.0,
+            33181.0,
+            34769.0,
+            30277.0,
+            35007.0,
+            35753.0,
+            36883.0
+        ]
+    },
+    "iteration-time": {
+        "start_step": 0,
+        "end_step": 50,
+        "step_interval": 5,
+        "values": [
+            6.27283,
+            0.39789,
+            0.39404,
+            0.39365,
+            0.39408,
+            0.39452,
+            0.3971,
+            0.39296,
+            0.39484,
+            0.39485
+        ]
+    }
+}
\ No newline at end of file
diff --git a/tests/functional_tests/test_cases/gpt/gpt3_mr_mcore_te_tp2_pp1_te_8experts2parallel_overlap_grad_reduce_param_gather_groupedGEMM_dgx_a100_1N8G/golden_values_lts.json b/tests/functional_tests/test_cases/gpt/gpt3_mr_mcore_te_tp2_pp1_te_8experts2parallel_overlap_grad_reduce_param_gather_groupedGEMM_dgx_a100_1N8G/golden_values_lts.json
new file mode 100644
index 0000000000..a8f23f172a
--- /dev/null
+++ b/tests/functional_tests/test_cases/gpt/gpt3_mr_mcore_te_tp2_pp1_te_8experts2parallel_overlap_grad_reduce_param_gather_groupedGEMM_dgx_a100_1N8G/golden_values_lts.json
@@ -0,0 +1 @@
+{"lm loss": {"start_step": 0, "end_step": 50, "step_interval": 5, "values": [10.80392, 10.86451, 10.86407, 10.80254, 10.71523, 10.64479, 10.21223, 10.32267, 10.22495, 9.93003]}, "num-zeros": {"start_step": 0, "end_step": 50, "step_interval": 5, "values": [31227.0, 37874.0, 37773.0, 35936.0, 33255.0, 34279.0, 30117.0, 35460.0, 36069.0, 36785.0]}, "iteration_timing_avg": 0.21900323529411767}
diff --git a/tests/functional_tests/test_cases/gpt/gpt3_mr_mcore_te_tp2_pp1_te_8experts2parallel_overlap_grad_reduce_param_gather_groupedGEMM_dgx_a100_1N8G/model_config.yaml b/tests/functional_tests/test_cases/gpt/gpt3_mr_mcore_te_tp2_pp1_te_8experts2parallel_overlap_grad_reduce_param_gather_groupedGEMM_dgx_a100_1N8G/model_config.yaml
new file mode 100644
index 0000000000..bc5da0c312
--- /dev/null
+++ b/tests/functional_tests/test_cases/gpt/gpt3_mr_mcore_te_tp2_pp1_te_8experts2parallel_overlap_grad_reduce_param_gather_groupedGEMM_dgx_a100_1N8G/model_config.yaml
@@ -0,0 +1,61 @@
+ENV_VARS:
+  CUDA_DEVICE_MAX_CONNECTIONS: 1
+  NVTE_ALLOW_NONDETERMINISTIC_ALGO: 0
+  NCCL_ALGO: Tree
+  CUBLAS_WORKSPACE_CONFIG: :4096:8
+  N_REPEATS: 5
+MODEL_ARGS:
+  --num-layers: 12
+  --hidden-size: 512
+  --num-attention-heads: 8
+  --log-params-norm: true
+  --log-num-zeros-in-grad: true
+  --log-validation-ppl-to-tensorboard: true
+  --log-timers-to-tensorboard: true
+  --tensorboard-dir: ${TENSORBOARD_PATH}
+  --micro-batch-size: 4
+  --global-batch-size: 32
+  --seq-length: 1024
+  --max-position-embeddings: 1024
+  --train-iters: 50
+  --timing-log-level: 2
+  --lr-decay-iters: 320000
+  --save: ${CHECKPOINT_PATH}
+  --load: ${CHECKPOINT_PATH}
+  --data-path: ${DATA_PATH}/my-gpt3_00_text_document
+  --vocab-file: ${DATA_PATH}/bpe/vocab.json
+  --merge-file: ${DATA_PATH}/bpe/merges.txt
+  --split: 949,50,1
+  --distributed-backend: nccl
+  --lr: 0.00015
+  --lr-decay-style: cosine
+  --min-lr: 1.0e-5
+  --weight-decay: 1e-2
+  --clip-grad: 1.0
+  --lr-warmup-fraction: .01
+  --log-interval: 1
+  --save-interval: 10000
+  --eval-interval: 1000
+  --eval-iters: 10
+  --transformer-impl: transformer_engine
+  --tensor-model-parallel-size: 2
+  --pipeline-model-parallel-size: 1
+  --expert-model-parallel-size: 2
+  --no-ckpt-fully-parallel-save: true
+  --moe-grouped-gemm: true
+  --disable-bias-linear: true
+  --sequence-parallel: true
+  --num-experts: 8
+  --use-distributed-optimizer: true
+  --moe-router-load-balancing-type: sinkhorn
+  --moe-router-topk: 1
+  --overlap-grad-reduce: true
+  --overlap-param-gather: true
+  --deterministic-mode: true
+  --no-gradient-accumulation-fusion: true
+  --attention-softmax-in-fp32: true
+  --use-mcore-models: true
+  --ckpt-format: torch_dist
+  --data-cache-path: ${DATA_CACHE_PATH}
+  --bf16: true
+TEST_TYPE: regular
diff --git a/tests/functional_tests/test_cases/gpt/gpt3_mr_mcore_te_tp2_pp1_te_8experts2parallel_top2router_dgx_a100_1N8G/golden_values_dev.json b/tests/functional_tests/test_cases/gpt/gpt3_mr_mcore_te_tp2_pp1_te_8experts2parallel_top2router_dgx_a100_1N8G/golden_values_dev.json
new file mode 100644
index 0000000000..99e329fb8f
--- /dev/null
+++ b/tests/functional_tests/test_cases/gpt/gpt3_mr_mcore_te_tp2_pp1_te_8experts2parallel_top2router_dgx_a100_1N8G/golden_values_dev.json
@@ -0,0 +1,53 @@
+{
+    "lm loss": {
+        "start_step": 0,
+        "end_step": 50,
+        "step_interval": 5,
+        "values": [
+            10.8361,
+            10.87864,
+            10.87768,
+            10.815,
+            10.68778,
+            10.5999,
+            10.08699,
+            10.21759,
+            10.10765,
+            9.78311
+        ]
+    },
+    "num-zeros": {
+        "start_step": 0,
+        "end_step": 50,
+        "step_interval": 5,
+        "values": [
+            26861.0,
+            33006.0,
+            33252.0,
+            31834.0,
+            29098.0,
+            30998.0,
+            28585.0,
+            33169.0,
+            33964.0,
+            35288.0
+        ]
+    },
+    "iteration-time": {
+        "start_step": 0,
+        "end_step": 50,
+        "step_interval": 5,
+        "values": [
+            8.94204,
+            0.58644,
+            0.5851,
+            0.58477,
+            0.59242,
+            0.59936,
+            0.60913,
+            0.62007,
+            0.62455,
+            0.62817
+        ]
+    }
+}
\ No newline at end of file
diff --git a/tests/functional_tests/test_cases/gpt/gpt3_mr_mcore_te_tp2_pp1_te_8experts2parallel_top2router_dgx_a100_1N8G/golden_values_lts.json b/tests/functional_tests/test_cases/gpt/gpt3_mr_mcore_te_tp2_pp1_te_8experts2parallel_top2router_dgx_a100_1N8G/golden_values_lts.json
new file mode 100644
index 0000000000..5b81d07061
--- /dev/null
+++ b/tests/functional_tests/test_cases/gpt/gpt3_mr_mcore_te_tp2_pp1_te_8experts2parallel_top2router_dgx_a100_1N8G/golden_values_lts.json
@@ -0,0 +1 @@
+{"lm loss": {"start_step": 0, "end_step": 50, "step_interval": 5, "values": [10.83503, 10.88475, 10.87872, 10.81608, 10.69357, 10.60024, 10.08934, 10.21378, 10.10871, 9.78568]}, "num-zeros": {"start_step": 0, "end_step": 50, "step_interval": 5, "values": [26744.0, 33099.0, 33750.0, 31697.0, 28979.0, 30817.0, 28713.0, 33425.0, 33927.0, 35074.0]}, "iteration_timing_avg": 0.28211852941176474}
\ No newline at end of file
diff --git a/tests/functional_tests/test_cases/gpt/gpt3_mr_mcore_te_tp2_pp1_te_8experts2parallel_top2router_dgx_a100_1N8G/model_config.yaml b/tests/functional_tests/test_cases/gpt/gpt3_mr_mcore_te_tp2_pp1_te_8experts2parallel_top2router_dgx_a100_1N8G/model_config.yaml
new file mode 100644
index 0000000000..7c437e0b10
--- /dev/null
+++ b/tests/functional_tests/test_cases/gpt/gpt3_mr_mcore_te_tp2_pp1_te_8experts2parallel_top2router_dgx_a100_1N8G/model_config.yaml
@@ -0,0 +1,58 @@
+ENV_VARS:
+  CUDA_DEVICE_MAX_CONNECTIONS: 1
+  NVTE_ALLOW_NONDETERMINISTIC_ALGO: 0
+  NCCL_ALGO: Tree
+  CUBLAS_WORKSPACE_CONFIG: :4096:8
+  N_REPEATS: 5
+MODEL_ARGS:
+  --num-layers: 12
+  --hidden-size: 512
+  --num-attention-heads: 8
+  --log-params-norm: true
+  --log-num-zeros-in-grad: true
+  --log-validation-ppl-to-tensorboard: true
+  --log-timers-to-tensorboard: true
+  --tensorboard-dir: ${TENSORBOARD_PATH}
+  --micro-batch-size: 4
+  --global-batch-size: 32
+  --seq-length: 1024
+  --max-position-embeddings: 1024
+  --train-iters: 50
+  --timing-log-level: 2
+  --lr-decay-iters: 320000
+  --save: ${CHECKPOINT_PATH}
+  --load: ${CHECKPOINT_PATH}
+  --data-path: ${DATA_PATH}/my-gpt3_00_text_document
+  --vocab-file: ${DATA_PATH}/bpe/vocab.json
+  --merge-file: ${DATA_PATH}/bpe/merges.txt
+  --split: 949,50,1
+  --distributed-backend: nccl
+  --lr: 0.00015
+  --lr-decay-style: cosine
+  --min-lr: 1.0e-5
+  --weight-decay: 1e-2
+  --clip-grad: 1.0
+  --lr-warmup-fraction: .01
+  --log-interval: 1
+  --save-interval: 10000
+  --eval-interval: 1000
+  --eval-iters: 10
+  --transformer-impl: transformer_engine
+  --tensor-model-parallel-size: 2
+  --pipeline-model-parallel-size: 1
+  --expert-model-parallel-size: 2
+  --disable-bias-linear: true
+  --sequence-parallel: true
+  --num-experts: 8
+  --moe-router-load-balancing-type: aux_loss
+  --moe-router-topk: 2
+  --moe-aux-loss-coeff: 1e-2
+  --deterministic-mode: true
+  --no-gradient-accumulation-fusion: true
+  --moe-grouped-gemm: true
+  --attention-softmax-in-fp32: true
+  --use-mcore-models: true
+  --ckpt-format: torch_dist
+  --data-cache-path: ${DATA_CACHE_PATH}
+  --bf16: true
+TEST_TYPE: regular
diff --git a/tests/functional_tests/test_cases/gpt/gpt3_mr_mcore_te_tp2_pp2_cp2_nondeterministic_dgx_a100_1N8G/golden_values_dev.json b/tests/functional_tests/test_cases/gpt/gpt3_mr_mcore_te_tp2_pp2_cp2_nondeterministic_dgx_a100_1N8G/golden_values_dev.json
new file mode 100644
index 0000000000..4c8008e6ac
--- /dev/null
+++ b/tests/functional_tests/test_cases/gpt/gpt3_mr_mcore_te_tp2_pp2_cp2_nondeterministic_dgx_a100_1N8G/golden_values_dev.json
@@ -0,0 +1 @@
+{"lm loss": {"start_step": 0, "end_step": 50, "step_interval": 5, "values": [10.93292, 10.93657, 10.88788, 10.86131, 10.71505, 10.61066, 10.06697, 10.17616, 10.07539, 9.74965]}, "num-zeros": {"start_step": 0, "end_step": 50, "step_interval": 5, "values": [607.0, 638.0, 643.0, 649.0, 648.0, 590.0, 548.0, 772.0, 834.0, 836.0]}, "iteration_timing_avg": 0.3993126470588235}
diff --git a/tests/functional_tests/test_cases/gpt/gpt3_mr_mcore_te_tp2_pp2_cp2_nondeterministic_dgx_a100_1N8G/golden_values_lts.json b/tests/functional_tests/test_cases/gpt/gpt3_mr_mcore_te_tp2_pp2_cp2_nondeterministic_dgx_a100_1N8G/golden_values_lts.json
new file mode 100644
index 0000000000..4c8008e6ac
--- /dev/null
+++ b/tests/functional_tests/test_cases/gpt/gpt3_mr_mcore_te_tp2_pp2_cp2_nondeterministic_dgx_a100_1N8G/golden_values_lts.json
@@ -0,0 +1 @@
+{"lm loss": {"start_step": 0, "end_step": 50, "step_interval": 5, "values": [10.93292, 10.93657, 10.88788, 10.86131, 10.71505, 10.61066, 10.06697, 10.17616, 10.07539, 9.74965]}, "num-zeros": {"start_step": 0, "end_step": 50, "step_interval": 5, "values": [607.0, 638.0, 643.0, 649.0, 648.0, 590.0, 548.0, 772.0, 834.0, 836.0]}, "iteration_timing_avg": 0.3993126470588235}
diff --git a/tests/functional_tests/test_cases/gpt/gpt3_mr_mcore_te_tp2_pp2_cp2_nondeterministic_dgx_a100_1N8G/model_config.yaml b/tests/functional_tests/test_cases/gpt/gpt3_mr_mcore_te_tp2_pp2_cp2_nondeterministic_dgx_a100_1N8G/model_config.yaml
new file mode 100644
index 0000000000..dde8a620d3
--- /dev/null
+++ b/tests/functional_tests/test_cases/gpt/gpt3_mr_mcore_te_tp2_pp2_cp2_nondeterministic_dgx_a100_1N8G/model_config.yaml
@@ -0,0 +1,51 @@
+ENV_VARS:
+  CUDA_DEVICE_MAX_CONNECTIONS: 1
+  NVTE_ALLOW_NONDETERMINISTIC_ALGO: 1
+  N_REPEATS: 5
+MODEL_ARGS:
+  --num-layers: 12
+  --hidden-size: 512
+  --num-attention-heads: 8
+  --log-params-norm: true
+  --log-num-zeros-in-grad: true
+  --log-validation-ppl-to-tensorboard: true
+  --log-timers-to-tensorboard: true
+  --tensorboard-dir: ${TENSORBOARD_PATH}
+  --micro-batch-size: 4
+  --global-batch-size: 32
+  --seq-length: 1024
+  --max-position-embeddings: 1024
+  --train-iters: 50
+  --timing-log-level: 2
+  --lr-decay-iters: 320000
+  --save: ${CHECKPOINT_PATH}
+  --load: ${CHECKPOINT_PATH}
+  --data-path: ${DATA_PATH}/my-gpt3_00_text_document
+  --vocab-file: ${DATA_PATH}/bpe/vocab.json
+  --merge-file: ${DATA_PATH}/bpe/merges.txt
+  --split: 949,50,1
+  --distributed-backend: nccl
+  --lr: 0.00015
+  --lr-decay-style: cosine
+  --min-lr: 1.0e-5
+  --weight-decay: 1e-2
+  --clip-grad: 1.0
+  --lr-warmup-fraction: .01
+  --log-interval: 1
+  --save-interval: 10000
+  --eval-interval: 1000
+  --eval-iters: 10
+  --transformer-impl: transformer_engine
+  --tensor-model-parallel-size: 2
+  --pipeline-model-parallel-size: 2
+  --context-parallel-size: 2
+  --sequence-parallel: true
+  --hidden-dropout: 0.0
+  --attention-dropout: 0.0
+  --no-gradient-accumulation-fusion: true
+  --attention-softmax-in-fp32: true
+  --use-mcore-models: true
+  --ckpt-format: torch_dist
+  --data-cache-path: ${DATA_CACHE_PATH}
+  --bf16: true
+TEST_TYPE: regular
diff --git a/tests/functional_tests/test_cases/gpt/gpt3_mr_mcore_te_tp2_pp2_cross_entropy_loss_fusion_dgx_a100_1N8G/golden_values_dev.json b/tests/functional_tests/test_cases/gpt/gpt3_mr_mcore_te_tp2_pp2_cross_entropy_loss_fusion_dgx_a100_1N8G/golden_values_dev.json
new file mode 100644
index 0000000000..98ff45e7db
--- /dev/null
+++ b/tests/functional_tests/test_cases/gpt/gpt3_mr_mcore_te_tp2_pp2_cross_entropy_loss_fusion_dgx_a100_1N8G/golden_values_dev.json
@@ -0,0 +1 @@
+{"lm loss": {"start_step": 0, "end_step": 50, "step_interval": 5, "values": [10.92705, 10.93627, 10.89332, 10.87322, 10.74871, 10.65375, 10.15756, 10.24634, 10.15177, 9.83799]}, "num-zeros": {"start_step": 0, "end_step": 50, "step_interval": 5, "values": [1707.0, 1885.0, 1986.0, 1760.0, 1773.0, 1859.0, 1598.0, 1965.0, 2199.0, 2316.0]}, "iteration_timing_avg": 0.20321264705882353}
diff --git a/tests/functional_tests/test_cases/gpt/gpt3_mr_mcore_te_tp2_pp2_cross_entropy_loss_fusion_dgx_a100_1N8G/golden_values_lts.json b/tests/functional_tests/test_cases/gpt/gpt3_mr_mcore_te_tp2_pp2_cross_entropy_loss_fusion_dgx_a100_1N8G/golden_values_lts.json
new file mode 100644
index 0000000000..98ff45e7db
--- /dev/null
+++ b/tests/functional_tests/test_cases/gpt/gpt3_mr_mcore_te_tp2_pp2_cross_entropy_loss_fusion_dgx_a100_1N8G/golden_values_lts.json
@@ -0,0 +1 @@
+{"lm loss": {"start_step": 0, "end_step": 50, "step_interval": 5, "values": [10.92705, 10.93627, 10.89332, 10.87322, 10.74871, 10.65375, 10.15756, 10.24634, 10.15177, 9.83799]}, "num-zeros": {"start_step": 0, "end_step": 50, "step_interval": 5, "values": [1707.0, 1885.0, 1986.0, 1760.0, 1773.0, 1859.0, 1598.0, 1965.0, 2199.0, 2316.0]}, "iteration_timing_avg": 0.20321264705882353}
diff --git a/tests/functional_tests/test_cases/gpt/gpt3_mr_mcore_te_tp2_pp2_cross_entropy_loss_fusion_dgx_a100_1N8G/model_config.yaml b/tests/functional_tests/test_cases/gpt/gpt3_mr_mcore_te_tp2_pp2_cross_entropy_loss_fusion_dgx_a100_1N8G/model_config.yaml
new file mode 100644
index 0000000000..303182bcaf
--- /dev/null
+++ b/tests/functional_tests/test_cases/gpt/gpt3_mr_mcore_te_tp2_pp2_cross_entropy_loss_fusion_dgx_a100_1N8G/model_config.yaml
@@ -0,0 +1,48 @@
+ENV_VARS:
+  CUDA_DEVICE_MAX_CONNECTIONS: 1
+  NVTE_ALLOW_NONDETERMINISTIC_ALGO: 1
+  N_REPEATS: 5
+MODEL_ARGS:
+  --num-layers: 12
+  --hidden-size: 512
+  --num-attention-heads: 8
+  --log-params-norm: true
+  --log-num-zeros-in-grad: true
+  --log-validation-ppl-to-tensorboard: true
+  --log-timers-to-tensorboard: true
+  --tensorboard-dir: ${TENSORBOARD_PATH}
+  --micro-batch-size: 4
+  --global-batch-size: 32
+  --seq-length: 1024
+  --max-position-embeddings: 1024
+  --train-iters: 50
+  --timing-log-level: 2
+  --lr-decay-iters: 320000
+  --save: ${CHECKPOINT_PATH}
+  --load: ${CHECKPOINT_PATH}
+  --data-path: ${DATA_PATH}/my-gpt3_00_text_document
+  --vocab-file: ${DATA_PATH}/bpe/vocab.json
+  --merge-file: ${DATA_PATH}/bpe/merges.txt
+  --split: 949,50,1
+  --distributed-backend: nccl
+  --lr: 0.00015
+  --lr-decay-style: cosine
+  --min-lr: 1.0e-5
+  --weight-decay: 1e-2
+  --clip-grad: 1.0
+  --lr-warmup-fraction: .01
+  --log-interval: 1
+  --save-interval: 10000
+  --eval-interval: 1000
+  --eval-iters: 10
+  --transformer-impl: transformer_engine
+  --tensor-model-parallel-size: 2
+  --pipeline-model-parallel-size: 2
+  --cross-entropy-loss-fusion: true
+  --no-gradient-accumulation-fusion: true
+  --attention-softmax-in-fp32: true
+  --use-mcore-models: true
+  --ckpt-format: torch_dist
+  --data-cache-path: ${DATA_CACHE_PATH}
+  --bf16: true
+TEST_TYPE: regular
diff --git a/tests/functional_tests/test_cases/gpt/gpt3_mr_mcore_te_tp2_pp2_ddp_average_in_collective_dgx_a100_1N8G/golden_values_dev.json b/tests/functional_tests/test_cases/gpt/gpt3_mr_mcore_te_tp2_pp2_ddp_average_in_collective_dgx_a100_1N8G/golden_values_dev.json
new file mode 100644
index 0000000000..81ebe32310
--- /dev/null
+++ b/tests/functional_tests/test_cases/gpt/gpt3_mr_mcore_te_tp2_pp2_ddp_average_in_collective_dgx_a100_1N8G/golden_values_dev.json
@@ -0,0 +1,53 @@
+{
+    "lm loss": {
+        "start_step": 0,
+        "end_step": 50,
+        "step_interval": 5,
+        "values": [
+            10.92655,
+            10.9356,
+            10.89279,
+            10.87309,
+            10.74892,
+            10.65436,
+            10.15723,
+            10.2467,
+            10.15196,
+            9.83834
+        ]
+    },
+    "num-zeros": {
+        "start_step": 0,
+        "end_step": 50,
+        "step_interval": 5,
+        "values": [
+            1637.0,
+            1871.0,
+            1961.0,
+            1750.0,
+            1831.0,
+            1817.0,
+            1600.0,
+            2009.0,
+            2300.0,
+            2398.0
+        ]
+    },
+    "iteration-time": {
+        "start_step": 0,
+        "end_step": 50,
+        "step_interval": 5,
+        "values": [
+            11.00345,
+            0.20167,
+            0.199,
+            0.19854,
+            0.19914,
+            0.19625,
+            0.19812,
+            0.19792,
+            0.19797,
+            0.19742
+        ]
+    }
+}
\ No newline at end of file
diff --git a/tests/functional_tests/test_cases/gpt/gpt3_mr_mcore_te_tp2_pp2_ddp_average_in_collective_dgx_a100_1N8G/golden_values_lts.json b/tests/functional_tests/test_cases/gpt/gpt3_mr_mcore_te_tp2_pp2_ddp_average_in_collective_dgx_a100_1N8G/golden_values_lts.json
new file mode 100644
index 0000000000..265ad7c9b9
--- /dev/null
+++ b/tests/functional_tests/test_cases/gpt/gpt3_mr_mcore_te_tp2_pp2_ddp_average_in_collective_dgx_a100_1N8G/golden_values_lts.json
@@ -0,0 +1 @@
+{"lm loss": {"start_step": 0, "end_step": 50, "step_interval": 5, "values": [10.92705, 10.93626, 10.89335, 10.87325, 10.74869, 10.65372, 10.15755, 10.24642, 10.15177, 9.83802]}, "num-zeros": {"start_step": 0, "end_step": 50, "step_interval": 5, "values": [1627.0, 1889.0, 1973.0, 1785.0, 1797.0, 1836.0, 1602.0, 2034.0, 2316.0, 2307.0]}, "iteration_timing_avg": 0.15396205882352942}
\ No newline at end of file
diff --git a/tests/functional_tests/test_cases/gpt/gpt3_mr_mcore_te_tp2_pp2_ddp_average_in_collective_dgx_a100_1N8G/model_config.yaml b/tests/functional_tests/test_cases/gpt/gpt3_mr_mcore_te_tp2_pp2_ddp_average_in_collective_dgx_a100_1N8G/model_config.yaml
new file mode 100644
index 0000000000..c08ce2e01c
--- /dev/null
+++ b/tests/functional_tests/test_cases/gpt/gpt3_mr_mcore_te_tp2_pp2_ddp_average_in_collective_dgx_a100_1N8G/model_config.yaml
@@ -0,0 +1,51 @@
+ENV_VARS:
+  CUDA_DEVICE_MAX_CONNECTIONS: 1
+  NVTE_ALLOW_NONDETERMINISTIC_ALGO: 0
+  NCCL_ALGO: Tree
+  CUBLAS_WORKSPACE_CONFIG: :4096:8
+  N_REPEATS: 5
+MODEL_ARGS:
+  --num-layers: 12
+  --hidden-size: 512
+  --num-attention-heads: 8
+  --log-params-norm: true
+  --log-num-zeros-in-grad: true
+  --log-validation-ppl-to-tensorboard: true
+  --log-timers-to-tensorboard: true
+  --tensorboard-dir: ${TENSORBOARD_PATH}
+  --micro-batch-size: 4
+  --global-batch-size: 32
+  --seq-length: 1024
+  --max-position-embeddings: 1024
+  --train-iters: 50
+  --timing-log-level: 2
+  --lr-decay-iters: 320000
+  --save: ${CHECKPOINT_PATH}
+  --load: ${CHECKPOINT_PATH}
+  --data-path: ${DATA_PATH}/my-gpt3_00_text_document
+  --vocab-file: ${DATA_PATH}/bpe/vocab.json
+  --merge-file: ${DATA_PATH}/bpe/merges.txt
+  --split: 949,50,1
+  --distributed-backend: nccl
+  --lr: 0.00015
+  --lr-decay-style: cosine
+  --min-lr: 1.0e-5
+  --weight-decay: 1e-2
+  --clip-grad: 1.0
+  --lr-warmup-fraction: .01
+  --log-interval: 1
+  --save-interval: 10000
+  --eval-interval: 1000
+  --eval-iters: 10
+  --transformer-impl: transformer_engine
+  --tensor-model-parallel-size: 2
+  --pipeline-model-parallel-size: 2
+  --ddp-average-in-collective: true
+  --deterministic-mode: true
+  --no-gradient-accumulation-fusion: true
+  --attention-softmax-in-fp32: true
+  --use-mcore-models: true
+  --ckpt-format: torch_dist
+  --data-cache-path: ${DATA_CACHE_PATH}
+  --bf16: true
+TEST_TYPE: regular
diff --git a/tests/functional_tests/test_cases/gpt/gpt3_mr_mcore_te_tp2_pp2_defer_embedding_wgrad_compute_dgx_a100_1N8G/golden_values_dev.json b/tests/functional_tests/test_cases/gpt/gpt3_mr_mcore_te_tp2_pp2_defer_embedding_wgrad_compute_dgx_a100_1N8G/golden_values_dev.json
new file mode 100644
index 0000000000..1911ec077e
--- /dev/null
+++ b/tests/functional_tests/test_cases/gpt/gpt3_mr_mcore_te_tp2_pp2_defer_embedding_wgrad_compute_dgx_a100_1N8G/golden_values_dev.json
@@ -0,0 +1,53 @@
+{
+    "lm loss": {
+        "start_step": 0,
+        "end_step": 50,
+        "step_interval": 5,
+        "values": [
+            10.92655,
+            10.93561,
+            10.89281,
+            10.87309,
+            10.74898,
+            10.65438,
+            10.15724,
+            10.24667,
+            10.15195,
+            9.83831
+        ]
+    },
+    "num-zeros": {
+        "start_step": 0,
+        "end_step": 50,
+        "step_interval": 5,
+        "values": [
+            61.0,
+            64.0,
+            72.0,
+            63.0,
+            56.0,
+            68.0,
+            59.0,
+            66.0,
+            80.0,
+            77.0
+        ]
+    },
+    "iteration-time": {
+        "start_step": 0,
+        "end_step": 50,
+        "step_interval": 5,
+        "values": [
+            11.97051,
+            0.19754,
+            0.1983,
+            0.19901,
+            0.19738,
+            0.19644,
+            0.19868,
+            0.19807,
+            0.19845,
+            0.19669
+        ]
+    }
+}
\ No newline at end of file
diff --git a/tests/functional_tests/test_cases/gpt/gpt3_mr_mcore_te_tp2_pp2_defer_embedding_wgrad_compute_dgx_a100_1N8G/golden_values_lts.json b/tests/functional_tests/test_cases/gpt/gpt3_mr_mcore_te_tp2_pp2_defer_embedding_wgrad_compute_dgx_a100_1N8G/golden_values_lts.json
new file mode 100644
index 0000000000..517c935c6a
--- /dev/null
+++ b/tests/functional_tests/test_cases/gpt/gpt3_mr_mcore_te_tp2_pp2_defer_embedding_wgrad_compute_dgx_a100_1N8G/golden_values_lts.json
@@ -0,0 +1 @@
+{"lm loss": {"start_step": 0, "end_step": 50, "step_interval": 5, "values": [10.92705, 10.93628, 10.89335, 10.87322, 10.7487, 10.65379, 10.15754, 10.2464, 10.15175, 9.83801]}, "num-zeros": {"start_step": 0, "end_step": 50, "step_interval": 5, "values": [68.0, 64.0, 61.0, 58.0, 55.0, 85.0, 77.0, 68.0, 78.0, 63.0]}}
diff --git a/tests/functional_tests/test_cases/gpt/gpt3_mr_mcore_te_tp2_pp2_defer_embedding_wgrad_compute_dgx_a100_1N8G/model_config.yaml b/tests/functional_tests/test_cases/gpt/gpt3_mr_mcore_te_tp2_pp2_defer_embedding_wgrad_compute_dgx_a100_1N8G/model_config.yaml
new file mode 100644
index 0000000000..959c286a50
--- /dev/null
+++ b/tests/functional_tests/test_cases/gpt/gpt3_mr_mcore_te_tp2_pp2_defer_embedding_wgrad_compute_dgx_a100_1N8G/model_config.yaml
@@ -0,0 +1,51 @@
+ENV_VARS:
+  CUDA_DEVICE_MAX_CONNECTIONS: 1
+  NVTE_ALLOW_NONDETERMINISTIC_ALGO: 0
+  NCCL_ALGO: Tree
+  CUBLAS_WORKSPACE_CONFIG: :4096:8
+  N_REPEATS: 5
+MODEL_ARGS:
+  --num-layers: 12
+  --hidden-size: 512
+  --num-attention-heads: 8
+  --log-params-norm: true
+  --log-num-zeros-in-grad: true
+  --log-validation-ppl-to-tensorboard: true
+  --log-timers-to-tensorboard: true
+  --tensorboard-dir: ${TENSORBOARD_PATH}
+  --micro-batch-size: 4
+  --global-batch-size: 32
+  --seq-length: 1024
+  --max-position-embeddings: 1024
+  --train-iters: 50
+  --timing-log-level: 2
+  --lr-decay-iters: 320000
+  --save: ${CHECKPOINT_PATH}
+  --load: ${CHECKPOINT_PATH}
+  --data-path: ${DATA_PATH}/my-gpt3_00_text_document
+  --vocab-file: ${DATA_PATH}/bpe/vocab.json
+  --merge-file: ${DATA_PATH}/bpe/merges.txt
+  --split: 949,50,1
+  --distributed-backend: nccl
+  --lr: 0.00015
+  --lr-decay-style: cosine
+  --min-lr: 1.0e-5
+  --weight-decay: 1e-2
+  --clip-grad: 1.0
+  --lr-warmup-fraction: .01
+  --log-interval: 1
+  --save-interval: 10000
+  --eval-interval: 1000
+  --eval-iters: 10
+  --transformer-impl: transformer_engine
+  --tensor-model-parallel-size: 2
+  --pipeline-model-parallel-size: 2
+  --defer-embedding-wgrad-compute: true
+  --wgrad-deferral-limit: 2
+  --deterministic-mode: true
+  --attention-softmax-in-fp32: true
+  --use-mcore-models: true
+  --ckpt-format: torch_dist
+  --data-cache-path: ${DATA_CACHE_PATH}
+  --bf16: true
+TEST_TYPE: regular
diff --git a/tests/functional_tests/test_cases/gpt/gpt3_mr_mcore_te_tp2_pp2_dgx_a100_1N8G/golden_values_dev.json b/tests/functional_tests/test_cases/gpt/gpt3_mr_mcore_te_tp2_pp2_dgx_a100_1N8G/golden_values_dev.json
new file mode 100644
index 0000000000..cd3b25b704
--- /dev/null
+++ b/tests/functional_tests/test_cases/gpt/gpt3_mr_mcore_te_tp2_pp2_dgx_a100_1N8G/golden_values_dev.json
@@ -0,0 +1,53 @@
+{
+    "lm loss": {
+        "start_step": 0,
+        "end_step": 50,
+        "step_interval": 5,
+        "values": [
+            10.92655,
+            10.9356,
+            10.89279,
+            10.87309,
+            10.74892,
+            10.65436,
+            10.15723,
+            10.2467,
+            10.15196,
+            9.83834
+        ]
+    },
+    "num-zeros": {
+        "start_step": 0,
+        "end_step": 50,
+        "step_interval": 5,
+        "values": [
+            1637.0,
+            1871.0,
+            1961.0,
+            1750.0,
+            1831.0,
+            1817.0,
+            1600.0,
+            2009.0,
+            2300.0,
+            2398.0
+        ]
+    },
+    "iteration-time": {
+        "start_step": 0,
+        "end_step": 50,
+        "step_interval": 5,
+        "values": [
+            9.39534,
+            0.21178,
+            0.20637,
+            0.22478,
+            0.19747,
+            0.19618,
+            0.19587,
+            0.19616,
+            0.2033,
+            0.19787
+        ]
+    }
+}
\ No newline at end of file
diff --git a/tests/functional_tests/test_cases/gpt/gpt3_mr_mcore_te_tp2_pp2_dgx_a100_1N8G/golden_values_lts.json b/tests/functional_tests/test_cases/gpt/gpt3_mr_mcore_te_tp2_pp2_dgx_a100_1N8G/golden_values_lts.json
new file mode 100644
index 0000000000..265ad7c9b9
--- /dev/null
+++ b/tests/functional_tests/test_cases/gpt/gpt3_mr_mcore_te_tp2_pp2_dgx_a100_1N8G/golden_values_lts.json
@@ -0,0 +1 @@
+{"lm loss": {"start_step": 0, "end_step": 50, "step_interval": 5, "values": [10.92705, 10.93626, 10.89335, 10.87325, 10.74869, 10.65372, 10.15755, 10.24642, 10.15177, 9.83802]}, "num-zeros": {"start_step": 0, "end_step": 50, "step_interval": 5, "values": [1627.0, 1889.0, 1973.0, 1785.0, 1797.0, 1836.0, 1602.0, 2034.0, 2316.0, 2307.0]}, "iteration_timing_avg": 0.15396205882352942}
\ No newline at end of file
diff --git a/tests/functional_tests/test_cases/gpt/gpt3_mr_mcore_te_tp2_pp2_dgx_a100_1N8G/model_config.yaml b/tests/functional_tests/test_cases/gpt/gpt3_mr_mcore_te_tp2_pp2_dgx_a100_1N8G/model_config.yaml
new file mode 100644
index 0000000000..c9938b5ee1
--- /dev/null
+++ b/tests/functional_tests/test_cases/gpt/gpt3_mr_mcore_te_tp2_pp2_dgx_a100_1N8G/model_config.yaml
@@ -0,0 +1,50 @@
+ENV_VARS:
+  CUDA_DEVICE_MAX_CONNECTIONS: 1
+  NVTE_ALLOW_NONDETERMINISTIC_ALGO: 0
+  NCCL_ALGO: Tree
+  CUBLAS_WORKSPACE_CONFIG: :4096:8
+  N_REPEATS: 5
+MODEL_ARGS:
+  --num-layers: 12
+  --hidden-size: 512
+  --num-attention-heads: 8
+  --log-params-norm: true
+  --log-num-zeros-in-grad: true
+  --log-validation-ppl-to-tensorboard: true
+  --log-timers-to-tensorboard: true
+  --tensorboard-dir: ${TENSORBOARD_PATH}
+  --micro-batch-size: 4
+  --global-batch-size: 32
+  --seq-length: 1024
+  --max-position-embeddings: 1024
+  --train-iters: 50
+  --timing-log-level: 2
+  --lr-decay-iters: 320000
+  --save: ${CHECKPOINT_PATH}
+  --load: ${CHECKPOINT_PATH}
+  --data-path: ${DATA_PATH}/my-gpt3_00_text_document
+  --vocab-file: ${DATA_PATH}/bpe/vocab.json
+  --merge-file: ${DATA_PATH}/bpe/merges.txt
+  --split: 949,50,1
+  --distributed-backend: nccl
+  --lr: 0.00015
+  --lr-decay-style: cosine
+  --min-lr: 1.0e-5
+  --weight-decay: 1e-2
+  --clip-grad: 1.0
+  --lr-warmup-fraction: .01
+  --log-interval: 1
+  --save-interval: 10000
+  --eval-interval: 1000
+  --eval-iters: 10
+  --transformer-impl: transformer_engine
+  --tensor-model-parallel-size: 2
+  --pipeline-model-parallel-size: 2
+  --deterministic-mode: true
+  --no-gradient-accumulation-fusion: true
+  --attention-softmax-in-fp32: true
+  --use-mcore-models: true
+  --ckpt-format: torch_dist
+  --data-cache-path: ${DATA_CACHE_PATH}
+  --bf16: true
+TEST_TYPE: regular
diff --git a/tests/functional_tests/test_cases/gpt/gpt3_mr_mcore_te_tp2_pp2_no_create_attention_mask_in_dataloader_dgx_a100_1N8G/golden_values_dev.json b/tests/functional_tests/test_cases/gpt/gpt3_mr_mcore_te_tp2_pp2_no_create_attention_mask_in_dataloader_dgx_a100_1N8G/golden_values_dev.json
new file mode 100644
index 0000000000..c6e707304f
--- /dev/null
+++ b/tests/functional_tests/test_cases/gpt/gpt3_mr_mcore_te_tp2_pp2_no_create_attention_mask_in_dataloader_dgx_a100_1N8G/golden_values_dev.json
@@ -0,0 +1,53 @@
+{
+    "lm loss": {
+        "start_step": 0,
+        "end_step": 50,
+        "step_interval": 5,
+        "values": [
+            10.92655,
+            10.9356,
+            10.89279,
+            10.87309,
+            10.74892,
+            10.65436,
+            10.15723,
+            10.2467,
+            10.15196,
+            9.83834
+        ]
+    },
+    "num-zeros": {
+        "start_step": 0,
+        "end_step": 50,
+        "step_interval": 5,
+        "values": [
+            1637.0,
+            1871.0,
+            1961.0,
+            1750.0,
+            1831.0,
+            1817.0,
+            1600.0,
+            2009.0,
+            2300.0,
+            2398.0
+        ]
+    },
+    "iteration-time": {
+        "start_step": 0,
+        "end_step": 50,
+        "step_interval": 5,
+        "values": [
+            12.32922,
+            0.19767,
+            0.19574,
+            0.19487,
+            0.19442,
+            0.1953,
+            0.19438,
+            0.19481,
+            0.19385,
+            0.19537
+        ]
+    }
+}
\ No newline at end of file
diff --git a/tests/functional_tests/test_cases/gpt/gpt3_mr_mcore_te_tp2_pp2_no_create_attention_mask_in_dataloader_dgx_a100_1N8G/golden_values_lts.json b/tests/functional_tests/test_cases/gpt/gpt3_mr_mcore_te_tp2_pp2_no_create_attention_mask_in_dataloader_dgx_a100_1N8G/golden_values_lts.json
new file mode 100644
index 0000000000..265ad7c9b9
--- /dev/null
+++ b/tests/functional_tests/test_cases/gpt/gpt3_mr_mcore_te_tp2_pp2_no_create_attention_mask_in_dataloader_dgx_a100_1N8G/golden_values_lts.json
@@ -0,0 +1 @@
+{"lm loss": {"start_step": 0, "end_step": 50, "step_interval": 5, "values": [10.92705, 10.93626, 10.89335, 10.87325, 10.74869, 10.65372, 10.15755, 10.24642, 10.15177, 9.83802]}, "num-zeros": {"start_step": 0, "end_step": 50, "step_interval": 5, "values": [1627.0, 1889.0, 1973.0, 1785.0, 1797.0, 1836.0, 1602.0, 2034.0, 2316.0, 2307.0]}, "iteration_timing_avg": 0.15396205882352942}
\ No newline at end of file
diff --git a/tests/functional_tests/test_cases/gpt/gpt3_mr_mcore_te_tp2_pp2_no_create_attention_mask_in_dataloader_dgx_a100_1N8G/model_config.yaml b/tests/functional_tests/test_cases/gpt/gpt3_mr_mcore_te_tp2_pp2_no_create_attention_mask_in_dataloader_dgx_a100_1N8G/model_config.yaml
new file mode 100644
index 0000000000..23060e55e4
--- /dev/null
+++ b/tests/functional_tests/test_cases/gpt/gpt3_mr_mcore_te_tp2_pp2_no_create_attention_mask_in_dataloader_dgx_a100_1N8G/model_config.yaml
@@ -0,0 +1,51 @@
+ENV_VARS:
+  CUDA_DEVICE_MAX_CONNECTIONS: 1
+  NVTE_ALLOW_NONDETERMINISTIC_ALGO: 0
+  NCCL_ALGO: Tree
+  CUBLAS_WORKSPACE_CONFIG: :4096:8
+  N_REPEATS: 5
+MODEL_ARGS:
+  --num-layers: 12
+  --hidden-size: 512
+  --num-attention-heads: 8
+  --log-params-norm: true
+  --log-num-zeros-in-grad: true
+  --log-validation-ppl-to-tensorboard: true
+  --log-timers-to-tensorboard: true
+  --tensorboard-dir: ${TENSORBOARD_PATH}
+  --micro-batch-size: 4
+  --global-batch-size: 32
+  --seq-length: 1024
+  --max-position-embeddings: 1024
+  --train-iters: 50
+  --timing-log-level: 2
+  --lr-decay-iters: 320000
+  --save: ${CHECKPOINT_PATH}
+  --load: ${CHECKPOINT_PATH}
+  --data-path: ${DATA_PATH}/my-gpt3_00_text_document
+  --vocab-file: ${DATA_PATH}/bpe/vocab.json
+  --merge-file: ${DATA_PATH}/bpe/merges.txt
+  --split: 949,50,1
+  --distributed-backend: nccl
+  --lr: 0.00015
+  --lr-decay-style: cosine
+  --min-lr: 1.0e-5
+  --weight-decay: 1e-2
+  --clip-grad: 1.0
+  --lr-warmup-fraction: .01
+  --log-interval: 1
+  --save-interval: 10000
+  --eval-interval: 1000
+  --eval-iters: 10
+  --transformer-impl: transformer_engine
+  --tensor-model-parallel-size: 2
+  --pipeline-model-parallel-size: 2
+  --no-create-attention-mask-in-dataloader: true
+  --deterministic-mode: true
+  --no-gradient-accumulation-fusion: true
+  --attention-softmax-in-fp32: true
+  --use-mcore-models: true
+  --ckpt-format: torch_dist
+  --data-cache-path: ${DATA_CACHE_PATH}
+  --bf16: true
+TEST_TYPE: regular
diff --git a/tests/functional_tests/test_cases/gpt/gpt3_mr_mcore_te_tp2_pp2_no_mmap_bin_files_dgx_a100_1N8G/golden_values_dev.json b/tests/functional_tests/test_cases/gpt/gpt3_mr_mcore_te_tp2_pp2_no_mmap_bin_files_dgx_a100_1N8G/golden_values_dev.json
new file mode 100644
index 0000000000..6e255054a1
--- /dev/null
+++ b/tests/functional_tests/test_cases/gpt/gpt3_mr_mcore_te_tp2_pp2_no_mmap_bin_files_dgx_a100_1N8G/golden_values_dev.json
@@ -0,0 +1,53 @@
+{
+    "lm loss": {
+        "start_step": 0,
+        "end_step": 50,
+        "step_interval": 5,
+        "values": [
+            10.92655,
+            10.9356,
+            10.89279,
+            10.87309,
+            10.74892,
+            10.65436,
+            10.15723,
+            10.2467,
+            10.15196,
+            9.83834
+        ]
+    },
+    "num-zeros": {
+        "start_step": 0,
+        "end_step": 50,
+        "step_interval": 5,
+        "values": [
+            1637.0,
+            1871.0,
+            1961.0,
+            1750.0,
+            1831.0,
+            1817.0,
+            1600.0,
+            2009.0,
+            2300.0,
+            2398.0
+        ]
+    },
+    "iteration-time": {
+        "start_step": 0,
+        "end_step": 50,
+        "step_interval": 5,
+        "values": [
+            9.28903,
+            0.20065,
+            0.20159,
+            0.20207,
+            0.20263,
+            0.19738,
+            0.19961,
+            0.199,
+            0.19954,
+            0.19791
+        ]
+    }
+}
\ No newline at end of file
diff --git a/tests/functional_tests/test_cases/gpt/gpt3_mr_mcore_te_tp2_pp2_no_mmap_bin_files_dgx_a100_1N8G/golden_values_lts.json b/tests/functional_tests/test_cases/gpt/gpt3_mr_mcore_te_tp2_pp2_no_mmap_bin_files_dgx_a100_1N8G/golden_values_lts.json
new file mode 100644
index 0000000000..265ad7c9b9
--- /dev/null
+++ b/tests/functional_tests/test_cases/gpt/gpt3_mr_mcore_te_tp2_pp2_no_mmap_bin_files_dgx_a100_1N8G/golden_values_lts.json
@@ -0,0 +1 @@
+{"lm loss": {"start_step": 0, "end_step": 50, "step_interval": 5, "values": [10.92705, 10.93626, 10.89335, 10.87325, 10.74869, 10.65372, 10.15755, 10.24642, 10.15177, 9.83802]}, "num-zeros": {"start_step": 0, "end_step": 50, "step_interval": 5, "values": [1627.0, 1889.0, 1973.0, 1785.0, 1797.0, 1836.0, 1602.0, 2034.0, 2316.0, 2307.0]}, "iteration_timing_avg": 0.15396205882352942}
\ No newline at end of file
diff --git a/tests/functional_tests/test_cases/gpt/gpt3_mr_mcore_te_tp2_pp2_no_mmap_bin_files_dgx_a100_1N8G/model_config.yaml b/tests/functional_tests/test_cases/gpt/gpt3_mr_mcore_te_tp2_pp2_no_mmap_bin_files_dgx_a100_1N8G/model_config.yaml
new file mode 100644
index 0000000000..32bd642deb
--- /dev/null
+++ b/tests/functional_tests/test_cases/gpt/gpt3_mr_mcore_te_tp2_pp2_no_mmap_bin_files_dgx_a100_1N8G/model_config.yaml
@@ -0,0 +1,51 @@
+ENV_VARS:
+  CUDA_DEVICE_MAX_CONNECTIONS: 1
+  NVTE_ALLOW_NONDETERMINISTIC_ALGO: 0
+  NCCL_ALGO: Tree
+  CUBLAS_WORKSPACE_CONFIG: :4096:8
+  N_REPEATS: 5
+MODEL_ARGS:
+  --num-layers: 12
+  --hidden-size: 512
+  --num-attention-heads: 8
+  --log-params-norm: true
+  --log-num-zeros-in-grad: true
+  --log-validation-ppl-to-tensorboard: true
+  --log-timers-to-tensorboard: true
+  --tensorboard-dir: ${TENSORBOARD_PATH}
+  --micro-batch-size: 4
+  --global-batch-size: 32
+  --seq-length: 1024
+  --max-position-embeddings: 1024
+  --train-iters: 50
+  --timing-log-level: 2
+  --lr-decay-iters: 320000
+  --save: ${CHECKPOINT_PATH}
+  --load: ${CHECKPOINT_PATH}
+  --data-path: ${DATA_PATH}/my-gpt3_00_text_document
+  --vocab-file: ${DATA_PATH}/bpe/vocab.json
+  --merge-file: ${DATA_PATH}/bpe/merges.txt
+  --split: 949,50,1
+  --distributed-backend: nccl
+  --lr: 0.00015
+  --lr-decay-style: cosine
+  --min-lr: 1.0e-5
+  --weight-decay: 1e-2
+  --clip-grad: 1.0
+  --lr-warmup-fraction: .01
+  --log-interval: 1
+  --save-interval: 10000
+  --eval-interval: 1000
+  --eval-iters: 10
+  --transformer-impl: transformer_engine
+  --tensor-model-parallel-size: 2
+  --pipeline-model-parallel-size: 2
+  --no-mmap-bin-files: true
+  --deterministic-mode: true
+  --no-gradient-accumulation-fusion: true
+  --attention-softmax-in-fp32: true
+  --use-mcore-models: true
+  --ckpt-format: torch_dist
+  --data-cache-path: ${DATA_CACHE_PATH}
+  --bf16: true
+TEST_TYPE: regular
diff --git a/tests/functional_tests/test_cases/gpt/gpt3_mr_mcore_te_tp2_pp2_resume_torch_dist_cp2_nondeterministic_dgx_a100_1N8G/model_config.yaml b/tests/functional_tests/test_cases/gpt/gpt3_mr_mcore_te_tp2_pp2_resume_torch_dist_cp2_nondeterministic_dgx_a100_1N8G/model_config.yaml
new file mode 100644
index 0000000000..7d64cf477f
--- /dev/null
+++ b/tests/functional_tests/test_cases/gpt/gpt3_mr_mcore_te_tp2_pp2_resume_torch_dist_cp2_nondeterministic_dgx_a100_1N8G/model_config.yaml
@@ -0,0 +1,52 @@
+ENV_VARS:
+  CUDA_DEVICE_MAX_CONNECTIONS: 1
+  NVTE_ALLOW_NONDETERMINISTIC_ALGO: 1
+  N_REPEATS: 5
+MODEL_ARGS:
+  --num-layers: 12
+  --hidden-size: 512
+  --num-attention-heads: 8
+  --log-params-norm: true
+  --log-num-zeros-in-grad: true
+  --log-validation-ppl-to-tensorboard: true
+  --log-timers-to-tensorboard: true
+  --tensorboard-dir: ${TENSORBOARD_PATH}
+  --micro-batch-size: 4
+  --global-batch-size: 32
+  --seq-length: 1024
+  --max-position-embeddings: 1024
+  --train-iters: 100
+  --timing-log-level: 2
+  --lr-decay-iters: 320000
+  --save: ${CHECKPOINT_PATH}
+  --load: ${CHECKPOINT_PATH}
+  --data-path: ${DATA_PATH}/my-gpt3_00_text_document
+  --vocab-file: ${DATA_PATH}/bpe/vocab.json
+  --merge-file: ${DATA_PATH}/bpe/merges.txt
+  --split: 949,50,1
+  --distributed-backend: nccl
+  --lr: 0.00015
+  --lr-decay-style: cosine
+  --min-lr: 1.0e-5
+  --weight-decay: 1e-2
+  --clip-grad: 1.0
+  --lr-warmup-fraction: .01
+  --log-interval: 1
+  --save-interval: 50
+  --eval-interval: 1000
+  --eval-iters: 10
+  --transformer-impl: transformer_engine
+  --tensor-model-parallel-size: 2
+  --pipeline-model-parallel-size: 2
+  --context-parallel-size: 2
+  --sequence-parallel: true
+  --hidden-dropout: 0.0
+  --attention-dropout: 0.0
+  --no-gradient-accumulation-fusion: true
+  --attention-softmax-in-fp32: true
+  --use-checkpoint-opt_param-scheduler: true
+  --use-mcore-models: true
+  --ckpt-format: torch_dist
+  --data-cache-path: ${DATA_CACHE_PATH}
+  --bf16: true
+TEST_TYPE: ckpt-resume
diff --git a/tests/functional_tests/test_cases/gpt/gpt3_mr_mcore_te_tp2_pp2_resume_torch_dist_cross_entropy_loss_fusion_dgx_a100_1N8G/model_config.yaml b/tests/functional_tests/test_cases/gpt/gpt3_mr_mcore_te_tp2_pp2_resume_torch_dist_cross_entropy_loss_fusion_dgx_a100_1N8G/model_config.yaml
new file mode 100644
index 0000000000..6014052dd6
--- /dev/null
+++ b/tests/functional_tests/test_cases/gpt/gpt3_mr_mcore_te_tp2_pp2_resume_torch_dist_cross_entropy_loss_fusion_dgx_a100_1N8G/model_config.yaml
@@ -0,0 +1,49 @@
+ENV_VARS:
+  CUDA_DEVICE_MAX_CONNECTIONS: 1
+  NVTE_ALLOW_NONDETERMINISTIC_ALGO: 1
+  N_REPEATS: 5
+MODEL_ARGS:
+  --num-layers: 12
+  --hidden-size: 512
+  --num-attention-heads: 8
+  --log-params-norm: true
+  --log-num-zeros-in-grad: true
+  --log-validation-ppl-to-tensorboard: true
+  --log-timers-to-tensorboard: true
+  --tensorboard-dir: ${TENSORBOARD_PATH}
+  --micro-batch-size: 4
+  --global-batch-size: 32
+  --seq-length: 1024
+  --max-position-embeddings: 1024
+  --train-iters: 100
+  --timing-log-level: 2
+  --lr-decay-iters: 320000
+  --save: ${CHECKPOINT_PATH}
+  --load: ${CHECKPOINT_PATH}
+  --data-path: ${DATA_PATH}/my-gpt3_00_text_document
+  --vocab-file: ${DATA_PATH}/bpe/vocab.json
+  --merge-file: ${DATA_PATH}/bpe/merges.txt
+  --split: 949,50,1
+  --distributed-backend: nccl
+  --lr: 0.00015
+  --lr-decay-style: cosine
+  --min-lr: 1.0e-5
+  --weight-decay: 1e-2
+  --clip-grad: 1.0
+  --lr-warmup-fraction: .01
+  --log-interval: 1
+  --save-interval: 50
+  --eval-interval: 1000
+  --eval-iters: 10
+  --transformer-impl: transformer_engine
+  --tensor-model-parallel-size: 2
+  --pipeline-model-parallel-size: 2
+  --cross-entropy-loss-fusion: true
+  --no-gradient-accumulation-fusion: true
+  --attention-softmax-in-fp32: true
+  --use-checkpoint-opt_param-scheduler: true
+  --use-mcore-models: true
+  --ckpt-format: torch_dist
+  --data-cache-path: ${DATA_CACHE_PATH}
+  --bf16: true
+TEST_TYPE: ckpt-resume
diff --git a/tests/functional_tests/test_cases/gpt/gpt3_mr_mcore_te_tp2_pp2_resume_torch_dist_ddp_average_in_collective_dgx_a100_1N8G/model_config.yaml b/tests/functional_tests/test_cases/gpt/gpt3_mr_mcore_te_tp2_pp2_resume_torch_dist_ddp_average_in_collective_dgx_a100_1N8G/model_config.yaml
new file mode 100644
index 0000000000..6d8a590974
--- /dev/null
+++ b/tests/functional_tests/test_cases/gpt/gpt3_mr_mcore_te_tp2_pp2_resume_torch_dist_ddp_average_in_collective_dgx_a100_1N8G/model_config.yaml
@@ -0,0 +1,52 @@
+ENV_VARS:
+  CUDA_DEVICE_MAX_CONNECTIONS: 1
+  NVTE_ALLOW_NONDETERMINISTIC_ALGO: 0
+  NCCL_ALGO: Tree
+  CUBLAS_WORKSPACE_CONFIG: :4096:8
+  N_REPEATS: 5
+MODEL_ARGS:
+  --num-layers: 12
+  --hidden-size: 512
+  --num-attention-heads: 8
+  --log-params-norm: true
+  --log-num-zeros-in-grad: true
+  --log-validation-ppl-to-tensorboard: true
+  --log-timers-to-tensorboard: true
+  --tensorboard-dir: ${TENSORBOARD_PATH}
+  --micro-batch-size: 4
+  --global-batch-size: 32
+  --seq-length: 1024
+  --max-position-embeddings: 1024
+  --train-iters: 100
+  --timing-log-level: 2
+  --lr-decay-iters: 320000
+  --save: ${CHECKPOINT_PATH}
+  --load: ${CHECKPOINT_PATH}
+  --data-path: ${DATA_PATH}/my-gpt3_00_text_document
+  --vocab-file: ${DATA_PATH}/bpe/vocab.json
+  --merge-file: ${DATA_PATH}/bpe/merges.txt
+  --split: 949,50,1
+  --distributed-backend: nccl
+  --lr: 0.00015
+  --lr-decay-style: cosine
+  --min-lr: 1.0e-5
+  --weight-decay: 1e-2
+  --clip-grad: 1.0
+  --lr-warmup-fraction: .01
+  --log-interval: 1
+  --save-interval: 50
+  --eval-interval: 1000
+  --eval-iters: 10
+  --transformer-impl: transformer_engine
+  --tensor-model-parallel-size: 2
+  --pipeline-model-parallel-size: 2
+  --ddp-average-in-collective: true
+  --deterministic-mode: true
+  --no-gradient-accumulation-fusion: true
+  --attention-softmax-in-fp32: true
+  --use-checkpoint-opt_param-scheduler: true
+  --use-mcore-models: true
+  --ckpt-format: torch_dist
+  --data-cache-path: ${DATA_CACHE_PATH}
+  --bf16: true
+TEST_TYPE: ckpt-resume
diff --git a/tests/functional_tests/test_cases/gpt/gpt3_mr_mcore_te_tp2_pp2_resume_torch_dist_defer_embedding_wgrad_compute_dgx_a100_1N8G/model_config.yaml b/tests/functional_tests/test_cases/gpt/gpt3_mr_mcore_te_tp2_pp2_resume_torch_dist_defer_embedding_wgrad_compute_dgx_a100_1N8G/model_config.yaml
new file mode 100644
index 0000000000..c304692d62
--- /dev/null
+++ b/tests/functional_tests/test_cases/gpt/gpt3_mr_mcore_te_tp2_pp2_resume_torch_dist_defer_embedding_wgrad_compute_dgx_a100_1N8G/model_config.yaml
@@ -0,0 +1,52 @@
+ENV_VARS:
+  CUDA_DEVICE_MAX_CONNECTIONS: 1
+  NVTE_ALLOW_NONDETERMINISTIC_ALGO: 0
+  NCCL_ALGO: Tree
+  CUBLAS_WORKSPACE_CONFIG: :4096:8
+  N_REPEATS: 5
+MODEL_ARGS:
+  --num-layers: 12
+  --hidden-size: 512
+  --num-attention-heads: 8
+  --log-params-norm: true
+  --log-num-zeros-in-grad: true
+  --log-validation-ppl-to-tensorboard: true
+  --log-timers-to-tensorboard: true
+  --tensorboard-dir: ${TENSORBOARD_PATH}
+  --micro-batch-size: 4
+  --global-batch-size: 32
+  --seq-length: 1024
+  --max-position-embeddings: 1024
+  --train-iters: 100
+  --timing-log-level: 2
+  --lr-decay-iters: 320000
+  --save: ${CHECKPOINT_PATH}
+  --load: ${CHECKPOINT_PATH}
+  --data-path: ${DATA_PATH}/my-gpt3_00_text_document
+  --vocab-file: ${DATA_PATH}/bpe/vocab.json
+  --merge-file: ${DATA_PATH}/bpe/merges.txt
+  --split: 949,50,1
+  --distributed-backend: nccl
+  --lr: 0.00015
+  --lr-decay-style: cosine
+  --min-lr: 1.0e-5
+  --weight-decay: 1e-2
+  --clip-grad: 1.0
+  --lr-warmup-fraction: .01
+  --log-interval: 1
+  --save-interval: 50
+  --eval-interval: 1000
+  --eval-iters: 10
+  --transformer-impl: transformer_engine
+  --tensor-model-parallel-size: 2
+  --pipeline-model-parallel-size: 2
+  --defer-embedding-wgrad-compute: true
+  --wgrad-deferral-limit: 2
+  --deterministic-mode: true
+  --attention-softmax-in-fp32: true
+  --use-checkpoint-opt_param-scheduler: true
+  --use-mcore-models: true
+  --ckpt-format: torch_dist
+  --data-cache-path: ${DATA_CACHE_PATH}
+  --bf16: true
+TEST_TYPE: ckpt-resume
diff --git a/tests/functional_tests/test_cases/gpt/gpt3_mr_mcore_te_tp2_pp2_resume_torch_dist_dgx_a100_1N8G/model_config.yaml b/tests/functional_tests/test_cases/gpt/gpt3_mr_mcore_te_tp2_pp2_resume_torch_dist_dgx_a100_1N8G/model_config.yaml
new file mode 100644
index 0000000000..d8f1585ae2
--- /dev/null
+++ b/tests/functional_tests/test_cases/gpt/gpt3_mr_mcore_te_tp2_pp2_resume_torch_dist_dgx_a100_1N8G/model_config.yaml
@@ -0,0 +1,51 @@
+ENV_VARS:
+  CUDA_DEVICE_MAX_CONNECTIONS: 1
+  NVTE_ALLOW_NONDETERMINISTIC_ALGO: 0
+  NCCL_ALGO: Tree
+  CUBLAS_WORKSPACE_CONFIG: :4096:8
+  N_REPEATS: 5
+MODEL_ARGS:
+  --num-layers: 12
+  --hidden-size: 512
+  --num-attention-heads: 8
+  --log-params-norm: true
+  --log-num-zeros-in-grad: true
+  --log-validation-ppl-to-tensorboard: true
+  --log-timers-to-tensorboard: true
+  --tensorboard-dir: ${TENSORBOARD_PATH}
+  --micro-batch-size: 4
+  --global-batch-size: 32
+  --seq-length: 1024
+  --max-position-embeddings: 1024
+  --train-iters: 100
+  --timing-log-level: 2
+  --lr-decay-iters: 320000
+  --save: ${CHECKPOINT_PATH}
+  --load: ${CHECKPOINT_PATH}
+  --data-path: ${DATA_PATH}/my-gpt3_00_text_document
+  --vocab-file: ${DATA_PATH}/bpe/vocab.json
+  --merge-file: ${DATA_PATH}/bpe/merges.txt
+  --split: 949,50,1
+  --distributed-backend: nccl
+  --lr: 0.00015
+  --lr-decay-style: cosine
+  --min-lr: 1.0e-5
+  --weight-decay: 1e-2
+  --clip-grad: 1.0
+  --lr-warmup-fraction: .01
+  --log-interval: 1
+  --save-interval: 50
+  --eval-interval: 1000
+  --eval-iters: 10
+  --transformer-impl: transformer_engine
+  --tensor-model-parallel-size: 2
+  --pipeline-model-parallel-size: 2
+  --deterministic-mode: true
+  --no-gradient-accumulation-fusion: true
+  --attention-softmax-in-fp32: true
+  --use-checkpoint-opt_param-scheduler: true
+  --use-mcore-models: true
+  --ckpt-format: torch_dist
+  --data-cache-path: ${DATA_CACHE_PATH}
+  --bf16: true
+TEST_TYPE: ckpt-resume
diff --git a/tests/functional_tests/test_cases/gpt/gpt3_mr_mcore_te_tp2_pp2_resume_torch_dist_no_create_attention_mask_in_dataloader_dgx_a100_1N8G/model_config.yaml b/tests/functional_tests/test_cases/gpt/gpt3_mr_mcore_te_tp2_pp2_resume_torch_dist_no_create_attention_mask_in_dataloader_dgx_a100_1N8G/model_config.yaml
new file mode 100644
index 0000000000..c02d1fdc67
--- /dev/null
+++ b/tests/functional_tests/test_cases/gpt/gpt3_mr_mcore_te_tp2_pp2_resume_torch_dist_no_create_attention_mask_in_dataloader_dgx_a100_1N8G/model_config.yaml
@@ -0,0 +1,52 @@
+ENV_VARS:
+  CUDA_DEVICE_MAX_CONNECTIONS: 1
+  NVTE_ALLOW_NONDETERMINISTIC_ALGO: 0
+  NCCL_ALGO: Tree
+  CUBLAS_WORKSPACE_CONFIG: :4096:8
+  N_REPEATS: 5
+MODEL_ARGS:
+  --num-layers: 12
+  --hidden-size: 512
+  --num-attention-heads: 8
+  --log-params-norm: true
+  --log-num-zeros-in-grad: true
+  --log-validation-ppl-to-tensorboard: true
+  --log-timers-to-tensorboard: true
+  --tensorboard-dir: ${TENSORBOARD_PATH}
+  --micro-batch-size: 4
+  --global-batch-size: 32
+  --seq-length: 1024
+  --max-position-embeddings: 1024
+  --train-iters: 100
+  --timing-log-level: 2
+  --lr-decay-iters: 320000
+  --save: ${CHECKPOINT_PATH}
+  --load: ${CHECKPOINT_PATH}
+  --data-path: ${DATA_PATH}/my-gpt3_00_text_document
+  --vocab-file: ${DATA_PATH}/bpe/vocab.json
+  --merge-file: ${DATA_PATH}/bpe/merges.txt
+  --split: 949,50,1
+  --distributed-backend: nccl
+  --lr: 0.00015
+  --lr-decay-style: cosine
+  --min-lr: 1.0e-5
+  --weight-decay: 1e-2
+  --clip-grad: 1.0
+  --lr-warmup-fraction: .01
+  --log-interval: 1
+  --save-interval: 50
+  --eval-interval: 1000
+  --eval-iters: 10
+  --transformer-impl: transformer_engine
+  --tensor-model-parallel-size: 2
+  --pipeline-model-parallel-size: 2
+  --no-create-attention-mask-in-dataloader: true
+  --deterministic-mode: true
+  --no-gradient-accumulation-fusion: true
+  --attention-softmax-in-fp32: true
+  --use-checkpoint-opt_param-scheduler: true
+  --use-mcore-models: true
+  --ckpt-format: torch_dist
+  --data-cache-path: ${DATA_CACHE_PATH}
+  --bf16: true
+TEST_TYPE: ckpt-resume
diff --git a/tests/functional_tests/test_cases/gpt/gpt3_mr_mcore_te_tp2_pp2_resume_torch_dist_no_mmap_bin_files_dgx_a100_1N8G/model_config.yaml b/tests/functional_tests/test_cases/gpt/gpt3_mr_mcore_te_tp2_pp2_resume_torch_dist_no_mmap_bin_files_dgx_a100_1N8G/model_config.yaml
new file mode 100644
index 0000000000..7d5b13b753
--- /dev/null
+++ b/tests/functional_tests/test_cases/gpt/gpt3_mr_mcore_te_tp2_pp2_resume_torch_dist_no_mmap_bin_files_dgx_a100_1N8G/model_config.yaml
@@ -0,0 +1,52 @@
+ENV_VARS:
+  CUDA_DEVICE_MAX_CONNECTIONS: 1
+  NVTE_ALLOW_NONDETERMINISTIC_ALGO: 0
+  NCCL_ALGO: Tree
+  CUBLAS_WORKSPACE_CONFIG: :4096:8
+  N_REPEATS: 5
+MODEL_ARGS:
+  --num-layers: 12
+  --hidden-size: 512
+  --num-attention-heads: 8
+  --log-params-norm: true
+  --log-num-zeros-in-grad: true
+  --log-validation-ppl-to-tensorboard: true
+  --log-timers-to-tensorboard: true
+  --tensorboard-dir: ${TENSORBOARD_PATH}
+  --micro-batch-size: 4
+  --global-batch-size: 32
+  --seq-length: 1024
+  --max-position-embeddings: 1024
+  --train-iters: 100
+  --timing-log-level: 2
+  --lr-decay-iters: 320000
+  --save: ${CHECKPOINT_PATH}
+  --load: ${CHECKPOINT_PATH}
+  --data-path: ${DATA_PATH}/my-gpt3_00_text_document
+  --vocab-file: ${DATA_PATH}/bpe/vocab.json
+  --merge-file: ${DATA_PATH}/bpe/merges.txt
+  --split: 949,50,1
+  --distributed-backend: nccl
+  --lr: 0.00015
+  --lr-decay-style: cosine
+  --min-lr: 1.0e-5
+  --weight-decay: 1e-2
+  --clip-grad: 1.0
+  --lr-warmup-fraction: .01
+  --log-interval: 1
+  --save-interval: 50
+  --eval-interval: 1000
+  --eval-iters: 10
+  --transformer-impl: transformer_engine
+  --tensor-model-parallel-size: 2
+  --pipeline-model-parallel-size: 2
+  --no-mmap-bin-files: true
+  --deterministic-mode: true
+  --no-gradient-accumulation-fusion: true
+  --attention-softmax-in-fp32: true
+  --use-checkpoint-opt_param-scheduler: true
+  --use-mcore-models: true
+  --ckpt-format: torch_dist
+  --data-cache-path: ${DATA_CACHE_PATH}
+  --bf16: true
+TEST_TYPE: ckpt-resume
diff --git a/tests/functional_tests/test_cases/gpt/gpt3_mr_mcore_te_tp2_pp2_resume_torch_dist_reshard_1x4xNone_dgx_a100_1N8G/model_config.yaml b/tests/functional_tests/test_cases/gpt/gpt3_mr_mcore_te_tp2_pp2_resume_torch_dist_reshard_1x4xNone_dgx_a100_1N8G/model_config.yaml
new file mode 100644
index 0000000000..cff824669b
--- /dev/null
+++ b/tests/functional_tests/test_cases/gpt/gpt3_mr_mcore_te_tp2_pp2_resume_torch_dist_reshard_1x4xNone_dgx_a100_1N8G/model_config.yaml
@@ -0,0 +1,48 @@
+ENV_VARS:
+  CUDA_DEVICE_MAX_CONNECTIONS: 1
+  NVTE_ALLOW_NONDETERMINISTIC_ALGO: 1
+  N_REPEATS: 5
+MODEL_ARGS:
+  --num-layers: 12
+  --hidden-size: 512
+  --num-attention-heads: 8
+  --log-params-norm: true
+  --log-num-zeros-in-grad: true
+  --log-validation-ppl-to-tensorboard: true
+  --log-timers-to-tensorboard: true
+  --tensorboard-dir: ${TENSORBOARD_PATH}
+  --micro-batch-size: 4
+  --global-batch-size: 32
+  --seq-length: 1024
+  --max-position-embeddings: 1024
+  --train-iters: 100
+  --timing-log-level: 2
+  --lr-decay-iters: 320000
+  --save: ${CHECKPOINT_PATH}
+  --load: ${CHECKPOINT_PATH}
+  --data-path: ${DATA_PATH}/my-gpt3_00_text_document
+  --vocab-file: ${DATA_PATH}/bpe/vocab.json
+  --merge-file: ${DATA_PATH}/bpe/merges.txt
+  --split: 949,50,1
+  --distributed-backend: nccl
+  --lr: 0.00015
+  --lr-decay-style: cosine
+  --min-lr: 1.0e-5
+  --weight-decay: 1e-2
+  --clip-grad: 1.0
+  --lr-warmup-fraction: .01
+  --log-interval: 1
+  --save-interval: 50
+  --eval-interval: 1000
+  --eval-iters: 10
+  --transformer-impl: transformer_engine
+  --tensor-model-parallel-size: 2
+  --pipeline-model-parallel-size: 2
+  --no-gradient-accumulation-fusion: true
+  --attention-softmax-in-fp32: true
+  --use-checkpoint-opt_param-scheduler: true
+  --use-mcore-models: true
+  --ckpt-format: torch_dist
+  --data-cache-path: ${DATA_CACHE_PATH}
+  --bf16: true
+TEST_TYPE: ckpt-resume
diff --git a/tests/functional_tests/test_cases/gpt/gpt3_mr_mcore_te_tp4_pp1_dist_optimizer_overlap_grad_reduce_dgx_a100_1N8G/golden_values_dev.json b/tests/functional_tests/test_cases/gpt/gpt3_mr_mcore_te_tp4_pp1_dist_optimizer_overlap_grad_reduce_dgx_a100_1N8G/golden_values_dev.json
new file mode 100644
index 0000000000..ccc25b4383
--- /dev/null
+++ b/tests/functional_tests/test_cases/gpt/gpt3_mr_mcore_te_tp4_pp1_dist_optimizer_overlap_grad_reduce_dgx_a100_1N8G/golden_values_dev.json
@@ -0,0 +1,53 @@
+{
+    "lm loss": {
+        "start_step": 0,
+        "end_step": 50,
+        "step_interval": 5,
+        "values": [
+            10.86065,
+            10.88608,
+            10.87727,
+            10.831,
+            10.71671,
+            10.60631,
+            10.1308,
+            10.22732,
+            10.1594,
+            9.8346
+        ]
+    },
+    "num-zeros": {
+        "start_step": 0,
+        "end_step": 50,
+        "step_interval": 5,
+        "values": [
+            1716.0,
+            2142.0,
+            2183.0,
+            2043.0,
+            2005.0,
+            1914.0,
+            1805.0,
+            2190.0,
+            2454.0,
+            2611.0
+        ]
+    },
+    "iteration-time": {
+        "start_step": 0,
+        "end_step": 50,
+        "step_interval": 5,
+        "values": [
+            6.28231,
+            0.28547,
+            0.28705,
+            0.28165,
+            0.28136,
+            0.28266,
+            0.28035,
+            0.27874,
+            0.27939,
+            0.28144
+        ]
+    }
+}
\ No newline at end of file
diff --git a/tests/functional_tests/test_cases/gpt/gpt3_mr_mcore_te_tp4_pp1_dist_optimizer_overlap_grad_reduce_dgx_a100_1N8G/golden_values_lts.json b/tests/functional_tests/test_cases/gpt/gpt3_mr_mcore_te_tp4_pp1_dist_optimizer_overlap_grad_reduce_dgx_a100_1N8G/golden_values_lts.json
new file mode 100644
index 0000000000..196e4b2905
--- /dev/null
+++ b/tests/functional_tests/test_cases/gpt/gpt3_mr_mcore_te_tp4_pp1_dist_optimizer_overlap_grad_reduce_dgx_a100_1N8G/golden_values_lts.json
@@ -0,0 +1 @@
+{"lm loss": {"start_step": 0, "end_step": 50, "step_interval": 5, "values": [10.86126, 10.88643, 10.87768, 10.83108, 10.71635, 10.60599, 10.13124, 10.2275, 10.15914, 9.83465]}, "num-zeros": {"start_step": 0, "end_step": 50, "step_interval": 5, "values": [1752.0, 2067.0, 2123.0, 2072.0, 1999.0, 1941.0, 1784.0, 2229.0, 2546.0, 2567.0]}, "iteration_timing_avg": 0.2256223529411765}
\ No newline at end of file
diff --git a/tests/functional_tests/test_cases/gpt/gpt3_mr_mcore_te_tp4_pp1_dist_optimizer_overlap_grad_reduce_dgx_a100_1N8G/model_config.yaml b/tests/functional_tests/test_cases/gpt/gpt3_mr_mcore_te_tp4_pp1_dist_optimizer_overlap_grad_reduce_dgx_a100_1N8G/model_config.yaml
new file mode 100644
index 0000000000..8846dacb40
--- /dev/null
+++ b/tests/functional_tests/test_cases/gpt/gpt3_mr_mcore_te_tp4_pp1_dist_optimizer_overlap_grad_reduce_dgx_a100_1N8G/model_config.yaml
@@ -0,0 +1,52 @@
+ENV_VARS:
+  CUDA_DEVICE_MAX_CONNECTIONS: 1
+  NVTE_ALLOW_NONDETERMINISTIC_ALGO: 0
+  NCCL_ALGO: Tree
+  CUBLAS_WORKSPACE_CONFIG: :4096:8
+  N_REPEATS: 5
+MODEL_ARGS:
+  --num-layers: 12
+  --hidden-size: 512
+  --num-attention-heads: 8
+  --log-params-norm: true
+  --log-num-zeros-in-grad: true
+  --log-validation-ppl-to-tensorboard: true
+  --log-timers-to-tensorboard: true
+  --tensorboard-dir: ${TENSORBOARD_PATH}
+  --micro-batch-size: 4
+  --global-batch-size: 32
+  --seq-length: 1024
+  --max-position-embeddings: 1024
+  --train-iters: 50
+  --timing-log-level: 2
+  --lr-decay-iters: 320000
+  --save: ${CHECKPOINT_PATH}
+  --load: ${CHECKPOINT_PATH}
+  --data-path: ${DATA_PATH}/my-gpt3_00_text_document
+  --vocab-file: ${DATA_PATH}/bpe/vocab.json
+  --merge-file: ${DATA_PATH}/bpe/merges.txt
+  --split: 949,50,1
+  --distributed-backend: nccl
+  --lr: 0.00015
+  --lr-decay-style: cosine
+  --min-lr: 1.0e-5
+  --weight-decay: 1e-2
+  --clip-grad: 1.0
+  --lr-warmup-fraction: .01
+  --log-interval: 1
+  --save-interval: 10000
+  --eval-interval: 1000
+  --eval-iters: 10
+  --transformer-impl: transformer_engine
+  --tensor-model-parallel-size: 4
+  --pipeline-model-parallel-size: 1
+  --use-distributed-optimizer: true
+  --overlap-grad-reduce: true
+  --deterministic-mode: true
+  --no-gradient-accumulation-fusion: true
+  --attention-softmax-in-fp32: true
+  --use-mcore-models: true
+  --ckpt-format: torch_dist
+  --data-cache-path: ${DATA_CACHE_PATH}
+  --bf16: true
+TEST_TYPE: regular
diff --git a/tests/functional_tests/test_cases/gpt/gpt3_mr_mcore_te_tp4_pp1_dist_optimizer_overlap_grad_reduce_param_gather_dgx_a100_1N8G/golden_values_dev.json b/tests/functional_tests/test_cases/gpt/gpt3_mr_mcore_te_tp4_pp1_dist_optimizer_overlap_grad_reduce_param_gather_dgx_a100_1N8G/golden_values_dev.json
new file mode 100644
index 0000000000..277df1af52
--- /dev/null
+++ b/tests/functional_tests/test_cases/gpt/gpt3_mr_mcore_te_tp4_pp1_dist_optimizer_overlap_grad_reduce_param_gather_dgx_a100_1N8G/golden_values_dev.json
@@ -0,0 +1,53 @@
+{
+    "lm loss": {
+        "start_step": 0,
+        "end_step": 50,
+        "step_interval": 5,
+        "values": [
+            10.86065,
+            10.88608,
+            10.87727,
+            10.831,
+            10.71671,
+            10.60631,
+            10.1308,
+            10.22732,
+            10.1594,
+            9.8346
+        ]
+    },
+    "num-zeros": {
+        "start_step": 0,
+        "end_step": 50,
+        "step_interval": 5,
+        "values": [
+            1716.0,
+            2142.0,
+            2183.0,
+            2043.0,
+            2005.0,
+            1914.0,
+            1805.0,
+            2190.0,
+            2454.0,
+            2611.0
+        ]
+    },
+    "iteration-time": {
+        "start_step": 0,
+        "end_step": 50,
+        "step_interval": 5,
+        "values": [
+            8.96702,
+            0.28691,
+            0.2858,
+            0.28546,
+            0.2831,
+            0.28282,
+            0.28235,
+            0.28247,
+            0.28212,
+            0.2825
+        ]
+    }
+}
\ No newline at end of file
diff --git a/tests/functional_tests/test_cases/gpt/gpt3_mr_mcore_te_tp4_pp1_dist_optimizer_overlap_grad_reduce_param_gather_dgx_a100_1N8G/golden_values_lts.json b/tests/functional_tests/test_cases/gpt/gpt3_mr_mcore_te_tp4_pp1_dist_optimizer_overlap_grad_reduce_param_gather_dgx_a100_1N8G/golden_values_lts.json
new file mode 100644
index 0000000000..49917fe78d
--- /dev/null
+++ b/tests/functional_tests/test_cases/gpt/gpt3_mr_mcore_te_tp4_pp1_dist_optimizer_overlap_grad_reduce_param_gather_dgx_a100_1N8G/golden_values_lts.json
@@ -0,0 +1 @@
+{"lm loss": {"start_step": 0, "end_step": 50, "step_interval": 5, "values": [10.86126, 10.88643, 10.87768, 10.83108, 10.71635, 10.60599, 10.13124, 10.2275, 10.15914, 9.83465]}, "num-zeros": {"start_step": 0, "end_step": 50, "step_interval": 5, "values": [1752.0, 2067.0, 2123.0, 2072.0, 1999.0, 1941.0, 1784.0, 2229.0, 2546.0, 2567.0]}, "iteration_timing_avg": 0.22043823529411763}
\ No newline at end of file
diff --git a/tests/functional_tests/test_cases/gpt/gpt3_mr_mcore_te_tp4_pp1_dist_optimizer_overlap_grad_reduce_param_gather_dgx_a100_1N8G/model_config.yaml b/tests/functional_tests/test_cases/gpt/gpt3_mr_mcore_te_tp4_pp1_dist_optimizer_overlap_grad_reduce_param_gather_dgx_a100_1N8G/model_config.yaml
new file mode 100644
index 0000000000..9295cdc580
--- /dev/null
+++ b/tests/functional_tests/test_cases/gpt/gpt3_mr_mcore_te_tp4_pp1_dist_optimizer_overlap_grad_reduce_param_gather_dgx_a100_1N8G/model_config.yaml
@@ -0,0 +1,53 @@
+ENV_VARS:
+  CUDA_DEVICE_MAX_CONNECTIONS: 1
+  NVTE_ALLOW_NONDETERMINISTIC_ALGO: 0
+  NCCL_ALGO: Tree
+  CUBLAS_WORKSPACE_CONFIG: :4096:8
+  N_REPEATS: 5
+MODEL_ARGS:
+  --num-layers: 12
+  --hidden-size: 512
+  --num-attention-heads: 8
+  --log-params-norm: true
+  --log-num-zeros-in-grad: true
+  --log-validation-ppl-to-tensorboard: true
+  --log-timers-to-tensorboard: true
+  --tensorboard-dir: ${TENSORBOARD_PATH}
+  --micro-batch-size: 4
+  --global-batch-size: 32
+  --seq-length: 1024
+  --max-position-embeddings: 1024
+  --train-iters: 50
+  --timing-log-level: 2
+  --lr-decay-iters: 320000
+  --save: ${CHECKPOINT_PATH}
+  --load: ${CHECKPOINT_PATH}
+  --data-path: ${DATA_PATH}/my-gpt3_00_text_document
+  --vocab-file: ${DATA_PATH}/bpe/vocab.json
+  --merge-file: ${DATA_PATH}/bpe/merges.txt
+  --split: 949,50,1
+  --distributed-backend: nccl
+  --lr: 0.00015
+  --lr-decay-style: cosine
+  --min-lr: 1.0e-5
+  --weight-decay: 1e-2
+  --clip-grad: 1.0
+  --lr-warmup-fraction: .01
+  --log-interval: 1
+  --save-interval: 10000
+  --eval-interval: 1000
+  --eval-iters: 10
+  --transformer-impl: transformer_engine
+  --tensor-model-parallel-size: 4
+  --pipeline-model-parallel-size: 1
+  --use-distributed-optimizer: true
+  --overlap-grad-reduce: true
+  --overlap-param-gather: true
+  --deterministic-mode: true
+  --no-gradient-accumulation-fusion: true
+  --attention-softmax-in-fp32: true
+  --use-mcore-models: true
+  --ckpt-format: torch_dist
+  --data-cache-path: ${DATA_CACHE_PATH}
+  --bf16: true
+TEST_TYPE: regular
diff --git a/tests/functional_tests/test_cases/gpt/gpt3_mr_mcore_te_tp4_pp1_qk_layernorm_test_mode_dgx_a100_1N8G/golden_values_dev.json b/tests/functional_tests/test_cases/gpt/gpt3_mr_mcore_te_tp4_pp1_qk_layernorm_test_mode_dgx_a100_1N8G/golden_values_dev.json
new file mode 100644
index 0000000000..87fec5135d
--- /dev/null
+++ b/tests/functional_tests/test_cases/gpt/gpt3_mr_mcore_te_tp4_pp1_qk_layernorm_test_mode_dgx_a100_1N8G/golden_values_dev.json
@@ -0,0 +1,53 @@
+{
+    "lm loss": {
+        "start_step": 0,
+        "end_step": 50,
+        "step_interval": 5,
+        "values": [
+            10.86068,
+            10.88629,
+            10.87817,
+            10.83284,
+            10.72061,
+            10.61155,
+            10.14139,
+            10.23429,
+            10.16623,
+            9.8443
+        ]
+    },
+    "num-zeros": {
+        "start_step": 0,
+        "end_step": 50,
+        "step_interval": 5,
+        "values": [
+            1759.0,
+            2157.0,
+            2237.0,
+            2082.0,
+            2118.0,
+            1941.0,
+            1757.0,
+            2223.0,
+            2527.0,
+            2641.0
+        ]
+    },
+    "iteration-time": {
+        "start_step": 0,
+        "end_step": 50,
+        "step_interval": 5,
+        "values": [
+            9.54719,
+            0.37979,
+            0.38002,
+            0.37952,
+            0.38133,
+            0.37848,
+            0.38021,
+            0.37925,
+            0.37876,
+            0.37987
+        ]
+    }
+}
\ No newline at end of file
diff --git a/tests/functional_tests/test_cases/gpt/gpt3_mr_mcore_te_tp4_pp1_qk_layernorm_test_mode_dgx_a100_1N8G/golden_values_lts.json b/tests/functional_tests/test_cases/gpt/gpt3_mr_mcore_te_tp4_pp1_qk_layernorm_test_mode_dgx_a100_1N8G/golden_values_lts.json
new file mode 100644
index 0000000000..8718207e0d
--- /dev/null
+++ b/tests/functional_tests/test_cases/gpt/gpt3_mr_mcore_te_tp4_pp1_qk_layernorm_test_mode_dgx_a100_1N8G/golden_values_lts.json
@@ -0,0 +1 @@
+{"lm loss": {"start_step": 0, "end_step": 50, "step_interval": 5, "values": [10.86217, 10.88641, 10.8786, 10.83291, 10.72031, 10.6109, 10.1418, 10.23434, 10.16605, 9.84445]}, "num-zeros": {"start_step": 0, "end_step": 50, "step_interval": 5, "values": [1769.0, 2019.0, 2145.0, 2058.0, 2166.0, 2060.0, 1776.0, 2174.0, 2524.0, 2645.0]}, "iteration_timing_avg": 0.2256223529411765}
\ No newline at end of file
diff --git a/tests/functional_tests/test_cases/gpt/gpt3_mr_mcore_te_tp4_pp1_qk_layernorm_test_mode_dgx_a100_1N8G/model_config.yaml b/tests/functional_tests/test_cases/gpt/gpt3_mr_mcore_te_tp4_pp1_qk_layernorm_test_mode_dgx_a100_1N8G/model_config.yaml
new file mode 100644
index 0000000000..b8f1667cdb
--- /dev/null
+++ b/tests/functional_tests/test_cases/gpt/gpt3_mr_mcore_te_tp4_pp1_qk_layernorm_test_mode_dgx_a100_1N8G/model_config.yaml
@@ -0,0 +1,52 @@
+ENV_VARS:
+  CUDA_DEVICE_MAX_CONNECTIONS: 1
+  NVTE_ALLOW_NONDETERMINISTIC_ALGO: 0
+  NCCL_ALGO: Tree
+  CUBLAS_WORKSPACE_CONFIG: :4096:8
+  N_REPEATS: 5
+MODEL_ARGS:
+  --num-layers: 12
+  --hidden-size: 512
+  --num-attention-heads: 8
+  --log-params-norm: true
+  --log-num-zeros-in-grad: true
+  --log-validation-ppl-to-tensorboard: true
+  --log-timers-to-tensorboard: true
+  --tensorboard-dir: ${TENSORBOARD_PATH}
+  --micro-batch-size: 4
+  --global-batch-size: 32
+  --seq-length: 1024
+  --max-position-embeddings: 1024
+  --train-iters: 50
+  --timing-log-level: 2
+  --lr-decay-iters: 320000
+  --save: ${CHECKPOINT_PATH}
+  --load: ${CHECKPOINT_PATH}
+  --data-path: ${DATA_PATH}/my-gpt3_00_text_document
+  --vocab-file: ${DATA_PATH}/bpe/vocab.json
+  --merge-file: ${DATA_PATH}/bpe/merges.txt
+  --split: 949,50,1
+  --distributed-backend: nccl
+  --lr: 0.00015
+  --lr-decay-style: cosine
+  --min-lr: 1.0e-5
+  --weight-decay: 1e-2
+  --clip-grad: 1.0
+  --lr-warmup-fraction: .01
+  --log-interval: 1
+  --save-interval: 10000
+  --eval-interval: 1000
+  --eval-iters: 10
+  --transformer-impl: transformer_engine
+  --tensor-model-parallel-size: 4
+  --pipeline-model-parallel-size: 1
+  --qk-layernorm: true
+  --test-mode: true
+  --deterministic-mode: true
+  --no-gradient-accumulation-fusion: true
+  --attention-softmax-in-fp32: true
+  --use-mcore-models: true
+  --ckpt-format: torch_dist
+  --data-cache-path: ${DATA_CACHE_PATH}
+  --bf16: true
+TEST_TYPE: regular
diff --git a/tests/functional_tests/test_cases/gpt/gpt3_mr_mcore_te_tp4_pp1_resume_torch_dist_dist_optimizer_overlap_grad_reduce_dgx_a100_1N8G/model_config.yaml b/tests/functional_tests/test_cases/gpt/gpt3_mr_mcore_te_tp4_pp1_resume_torch_dist_dist_optimizer_overlap_grad_reduce_dgx_a100_1N8G/model_config.yaml
new file mode 100644
index 0000000000..d2888f767c
--- /dev/null
+++ b/tests/functional_tests/test_cases/gpt/gpt3_mr_mcore_te_tp4_pp1_resume_torch_dist_dist_optimizer_overlap_grad_reduce_dgx_a100_1N8G/model_config.yaml
@@ -0,0 +1,53 @@
+ENV_VARS:
+  CUDA_DEVICE_MAX_CONNECTIONS: 1
+  NVTE_ALLOW_NONDETERMINISTIC_ALGO: 0
+  NCCL_ALGO: Tree
+  CUBLAS_WORKSPACE_CONFIG: :4096:8
+  N_REPEATS: 5
+MODEL_ARGS:
+  --num-layers: 12
+  --hidden-size: 512
+  --num-attention-heads: 8
+  --log-params-norm: true
+  --log-num-zeros-in-grad: true
+  --log-validation-ppl-to-tensorboard: true
+  --log-timers-to-tensorboard: true
+  --tensorboard-dir: ${TENSORBOARD_PATH}
+  --micro-batch-size: 4
+  --global-batch-size: 32
+  --seq-length: 1024
+  --max-position-embeddings: 1024
+  --train-iters: 100
+  --timing-log-level: 2
+  --lr-decay-iters: 320000
+  --save: ${CHECKPOINT_PATH}
+  --load: ${CHECKPOINT_PATH}
+  --data-path: ${DATA_PATH}/my-gpt3_00_text_document
+  --vocab-file: ${DATA_PATH}/bpe/vocab.json
+  --merge-file: ${DATA_PATH}/bpe/merges.txt
+  --split: 949,50,1
+  --distributed-backend: nccl
+  --lr: 0.00015
+  --lr-decay-style: cosine
+  --min-lr: 1.0e-5
+  --weight-decay: 1e-2
+  --clip-grad: 1.0
+  --lr-warmup-fraction: .01
+  --log-interval: 1
+  --save-interval: 50
+  --eval-interval: 1000
+  --eval-iters: 10
+  --transformer-impl: transformer_engine
+  --tensor-model-parallel-size: 4
+  --pipeline-model-parallel-size: 1
+  --use-distributed-optimizer: true
+  --overlap-grad-reduce: true
+  --deterministic-mode: true
+  --no-gradient-accumulation-fusion: true
+  --attention-softmax-in-fp32: true
+  --use-checkpoint-opt_param-scheduler: true
+  --use-mcore-models: true
+  --ckpt-format: torch_dist
+  --data-cache-path: ${DATA_CACHE_PATH}
+  --bf16: true
+TEST_TYPE: ckpt-resume
diff --git a/tests/functional_tests/test_cases/gpt/gpt3_mr_mcore_te_tp4_pp1_resume_torch_dist_dist_optimizer_overlap_grad_reduce_param_gather_dgx_a100_1N8G/model_config.yaml b/tests/functional_tests/test_cases/gpt/gpt3_mr_mcore_te_tp4_pp1_resume_torch_dist_dist_optimizer_overlap_grad_reduce_param_gather_dgx_a100_1N8G/model_config.yaml
new file mode 100644
index 0000000000..27acfbee86
--- /dev/null
+++ b/tests/functional_tests/test_cases/gpt/gpt3_mr_mcore_te_tp4_pp1_resume_torch_dist_dist_optimizer_overlap_grad_reduce_param_gather_dgx_a100_1N8G/model_config.yaml
@@ -0,0 +1,54 @@
+ENV_VARS:
+  CUDA_DEVICE_MAX_CONNECTIONS: 1
+  NVTE_ALLOW_NONDETERMINISTIC_ALGO: 0
+  NCCL_ALGO: Tree
+  CUBLAS_WORKSPACE_CONFIG: :4096:8
+  N_REPEATS: 5
+MODEL_ARGS:
+  --num-layers: 12
+  --hidden-size: 512
+  --num-attention-heads: 8
+  --log-params-norm: true
+  --log-num-zeros-in-grad: true
+  --log-validation-ppl-to-tensorboard: true
+  --log-timers-to-tensorboard: true
+  --tensorboard-dir: ${TENSORBOARD_PATH}
+  --micro-batch-size: 4
+  --global-batch-size: 32
+  --seq-length: 1024
+  --max-position-embeddings: 1024
+  --train-iters: 100
+  --timing-log-level: 2
+  --lr-decay-iters: 320000
+  --save: ${CHECKPOINT_PATH}
+  --load: ${CHECKPOINT_PATH}
+  --data-path: ${DATA_PATH}/my-gpt3_00_text_document
+  --vocab-file: ${DATA_PATH}/bpe/vocab.json
+  --merge-file: ${DATA_PATH}/bpe/merges.txt
+  --split: 949,50,1
+  --distributed-backend: nccl
+  --lr: 0.00015
+  --lr-decay-style: cosine
+  --min-lr: 1.0e-5
+  --weight-decay: 1e-2
+  --clip-grad: 1.0
+  --lr-warmup-fraction: .01
+  --log-interval: 1
+  --save-interval: 50
+  --eval-interval: 1000
+  --eval-iters: 10
+  --transformer-impl: transformer_engine
+  --tensor-model-parallel-size: 4
+  --pipeline-model-parallel-size: 1
+  --use-distributed-optimizer: true
+  --overlap-grad-reduce: true
+  --overlap-param-gather: true
+  --deterministic-mode: true
+  --no-gradient-accumulation-fusion: true
+  --attention-softmax-in-fp32: true
+  --use-checkpoint-opt_param-scheduler: true
+  --use-mcore-models: true
+  --ckpt-format: torch_dist
+  --data-cache-path: ${DATA_CACHE_PATH}
+  --bf16: true
+TEST_TYPE: ckpt-resume
diff --git a/tests/functional_tests/test_cases/gpt/gpt3_mr_mcore_te_tp4_pp1_resume_torch_dist_qk_layernorm_test_mode_dgx_a100_1N8G/model_config.yaml b/tests/functional_tests/test_cases/gpt/gpt3_mr_mcore_te_tp4_pp1_resume_torch_dist_qk_layernorm_test_mode_dgx_a100_1N8G/model_config.yaml
new file mode 100644
index 0000000000..1ea30bae73
--- /dev/null
+++ b/tests/functional_tests/test_cases/gpt/gpt3_mr_mcore_te_tp4_pp1_resume_torch_dist_qk_layernorm_test_mode_dgx_a100_1N8G/model_config.yaml
@@ -0,0 +1,53 @@
+ENV_VARS:
+  CUDA_DEVICE_MAX_CONNECTIONS: 1
+  NVTE_ALLOW_NONDETERMINISTIC_ALGO: 0
+  NCCL_ALGO: Tree
+  CUBLAS_WORKSPACE_CONFIG: :4096:8
+  N_REPEATS: 5
+MODEL_ARGS:
+  --num-layers: 12
+  --hidden-size: 512
+  --num-attention-heads: 8
+  --log-params-norm: true
+  --log-num-zeros-in-grad: true
+  --log-validation-ppl-to-tensorboard: true
+  --log-timers-to-tensorboard: true
+  --tensorboard-dir: ${TENSORBOARD_PATH}
+  --micro-batch-size: 4
+  --global-batch-size: 32
+  --seq-length: 1024
+  --max-position-embeddings: 1024
+  --train-iters: 100
+  --timing-log-level: 2
+  --lr-decay-iters: 320000
+  --save: ${CHECKPOINT_PATH}
+  --load: ${CHECKPOINT_PATH}
+  --data-path: ${DATA_PATH}/my-gpt3_00_text_document
+  --vocab-file: ${DATA_PATH}/bpe/vocab.json
+  --merge-file: ${DATA_PATH}/bpe/merges.txt
+  --split: 949,50,1
+  --distributed-backend: nccl
+  --lr: 0.00015
+  --lr-decay-style: cosine
+  --min-lr: 1.0e-5
+  --weight-decay: 1e-2
+  --clip-grad: 1.0
+  --lr-warmup-fraction: .01
+  --log-interval: 1
+  --save-interval: 50
+  --eval-interval: 1000
+  --eval-iters: 10
+  --transformer-impl: transformer_engine
+  --tensor-model-parallel-size: 4
+  --pipeline-model-parallel-size: 1
+  --qk-layernorm: true
+  --test-mode: true
+  --deterministic-mode: true
+  --no-gradient-accumulation-fusion: true
+  --attention-softmax-in-fp32: true
+  --use-checkpoint-opt_param-scheduler: true
+  --use-mcore-models: true
+  --ckpt-format: torch_dist
+  --data-cache-path: ${DATA_CACHE_PATH}
+  --bf16: true
+TEST_TYPE: ckpt-resume
diff --git a/tests/functional_tests/test_cases/gpt/gpt3_mr_mcore_te_tp4_pp2_resume_torch_dist_reshard_8x1xNone_dgx_a100_1N8G/model_config.yaml b/tests/functional_tests/test_cases/gpt/gpt3_mr_mcore_te_tp4_pp2_resume_torch_dist_reshard_8x1xNone_dgx_a100_1N8G/model_config.yaml
new file mode 100644
index 0000000000..f3348d608d
--- /dev/null
+++ b/tests/functional_tests/test_cases/gpt/gpt3_mr_mcore_te_tp4_pp2_resume_torch_dist_reshard_8x1xNone_dgx_a100_1N8G/model_config.yaml
@@ -0,0 +1,51 @@
+ENV_VARS:
+  CUDA_DEVICE_MAX_CONNECTIONS: 1
+  NVTE_ALLOW_NONDETERMINISTIC_ALGO: 1
+  N_REPEATS: 5
+MODEL_ARGS:
+  --num-layers: 12
+  --hidden-size: 512
+  --num-attention-heads: 8
+  --log-params-norm: true
+  --log-num-zeros-in-grad: true
+  --log-validation-ppl-to-tensorboard: true
+  --log-timers-to-tensorboard: true
+  --tensorboard-dir: ${TENSORBOARD_PATH}
+  --micro-batch-size: 4
+  --global-batch-size: 32
+  --seq-length: 1024
+  --max-position-embeddings: 1024
+  --train-iters: 100
+  --timing-log-level: 2
+  --lr-decay-iters: 320000
+  --save: ${CHECKPOINT_PATH}
+  --load: ${CHECKPOINT_PATH}
+  --data-path: ${DATA_PATH}/my-gpt3_00_text_document
+  --vocab-file: ${DATA_PATH}/bpe/vocab.json
+  --merge-file: ${DATA_PATH}/bpe/merges.txt
+  --split: 949,50,1
+  --distributed-backend: nccl
+  --lr: 0.00015
+  --lr-decay-style: cosine
+  --min-lr: 1.0e-5
+  --weight-decay: 1e-2
+  --clip-grad: 1.0
+  --lr-warmup-fraction: .01
+  --log-interval: 1
+  --save-interval: 50
+  --eval-interval: 1000
+  --eval-iters: 10
+  --transformer-impl: transformer_engine
+  --tensor-model-parallel-size: 4
+  --pipeline-model-parallel-size: 2
+  --use-distributed-optimizer: true
+  --async-save: true
+  --ckpt-fully-parallel-save: true
+  --no-gradient-accumulation-fusion: true
+  --attention-softmax-in-fp32: true
+  --use-checkpoint-opt_param-scheduler: true
+  --use-mcore-models: true
+  --ckpt-format: torch_dist
+  --data-cache-path: ${DATA_CACHE_PATH}
+  --bf16: true
+TEST_TYPE: ckpt-resume
diff --git a/tests/functional_tests/test_cases/gpt/gpt3_mr_mcore_tp2_pp2_resume_torch_dist_uninstall_te_dgx_a100_1N8G/model_config.yaml b/tests/functional_tests/test_cases/gpt/gpt3_mr_mcore_tp2_pp2_resume_torch_dist_uninstall_te_dgx_a100_1N8G/model_config.yaml
new file mode 100644
index 0000000000..fbb767cb14
--- /dev/null
+++ b/tests/functional_tests/test_cases/gpt/gpt3_mr_mcore_tp2_pp2_resume_torch_dist_uninstall_te_dgx_a100_1N8G/model_config.yaml
@@ -0,0 +1,53 @@
+ENV_VARS:
+  CUDA_DEVICE_MAX_CONNECTIONS: 1
+  NVTE_ALLOW_NONDETERMINISTIC_ALGO: 0
+  NCCL_ALGO: Tree
+  CUBLAS_WORKSPACE_CONFIG: :4096:8
+  N_REPEATS: 5
+MODEL_ARGS:
+  --num-layers: 12
+  --hidden-size: 512
+  --num-attention-heads: 8
+  --log-params-norm: true
+  --log-num-zeros-in-grad: true
+  --log-validation-ppl-to-tensorboard: true
+  --log-timers-to-tensorboard: true
+  --tensorboard-dir: ${TENSORBOARD_PATH}
+  --micro-batch-size: 4
+  --global-batch-size: 32
+  --seq-length: 1024
+  --max-position-embeddings: 1024
+  --train-iters: 100
+  --timing-log-level: 2
+  --lr-decay-iters: 320000
+  --save: ${CHECKPOINT_PATH}
+  --load: ${CHECKPOINT_PATH}
+  --data-path: ${DATA_PATH}/my-gpt3_00_text_document
+  --vocab-file: ${DATA_PATH}/bpe/vocab.json
+  --merge-file: ${DATA_PATH}/bpe/merges.txt
+  --split: 949,50,1
+  --distributed-backend: nccl
+  --lr: 0.00015
+  --lr-decay-style: cosine
+  --min-lr: 1.0e-5
+  --weight-decay: 1e-2
+  --clip-grad: 1.0
+  --lr-warmup-fraction: .01
+  --log-interval: 1
+  --save-interval: 50
+  --eval-interval: 1000
+  --eval-iters: 10
+  --transformer-impl: local
+  --tensor-model-parallel-size: 2
+  --pipeline-model-parallel-size: 2
+  --no-persist-layer-norm: true
+  --no-masked-softmax-fusion: true
+  --deterministic-mode: true
+  --no-gradient-accumulation-fusion: true
+  --use-checkpoint-opt_param-scheduler: true
+  --use-mcore-models: true
+  --ckpt-format: torch_dist
+  --data-cache-path: ${DATA_CACHE_PATH}
+  --fp16: true
+  --apply-query-key-layer-scaling: true
+TEST_TYPE: ckpt-resume
diff --git a/tests/functional_tests/test_cases/gpt/gpt3_mr_mcore_tp2_pp2_uninstall_te_dgx_a100_1N8G/model_config.yaml b/tests/functional_tests/test_cases/gpt/gpt3_mr_mcore_tp2_pp2_uninstall_te_dgx_a100_1N8G/model_config.yaml
new file mode 100644
index 0000000000..cf65df920f
--- /dev/null
+++ b/tests/functional_tests/test_cases/gpt/gpt3_mr_mcore_tp2_pp2_uninstall_te_dgx_a100_1N8G/model_config.yaml
@@ -0,0 +1,54 @@
+ENV_VARS:
+  CUDA_DEVICE_MAX_CONNECTIONS: 1
+  NVTE_ALLOW_NONDETERMINISTIC_ALGO: 0
+  NCCL_ALGO: Tree
+  CUBLAS_WORKSPACE_CONFIG: :4096:8
+  SKIP_PYTEST: 1
+  N_REPEATS: 1
+BEFORE_SCRIPT: pip uninstall -y transformer_engine pip uninstall -y Apex ## TODO: remove once Apex dependency has been removed completely
+MODEL_ARGS:
+  --num-layers: 12
+  --hidden-size: 512
+  --num-attention-heads: 8
+  --log-params-norm: true
+  --log-num-zeros-in-grad: true
+  --log-validation-ppl-to-tensorboard: true
+  --log-timers-to-tensorboard: true
+  --tensorboard-dir: ${TENSORBOARD_PATH}
+  --micro-batch-size: 4
+  --global-batch-size: 32
+  --seq-length: 1024
+  --max-position-embeddings: 1024
+  --train-iters: 50
+  --timing-log-level: 2
+  --lr-decay-iters: 320000
+  --save: ${CHECKPOINT_PATH}
+  --load: ${CHECKPOINT_PATH}
+  --data-path: ${DATA_PATH}/my-gpt3_00_text_document
+  --vocab-file: ${DATA_PATH}/bpe/vocab.json
+  --merge-file: ${DATA_PATH}/bpe/merges.txt
+  --split: 949,50,1
+  --distributed-backend: nccl
+  --lr: 0.00015
+  --lr-decay-style: cosine
+  --min-lr: 1.0e-5
+  --weight-decay: 1e-2
+  --clip-grad: 1.0
+  --lr-warmup-fraction: .01
+  --log-interval: 1
+  --save-interval: 10000
+  --eval-interval: 1000
+  --eval-iters: 10
+  --transformer-impl: local
+  --tensor-model-parallel-size: 2
+  --pipeline-model-parallel-size: 2
+  --no-persist-layer-norm: true
+  --no-masked-softmax-fusion: true
+  --deterministic-mode: true
+  --no-gradient-accumulation-fusion: true
+  --use-mcore-models: true
+  --ckpt-format: torch_dist
+  --data-cache-path: ${DATA_CACHE_PATH}
+  --fp16: true
+  --apply-query-key-layer-scaling: true
+TEST_TYPE: regular
diff --git a/tests/functional_tests/test_cases/gpt/gpt3_mr_te_tp2_pp2_dgx_a100_1N8G/golden_values_dev.json b/tests/functional_tests/test_cases/gpt/gpt3_mr_te_tp2_pp2_dgx_a100_1N8G/golden_values_dev.json
new file mode 100644
index 0000000000..e728823b4c
--- /dev/null
+++ b/tests/functional_tests/test_cases/gpt/gpt3_mr_te_tp2_pp2_dgx_a100_1N8G/golden_values_dev.json
@@ -0,0 +1,53 @@
+{
+    "lm loss": {
+        "start_step": 0,
+        "end_step": 50,
+        "step_interval": 5,
+        "values": [
+            10.85926,
+            10.89117,
+            10.86647,
+            10.81416,
+            10.70027,
+            10.60761,
+            10.10644,
+            10.21377,
+            10.12972,
+            9.8041
+        ]
+    },
+    "num-zeros": {
+        "start_step": 0,
+        "end_step": 50,
+        "step_interval": 5,
+        "values": [
+            1726.0,
+            1922.0,
+            2043.0,
+            1879.0,
+            1882.0,
+            1821.0,
+            1648.0,
+            2039.0,
+            2379.0,
+            2451.0
+        ]
+    },
+    "iteration-time": {
+        "start_step": 0,
+        "end_step": 50,
+        "step_interval": 5,
+        "values": [
+            11.65882,
+            0.1955,
+            0.19501,
+            0.19146,
+            0.19165,
+            0.1903,
+            0.19096,
+            0.19025,
+            0.1901,
+            0.18996
+        ]
+    }
+}
\ No newline at end of file
diff --git a/tests/functional_tests/test_cases/gpt/gpt3_mr_te_tp2_pp2_dgx_a100_1N8G/golden_values_lts.json b/tests/functional_tests/test_cases/gpt/gpt3_mr_te_tp2_pp2_dgx_a100_1N8G/golden_values_lts.json
new file mode 100644
index 0000000000..5c516f0562
--- /dev/null
+++ b/tests/functional_tests/test_cases/gpt/gpt3_mr_te_tp2_pp2_dgx_a100_1N8G/golden_values_lts.json
@@ -0,0 +1 @@
+{"lm loss": {"start_step": 0, "end_step": 50, "step_interval": 5, "values": [10.86208, 10.89137, 10.86731, 10.81652, 10.70126, 10.60816, 10.11007, 10.21889, 10.1294, 9.80326]}, "num-zeros": {"start_step": 0, "end_step": 50, "step_interval": 5, "values": [1659.0, 1944.0, 1974.0, 1920.0, 1918.0, 1855.0, 1621.0, 2018.0, 2436.0, 2304.0]}, "iteration_timing_avg": 0.14203264705882354}
\ No newline at end of file
diff --git a/tests/functional_tests/test_cases/gpt/gpt3_mr_te_tp2_pp2_dgx_a100_1N8G/model_config.yaml b/tests/functional_tests/test_cases/gpt/gpt3_mr_te_tp2_pp2_dgx_a100_1N8G/model_config.yaml
new file mode 100644
index 0000000000..af105662a9
--- /dev/null
+++ b/tests/functional_tests/test_cases/gpt/gpt3_mr_te_tp2_pp2_dgx_a100_1N8G/model_config.yaml
@@ -0,0 +1,50 @@
+ENV_VARS:
+  CUDA_DEVICE_MAX_CONNECTIONS: 1
+  NVTE_ALLOW_NONDETERMINISTIC_ALGO: 0
+  NCCL_ALGO: Tree
+  CUBLAS_WORKSPACE_CONFIG: :4096:8
+  N_REPEATS: 5
+MODEL_ARGS:
+  --num-layers: 12
+  --hidden-size: 512
+  --num-attention-heads: 8
+  --log-params-norm: true
+  --log-num-zeros-in-grad: true
+  --log-validation-ppl-to-tensorboard: true
+  --log-timers-to-tensorboard: true
+  --tensorboard-dir: ${TENSORBOARD_PATH}
+  --micro-batch-size: 4
+  --global-batch-size: 32
+  --seq-length: 1024
+  --max-position-embeddings: 1024
+  --train-iters: 50
+  --timing-log-level: 2
+  --lr-decay-iters: 320000
+  --save: ${CHECKPOINT_PATH}
+  --load: ${CHECKPOINT_PATH}
+  --data-path: ${DATA_PATH}/my-gpt3_00_text_document
+  --vocab-file: ${DATA_PATH}/bpe/vocab.json
+  --merge-file: ${DATA_PATH}/bpe/merges.txt
+  --split: 949,50,1
+  --distributed-backend: nccl
+  --lr: 0.00015
+  --lr-decay-style: cosine
+  --min-lr: 1.0e-5
+  --weight-decay: 1e-2
+  --clip-grad: 1.0
+  --lr-warmup-fraction: .01
+  --log-interval: 1
+  --save-interval: 10000
+  --eval-interval: 1000
+  --eval-iters: 10
+  --transformer-impl: transformer_engine
+  --tensor-model-parallel-size: 2
+  --pipeline-model-parallel-size: 2
+  --deterministic-mode: true
+  --no-gradient-accumulation-fusion: true
+  --attention-softmax-in-fp32: true
+  --ckpt-format: torch
+  --use-legacy-models: true
+  --data-cache-path: ${DATA_CACHE_PATH}
+  --bf16: true
+TEST_TYPE: regular
diff --git a/tests/functional_tests/test_cases/gpt/gpt3_mr_te_tp2_pp2_resume_torch_dgx_a100_1N8G/model_config.yaml b/tests/functional_tests/test_cases/gpt/gpt3_mr_te_tp2_pp2_resume_torch_dgx_a100_1N8G/model_config.yaml
new file mode 100644
index 0000000000..3d27f95aa6
--- /dev/null
+++ b/tests/functional_tests/test_cases/gpt/gpt3_mr_te_tp2_pp2_resume_torch_dgx_a100_1N8G/model_config.yaml
@@ -0,0 +1,51 @@
+ENV_VARS:
+  CUDA_DEVICE_MAX_CONNECTIONS: 1
+  NVTE_ALLOW_NONDETERMINISTIC_ALGO: 0
+  NCCL_ALGO: Tree
+  CUBLAS_WORKSPACE_CONFIG: :4096:8
+  N_REPEATS: 5
+MODEL_ARGS:
+  --num-layers: 12
+  --hidden-size: 512
+  --num-attention-heads: 8
+  --log-params-norm: true
+  --log-num-zeros-in-grad: true
+  --log-validation-ppl-to-tensorboard: true
+  --log-timers-to-tensorboard: true
+  --tensorboard-dir: ${TENSORBOARD_PATH}
+  --micro-batch-size: 4
+  --global-batch-size: 32
+  --seq-length: 1024
+  --max-position-embeddings: 1024
+  --train-iters: 100
+  --timing-log-level: 2
+  --lr-decay-iters: 320000
+  --save: ${CHECKPOINT_PATH}
+  --load: ${CHECKPOINT_PATH}
+  --data-path: ${DATA_PATH}/my-gpt3_00_text_document
+  --vocab-file: ${DATA_PATH}/bpe/vocab.json
+  --merge-file: ${DATA_PATH}/bpe/merges.txt
+  --split: 949,50,1
+  --distributed-backend: nccl
+  --lr: 0.00015
+  --lr-decay-style: cosine
+  --min-lr: 1.0e-5
+  --weight-decay: 1e-2
+  --clip-grad: 1.0
+  --lr-warmup-fraction: .01
+  --log-interval: 1
+  --save-interval: 50
+  --eval-interval: 1000
+  --eval-iters: 10
+  --transformer-impl: transformer_engine
+  --tensor-model-parallel-size: 2
+  --pipeline-model-parallel-size: 2
+  --deterministic-mode: true
+  --no-gradient-accumulation-fusion: true
+  --attention-softmax-in-fp32: true
+  --use-checkpoint-opt_param-scheduler: true
+  --ckpt-format: torch
+  --use-legacy-models: true
+  --data-cache-path: ${DATA_CACHE_PATH}
+  --bf16: true
+TEST_TYPE: ckpt-resume
diff --git a/tests/functional_tests/test_cases/gpt/gpt3_mr_tp1_pp4_vp1_dgx_a100_1N8G/golden_values_dev.json b/tests/functional_tests/test_cases/gpt/gpt3_mr_tp1_pp4_vp1_dgx_a100_1N8G/golden_values_dev.json
new file mode 100644
index 0000000000..68d9fe822f
--- /dev/null
+++ b/tests/functional_tests/test_cases/gpt/gpt3_mr_tp1_pp4_vp1_dgx_a100_1N8G/golden_values_dev.json
@@ -0,0 +1 @@
+{"lm loss": {"start_step": 0, "end_step": 50, "step_interval": 5, "values": [10.79311, 10.85248, 10.87281, 10.83016, 10.82949, 10.78726, 10.565, 10.57088, 10.4836, 10.19521]}, "num-zeros": {"start_step": 0, "end_step": 34, "step_interval": 5, "values": [2450.0, 2765.0, 2163.0, 2585.0, 2634.0, 2585.0, 2987.0]}, "iteration_timing_avg": 0.1211408823529412}
diff --git a/tests/functional_tests/test_cases/gpt/gpt3_mr_tp1_pp4_vp1_dgx_a100_1N8G/golden_values_lts.json b/tests/functional_tests/test_cases/gpt/gpt3_mr_tp1_pp4_vp1_dgx_a100_1N8G/golden_values_lts.json
new file mode 100644
index 0000000000..68d9fe822f
--- /dev/null
+++ b/tests/functional_tests/test_cases/gpt/gpt3_mr_tp1_pp4_vp1_dgx_a100_1N8G/golden_values_lts.json
@@ -0,0 +1 @@
+{"lm loss": {"start_step": 0, "end_step": 50, "step_interval": 5, "values": [10.79311, 10.85248, 10.87281, 10.83016, 10.82949, 10.78726, 10.565, 10.57088, 10.4836, 10.19521]}, "num-zeros": {"start_step": 0, "end_step": 34, "step_interval": 5, "values": [2450.0, 2765.0, 2163.0, 2585.0, 2634.0, 2585.0, 2987.0]}, "iteration_timing_avg": 0.1211408823529412}
diff --git a/tests/functional_tests/test_cases/gpt/gpt3_mr_tp1_pp4_vp1_dgx_a100_1N8G/model_config.yaml b/tests/functional_tests/test_cases/gpt/gpt3_mr_tp1_pp4_vp1_dgx_a100_1N8G/model_config.yaml
new file mode 100644
index 0000000000..1e6b07a429
--- /dev/null
+++ b/tests/functional_tests/test_cases/gpt/gpt3_mr_tp1_pp4_vp1_dgx_a100_1N8G/model_config.yaml
@@ -0,0 +1,51 @@
+ENV_VARS:
+  CUDA_DEVICE_MAX_CONNECTIONS: 1
+  NVTE_ALLOW_NONDETERMINISTIC_ALGO: 0
+  NCCL_ALGO: Tree
+  CUBLAS_WORKSPACE_CONFIG: :4096:8
+  N_REPEATS: 5
+MODEL_ARGS:
+  --num-layers: 12
+  --hidden-size: 512
+  --num-attention-heads: 8
+  --log-params-norm: true
+  --log-num-zeros-in-grad: true
+  --log-validation-ppl-to-tensorboard: true
+  --log-timers-to-tensorboard: true
+  --tensorboard-dir: ${TENSORBOARD_PATH}
+  --micro-batch-size: 4
+  --global-batch-size: 32
+  --seq-length: 1024
+  --max-position-embeddings: 1024
+  --train-iters: 50
+  --timing-log-level: 2
+  --lr-decay-iters: 320000
+  --save: ${CHECKPOINT_PATH}
+  --load: ${CHECKPOINT_PATH}
+  --data-path: ${DATA_PATH}/my-gpt3_00_text_document
+  --vocab-file: ${DATA_PATH}/bpe/vocab.json
+  --merge-file: ${DATA_PATH}/bpe/merges.txt
+  --split: 949,50,1
+  --distributed-backend: nccl
+  --lr: 0.00015
+  --lr-decay-style: cosine
+  --min-lr: 1.0e-5
+  --weight-decay: 1e-2
+  --clip-grad: 1.0
+  --lr-warmup-fraction: .01
+  --log-interval: 1
+  --save-interval: 10000
+  --eval-interval: 1000
+  --eval-iters: 10
+  --transformer-impl: local
+  --tensor-model-parallel-size: 1
+  --pipeline-model-parallel-size: 4
+  --num-layers-per-virtual-pipeline-stage: 1
+  --deterministic-mode: true
+  --no-gradient-accumulation-fusion: true
+  --ckpt-format: torch
+  --use-legacy-models: true
+  --data-cache-path: ${DATA_CACHE_PATH}
+  --fp16: true
+  --apply-query-key-layer-scaling: true
+TEST_TYPE: regular
diff --git a/tests/functional_tests/test_cases/gpt/gpt3_mr_tp1_pp4_vp1_resume_torch_dgx_a100_1N8G/model_config.yaml b/tests/functional_tests/test_cases/gpt/gpt3_mr_tp1_pp4_vp1_resume_torch_dgx_a100_1N8G/model_config.yaml
new file mode 100644
index 0000000000..2ff5fc2224
--- /dev/null
+++ b/tests/functional_tests/test_cases/gpt/gpt3_mr_tp1_pp4_vp1_resume_torch_dgx_a100_1N8G/model_config.yaml
@@ -0,0 +1,52 @@
+ENV_VARS:
+  CUDA_DEVICE_MAX_CONNECTIONS: 1
+  NVTE_ALLOW_NONDETERMINISTIC_ALGO: 0
+  NCCL_ALGO: Tree
+  CUBLAS_WORKSPACE_CONFIG: :4096:8
+  N_REPEATS: 5
+MODEL_ARGS:
+  --num-layers: 12
+  --hidden-size: 512
+  --num-attention-heads: 8
+  --log-params-norm: true
+  --log-num-zeros-in-grad: true
+  --log-validation-ppl-to-tensorboard: true
+  --log-timers-to-tensorboard: true
+  --tensorboard-dir: ${TENSORBOARD_PATH}
+  --micro-batch-size: 4
+  --global-batch-size: 32
+  --seq-length: 1024
+  --max-position-embeddings: 1024
+  --train-iters: 100
+  --timing-log-level: 2
+  --lr-decay-iters: 320000
+  --save: ${CHECKPOINT_PATH}
+  --load: ${CHECKPOINT_PATH}
+  --data-path: ${DATA_PATH}/my-gpt3_00_text_document
+  --vocab-file: ${DATA_PATH}/bpe/vocab.json
+  --merge-file: ${DATA_PATH}/bpe/merges.txt
+  --split: 949,50,1
+  --distributed-backend: nccl
+  --lr: 0.00015
+  --lr-decay-style: cosine
+  --min-lr: 1.0e-5
+  --weight-decay: 1e-2
+  --clip-grad: 1.0
+  --lr-warmup-fraction: .01
+  --log-interval: 1
+  --save-interval: 50
+  --eval-interval: 1000
+  --eval-iters: 10
+  --transformer-impl: local
+  --tensor-model-parallel-size: 1
+  --pipeline-model-parallel-size: 4
+  --num-layers-per-virtual-pipeline-stage: 1
+  --deterministic-mode: true
+  --no-gradient-accumulation-fusion: true
+  --use-checkpoint-opt_param-scheduler: true
+  --ckpt-format: torch
+  --use-legacy-models: true
+  --data-cache-path: ${DATA_CACHE_PATH}
+  --fp16: true
+  --apply-query-key-layer-scaling: true
+TEST_TYPE: ckpt-resume
diff --git a/tests/functional_tests/test_cases/gpt/gpt3_mr_tp2_pp2_dgx_a100_1N8G/golden_values_dev.json b/tests/functional_tests/test_cases/gpt/gpt3_mr_tp2_pp2_dgx_a100_1N8G/golden_values_dev.json
new file mode 100644
index 0000000000..87df9ed6c0
--- /dev/null
+++ b/tests/functional_tests/test_cases/gpt/gpt3_mr_tp2_pp2_dgx_a100_1N8G/golden_values_dev.json
@@ -0,0 +1 @@
+{"lm loss": {"start_step": 0, "end_step": 50, "step_interval": 5, "values": [10.85929, 10.89211, 10.87639, 10.86988, 10.88179, 10.83898, 10.66589, 10.62691, 10.52461, 10.25708]}, "num-zeros": {"start_step": 0, "end_step": 33, "step_interval": 5, "values": [2458.0, 2527.0, 2467.0, 2148.0, 2250.0, 2467.0, 2528.0]}, "iteration_timing_avg": 0.14292588235294112}
diff --git a/tests/functional_tests/test_cases/gpt/gpt3_mr_tp2_pp2_dgx_a100_1N8G/golden_values_lts.json b/tests/functional_tests/test_cases/gpt/gpt3_mr_tp2_pp2_dgx_a100_1N8G/golden_values_lts.json
new file mode 100644
index 0000000000..87df9ed6c0
--- /dev/null
+++ b/tests/functional_tests/test_cases/gpt/gpt3_mr_tp2_pp2_dgx_a100_1N8G/golden_values_lts.json
@@ -0,0 +1 @@
+{"lm loss": {"start_step": 0, "end_step": 50, "step_interval": 5, "values": [10.85929, 10.89211, 10.87639, 10.86988, 10.88179, 10.83898, 10.66589, 10.62691, 10.52461, 10.25708]}, "num-zeros": {"start_step": 0, "end_step": 33, "step_interval": 5, "values": [2458.0, 2527.0, 2467.0, 2148.0, 2250.0, 2467.0, 2528.0]}, "iteration_timing_avg": 0.14292588235294112}
diff --git a/tests/functional_tests/test_cases/gpt/gpt3_mr_tp2_pp2_dgx_a100_1N8G/model_config.yaml b/tests/functional_tests/test_cases/gpt/gpt3_mr_tp2_pp2_dgx_a100_1N8G/model_config.yaml
new file mode 100644
index 0000000000..4e4a963417
--- /dev/null
+++ b/tests/functional_tests/test_cases/gpt/gpt3_mr_tp2_pp2_dgx_a100_1N8G/model_config.yaml
@@ -0,0 +1,50 @@
+ENV_VARS:
+  CUDA_DEVICE_MAX_CONNECTIONS: 1
+  NVTE_ALLOW_NONDETERMINISTIC_ALGO: 0
+  NCCL_ALGO: Tree
+  CUBLAS_WORKSPACE_CONFIG: :4096:8
+  N_REPEATS: 5
+MODEL_ARGS:
+  --num-layers: 12
+  --hidden-size: 512
+  --num-attention-heads: 8
+  --log-params-norm: true
+  --log-num-zeros-in-grad: true
+  --log-validation-ppl-to-tensorboard: true
+  --log-timers-to-tensorboard: true
+  --tensorboard-dir: ${TENSORBOARD_PATH}
+  --micro-batch-size: 4
+  --global-batch-size: 32
+  --seq-length: 1024
+  --max-position-embeddings: 1024
+  --train-iters: 50
+  --timing-log-level: 2
+  --lr-decay-iters: 320000
+  --save: ${CHECKPOINT_PATH}
+  --load: ${CHECKPOINT_PATH}
+  --data-path: ${DATA_PATH}/my-gpt3_00_text_document
+  --vocab-file: ${DATA_PATH}/bpe/vocab.json
+  --merge-file: ${DATA_PATH}/bpe/merges.txt
+  --split: 949,50,1
+  --distributed-backend: nccl
+  --lr: 0.00015
+  --lr-decay-style: cosine
+  --min-lr: 1.0e-5
+  --weight-decay: 1e-2
+  --clip-grad: 1.0
+  --lr-warmup-fraction: .01
+  --log-interval: 1
+  --save-interval: 10000
+  --eval-interval: 1000
+  --eval-iters: 10
+  --transformer-impl: local
+  --tensor-model-parallel-size: 2
+  --pipeline-model-parallel-size: 2
+  --deterministic-mode: true
+  --no-gradient-accumulation-fusion: true
+  --ckpt-format: torch
+  --use-legacy-models: true
+  --data-cache-path: ${DATA_CACHE_PATH}
+  --fp16: true
+  --apply-query-key-layer-scaling: true
+TEST_TYPE: regular
diff --git a/tests/functional_tests/test_cases/gpt/gpt3_mr_tp2_pp2_resume_torch_dgx_a100_1N8G/model_config.yaml b/tests/functional_tests/test_cases/gpt/gpt3_mr_tp2_pp2_resume_torch_dgx_a100_1N8G/model_config.yaml
new file mode 100644
index 0000000000..8d11e207e7
--- /dev/null
+++ b/tests/functional_tests/test_cases/gpt/gpt3_mr_tp2_pp2_resume_torch_dgx_a100_1N8G/model_config.yaml
@@ -0,0 +1,51 @@
+ENV_VARS:
+  CUDA_DEVICE_MAX_CONNECTIONS: 1
+  NVTE_ALLOW_NONDETERMINISTIC_ALGO: 0
+  NCCL_ALGO: Tree
+  CUBLAS_WORKSPACE_CONFIG: :4096:8
+  N_REPEATS: 5
+MODEL_ARGS:
+  --num-layers: 12
+  --hidden-size: 512
+  --num-attention-heads: 8
+  --log-params-norm: true
+  --log-num-zeros-in-grad: true
+  --log-validation-ppl-to-tensorboard: true
+  --log-timers-to-tensorboard: true
+  --tensorboard-dir: ${TENSORBOARD_PATH}
+  --micro-batch-size: 4
+  --global-batch-size: 32
+  --seq-length: 1024
+  --max-position-embeddings: 1024
+  --train-iters: 100
+  --timing-log-level: 2
+  --lr-decay-iters: 320000
+  --save: ${CHECKPOINT_PATH}
+  --load: ${CHECKPOINT_PATH}
+  --data-path: ${DATA_PATH}/my-gpt3_00_text_document
+  --vocab-file: ${DATA_PATH}/bpe/vocab.json
+  --merge-file: ${DATA_PATH}/bpe/merges.txt
+  --split: 949,50,1
+  --distributed-backend: nccl
+  --lr: 0.00015
+  --lr-decay-style: cosine
+  --min-lr: 1.0e-5
+  --weight-decay: 1e-2
+  --clip-grad: 1.0
+  --lr-warmup-fraction: .01
+  --log-interval: 1
+  --save-interval: 50
+  --eval-interval: 1000
+  --eval-iters: 10
+  --transformer-impl: local
+  --tensor-model-parallel-size: 2
+  --pipeline-model-parallel-size: 2
+  --deterministic-mode: true
+  --no-gradient-accumulation-fusion: true
+  --use-checkpoint-opt_param-scheduler: true
+  --ckpt-format: torch
+  --use-legacy-models: true
+  --data-cache-path: ${DATA_CACHE_PATH}
+  --fp16: true
+  --apply-query-key-layer-scaling: true
+TEST_TYPE: ckpt-resume
diff --git a/tests/functional_tests/test_cases/mixtral/mixtral_8x22b_tp2pp8ep8vpp1_release/model_config.yaml b/tests/functional_tests/test_cases/mixtral/mixtral_8x22b_tp2pp8ep8vpp1_release/model_config.yaml
new file mode 100644
index 0000000000..9516076dc6
--- /dev/null
+++ b/tests/functional_tests/test_cases/mixtral/mixtral_8x22b_tp2pp8ep8vpp1_release/model_config.yaml
@@ -0,0 +1,110 @@
+ENV_VARS:
+  CUDA_DEVICE_MAX_CONNECTIONS: 1
+  TORCH_NCCL_AVOID_RECORD_STREAMS: 1
+  NVTE_ALLOW_NONDETERMINISTIC_ALGO: 1
+  PYTORCH_CUDA_ALLOC_CONF: expandable_segments:True
+  NCCL_NVLS_ENABLE: 0
+
+TEST_TYPE: "release"
+
+MODEL_ARGS:
+  # Distributed args
+  --distributed-timeout-minutes: 60
+  --tensor-model-parallel-size: 2
+  --pipeline-model-parallel-size: 8
+  --use-distributed-optimizer: true
+  --overlap-grad-reduce: true
+  --overlap-param-gather: true
+
+  # Training args
+  --use-mcore-models: true
+  --sequence-parallel: true
+  --use-flash-attn: true
+  --disable-bias-linear: true
+  --micro-batch-size: 1
+  --global-batch-size: 256
+  --train-samples: 38400
+  --exit-duration-in-mins: 230
+
+  # Transformer Engine args
+  --transformer-impl: transformer_engine
+
+  # Data args
+  --data-cache-path: ${DATA_CACHE_PATH}
+  --tokenizer-type: Llama2Tokenizer
+  --tokenizer-model: ${DATA_PATH}/tokenizer.model
+  --data-path: ${DATA_BLEND}
+  --split: 99,1,0
+  --no-mmap-bin-files: true
+  --num-workers: 6
+
+  # Add network size args
+  --untie-embeddings-and-output-weights: true
+  --no-position-embedding: true
+  --position-embedding-type: rope
+  --rotary-percent: 1.0
+  --normalization: RMSNorm
+  --swiglu: true
+  --num-layers: 56
+  --hidden-size: 6144
+  --ffn-hidden-size: 16384
+  --num-attention-heads: 48
+  --group-query-attention: true
+  --num-query-groups: 8
+  --seq-length: 4096
+  --max-position-embeddings: 4096
+  --make-vocab-size-divisible-by: 128
+
+  # Add regularization args
+  --attention-dropout: 0.0
+  --hidden-dropout: 0.0
+  --clip-grad: 1.0
+  --weight-decay: 0.1
+
+  # Add learning rate args
+  --lr-decay-samples: 255126953
+  --lr-warmup-samples: 162761
+  --lr: 1.2e-5
+  --min-lr: 1.2e-6
+  --lr-decay-style: cosine
+  --adam-beta1: 0.9
+  --adam-beta2: 0.95
+
+  # Add MoE args
+  --expert-model-parallel-size: 8
+  --num-experts: 8
+  --moe-router-load-balancing-type: aux_loss
+  --moe-router-topk: 2
+  --moe-grouped-gemm: true
+  --moe-aux-loss-coeff: 1e-2
+  --moe-token-dispatcher-type: alltoall
+
+  # Add validation args
+  --eval-iters: 32
+  --eval-interval: 500
+
+  # Add checkpointing args
+  --finetune: true
+  --auto-detect-ckpt-format: true
+  --load: ${LOAD_PATH}
+  --save: ${OUTPUT_PATH}/checkpoints
+  --no-ckpt-fully-parallel-save: true
+  --save-interval: 500
+
+  # Add initialization args
+  --init-method-std: 0.008
+
+  # Add logging args
+  --log-timers-to-tensorboard: true
+  --log-memory-to-tensorboard: true
+  --log-num-zeros-in-grad: true
+  --log-params-norm: true
+  --log-validation-ppl-to-tensorboard: true
+  --log-throughput: true
+  --log-interval: 1
+  --tensorboard-dir: ${OUTPUT_PATH}/tensorboard
+  --wandb-project: megatron-core-release-runs
+  --wandb-exp-name: ${WANDB_EXPERIMENT}
+
+  # Add mixed precision args
+  --bf16: true
diff --git a/tests/functional_tests/test_cases/mixtral/mixtral_8x7b_alltoall_tp2pp4ep4_release/golden_values_0.8.0.json b/tests/functional_tests/test_cases/mixtral/mixtral_8x7b_alltoall_tp2pp4ep4_release/golden_values_0.8.0.json
new file mode 100644
index 0000000000..fd05d12398
--- /dev/null
+++ b/tests/functional_tests/test_cases/mixtral/mixtral_8x7b_alltoall_tp2pp4ep4_release/golden_values_0.8.0.json
@@ -0,0 +1,326 @@
+{
+    "lm loss": {
+        "start_step": 0,
+        "end_step": 502,
+        "step_interval": 5,
+        "values": [
+            12.66411,
+            12.57516,
+            11.54354,
+            10.6032,
+            10.16449,
+            9.88042,
+            9.63438,
+            9.41891,
+            9.20503,
+            9.03148,
+            8.87789,
+            8.67233,
+            8.53839,
+            8.43406,
+            8.31108,
+            8.16115,
+            8.02824,
+            7.92113,
+            7.76569,
+            7.64618,
+            7.56482,
+            7.423,
+            7.33899,
+            7.1926,
+            7.12876,
+            7.00496,
+            6.94097,
+            6.84124,
+            6.75131,
+            6.66666,
+            6.61212,
+            6.52689,
+            6.46099,
+            6.38008,
+            6.33837,
+            6.26728,
+            6.21,
+            6.11653,
+            6.08526,
+            5.99383,
+            5.97289,
+            5.87339,
+            5.84685,
+            5.8009,
+            5.73867,
+            5.66111,
+            5.64924,
+            5.61117,
+            5.54497,
+            5.52944,
+            5.44052,
+            5.4127,
+            5.34505,
+            5.32588,
+            5.31378,
+            5.21715,
+            5.153,
+            5.15225,
+            5.1334,
+            5.10311,
+            5.06526,
+            5.01847,
+            4.98702,
+            4.94667,
+            4.91664,
+            4.91943,
+            4.87036,
+            4.82483,
+            4.81318,
+            4.77824,
+            4.74309,
+            4.73812,
+            4.66233,
+            4.64263,
+            4.66767,
+            4.60771,
+            4.59091,
+            4.55776,
+            4.51109,
+            4.4562,
+            4.4568,
+            4.39769,
+            4.39211,
+            4.38708,
+            4.32148,
+            4.3179,
+            4.25069,
+            4.22698,
+            4.18783,
+            4.17126,
+            4.15768,
+            4.12308,
+            4.10039,
+            4.03635,
+            4.04794,
+            4.05032,
+            3.98542,
+            4.01068,
+            3.96227,
+            3.89516,
+            3.91924
+        ]
+    },
+    "mem-allocated-bytes": {
+        "start_step": 0,
+        "end_step": 502,
+        "step_interval": 5,
+        "values": [
+            17448312832.0,
+            17448214528.0,
+            17448243200.0,
+            17447923712.0,
+            17448040448.0,
+            17448124416.0,
+            17448331264.0,
+            17448151040.0,
+            17448157184.0,
+            17448271872.0,
+            17448185856.0,
+            17448304640.0,
+            17448306688.0,
+            17448359936.0,
+            17448329216.0,
+            17448173568.0,
+            17448312832.0,
+            17448181760.0,
+            17448278016.0,
+            17448253440.0,
+            17448331264.0,
+            17448394752.0,
+            17448251392.0,
+            17448341504.0,
+            17448284160.0,
+            17448210432.0,
+            17448198144.0,
+            17448226816.0,
+            17448251392.0,
+            17448212480.0,
+            17448351744.0,
+            17448347648.0,
+            17448235008.0,
+            17448189952.0,
+            17448259584.0,
+            17448318976.0,
+            17448214528.0,
+            17448271872.0,
+            17448235008.0,
+            17448286208.0,
+            17448230912.0,
+            17448288256.0,
+            17448288256.0,
+            17448230912.0,
+            17448284160.0,
+            17449197568.0,
+            17448337408.0,
+            17448259584.0,
+            17448253440.0,
+            17448259584.0,
+            17448224768.0,
+            17448280064.0,
+            17448230912.0,
+            17448224768.0,
+            17448267776.0,
+            17448263680.0,
+            17448296448.0,
+            17448230912.0,
+            17448220672.0,
+            17448257536.0,
+            17448200192.0,
+            17448306688.0,
+            17448265728.0,
+            17448226816.0,
+            17448304640.0,
+            17448230912.0,
+            17448230912.0,
+            17448310784.0,
+            17448253440.0,
+            17448253440.0,
+            17448308736.0,
+            17448243200.0,
+            17448239104.0,
+            17448294400.0,
+            17448282112.0,
+            17448296448.0,
+            17448280064.0,
+            17448251392.0,
+            17448259584.0,
+            17448282112.0,
+            17448308736.0,
+            17448294400.0,
+            17448286208.0,
+            17448290304.0,
+            17448280064.0,
+            17448288256.0,
+            17448278016.0,
+            17448284160.0,
+            17448290304.0,
+            17448308736.0,
+            17448267776.0,
+            17448259584.0,
+            17448302592.0,
+            17448284160.0,
+            17448243200.0,
+            17448298496.0,
+            17448243200.0,
+            17448286208.0,
+            17448269824.0,
+            17448267776.0,
+            17448247296.0
+        ]
+    },
+    "iteration-time": {
+        "start_step": 0,
+        "end_step": 502,
+        "step_interval": 5,
+        "values": [
+            105.86866,
+            27.56126,
+            28.82349,
+            29.53482,
+            27.89586,
+            28.03171,
+            26.76686,
+            27.44711,
+            27.49381,
+            26.2265,
+            26.34585,
+            26.49051,
+            25.37542,
+            25.01744,
+            25.80256,
+            25.40128,
+            24.8858,
+            25.58665,
+            24.75191,
+            25.04627,
+            24.2937,
+            24.7563,
+            24.02316,
+            24.34371,
+            24.1251,
+            23.96596,
+            24.00971,
+            23.89089,
+            23.58458,
+            24.4027,
+            24.01048,
+            23.99876,
+            23.99977,
+            23.84646,
+            24.00587,
+            24.41593,
+            23.62381,
+            23.21431,
+            23.60982,
+            23.42319,
+            23.37656,
+            23.99874,
+            23.14469,
+            23.10061,
+            23.28335,
+            23.36868,
+            23.1209,
+            23.39396,
+            23.47888,
+            23.09894,
+            23.64079,
+            22.88334,
+            23.72844,
+            23.62627,
+            22.73817,
+            22.86507,
+            23.453,
+            23.09974,
+            22.69251,
+            24.12787,
+            22.81395,
+            22.66667,
+            23.18731,
+            22.85296,
+            23.01887,
+            23.04897,
+            22.88361,
+            22.74143,
+            22.74174,
+            22.75465,
+            23.50667,
+            23.00953,
+            22.53933,
+            22.55209,
+            22.99388,
+            22.5802,
+            22.61953,
+            23.25686,
+            23.04985,
+            22.48606,
+            22.77353,
+            23.16327,
+            22.37138,
+            22.76908,
+            22.68125,
+            22.87267,
+            22.54488,
+            22.61455,
+            23.20255,
+            22.35706,
+            22.78544,
+            22.51313,
+            22.8067,
+            22.63311,
+            22.36641,
+            22.93204,
+            22.8089,
+            22.69756,
+            22.35847,
+            22.84454,
+            22.16427
+        ]
+    }
+}
\ No newline at end of file
diff --git a/tests/functional_tests/test_cases/mixtral/mixtral_8x7b_alltoall_tp2pp4ep4_release/model_config.yaml b/tests/functional_tests/test_cases/mixtral/mixtral_8x7b_alltoall_tp2pp4ep4_release/model_config.yaml
new file mode 100644
index 0000000000..585d9bb2c7
--- /dev/null
+++ b/tests/functional_tests/test_cases/mixtral/mixtral_8x7b_alltoall_tp2pp4ep4_release/model_config.yaml
@@ -0,0 +1,110 @@
+ENV_VARS:
+  NCCL_IB_SL: 1
+  NCCL_IB_TIMEOUT: 19
+  CUDA_DEVICE_MAX_CONNECTIONS: 1
+  NVTE_FWD_LAYERNORM_SM_MARGIN: 16
+  NVTE_BWD_LAYERNORM_SM_MARGIN: 16
+  NCCL_P2P_NET_CHUNKSIZE: 2097152
+  NVTE_ALLOW_NONDETERMINISTIC_ALGO: 1
+
+TEST_TYPE: "release"
+
+MODEL_ARGS:
+  # Distributed args
+  --distributed-timeout-minutes: 60
+  --tensor-model-parallel-size: 2
+  --pipeline-model-parallel-size: 4
+  --use-distributed-optimizer: true
+  --overlap-grad-reduce: true
+  --overlap-param-gather: true
+  --no-ckpt-fully-parallel-save: true
+  
+  # Training args
+  --use-mcore-models: true
+  --sequence-parallel: true
+  --use-flash-attn: true
+  --disable-bias-linear: true
+  --micro-batch-size: 1
+  --global-batch-size: 1024
+  --train-samples: 24414063
+  --exit-duration-in-mins: 230
+
+  # Transformer Engine args
+  --transformer-impl: transformer_engine
+
+  # Data args
+  --data-cache-path: ${DATA_CACHE_PATH}
+  --tokenizer-type: GPTSentencePieceTokenizer
+  --tokenizer-model: ${DATA_PATH}/utils/nemotron_2_256k.model 
+  --data-path: $DATA_BLEND
+  --split: 99,1,0
+  --no-mmap-bin-files: true
+  --num-workers: 6
+
+  # Add network size args
+  --untie-embeddings-and-output-weights: true
+  --no-position-embedding: true
+  --position-embedding-type: rope
+  --rotary-percent: 0.5
+  --normalization: RMSNorm
+  --swiglu: true
+  --num-layers: 32
+  --hidden-size: 4096
+  --ffn-hidden-size: 14336
+  --num-attention-heads: 32
+  --group-query-attention: true
+  --num-query-groups: 8
+  --seq-length: 4096
+  --max-position-embeddings: 4096
+  --make-vocab-size-divisible-by: 128
+
+  # Add regularization args
+  --attention-dropout: 0.0
+  --hidden-dropout: 0.0
+  --clip-grad: 1.0
+  --weight-decay: 0.1
+
+  # Add learning rate args
+  --lr-decay-samples: 1949218748
+  --lr-warmup-samples: 3906252
+  --lr: 3.0e-4
+  --min-lr: 3.0e-5
+  --lr-decay-style: cosine
+  --adam-beta1: 0.9
+  --adam-beta2: 0.95
+
+  # Add MoE args
+  --expert-model-parallel-size: 4
+  --num-experts: 8
+  --moe-router-load-balancing-type: aux_loss
+  --moe-router-topk: 2
+  --moe-grouped-gemm: true
+  --moe-aux-loss-coeff: 1e-2
+  --moe-token-dispatcher-type: alltoall
+
+  # Add validation args
+  --eval-iters: 32
+  --eval-interval: 200
+
+  # Add checkpointing args
+  --load: ${OUTPUT_PATH}/checkpoints
+  --save: ${OUTPUT_PATH}/checkpoints
+  --save-interval: 500
+
+  # Add initialization args
+  --init-method-std: 0.010
+
+  # Add logging args
+  --log-timers-to-tensorboard: true
+  --log-memory-to-tensorboard: true
+  --log-num-zeros-in-grad: true
+  --log-params-norm: true
+  --log-validation-ppl-to-tensorboard: true
+  --log-throughput: true
+  --log-interval: 1
+  --tensorboard-dir: ${OUTPUT_PATH}/tensorboard
+  --wandb-project: megatron-core-release-runs
+  --wandb-exp-name: ${WANDB_EXPERIMENT}
+
+  # Add mixed precision args
+  --bf16: true
diff --git a/tests/functional_tests/test_cases/mixtral/mixtral_8x7b_alltoall_tp2pp4ep4_release_sm/model_config.yaml b/tests/functional_tests/test_cases/mixtral/mixtral_8x7b_alltoall_tp2pp4ep4_release_sm/model_config.yaml
new file mode 100644
index 0000000000..22607416a3
--- /dev/null
+++ b/tests/functional_tests/test_cases/mixtral/mixtral_8x7b_alltoall_tp2pp4ep4_release_sm/model_config.yaml
@@ -0,0 +1,110 @@
+ENV_VARS:
+  NCCL_IB_SL: 1
+  NCCL_IB_TIMEOUT: 19
+  CUDA_DEVICE_MAX_CONNECTIONS: 1
+  NVTE_FWD_LAYERNORM_SM_MARGIN: 16
+  NVTE_BWD_LAYERNORM_SM_MARGIN: 16
+  NCCL_P2P_NET_CHUNKSIZE: 2097152
+  NVTE_ALLOW_NONDETERMINISTIC_ALGO: 1
+
+TEST_TYPE: "release"
+
+MODEL_ARGS:
+  # Distributed args
+  --distributed-timeout-minutes: 60
+  --tensor-model-parallel-size: 2
+  --pipeline-model-parallel-size: 4
+  --use-distributed-optimizer: true
+  --overlap-grad-reduce: true
+  --overlap-param-gather: true
+  --no-ckpt-fully-parallel-save: true
+  
+  # Training args
+  --use-mcore-models: true
+  --sequence-parallel: true
+  --use-flash-attn: true
+  --disable-bias-linear: true
+  --micro-batch-size: 1
+  --global-batch-size: 1024
+  --train-samples: 6103515
+  --exit-duration-in-mins: 230
+
+  # Transformer Engine args
+  --transformer-impl: transformer_engine
+
+  # Data args
+  --data-cache-path: ${DATA_CACHE_PATH}
+  --tokenizer-type: GPTSentencePieceTokenizer
+  --tokenizer-model: ${DATA_PATH}/utils/nemotron_2_256k.model 
+  --data-path: $DATA_BLEND
+  --split: 99,1,0
+  --no-mmap-bin-files: true
+  --num-workers: 6
+
+  # Add network size args
+  --untie-embeddings-and-output-weights: true
+  --no-position-embedding: true
+  --position-embedding-type: rope
+  --rotary-percent: 0.5
+  --normalization: RMSNorm
+  --swiglu: true
+  --num-layers: 32
+  --hidden-size: 4096
+  --ffn-hidden-size: 14336
+  --num-attention-heads: 32
+  --group-query-attention: true
+  --num-query-groups: 8
+  --seq-length: 4096
+  --max-position-embeddings: 4096
+  --make-vocab-size-divisible-by: 128
+
+  # Add regularization args
+  --attention-dropout: 0.0
+  --hidden-dropout: 0.0
+  --clip-grad: 1.0
+  --weight-decay: 0.1
+
+  # Add learning rate args
+  --lr-decay-samples: 1949218748
+  --lr-warmup-samples: 3906252
+  --lr: 3.0e-4
+  --min-lr: 3.0e-5
+  --lr-decay-style: cosine
+  --adam-beta1: 0.9
+  --adam-beta2: 0.95
+
+  # Add MoE args
+  --expert-model-parallel-size: 4
+  --num-experts: 8
+  --moe-router-load-balancing-type: aux_loss
+  --moe-router-topk: 2
+  --moe-grouped-gemm: true
+  --moe-aux-loss-coeff: 1e-2
+  --moe-token-dispatcher-type: alltoall
+
+  # Add validation args
+  --eval-iters: 32
+  --eval-interval: 200
+
+  # Add checkpointing args
+  --load: ${OUTPUT_PATH}/checkpoints
+  --save: ${OUTPUT_PATH}/checkpoints
+  --save-interval: 500
+
+  # Add initialization args
+  --init-method-std: 0.010
+
+  # Add logging args
+  --log-timers-to-tensorboard: true
+  --log-memory-to-tensorboard: true
+  --log-num-zeros-in-grad: true
+  --log-params-norm: true
+  --log-validation-ppl-to-tensorboard: true
+  --log-throughput: true
+  --log-interval: 1
+  --tensorboard-dir: ${OUTPUT_PATH}/tensorboard
+  --wandb-project: megatron-core-release-runs
+  --wandb-exp-name: ${WANDB_EXPERIMENT}
+
+  # Add mixed precision args
+  --bf16: true
diff --git a/tests/functional_tests/test_cases/mixtral/mixtral_8x7b_tp1pp4ep8vpp8_release/model_config.yaml b/tests/functional_tests/test_cases/mixtral/mixtral_8x7b_tp1pp4ep8vpp8_release/model_config.yaml
new file mode 100644
index 0000000000..39421a887e
--- /dev/null
+++ b/tests/functional_tests/test_cases/mixtral/mixtral_8x7b_tp1pp4ep8vpp8_release/model_config.yaml
@@ -0,0 +1,111 @@
+ENV_VARS:
+  CUDA_DEVICE_MAX_CONNECTIONS: 1
+  TORCH_NCCL_AVOID_RECORD_STREAMS: 1
+  NVTE_ALLOW_NONDETERMINISTIC_ALGO: 1
+  PYTORCH_CUDA_ALLOC_CONF: expandable_segments:True
+  NCCL_NVLS_ENABLE: 0
+
+TEST_TYPE: "release"
+
+MODEL_ARGS:
+  # Distributed args
+  --distributed-timeout-minutes: 60
+  --tensor-model-parallel-size: 1
+  --pipeline-model-parallel-size: 4
+  --num-layers-per-virtual-pipeline-stage: 1
+  --use-distributed-optimizer: true
+  --overlap-grad-reduce: true
+  --overlap-param-gather: true
+
+  # Training args
+  --use-mcore-models: true
+  --sequence-parallel: true
+  --use-flash-attn: true
+  --disable-bias-linear: true
+  --micro-batch-size: 1
+  --global-batch-size: 256
+  --train-samples: 51200
+  --exit-duration-in-mins: 230
+
+  # Transformer Engine args
+  --transformer-impl: transformer_engine
+
+  # Data args
+  --data-cache-path: ${DATA_CACHE_PATH}
+  --tokenizer-type: Llama2Tokenizer
+  --tokenizer-model: ${DATA_PATH}/tokenizer.model
+  --data-path: ${DATA_BLEND}
+  --split: 99,1,0
+  --no-mmap-bin-files: true
+  --num-workers: 6
+
+  # Add network size args
+  --untie-embeddings-and-output-weights: true
+  --no-position-embedding: true
+  --position-embedding-type: rope
+  --rotary-percent: 1.0
+  --normalization: RMSNorm
+  --swiglu: true
+  --num-layers: 32
+  --hidden-size: 4096
+  --ffn-hidden-size: 14336
+  --num-attention-heads: 32
+  --group-query-attention: true
+  --num-query-groups: 8
+  --seq-length: 4096
+  --max-position-embeddings: 4096
+  --make-vocab-size-divisible-by: 128
+
+  # Add regularization args
+  --attention-dropout: 0.0
+  --hidden-dropout: 0.0
+  --clip-grad: 1.0
+  --weight-decay: 0.1
+
+  # Add learning rate args
+  --lr-decay-samples: 255126953
+  --lr-warmup-samples: 162761
+  --lr: 1.2e-5
+  --min-lr: 1.2e-6
+  --lr-decay-style: cosine
+  --adam-beta1: 0.9
+  --adam-beta2: 0.95
+
+  # Add MoE args
+  --expert-model-parallel-size: 8
+  --num-experts: 8
+  --moe-router-load-balancing-type: aux_loss
+  --moe-router-topk: 2
+  --moe-grouped-gemm: true
+  --moe-aux-loss-coeff: 1e-2
+  --moe-token-dispatcher-type: alltoall
+
+  # Add validation args
+  --eval-iters: 32
+  --eval-interval: 200
+
+  # Add checkpointing args
+  --finetune: true
+  --auto-detect-ckpt-format: true
+  --load: ${LOAD_PATH}
+  --save: ${OUTPUT_PATH}/checkpoints
+  --no-ckpt-fully-parallel-save: true
+  --save-interval: 500
+
+  # Add initialization args
+  --init-method-std: 0.008
+
+  # Add logging args
+  --log-timers-to-tensorboard: true
+  --log-memory-to-tensorboard: true
+  --log-num-zeros-in-grad: true
+  --log-params-norm: true
+  --log-validation-ppl-to-tensorboard: true
+  --log-throughput: true
+  --log-interval: 1
+  --tensorboard-dir: ${OUTPUT_PATH}/tensorboard
+  --wandb-project: megatron-core-release-runs
+  --wandb-exp-name: ${WANDB_EXPERIMENT}
+
+  # Add mixed precision args
+  --bf16: true
diff --git a/tests/functional_tests/test_cases/multimodal-llava/multimodal_llava_mr_mcore_te_tp1_pp1_dgx_a100_1N8G/golden_values_dev.json b/tests/functional_tests/test_cases/multimodal-llava/multimodal_llava_mr_mcore_te_tp1_pp1_dgx_a100_1N8G/golden_values_dev.json
new file mode 100644
index 0000000000..fdcf15222e
--- /dev/null
+++ b/tests/functional_tests/test_cases/multimodal-llava/multimodal_llava_mr_mcore_te_tp1_pp1_dgx_a100_1N8G/golden_values_dev.json
@@ -0,0 +1,53 @@
+{
+    "lm loss": {
+        "start_step": 0,
+        "end_step": 50,
+        "step_interval": 5,
+        "values": [
+            9.1349,
+            9.13328,
+            9.129,
+            9.11325,
+            9.05402,
+            9.0423,
+            8.98255,
+            8.93259,
+            8.88939,
+            8.78786
+        ]
+    },
+    "num-zeros": {
+        "start_step": 0,
+        "end_step": 50,
+        "step_interval": 5,
+        "values": [
+            3477378.0,
+            3584431.0,
+            3475109.0,
+            3382848.0,
+            3699812.0,
+            3478561.0,
+            3397873.0,
+            3453618.0,
+            3424934.0,
+            3585113.0
+        ]
+    },
+    "iteration-time": {
+        "start_step": 0,
+        "end_step": 50,
+        "step_interval": 5,
+        "values": [
+            9.79473,
+            0.31292,
+            0.31229,
+            0.31273,
+            0.31218,
+            0.31206,
+            0.31234,
+            0.3114,
+            0.31226,
+            0.31109
+        ]
+    }
+}
\ No newline at end of file
diff --git a/tests/functional_tests/test_cases/multimodal-llava/multimodal_llava_mr_mcore_te_tp1_pp1_dgx_a100_1N8G/golden_values_lts.json b/tests/functional_tests/test_cases/multimodal-llava/multimodal_llava_mr_mcore_te_tp1_pp1_dgx_a100_1N8G/golden_values_lts.json
new file mode 100644
index 0000000000..f4b39082a6
--- /dev/null
+++ b/tests/functional_tests/test_cases/multimodal-llava/multimodal_llava_mr_mcore_te_tp1_pp1_dgx_a100_1N8G/golden_values_lts.json
@@ -0,0 +1 @@
+{"lm loss": {"start_step": 0, "end_step": 50, "step_interval": 5, "values": [9.13495, 9.13325, 9.12905, 9.11323, 9.05401, 9.04233, 8.98255, 8.93258, 8.88937, 8.78788]}, "num-zeros": {"start_step": 0, "end_step": 50, "step_interval": 5, "values": [3477473.0, 3584371.0, 3475194.0, 3382773.0, 3699802.0, 3478715.0, 3397967.0, 3453615.0, 3424973.0, 3585127.0]},"iteration_timing_avg": 0.2253964705882353}
\ No newline at end of file
diff --git a/tests/functional_tests/test_cases/multimodal-llava/multimodal_llava_mr_mcore_te_tp1_pp1_dgx_a100_1N8G/model_config.yaml b/tests/functional_tests/test_cases/multimodal-llava/multimodal_llava_mr_mcore_te_tp1_pp1_dgx_a100_1N8G/model_config.yaml
new file mode 100644
index 0000000000..6da0c3a85a
--- /dev/null
+++ b/tests/functional_tests/test_cases/multimodal-llava/multimodal_llava_mr_mcore_te_tp1_pp1_dgx_a100_1N8G/model_config.yaml
@@ -0,0 +1,53 @@
+ENV_VARS:
+  CUDA_DEVICE_MAX_CONNECTIONS: 1
+  NVTE_ALLOW_NONDETERMINISTIC_ALGO: 0
+  NCCL_ALGO: Tree
+  CUBLAS_WORKSPACE_CONFIG: :4096:8
+  N_REPEATS: 5
+MODEL_ARGS:
+  --num-layers: 12
+  --hidden-size: 624
+  --attention-dropout: 0.0
+  --hidden-dropout: 0.0
+  --num-attention-heads: 12
+  --log-params-norm: true
+  --log-num-zeros-in-grad: true
+  --log-validation-ppl-to-tensorboard: true
+  --log-timers-to-tensorboard: true
+  --tensorboard-dir: ${TENSORBOARD_PATH}
+  --micro-batch-size: 4
+  --global-batch-size: 32
+  --seq-length: 1024
+  --max-position-embeddings: 1024
+  --train-iters: 50
+  --timing-log-level: 2
+  --lr-decay-iters: 320000
+  --save: ${CHECKPOINT_PATH}
+  --load: ${CHECKPOINT_PATH}
+  --split: 949,50,1
+  --tokenizer-type: NullTokenizer
+  --vocab-size: 8192
+  --distributed-backend: nccl
+  --lr: 0.00015
+  --lr-decay-style: cosine
+  --min-lr: 1.0e-5
+  --weight-decay: 1e-2
+  --clip-grad: 1.0
+  --lr-warmup-fraction: .01
+  --log-interval: 1
+  --save-interval: 10000
+  --eval-interval: 1000
+  --eval-iters: 10
+  --transformer-impl: transformer_engine
+  --tensor-model-parallel-size: 1
+  --pipeline-model-parallel-size: 1
+  --deterministic-mode: true
+  --attention-softmax-in-fp32: true
+  --ckpt-format: torch
+  --no-gradient-accumulation-fusion: true
+  --bf16: true
+  --img-h: 336
+  --img-w: 336
+  --patch-dim: 14
+  --mock-data: true
+TEST_TYPE: regular
diff --git a/tests/functional_tests/test_cases/multimodal-llava/multimodal_llava_mr_mcore_te_tp2_pp3_dgx_a100_1N8G/golden_values_dev.json b/tests/functional_tests/test_cases/multimodal-llava/multimodal_llava_mr_mcore_te_tp2_pp3_dgx_a100_1N8G/golden_values_dev.json
new file mode 100644
index 0000000000..e7b7b7ea3a
--- /dev/null
+++ b/tests/functional_tests/test_cases/multimodal-llava/multimodal_llava_mr_mcore_te_tp2_pp3_dgx_a100_1N8G/golden_values_dev.json
@@ -0,0 +1,52 @@
+{   "lm loss": {
+        "start_step": 0,
+        "end_step": 50,
+        "step_interval": 5,
+        "values": [
+            9.16172,
+            9.16209,
+            9.15685,
+            9.1402,
+            9.09395,
+            9.07144,
+            9.01399,
+            8.96508,
+            8.91879,
+            8.8258
+        ]
+    },
+    "num-zeros": {
+        "start_step": 0,
+        "end_step": 50,
+        "step_interval": 5,
+        "values": [
+            3557267.0,
+            3663904.0,
+            3554934.0,
+            3462955.0,
+            3780144.0,
+            3559102.0,
+            3477361.0,
+            3533886.0,
+            3504942.0,
+            3665022.0
+        ]
+    },
+    "iteration-time": {
+        "start_step": 0,
+        "end_step": 50,
+        "step_interval": 5,
+        "values": [
+            19.95466,
+            0.64533,
+            0.64247,
+            0.64737,
+            0.64555,
+            0.64863,
+            0.64899,
+            0.64814,
+            0.64615,
+            0.64499
+        ]
+    }
+}
\ No newline at end of file
diff --git a/tests/functional_tests/test_cases/multimodal-llava/multimodal_llava_mr_mcore_te_tp2_pp3_dgx_a100_1N8G/golden_values_lts.json b/tests/functional_tests/test_cases/multimodal-llava/multimodal_llava_mr_mcore_te_tp2_pp3_dgx_a100_1N8G/golden_values_lts.json
new file mode 100644
index 0000000000..03e0dd0e9b
--- /dev/null
+++ b/tests/functional_tests/test_cases/multimodal-llava/multimodal_llava_mr_mcore_te_tp2_pp3_dgx_a100_1N8G/golden_values_lts.json
@@ -0,0 +1 @@
+{"num-zeros": {"start_step": 0, "end_step": 50, "step_interval": 5, "values": [3557301.0, 3663955.0, 3555196.0, 3462888.0, 3780083.0, 3559007.0, 3477262.0, 3533752.0, 3505033.0, 3665096.0]},"lm loss": {"start_step": 0, "end_step": 50, "step_interval": 5, "values": [9.16173, 9.16211, 9.15686, 9.14022, 9.09396, 9.07146, 9.01401, 8.9651, 8.91881, 8.82578]}}
\ No newline at end of file
diff --git a/tests/functional_tests/test_cases/multimodal-llava/multimodal_llava_mr_mcore_te_tp2_pp3_dgx_a100_1N8G/model_config.yaml b/tests/functional_tests/test_cases/multimodal-llava/multimodal_llava_mr_mcore_te_tp2_pp3_dgx_a100_1N8G/model_config.yaml
new file mode 100644
index 0000000000..816aa8bf1f
--- /dev/null
+++ b/tests/functional_tests/test_cases/multimodal-llava/multimodal_llava_mr_mcore_te_tp2_pp3_dgx_a100_1N8G/model_config.yaml
@@ -0,0 +1,54 @@
+ENV_VARS:
+  CUDA_DEVICE_MAX_CONNECTIONS: 1
+  NVTE_ALLOW_NONDETERMINISTIC_ALGO: 0
+  NCCL_ALGO: Tree
+  CUBLAS_WORKSPACE_CONFIG: :4096:8
+  N_REPEATS: 5
+MODEL_ARGS:
+  --num-layers: 12
+  --hidden-size: 624
+  --attention-dropout: 0.0
+  --hidden-dropout: 0.0
+  --num-attention-heads: 12
+  --log-params-norm: true
+  --log-num-zeros-in-grad: true
+  --log-validation-ppl-to-tensorboard: true
+  --log-timers-to-tensorboard: true
+  --tensorboard-dir: ${TENSORBOARD_PATH}
+  --micro-batch-size: 4
+  --global-batch-size: 32
+  --seq-length: 1024
+  --max-position-embeddings: 1024
+  --train-iters: 50
+  --timing-log-level: 2
+  --lr-decay-iters: 320000
+  --save: ${CHECKPOINT_PATH}
+  --load: ${CHECKPOINT_PATH}
+  --split: 949,50,1
+  --tokenizer-type: NullTokenizer
+  --vocab-size: 8192
+  --distributed-backend: nccl
+  --lr: 0.00015
+  --lr-decay-style: cosine
+  --min-lr: 1.0e-5
+  --weight-decay: 1e-2
+  --clip-grad: 1.0
+  --lr-warmup-fraction: .01
+  --log-interval: 1
+  --save-interval: 10000
+  --eval-interval: 1000
+  --eval-iters: 10
+  --transformer-impl: transformer_engine
+  --tensor-model-parallel-size: 2
+  --pipeline-model-parallel-size: 3
+  --encoder-pipeline-model-parallel-size: 1
+  --deterministic-mode: true
+  --attention-softmax-in-fp32: true
+  --ckpt-format: torch
+  --no-gradient-accumulation-fusion: true
+  --bf16: true
+  --img-h: 336
+  --img-w: 336
+  --patch-dim: 14
+  --mock-data: true
+TEST_TYPE: regular
diff --git a/tests/functional_tests/test_cases/multimodal-llava/multimodal_llava_mr_mcore_te_tp4_pp1_etp3_dgx_a100_1N7G/golden_values_dev.json b/tests/functional_tests/test_cases/multimodal-llava/multimodal_llava_mr_mcore_te_tp4_pp1_etp3_dgx_a100_1N7G/golden_values_dev.json
new file mode 100644
index 0000000000..a7ef0e1fac
--- /dev/null
+++ b/tests/functional_tests/test_cases/multimodal-llava/multimodal_llava_mr_mcore_te_tp4_pp1_etp3_dgx_a100_1N7G/golden_values_dev.json
@@ -0,0 +1,53 @@
+{
+    "lm loss": {
+        "start_step": 0,
+        "end_step": 50,
+        "step_interval": 5,
+        "values": [
+            9.19864,
+            9.20111,
+            9.19601,
+            9.17296,
+            9.11705,
+            9.10224,
+            9.04016,
+            8.98428,
+            8.94016,
+            8.8386
+        ]
+    },
+    "num-zeros": {
+        "start_step": 0,
+        "end_step": 50,
+        "step_interval": 5,
+        "values": [
+            3717664.0,
+            3824288.0,
+            3714705.0,
+            3622894.0,
+            3939791.0,
+            3718740.0,
+            3637227.0,
+            3694225.0,
+            3665435.0,
+            3825408.0
+        ]
+    },
+    "iteration-time": {
+        "start_step": 0,
+        "end_step": 50,
+        "step_interval": 5,
+        "values": [
+            12.72076,
+            0.81802,
+            0.8164,
+            0.81573,
+            0.81376,
+            0.81495,
+            0.81587,
+            0.8178,
+            0.82291,
+            0.82279
+        ]
+    }
+}
\ No newline at end of file
diff --git a/tests/functional_tests/test_cases/multimodal-llava/multimodal_llava_mr_mcore_te_tp4_pp1_etp3_dgx_a100_1N7G/golden_values_lts.json b/tests/functional_tests/test_cases/multimodal-llava/multimodal_llava_mr_mcore_te_tp4_pp1_etp3_dgx_a100_1N7G/golden_values_lts.json
new file mode 100644
index 0000000000..96f345a702
--- /dev/null
+++ b/tests/functional_tests/test_cases/multimodal-llava/multimodal_llava_mr_mcore_te_tp4_pp1_etp3_dgx_a100_1N7G/golden_values_lts.json
@@ -0,0 +1 @@
+{"lm loss": {"start_step": 0, "end_step": 50, "step_interval": 5, "values": [9.19864, 9.20112, 9.19598, 9.17297, 9.1171, 9.10232, 9.04013, 8.98432, 8.94016, 8.83862]},"num-zeros": {"start_step": 0, "end_step": 50, "step_interval": 5, "values": [3717564.0, 3824205.0, 3714643.0, 3622971.0, 3939727.0, 3718836.0, 3637293.0, 3694227.0, 3665382.0, 3825257.0]}, "iteration_timing_avg": 0.5847132352941178}
\ No newline at end of file
diff --git a/tests/functional_tests/test_cases/multimodal-llava/multimodal_llava_mr_mcore_te_tp4_pp1_etp3_dgx_a100_1N7G/model_config.yaml b/tests/functional_tests/test_cases/multimodal-llava/multimodal_llava_mr_mcore_te_tp4_pp1_etp3_dgx_a100_1N7G/model_config.yaml
new file mode 100644
index 0000000000..180e6beedd
--- /dev/null
+++ b/tests/functional_tests/test_cases/multimodal-llava/multimodal_llava_mr_mcore_te_tp4_pp1_etp3_dgx_a100_1N7G/model_config.yaml
@@ -0,0 +1,56 @@
+ENV_VARS:
+  CUDA_DEVICE_MAX_CONNECTIONS: 1
+  NVTE_ALLOW_NONDETERMINISTIC_ALGO: 0
+  NCCL_ALGO: Tree
+  CUBLAS_WORKSPACE_CONFIG: :4096:8
+  GPUS_PER_NODE: 7
+  N_REPEATS: 5
+MODEL_ARGS:
+  --num-layers: 12
+  --hidden-size: 624
+  --attention-dropout: 0.0
+  --hidden-dropout: 0.0
+  --num-attention-heads: 12
+  --log-params-norm: true
+  --log-num-zeros-in-grad: true
+  --log-validation-ppl-to-tensorboard: true
+  --log-timers-to-tensorboard: true
+  --tensorboard-dir: ${TENSORBOARD_PATH}
+  --micro-batch-size: 4
+  --global-batch-size: 32
+  --seq-length: 1024
+  --max-position-embeddings: 1024
+  --train-iters: 50
+  --timing-log-level: 2
+  --lr-decay-iters: 320000
+  --save: ${CHECKPOINT_PATH}
+  --load: ${CHECKPOINT_PATH}
+  --split: 949,50,1
+  --tokenizer-type: NullTokenizer
+  --vocab-size: 8192
+  --distributed-backend: nccl
+  --lr: 0.00015
+  --lr-decay-style: cosine
+  --min-lr: 1.0e-5
+  --weight-decay: 1e-2
+  --clip-grad: 1.0
+  --lr-warmup-fraction: .01
+  --log-interval: 1
+  --save-interval: 10000
+  --eval-interval: 1000
+  --eval-iters: 10
+  --transformer-impl: transformer_engine
+  --tensor-model-parallel-size: 4
+  --pipeline-model-parallel-size: 1
+  --encoder-pipeline-model-parallel-size: 1
+  --encoder-tensor-model-parallel-size: 3
+  --deterministic-mode: true
+  --attention-softmax-in-fp32: true
+  --ckpt-format: torch
+  --no-gradient-accumulation-fusion: true
+  --bf16: true
+  --img-h: 336
+  --img-w: 336
+  --patch-dim: 14
+  --mock-data: true
+TEST_TYPE: regular
diff --git a/tests/functional_tests/test_cases/multimodal-llava/multimodal_llava_mr_mcore_te_tp4_pp1_resume_torch_etp3_dgx_a100_1N7G/model_config.yaml b/tests/functional_tests/test_cases/multimodal-llava/multimodal_llava_mr_mcore_te_tp4_pp1_resume_torch_etp3_dgx_a100_1N7G/model_config.yaml
new file mode 100644
index 0000000000..1fade8fd4e
--- /dev/null
+++ b/tests/functional_tests/test_cases/multimodal-llava/multimodal_llava_mr_mcore_te_tp4_pp1_resume_torch_etp3_dgx_a100_1N7G/model_config.yaml
@@ -0,0 +1,57 @@
+ENV_VARS:
+  CUDA_DEVICE_MAX_CONNECTIONS: 1
+  NVTE_ALLOW_NONDETERMINISTIC_ALGO: 0
+  NCCL_ALGO: Tree
+  CUBLAS_WORKSPACE_CONFIG: :4096:8
+  GPUS_PER_NODE: 7
+  N_REPEATS: 5
+MODEL_ARGS:
+  --num-layers: 12
+  --hidden-size: 624
+  --attention-dropout: 0.0
+  --hidden-dropout: 0.0
+  --num-attention-heads: 12
+  --log-params-norm: true
+  --log-num-zeros-in-grad: true
+  --log-validation-ppl-to-tensorboard: true
+  --log-timers-to-tensorboard: true
+  --tensorboard-dir: ${TENSORBOARD_PATH}
+  --micro-batch-size: 4
+  --global-batch-size: 32
+  --seq-length: 1024
+  --max-position-embeddings: 1024
+  --train-iters: 100
+  --timing-log-level: 2
+  --lr-decay-iters: 320000
+  --save: ${CHECKPOINT_PATH}
+  --load: ${CHECKPOINT_PATH}
+  --split: 949,50,1
+  --tokenizer-type: NullTokenizer
+  --vocab-size: 8192
+  --distributed-backend: nccl
+  --lr: 0.00015
+  --lr-decay-style: cosine
+  --min-lr: 1.0e-5
+  --weight-decay: 1e-2
+  --clip-grad: 1.0
+  --lr-warmup-fraction: .01
+  --log-interval: 1
+  --save-interval: 50
+  --eval-interval: 1000
+  --eval-iters: 10
+  --transformer-impl: transformer_engine
+  --tensor-model-parallel-size: 4
+  --pipeline-model-parallel-size: 1
+  --encoder-pipeline-model-parallel-size: 1
+  --encoder-tensor-model-parallel-size: 3
+  --deterministic-mode: true
+  --attention-softmax-in-fp32: true
+  --use-checkpoint-opt_param-scheduler: true
+  --ckpt-format: torch
+  --no-gradient-accumulation-fusion: true
+  --bf16: true
+  --img-h: 336
+  --img-w: 336
+  --patch-dim: 14
+  --mock-data: true
+TEST_TYPE: ckpt-resume
diff --git a/tests/functional_tests/test_cases/t5/t5_220m_mr_mcore_te_tp2_pp2_dgx_a100_1N8G/golden_values_dev.json b/tests/functional_tests/test_cases/t5/t5_220m_mr_mcore_te_tp2_pp2_dgx_a100_1N8G/golden_values_dev.json
new file mode 100644
index 0000000000..4db9298008
--- /dev/null
+++ b/tests/functional_tests/test_cases/t5/t5_220m_mr_mcore_te_tp2_pp2_dgx_a100_1N8G/golden_values_dev.json
@@ -0,0 +1,83 @@
+{   
+    "lm loss": {
+        "start_step": 0,
+        "end_step": 100,
+        "step_interval": 5,
+        "values": [
+            10.41565,
+            9.20451,
+            8.62182,
+            8.34338,
+            8.08299,
+            7.96836,
+            7.68095,
+            7.39586,
+            7.26027,
+            7.1927,
+            7.31152,
+            7.16483,
+            7.05906,
+            6.99465,
+            6.8553,
+            6.93156,
+            6.95162,
+            7.025,
+            6.66761,
+            6.9396
+        ]
+    },
+    "num-zeros": {
+        "start_step": 0,
+        "end_step": 100,
+        "step_interval": 5,
+        "values": [
+            115743.0,
+            111076.0,
+            117069.0,
+            112374.0,
+            118724.0,
+            116979.0,
+            111370.0,
+            114004.0,
+            118473.0,
+            116942.0,
+            111516.0,
+            115638.0,
+            108510.0,
+            119946.0,
+            115729.0,
+            116934.0,
+            119852.0,
+            120367.0,
+            121411.0,
+            118447.0
+        ]
+    },
+    "iteration-time": {
+        "start_step": 0,
+        "end_step": 100,
+        "step_interval": 5,
+        "values": [
+            16.87868,
+            0.6539,
+            0.65018,
+            0.65146,
+            0.64779,
+            0.66047,
+            0.65067,
+            0.65397,
+            0.65676,
+            0.64702,
+            0.64712,
+            0.64088,
+            0.64576,
+            0.64057,
+            0.64318,
+            0.6678,
+            0.64034,
+            0.67174,
+            0.63871,
+            0.83246
+        ]
+    }
+}
\ No newline at end of file
diff --git a/tests/functional_tests/test_cases/t5/t5_220m_mr_mcore_te_tp2_pp2_dgx_a100_1N8G/golden_values_lts.json b/tests/functional_tests/test_cases/t5/t5_220m_mr_mcore_te_tp2_pp2_dgx_a100_1N8G/golden_values_lts.json
new file mode 100644
index 0000000000..bcff777664
--- /dev/null
+++ b/tests/functional_tests/test_cases/t5/t5_220m_mr_mcore_te_tp2_pp2_dgx_a100_1N8G/golden_values_lts.json
@@ -0,0 +1 @@
+{"forward-backward-time": {"start_step": 0, "end_step": 100, "step_interval": 5, "values": [19.39068, 0.66038, 0.65673, 0.66493, 0.65894, 0.6473, 0.65746, 0.64942, 0.66259, 0.65247, 0.65165, 0.64944, 0.81313, 0.65069, 0.64982, 0.65247, 0.65149, 0.65284, 0.64913, 0.6496]}, "forward-compute-time": {"start_step": 0, "end_step": 100, "step_interval": 5, "values": [3.63253, 0.27412, 0.26777, 0.27338, 0.26922, 0.26445, 0.27043, 0.26308, 0.27178, 0.26246, 0.26565, 0.26691, 0.42095, 0.26741, 0.26653, 0.26546, 0.26547, 0.26403, 0.26266, 0.26606]}, "backward-compute-time": {"start_step": 0, "end_step": 100, "step_interval": 5, "values": [2.0264, 0.24005, 0.23751, 0.24162, 0.24102, 0.23888, 0.24027, 0.23829, 0.24182, 0.24308, 0.24109, 0.23964, 0.23841, 0.24005, 0.23898, 0.23896, 0.24052, 0.23894, 0.24242, 0.23863]}, "forward-recv-time": {"start_step": 0, "end_step": 100, "step_interval": 5, "values": [8.32911, 0.07441, 0.07755, 0.07578, 0.07557, 0.07223, 0.0737, 0.07404, 0.07108, 0.07174, 0.07137, 0.07162, 0.07437, 0.07185, 0.07129, 0.07247, 0.0719, 0.07573, 0.07292, 0.07122]}, "forward-send-time": {"start_step": 0, "end_step": 100, "step_interval": 5, "values": [1.47287, 0.00053, 0.00063, 0.00048, 0.00045, 0.00047, 0.00046, 0.00045, 0.00046, 0.00063, 0.00044, 0.00046, 0.00047, 0.00045, 0.00056, 0.00046, 0.00045, 0.00046, 0.00045, 0.00044]}, "backward-recv-time": {"start_step": 0, "end_step": 100, "step_interval": 5, "values": [0.1444, 0.13179, 0.12767, 0.13592, 0.1279, 0.12912, 0.13033, 0.1328, 0.13106, 0.13249, 0.12957, 0.12877, 0.13334, 0.12829, 0.12815, 0.13128, 0.12985, 0.13117, 0.12901, 0.1277]}, "backward-send-time": {"start_step": 0, "end_step": 100, "step_interval": 5, "values": [0.00065, 0.00056, 0.00066, 0.00067, 0.0006, 0.00059, 0.00064, 0.00067, 0.00068, 0.0006, 0.00056, 0.00058, 0.00059, 0.00056, 0.00064, 0.00058, 0.00049, 0.00079, 0.00081, 0.0006]}, "forward-send-backward-recv-time": {"start_step": 0, "end_step": 100, "step_interval": 5, "values": [12.49425, 0.23291, 0.228, 0.22475, 0.22786, 0.22525, 0.22534, 0.22597, 0.23004, 0.22656, 0.22342, 0.22577, 0.38374, 0.22857, 0.22673, 0.22371, 0.22908, 0.23017, 0.23145, 0.23191]}, "backward-send-forward-recv-time": {"start_step": 0, "end_step": 100, "step_interval": 5, "values": [5.02478, 0.00608, 0.00441, 0.00414, 0.0093, 0.00347, 0.00363, 0.00527, 0.0093, 0.00705, 0.00369, 0.00633, 0.00834, 0.00352, 0.0034, 0.00565, 0.00346, 0.00354, 0.00341, 0.0035]}, "layernorm-grads-all-reduce-time": {"start_step": 0, "end_step": 100, "step_interval": 5, "values": [5e-05, 2e-05, 2e-05, 3e-05, 3e-05, 2e-05, 3e-05, 2e-05, 2e-05, 2e-05, 2e-05, 3e-05, 2e-05, 2e-05, 2e-05, 2e-05, 2e-05, 2e-05, 2e-05, 2e-05]}, "embedding-grads-all-reduce-time": {"start_step": 0, "end_step": 100, "step_interval": 5, "values": [0.47745, 0.00052, 0.00064, 0.00053, 0.00052, 0.0006, 0.00052, 0.00062, 0.00052, 0.00056, 0.00065, 0.00056, 0.00054, 0.00053, 0.00058, 0.00052, 0.00052, 0.00052, 0.00055, 0.00053]}, "all-grads-sync-time": {"start_step": 0, "end_step": 100, "step_interval": 5, "values": [0.43086, 0.00036, 0.00041, 0.00037, 0.00032, 0.00037, 0.00048, 0.00044, 0.00043, 0.00045, 0.00034, 0.00044, 0.00037, 0.00043, 0.00044, 0.00032, 0.00032, 0.00045, 0.00045, 0.00045]}, "optimizer-copy-to-main-grad-time": {"start_step": 0, "end_step": 100, "step_interval": 5, "values": [0.00053, 0.00034, 0.00032, 0.00033, 0.00034, 0.00031, 0.00033, 0.00035, 0.00032, 0.00033, 0.00036, 0.00035, 0.00033, 0.00033, 0.00034, 0.00035, 0.00033, 0.00034, 0.00032, 0.00035]}, "optimizer-clip-main-grad-time": {"start_step": 0, "end_step": 100, "step_interval": 5, "values": [2.26638, 0.00127, 0.00123, 0.00144, 0.00125, 0.00123, 0.00128, 0.00162, 0.00128, 0.00131, 0.00138, 0.00133, 0.00142, 0.0013, 0.00136, 0.00137, 0.00133, 0.00135, 0.00129, 0.00136]}, "optimizer-count-zeros-time": {"start_step": 0, "end_step": 100, "step_interval": 5, "values": [0.01282, 0.00738, 0.00728, 0.00736, 0.00738, 0.00733, 0.00738, 0.00735, 0.00731, 0.00727, 0.00897, 0.00755, 0.0073, 0.00721, 0.00734, 0.00746, 0.00736, 0.00734, 0.00737, 0.00726]}, "optimizer-inner-step-time": {"start_step": 0, "end_step": 100, "step_interval": 5, "values": [0.00984, 0.00108, 0.00105, 0.00108, 0.00105, 0.00105, 0.00107, 0.00104, 0.00105, 0.00106, 0.00106, 0.00105, 0.0012, 0.00106, 0.00105, 0.00105, 0.00105, 0.00106, 0.00104, 0.00106]}, "optimizer-copy-main-to-model-params-time": {"start_step": 0, "end_step": 100, "step_interval": 5, "values": [0.0011, 0.00101, 0.00102, 0.00102, 0.00101, 0.00102, 0.00101, 0.00101, 0.00101, 0.00101, 0.00101, 0.00101, 0.0015, 0.00102, 0.00101, 0.00101, 0.00102, 0.00268, 0.00101, 0.00101]}, "optimizer-time": {"start_step": 0, "end_step": 100, "step_interval": 5, "values": [2.29197, 0.01172, 0.01152, 0.01191, 0.01165, 0.01156, 0.0117, 0.01199, 0.01159, 0.01161, 0.0134, 0.01194, 0.01269, 0.01155, 0.01172, 0.01186, 0.01173, 0.01343, 0.01172, 0.01165]}, "learning-rate": {"start_step": 0, "end_step": 100, "step_interval": 5, "values": [0.0001, 0.0001, 9e-05, 9e-05, 8e-05, 8e-05, 7e-05, 7e-05, 6e-05, 6e-05, 5e-05, 5e-05, 5e-05, 4e-05, 4e-05, 3e-05, 3e-05, 2e-05, 2e-05, 1e-05]}, "learning-rate vs samples": {"start_step": 0, "end_step": 100, "step_interval": 5, "values": [0.0001, 0.0001, 9e-05, 9e-05, 8e-05, 8e-05, 7e-05, 7e-05, 6e-05, 6e-05, 5e-05, 5e-05, 5e-05, 4e-05, 4e-05, 3e-05, 3e-05, 2e-05, 2e-05, 1e-05]}, "batch-size": {"start_step": 0, "end_step": 100, "step_interval": 5, "values": [32.0, 32.0, 32.0, 32.0, 32.0, 32.0, 32.0, 32.0, 32.0, 32.0, 32.0, 32.0, 32.0, 32.0, 32.0, 32.0, 32.0, 32.0, 32.0, 32.0]}, "batch-size vs samples": {"start_step": 0, "end_step": 100, "step_interval": 5, "values": [32.0, 32.0, 32.0, 32.0, 32.0, 32.0, 32.0, 32.0, 32.0, 32.0, 32.0, 32.0, 32.0, 32.0, 32.0, 32.0, 32.0, 32.0, 32.0, 32.0]}, "lm loss": {"start_step": 0, "end_step": 100, "step_interval": 5, "values": [10.41489, 9.20451, 8.62156, 8.34435, 8.08472, 7.96931, 7.68116, 7.39495, 7.26108, 7.19145, 7.31028, 7.16653, 7.05979, 6.99436, 6.85568, 6.93225, 6.95525, 7.02522, 6.66561, 6.93924]}, "lm loss vs samples": {"start_step": 0, "end_step": 100, "step_interval": 5, "values": [10.41489, 9.20451, 8.62156, 8.34435, 8.08472, 7.96931, 7.68116, 7.39495, 7.26108, 7.19145, 7.31028, 7.16653, 7.05979, 6.99436, 6.85568, 6.93225, 6.95525, 7.02522, 6.66561, 6.93924]}, "loss-scale": {"start_step": 0, "end_step": 100, "step_interval": 5, "values": [1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0]}, "loss-scale vs samples": {"start_step": 0, "end_step": 100, "step_interval": 5, "values": [1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0]}, "grad-norm": {"start_step": 0, "end_step": 100, "step_interval": 5, "values": [31.51239, 2.98952, 3.27663, 2.61225, 2.39588, 1.99758, 1.81287, 1.93167, 1.62175, 1.51416, 1.16291, 1.32388, 1.20328, 1.10814, 1.5007, 2.15295, 1.65903, 1.42013, 2.08526, 1.2754]}, "grad-norm vs samples": {"start_step": 0, "end_step": 100, "step_interval": 5, "values": [31.51239, 2.98952, 3.27663, 2.61225, 2.39588, 1.99758, 1.81287, 1.93167, 1.62175, 1.51416, 1.16291, 1.32388, 1.20328, 1.10814, 1.5007, 2.15295, 1.65903, 1.42013, 2.08526, 1.2754]}, "num-zeros": {"start_step": 0, "end_step": 100, "step_interval": 5, "values": [115745.0, 111070.0, 117081.0, 112381.0, 118700.0, 116957.0, 111399.0, 114013.0, 118460.0, 116959.0, 111499.0, 115613.0, 108489.0, 119947.0, 115772.0, 116922.0, 119841.0, 120380.0, 121396.0, 118455.0]}, "num-zeros vs samples": {"start_step": 0, "end_step": 100, "step_interval": 5, "values": [115745.0, 111070.0, 117081.0, 112381.0, 118700.0, 116957.0, 111399.0, 114013.0, 118460.0, 116959.0, 111499.0, 115613.0, 108489.0, 119947.0, 115772.0, 116922.0, 119841.0, 120380.0, 121396.0, 118455.0]}, "params-norm": {"start_step": 0, "end_step": 100, "step_interval": 5, "values": [309.46707, 309.48447, 309.52603, 309.57944, 309.64523, 309.72018, 309.80231, 309.8884, 309.97391, 310.05591, 310.13483, 310.20755, 310.27094, 310.32535, 310.37161, 310.40887, 310.43597, 310.45648, 310.47238, 310.48444]}, "params-norm vs samples": {"start_step": 0, "end_step": 100, "step_interval": 5, "values": [309.46707, 309.48447, 309.52603, 309.57944, 309.64523, 309.72018, 309.80231, 309.8884, 309.97391, 310.05591, 310.13483, 310.20755, 310.27094, 310.32535, 310.37161, 310.40887, 310.43597, 310.45648, 310.47238, 310.48444]}, "iteration-time": {"start_step": 0, "end_step": 100, "step_interval": 5, "values": [21.7057, 0.68569, 0.68236, 0.69077, 0.68415, 0.67238, 0.68288, 0.67481, 0.6874, 0.67748, 0.6785, 0.67478, 0.83941, 0.6755, 0.67503, 0.67787, 0.67668, 0.67904, 0.67443, 0.67541]}, "lm loss validation": {"start_step": 0, "end_step": 2, "step_interval": 5, "values": [6.86582]}, "lm loss validation vs samples": {"start_step": 0, "end_step": 2, "step_interval": 5, "values": [6.86582]}, "lm loss validation ppl": {"start_step": 0, "end_step": 2, "step_interval": 5, "values": [958.93542]}, "lm loss validation ppl vs samples": {"start_step": 0, "end_step": 2, "step_interval": 5, "values": [958.93542]}}
\ No newline at end of file
diff --git a/tests/functional_tests/test_cases/t5/t5_220m_mr_mcore_te_tp2_pp2_dgx_a100_1N8G/model_config.yaml b/tests/functional_tests/test_cases/t5/t5_220m_mr_mcore_te_tp2_pp2_dgx_a100_1N8G/model_config.yaml
new file mode 100644
index 0000000000..076389c3d6
--- /dev/null
+++ b/tests/functional_tests/test_cases/t5/t5_220m_mr_mcore_te_tp2_pp2_dgx_a100_1N8G/model_config.yaml
@@ -0,0 +1,55 @@
+ENV_VARS:
+  CUDA_DEVICE_MAX_CONNECTIONS: 1
+  NVTE_ALLOW_NONDETERMINISTIC_ALGO: 0
+  NCCL_ALGO: ^NVLS
+  CUBLAS_WORKSPACE_CONFIG: :4096:8
+  N_REPEATS: 5
+MODEL_ARGS:
+  --encoder-num-layers: 12
+  --decoder-num-layers: 12
+  --hidden-size: 768
+  --num-attention-heads: 12
+  --kv-channels: 64
+  --ffn-hidden-size: 3072
+  --encoder-seq-length: 512
+  --decoder-seq-length: 128
+  --max-position-embeddings: 512
+  --tensor-model-parallel-size: 2
+  --pipeline-model-parallel-size: 2
+  --micro-batch-size: 4
+  --global-batch-size: 32
+  --lr: 0.0001
+  --train-iters: 100
+  --lr-decay-iters: 100
+  --lr-decay-style: linear
+  --min-lr: 0.00001
+  --weight-decay: 1e-2
+  --lr-warmup-fraction: .01
+  --clip-grad: 1.0
+  --bf16: true
+  --vocab-extra-ids: 100
+  --init-method-std: 0.015
+  --transformer-impl: transformer_engine
+  --data-path: ${DATA_PATH}/my-t5_00_text_document
+  --vocab-file: ${DATA_PATH}/bert-large-cased-vocab.txt
+  --tokenizer-type: BertWordPieceCase
+  --calculate-per-token-loss: true
+  --split: 99982,9,9
+  --save: ${CHECKPOINT_PATH}
+  --load: ${CHECKPOINT_PATH}
+  --tensorboard-dir: ${TENSORBOARD_PATH}
+  --log-params-norm: true
+  --log-num-zeros-in-grad: true
+  --log-validation-ppl-to-tensorboard: true
+  --log-timers-to-tensorboard: true
+  --timing-log-level: 2
+  --log-interval: 1
+  --save-interval: 10000
+  --eval-interval: 1000
+  --eval-iters: 10
+  --distributed-backend: nccl
+  --data-cache-path: ${DATA_CACHE_PATH}
+  --encoder-pipeline-model-parallel-size: 2
+  --deterministic-mode: true
+  --ckpt-format: torch
+TEST_TYPE: regular
diff --git a/tests/functional_tests/test_cases/t5/t5_220m_mr_mcore_te_tp2_pp2_resume_torch_dgx_a100_1N8G/model_config.yaml b/tests/functional_tests/test_cases/t5/t5_220m_mr_mcore_te_tp2_pp2_resume_torch_dgx_a100_1N8G/model_config.yaml
new file mode 100644
index 0000000000..b0d00b8f83
--- /dev/null
+++ b/tests/functional_tests/test_cases/t5/t5_220m_mr_mcore_te_tp2_pp2_resume_torch_dgx_a100_1N8G/model_config.yaml
@@ -0,0 +1,55 @@
+ENV_VARS:
+  CUDA_DEVICE_MAX_CONNECTIONS: 1
+  NVTE_ALLOW_NONDETERMINISTIC_ALGO: 0
+  NCCL_ALGO: ^NVLS
+  CUBLAS_WORKSPACE_CONFIG: :4096:8
+  N_REPEATS: 5
+MODEL_ARGS:
+  --encoder-num-layers: 12
+  --decoder-num-layers: 12
+  --hidden-size: 768
+  --num-attention-heads: 12
+  --kv-channels: 64
+  --ffn-hidden-size: 3072
+  --encoder-seq-length: 512
+  --decoder-seq-length: 128
+  --max-position-embeddings: 512
+  --tensor-model-parallel-size: 2
+  --pipeline-model-parallel-size: 2
+  --micro-batch-size: 4
+  --global-batch-size: 32
+  --lr: 0.0001
+  --train-iters: 100
+  --lr-decay-iters: 100
+  --lr-decay-style: linear
+  --min-lr: 0.00001
+  --weight-decay: 1e-2
+  --lr-warmup-fraction: .01
+  --clip-grad: 1.0
+  --bf16: true
+  --vocab-extra-ids: 100
+  --init-method-std: 0.015
+  --transformer-impl: transformer_engine
+  --data-path: ${DATA_PATH}/my-t5_00_text_document
+  --vocab-file: ${DATA_PATH}/bert-large-cased-vocab.txt
+  --tokenizer-type: BertWordPieceCase
+  --calculate-per-token-loss: true
+  --split: 99982,9,9
+  --save: ${CHECKPOINT_PATH}
+  --load: ${CHECKPOINT_PATH}
+  --tensorboard-dir: ${TENSORBOARD_PATH}
+  --log-params-norm: true
+  --log-num-zeros-in-grad: true
+  --log-validation-ppl-to-tensorboard: true
+  --log-timers-to-tensorboard: true
+  --timing-log-level: 2
+  --log-interval: 1
+  --save-interval: 50
+  --eval-interval: 1000
+  --eval-iters: 10
+  --distributed-backend: nccl
+  --data-cache-path: ${DATA_CACHE_PATH}
+  --encoder-pipeline-model-parallel-size: 2
+  --deterministic-mode: true
+  --ckpt-format: torch
+TEST_TYPE: ckpt-resume
diff --git a/tests/functional_tests/test_cases/t5/t5_220m_mr_mcore_te_tp4_pp1_dgx_a100_1N8G/golden_values_dev.json b/tests/functional_tests/test_cases/t5/t5_220m_mr_mcore_te_tp4_pp1_dgx_a100_1N8G/golden_values_dev.json
new file mode 100644
index 0000000000..4bba0e7121
--- /dev/null
+++ b/tests/functional_tests/test_cases/t5/t5_220m_mr_mcore_te_tp4_pp1_dgx_a100_1N8G/golden_values_dev.json
@@ -0,0 +1,83 @@
+{
+    "lm loss": {
+        "start_step": 0,
+        "end_step": 100,
+        "step_interval": 5,
+        "values": [
+            10.32668,
+            9.41285,
+            8.86075,
+            8.5652,
+            8.28647,
+            8.10344,
+            7.83665,
+            7.53871,
+            7.39157,
+            7.29181,
+            7.37615,
+            7.22178,
+            7.11118,
+            7.0631,
+            6.91811,
+            6.96318,
+            6.96863,
+            7.04288,
+            6.71613,
+            6.97797
+        ]
+    },
+    "num-zeros": {
+        "start_step": 0,
+        "end_step": 100,
+        "step_interval": 5,
+        "values": [
+            43304.0,
+            40973.0,
+            43954.0,
+            41624.0,
+            44757.0,
+            43925.0,
+            41081.0,
+            42466.0,
+            44648.0,
+            43893.0,
+            41151.0,
+            43235.0,
+            39726.0,
+            45370.0,
+            43318.0,
+            43918.0,
+            45385.0,
+            45715.0,
+            46166.0,
+            44701.0
+        ]
+    },
+    "iteration-time": {
+        "start_step": 0,
+        "end_step": 100,
+        "step_interval": 5,
+        "values": [
+            9.84063,
+            0.75775,
+            0.76184,
+            0.77131,
+            0.77196,
+            1.03215,
+            0.77291,
+            0.79059,
+            0.80195,
+            0.79537,
+            0.79261,
+            0.79067,
+            0.77789,
+            0.79081,
+            0.79068,
+            0.78627,
+            0.79476,
+            0.78587,
+            0.78942,
+            0.79045
+        ]
+    }
+}
\ No newline at end of file
diff --git a/tests/functional_tests/test_cases/t5/t5_220m_mr_mcore_te_tp4_pp1_dgx_a100_1N8G/golden_values_lts.json b/tests/functional_tests/test_cases/t5/t5_220m_mr_mcore_te_tp4_pp1_dgx_a100_1N8G/golden_values_lts.json
new file mode 100644
index 0000000000..c59b98b90a
--- /dev/null
+++ b/tests/functional_tests/test_cases/t5/t5_220m_mr_mcore_te_tp4_pp1_dgx_a100_1N8G/golden_values_lts.json
@@ -0,0 +1 @@
+{"forward-backward-time": {"start_step": 0, "end_step": 100, "step_interval": 5, "values": [11.55278, 0.77358, 0.76856, 0.77172, 0.75887, 0.76061, 0.75836, 0.76125, 0.76192, 0.76187, 0.76171, 0.76045, 0.7599, 0.76535, 0.76121, 0.76796, 0.76998, 0.76511, 0.76167, 0.75816]}, "forward-compute-time": {"start_step": 0, "end_step": 100, "step_interval": 5, "values": [6.97639, 0.39525, 0.3898, 0.39437, 0.37749, 0.38195, 0.37908, 0.37821, 0.38433, 0.38023, 0.38359, 0.37973, 0.37768, 0.37754, 0.38336, 0.38173, 0.39026, 0.38845, 0.38337, 0.37691]}, "backward-compute-time": {"start_step": 0, "end_step": 100, "step_interval": 5, "values": [3.32964, 0.37495, 0.37481, 0.37567, 0.37884, 0.37558, 0.37486, 0.37929, 0.37612, 0.37965, 0.37608, 0.37503, 0.37843, 0.38541, 0.37552, 0.38094, 0.37923, 0.37628, 0.37437, 0.37757]}, "layernorm-grads-all-reduce-time": {"start_step": 0, "end_step": 100, "step_interval": 5, "values": [5e-05, 3e-05, 3e-05, 3e-05, 3e-05, 3e-05, 3e-05, 3e-05, 3e-05, 3e-05, 3e-05, 3e-05, 3e-05, 3e-05, 3e-05, 3e-05, 3e-05, 3e-05, 3e-05, 3e-05]}, "embedding-grads-all-reduce-time": {"start_step": 0, "end_step": 100, "step_interval": 5, "values": [5e-05, 4e-05, 4e-05, 4e-05, 4e-05, 4e-05, 4e-05, 4e-05, 4e-05, 4e-05, 3e-05, 4e-05, 4e-05, 4e-05, 4e-05, 4e-05, 4e-05, 4e-05, 4e-05, 4e-05]}, "all-grads-sync-time": {"start_step": 0, "end_step": 100, "step_interval": 5, "values": [0.89543, 0.00188, 0.00211, 0.00164, 0.00165, 0.00162, 0.00162, 0.00162, 0.00184, 0.00165, 0.00164, 0.00208, 0.00162, 0.00167, 0.0016, 0.00168, 0.00165, 0.00163, 0.00164, 0.00161]}, "optimizer-copy-to-main-grad-time": {"start_step": 0, "end_step": 100, "step_interval": 5, "values": [0.00146, 0.00105, 0.00105, 0.00102, 0.00107, 0.00107, 0.00107, 0.00109, 0.00105, 0.00106, 0.00107, 0.00106, 0.00106, 0.00106, 0.00108, 0.00108, 0.00107, 0.00104, 0.00103, 0.0011]}, "optimizer-clip-main-grad-time": {"start_step": 0, "end_step": 100, "step_interval": 5, "values": [1.50022, 0.00376, 0.00381, 0.00329, 0.00321, 0.00354, 0.00371, 0.00375, 0.00366, 0.00301, 0.00349, 0.00372, 0.00349, 0.00369, 0.00297, 0.00283, 0.00369, 0.00377, 0.00388, 0.00369]}, "optimizer-count-zeros-time": {"start_step": 0, "end_step": 100, "step_interval": 5, "values": [0.04986, 0.02302, 0.02299, 0.02588, 0.02338, 0.0231, 0.02293, 0.0231, 0.02309, 0.02329, 0.02328, 0.02332, 0.02304, 0.02327, 0.02287, 0.02321, 0.02315, 0.0234, 0.02312, 0.02327]}, "optimizer-inner-step-time": {"start_step": 0, "end_step": 100, "step_interval": 5, "values": [0.0158, 0.00219, 0.00221, 0.00411, 0.0022, 0.0022, 0.00216, 0.0022, 0.00217, 0.00218, 0.00218, 0.00225, 0.00233, 0.00219, 0.00223, 0.00222, 0.00212, 0.0022, 0.00222, 0.00225]}, "optimizer-copy-main-to-model-params-time": {"start_step": 0, "end_step": 100, "step_interval": 5, "values": [0.00301, 0.00302, 0.00302, 0.00339, 0.003, 0.00302, 0.00302, 0.00301, 0.00301, 0.00301, 0.003, 0.00301, 0.00302, 0.00304, 0.003, 0.00301, 0.00299, 0.00304, 0.00303, 0.00303]}, "optimizer-time": {"start_step": 0, "end_step": 100, "step_interval": 5, "values": [1.57167, 0.03386, 0.03382, 0.03847, 0.03353, 0.03358, 0.03363, 0.03394, 0.03377, 0.03326, 0.03368, 0.03412, 0.03363, 0.03407, 0.03281, 0.03316, 0.03373, 0.03419, 0.03396, 0.034]}, "learning-rate": {"start_step": 0, "end_step": 100, "step_interval": 5, "values": [0.0001, 0.0001, 9e-05, 9e-05, 8e-05, 8e-05, 7e-05, 7e-05, 6e-05, 6e-05, 5e-05, 5e-05, 5e-05, 4e-05, 4e-05, 3e-05, 3e-05, 2e-05, 2e-05, 1e-05]}, "learning-rate vs samples": {"start_step": 0, "end_step": 100, "step_interval": 5, "values": [0.0001, 0.0001, 9e-05, 9e-05, 8e-05, 8e-05, 7e-05, 7e-05, 6e-05, 6e-05, 5e-05, 5e-05, 5e-05, 4e-05, 4e-05, 3e-05, 3e-05, 2e-05, 2e-05, 1e-05]}, "batch-size": {"start_step": 0, "end_step": 100, "step_interval": 5, "values": [32.0, 32.0, 32.0, 32.0, 32.0, 32.0, 32.0, 32.0, 32.0, 32.0, 32.0, 32.0, 32.0, 32.0, 32.0, 32.0, 32.0, 32.0, 32.0, 32.0]}, "batch-size vs samples": {"start_step": 0, "end_step": 100, "step_interval": 5, "values": [32.0, 32.0, 32.0, 32.0, 32.0, 32.0, 32.0, 32.0, 32.0, 32.0, 32.0, 32.0, 32.0, 32.0, 32.0, 32.0, 32.0, 32.0, 32.0, 32.0]}, "lm loss": {"start_step": 0, "end_step": 100, "step_interval": 5, "values": [10.32677, 9.4141, 8.86401, 8.56564, 8.28782, 8.1035, 7.83676, 7.53769, 7.39294, 7.29345, 7.37746, 7.22535, 7.11277, 7.06759, 6.91832, 6.96664, 6.97845, 7.04885, 6.7213, 6.98241]}, "lm loss vs samples": {"start_step": 0, "end_step": 100, "step_interval": 5, "values": [10.32677, 9.4141, 8.86401, 8.56564, 8.28782, 8.1035, 7.83676, 7.53769, 7.39294, 7.29345, 7.37746, 7.22535, 7.11277, 7.06759, 6.91832, 6.96664, 6.97845, 7.04885, 6.7213, 6.98241]}, "loss-scale": {"start_step": 0, "end_step": 100, "step_interval": 5, "values": [1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0]}, "loss-scale vs samples": {"start_step": 0, "end_step": 100, "step_interval": 5, "values": [1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0]}, "grad-norm": {"start_step": 0, "end_step": 100, "step_interval": 5, "values": [21.26434, 2.17404, 2.50103, 2.08973, 1.92522, 1.69977, 1.63605, 1.57256, 1.48469, 1.29632, 1.00932, 1.0148, 0.95539, 1.04571, 0.94482, 0.77816, 1.07456, 1.17593, 1.12335, 0.8491]}, "grad-norm vs samples": {"start_step": 0, "end_step": 100, "step_interval": 5, "values": [21.26434, 2.17404, 2.50103, 2.08973, 1.92522, 1.69977, 1.63605, 1.57256, 1.48469, 1.29632, 1.00932, 1.0148, 0.95539, 1.04571, 0.94482, 0.77816, 1.07456, 1.17593, 1.12335, 0.8491]}, "num-zeros": {"start_step": 0, "end_step": 100, "step_interval": 5, "values": [43306.0, 40955.0, 43967.0, 41614.0, 44764.0, 43923.0, 41108.0, 42464.0, 44664.0, 43899.0, 41152.0, 43230.0, 39719.0, 45367.0, 43334.0, 43903.0, 45349.0, 45688.0, 46166.0, 44691.0]}, "num-zeros vs samples": {"start_step": 0, "end_step": 100, "step_interval": 5, "values": [43306.0, 40955.0, 43967.0, 41614.0, 44764.0, 43923.0, 41108.0, 42464.0, 44664.0, 43899.0, 41152.0, 43230.0, 39719.0, 45367.0, 43334.0, 43903.0, 45349.0, 45688.0, 46166.0, 44691.0]}, "params-norm": {"start_step": 0, "end_step": 100, "step_interval": 5, "values": [283.80362, 283.8273, 283.86472, 283.9053, 283.95062, 284.00027, 284.05212, 284.1051, 284.15643, 284.20459, 284.25775, 284.30682, 284.34848, 284.38312, 284.41144, 284.43539, 284.45441, 284.46988, 284.48172, 284.49054]}, "params-norm vs samples": {"start_step": 0, "end_step": 100, "step_interval": 5, "values": [283.80362, 283.8273, 283.86472, 283.9053, 283.95062, 284.00027, 284.05212, 284.1051, 284.15643, 284.20459, 284.25775, 284.30682, 284.34848, 284.38312, 284.41144, 284.43539, 284.45441, 284.46988, 284.48172, 284.49054]}, "iteration-time": {"start_step": 0, "end_step": 100, "step_interval": 5, "values": [13.15856, 0.82951, 0.82427, 0.83168, 0.8147, 0.81581, 0.81386, 0.8171, 0.8176, 0.81664, 0.81719, 0.81685, 0.81547, 0.82136, 0.81551, 0.82315, 0.82591, 0.82132, 0.81777, 0.81414]}, "lm loss validation": {"start_step": 0, "end_step": 2, "step_interval": 5, "values": [6.9202]}, "lm loss validation vs samples": {"start_step": 0, "end_step": 2, "step_interval": 5, "values": [6.9202]}, "lm loss validation ppl": {"start_step": 0, "end_step": 2, "step_interval": 5, "values": [1012.5238]}, "lm loss validation ppl vs samples": {"start_step": 0, "end_step": 2, "step_interval": 5, "values": [1012.5238]}}
\ No newline at end of file
diff --git a/tests/functional_tests/test_cases/t5/t5_220m_mr_mcore_te_tp4_pp1_dgx_a100_1N8G/model_config.yaml b/tests/functional_tests/test_cases/t5/t5_220m_mr_mcore_te_tp4_pp1_dgx_a100_1N8G/model_config.yaml
new file mode 100644
index 0000000000..d1b9e8429e
--- /dev/null
+++ b/tests/functional_tests/test_cases/t5/t5_220m_mr_mcore_te_tp4_pp1_dgx_a100_1N8G/model_config.yaml
@@ -0,0 +1,55 @@
+ENV_VARS:
+  CUDA_DEVICE_MAX_CONNECTIONS: 1
+  NVTE_ALLOW_NONDETERMINISTIC_ALGO: 0
+  NCCL_ALGO: ^NVLS
+  CUBLAS_WORKSPACE_CONFIG: :4096:8
+  N_REPEATS: 5
+MODEL_ARGS:
+  --encoder-num-layers: 12
+  --decoder-num-layers: 12
+  --hidden-size: 768
+  --num-attention-heads: 12
+  --kv-channels: 64
+  --ffn-hidden-size: 3072
+  --encoder-seq-length: 512
+  --decoder-seq-length: 128
+  --max-position-embeddings: 512
+  --tensor-model-parallel-size: 4
+  --pipeline-model-parallel-size: 1
+  --micro-batch-size: 4
+  --global-batch-size: 32
+  --lr: 0.0001
+  --train-iters: 100
+  --lr-decay-iters: 100
+  --lr-decay-style: linear
+  --min-lr: 0.00001
+  --weight-decay: 1e-2
+  --lr-warmup-fraction: .01
+  --clip-grad: 1.0
+  --bf16: true
+  --vocab-extra-ids: 100
+  --init-method-std: 0.015
+  --transformer-impl: transformer_engine
+  --data-path: ${DATA_PATH}/my-t5_00_text_document
+  --vocab-file: ${DATA_PATH}/bert-large-cased-vocab.txt
+  --tokenizer-type: BertWordPieceCase
+  --calculate-per-token-loss: true
+  --split: 99982,9,9
+  --save: ${CHECKPOINT_PATH}
+  --load: ${CHECKPOINT_PATH}
+  --tensorboard-dir: ${TENSORBOARD_PATH}
+  --log-params-norm: true
+  --log-num-zeros-in-grad: true
+  --log-validation-ppl-to-tensorboard: true
+  --log-timers-to-tensorboard: true
+  --timing-log-level: 2
+  --log-interval: 1
+  --save-interval: 10000
+  --eval-interval: 1000
+  --eval-iters: 10
+  --distributed-backend: nccl
+  --data-cache-path: ${DATA_CACHE_PATH}
+  --encoder-pipeline-model-parallel-size: 0
+  --deterministic-mode: true
+  --ckpt-format: torch_dist
+TEST_TYPE: regular
diff --git a/tests/functional_tests/test_cases/t5/t5_220m_mr_mcore_te_tp4_pp1_resume_torch_dist_dgx_a100_1N8G/model_config.yaml b/tests/functional_tests/test_cases/t5/t5_220m_mr_mcore_te_tp4_pp1_resume_torch_dist_dgx_a100_1N8G/model_config.yaml
new file mode 100644
index 0000000000..540d4c1b73
--- /dev/null
+++ b/tests/functional_tests/test_cases/t5/t5_220m_mr_mcore_te_tp4_pp1_resume_torch_dist_dgx_a100_1N8G/model_config.yaml
@@ -0,0 +1,55 @@
+ENV_VARS:
+  CUDA_DEVICE_MAX_CONNECTIONS: 1
+  NVTE_ALLOW_NONDETERMINISTIC_ALGO: 0
+  NCCL_ALGO: ^NVLS
+  CUBLAS_WORKSPACE_CONFIG: :4096:8
+  N_REPEATS: 5
+MODEL_ARGS:
+  --encoder-num-layers: 12
+  --decoder-num-layers: 12
+  --hidden-size: 768
+  --num-attention-heads: 12
+  --kv-channels: 64
+  --ffn-hidden-size: 3072
+  --encoder-seq-length: 512
+  --decoder-seq-length: 128
+  --max-position-embeddings: 512
+  --tensor-model-parallel-size: 4
+  --pipeline-model-parallel-size: 1
+  --micro-batch-size: 4
+  --global-batch-size: 32
+  --lr: 0.0001
+  --train-iters: 100
+  --lr-decay-iters: 100
+  --lr-decay-style: linear
+  --min-lr: 0.00001
+  --weight-decay: 1e-2
+  --lr-warmup-fraction: .01
+  --clip-grad: 1.0
+  --bf16: true
+  --vocab-extra-ids: 100
+  --init-method-std: 0.015
+  --transformer-impl: transformer_engine
+  --data-path: ${DATA_PATH}/my-t5_00_text_document
+  --vocab-file: ${DATA_PATH}/bert-large-cased-vocab.txt
+  --tokenizer-type: BertWordPieceCase
+  --calculate-per-token-loss: true
+  --split: 99982,9,9
+  --save: ${CHECKPOINT_PATH}
+  --load: ${CHECKPOINT_PATH}
+  --tensorboard-dir: ${TENSORBOARD_PATH}
+  --log-params-norm: true
+  --log-num-zeros-in-grad: true
+  --log-validation-ppl-to-tensorboard: true
+  --log-timers-to-tensorboard: true
+  --timing-log-level: 2
+  --log-interval: 1
+  --save-interval: 50
+  --eval-interval: 1000
+  --eval-iters: 10
+  --distributed-backend: nccl
+  --data-cache-path: ${DATA_CACHE_PATH}
+  --encoder-pipeline-model-parallel-size: 0
+  --deterministic-mode: true
+  --ckpt-format: torch_dist
+TEST_TYPE: ckpt-resume
diff --git a/tests/functional_tests/test_cases/t5/t5_220m_mr_mcore_tp2_pp2_dgx_a100_1N8G/golden_values_dev.json b/tests/functional_tests/test_cases/t5/t5_220m_mr_mcore_tp2_pp2_dgx_a100_1N8G/golden_values_dev.json
new file mode 100644
index 0000000000..290f72fa54
--- /dev/null
+++ b/tests/functional_tests/test_cases/t5/t5_220m_mr_mcore_tp2_pp2_dgx_a100_1N8G/golden_values_dev.json
@@ -0,0 +1,83 @@
+{
+    "lm loss": {
+        "start_step": 0,
+        "end_step": 100,
+        "step_interval": 5,
+        "values": [
+            10.41501,
+            9.20443,
+            8.62112,
+            8.34419,
+            8.08444,
+            7.96918,
+            7.68094,
+            7.39407,
+            7.26111,
+            7.1912,
+            7.30986,
+            7.16621,
+            7.05948,
+            6.99431,
+            6.85598,
+            6.93101,
+            6.95451,
+            7.02449,
+            6.66498,
+            6.93853
+        ]
+    },
+    "num-zeros": {
+        "start_step": 0,
+        "end_step": 100,
+        "step_interval": 5,
+        "values": [
+            115751.0,
+            111072.0,
+            117055.0,
+            112398.0,
+            118711.0,
+            116945.0,
+            111371.0,
+            114003.0,
+            118481.0,
+            116960.0,
+            111515.0,
+            115593.0,
+            108487.0,
+            119963.0,
+            115753.0,
+            116928.0,
+            119834.0,
+            120372.0,
+            121397.0,
+            118441.0
+        ]
+    },
+    "iteration-time": {
+        "start_step": 0,
+        "end_step": 100,
+        "step_interval": 5,
+        "values": [
+            18.38831,
+            0.62692,
+            0.62068,
+            0.61881,
+            0.61978,
+            0.61894,
+            0.62198,
+            0.61769,
+            0.61719,
+            0.62601,
+            0.61805,
+            0.632,
+            0.62219,
+            0.63216,
+            0.63182,
+            0.63347,
+            0.62385,
+            0.62046,
+            0.61824,
+            0.61793
+        ]
+    }
+}
\ No newline at end of file
diff --git a/tests/functional_tests/test_cases/t5/t5_220m_mr_mcore_tp2_pp2_dgx_a100_1N8G/golden_values_lts.json b/tests/functional_tests/test_cases/t5/t5_220m_mr_mcore_tp2_pp2_dgx_a100_1N8G/golden_values_lts.json
new file mode 100644
index 0000000000..67e211c04f
--- /dev/null
+++ b/tests/functional_tests/test_cases/t5/t5_220m_mr_mcore_tp2_pp2_dgx_a100_1N8G/golden_values_lts.json
@@ -0,0 +1 @@
+{"lm loss": {"start_step": 0, "end_step": 50, "step_interval": 5, "values": [10.41501, 9.20443, 8.62112, 8.34419, 8.08454, 7.96905, 7.68086, 7.39418, 7.26109, 7.19122]}, "num-zeros": {"start_step": 0, "end_step": 50, "step_interval": 5, "values": [115751.0, 111072.0, 117055.0, 112398.0, 118712.0, 116944.0, 111387.0, 114025.0, 118464.0, 116959.0]}, "iteration_timing_avg": 0.2253964705882353}
\ No newline at end of file
diff --git a/tests/functional_tests/test_cases/t5/t5_220m_mr_mcore_tp2_pp2_dgx_a100_1N8G/model_config.yaml b/tests/functional_tests/test_cases/t5/t5_220m_mr_mcore_tp2_pp2_dgx_a100_1N8G/model_config.yaml
new file mode 100644
index 0000000000..8abace27d3
--- /dev/null
+++ b/tests/functional_tests/test_cases/t5/t5_220m_mr_mcore_tp2_pp2_dgx_a100_1N8G/model_config.yaml
@@ -0,0 +1,55 @@
+ENV_VARS:
+  CUDA_DEVICE_MAX_CONNECTIONS: 1
+  NVTE_ALLOW_NONDETERMINISTIC_ALGO: 0
+  NCCL_ALGO: ^NVLS
+  CUBLAS_WORKSPACE_CONFIG: :4096:8
+  N_REPEATS: 5
+MODEL_ARGS:
+  --encoder-num-layers: 12
+  --decoder-num-layers: 12
+  --hidden-size: 768
+  --num-attention-heads: 12
+  --kv-channels: 64
+  --ffn-hidden-size: 3072
+  --encoder-seq-length: 512
+  --decoder-seq-length: 128
+  --max-position-embeddings: 512
+  --tensor-model-parallel-size: 2
+  --pipeline-model-parallel-size: 2
+  --micro-batch-size: 4
+  --global-batch-size: 32
+  --lr: 0.0001
+  --train-iters: 100
+  --lr-decay-iters: 100
+  --lr-decay-style: linear
+  --min-lr: 0.00001
+  --weight-decay: 1e-2
+  --lr-warmup-fraction: .01
+  --clip-grad: 1.0
+  --bf16: true
+  --vocab-extra-ids: 100
+  --init-method-std: 0.015
+  --transformer-impl: local
+  --data-path: ${DATA_PATH}/my-t5_00_text_document
+  --vocab-file: ${DATA_PATH}/bert-large-cased-vocab.txt
+  --tokenizer-type: BertWordPieceCase
+  --calculate-per-token-loss: true
+  --split: 99982,9,9
+  --save: ${CHECKPOINT_PATH}
+  --load: ${CHECKPOINT_PATH}
+  --tensorboard-dir: ${TENSORBOARD_PATH}
+  --log-params-norm: true
+  --log-num-zeros-in-grad: true
+  --log-validation-ppl-to-tensorboard: true
+  --log-timers-to-tensorboard: true
+  --timing-log-level: 2
+  --log-interval: 1
+  --save-interval: 10000
+  --eval-interval: 1000
+  --eval-iters: 10
+  --distributed-backend: nccl
+  --data-cache-path: ${DATA_CACHE_PATH}
+  --encoder-pipeline-model-parallel-size: 2
+  --deterministic-mode: true
+  --ckpt-format: torch
+TEST_TYPE: regular
diff --git a/tests/functional_tests/test_cases/t5/t5_220m_mr_mcore_tp2_pp2_resume_torch_dgx_a100_1N8G/model_config.yaml b/tests/functional_tests/test_cases/t5/t5_220m_mr_mcore_tp2_pp2_resume_torch_dgx_a100_1N8G/model_config.yaml
new file mode 100644
index 0000000000..c1a6d51bf1
--- /dev/null
+++ b/tests/functional_tests/test_cases/t5/t5_220m_mr_mcore_tp2_pp2_resume_torch_dgx_a100_1N8G/model_config.yaml
@@ -0,0 +1,55 @@
+ENV_VARS:
+  CUDA_DEVICE_MAX_CONNECTIONS: 1
+  NVTE_ALLOW_NONDETERMINISTIC_ALGO: 0
+  NCCL_ALGO: ^NVLS
+  CUBLAS_WORKSPACE_CONFIG: :4096:8
+  N_REPEATS: 5
+MODEL_ARGS:
+  --encoder-num-layers: 12
+  --decoder-num-layers: 12
+  --hidden-size: 768
+  --num-attention-heads: 12
+  --kv-channels: 64
+  --ffn-hidden-size: 3072
+  --encoder-seq-length: 512
+  --decoder-seq-length: 128
+  --max-position-embeddings: 512
+  --tensor-model-parallel-size: 2
+  --pipeline-model-parallel-size: 2
+  --micro-batch-size: 4
+  --global-batch-size: 32
+  --lr: 0.0001
+  --train-iters: 100
+  --lr-decay-iters: 100
+  --lr-decay-style: linear
+  --min-lr: 0.00001
+  --weight-decay: 1e-2
+  --lr-warmup-fraction: .01
+  --clip-grad: 1.0
+  --bf16: true
+  --vocab-extra-ids: 100
+  --init-method-std: 0.015
+  --transformer-impl: local
+  --data-path: ${DATA_PATH}/my-t5_00_text_document
+  --vocab-file: ${DATA_PATH}/bert-large-cased-vocab.txt
+  --tokenizer-type: BertWordPieceCase
+  --calculate-per-token-loss: true
+  --split: 99982,9,9
+  --save: ${CHECKPOINT_PATH}
+  --load: ${CHECKPOINT_PATH}
+  --tensorboard-dir: ${TENSORBOARD_PATH}
+  --log-params-norm: true
+  --log-num-zeros-in-grad: true
+  --log-validation-ppl-to-tensorboard: true
+  --log-timers-to-tensorboard: true
+  --timing-log-level: 2
+  --log-interval: 1
+  --save-interval: 50
+  --eval-interval: 1000
+  --eval-iters: 10
+  --distributed-backend: nccl
+  --data-cache-path: ${DATA_CACHE_PATH}
+  --encoder-pipeline-model-parallel-size: 2
+  --deterministic-mode: true
+  --ckpt-format: torch
+TEST_TYPE: ckpt-resume
diff --git a/tests/functional_tests/test_cases/t5/t5_220m_mr_mcore_tp4_pp1_dgx_a100_1N8G/golden_values_dev.json b/tests/functional_tests/test_cases/t5/t5_220m_mr_mcore_tp4_pp1_dgx_a100_1N8G/golden_values_dev.json
new file mode 100644
index 0000000000..d752d31b3a
--- /dev/null
+++ b/tests/functional_tests/test_cases/t5/t5_220m_mr_mcore_tp4_pp1_dgx_a100_1N8G/golden_values_dev.json
@@ -0,0 +1,83 @@
+{
+    "lm loss": {
+        "start_step": 0,
+        "end_step": 100,
+        "step_interval": 5,
+        "values": [
+            10.32658,
+            9.41413,
+            8.86432,
+            8.56546,
+            8.2877,
+            8.1035,
+            7.83646,
+            7.5377,
+            7.39282,
+            7.29333,
+            7.37736,
+            7.22498,
+            7.11249,
+            7.06739,
+            6.91817,
+            6.96674,
+            6.97821,
+            7.0494,
+            6.72101,
+            6.98229
+        ]
+    },
+    "num-zeros": {
+        "start_step": 0,
+        "end_step": 100,
+        "step_interval": 5,
+        "values": [
+            43310.0,
+            40943.0,
+            43952.0,
+            41616.0,
+            44789.0,
+            43937.0,
+            41093.0,
+            42468.0,
+            44652.0,
+            43894.0,
+            41154.0,
+            43226.0,
+            39719.0,
+            45362.0,
+            43332.0,
+            43913.0,
+            45362.0,
+            45695.0,
+            46170.0,
+            44701.0
+        ]
+    },
+    "iteration-time": {
+        "start_step": 0,
+        "end_step": 100,
+        "step_interval": 5,
+        "values": [
+            11.09527,
+            0.74337,
+            0.74502,
+            0.74411,
+            1.06685,
+            0.74366,
+            0.74354,
+            0.74287,
+            0.7419,
+            0.74299,
+            1.02516,
+            0.74651,
+            0.74175,
+            0.74347,
+            0.7457,
+            0.74253,
+            0.74391,
+            0.74341,
+            0.74261,
+            0.74236
+        ]
+    }
+}
\ No newline at end of file
diff --git a/tests/functional_tests/test_cases/t5/t5_220m_mr_mcore_tp4_pp1_dgx_a100_1N8G/golden_values_lts.json b/tests/functional_tests/test_cases/t5/t5_220m_mr_mcore_tp4_pp1_dgx_a100_1N8G/golden_values_lts.json
new file mode 100644
index 0000000000..d932464f76
--- /dev/null
+++ b/tests/functional_tests/test_cases/t5/t5_220m_mr_mcore_tp4_pp1_dgx_a100_1N8G/golden_values_lts.json
@@ -0,0 +1,763 @@
+{
+    "forward-backward-time": {
+        "start_step": 0,
+        "end_step": 100,
+        "step_interval": 5,
+        "values": [
+            14.18678,
+            0.67885,
+            0.68278,
+            0.68333,
+            0.67855,
+            0.68179,
+            0.68809,
+            0.67808,
+            0.67889,
+            0.69586,
+            0.69577,
+            0.67938,
+            0.68076,
+            0.68551,
+            0.69108,
+            0.67821,
+            0.68422,
+            0.68947,
+            0.67891,
+            0.68614
+        ]
+    },
+    "forward-compute-time": {
+        "start_step": 0,
+        "end_step": 100,
+        "step_interval": 5,
+        "values": [
+            8.91183,
+            0.31386,
+            0.31455,
+            0.31529,
+            0.31399,
+            0.31376,
+            0.3168,
+            0.31219,
+            0.31205,
+            0.32539,
+            0.32943,
+            0.31424,
+            0.31569,
+            0.32161,
+            0.32188,
+            0.31166,
+            0.31627,
+            0.31935,
+            0.31029,
+            0.32078
+        ]
+    },
+    "backward-compute-time": {
+        "start_step": 0,
+        "end_step": 100,
+        "step_interval": 5,
+        "values": [
+            4.25414,
+            0.3682,
+            0.37658,
+            0.37755,
+            0.37333,
+            0.37381,
+            0.37727,
+            0.37278,
+            0.37206,
+            0.37541,
+            0.37183,
+            0.37214,
+            0.37101,
+            0.37247,
+            0.37485,
+            0.36955,
+            0.37359,
+            0.3825,
+            0.37545,
+            0.37777
+        ]
+    },
+    "layernorm-grads-all-reduce-time": {
+        "start_step": 0,
+        "end_step": 100,
+        "step_interval": 5,
+        "values": [
+            0.00004,
+            0.00003,
+            0.00003,
+            0.00002,
+            0.00002,
+            0.00002,
+            0.00002,
+            0.00003,
+            0.00002,
+            0.00003,
+            0.00002,
+            0.00003,
+            0.00002,
+            0.00002,
+            0.00004,
+            0.00003,
+            0.00002,
+            0.00002,
+            0.00002,
+            0.00002
+        ]
+    },
+    "embedding-grads-all-reduce-time": {
+        "start_step": 0,
+        "end_step": 100,
+        "step_interval": 5,
+        "values": [
+            0.00005,
+            0.00004,
+            0.00004,
+            0.00004,
+            0.00004,
+            0.00003,
+            0.00003,
+            0.00004,
+            0.00004,
+            0.00003,
+            0.00003,
+            0.00004,
+            0.00004,
+            0.00004,
+            0.00004,
+            0.00003,
+            0.00003,
+            0.00003,
+            0.00003,
+            0.00003
+        ]
+    },
+    "all-grads-sync-time": {
+        "start_step": 0,
+        "end_step": 100,
+        "step_interval": 5,
+        "values": [
+            0.9061,
+            0.00163,
+            0.00202,
+            0.00163,
+            0.00157,
+            0.00156,
+            0.00183,
+            0.0016,
+            0.00183,
+            0.00157,
+            0.00157,
+            0.00158,
+            0.00168,
+            0.00158,
+            0.00169,
+            0.00156,
+            0.00157,
+            0.00157,
+            0.00156,
+            0.00185
+        ]
+    },
+    "optimizer-copy-to-main-grad-time": {
+        "start_step": 0,
+        "end_step": 100,
+        "step_interval": 5,
+        "values": [
+            0.0011,
+            0.00104,
+            0.00102,
+            0.00101,
+            0.00097,
+            0.00098,
+            0.001,
+            0.00096,
+            0.00096,
+            0.00099,
+            0.00095,
+            0.00097,
+            0.00096,
+            0.00098,
+            0.00097,
+            0.00098,
+            0.00095,
+            0.00099,
+            0.00098,
+            0.00099
+        ]
+    },
+    "optimizer-clip-main-grad-time": {
+        "start_step": 0,
+        "end_step": 100,
+        "step_interval": 5,
+        "values": [
+            1.59317,
+            0.00265,
+            0.00282,
+            0.00284,
+            0.00289,
+            0.00298,
+            0.00282,
+            0.00294,
+            0.00302,
+            0.00301,
+            0.00304,
+            0.00294,
+            0.00253,
+            0.00296,
+            0.00251,
+            0.00227,
+            0.00282,
+            0.00287,
+            0.00308,
+            0.00276
+        ]
+    },
+    "optimizer-count-zeros-time": {
+        "start_step": 0,
+        "end_step": 100,
+        "step_interval": 5,
+        "values": [
+            0.04375,
+            0.02396,
+            0.02387,
+            0.02381,
+            0.02385,
+            0.02393,
+            0.0241,
+            0.02406,
+            0.02393,
+            0.024,
+            0.02396,
+            0.024,
+            0.0241,
+            0.02397,
+            0.024,
+            0.02378,
+            0.0238,
+            0.02393,
+            0.02395,
+            0.02405
+        ]
+    },
+    "optimizer-inner-step-time": {
+        "start_step": 0,
+        "end_step": 100,
+        "step_interval": 5,
+        "values": [
+            0.01715,
+            0.00212,
+            0.0021,
+            0.00212,
+            0.00212,
+            0.00211,
+            0.00218,
+            0.00213,
+            0.00212,
+            0.00214,
+            0.00211,
+            0.00226,
+            0.00211,
+            0.00209,
+            0.00211,
+            0.00218,
+            0.00207,
+            0.00211,
+            0.00213,
+            0.00218
+        ]
+    },
+    "optimizer-copy-main-to-model-params-time": {
+        "start_step": 0,
+        "end_step": 100,
+        "step_interval": 5,
+        "values": [
+            0.00281,
+            0.00282,
+            0.00281,
+            0.00283,
+            0.00281,
+            0.00283,
+            0.00289,
+            0.00286,
+            0.00281,
+            0.00284,
+            0.00282,
+            0.00431,
+            0.00295,
+            0.00284,
+            0.00283,
+            0.00283,
+            0.18259,
+            0.00284,
+            0.00283,
+            0.00295
+        ]
+    },
+    "optimizer-time": {
+        "start_step": 0,
+        "end_step": 100,
+        "step_interval": 5,
+        "values": [
+            1.65881,
+            0.03322,
+            0.03326,
+            0.03323,
+            0.03329,
+            0.03345,
+            0.03361,
+            0.03357,
+            0.03352,
+            0.03364,
+            0.03349,
+            0.03532,
+            0.03332,
+            0.03347,
+            0.03313,
+            0.03267,
+            0.21285,
+            0.03336,
+            0.03358,
+            0.03357
+        ]
+    },
+    "learning-rate": {
+        "start_step": 0,
+        "end_step": 100,
+        "step_interval": 5,
+        "values": [
+            0.0001,
+            0.0001,
+            0.00009,
+            0.00009,
+            0.00008,
+            0.00008,
+            0.00007,
+            0.00007,
+            0.00006,
+            0.00006,
+            0.00005,
+            0.00005,
+            0.00005,
+            0.00004,
+            0.00004,
+            0.00003,
+            0.00003,
+            0.00002,
+            0.00002,
+            0.00001
+        ]
+    },
+    "learning-rate vs samples": {
+        "start_step": 0,
+        "end_step": 100,
+        "step_interval": 5,
+        "values": [
+            0.0001,
+            0.0001,
+            0.00009,
+            0.00009,
+            0.00008,
+            0.00008,
+            0.00007,
+            0.00007,
+            0.00006,
+            0.00006,
+            0.00005,
+            0.00005,
+            0.00005,
+            0.00004,
+            0.00004,
+            0.00003,
+            0.00003,
+            0.00002,
+            0.00002,
+            0.00001
+        ]
+    },
+    "batch-size": {
+        "start_step": 0,
+        "end_step": 100,
+        "step_interval": 5,
+        "values": [
+            32,
+            32,
+            32,
+            32,
+            32,
+            32,
+            32,
+            32,
+            32,
+            32,
+            32,
+            32,
+            32,
+            32,
+            32,
+            32,
+            32,
+            32,
+            32,
+            32
+        ]
+    },
+    "batch-size vs samples": {
+        "start_step": 0,
+        "end_step": 100,
+        "step_interval": 5,
+        "values": [
+            32,
+            32,
+            32,
+            32,
+            32,
+            32,
+            32,
+            32,
+            32,
+            32,
+            32,
+            32,
+            32,
+            32,
+            32,
+            32,
+            32,
+            32,
+            32,
+            32
+        ]
+    },
+    "lm loss": {
+        "start_step": 0,
+        "end_step": 100,
+        "step_interval": 5,
+        "values": [
+            10.3267,
+            9.41409,
+            8.86422,
+            8.56557,
+            8.28779,
+            8.10356,
+            7.83669,
+            7.53761,
+            7.39304,
+            7.29344,
+            7.37755,
+            7.22522,
+            7.11288,
+            7.06761,
+            6.91847,
+            6.96686,
+            6.97827,
+            7.04883,
+            6.72143,
+            6.98255
+        ]
+    },
+    "lm loss vs samples": {
+        "start_step": 0,
+        "end_step": 100,
+        "step_interval": 5,
+        "values": [
+            10.3267,
+            9.41409,
+            8.86422,
+            8.56557,
+            8.28779,
+            8.10356,
+            7.83669,
+            7.53761,
+            7.39304,
+            7.29344,
+            7.37755,
+            7.22522,
+            7.11288,
+            7.06761,
+            6.91847,
+            6.96686,
+            6.97827,
+            7.04883,
+            6.72143,
+            6.98255
+        ]
+    },
+    "loss-scale": {
+        "start_step": 0,
+        "end_step": 100,
+        "step_interval": 5,
+        "values": [
+            1,
+            1,
+            1,
+            1,
+            1,
+            1,
+            1,
+            1,
+            1,
+            1,
+            1,
+            1,
+            1,
+            1,
+            1,
+            1,
+            1,
+            1,
+            1,
+            1
+        ]
+    },
+    "loss-scale vs samples": {
+        "start_step": 0,
+        "end_step": 100,
+        "step_interval": 5,
+        "values": [
+            1,
+            1,
+            1,
+            1,
+            1,
+            1,
+            1,
+            1,
+            1,
+            1,
+            1,
+            1,
+            1,
+            1,
+            1,
+            1,
+            1,
+            1,
+            1,
+            1
+        ]
+    },
+    "grad-norm": {
+        "start_step": 0,
+        "end_step": 100,
+        "step_interval": 5,
+        "values": [
+            21.2635,
+            2.17416,
+            2.50475,
+            2.08972,
+            1.9252,
+            1.69975,
+            1.63606,
+            1.57261,
+            1.48503,
+            1.29641,
+            1.00944,
+            1.01609,
+            0.95592,
+            1.04635,
+            0.94502,
+            0.7775,
+            1.07117,
+            1.16813,
+            1.12672,
+            0.85024
+        ]
+    },
+    "grad-norm vs samples": {
+        "start_step": 0,
+        "end_step": 100,
+        "step_interval": 5,
+        "values": [
+            21.2635,
+            2.17416,
+            2.50475,
+            2.08972,
+            1.9252,
+            1.69975,
+            1.63606,
+            1.57261,
+            1.48503,
+            1.29641,
+            1.00944,
+            1.01609,
+            0.95592,
+            1.04635,
+            0.94502,
+            0.7775,
+            1.07117,
+            1.16813,
+            1.12672,
+            0.85024
+        ]
+    },
+    "num-zeros": {
+        "start_step": 0,
+        "end_step": 100,
+        "step_interval": 5,
+        "values": [
+            43318,
+            40956,
+            43957,
+            41617,
+            44756,
+            43946,
+            41064,
+            42479,
+            44668,
+            43904,
+            41151,
+            43235,
+            39712,
+            45373,
+            43360,
+            43896,
+            45353,
+            45682,
+            46166,
+            44693
+        ]
+    },
+    "num-zeros vs samples": {
+        "start_step": 0,
+        "end_step": 100,
+        "step_interval": 5,
+        "values": [
+            43318,
+            40956,
+            43957,
+            41617,
+            44756,
+            43946,
+            41064,
+            42479,
+            44668,
+            43904,
+            41151,
+            43235,
+            39712,
+            45373,
+            43360,
+            43896,
+            45353,
+            45682,
+            46166,
+            44693
+        ]
+    },
+    "params-norm": {
+        "start_step": 0,
+        "end_step": 100,
+        "step_interval": 5,
+        "values": [
+            283.80362,
+            283.8273,
+            283.86469,
+            283.90527,
+            283.95059,
+            284.00024,
+            284.05206,
+            284.10507,
+            284.15643,
+            284.20459,
+            284.25775,
+            284.30685,
+            284.34851,
+            284.38309,
+            284.41144,
+            284.43536,
+            284.45441,
+            284.46985,
+            284.48169,
+            284.49057
+        ]
+    },
+    "params-norm vs samples": {
+        "start_step": 0,
+        "end_step": 100,
+        "step_interval": 5,
+        "values": [
+            283.80362,
+            283.8273,
+            283.86469,
+            283.90527,
+            283.95059,
+            284.00024,
+            284.05206,
+            284.10507,
+            284.15643,
+            284.20459,
+            284.25775,
+            284.30685,
+            284.34851,
+            284.38309,
+            284.41144,
+            284.43536,
+            284.45441,
+            284.46985,
+            284.48169,
+            284.49057
+        ]
+    },
+    "iteration-time": {
+        "start_step": 0,
+        "end_step": 100,
+        "step_interval": 5,
+        "values": [
+            15.87098,
+            0.73261,
+            0.73669,
+            0.73696,
+            0.73228,
+            0.73561,
+            0.74191,
+            0.73193,
+            0.73279,
+            0.75004,
+            0.74974,
+            0.73772,
+            0.73447,
+            0.73951,
+            0.74553,
+            0.73119,
+            0.9162,
+            0.74318,
+            0.73275,
+            0.74014
+        ]
+    },
+    "lm loss validation": {
+        "start_step": 0,
+        "end_step": 2,
+        "step_interval": 5,
+        "values": [
+            6.92026
+        ]
+    },
+    "lm loss validation vs samples": {
+        "start_step": 0,
+        "end_step": 2,
+        "step_interval": 5,
+        "values": [
+            6.92026
+        ]
+    },
+    "lm loss validation ppl": {
+        "start_step": 0,
+        "end_step": 2,
+        "step_interval": 5,
+        "values": [
+            1012.58173
+        ]
+    },
+    "lm loss validation ppl vs samples": {
+        "start_step": 0,
+        "end_step": 2,
+        "step_interval": 5,
+        "values": [
+            1012.58173
+        ]
+    }
+}
\ No newline at end of file
diff --git a/tests/functional_tests/test_cases/t5/t5_220m_mr_mcore_tp4_pp1_dgx_a100_1N8G/model_config.yaml b/tests/functional_tests/test_cases/t5/t5_220m_mr_mcore_tp4_pp1_dgx_a100_1N8G/model_config.yaml
new file mode 100644
index 0000000000..6aae44ca71
--- /dev/null
+++ b/tests/functional_tests/test_cases/t5/t5_220m_mr_mcore_tp4_pp1_dgx_a100_1N8G/model_config.yaml
@@ -0,0 +1,55 @@
+ENV_VARS:
+  CUDA_DEVICE_MAX_CONNECTIONS: 1
+  NVTE_ALLOW_NONDETERMINISTIC_ALGO: 0
+  NCCL_ALGO: ^NVLS
+  CUBLAS_WORKSPACE_CONFIG: :4096:8
+  N_REPEATS: 5
+MODEL_ARGS:
+  --encoder-num-layers: 12
+  --decoder-num-layers: 12
+  --hidden-size: 768
+  --num-attention-heads: 12
+  --kv-channels: 64
+  --ffn-hidden-size: 3072
+  --encoder-seq-length: 512
+  --decoder-seq-length: 128
+  --max-position-embeddings: 512
+  --tensor-model-parallel-size: 4
+  --pipeline-model-parallel-size: 1
+  --micro-batch-size: 4
+  --global-batch-size: 32
+  --lr: 0.0001
+  --train-iters: 100
+  --lr-decay-iters: 100
+  --lr-decay-style: linear
+  --min-lr: 0.00001
+  --weight-decay: 1e-2
+  --lr-warmup-fraction: .01
+  --clip-grad: 1.0
+  --bf16: true
+  --vocab-extra-ids: 100
+  --init-method-std: 0.015
+  --transformer-impl: local
+  --data-path: ${DATA_PATH}/my-t5_00_text_document
+  --vocab-file: ${DATA_PATH}/bert-large-cased-vocab.txt
+  --tokenizer-type: BertWordPieceCase
+  --calculate-per-token-loss: true
+  --split: 99982,9,9
+  --save: ${CHECKPOINT_PATH}
+  --load: ${CHECKPOINT_PATH}
+  --tensorboard-dir: ${TENSORBOARD_PATH}
+  --log-params-norm: true
+  --log-num-zeros-in-grad: true
+  --log-validation-ppl-to-tensorboard: true
+  --log-timers-to-tensorboard: true
+  --timing-log-level: 2
+  --log-interval: 1
+  --save-interval: 10000
+  --eval-interval: 1000
+  --eval-iters: 10
+  --distributed-backend: nccl
+  --data-cache-path: ${DATA_CACHE_PATH}
+  --encoder-pipeline-model-parallel-size: 0
+  --deterministic-mode: true
+  --ckpt-format: torch_dist
+TEST_TYPE: regular
diff --git a/tests/functional_tests/test_cases/t5/t5_220m_mr_mcore_tp4_pp1_resume_torch_dist_dgx_a100_1N8G/model_config.yaml b/tests/functional_tests/test_cases/t5/t5_220m_mr_mcore_tp4_pp1_resume_torch_dist_dgx_a100_1N8G/model_config.yaml
new file mode 100644
index 0000000000..6e9731d4ce
--- /dev/null
+++ b/tests/functional_tests/test_cases/t5/t5_220m_mr_mcore_tp4_pp1_resume_torch_dist_dgx_a100_1N8G/model_config.yaml
@@ -0,0 +1,55 @@
+ENV_VARS:
+  CUDA_DEVICE_MAX_CONNECTIONS: 1
+  NVTE_ALLOW_NONDETERMINISTIC_ALGO: 0
+  NCCL_ALGO: ^NVLS
+  CUBLAS_WORKSPACE_CONFIG: :4096:8
+  N_REPEATS: 5
+MODEL_ARGS:
+  --encoder-num-layers: 12
+  --decoder-num-layers: 12
+  --hidden-size: 768
+  --num-attention-heads: 12
+  --kv-channels: 64
+  --ffn-hidden-size: 3072
+  --encoder-seq-length: 512
+  --decoder-seq-length: 128
+  --max-position-embeddings: 512
+  --tensor-model-parallel-size: 4
+  --pipeline-model-parallel-size: 1
+  --micro-batch-size: 4
+  --global-batch-size: 32
+  --lr: 0.0001
+  --train-iters: 100
+  --lr-decay-iters: 100
+  --lr-decay-style: linear
+  --min-lr: 0.00001
+  --weight-decay: 1e-2
+  --lr-warmup-fraction: .01
+  --clip-grad: 1.0
+  --bf16: true
+  --vocab-extra-ids: 100
+  --init-method-std: 0.015
+  --transformer-impl: local
+  --data-path: ${DATA_PATH}/my-t5_00_text_document
+  --vocab-file: ${DATA_PATH}/bert-large-cased-vocab.txt
+  --tokenizer-type: BertWordPieceCase
+  --calculate-per-token-loss: true
+  --split: 99982,9,9
+  --save: ${CHECKPOINT_PATH}
+  --load: ${CHECKPOINT_PATH}
+  --tensorboard-dir: ${TENSORBOARD_PATH}
+  --log-params-norm: true
+  --log-num-zeros-in-grad: true
+  --log-validation-ppl-to-tensorboard: true
+  --log-timers-to-tensorboard: true
+  --timing-log-level: 2
+  --log-interval: 1
+  --save-interval: 50
+  --eval-interval: 1000
+  --eval-iters: 10
+  --distributed-backend: nccl
+  --data-cache-path: ${DATA_CACHE_PATH}
+  --encoder-pipeline-model-parallel-size: 0
+  --deterministic-mode: true
+  --ckpt-format: torch_dist
+TEST_TYPE: ckpt-resume
diff --git a/tests/functional_tests/test_cases/t5/t5_220m_weekly_dgx_a100_1N8G_mcore_te_tp1_pp1_vp1_resume_torch/model_config.yaml b/tests/functional_tests/test_cases/t5/t5_220m_weekly_dgx_a100_1N8G_mcore_te_tp1_pp1_vp1_resume_torch/model_config.yaml
new file mode 100644
index 0000000000..6556baeb59
--- /dev/null
+++ b/tests/functional_tests/test_cases/t5/t5_220m_weekly_dgx_a100_1N8G_mcore_te_tp1_pp1_vp1_resume_torch/model_config.yaml
@@ -0,0 +1,54 @@
+ENV_VARS:
+  CUDA_DEVICE_MAX_CONNECTIONS: 1
+  NVTE_ALLOW_NONDETERMINISTIC_ALGO: 0
+  NCCL_ALGO: ^NVLS
+  CUBLAS_WORKSPACE_CONFIG: :4096:8
+MODEL_ARGS:
+  --encoder-num-layers: 12
+  --decoder-num-layers: 12
+  --hidden-size: 768
+  --num-attention-heads: 12
+  --kv-channels: 64
+  --ffn-hidden-size: 3072
+  --encoder-seq-length: 512
+  --decoder-seq-length: 128
+  --max-position-embeddings: 512
+  --tensor-model-parallel-size: 1
+  --pipeline-model-parallel-size: 1
+  --micro-batch-size: 4
+  --global-batch-size: 32
+  --lr: 0.0001
+  --train-iters: 100
+  --lr-decay-iters: 100
+  --lr-decay-style: linear
+  --min-lr: 0.00001
+  --weight-decay: 1e-2
+  --lr-warmup-fraction: .01
+  --clip-grad: 1.0
+  --bf16: true
+  --vocab-extra-ids: 100
+  --init-method-std: 0.015
+  --transformer-impl: transformer_engine
+  --data-path: ${DATA_PATH}/my-t5_00_text_document
+  --vocab-file: ${DATA_PATH}/bert-large-cased-vocab.txt
+  --tokenizer-type: BertWordPieceCase
+  --calculate-per-token-loss: true
+  --split: 99982,9,9
+  --save: ${CHECKPOINT_PATH}
+  --load: ${CHECKPOINT_PATH}
+  --tensorboard-dir: ${TENSORBOARD_PATH}
+  --log-params-norm: true
+  --log-num-zeros-in-grad: true
+  --log-validation-ppl-to-tensorboard: true
+  --log-timers-to-tensorboard: true
+  --timing-log-level: 2
+  --log-interval: 1
+  --save-interval: 50
+  --eval-interval: 1000
+  --eval-iters: 10
+  --distributed-backend: nccl
+  --data-cache-path: ${DATA_CACHE_PATH}
+  --deterministic-mode: true
+  --attention-softmax-in-fp32: true
+  --ckpt-format: torch
+TEST_TYPE: ckpt-resume
\ No newline at end of file
diff --git a/tests/functional_tests/test_cases/t5/t5_220m_weekly_dgx_a100_1N8G_mcore_te_tp2_pp1_vp1/golden_values_dev.json b/tests/functional_tests/test_cases/t5/t5_220m_weekly_dgx_a100_1N8G_mcore_te_tp2_pp1_vp1/golden_values_dev.json
new file mode 100644
index 0000000000..cb39f6cc38
--- /dev/null
+++ b/tests/functional_tests/test_cases/t5/t5_220m_weekly_dgx_a100_1N8G_mcore_te_tp2_pp1_vp1/golden_values_dev.json
@@ -0,0 +1,83 @@
+{
+    "lm loss": {
+        "start_step": 0,
+        "end_step": 100,
+        "step_interval": 5,
+        "values": [
+            10.39855,
+            9.41112,
+            8.88304,
+            8.56269,
+            8.28765,
+            8.10224,
+            7.83813,
+            7.53409,
+            7.39411,
+            7.28757,
+            7.3679,
+            7.22194,
+            7.10575,
+            7.0526,
+            6.91422,
+            6.96483,
+            6.97306,
+            7.03511,
+            6.70374,
+            6.97038
+        ]
+    },
+    "num-zeros": {
+        "start_step": 0,
+        "end_step": 100,
+        "step_interval": 5,
+        "values": [
+            43312.0,
+            40958.0,
+            43972.0,
+            41597.0,
+            44750.0,
+            43923.0,
+            41262.0,
+            42494.0,
+            44656.0,
+            43889.0,
+            41161.0,
+            43247.0,
+            39676.0,
+            45397.0,
+            43316.0,
+            43882.0,
+            45349.0,
+            45684.0,
+            46190.0,
+            44647.0
+        ]
+    },
+    "iteration-time": {
+        "start_step": 0,
+        "end_step": 100,
+        "step_interval": 5,
+        "values": [
+            16.16815,
+            0.59042,
+            0.4284,
+            0.43391,
+            0.42668,
+            0.42919,
+            0.42816,
+            0.43087,
+            0.4328,
+            0.42988,
+            0.42869,
+            0.42651,
+            0.42621,
+            0.43082,
+            0.43114,
+            0.42943,
+            0.42758,
+            0.43083,
+            0.43032,
+            0.43533
+        ]
+    }
+}
\ No newline at end of file
diff --git a/tests/functional_tests/test_cases/t5/t5_220m_weekly_dgx_a100_1N8G_mcore_te_tp2_pp1_vp1/golden_values_lts.json b/tests/functional_tests/test_cases/t5/t5_220m_weekly_dgx_a100_1N8G_mcore_te_tp2_pp1_vp1/golden_values_lts.json
new file mode 100644
index 0000000000..cb39f6cc38
--- /dev/null
+++ b/tests/functional_tests/test_cases/t5/t5_220m_weekly_dgx_a100_1N8G_mcore_te_tp2_pp1_vp1/golden_values_lts.json
@@ -0,0 +1,83 @@
+{
+    "lm loss": {
+        "start_step": 0,
+        "end_step": 100,
+        "step_interval": 5,
+        "values": [
+            10.39855,
+            9.41112,
+            8.88304,
+            8.56269,
+            8.28765,
+            8.10224,
+            7.83813,
+            7.53409,
+            7.39411,
+            7.28757,
+            7.3679,
+            7.22194,
+            7.10575,
+            7.0526,
+            6.91422,
+            6.96483,
+            6.97306,
+            7.03511,
+            6.70374,
+            6.97038
+        ]
+    },
+    "num-zeros": {
+        "start_step": 0,
+        "end_step": 100,
+        "step_interval": 5,
+        "values": [
+            43312.0,
+            40958.0,
+            43972.0,
+            41597.0,
+            44750.0,
+            43923.0,
+            41262.0,
+            42494.0,
+            44656.0,
+            43889.0,
+            41161.0,
+            43247.0,
+            39676.0,
+            45397.0,
+            43316.0,
+            43882.0,
+            45349.0,
+            45684.0,
+            46190.0,
+            44647.0
+        ]
+    },
+    "iteration-time": {
+        "start_step": 0,
+        "end_step": 100,
+        "step_interval": 5,
+        "values": [
+            16.16815,
+            0.59042,
+            0.4284,
+            0.43391,
+            0.42668,
+            0.42919,
+            0.42816,
+            0.43087,
+            0.4328,
+            0.42988,
+            0.42869,
+            0.42651,
+            0.42621,
+            0.43082,
+            0.43114,
+            0.42943,
+            0.42758,
+            0.43083,
+            0.43032,
+            0.43533
+        ]
+    }
+}
\ No newline at end of file
diff --git a/tests/functional_tests/test_cases/t5/t5_220m_weekly_dgx_a100_1N8G_mcore_te_tp2_pp1_vp1/model_config.yaml b/tests/functional_tests/test_cases/t5/t5_220m_weekly_dgx_a100_1N8G_mcore_te_tp2_pp1_vp1/model_config.yaml
new file mode 100644
index 0000000000..70077b84a9
--- /dev/null
+++ b/tests/functional_tests/test_cases/t5/t5_220m_weekly_dgx_a100_1N8G_mcore_te_tp2_pp1_vp1/model_config.yaml
@@ -0,0 +1,54 @@
+ENV_VARS:
+  CUDA_DEVICE_MAX_CONNECTIONS: 1
+  NVTE_ALLOW_NONDETERMINISTIC_ALGO: 0
+  NCCL_ALGO: ^NVLS
+  CUBLAS_WORKSPACE_CONFIG: :4096:8
+MODEL_ARGS:
+  --encoder-num-layers: 12
+  --decoder-num-layers: 12
+  --hidden-size: 768
+  --num-attention-heads: 12
+  --kv-channels: 64
+  --ffn-hidden-size: 3072
+  --encoder-seq-length: 512
+  --decoder-seq-length: 128
+  --max-position-embeddings: 512
+  --tensor-model-parallel-size: 2
+  --pipeline-model-parallel-size: 1
+  --micro-batch-size: 4
+  --global-batch-size: 32
+  --lr: 0.0001
+  --train-iters: 100
+  --lr-decay-iters: 100
+  --lr-decay-style: linear
+  --min-lr: 0.00001
+  --weight-decay: 1e-2
+  --lr-warmup-fraction: .01
+  --clip-grad: 1.0
+  --bf16: true
+  --vocab-extra-ids: 100
+  --init-method-std: 0.015
+  --transformer-impl: transformer_engine
+  --data-path: ${DATA_PATH}/my-t5_00_text_document
+  --vocab-file: ${DATA_PATH}/bert-large-cased-vocab.txt
+  --tokenizer-type: BertWordPieceCase
+  --calculate-per-token-loss: true
+  --split: 99982,9,9
+  --save: ${CHECKPOINT_PATH}
+  --load: ${CHECKPOINT_PATH}
+  --tensorboard-dir: ${TENSORBOARD_PATH}
+  --log-params-norm: true
+  --log-num-zeros-in-grad: true
+  --log-validation-ppl-to-tensorboard: true
+  --log-timers-to-tensorboard: true
+  --timing-log-level: 2
+  --log-interval: 1
+  --save-interval: 10000
+  --eval-interval: 1000
+  --eval-iters: 10
+  --distributed-backend: nccl
+  --data-cache-path: ${DATA_CACHE_PATH}
+  --deterministic-mode: true
+  --attention-softmax-in-fp32: true
+  --ckpt-format: torch
+TEST_TYPE: regular
\ No newline at end of file
diff --git a/tests/functional_tests/test_cases/t5/t5_220m_weekly_dgx_a100_1N8G_mcore_te_tp2_pp1_vp1_sequence_parallel/golden_values_dev.json b/tests/functional_tests/test_cases/t5/t5_220m_weekly_dgx_a100_1N8G_mcore_te_tp2_pp1_vp1_sequence_parallel/golden_values_dev.json
new file mode 100644
index 0000000000..021c054969
--- /dev/null
+++ b/tests/functional_tests/test_cases/t5/t5_220m_weekly_dgx_a100_1N8G_mcore_te_tp2_pp1_vp1_sequence_parallel/golden_values_dev.json
@@ -0,0 +1,83 @@
+{
+    "lm loss": {
+        "start_step": 0,
+        "end_step": 100,
+        "step_interval": 5,
+        "values": [
+            10.39236,
+            9.4128,
+            8.88319,
+            8.56427,
+            8.29039,
+            8.10532,
+            7.84044,
+            7.53655,
+            7.39743,
+            7.28828,
+            7.36794,
+            7.22149,
+            7.10817,
+            7.05287,
+            6.92212,
+            6.96976,
+            6.98418,
+            7.04401,
+            6.71005,
+            6.97246
+        ]
+    },
+    "num-zeros": {
+        "start_step": 0,
+        "end_step": 100,
+        "step_interval": 5,
+        "values": [
+            43310.0,
+            40945.0,
+            43941.0,
+            41610.0,
+            44749.0,
+            43933.0,
+            41233.0,
+            42463.0,
+            44633.0,
+            43892.0,
+            41120.0,
+            43253.0,
+            39705.0,
+            45385.0,
+            43275.0,
+            43884.0,
+            45347.0,
+            45687.0,
+            46131.0,
+            44708.0
+        ]
+    },
+    "iteration-time": {
+        "start_step": 0,
+        "end_step": 100,
+        "step_interval": 5,
+        "values": [
+            13.97669,
+            0.63681,
+            0.47949,
+            0.48069,
+            0.46755,
+            0.4765,
+            0.47458,
+            0.46609,
+            0.48646,
+            0.47931,
+            0.46563,
+            0.47271,
+            0.49037,
+            0.46898,
+            0.47713,
+            0.472,
+            0.46796,
+            0.47359,
+            0.47799,
+            0.46934
+        ]
+    }
+}
\ No newline at end of file
diff --git a/tests/functional_tests/test_cases/t5/t5_220m_weekly_dgx_a100_1N8G_mcore_te_tp2_pp1_vp1_sequence_parallel/golden_values_lts.json b/tests/functional_tests/test_cases/t5/t5_220m_weekly_dgx_a100_1N8G_mcore_te_tp2_pp1_vp1_sequence_parallel/golden_values_lts.json
new file mode 100644
index 0000000000..021c054969
--- /dev/null
+++ b/tests/functional_tests/test_cases/t5/t5_220m_weekly_dgx_a100_1N8G_mcore_te_tp2_pp1_vp1_sequence_parallel/golden_values_lts.json
@@ -0,0 +1,83 @@
+{
+    "lm loss": {
+        "start_step": 0,
+        "end_step": 100,
+        "step_interval": 5,
+        "values": [
+            10.39236,
+            9.4128,
+            8.88319,
+            8.56427,
+            8.29039,
+            8.10532,
+            7.84044,
+            7.53655,
+            7.39743,
+            7.28828,
+            7.36794,
+            7.22149,
+            7.10817,
+            7.05287,
+            6.92212,
+            6.96976,
+            6.98418,
+            7.04401,
+            6.71005,
+            6.97246
+        ]
+    },
+    "num-zeros": {
+        "start_step": 0,
+        "end_step": 100,
+        "step_interval": 5,
+        "values": [
+            43310.0,
+            40945.0,
+            43941.0,
+            41610.0,
+            44749.0,
+            43933.0,
+            41233.0,
+            42463.0,
+            44633.0,
+            43892.0,
+            41120.0,
+            43253.0,
+            39705.0,
+            45385.0,
+            43275.0,
+            43884.0,
+            45347.0,
+            45687.0,
+            46131.0,
+            44708.0
+        ]
+    },
+    "iteration-time": {
+        "start_step": 0,
+        "end_step": 100,
+        "step_interval": 5,
+        "values": [
+            13.97669,
+            0.63681,
+            0.47949,
+            0.48069,
+            0.46755,
+            0.4765,
+            0.47458,
+            0.46609,
+            0.48646,
+            0.47931,
+            0.46563,
+            0.47271,
+            0.49037,
+            0.46898,
+            0.47713,
+            0.472,
+            0.46796,
+            0.47359,
+            0.47799,
+            0.46934
+        ]
+    }
+}
\ No newline at end of file
diff --git a/tests/functional_tests/test_cases/t5/t5_220m_weekly_dgx_a100_1N8G_mcore_te_tp2_pp1_vp1_sequence_parallel/model_config.yaml b/tests/functional_tests/test_cases/t5/t5_220m_weekly_dgx_a100_1N8G_mcore_te_tp2_pp1_vp1_sequence_parallel/model_config.yaml
new file mode 100644
index 0000000000..3a1793957b
--- /dev/null
+++ b/tests/functional_tests/test_cases/t5/t5_220m_weekly_dgx_a100_1N8G_mcore_te_tp2_pp1_vp1_sequence_parallel/model_config.yaml
@@ -0,0 +1,55 @@
+ENV_VARS:
+  CUDA_DEVICE_MAX_CONNECTIONS: 1
+  NVTE_ALLOW_NONDETERMINISTIC_ALGO: 0
+  NCCL_ALGO: ^NVLS
+  CUBLAS_WORKSPACE_CONFIG: :4096:8
+MODEL_ARGS:
+  --encoder-num-layers: 12
+  --decoder-num-layers: 12
+  --hidden-size: 768
+  --num-attention-heads: 12
+  --kv-channels: 64
+  --ffn-hidden-size: 3072
+  --encoder-seq-length: 512
+  --decoder-seq-length: 128
+  --max-position-embeddings: 512
+  --tensor-model-parallel-size: 2
+  --pipeline-model-parallel-size: 1
+  --micro-batch-size: 4
+  --global-batch-size: 32
+  --lr: 0.0001
+  --train-iters: 100
+  --lr-decay-iters: 100
+  --lr-decay-style: linear
+  --min-lr: 0.00001
+  --weight-decay: 1e-2
+  --lr-warmup-fraction: .01
+  --clip-grad: 1.0
+  --bf16: true
+  --vocab-extra-ids: 100
+  --init-method-std: 0.015
+  --transformer-impl: transformer_engine
+  --data-path: ${DATA_PATH}/my-t5_00_text_document
+  --vocab-file: ${DATA_PATH}/bert-large-cased-vocab.txt
+  --tokenizer-type: BertWordPieceCase
+  --calculate-per-token-loss: true
+  --split: 99982,9,9
+  --save: ${CHECKPOINT_PATH}
+  --load: ${CHECKPOINT_PATH}
+  --tensorboard-dir: ${TENSORBOARD_PATH}
+  --log-params-norm: true
+  --log-num-zeros-in-grad: true
+  --log-validation-ppl-to-tensorboard: true
+  --log-timers-to-tensorboard: true
+  --timing-log-level: 2
+  --log-interval: 1
+  --save-interval: 10000
+  --eval-interval: 1000
+  --eval-iters: 10
+  --distributed-backend: nccl
+  --data-cache-path: ${DATA_CACHE_PATH}
+  --sequence-parallel: true
+  --deterministic-mode: true
+  --attention-softmax-in-fp32: true
+  --ckpt-format: torch
+TEST_TYPE: regular
\ No newline at end of file
diff --git a/tests/functional_tests/test_cases/t5/t5_220m_weekly_dgx_a100_1N8G_mcore_tp1_pp1_vp1/golden_values_dev.json b/tests/functional_tests/test_cases/t5/t5_220m_weekly_dgx_a100_1N8G_mcore_tp1_pp1_vp1/golden_values_dev.json
new file mode 100644
index 0000000000..bd1e72366c
--- /dev/null
+++ b/tests/functional_tests/test_cases/t5/t5_220m_weekly_dgx_a100_1N8G_mcore_tp1_pp1_vp1/golden_values_dev.json
@@ -0,0 +1,83 @@
+{
+    "lm loss": {
+        "start_step": 0,
+        "end_step": 100,
+        "step_interval": 5,
+        "values": [
+            10.33709,
+            9.42687,
+            8.8634,
+            8.56213,
+            8.28406,
+            8.10594,
+            7.84882,
+            7.53542,
+            7.41068,
+            7.29571,
+            7.39283,
+            7.2191,
+            7.10262,
+            7.04837,
+            6.90357,
+            6.96014,
+            6.96438,
+            7.03513,
+            6.70023,
+            6.96639
+        ]
+    },
+    "num-zeros": {
+        "start_step": 0,
+        "end_step": 100,
+        "step_interval": 5,
+        "values": [
+            43334.0,
+            41023.0,
+            44021.0,
+            41733.0,
+            44803.0,
+            43935.0,
+            41268.0,
+            42516.0,
+            44710.0,
+            43908.0,
+            41143.0,
+            43285.0,
+            39763.0,
+            45410.0,
+            43315.0,
+            43919.0,
+            45394.0,
+            45708.0,
+            46319.0,
+            44709.0
+        ]
+    },
+    "iteration-time": {
+        "start_step": 0,
+        "end_step": 100,
+        "step_interval": 5,
+        "values": [
+            14.36472,
+            0.24447,
+            0.24436,
+            0.23998,
+            0.23902,
+            0.38149,
+            0.25367,
+            0.23963,
+            0.23768,
+            0.23812,
+            0.24016,
+            0.23918,
+            0.239,
+            0.23853,
+            0.23868,
+            0.23858,
+            0.23757,
+            0.2428,
+            0.24091,
+            0.2352
+        ]
+    }
+}
\ No newline at end of file
diff --git a/tests/functional_tests/test_cases/t5/t5_220m_weekly_dgx_a100_1N8G_mcore_tp1_pp1_vp1/golden_values_lts.json b/tests/functional_tests/test_cases/t5/t5_220m_weekly_dgx_a100_1N8G_mcore_tp1_pp1_vp1/golden_values_lts.json
new file mode 100644
index 0000000000..bd1e72366c
--- /dev/null
+++ b/tests/functional_tests/test_cases/t5/t5_220m_weekly_dgx_a100_1N8G_mcore_tp1_pp1_vp1/golden_values_lts.json
@@ -0,0 +1,83 @@
+{
+    "lm loss": {
+        "start_step": 0,
+        "end_step": 100,
+        "step_interval": 5,
+        "values": [
+            10.33709,
+            9.42687,
+            8.8634,
+            8.56213,
+            8.28406,
+            8.10594,
+            7.84882,
+            7.53542,
+            7.41068,
+            7.29571,
+            7.39283,
+            7.2191,
+            7.10262,
+            7.04837,
+            6.90357,
+            6.96014,
+            6.96438,
+            7.03513,
+            6.70023,
+            6.96639
+        ]
+    },
+    "num-zeros": {
+        "start_step": 0,
+        "end_step": 100,
+        "step_interval": 5,
+        "values": [
+            43334.0,
+            41023.0,
+            44021.0,
+            41733.0,
+            44803.0,
+            43935.0,
+            41268.0,
+            42516.0,
+            44710.0,
+            43908.0,
+            41143.0,
+            43285.0,
+            39763.0,
+            45410.0,
+            43315.0,
+            43919.0,
+            45394.0,
+            45708.0,
+            46319.0,
+            44709.0
+        ]
+    },
+    "iteration-time": {
+        "start_step": 0,
+        "end_step": 100,
+        "step_interval": 5,
+        "values": [
+            14.36472,
+            0.24447,
+            0.24436,
+            0.23998,
+            0.23902,
+            0.38149,
+            0.25367,
+            0.23963,
+            0.23768,
+            0.23812,
+            0.24016,
+            0.23918,
+            0.239,
+            0.23853,
+            0.23868,
+            0.23858,
+            0.23757,
+            0.2428,
+            0.24091,
+            0.2352
+        ]
+    }
+}
\ No newline at end of file
diff --git a/tests/functional_tests/test_cases/t5/t5_220m_weekly_dgx_a100_1N8G_mcore_tp1_pp1_vp1/model_config.yaml b/tests/functional_tests/test_cases/t5/t5_220m_weekly_dgx_a100_1N8G_mcore_tp1_pp1_vp1/model_config.yaml
new file mode 100644
index 0000000000..233023af31
--- /dev/null
+++ b/tests/functional_tests/test_cases/t5/t5_220m_weekly_dgx_a100_1N8G_mcore_tp1_pp1_vp1/model_config.yaml
@@ -0,0 +1,53 @@
+ENV_VARS:
+  CUDA_DEVICE_MAX_CONNECTIONS: 1
+  NVTE_ALLOW_NONDETERMINISTIC_ALGO: 0
+  NCCL_ALGO: ^NVLS
+  CUBLAS_WORKSPACE_CONFIG: :4096:8
+MODEL_ARGS:
+  --encoder-num-layers: 12
+  --decoder-num-layers: 12
+  --hidden-size: 768
+  --num-attention-heads: 12
+  --kv-channels: 64
+  --ffn-hidden-size: 3072
+  --encoder-seq-length: 512
+  --decoder-seq-length: 128
+  --max-position-embeddings: 512
+  --tensor-model-parallel-size: 1
+  --pipeline-model-parallel-size: 1
+  --micro-batch-size: 4
+  --global-batch-size: 32
+  --lr: 0.0001
+  --train-iters: 100
+  --lr-decay-iters: 100
+  --lr-decay-style: linear
+  --min-lr: 0.00001
+  --weight-decay: 1e-2
+  --lr-warmup-fraction: .01
+  --clip-grad: 1.0
+  --bf16: true
+  --vocab-extra-ids: 100
+  --init-method-std: 0.015
+  --transformer-impl: local
+  --data-path: ${DATA_PATH}/my-t5_00_text_document
+  --vocab-file: ${DATA_PATH}/bert-large-cased-vocab.txt
+  --tokenizer-type: BertWordPieceCase
+  --calculate-per-token-loss: true
+  --split: 99982,9,9
+  --save: ${CHECKPOINT_PATH}
+  --load: ${CHECKPOINT_PATH}
+  --tensorboard-dir: ${TENSORBOARD_PATH}
+  --log-params-norm: true
+  --log-num-zeros-in-grad: true
+  --log-validation-ppl-to-tensorboard: true
+  --log-timers-to-tensorboard: true
+  --timing-log-level: 2
+  --log-interval: 1
+  --save-interval: 10000
+  --eval-interval: 1000
+  --eval-iters: 10
+  --distributed-backend: nccl
+  --data-cache-path: ${DATA_CACHE_PATH}
+  --deterministic-mode: true
+  --ckpt-format: torch
+TEST_TYPE: regular
\ No newline at end of file
diff --git a/tests/functional_tests/test_cases/t5/t5_220m_weekly_dgx_a100_1N8G_mcore_tp1_pp1_vp1_resume_torch/model_config.yaml b/tests/functional_tests/test_cases/t5/t5_220m_weekly_dgx_a100_1N8G_mcore_tp1_pp1_vp1_resume_torch/model_config.yaml
new file mode 100644
index 0000000000..43afd73364
--- /dev/null
+++ b/tests/functional_tests/test_cases/t5/t5_220m_weekly_dgx_a100_1N8G_mcore_tp1_pp1_vp1_resume_torch/model_config.yaml
@@ -0,0 +1,53 @@
+ENV_VARS:
+  CUDA_DEVICE_MAX_CONNECTIONS: 1
+  NVTE_ALLOW_NONDETERMINISTIC_ALGO: 0
+  NCCL_ALGO: ^NVLS
+  CUBLAS_WORKSPACE_CONFIG: :4096:8
+MODEL_ARGS:
+  --encoder-num-layers: 12
+  --decoder-num-layers: 12
+  --hidden-size: 768
+  --num-attention-heads: 12
+  --kv-channels: 64
+  --ffn-hidden-size: 3072
+  --encoder-seq-length: 512
+  --decoder-seq-length: 128
+  --max-position-embeddings: 512
+  --tensor-model-parallel-size: 1
+  --pipeline-model-parallel-size: 1
+  --micro-batch-size: 4
+  --global-batch-size: 32
+  --lr: 0.0001
+  --train-iters: 100
+  --lr-decay-iters: 100
+  --lr-decay-style: linear
+  --min-lr: 0.00001
+  --weight-decay: 1e-2
+  --lr-warmup-fraction: .01
+  --clip-grad: 1.0
+  --bf16: true
+  --vocab-extra-ids: 100
+  --init-method-std: 0.015
+  --transformer-impl: local
+  --data-path: ${DATA_PATH}/my-t5_00_text_document
+  --vocab-file: ${DATA_PATH}/bert-large-cased-vocab.txt
+  --tokenizer-type: BertWordPieceCase
+  --calculate-per-token-loss: true
+  --split: 99982,9,9
+  --save: ${CHECKPOINT_PATH}
+  --load: ${CHECKPOINT_PATH}
+  --tensorboard-dir: ${TENSORBOARD_PATH}
+  --log-params-norm: true
+  --log-num-zeros-in-grad: true
+  --log-validation-ppl-to-tensorboard: true
+  --log-timers-to-tensorboard: true
+  --timing-log-level: 2
+  --log-interval: 1
+  --save-interval: 50
+  --eval-interval: 1000
+  --eval-iters: 10
+  --distributed-backend: nccl
+  --data-cache-path: ${DATA_CACHE_PATH}
+  --deterministic-mode: true
+  --ckpt-format: torch
+TEST_TYPE: ckpt-resume
\ No newline at end of file
diff --git a/tests/functional_tests/test_cases/t5/t5_220m_weekly_dgx_a100_1N8G_mcore_tp2_pp1_vp1/golden_values_dev.json b/tests/functional_tests/test_cases/t5/t5_220m_weekly_dgx_a100_1N8G_mcore_tp2_pp1_vp1/golden_values_dev.json
new file mode 100644
index 0000000000..3215a21156
--- /dev/null
+++ b/tests/functional_tests/test_cases/t5/t5_220m_weekly_dgx_a100_1N8G_mcore_tp2_pp1_vp1/golden_values_dev.json
@@ -0,0 +1,83 @@
+{
+    "lm loss": {
+        "start_step": 0,
+        "end_step": 100,
+        "step_interval": 5,
+        "values": [
+            10.39854,
+            9.41109,
+            8.8833,
+            8.56279,
+            8.28765,
+            8.10226,
+            7.83824,
+            7.53414,
+            7.39426,
+            7.28765,
+            7.36798,
+            7.22207,
+            7.10595,
+            7.05273,
+            6.91414,
+            6.96485,
+            6.97279,
+            7.03525,
+            6.70355,
+            6.97029
+        ]
+    },
+    "num-zeros": {
+        "start_step": 0,
+        "end_step": 100,
+        "step_interval": 5,
+        "values": [
+            43320.0,
+            40948.0,
+            43971.0,
+            41622.0,
+            44740.0,
+            43919.0,
+            41231.0,
+            42497.0,
+            44664.0,
+            43894.0,
+            41149.0,
+            43254.0,
+            39687.0,
+            45400.0,
+            43313.0,
+            43891.0,
+            45351.0,
+            45692.0,
+            46187.0,
+            44657.0
+        ]
+    },
+    "iteration-time": {
+        "start_step": 0,
+        "end_step": 100,
+        "step_interval": 5,
+        "values": [
+            14.46368,
+            0.41717,
+            0.42344,
+            0.4102,
+            0.40332,
+            0.40531,
+            0.40418,
+            0.40386,
+            0.40711,
+            0.4048,
+            0.40536,
+            0.40331,
+            0.40175,
+            0.4047,
+            0.40982,
+            0.40834,
+            0.40594,
+            0.40872,
+            0.40896,
+            0.41014
+        ]
+    }
+}
\ No newline at end of file
diff --git a/tests/functional_tests/test_cases/t5/t5_220m_weekly_dgx_a100_1N8G_mcore_tp2_pp1_vp1/golden_values_lts.json b/tests/functional_tests/test_cases/t5/t5_220m_weekly_dgx_a100_1N8G_mcore_tp2_pp1_vp1/golden_values_lts.json
new file mode 100644
index 0000000000..3215a21156
--- /dev/null
+++ b/tests/functional_tests/test_cases/t5/t5_220m_weekly_dgx_a100_1N8G_mcore_tp2_pp1_vp1/golden_values_lts.json
@@ -0,0 +1,83 @@
+{
+    "lm loss": {
+        "start_step": 0,
+        "end_step": 100,
+        "step_interval": 5,
+        "values": [
+            10.39854,
+            9.41109,
+            8.8833,
+            8.56279,
+            8.28765,
+            8.10226,
+            7.83824,
+            7.53414,
+            7.39426,
+            7.28765,
+            7.36798,
+            7.22207,
+            7.10595,
+            7.05273,
+            6.91414,
+            6.96485,
+            6.97279,
+            7.03525,
+            6.70355,
+            6.97029
+        ]
+    },
+    "num-zeros": {
+        "start_step": 0,
+        "end_step": 100,
+        "step_interval": 5,
+        "values": [
+            43320.0,
+            40948.0,
+            43971.0,
+            41622.0,
+            44740.0,
+            43919.0,
+            41231.0,
+            42497.0,
+            44664.0,
+            43894.0,
+            41149.0,
+            43254.0,
+            39687.0,
+            45400.0,
+            43313.0,
+            43891.0,
+            45351.0,
+            45692.0,
+            46187.0,
+            44657.0
+        ]
+    },
+    "iteration-time": {
+        "start_step": 0,
+        "end_step": 100,
+        "step_interval": 5,
+        "values": [
+            14.46368,
+            0.41717,
+            0.42344,
+            0.4102,
+            0.40332,
+            0.40531,
+            0.40418,
+            0.40386,
+            0.40711,
+            0.4048,
+            0.40536,
+            0.40331,
+            0.40175,
+            0.4047,
+            0.40982,
+            0.40834,
+            0.40594,
+            0.40872,
+            0.40896,
+            0.41014
+        ]
+    }
+}
\ No newline at end of file
diff --git a/tests/functional_tests/test_cases/t5/t5_220m_weekly_dgx_a100_1N8G_mcore_tp2_pp1_vp1/model_config.yaml b/tests/functional_tests/test_cases/t5/t5_220m_weekly_dgx_a100_1N8G_mcore_tp2_pp1_vp1/model_config.yaml
new file mode 100644
index 0000000000..47ff5b038b
--- /dev/null
+++ b/tests/functional_tests/test_cases/t5/t5_220m_weekly_dgx_a100_1N8G_mcore_tp2_pp1_vp1/model_config.yaml
@@ -0,0 +1,53 @@
+ENV_VARS:
+  CUDA_DEVICE_MAX_CONNECTIONS: 1
+  NVTE_ALLOW_NONDETERMINISTIC_ALGO: 0
+  NCCL_ALGO: ^NVLS
+  CUBLAS_WORKSPACE_CONFIG: :4096:8
+MODEL_ARGS:
+  --encoder-num-layers: 12
+  --decoder-num-layers: 12
+  --hidden-size: 768
+  --num-attention-heads: 12
+  --kv-channels: 64
+  --ffn-hidden-size: 3072
+  --encoder-seq-length: 512
+  --decoder-seq-length: 128
+  --max-position-embeddings: 512
+  --tensor-model-parallel-size: 2
+  --pipeline-model-parallel-size: 1
+  --micro-batch-size: 4
+  --global-batch-size: 32
+  --lr: 0.0001
+  --train-iters: 100
+  --lr-decay-iters: 100
+  --lr-decay-style: linear
+  --min-lr: 0.00001
+  --weight-decay: 1e-2
+  --lr-warmup-fraction: .01
+  --clip-grad: 1.0
+  --bf16: true
+  --vocab-extra-ids: 100
+  --init-method-std: 0.015
+  --transformer-impl: local
+  --data-path: ${DATA_PATH}/my-t5_00_text_document
+  --vocab-file: ${DATA_PATH}/bert-large-cased-vocab.txt
+  --tokenizer-type: BertWordPieceCase
+  --calculate-per-token-loss: true
+  --split: 99982,9,9
+  --save: ${CHECKPOINT_PATH}
+  --load: ${CHECKPOINT_PATH}
+  --tensorboard-dir: ${TENSORBOARD_PATH}
+  --log-params-norm: true
+  --log-num-zeros-in-grad: true
+  --log-validation-ppl-to-tensorboard: true
+  --log-timers-to-tensorboard: true
+  --timing-log-level: 2
+  --log-interval: 1
+  --save-interval: 10000
+  --eval-interval: 1000
+  --eval-iters: 10
+  --distributed-backend: nccl
+  --data-cache-path: ${DATA_CACHE_PATH}
+  --deterministic-mode: true
+  --ckpt-format: torch
+TEST_TYPE: regular
\ No newline at end of file
diff --git a/tests/functional_tests/test_cases/t5/t5_release/model_config.yaml b/tests/functional_tests/test_cases/t5/t5_release/model_config.yaml
new file mode 100644
index 0000000000..64784c36a6
--- /dev/null
+++ b/tests/functional_tests/test_cases/t5/t5_release/model_config.yaml
@@ -0,0 +1,67 @@
+ENV_VARS:
+  CUDA_DEVICE_MAX_CONNECTIONS: '1'
+  NVTE_ALLOW_NONDETERMINISTIC_ALGO: '1'
+  NVTE_FLASH_ATTN: '0'
+  NVTE_FUSED_ATTN: '0'
+
+TEST_TYPE: 'release'
+
+MODEL_ARGS:
+  # T5 model args
+  --encoder-num-layers: 12
+  --decoder-num-layers: 12
+  --hidden-size: 768
+  --num-attention-heads: 12 
+  --kv-channels: 64
+  --ffn-hidden-size: 3072
+  --encoder-seq-length: 512
+  --decoder-seq-length: 128
+  --max-position-embeddings: 512 
+  --init-method-std: 0.015
+
+  # Training args
+  --micro-batch-size: 32 
+  --global-batch-size: 512 
+  --train-iters: 100000 
+  --weight-decay: 1e-2 
+  --clip-grad: 1.0 
+  --bf16: true
+  --lr: 0.0001
+  --lr-decay-style: linear 
+  --min-lr: 1.0e-5 
+  --lr-warmup-fraction: .01 
+  --distributed-backend: nccl
+
+  # Transformer Engine args
+  --use-mcore-models: true
+  --transformer-impl: transformer_engine
+
+  # Model parallel
+  --tensor-model-parallel-size: 4
+  --pipeline-model-parallel-size: 1 
+  --encoder-pipeline-model-parallel-size: 0
+
+  # Data args
+  --data-path: ${DATA_BLEND}
+  --vocab-file: ${DATA_PATH}/bert-large-cased-vocab.txt
+  --tokenizer-type: BertWordPieceCase
+  --split: 99982,9,9
+  --data-cache-path: ${DATA_CACHE_PATH}
+  --vocab-extra-ids: 100
+
+  # EVAL_AND_LOGGING_ARGS
+  --log-interval: 100
+  --save-interval: 2000
+  --eval-interval: 1000 
+  --save: ${CHECKPOINT_PATH}
+  --load: ${CHECKPOINT_PATH}
+  --eval-iters: 10
+  --tensorboard-dir: ${TENSORBOARD_PATH}
+  --log-timers-to-tensorboard: true
+  --log-memory-to-tensorboard: true
+  --log-num-zeros-in-grad: true
+  --log-params-norm: true
+  --log-validation-ppl-to-tensorboard: true
+  --timing-log-level: 2
+  --wandb-project: megatron-core-release-runs
+  --wandb-exp-name: ${WANDB_EXPERIMENT}
\ No newline at end of file
diff --git a/tests/functional_tests/test_results/bert/bert_tp1_pp2_1nodes_50steps.json b/tests/functional_tests/test_results/bert/bert_tp1_pp2_1nodes_50steps.json
deleted file mode 100644
index 760aa31f4c..0000000000
--- a/tests/functional_tests/test_results/bert/bert_tp1_pp2_1nodes_50steps.json
+++ /dev/null
@@ -1 +0,0 @@
-{"lm loss": {"start_step": 0, "end_step": 50, "step_interval": 5, "values": [10.50444, 10.49325, 10.4863, 10.48386, 10.49892, 10.46644, 10.41921, 10.30106, 10.16285, 9.97939]}, "num-zeros": {"start_step": 0, "end_step": 34, "step_interval": 5, "values": [17438.0, 18815.0, 22912.0, 18568.0, 19900.0, 23810.0, 22918.0]}, "iteration_timing_avg": 0.35970588235294115}
diff --git a/tests/functional_tests/test_results/bert/bert_tp1_pp4_1nodes_50steps.json b/tests/functional_tests/test_results/bert/bert_tp1_pp4_1nodes_50steps.json
deleted file mode 100644
index 2b5a223e7d..0000000000
--- a/tests/functional_tests/test_results/bert/bert_tp1_pp4_1nodes_50steps.json
+++ /dev/null
@@ -1 +0,0 @@
-{"lm loss": {"start_step": 0, "end_step": 50, "step_interval": 5, "values": [10.54369, 10.5383, 10.55953, 10.54011, 10.51908, 10.49118, 10.46612, 10.31901, 10.15649, 9.96702]}, "num-zeros": {"start_step": 0, "end_step": 34, "step_interval": 5, "values": [21736.0, 20433.0, 27243.0, 23240.0, 22459.0, 20724.0, 23451.0]}, "iteration_timing_avg": 0.8657461764705884}
diff --git a/tests/functional_tests/test_results/bert/bert_tp2_pp2_1nodes_50steps.json b/tests/functional_tests/test_results/bert/bert_tp2_pp2_1nodes_50steps.json
deleted file mode 100644
index e90891762f..0000000000
--- a/tests/functional_tests/test_results/bert/bert_tp2_pp2_1nodes_50steps.json
+++ /dev/null
@@ -1 +0,0 @@
-{"lm loss": {"start_step": 0, "end_step": 50, "step_interval": 5, "values": [10.44729, 10.44093, 10.45375, 10.44445, 10.44305, 10.44595, 10.39163, 10.25898, 10.13498, 9.95692]}, "num-zeros": {"start_step": 0, "end_step": 34, "step_interval": 5, "values": [27334.0, 20551.0, 28114.0, 24328.0, 24070.0, 20653.0, 21346.0]}, "iteration_timing_avg": 0.6318655882352939}
diff --git a/tests/functional_tests/test_results/bert/bert_tp4_pp1_1nodes_50steps.json b/tests/functional_tests/test_results/bert/bert_tp4_pp1_1nodes_50steps.json
deleted file mode 100644
index 2c4bafd5f2..0000000000
--- a/tests/functional_tests/test_results/bert/bert_tp4_pp1_1nodes_50steps.json
+++ /dev/null
@@ -1 +0,0 @@
-{"lm loss": {"start_step": 0, "end_step": 50, "step_interval": 5, "values": [10.4978, 10.49775, 10.48021, 10.50638, 10.49624, 10.47018, 10.34494, 10.25536, 10.10244, 9.91938]}, "num-zeros": {"start_step": 0, "end_step": 35, "step_interval": 5, "values": [26168.0, 19042.0, 28718.0, 22408.0, 26377.0, 34320.0, 21873.0]}, "iteration_timing_avg": 1.1249785294117647}
diff --git a/tests/functional_tests/test_results/gpt3/gpt3_tp1_pp2_1nodes_50steps.json b/tests/functional_tests/test_results/gpt3/gpt3_tp1_pp2_1nodes_50steps.json
deleted file mode 100644
index cb07592a1b..0000000000
--- a/tests/functional_tests/test_results/gpt3/gpt3_tp1_pp2_1nodes_50steps.json
+++ /dev/null
@@ -1 +0,0 @@
-{"lm loss": {"start_step": 0, "end_step": 37, "step_interval": 5, "values": [10.84266, 10.89696, 10.90542, 10.87498, 10.86279, 10.83628, 10.64437, 10.62386]}, "num-zeros": {"start_step": 0, "end_step": 20, "step_interval": 5, "values": [2093.0, 2474.0, 2327.0, 2213.0]}, "iteration_timing_avg": 0.080846}
diff --git a/tests/functional_tests/test_results/gpt3/gpt3_tp1_pp4_1nodes_50steps.json b/tests/functional_tests/test_results/gpt3/gpt3_tp1_pp4_1nodes_50steps.json
deleted file mode 100644
index 0cf9359fb9..0000000000
--- a/tests/functional_tests/test_results/gpt3/gpt3_tp1_pp4_1nodes_50steps.json
+++ /dev/null
@@ -1 +0,0 @@
-{"lm loss": {"start_step": 0, "end_step": 49, "step_interval": 5, "values": [10.7947, 10.85294, 10.87058, 10.83388, 10.83025, 10.78755, 10.56419, 10.57339, 10.48735, 10.19553]}, "num-zeros": {"start_step": 0, "end_step": 33, "step_interval": 5, "values": [2452.0, 2744.0, 2176.0, 2722.0, 2636.0, 2535.0, 2996.0]}, "iteration_timing_avg": 0.1158709090909091}
diff --git a/tests/functional_tests/test_results/gpt3/gpt3_tp2_pp2_1nodes_50steps.json b/tests/functional_tests/test_results/gpt3/gpt3_tp2_pp2_1nodes_50steps.json
deleted file mode 100644
index 2347dfdf9c..0000000000
--- a/tests/functional_tests/test_results/gpt3/gpt3_tp2_pp2_1nodes_50steps.json
+++ /dev/null
@@ -1 +0,0 @@
-{"lm loss": {"start_step": 0, "end_step": 48, "step_interval": 5, "values": [10.85716, 10.88973, 10.879, 10.87014, 10.87978, 10.84463, 10.67266, 10.62932, 10.52767, 10.25362]}, "num-zeros": {"start_step": 0, "end_step": 31, "step_interval": 5, "values": [2450.0, 2396.0, 2523.0, 2242.0, 2225.0, 2478.0, 2536.0]}, "iteration_timing_avg": 0.11416968750000002}
diff --git a/tests/functional_tests/test_results/gpt3/gpt3_tp4_pp1_1nodes_50steps.json b/tests/functional_tests/test_results/gpt3/gpt3_tp4_pp1_1nodes_50steps.json
deleted file mode 100644
index 5adc692b5d..0000000000
--- a/tests/functional_tests/test_results/gpt3/gpt3_tp4_pp1_1nodes_50steps.json
+++ /dev/null
@@ -1 +0,0 @@
-{"lm loss": {"start_step": 0, "end_step": 50, "step_interval": 5, "values": [10.86276, 10.88058, 10.87527, 10.88402, 10.89173, 10.84724, 10.6886, 10.62864, 10.53925, 10.26646]}, "num-zeros": {"start_step": 0, "end_step": 33, "step_interval": 5, "values": [2199.0, 2306.0, 2412.0, 2032.0, 2077.0, 2475.0, 2347.0]}, "iteration_timing_avg": 0.15481029411764707}
diff --git a/tests/functional_tests/test_scripts/bert/pretrain_bert_distributed_resume_checkpoint_test.sh b/tests/functional_tests/test_scripts/bert/pretrain_bert_distributed_resume_checkpoint_test.sh
deleted file mode 100755
index d5c2f83e06..0000000000
--- a/tests/functional_tests/test_scripts/bert/pretrain_bert_distributed_resume_checkpoint_test.sh
+++ /dev/null
@@ -1,100 +0,0 @@
-#! /bin/bash
-
-DATA_PATH=$1
-CHECKPOINT_PATH=$2
-TENSORBOARD_DIR=$3
-TP_SIZE=$4
-PP_SIZE=$5
-NNODES=$6
-
-GPUS_PER_NODE=8
-# Change for multinode config
-MASTER_ADDR=localhost
-MASTER_PORT=6000
-NODE_RANK=0
-WORLD_SIZE=$(($GPUS_PER_NODE*$NNODES))
-export CUDA_DEVICE_MAX_CONNECTIONS=1
-
-
-# Runs the "345M" parameter model
-DISTRIBUTED_ARGS="--nproc_per_node $GPUS_PER_NODE --nnodes $NNODES --node_rank $NODE_RANK --master_addr $MASTER_ADDR --master_port $MASTER_PORT"
-
-# Run for 100 iterations
-python -m torch.distributed.launch $DISTRIBUTED_ARGS \
-       pretrain_bert.py \
-       --use-checkpoint-args \
-       --use-checkpoint-opt_param-scheduler \
-       --num-layers 24 \
-       --hidden-size 1024 \
-       --num-attention-heads 16 \
-       --log-params-norm \
-       --log-num-zeros-in-grad \
-       --log-validation-ppl-to-tensorboard \
-       --log-timers-to-tensorboard \
-       --tensorboard-dir ${TENSORBOARD_DIR} \
-       --micro-batch-size 4 \
-       --global-batch-size 128 \
-       --seq-length 512 \
-       --max-position-embeddings 512 \
-       --train-iters 100 \
-       --timing-log-level 2 \
-       --lr-decay-iters 990000 \
-       --save $CHECKPOINT_PATH \
-       --load $CHECKPOINT_PATH \
-       --data-path $DATA_PATH \
-       --vocab-file /workspace/data/bert_data/vocab.txt \
-       --data-impl mmap \
-       --split 949,50,1 \
-       --distributed-backend nccl \
-       --lr 0.0001 \
-       --min-lr 0.00001 \
-       --lr-warmup-fraction 0.01 \
-       --log-interval 1 \
-       --save-interval 50 \
-       --eval-interval 1000 \
-       --eval-iters 10 \
-       --tensor-model-parallel-size $TP_SIZE \
-       --pipeline-model-parallel-size $PP_SIZE \
-       --no-gradient-accumulation-fusion \
-       --fp16
-
-echo 50 > $CHECKPOINT_PATH/latest_checkpointed_iteration.txt
-
-# Resume from 50th iteration ckpt and continue to 100 iterations
-python -m torch.distributed.launch $DISTRIBUTED_ARGS \
-       pretrain_bert.py \
-       --use-checkpoint-args \
-       --use-checkpoint-opt_param-scheduler \
-       --num-layers 24 \
-       --hidden-size 1024 \
-       --num-attention-heads 16 \
-       --log-params-norm \
-       --log-num-zeros-in-grad \
-       --log-validation-ppl-to-tensorboard \
-       --log-timers-to-tensorboard \
-       --tensorboard-dir ${TENSORBOARD_DIR} \
-       --micro-batch-size 4 \
-       --global-batch-size 128 \
-       --seq-length 512 \
-       --max-position-embeddings 512 \
-       --train-iters 100 \
-       --timing-log-level 2 \
-       --lr-decay-iters 990000 \
-       --save $CHECKPOINT_PATH \
-       --load $CHECKPOINT_PATH \
-       --data-path $DATA_PATH \
-       --vocab-file /workspace/data/bert_data/vocab.txt \
-       --data-impl mmap \
-       --split 949,50,1 \
-       --distributed-backend nccl \
-       --lr 0.0001 \
-       --min-lr 0.00001 \
-       --lr-warmup-fraction 0.01 \
-       --log-interval 1 \
-       --save-interval 10000 \
-       --eval-interval 1000 \
-       --eval-iters 10 \
-       --tensor-model-parallel-size $TP_SIZE \
-       --pipeline-model-parallel-size $PP_SIZE \
-       --no-gradient-accumulation-fusion \
-       --fp16
\ No newline at end of file
diff --git a/tests/functional_tests/test_scripts/bert/pretrain_bert_distributed_test.sh b/tests/functional_tests/test_scripts/bert/pretrain_bert_distributed_test.sh
deleted file mode 100755
index af24b473da..0000000000
--- a/tests/functional_tests/test_scripts/bert/pretrain_bert_distributed_test.sh
+++ /dev/null
@@ -1,59 +0,0 @@
-#! /bin/bash
-set -o xtrace
-
-DATA_PATH=$1
-CHECKPOINT_PATH=$2
-TENSORBOARD_DIR=$3
-TP_SIZE=$4
-PP_SIZE=$5
-NNODES=$6
-MAX_STEPS=$7
-VP_SIZE=$8
-GPUS_PER_NODE=8
-# Change for multinode config
-MASTER_ADDR=localhost
-MASTER_PORT=6000
-NODE_RANK=0
-WORLD_SIZE=$(($GPUS_PER_NODE*$NNODES))
-export CUDA_DEVICE_MAX_CONNECTIONS=1
-
-
-# Runs the "345M" parameter model
-DISTRIBUTED_ARGS="--nproc_per_node $GPUS_PER_NODE --nnodes $NNODES --node_rank $NODE_RANK --master_addr $MASTER_ADDR --master_port $MASTER_PORT"
-
-python -m torch.distributed.launch $DISTRIBUTED_ARGS \
-       pretrain_bert.py \
-       --num-layers 24 \
-       --hidden-size 1024 \
-       --num-attention-heads 16 \
-       --log-params-norm \
-       --log-num-zeros-in-grad \
-       --log-validation-ppl-to-tensorboard \
-       --log-timers-to-tensorboard \
-       --tensorboard-dir ${TENSORBOARD_DIR} \
-       --micro-batch-size 4 \
-       --global-batch-size 128 \
-       --seq-length 512 \
-       --max-position-embeddings 512 \
-       --train-iters $MAX_STEPS \
-       --timing-log-level 2 \
-       --lr-decay-iters 990000 \
-       --save $CHECKPOINT_PATH \
-       --load $CHECKPOINT_PATH \
-       --data-path $DATA_PATH \
-       --vocab-file /workspace/data/bert_data/vocab.txt \
-       --data-impl mmap \
-       --split 949,50,1 \
-       --distributed-backend nccl \
-       --lr 0.0001 \
-       --min-lr 0.00001 \
-       --lr-warmup-fraction 0.01 \
-       --log-interval 1 \
-       --save-interval 10000 \
-       --eval-interval 1000 \
-       --eval-iters 10 \
-       --tensor-model-parallel-size $TP_SIZE \
-       --pipeline-model-parallel-size $PP_SIZE \
-       ${VP_SIZE:+--num-layers-per-virtual-pipeline-stage "$VP_SIZE"} \
-       --no-gradient-accumulation-fusion \
-       --fp16 
\ No newline at end of file
diff --git a/tests/functional_tests/test_scripts/bert/sbatch_bert_distributed_resume_checkpoint_test.sh b/tests/functional_tests/test_scripts/bert/sbatch_bert_distributed_resume_checkpoint_test.sh
deleted file mode 100644
index 31b3ff9937..0000000000
--- a/tests/functional_tests/test_scripts/bert/sbatch_bert_distributed_resume_checkpoint_test.sh
+++ /dev/null
@@ -1,16 +0,0 @@
-#!/bin/bash
-
-# Parameters
-#SBATCH --account=adlr
-#SBATCH --job-name=adlr-ci:megatron-job
-#SBATCH --nodes=1
-#SBATCH --partition=luna
-
-DATA_PATH=/workspace/data/bert_data/my-bert_00_text_sentence
-CHECKPOINT_PATH=/workspace/checkpoints
-TENSORBOARD_DIR=/workspace/logs
-
-srun --output $BASE_DIR/results/slurm-%j.out --error $BASE_DIR/results/slurm-%j.out --container-image gitlab-master.nvidia.com/dl/dgx/pytorch:21.12-py3-devel --container-mounts $BASE_DIR/logs:/workspace/logs,$BASE_DIR/checkpoints:/workspace/checkpoints,$BUILD_DIR:/workspace/megatron-lm,$DATA_DIR:/workspace/data --no-container-mount-home bash -c "
-  ls 
-  cd /workspace/megatron-lm
-  ./tests/functional_tests/test_scripts/bert/pretrain_bert_distributed_resume_checkpoint_test.sh $DATA_PATH $CHECKPOINT_PATH $TENSORBOARD_DIR $TP_SIZE $PP_SIZE $NUM_NODES"
\ No newline at end of file
diff --git a/tests/functional_tests/test_scripts/bert/sbatch_bert_distributed_test.sh b/tests/functional_tests/test_scripts/bert/sbatch_bert_distributed_test.sh
deleted file mode 100755
index 45a441b27e..0000000000
--- a/tests/functional_tests/test_scripts/bert/sbatch_bert_distributed_test.sh
+++ /dev/null
@@ -1,16 +0,0 @@
-#!/bin/bash
-
-# Parameters
-#SBATCH --account=adlr
-#SBATCH --job-name=adlr-ci:megatron-job
-#SBATCH --nodes=1
-#SBATCH --partition=luna
-
-DATA_PATH=/workspace/data/bert_data/my-bert_00_text_sentence
-CHECKPOINT_PATH=/workspace/checkpoints
-TENSORBOARD_DIR=/workspace/logs
-
-srun --output $BASE_DIR/results/slurm-%j.out --error $BASE_DIR/results/slurm-%j.out --container-image gitlab-master.nvidia.com/dl/dgx/pytorch:21.12-py3-devel --container-mounts $BASE_DIR/logs:/workspace/logs,$BASE_DIR/checkpoints:/workspace/checkpoints,$BUILD_DIR:/workspace/megatron-lm,$DATA_DIR:/workspace/data --no-container-mount-home bash -c "
-  ls 
-  cd /workspace/megatron-lm
-  ./tests/functional_tests/test_scripts/bert/pretrain_bert_distributed_test.sh $DATA_PATH $CHECKPOINT_PATH $TENSORBOARD_DIR $TP_SIZE $PP_SIZE $NUM_NODES $MAX_STEPS $VP_SIZE"
\ No newline at end of file
diff --git a/tests/functional_tests/test_scripts/gpt3/pretrain_gpt3_distributed_resume_checkpoint_test.sh b/tests/functional_tests/test_scripts/gpt3/pretrain_gpt3_distributed_resume_checkpoint_test.sh
deleted file mode 100755
index 7a91a13c54..0000000000
--- a/tests/functional_tests/test_scripts/gpt3/pretrain_gpt3_distributed_resume_checkpoint_test.sh
+++ /dev/null
@@ -1,108 +0,0 @@
-#! /bin/bash
-
-DATA_PATH=$1
-CHECKPOINT_PATH=$2
-TENSORBOARD_DIR=$3
-TP_SIZE=$4
-PP_SIZE=$5
-NNODES=$6
-
-GPUS_PER_NODE=8
-# Change for multinode config
-MASTER_ADDR=localhost
-MASTER_PORT=6000
-NODE_RANK=0
-WORLD_SIZE=$(($GPUS_PER_NODE*$NNODES))
-export CUDA_DEVICE_MAX_CONNECTIONS=1
-
-
-# Runs the "345M" parameter model
-DISTRIBUTED_ARGS="--nproc_per_node $GPUS_PER_NODE --nnodes $NNODES --node_rank $NODE_RANK --master_addr $MASTER_ADDR --master_port $MASTER_PORT"
-
-# Run for 100 iterations and save checkpoint at 50
-python -m torch.distributed.launch $DISTRIBUTED_ARGS \
-       pretrain_gpt.py \
-       --use-checkpoint-args \
-       --use-checkpoint-opt_param-scheduler \
-       --num-layers 12 \
-       --hidden-size 512 \
-       --num-attention-heads 8 \
-       --log-params-norm \
-       --log-num-zeros-in-grad \
-       --log-validation-ppl-to-tensorboard \
-       --log-timers-to-tensorboard \
-       --tensorboard-dir ${TENSORBOARD_DIR} \
-       --micro-batch-size 4 \
-       --global-batch-size 32 \
-       --seq-length 1024 \
-       --max-position-embeddings 1024 \
-       --train-iters 100 \
-       --timing-log-level 2 \
-       --lr-decay-iters 320000 \
-       --save $CHECKPOINT_PATH \
-       --load $CHECKPOINT_PATH \
-       --data-path $DATA_PATH \
-       --vocab-file /workspace/data/gpt3_data/gpt2-vocab.json \
-       --merge-file /workspace/data/gpt3_data/gpt2-merges.txt \
-       --data-impl mmap \
-       --split 949,50,1 \
-       --distributed-backend nccl \
-       --lr 0.00015 \
-       --lr-decay-style cosine \
-       --min-lr 1.0e-5 \
-       --weight-decay 1e-2 \
-       --clip-grad 1.0 \
-       --lr-warmup-fraction .01 \
-       --log-interval 1 \
-       --save-interval 50 \
-       --eval-interval 1000 \
-       --eval-iters 10 \
-       --tensor-model-parallel-size $TP_SIZE \
-       --pipeline-model-parallel-size $PP_SIZE \
-       --no-gradient-accumulation-fusion \
-       --fp16
-
-echo 50 > $CHECKPOINT_PATH/latest_checkpointed_iteration.txt
-
-# Resume from 50th iteration ckpt and continue to 100 iterations
-python -m torch.distributed.launch $DISTRIBUTED_ARGS \
-       pretrain_gpt.py \
-       --use-checkpoint-args \
-       --use-checkpoint-opt_param-scheduler \
-       --num-layers 12 \
-       --hidden-size 512 \
-       --num-attention-heads 8 \
-       --log-params-norm \
-       --log-num-zeros-in-grad \
-       --log-validation-ppl-to-tensorboard \
-       --log-timers-to-tensorboard \
-       --tensorboard-dir ${TENSORBOARD_DIR} \
-       --micro-batch-size 4 \
-       --global-batch-size 32 \
-       --seq-length 1024 \
-       --max-position-embeddings 1024 \
-       --train-iters 100 \
-       --timing-log-level 2 \
-       --lr-decay-iters 320000 \
-       --save $CHECKPOINT_PATH \
-       --load $CHECKPOINT_PATH \
-       --data-path $DATA_PATH \
-       --vocab-file /workspace/data/gpt3_data/gpt2-vocab.json \
-       --merge-file /workspace/data/gpt3_data/gpt2-merges.txt \
-       --data-impl mmap \
-       --split 949,50,1 \
-       --distributed-backend nccl \
-       --lr 0.00015 \
-       --lr-decay-style cosine \
-       --min-lr 1.0e-5 \
-       --weight-decay 1e-2 \
-       --clip-grad 1.0 \
-       --lr-warmup-fraction .01 \
-       --log-interval 1 \
-       --save-interval 10000 \
-       --eval-interval 1000 \
-       --eval-iters 10 \
-       --tensor-model-parallel-size $TP_SIZE \
-       --pipeline-model-parallel-size $PP_SIZE \
-       --no-gradient-accumulation-fusion \
-       --fp16
\ No newline at end of file
diff --git a/tests/functional_tests/test_scripts/gpt3/pretrain_gpt3_distributed_test.sh b/tests/functional_tests/test_scripts/gpt3/pretrain_gpt3_distributed_test.sh
deleted file mode 100755
index 5ab3b76c42..0000000000
--- a/tests/functional_tests/test_scripts/gpt3/pretrain_gpt3_distributed_test.sh
+++ /dev/null
@@ -1,76 +0,0 @@
-#! /bin/bash
-
-DATA_PATH=$1
-CHECKPOINT_PATH=$2
-TENSORBOARD_DIR=$3
-USE_TE=$4
-TP_SIZE=$5
-PP_SIZE=$6
-NNODES=$7
-MAX_STEPS=$8
-VP_SIZE=$9
-MBS=${10}
-GBS=${11}
-GPUS_PER_NODE=8
-# Change for multinode config
-MASTER_ADDR=localhost
-MASTER_PORT=6000
-NODE_RANK=0
-WORLD_SIZE=$(($GPUS_PER_NODE*$NNODES))
-export CUDA_DEVICE_MAX_CONNECTIONS=1
-
-TRANSFORMER_IMPL=local
-TRAINING_DTYPE=fp16
-
-if [[ $USE_TE -eq 1 ]]; then
-       echo "Running with TransformerEngine ..."
-       TRANSFORMER_IMPL=transformer_engine
-       TRAINING_DTYPE=bf16
-else
-       echo "Running with local transformer implementation ..."
-fi
-
-# Runs the "345M" parameter model
-DISTRIBUTED_ARGS="--nproc_per_node $GPUS_PER_NODE --nnodes $NNODES"
-
-torchrun $DISTRIBUTED_ARGS \
-       pretrain_gpt.py \
-       --num-layers 12 \
-       --hidden-size 512 \
-       --num-attention-heads 8 \
-       --log-params-norm \
-       --log-num-zeros-in-grad \
-       --log-validation-ppl-to-tensorboard \
-       --log-timers-to-tensorboard \
-       --tensorboard-dir ${TENSORBOARD_DIR} \
-       --micro-batch-size ${MBS:-4} \
-       --global-batch-size ${GBS:-32} \
-       --seq-length 1024 \
-       --max-position-embeddings 1024 \
-       --train-iters $MAX_STEPS \
-       --timing-log-level 2 \
-       --lr-decay-iters 320000 \
-       --save $CHECKPOINT_PATH \
-       --load $CHECKPOINT_PATH \
-       --data-path $DATA_PATH \
-       --vocab-file /workspace/data/gpt3_data/gpt2-vocab.json \
-       --merge-file /workspace/data/gpt3_data/gpt2-merges.txt \
-       --data-impl mmap \
-       --split 949,50,1 \
-       --distributed-backend nccl \
-       --lr 0.00015 \
-       --lr-decay-style cosine \
-       --min-lr 1.0e-5 \
-       --weight-decay 1e-2 \
-       --clip-grad 1.0 \
-       --lr-warmup-fraction .01 \
-       --log-interval 1 \
-       --save-interval 10000 \
-       --eval-interval 1000 \
-       --eval-iters 10 \
-       --transformer-impl $TRANSFORMER_IMPL \
-       --tensor-model-parallel-size $TP_SIZE \
-       --pipeline-model-parallel-size $PP_SIZE \
-       ${VP_SIZE:+--num-layers-per-virtual-pipeline-stage "$VP_SIZE"} \
-       --no-gradient-accumulation-fusion \
-       --${TRAINING_DTYPE}
diff --git a/tests/functional_tests/test_scripts/gpt3/sbatch_gpt3_distributed_resume_checkpoint_test.sh b/tests/functional_tests/test_scripts/gpt3/sbatch_gpt3_distributed_resume_checkpoint_test.sh
deleted file mode 100644
index f9761a1346..0000000000
--- a/tests/functional_tests/test_scripts/gpt3/sbatch_gpt3_distributed_resume_checkpoint_test.sh
+++ /dev/null
@@ -1,16 +0,0 @@
-#!/bin/bash
-
-# Parameters
-#SBATCH --account=adlr
-#SBATCH --job-name=adlr-ci:megatron-job
-#SBATCH --nodes=1
-#SBATCH --partition=luna
-
-DATA_PATH=/workspace/data/gpt3_data/my-gpt3_00_text_document
-CHECKPOINT_PATH=/workspace/checkpoints
-TENSORBOARD_DIR=/workspace/logs
-
-srun --output $BASE_DIR/results/slurm-%j.out --error $BASE_DIR/results/slurm-%j.out --container-image gitlab-master.nvidia.com/dl/dgx/pytorch:21.12-py3-devel --container-mounts $BASE_DIR/logs:/workspace/logs,$BASE_DIR/checkpoints:/workspace/checkpoints,$BUILD_DIR:/workspace/megatron-lm,$DATA_DIR:/workspace/data --no-container-mount-home bash -c "
-  ls 
-  cd /workspace/megatron-lm
-  ./tests/functional_tests/test_scripts/gpt3/pretrain_gpt3_distributed_resume_checkpoint_test.sh $DATA_PATH $CHECKPOINT_PATH $TENSORBOARD_DIR $TP_SIZE $PP_SIZE $NUM_NODES"
\ No newline at end of file
diff --git a/tests/functional_tests/test_scripts/gpt3/sbatch_gpt3_distributed_test.sh b/tests/functional_tests/test_scripts/gpt3/sbatch_gpt3_distributed_test.sh
deleted file mode 100755
index cab43bc156..0000000000
--- a/tests/functional_tests/test_scripts/gpt3/sbatch_gpt3_distributed_test.sh
+++ /dev/null
@@ -1,22 +0,0 @@
-#!/bin/bash
-
-# Parameters
-#SBATCH --account=adlr
-#SBATCH --job-name=adlr-ci:megatron-job
-#SBATCH --nodes=1
-#SBATCH --partition=luna
-
-DATA_PATH=/workspace/data/gpt3_data/my-gpt3_00_text_document
-CHECKPOINT_PATH=/workspace/checkpoints
-TENSORBOARD_DIR=/workspace/logs
-IMAGE=gitlab-master.nvidia.com/dl/dgx/pytorch:21.12-py3-devel
-
-if [[ $USE_TE -eq 1 ]]; then
-  echo "Using container nvcr.io/nvidia/pytorch:23.04-py3 for running with TE ..."
-  IMAGE=nvcr.io/nvidia/pytorch:23.04-py3
-fi
-
-srun --output $BASE_DIR/results/slurm-%j.out --error $BASE_DIR/results/slurm-%j.out --container-image $IMAGE --container-mounts $BASE_DIR/logs:/workspace/logs,$BASE_DIR/checkpoints:/workspace/checkpoints,$BUILD_DIR:/workspace/megatron-lm,$DATA_DIR:/workspace/data --no-container-mount-home bash -c "
-  ls 
-  cd /workspace/megatron-lm
-  ./tests/functional_tests/test_scripts/gpt3/pretrain_gpt3_distributed_test.sh $DATA_PATH $CHECKPOINT_PATH $TENSORBOARD_DIR $USE_TE $TP_SIZE $PP_SIZE $NUM_NODES $MAX_STEPS $VP_SIZE $MBS $GBS"
diff --git a/tests/pipeline_parallel/test_schedules.py b/tests/pipeline_parallel/test_schedules.py
deleted file mode 100644
index b74822ec22..0000000000
--- a/tests/pipeline_parallel/test_schedules.py
+++ /dev/null
@@ -1,189 +0,0 @@
-import torch
-from tests.test_utilities import Utils
-import megatron.core.pipeline_parallel.schedules as schedule
-from pytest_mock import mocker 
-import pytest
-
-rank = Utils.rank
- 
-def test_get_forward_backward_func():
-    Utils.initialize_model_parallel(tensor_model_parallel_size=2, pipeline_model_parallel_size=1)
-    assert(schedule.get_forward_backward_func() == schedule.forward_backward_no_pipelining)
-    Utils.destroy_model_parallel()
-    Utils.initialize_model_parallel(tensor_model_parallel_size=2, pipeline_model_parallel_size=4)
-    assert(schedule.get_forward_backward_func() == schedule.forward_backward_pipelining_without_interleaving)
-    Utils.destroy_model_parallel()
-    Utils.initialize_model_parallel(tensor_model_parallel_size=2, pipeline_model_parallel_size=4, virtual_pipeline_model_parallel_size=2)
-    assert(schedule.get_forward_backward_func() == schedule.forward_backward_pipelining_with_interleaving)
-    Utils.destroy_model_parallel()
-
-def test_deallocate_output_tensor():
-    out = torch.tensor([[1, 2, 3], [4, 5, 6]])
-    schedule.deallocate_output_tensor(out)
-    assert(out.nelement() == 1) 
-
-def test_forward_backward_func_without_pipeline_parallel(mocker):
-    from megatron.core.pipeline_parallel import get_forward_backward_func
-
-    Utils.initialize_model_parallel(tensor_model_parallel_size=2, pipeline_model_parallel_size=1)
-
-    def forward_step_func(data_iterator, model):
-        import os
-        rank = int(os.environ['LOCAL_RANK'])
-        dummy_data = torch.ones(1,4)
-        def loss_func(output_tensor):
-            return rank, {'loss_reduced':rank}
-        return model(dummy_data), loss_func
-
-    model = torch.nn.Linear(4,1)
-    model.model_type = 'unit-test'
-    def set_input_tensor(input_tensor):
-        return None
-    model.set_input_tensor = set_input_tensor
-
-    forward_backward_func = get_forward_backward_func()
-    assert(schedule.get_forward_backward_func() == schedule.forward_backward_no_pipelining)
-
-    mocker.patch("megatron.core.pipeline_parallel.schedules.custom_backward", return_value=2)
-    
-    losses_reduced = forward_backward_func(
-        forward_step_func=forward_step_func,
-        data_iterator=None,
-        model=[model],
-        num_microbatches=4,
-        forward_only=False) 
-    
-    loss_reduced_expected = [{'loss_reduced': rank}, {'loss_reduced': rank}, {'loss_reduced': rank}, {'loss_reduced': rank}]
-    for i,j in zip(losses_reduced, loss_reduced_expected):
-        print(losses_reduced)
-        assert(i['loss_reduced'] == j['loss_reduced'])
-    Utils.destroy_model_parallel() 
-
-def test_forward_backward_func_with_pipeline_parallel(mocker):
-    from megatron.core.pipeline_parallel import get_forward_backward_func
-
-    Utils.initialize_model_parallel(tensor_model_parallel_size=1, pipeline_model_parallel_size=4)
-
-    def forward_step_func(data_iterator, model):
-        import os
-        rank = int(os.environ['LOCAL_RANK'])
-        def loss_func(output_tensor):
-            return rank, {'loss_reduced':rank}
-        return torch.rand(512,8,256).cuda(), loss_func
-
-    model = torch.nn.Linear(4,1)
-    model.model_type = 'unit-test'
-    def set_input_tensor(input_tensor):
-        return None
-    model.set_input_tensor = set_input_tensor
-
-    forward_backward_func = get_forward_backward_func()
-    assert(schedule.get_forward_backward_func() == schedule.forward_backward_pipelining_without_interleaving)
-
-    sequence_length = 512
-    micro_batch_size = 8
-    hidden_size = 256
-    
-    losses_reduced = forward_backward_func(
-        forward_step_func=forward_step_func,
-        data_iterator=None,
-        dtype=torch.float32,
-        model=[model],
-        num_microbatches= micro_batch_size,
-        tensor_shape=[sequence_length, micro_batch_size, hidden_size],
-        decoder_seq_length=sequence_length,
-        sequence_parallel=False,
-        forward_only=True) 
-    
-    loss_reduced_expected = [{'loss_reduced': rank}, {'loss_reduced': rank}, {'loss_reduced': rank}, {'loss_reduced': rank}]
-    for i,j in zip(losses_reduced, loss_reduced_expected):
-        print(losses_reduced)
-        assert(i['loss_reduced'] == j['loss_reduced'])
-    Utils.destroy_model_parallel()  
-
-""" 
-def test_forward_backward_func_with_interleaving(mocker):
-    from megatron.core.pipeline_parallel import get_forward_backward_func
-    from megatron.core.enums import ModelType
-
-    Utils.initialize_model_parallel(tensor_model_parallel_size=1, pipeline_model_parallel_size=4, virtual_pipeline_model_parallel_size=2)
-
-    def forward_step_func(data_iterator, model):
-        import os
-        rank = int(os.environ['LOCAL_RANK'])
-        def loss_func(output_tensor):
-            return rank, {'loss_reduced':rank}
-        return torch.rand(512,8,256).cuda(), loss_func
-
-    model = torch.nn.Linear(4,1)
-    def set_input_tensor(input_tensor):
-        return None
-    model.set_input_tensor = set_input_tensor
-
-    forward_backward_func = get_forward_backward_func()
-    assert(schedule.get_forward_backward_func() == schedule.forward_backward_pipelining_with_interleaving)
-
-    sequence_length = 512
-    micro_batch_size = 8
-    hidden_size = 256
-
-    mocker.patch("megatron.core.pipeline_parallel.schedules.custom_backward", return_value=2)
-
-    with pytest.raises(RuntimeError):
-        model.model_type = ModelType.encoder_and_decoder
-        forward_backward_func(
-            forward_step_func=forward_step_func,
-            data_iterator=range(0,100),
-            dtype=torch.float32,
-            model=[model, model],
-            num_microbatches= micro_batch_size,
-            tensor_shape=[sequence_length, micro_batch_size, hidden_size],
-            decoder_seq_length=sequence_length,
-            sequence_parallel=False,
-            forward_only=True)
-        
-    with pytest.raises(RuntimeError):
-        model.model_type = ModelType.encoder_or_decoder
-        forward_backward_func(
-            forward_step_func=forward_step_func,
-            data_iterator=range(0,100),
-            dtype=torch.float32,
-            model=[model, model],
-            num_microbatches= micro_batch_size,
-            tensor_shape=[sequence_length, micro_batch_size, hidden_size],
-            decoder_seq_length=256,
-            sequence_parallel=False,
-            forward_only=True)
-
-    with pytest.raises(RuntimeError):
-        model.model_type = ModelType.encoder_or_decoder
-        forward_backward_func(
-            forward_step_func=forward_step_func,
-            data_iterator=range(0,100),
-            dtype=torch.float32,
-            model=[model, model],
-            num_microbatches= 7,
-            tensor_shape=[sequence_length, micro_batch_size, hidden_size],
-            decoder_seq_length=512,
-            sequence_parallel=False,
-            forward_only=True)    
-
-    model.model_type = ModelType.encoder_or_decoder
-    losses_reduced = forward_backward_func(
-        forward_step_func=forward_step_func,
-        data_iterator=range(0,100),
-        dtype=torch.float32,
-        model=[model, model],
-        num_microbatches= micro_batch_size,
-        tensor_shape=[sequence_length, micro_batch_size, hidden_size],
-        decoder_seq_length=sequence_length,
-        sequence_parallel=True,
-        forward_only=True) 
-    
-    loss_reduced_expected = [{'loss_reduced': rank}, {'loss_reduced': rank}, {'loss_reduced': rank}, {'loss_reduced': rank}]
-    for i,j in zip(losses_reduced, loss_reduced_expected):
-        print(losses_reduced)
-        assert(i['loss_reduced'] == j['loss_reduced'])
-
-    Utils.destroy_model_parallel()  
-"""
\ No newline at end of file
diff --git a/tests/unit_tests/__init__.py b/tests/unit_tests/__init__.py
index e69de29bb2..38a9977640 100644
--- a/tests/unit_tests/__init__.py
+++ b/tests/unit_tests/__init__.py
@@ -0,0 +1,3 @@
+import torch._dynamo
+
+torch._dynamo.config.suppress_errors = True
diff --git a/tests/unit_tests/conftest.py b/tests/unit_tests/conftest.py
new file mode 100644
index 0000000000..01b5bcb30b
--- /dev/null
+++ b/tests/unit_tests/conftest.py
@@ -0,0 +1,38 @@
+import gc
+import os
+import sys
+from pathlib import Path
+from unittest import mock
+
+import pytest
+import torch
+
+from megatron.core.dist_checkpointing.strategies.base import StrategyAction, get_default_strategy
+from megatron.core.utils import is_te_min_version
+from tests.unit_tests.dist_checkpointing import TempNamedDir
+from tests.unit_tests.test_utilities import Utils
+
+
+@pytest.fixture(scope="session", autouse=True)
+def set_env():
+    if is_te_min_version("1.3"):
+        os.environ['NVTE_FLASH_ATTN'] = '0'
+        os.environ['NVTE_FUSED_ATTN'] = '0'
+
+
+@pytest.fixture(scope="session")
+def tmp_path_dist_ckpt(tmp_path_factory) -> Path:
+    """Common directory for saving the checkpoint.
+
+    Can't use pytest `tmp_path_factory` directly because directory must be shared between processes.
+    """
+
+    tmp_dir = tmp_path_factory.mktemp('ignored', numbered=False)
+    tmp_dir = tmp_dir.parent.parent / 'tmp_dist_ckpt'
+
+    if Utils.rank == 0:
+        with TempNamedDir(tmp_dir, sync=False):
+            yield tmp_dir
+
+    else:
+        yield tmp_dir
diff --git a/tests/unit_tests/data/__init__.py b/tests/unit_tests/data/__init__.py
new file mode 100644
index 0000000000..e69de29bb2
diff --git a/tests/unit_tests/data/test_bin_reader.py b/tests/unit_tests/data/test_bin_reader.py
new file mode 100644
index 0000000000..b8b6ec5dd7
--- /dev/null
+++ b/tests/unit_tests/data/test_bin_reader.py
@@ -0,0 +1,164 @@
+import os
+import random
+import sys
+import tempfile
+from types import ModuleType, SimpleNamespace
+from typing import Any, Dict
+
+import nltk
+import pytest
+
+try:
+    import boto3
+    import botocore.exceptions as exceptions
+except ModuleNotFoundError:
+    boto3 = ModuleType("boto3")
+    sys.modules[boto3.__name__] = boto3
+    exceptions = ModuleType("botocore.exceptions")
+    sys.modules[exceptions.__name__] = exceptions
+
+from megatron.core.datasets.indexed_dataset import (
+    IndexedDataset,
+    S3Config,
+    _FileBinReader,
+    _MMapBinReader,
+    _S3BinReader,
+)
+from megatron.core.datasets.utils_s3 import S3_PREFIX, S3Client
+from tests.unit_tests.data.test_preprocess_data import (
+    build_datasets,
+    dummy_jsonl,
+    gpt2_merge,
+    gpt2_vocab,
+)
+
+##
+# Overload client from boto3
+##
+
+
+class _LocalClient(S3Client):
+    """Local test client"""
+
+    def __init__(self, *args: Any) -> None:
+        pass
+
+    def download_file(self, Bucket: str, Key: str, Filename: str) -> None:
+        os.system(f"cp {os.path.join('/', Bucket, Key)} {Filename}")
+        assert os.path.exists(Filename)
+
+    def upload_file(self, Filename: str, Bucket: str, Key: str) -> None:
+        raise NotImplementedError
+
+    def head_object(self, Bucket: str, Key: str) -> Dict[str, Any]:
+        assert os.path.exists(os.path.join("/", Bucket, Key))
+        return {}
+
+    def get_object(self, Bucket: str, Key: str, Range: str) -> Dict[str, Any]:
+        _, _range = Range.split("=")
+        _range_beg, _range_end = tuple(map(int, _range.split("-")))
+
+        filename = os.path.join("/", Bucket, Key)
+
+        with open(filename, mode='rb', buffering=0) as bin_buffer_file:
+            bin_buffer_file.seek(_range_beg)
+            _bytes = bin_buffer_file.read(_range_end - _range_beg)
+
+        response = {"Body": SimpleNamespace(read=lambda: _bytes)}
+
+        return response
+
+    def close(self) -> None:
+        pass
+
+
+setattr(boto3, "client", _LocalClient)
+
+
+##
+# Overload ClientError from botocore.exceptions
+##
+
+
+class _LocalClientError(Exception):
+    """ "Local test client error"""
+
+    pass
+
+
+setattr(exceptions, "ClientError", _LocalClientError)
+
+
+@pytest.mark.flaky
+def test_bin_reader():
+    with tempfile.TemporaryDirectory() as temp_dir:
+        # set the default nltk data path
+        os.environ["NLTK_DATA"] = os.path.join(temp_dir, "nltk_data")
+        nltk.data.path.append(os.environ["NLTK_DATA"])
+
+        path_to_raws = os.path.join(temp_dir, "sample_raws")
+        path_to_data = os.path.join(temp_dir, "sample_data")
+        path_to_s3_cache = os.path.join(temp_dir, "s3_cache")
+        os.mkdir(path_to_raws)
+        os.mkdir(path_to_data)
+        os.mkdir(path_to_s3_cache)
+
+        # create the dummy resources
+        dummy_jsonl(path_to_raws)
+
+        # build the datasets
+        build_datasets(
+            path_to_raws,
+            path_to_data,
+            extra_args=[
+                "--tokenizer-type",
+                "GPT2BPETokenizer",
+                "--vocab-file",
+                gpt2_vocab(temp_dir),
+                "--merge-file",
+                gpt2_merge(temp_dir),
+                "--append-eod",
+                "--workers",
+                "10",
+                "--log-interval",
+                "1",
+            ],
+        )
+
+        prefixes = set(
+            [
+                os.path.join(temp_dir, "sample_data", path.split(".")[0])
+                for path in os.listdir(path_to_data)
+                if path.endswith(".bin") or path.endswith(".idx")
+            ]
+        )
+
+        for prefix in prefixes:
+            indexed_dataset_file = IndexedDataset(prefix, multimodal=False, mmap=False)
+            assert isinstance(indexed_dataset_file.bin_reader, _FileBinReader)
+
+            indexed_dataset_mmap = IndexedDataset(prefix, multimodal=False, mmap=True)
+            assert isinstance(indexed_dataset_mmap.bin_reader, _MMapBinReader)
+
+            indexed_dataset_s3 = IndexedDataset(
+                S3_PREFIX + prefix,
+                multimodal=False,
+                mmap=False,
+                s3_config=S3Config(path_to_idx_cache=path_to_s3_cache),
+            )
+            assert isinstance(indexed_dataset_s3.bin_reader, _S3BinReader)
+
+            assert len(indexed_dataset_s3) == len(indexed_dataset_file)
+            assert len(indexed_dataset_s3) == len(indexed_dataset_mmap)
+
+            indices = random.sample(
+                list(range(len(indexed_dataset_s3))), min(100, len(indexed_dataset_s3))
+            )
+
+            for idx in indices:
+                assert (indexed_dataset_s3[idx] == indexed_dataset_file[idx]).all()
+                assert (indexed_dataset_s3[idx] == indexed_dataset_mmap[idx]).all()
+
+
+if __name__ == "__main__":
+    test_bin_reader()
diff --git a/tests/unit_tests/data/test_builder.py b/tests/unit_tests/data/test_builder.py
new file mode 100644
index 0000000000..7f4caaa0f6
--- /dev/null
+++ b/tests/unit_tests/data/test_builder.py
@@ -0,0 +1,393 @@
+##
+# Compile megatron.core.datasets.helpers dependencies before BlendedDataset import
+##
+
+import os
+import tempfile
+from collections import defaultdict
+from typing import Dict, Optional
+
+import numpy
+import pytest
+import torch
+
+from megatron.core.datasets.blended_megatron_dataset_builder import BlendedMegatronDatasetBuilder
+from megatron.core.datasets.blended_megatron_dataset_config import BlendedMegatronDatasetConfig
+from megatron.core.datasets.megatron_dataset import LowLevelDataset, MegatronDataset
+from megatron.core.datasets.utils import Split, compile_helpers, get_blend_from_list
+from tests.unit_tests.test_utilities import Utils
+
+_NUM_DATASETS = 10
+
+_SEQUENCE_LENGTH = 10
+
+_SIZES = {}
+for split in Split:
+    _SIZES[split] = []
+    for i in range(_NUM_DATASETS):
+        _SIZES[split].append({Split.train: 1000, Split.valid: 100, Split.test: 10}[split] * (i + 1))
+
+_MARGIN = 0.005
+
+
+def do_setup(odir):
+    paths = defaultdict(list)
+
+    for i in range(_NUM_DATASETS):
+        path_to_data = os.path.join(odir, str(i))
+        os.mkdir(path_to_data)
+
+        for split in _SIZES:
+            data = numpy.zeros((_SIZES[split][i], _SEQUENCE_LENGTH))
+            path = os.path.join(path_to_data, f"{split.name}.npy")
+            numpy.save(path, data)
+            paths[split].append(path)
+
+    return paths
+
+
+def test_builder():
+    if torch.distributed.is_available():
+        Utils.initialize_distributed()
+        if torch.distributed.get_rank() == 0:
+            compile_helpers()
+        torch.distributed.barrier()
+    else:
+        compile_helpers()
+
+    # Define the class here to avoid pytest warnings
+
+    class TestDataset(MegatronDataset):
+        def __init__(
+            self,
+            dataset: LowLevelDataset,
+            dataset_path: Optional[str],
+            indices: numpy.ndarray,
+            num_samples: Optional[int],
+            index_split: Split,
+            config: BlendedMegatronDatasetConfig,
+        ) -> None:
+            super().__init__(dataset, dataset_path, indices, num_samples, index_split, config)
+
+            if self.num_samples is None:
+                self.num_samples = len(self.indices)
+
+            self.sample_index = numpy.random.choice(self.indices, size=self.num_samples)
+
+        @staticmethod
+        def numel_low_level_dataset(low_level_dataset: LowLevelDataset) -> int:
+            return len(low_level_dataset)
+
+        @staticmethod
+        def build_low_level_dataset(
+            dataset_path: str, config: BlendedMegatronDatasetConfig
+        ) -> LowLevelDataset:
+            return numpy.load(dataset_path)
+
+        def __len__(self) -> int:
+            return len(self.sample_index)
+
+        def __getitem__(self, idx: int) -> Dict[str, numpy.ndarray]:
+            return {"text": self.dataset[self.sample_index[idx]]}
+
+    with tempfile.TemporaryDirectory() as temp_dir:
+
+        paths = do_setup(temp_dir)
+
+        blends = {
+            split: get_blend_from_list(
+                [
+                    weight_or_path
+                    for pair in zip(list(range(1, len(paths[split]) + 1, 1)), paths[split])
+                    for weight_or_path in pair
+                ]
+            )
+            for split in Split
+        }
+
+        blends_unweighted = {split: (blends[split][0], None) for split in blends}
+
+        config = BlendedMegatronDatasetConfig(
+            random_seed=1234,
+            sequence_length=_SEQUENCE_LENGTH,
+            blend_per_split=[blends[Split.train], None, None],
+        )
+        try:
+            datasets = BlendedMegatronDatasetBuilder(
+                TestDataset, [None, None, None], lambda: True, config
+            ).build()
+            raise RuntimeError
+        except AssertionError:
+            pass
+
+        config = BlendedMegatronDatasetConfig(
+            random_seed=1234,
+            sequence_length=_SEQUENCE_LENGTH,
+            blend_per_split=[get_blend_from_list([paths[Split.train][0]]), None, None],
+        )
+        datasets = BlendedMegatronDatasetBuilder(
+            TestDataset, [1000, None, None], lambda: True, config
+        ).build()
+        assert len(datasets[0]) == 1000 and isinstance(datasets[0], TestDataset)
+        assert datasets[1] is None
+        assert datasets[2] is None
+
+        config = BlendedMegatronDatasetConfig(
+            random_seed=1234,
+            sequence_length=_SEQUENCE_LENGTH,
+            blend_per_split=[
+                blends_unweighted[Split.train],
+                blends_unweighted[Split.valid],
+                blends_unweighted[Split.test],
+            ],
+        )
+        datasets = BlendedMegatronDatasetBuilder(
+            TestDataset, [1000, 1000, 1000], lambda: True, config
+        ).build()
+        assert len(datasets[0]) == 1000
+        assert len(datasets[1]) == 1000
+        assert len(datasets[2]) == sum(_SIZES[Split.test])
+
+        config = BlendedMegatronDatasetConfig(
+            random_seed=1234,
+            sequence_length=_SEQUENCE_LENGTH,
+            blend_per_split=[
+                blends_unweighted[Split.train],
+                blends_unweighted[Split.valid],
+                blends_unweighted[Split.test],
+            ],
+        )
+        datasets = BlendedMegatronDatasetBuilder(
+            TestDataset, [None, None, None], lambda: True, config
+        ).build()
+        assert len(datasets[0]) == sum(_SIZES[Split.train])
+        assert numpy.all(
+            numpy.array(datasets[0].weights)
+            == numpy.unique(datasets[0].dataset_index, return_counts=True)[1]
+        )
+        assert len(datasets[1]) == sum(_SIZES[Split.valid])
+        assert numpy.all(
+            numpy.array(datasets[1].weights)
+            == numpy.unique(datasets[1].dataset_index, return_counts=True)[1]
+        )
+        assert len(datasets[2]) == sum(_SIZES[Split.test])
+        assert numpy.all(
+            numpy.array(datasets[2].weights)
+            == numpy.unique(datasets[2].dataset_index, return_counts=True)[1]
+        )
+
+        config = BlendedMegatronDatasetConfig(
+            random_seed=1234,
+            sequence_length=_SEQUENCE_LENGTH,
+            blend_per_split=[blends_unweighted[Split.train], None, None],
+        )
+        datasets = BlendedMegatronDatasetBuilder(
+            TestDataset, [1000, None, None], lambda: True, config
+        ).build()
+        assert len(datasets[0]) == 1000
+        for i in range(_NUM_DATASETS):
+            assert len(datasets[0].datasets[i]) == _SIZES[Split.train][i]
+        assert datasets[1] is None
+        assert datasets[2] is None
+
+        config = BlendedMegatronDatasetConfig(
+            random_seed=1234,
+            sequence_length=_SEQUENCE_LENGTH,
+            blend_per_split=[blends[Split.train], None, None],
+        )
+        try:
+            datasets = BlendedMegatronDatasetBuilder(
+                TestDataset, [1000, None, None], lambda: True, config
+            ).build()
+            raise RuntimeError
+        except IndexError:
+            ##
+            #
+            # The size per dataset is a function of the requested size, the weight per dataset,
+            # and a constant coefficient. The sizes, and consequently the total size to request,
+            # are modified such that the weights may or may not be sufficiently representative.
+            # To fix this, the weights should be reset according to the new sizes:
+            #
+            # S := size
+            # W := weights
+            #
+            # S = func(S, W)
+            #
+            # W = S / sum(S)
+            #
+            ##
+            config = BlendedMegatronDatasetConfig(
+                random_seed=1234,
+                sequence_length=_SEQUENCE_LENGTH,
+                blend_per_split=[blends[Split.train], None, None],
+                renormalize_blend_weights=True,
+            )
+            datasets = BlendedMegatronDatasetBuilder(
+                TestDataset, [1000, None, None], lambda: True, config
+            ).build()
+            assert (
+                len(datasets[0]) >= 1000
+                and len(datasets[0]) <= 1000 * (1 + _MARGIN) + _NUM_DATASETS
+            )
+
+            config = BlendedMegatronDatasetConfig(
+                random_seed=1234,
+                sequence_length=_SEQUENCE_LENGTH,
+                blend_per_split=[blends[Split.train], blends[Split.valid], blends[Split.test]],
+            )
+            datasets = BlendedMegatronDatasetBuilder(
+                TestDataset, [100, 100, 100], lambda: True, config
+            ).build()
+            assert (
+                len(datasets[0]) >= 100 and len(datasets[0]) <= 100 * (1 + _MARGIN) + _NUM_DATASETS
+            )
+            assert (
+                len(datasets[1]) >= 100 and len(datasets[1]) <= 100 * (1 + _MARGIN) + _NUM_DATASETS
+            )
+            assert (
+                len(datasets[2]) >= 100 and len(datasets[2]) <= 100 * (1 + _MARGIN) + _NUM_DATASETS
+            )
+
+        config = BlendedMegatronDatasetConfig(
+            random_seed=1234,
+            sequence_length=_SEQUENCE_LENGTH,
+            blend=blends_unweighted[Split.train],
+            split="100,0,0",
+        )
+        datasets = BlendedMegatronDatasetBuilder(
+            TestDataset, [None, None, None], lambda: True, config
+        ).build()
+        assert len(datasets[0]) == sum(_SIZES[Split.train])
+        assert numpy.all(
+            numpy.array(datasets[0].weights)
+            == numpy.unique(datasets[0].dataset_index, return_counts=True)[1]
+        )
+        assert datasets[1] is None
+        assert datasets[2] is None
+
+        if torch.distributed.is_initialized():
+            config = BlendedMegatronDatasetConfig(
+                random_seed=1234,
+                sequence_length=_SEQUENCE_LENGTH,
+                blend=blends_unweighted[Split.train],
+                split="100,0,0",
+            )
+            datasets = BlendedMegatronDatasetBuilder(
+                TestDataset,
+                [None, None, None],
+                lambda: torch.distributed.get_rank() % 2 == 0,
+                config,
+            ).build()
+            if torch.distributed.get_rank() % 2 == 0:
+                assert len(datasets[0]) == sum(_SIZES[Split.train])
+                assert numpy.all(
+                    numpy.array(datasets[0].weights)
+                    == numpy.unique(datasets[0].dataset_index, return_counts=True)[1]
+                )
+            else:
+                assert datasets[0] is None
+            assert datasets[1] is None
+            assert datasets[2] is None
+
+        config = BlendedMegatronDatasetConfig(
+            random_seed=1234,
+            sequence_length=_SEQUENCE_LENGTH,
+            blend=blends_unweighted[Split.train],
+            split="50,50,0",
+        )
+        datasets = BlendedMegatronDatasetBuilder(
+            TestDataset, [1000, 0, None], lambda: True, config
+        ).build()
+        assert len(datasets[0]) == 1000
+        assert sum(map(len, datasets[0].datasets)) == sum(_SIZES[Split.train]) / 2
+        assert sum(map(len, datasets[1].datasets)) == sum(_SIZES[Split.train]) / 2
+        assert datasets[1] is not None and len(datasets[1]) == 0
+        assert datasets[2] is None
+
+        config = BlendedMegatronDatasetConfig(
+            random_seed=1234,
+            sequence_length=_SEQUENCE_LENGTH,
+            blend=blends_unweighted[Split.train],
+            split="50,50,0",
+        )
+        datasets = BlendedMegatronDatasetBuilder(
+            TestDataset,
+            [int(sum(_SIZES[Split.train]) / 4), int(sum(_SIZES[Split.train])), None],
+            lambda: True,
+            config,
+        ).build()
+        assert len(datasets[0]) == sum(_SIZES[Split.train]) / 4
+        assert len(datasets[1]) == sum(_SIZES[Split.train]) / 2
+        assert datasets[2] is None
+
+        # 990 9 1
+        # 100000 1000 1
+        # []
+        config = BlendedMegatronDatasetConfig(
+            random_seed=1234,
+            sequence_length=_SEQUENCE_LENGTH,
+            blend=blends[Split.train],
+            split="990,9,1",
+        )
+        try:
+            # All three of 100000, 1000, and 1 result in error, yet 10000 and 100 do not
+            datasets = BlendedMegatronDatasetBuilder(
+                TestDataset, [100000, 1000, 1], lambda: True, config
+            ).build()
+        except IndexError:
+            ##
+            #
+            # The size per dataset is a function of the requested size, the weight per dataset,
+            # and a constant coefficient. The sizes, and consequently the total size to request,
+            # are modified such that the weights may or may not be sufficiently representative.
+            # To fix this, the weights should be reset according to the new sizes:
+            #
+            # S := size
+            # W := weights
+            #
+            # S = func(S, W)
+            #
+            # W = S / sum(S)
+            #
+            ##
+            config = BlendedMegatronDatasetConfig(
+                random_seed=1234,
+                sequence_length=_SEQUENCE_LENGTH,
+                blend=blends[Split.train],
+                split="990,9,1",
+                renormalize_blend_weights=True,
+            )
+            datasets = BlendedMegatronDatasetBuilder(
+                TestDataset, [100000, 1000, 1], lambda: True, config
+            ).build()
+            assert (
+                len(datasets[0]) >= 100000
+                and len(datasets[0]) <= 100000 * (1 + _MARGIN) + _NUM_DATASETS
+            )
+            assert (
+                len(datasets[1]) >= 1000
+                and len(datasets[1]) <= 1000 * (1 + _MARGIN) + _NUM_DATASETS
+            )
+            assert len(datasets[2]) >= 1 and len(datasets[2]) <= 1 * (1 + _MARGIN) + _NUM_DATASETS
+
+            config = BlendedMegatronDatasetConfig(
+                random_seed=1234,
+                sequence_length=_SEQUENCE_LENGTH,
+                blend=blends[Split.train],
+                split="990,9,1",
+            )
+            datasets = BlendedMegatronDatasetBuilder(
+                TestDataset, [10000, 100, 0], lambda: True, config
+            ).build()
+            assert (
+                len(datasets[0]) >= 10000
+                and len(datasets[0]) <= 10000 * (1 + _MARGIN) + _NUM_DATASETS
+            )
+            assert (
+                len(datasets[1]) >= 100 and len(datasets[1]) <= 100 * (1 + _MARGIN) + _NUM_DATASETS
+            )
+            assert len(datasets[2]) == 0
+
+
+if __name__ == "__main__":
+    test_builder()
diff --git a/tests/unit_tests/data/test_gpt_dataset.py b/tests/unit_tests/data/test_gpt_dataset.py
new file mode 100644
index 0000000000..817ea227f1
--- /dev/null
+++ b/tests/unit_tests/data/test_gpt_dataset.py
@@ -0,0 +1,112 @@
+##
+# Compile megatron.core.datasets.helpers dependencies before BlendedDataset import
+##
+
+import random
+
+import numpy
+import pytest
+import torch
+
+from megatron.core.datasets.blended_megatron_dataset_builder import BlendedMegatronDatasetBuilder
+from megatron.core.datasets.gpt_dataset import GPTDatasetConfig, MockGPTDataset
+from megatron.core.datasets.utils import compile_helpers
+from megatron.training.tokenizer.tokenizer import _NullTokenizer
+from tests.unit_tests.test_utilities import Utils
+
+_MOCK_VOCAB_SIZE = 8192
+
+
+def sample_N(dataset, N, randomize):
+    if randomize:
+        indices = [random.randint(0, len(dataset) - 1) for _ in range(N)]
+    else:
+        indices = list(range(N))
+    samples = [dataset[index]["tokens"].numpy() for index in indices]
+    return samples
+
+
+@pytest.mark.flaky
+def test_mock_gpt_dataset():
+    if torch.distributed.is_available():
+        Utils.initialize_distributed()
+        if torch.distributed.get_rank() == 0:
+            compile_helpers()
+        torch.distributed.barrier()
+    else:
+        compile_helpers()
+
+    tokenizer = _NullTokenizer(vocab_size=_MOCK_VOCAB_SIZE)
+
+    config = GPTDatasetConfig(
+        random_seed=1234,
+        sequence_length=1024,
+        split="990,9,1",
+        reset_position_ids=True,
+        reset_attention_mask=True,
+        eod_mask_loss=True,
+        tokenizer=tokenizer,
+    )
+
+    datasets = BlendedMegatronDatasetBuilder(
+        MockGPTDataset, [100, 100, 100], lambda: True, config
+    ).build()
+
+    N = 10
+
+    # Check iso-index variance by split
+    subsets = [sample_N(dataset, N, randomize=False) for dataset in datasets]
+    assert not numpy.allclose(subsets[0], subsets[1])
+    assert not numpy.allclose(subsets[0], subsets[2])
+    assert not numpy.allclose(subsets[1], subsets[2])
+
+    # Check iso-split / iso-index identity
+    subset_1A = sample_N(datasets[0], N, randomize=False)
+    subset_1B = sample_N(datasets[0], N, randomize=False)
+    assert numpy.allclose(subset_1A, subset_1B)
+
+    # Check iso-split variance by index
+    subset_1A = sample_N(datasets[0], N, randomize=True)
+    subset_1B = sample_N(datasets[0], N, randomize=True)
+    assert not numpy.allclose(subset_1A, subset_1B)
+
+    config = GPTDatasetConfig(
+        random_seed=1234,
+        sequence_length=1024,
+        split="990,10,0",
+        reset_position_ids=True,
+        reset_attention_mask=True,
+        eod_mask_loss=True,
+        drop_last_partial_validation_sequence=False,
+        add_extra_token_to_sequence=False,
+        tokenizer=tokenizer,
+    )
+
+    datasets = BlendedMegatronDatasetBuilder(
+        MockGPTDataset, [0, None, 0], lambda: True, config
+    ).build()
+
+    sample = datasets[1][datasets[1].shuffle_index.argmax()]
+    argmax = sample['labels'].shape[0] - torch.flip(sample['labels'], [0]).argmax() - 1
+
+    # Test add_extra_token_to_sequence
+    assert sample['tokens'][argmax] != tokenizer.eod
+    assert sample['labels'][argmax] == tokenizer.eod
+
+    # Test eod_mask_loss, drop_last_partial_validation_sequence
+    assert argmax < sample['labels'].shape[0] - 1
+    assert torch.all(sample['labels'][argmax + 1 :] == 0)
+    assert not torch.any(
+        sample['loss_mask'][
+            torch.logical_and(sample['labels'] == tokenizer.eod, sample['labels'] == 0)
+        ]
+    )
+
+    sample = datasets[1][None]
+
+    # Check handling of None index
+    assert not torch.any(sample['loss_mask'])
+
+
+if __name__ == "__main__":
+    test_mock_gpt_dataset()
diff --git a/tests/unit_tests/data/test_multimodal_dataset.py b/tests/unit_tests/data/test_multimodal_dataset.py
new file mode 100644
index 0000000000..a9a30c02ec
--- /dev/null
+++ b/tests/unit_tests/data/test_multimodal_dataset.py
@@ -0,0 +1,53 @@
+# Copyright (c) 2024, NVIDIA CORPORATION. All rights reserved.
+
+##
+# Compile megatron.core.datasets.helpers dependencies before BlendedDataset import
+##
+
+from types import SimpleNamespace
+
+import torch
+
+from megatron.core.datasets.blended_megatron_dataset_builder import BlendedMegatronDatasetBuilder
+from megatron.core.datasets.multimodal_dataset import MockMultimodalDataset, MultimodalDatasetConfig
+from megatron.core.datasets.utils import compile_helpers
+from megatron.training.tokenizer.tokenizer import _NullTokenizer
+from tests.unit_tests.test_utilities import Utils
+
+_MOCK_VOCAB_SIZE = 8192
+
+
+def test_mock_multimodal_dataset():
+    if torch.distributed.is_available():
+        Utils.initialize_distributed()
+        if torch.distributed.get_rank() == 0:
+            compile_helpers()
+        torch.distributed.barrier()
+    else:
+        compile_helpers()
+
+    config = MultimodalDatasetConfig(
+        random_seed=1234,
+        sequence_length=1024,
+        reset_position_ids=False,
+        reset_attention_mask=False,
+        eod_mask_loss=True,
+        image_h=336,
+        image_w=336,
+        split="990,9,1",
+        tokenizer=_NullTokenizer(vocab_size=_MOCK_VOCAB_SIZE),
+    )
+
+    datasets = BlendedMegatronDatasetBuilder(
+        MockMultimodalDataset, [100, 100, 100], lambda: True, config
+    ).build()
+
+    for ds in datasets:
+        sample = ds[0]
+        assert "image" in sample
+        assert sample["image"].shape == torch.Size([3, 336, 336])
+        assert "tokens" in sample
+
+
+if __name__ == "__main__":
+    test_mock_multimodal_dataset()
diff --git a/tests/unit_tests/data/test_preprocess_data.py b/tests/unit_tests/data/test_preprocess_data.py
new file mode 100644
index 0000000000..4eca14e588
--- /dev/null
+++ b/tests/unit_tests/data/test_preprocess_data.py
@@ -0,0 +1,242 @@
+# Copyright (c) 2022, NVIDIA CORPORATION. All rights reserved.
+
+import json
+import os
+import sys
+import tempfile
+
+import nltk
+import pytest
+import requests
+
+from megatron.core.datasets.indexed_dataset import IndexedDataset
+from megatron.training.tokenizer.gpt2_tokenization import (
+    PRETRAINED_MERGES_ARCHIVE_MAP,
+    PRETRAINED_VOCAB_ARCHIVE_MAP,
+)
+from tools.merge_datasets import main as merge_main
+from tools.preprocess_data import Encoder
+from tools.preprocess_data import get_args as build_args
+from tools.preprocess_data import main as build_main
+
+__HUGGINGFACE_BERT_BASE_UNCASED_VOCAB = (
+    "https://huggingface.co/bert-base-uncased/raw/main/vocab.txt"
+)
+
+__LOCAL_BERT_VOCAB = "/home/gitlab-runner/data/bert_data/vocab.txt"
+
+__LOCAL_GPT2_MERGE = "/home/gitlab-runner/data/gpt3_data/gpt2-merges.txt"
+
+__LOCAL_GPT2_VOCAB = "/home/gitlab-runner/data/gpt3_data/gpt2-vocab.json"
+
+
+def dummy_jsonl(odir):
+    # numbers
+    list_numbers = [json.dumps({"text": str(i + 1)}) + "\n" for i in range(100)]
+    with open(os.path.join(odir, "numbers.jsonl"), "w") as writer:
+        writer.writelines(list_numbers)
+    # numbers ascending
+    list_numbers_ascending = [
+        json.dumps({"text": " ".join([str(j + 1) for j in range(i + 1)])}) + "\n"
+        for i in range(100)
+    ]
+    with open(os.path.join(odir, "numbers_ascending.jsonl"), "w") as writer:
+        writer.writelines(list_numbers_ascending)
+    # test
+    list_test = []
+    with open(__file__) as reader:
+        for line in reader:
+            list_test.append(json.dumps({"text": line}) + "\n")
+    with open(os.path.join(odir, "test.jsonl"), "w") as writer:
+        writer.writelines(list_test)
+
+
+def build_datasets(idir, odir, extra_args=[]):
+    for name in os.listdir(idir):
+        sys.argv = [
+            sys.argv[0],
+            "--input",
+            os.path.join(idir, name),
+            "--output-prefix",
+            os.path.join(odir, os.path.splitext(name)[0]),
+        ] + extra_args
+        build_main()
+
+
+def merge_datasets(idir):
+    sys.argv = [sys.argv[0], "--input", idir, "--output-prefix", os.path.join(idir, "merge")]
+    merge_main()
+
+
+def do_test_preprocess_data(temp_dir, extra_args=[]):
+    # set the default nltk data path
+    os.environ["NLTK_DATA"] = os.path.join(temp_dir, "nltk_data")
+    nltk.data.path.append(os.environ["NLTK_DATA"])
+
+    path_to_raws = os.path.join(temp_dir, "sample_raws")
+    path_to_data = os.path.join(temp_dir, "sample_data")
+    os.mkdir(path_to_raws)
+    os.mkdir(path_to_data)
+
+    # create the dummy resources
+    dummy_jsonl(path_to_raws)
+
+    # build the datasets
+    build_datasets(path_to_raws, path_to_data, extra_args=extra_args)
+
+    # merge the datasets
+    merge_datasets(path_to_data)
+
+    sys.argv = [sys.argv[0], "--input", None, "--output-prefix", None] + extra_args
+    encoder = Encoder(build_args())
+    encoder.initializer()
+
+    def tokens_to_string(toks):
+        for option in ["decode", "detokenize"]:
+            try:
+                return getattr(encoder.tokenizer, option)(toks)
+            except:
+                continue
+        raise RuntimeError(f"{type(encoder.tokenizer)} tokenizer cannot decode or detokenize")
+
+    merged_index = 0
+    merged_dataset = IndexedDataset(os.path.join(path_to_data, "merge"))
+
+    # sorted to ensure ordering matches merged dataset
+    basenames = sorted(
+        [
+            name
+            for name in os.listdir(path_to_data)
+            if name.endswith(".idx") and not name.startswith("merge")
+        ]
+    )
+
+    # index into the merged document index
+    merged_doc_index_index = 0
+
+    for basename in basenames:
+        realpath_raw = f"{os.path.join(path_to_raws, '_'.join(basename.split('_')[:-2]))}.jsonl"
+        realpath_doc = os.path.join(path_to_data, basename.split(".")[-2])
+
+        dataset_index = 0
+        dataset = IndexedDataset(realpath_doc)
+
+        merged_doc_idx = merged_dataset.document_indices[
+            merged_doc_index_index : merged_doc_index_index + len(dataset.document_indices)
+        ]
+        merged_doc_idx = merged_doc_idx - merged_doc_idx[0]
+
+        assert (
+            dataset.document_indices == merged_doc_idx
+        ).all(), f"ERROR: {basename.split('_')[:-2]}: merged dataset document indices mismatch"
+
+        merged_doc_index_index += len(dataset.document_indices) - 1
+
+        with open(realpath_raw, "rt") as reader:
+            for json_line in reader:
+                toks = encoder.encode(json_line)[0]["text"]
+
+                raw = tokens_to_string(toks)
+
+                processed_toks = []
+                while len(processed_toks) < len(toks):
+                    processed_toks.extend(dataset[dataset_index])
+                    dataset_index += 1
+                processed = tokens_to_string(processed_toks)
+
+                assert (
+                    raw == processed
+                ), f"ERROR: {basename.split('_')[:-2]}: raw and processed documents do not match"
+
+                merged_toks = []
+                while len(merged_toks) < len(toks):
+                    merged_toks.extend(merged_dataset[merged_index])
+                    merged_index += 1
+                merged = tokens_to_string(merged_toks)
+
+                assert (
+                    raw == merged
+                ), f"ERROR: {basename.split('_')[:-2]}: raw and merged documents do not match"
+
+        print(
+            f"INFO: {''.join(basename.split('_')[:-2])}: raw, processed, and merged documents match!"
+        )
+
+    print("INFO: Success!")
+
+
+def gpt2_vocab(odir):
+    if os.path.exists(__LOCAL_GPT2_VOCAB):
+        return __LOCAL_GPT2_VOCAB
+    path = os.path.join(odir, "vocab.json")
+    with open(path, "wb") as writer:
+        writer.write(requests.get(PRETRAINED_VOCAB_ARCHIVE_MAP['gpt2']).content)
+    return path
+
+
+def gpt2_merge(odir):
+    if os.path.exists(__LOCAL_GPT2_MERGE):
+        return __LOCAL_GPT2_MERGE
+    path = os.path.join(odir, "merge.txt")
+    with open(path, "wb") as writer:
+        writer.write(requests.get(PRETRAINED_MERGES_ARCHIVE_MAP['gpt2']).content)
+    return path
+
+
+@pytest.mark.flaky
+def test_preprocess_data_gpt():
+    with tempfile.TemporaryDirectory() as temp_dir:
+
+        # gpt specific args
+        gpt_args = [
+            "--tokenizer-type",
+            "GPT2BPETokenizer",
+            "--vocab-file",
+            gpt2_vocab(temp_dir),
+            "--merge-file",
+            gpt2_merge(temp_dir),
+            "--append-eod",
+            "--workers",
+            "10",
+            "--log-interval",
+            "1",
+        ]
+
+        do_test_preprocess_data(temp_dir, extra_args=gpt_args)
+
+
+def bert_vocab(odir):
+    if os.path.exists(__LOCAL_BERT_VOCAB):
+        return __LOCAL_BERT_VOCAB
+    path = os.path.join(odir, "vocab.txt")
+    with open(path, "wb") as writer:
+        writer.write(requests.get(__HUGGINGFACE_BERT_BASE_UNCASED_VOCAB).content)
+    return path
+
+
+@pytest.mark.flaky
+def test_preprocess_data_bert():
+    with tempfile.TemporaryDirectory() as temp_dir:
+
+        # bert specific args
+        bert_args = [
+            "--tokenizer-type",
+            "BertWordPieceLowerCase",
+            "--vocab-file",
+            bert_vocab(temp_dir),
+            "--split-sentences",
+            "--workers",
+            "10",
+            "--log-interval",
+            "1",
+            "--partitions",
+            "2",
+            "--keep-sequential-samples",
+        ]
+
+        do_test_preprocess_data(temp_dir, extra_args=bert_args)
+
+
+if __name__ == "__main__":
+    test_preprocess_data_gpt()
+    test_preprocess_data_bert()
diff --git a/tests/unit_tests/data/test_preprocess_mmdata.py b/tests/unit_tests/data/test_preprocess_mmdata.py
new file mode 100644
index 0000000000..d6ad4eddc7
--- /dev/null
+++ b/tests/unit_tests/data/test_preprocess_mmdata.py
@@ -0,0 +1,219 @@
+# Copyright (c) 2022, NVIDIA CORPORATION. All rights reserved.
+
+import os
+import random
+import sys
+import tempfile
+
+import nltk
+import numpy
+
+from megatron.core.datasets.indexed_dataset import IndexedDataset
+from tests.unit_tests.data.test_preprocess_data import dummy_jsonl, gpt2_merge, gpt2_vocab
+from tools.merge_datasets import main as merge_main
+from tools.preprocess_mmdata import Encoder
+from tools.preprocess_mmdata import get_args as build_args
+from tools.preprocess_mmdata import main as build_main
+
+
+def dummy_img(odir_txt, odir_img):
+    for name in os.listdir(odir_txt):
+        with open(os.path.join(odir_txt, name), "rt") as reader_txt:
+            length = sum(1 for _ in reader_txt)
+        os.makedirs(os.path.join(odir_img, os.path.splitext(name)[0]), exist_ok=False)
+        for i in range(length):
+            with open(
+                os.path.join(odir_img, os.path.splitext(name)[0], f"{str(i).zfill(4)}.img"), "wb"
+            ) as writer_img:
+                # 32 * 32 - 1 to induce preprocessing 0-index padding
+                writer_img.write(bytes([random.randint(0, 255) for _ in range(32 * 32 - 1)]))
+
+
+def build_datasets(idir_txt, idir_img, odir, extra_args=[]):
+    for name in os.listdir(idir_txt):
+        sys.argv = [
+            sys.argv[0],
+            "--input",
+            os.path.join(idir_txt, name),
+            "--input-image",
+            os.path.join(idir_img, os.path.splitext(name)[0]),
+            "--output-prefix",
+            os.path.join(odir, os.path.splitext(name)[0]),
+        ] + extra_args
+        build_main()
+
+
+def merge_datasets(idir):
+    sys.argv = [
+        sys.argv[0],
+        "--input",
+        idir,
+        "--output-prefix",
+        os.path.join(idir, "merge"),
+        "--multimodal",
+    ]
+    merge_main()
+
+
+def do_test_preprocess_mmdata(temp_dir, extra_args=[]):
+    # set the default nltk data path
+    os.environ["NLTK_DATA"] = os.path.join(temp_dir, "nltk_data")
+    nltk.data.path.append(os.environ["NLTK_DATA"])
+
+    path_to_raws_txt = os.path.join(temp_dir, "sample_raws_txt")
+    path_to_raws_img = os.path.join(temp_dir, "sample_raws_img")
+    path_to_data = os.path.join(temp_dir, "sample_data")
+    os.mkdir(path_to_raws_txt)
+    os.mkdir(path_to_raws_img)
+    os.mkdir(path_to_data)
+
+    # create the dummy text resources
+    dummy_jsonl(path_to_raws_txt)
+
+    # create the dummy image resources
+    dummy_img(path_to_raws_txt, path_to_raws_img)
+
+    # build the datasets
+    build_datasets(path_to_raws_txt, path_to_raws_img, path_to_data, extra_args=extra_args)
+
+    # merge the datasets
+    merge_datasets(path_to_data)
+
+    sys.argv = [
+        sys.argv[0],
+        "--input",
+        None,
+        "--input-image",
+        None,
+        "--output-prefix",
+        None,
+    ] + extra_args
+    encoder = Encoder(build_args())
+    encoder.initializer()
+
+    def tokens_to_string(toks):
+        for option in ["decode", "detokenize"]:
+            try:
+                return getattr(encoder.tokenizer, option)(toks)
+            except AttributeError:
+                continue
+        raise RuntimeError(f"{type(encoder.tokenizer)} tokenizer cannot `decode` or `detokenize`.")
+
+    merged_index = 0
+    merged_dataset = IndexedDataset(os.path.join(path_to_data, "merge"), multimodal=True)
+
+    # sorted to ensure ordering matches merged dataset
+    basenames = sorted(
+        [
+            name
+            for name in os.listdir(path_to_data)
+            if name.endswith(".idx") and not name.startswith("merge")
+        ]
+    )
+
+    # index into the merged document index
+    merged_doc_index_index = 0
+
+    for basename in basenames:
+        realpath_raw_txt = os.path.join(path_to_raws_txt, f"{os.path.splitext(basename)[0]}.jsonl")
+        realpath_raw_img = os.path.join(path_to_raws_img, os.path.splitext(basename)[0])
+        realpath_doc = os.path.join(path_to_data, os.path.splitext(basename)[0])
+
+        dataset_index = 0
+        dataset = IndexedDataset(realpath_doc, multimodal=True)
+
+        merged_doc_idx = merged_dataset.document_indices[
+            merged_doc_index_index : merged_doc_index_index + len(dataset.document_indices)
+        ]
+        merged_doc_idx = merged_doc_idx - merged_doc_idx[0]
+
+        assert (
+            dataset.document_indices == merged_doc_idx
+        ).all(), f"ERROR: {basename.split('_')[:-2]}: merged dataset document indices mismatch"
+
+        merged_doc_index_index += len(dataset.document_indices) - 1
+
+        with open(realpath_raw_txt, "rt") as reader:
+            for json_line, image_path in zip(
+                reader,
+                [
+                    os.path.join(realpath_raw_img, basename)
+                    for basename in os.listdir(realpath_raw_img)
+                ],
+            ):
+                toks, image, length = encoder.encode((json_line, image_path))
+
+                raw_text = tokens_to_string(toks)
+                # reverse to account for preprocessing 0-index padding
+                raw_image = image[::-1]
+
+                processed_toks = dataset[dataset_index][0]
+                assert dataset[dataset_index][1] == 0
+                processed_text = tokens_to_string(processed_toks)
+
+                processed_image = dataset[dataset_index + 1][0]
+                assert dataset[dataset_index + 1][1] == 1
+                # reverse to account for preprocessing 0-index padding
+                processed_image = processed_image[::-1][0 : raw_image.size]
+
+                assert (
+                    raw_text == processed_text
+                ), f"ERROR: {basename.split('_')[:-2]}: raw and processed documents (text) do not match"
+
+                assert numpy.allclose(
+                    raw_image, processed_image
+                ), f"ERROR: {basename.split('_')[:-2]}: raw and processed documents (image) do not match"
+
+                dataset_index += 2
+
+                merged_toks = merged_dataset[merged_index][0]
+                assert merged_dataset[merged_index][1] == 0
+                merged_text = tokens_to_string(merged_toks)
+
+                merged_image = merged_dataset[merged_index + 1][0]
+                assert merged_dataset[merged_index + 1][1] == 1
+                # reverse to account for preprocessing 0-index padding
+                merged_image = merged_image[::-1][0 : raw_image.size]
+
+                assert (
+                    raw_text == merged_text
+                ), f"ERROR: {basename.split('_')[:-2]}: raw and merged documents (text) do not match"
+
+                assert numpy.allclose(
+                    raw_image, merged_image
+                ), f"ERROR: {basename.split('_')[:-2]}: raw and merged documents (image) do not match"
+
+                merged_index += 2
+
+        print(
+            f"INFO: {''.join(basename.split('_')[:-2])}: raw, processed, and merged documents match!"
+        )
+
+    print("INFO: Success!")
+
+
+def test_preprocess_mmdata():
+    with tempfile.TemporaryDirectory() as temp_dir:
+
+        # gpt specific args
+        gpt_args = [
+            "--pad-length",
+            "1024",
+            "--tokenizer-type",
+            "GPT2BPETokenizer",
+            "--vocab-file",
+            gpt2_vocab(temp_dir),
+            "--merge-file",
+            gpt2_merge(temp_dir),
+            "--append-eod",
+            "--workers",
+            "10",
+            "--log-interval",
+            "1",
+        ]
+
+        do_test_preprocess_mmdata(temp_dir, extra_args=gpt_args)
+
+
+if __name__ == "__main__":
+    test_preprocess_mmdata()
diff --git a/tests/unit_tests/dist_checkpointing/__init__.py b/tests/unit_tests/dist_checkpointing/__init__.py
new file mode 100644
index 0000000000..ae16372586
--- /dev/null
+++ b/tests/unit_tests/dist_checkpointing/__init__.py
@@ -0,0 +1,71 @@
+import os
+import weakref
+from pathlib import Path
+from shutil import rmtree
+from tempfile import TemporaryDirectory
+from typing import Optional, Union
+
+from tests.unit_tests.dist_checkpointing.utils import (
+    init_basic_mock_args,
+    init_checkpointing_mock_args,
+    initialize_gpt_model,
+    setup_model_and_optimizer,
+    setup_moe_model_and_optimizer,
+)
+from tests.unit_tests.test_utilities import Utils
+
+
+def empty_dir(path: Path):
+    if Utils.rank > 0:
+        return
+    for p in path.iterdir():
+        if p.is_dir():
+            rmtree(p)
+        else:
+            p.unlink()
+
+
+class TempNamedDir(TemporaryDirectory):
+    """TemporaryDirectory with a fully named directory. Empties the dir if not empty."""
+
+    def __init__(self, name: Union[str, Path], sync=True, ignore_cleanup_errors=False) -> None:
+        self.name = str(name)
+        if Utils.rank == 0:
+            os.makedirs(name, exist_ok=True)
+            empty_dir(Path(name))
+        if sync:
+            import torch
+
+            torch.distributed.barrier()
+        else:
+            os.makedirs(name, exist_ok=True)
+
+        self._ignore_cleanup_errors = ignore_cleanup_errors
+        self._finalizer = weakref.finalize(
+            self, self._cleanup, self.name, warn_message="Implicitly cleaning up {!r}".format(self)
+        )
+        self.sync = sync
+
+    def cleanup(self, override_sync: Optional[bool] = None) -> None:
+        sync = self.sync if override_sync is None else override_sync
+        if sync:
+            import torch
+
+            if torch.distributed.is_available() and torch.distributed.is_initialized():
+                torch.distributed.barrier()
+        if Utils.rank == 0:
+            super().cleanup()
+
+    def __enter__(self):
+        path = Path(super().__enter__())
+        if self.sync:
+            import torch
+
+            if torch.distributed.is_available() and torch.distributed.is_initialized():
+                torch.distributed.barrier()
+        return path
+
+    def __exit__(self, exc_type, exc_val, exc_tb):
+        raised = exc_type is not None
+        if not raised:
+            self.cleanup()
diff --git a/tests/unit_tests/dist_checkpointing/conftest.py b/tests/unit_tests/dist_checkpointing/conftest.py
new file mode 100644
index 0000000000..83cbc684fd
--- /dev/null
+++ b/tests/unit_tests/dist_checkpointing/conftest.py
@@ -0,0 +1,17 @@
+from unittest import mock
+
+import pytest
+
+from megatron.core.dist_checkpointing.strategies.base import StrategyAction, get_default_strategy
+
+
+@pytest.fixture(scope='session', autouse=True)
+def set_default_dist_ckpt_strategy():
+    def get_pyt_dist_save_sharded_strategy():
+        return get_default_strategy(StrategyAction.SAVE_SHARDED, 'torch_dist', 1)
+
+    with mock.patch(
+        'megatron.core.dist_checkpointing.serialization.get_default_save_sharded_strategy',
+        new=get_pyt_dist_save_sharded_strategy,
+    ) as _fixture:
+        yield _fixture
diff --git a/tests/unit_tests/dist_checkpointing/models/__init__.py b/tests/unit_tests/dist_checkpointing/models/__init__.py
new file mode 100644
index 0000000000..e69de29bb2
diff --git a/tests/unit_tests/dist_checkpointing/models/common.py b/tests/unit_tests/dist_checkpointing/models/common.py
new file mode 100644
index 0000000000..4b908ba3fc
--- /dev/null
+++ b/tests/unit_tests/dist_checkpointing/models/common.py
@@ -0,0 +1,228 @@
+# Copyright (c) 2024, NVIDIA CORPORATION. All rights reserved.
+import math
+
+import torch
+
+from megatron.core import parallel_state
+from megatron.core.dist_checkpointing import load, load_plain_tensors, save
+from megatron.core.dist_checkpointing.dict_utils import diff
+from megatron.core.dist_checkpointing.serialization import (
+    get_default_load_sharded_strategy,
+    get_default_save_sharded_strategy,
+)
+from megatron.core.dist_checkpointing.strategies.fully_parallel import (
+    FullyParallelLoadStrategyWrapper,
+    FullyParallelSaveStrategyWrapper,
+)
+from megatron.core.dist_checkpointing.validation import StrictHandling
+from tests.unit_tests.dist_checkpointing import TempNamedDir
+from tests.unit_tests.test_utilities import Utils
+
+
+def common_test_simple_sharded_state_dict_save_load(
+    initialize_model_fn, tmp_path_dist_ckpt, src_layer_spec_fn, dst_layer_spec_fn
+):
+    """Simple save and load sanity check, without any equality tests."""
+    tp = 2
+    pp = 4
+    Utils.initialize_model_parallel(tp, pp)
+    gpt_model = initialize_model_fn(
+        1, src_layer_spec_fn, tensor_model_parallel_size=tp, pipeline_model_parallel_size=pp
+    )
+    with TempNamedDir(tmp_path_dist_ckpt / 'test_gpt_model') as ckpt_dir:
+        # Save
+        sharded_state_dict = gpt_model.sharded_state_dict()
+        save(sharded_state_dict, ckpt_dir)
+
+        # Load
+        gpt_model = initialize_model_fn(
+            2, dst_layer_spec_fn, tensor_model_parallel_size=tp, pipeline_model_parallel_size=pp
+        )
+        sharded_state_dict = gpt_model.sharded_state_dict()
+        state_dict, missing_keys, unexpected_keys = load(
+            sharded_state_dict, ckpt_dir, strict=StrictHandling.RETURN_ALL
+        )
+        # Potential mismatch is because of extra states which is ok
+        assert all('_extra_state' in k for k in missing_keys)
+        assert all('_extra_state' in k for k in unexpected_keys)
+        gpt_model.load_state_dict(state_dict)
+    Utils.destroy_model_parallel()
+
+
+def common_test_parallel_reconfiguration_e2e(
+    initialize_model_fn,
+    tmp_path_dist_ckpt,
+    src_tp_pp,
+    dest_tp_pp,
+    src_layer_spec_fn,
+    dst_layer_spec_fn,
+    use_fpsl,
+    load_order="tp-dp-pp",
+    store_order="tp-dp-pp",
+):
+    """Test model saving and loading with different TP/PP"""
+    with TempNamedDir(
+        tmp_path_dist_ckpt / 'test_gpt_model_reconfiguration_model_A'
+    ) as ckpt_dir_A, TempNamedDir(
+        tmp_path_dist_ckpt / 'test_gpt_model_reconfiguration_model_B'
+    ) as ckpt_dir_B:
+        # Save checkpoint A
+        Utils.initialize_model_parallel(*src_tp_pp, order=load_order)
+        gpt_model_A = initialize_model_fn(
+            1,
+            src_layer_spec_fn,
+            tensor_model_parallel_size=src_tp_pp[0],
+            pipeline_model_parallel_size=src_tp_pp[1],
+        )
+        save_strategy = get_default_save_sharded_strategy()
+        if use_fpsl:
+            save_strategy = FullyParallelSaveStrategyWrapper(
+                save_strategy,
+                parallel_state.get_data_parallel_group(with_context_parallel=True),
+                True,
+            )
+        save(gpt_model_A.sharded_state_dict(), ckpt_dir_A, save_strategy)
+        regular_state_dict_A = gpt_model_A.state_dict()
+        Utils.destroy_model_parallel()
+
+        # Load checkpoint A with different TP/PP and save as checkpoint B
+        # No FPS this time, only FPL
+        Utils.initialize_model_parallel(*dest_tp_pp, order=store_order)
+        gpt_model_B = initialize_model_fn(
+            2,
+            dst_layer_spec_fn,
+            tensor_model_parallel_size=dest_tp_pp[0],
+            pipeline_model_parallel_size=dest_tp_pp[1],
+        )
+        if use_fpsl:
+            load_strategy = get_default_load_sharded_strategy(ckpt_dir_A)
+            load_strategy = FullyParallelLoadStrategyWrapper(load_strategy)
+        else:
+            load_strategy = None
+        state_dict, missing_keys, unexpected_keys = load(
+            gpt_model_B.sharded_state_dict(),
+            ckpt_dir_A,
+            load_strategy,
+            strict=StrictHandling.RETURN_ALL,
+        )
+        # Potential mismatch is because of extra states which is ok
+        assert all('_extra_state' in k for k in missing_keys)
+        assert all('_extra_state' in k for k in unexpected_keys)
+        gpt_model_B.load_state_dict(state_dict)
+        save(gpt_model_B.sharded_state_dict(), ckpt_dir_B)
+        regular_state_dict_B = gpt_model_A.state_dict()
+        Utils.destroy_model_parallel()
+
+        # Test both checkpoints are equal
+        Utils.initialize_model_parallel(1, 1)
+        plain_state_dict_A = load_plain_tensors(ckpt_dir_A)
+        plain_state_dict_B = load_plain_tensors(ckpt_dir_B)
+        diffs = diff(plain_state_dict_A, plain_state_dict_B)
+        assert not any(map(bool, diffs)), diffs
+
+        # Test both regular state dicts are equal, turning FP8 states to bytes first
+        regular_state_dict_A = {
+            k: v for k, v in regular_state_dict_A.items() if not k.endswith('_extra_state')
+        }
+        regular_state_dict_B = {
+            k: v for k, v in regular_state_dict_B.items() if not k.endswith('_extra_state')
+        }
+        diffs = diff(regular_state_dict_A, regular_state_dict_B)
+        assert not any(map(bool, diffs)), diffs
+        Utils.destroy_model_parallel()
+
+
+def common_test_state_dict_comparison(initialize_model_fn, tmp_path_dist_ckpt):
+    tp = 2
+    pp = 4
+    Utils.initialize_model_parallel(tp, pp)
+    with TempNamedDir(
+        tmp_path_dist_ckpt / 'test_state_dict_comparison_A'
+    ) as ckpt_dir_A, TempNamedDir(
+        tmp_path_dist_ckpt / 'test_state_dict_comparison_B'
+    ) as ckpt_dir_B:
+        gpt_model_A = initialize_model_fn(
+            1, tensor_model_parallel_size=tp, pipeline_model_parallel_size=pp
+        )
+        save(gpt_model_A.sharded_state_dict(), ckpt_dir_A)
+        gpt_model_B = initialize_model_fn(
+            2, tensor_model_parallel_size=tp, pipeline_model_parallel_size=pp
+        )
+        save(gpt_model_B.sharded_state_dict(), ckpt_dir_B)
+
+        state_dict_A = load_plain_tensors(ckpt_dir_A)
+        state_dict_A_dup = load_plain_tensors(ckpt_dir_A)
+        state_dict_B = load_plain_tensors(ckpt_dir_B)
+
+        # Test that A matches A
+        diffs = diff(state_dict_A, state_dict_A_dup)
+        assert not any(map(bool, diffs)), diffs
+
+        # Test that A *keys* match B *keys*, but the tensors content is different
+        only_left, only_right, mismatch = diff(state_dict_A, state_dict_B)
+        assert not only_left and not only_right, (only_left, only_right)
+        assert len(mismatch) == len(state_dict_A), (len(mismatch), (len(state_dict_A)))
+    Utils.destroy_model_parallel()
+
+
+def common_test_vocab_size_padding_change(
+    initialize_model_fn, tmp_path_dist_ckpt, vocab_size_base, src_tp_pp, dest_tp_pp
+):
+    """Test model loading with different vocab size (caused by TP padding)."""
+
+    def get_test_vocab_size(make_divisible_by=128):
+        divisor = make_divisible_by * parallel_state.get_tensor_model_parallel_world_size()
+        return int(math.ceil(vocab_size_base / divisor)) * divisor
+
+    vocab_size_dependent_keys = {
+        'output_layer.weight',
+        'output_layer.bias',
+        'embedding.word_embeddings.weight',
+    }
+
+    with TempNamedDir(
+        tmp_path_dist_ckpt / 'test_vocab_size_padding_change_A'
+    ) as ckpt_dir_A, TempNamedDir(
+        tmp_path_dist_ckpt / 'test_vocab_size_padding_change_B'
+    ) as ckpt_dir_B:
+        # Save checkpoint A
+        Utils.initialize_model_parallel(*src_tp_pp)
+        gpt_model_A = initialize_model_fn(
+            1,
+            tensor_model_parallel_size=src_tp_pp[0],
+            pipeline_model_parallel_size=src_tp_pp[1],
+            vocab_size=get_test_vocab_size(),
+        )
+        save(gpt_model_A.sharded_state_dict(), ckpt_dir_A)
+        Utils.destroy_model_parallel()
+
+        # Load checkpoint A with different TP/PP and save as checkpoint B
+        Utils.initialize_model_parallel(*dest_tp_pp)
+        gpt_model_B = initialize_model_fn(
+            2,
+            tensor_model_parallel_size=dest_tp_pp[0],
+            pipeline_model_parallel_size=dest_tp_pp[1],
+            vocab_size=get_test_vocab_size(),
+        )
+        state_dict = load(gpt_model_B.sharded_state_dict(), ckpt_dir_A)
+        gpt_model_B.load_state_dict(state_dict)
+        save(gpt_model_B.sharded_state_dict(), ckpt_dir_B)
+        Utils.destroy_model_parallel()
+
+        # Test equality
+        Utils.initialize_model_parallel(1, 1)
+        plain_state_dict_A = load_plain_tensors(ckpt_dir_A)
+        plain_state_dict_B = load_plain_tensors(ckpt_dir_B)
+        # Test vocab size dependent keys are equal up to `vocab_size_base`
+        for vocab_layer_key in vocab_size_dependent_keys:
+            if vocab_layer_key in plain_state_dict_A:
+                ten_A = plain_state_dict_A.pop(vocab_layer_key)
+                ten_B = plain_state_dict_B.pop(vocab_layer_key)
+                assert torch.all(
+                    ten_A[:vocab_size_base] == ten_B[:vocab_size_base]
+                ), vocab_layer_key
+
+        # Test other tensors are equal
+        diffs = diff(plain_state_dict_A, plain_state_dict_B)
+        assert not any(map(bool, diffs)), diffs
+        Utils.destroy_model_parallel()
diff --git a/tests/unit_tests/dist_checkpointing/models/test_bert_model.py b/tests/unit_tests/dist_checkpointing/models/test_bert_model.py
new file mode 100644
index 0000000000..a84553eaa0
--- /dev/null
+++ b/tests/unit_tests/dist_checkpointing/models/test_bert_model.py
@@ -0,0 +1,157 @@
+# Copyright (c) 2023, NVIDIA CORPORATION. All rights reserved.
+
+import os
+
+import pytest
+import torch
+
+from megatron.core import parallel_state as ps
+from megatron.core.models.bert.bert_layer_specs import (
+    bert_layer_local_spec,
+    bert_layer_with_transformer_engine_spec,
+)
+from megatron.core.models.bert.bert_model import BertModel
+from megatron.core.tensor_parallel.random import model_parallel_cuda_manual_seed
+from megatron.core.transformer.transformer_config import TransformerConfig
+from tests.unit_tests.dist_checkpointing.models.common import (
+    common_test_parallel_reconfiguration_e2e,
+    common_test_simple_sharded_state_dict_save_load,
+    common_test_state_dict_comparison,
+    common_test_vocab_size_padding_change,
+)
+from tests.unit_tests.test_utilities import Utils
+
+
+def initialize_bert_model(
+    seed, layer_spec_fn=bert_layer_with_transformer_engine_spec, vocab_size=128, **config_kwargs
+):
+    os.environ['NVTE_FLASH_ATTN'] = '0'
+    os.environ['NVTE_FUSED_ATTN'] = '0'
+    torch.manual_seed(seed)
+    model_parallel_cuda_manual_seed(seed)
+
+    layer_spec = layer_spec_fn() if callable(layer_spec_fn) else layer_spec_fn
+
+    default_config_kwargs = dict(
+        num_layers=8,
+        hidden_size=16,
+        num_attention_heads=8,
+        use_cpu_initialization=True,
+        pipeline_dtype=torch.bfloat16,
+    )
+    default_config_kwargs.update(**config_kwargs)
+    transformer_config = TransformerConfig(**default_config_kwargs)
+    pre_process = ps.is_pipeline_first_stage()
+    post_process = ps.is_pipeline_last_stage()
+    model = BertModel(
+        config=transformer_config,
+        transformer_layer_spec=layer_spec,
+        vocab_size=vocab_size,
+        max_sequence_length=4,
+        pre_process=pre_process,
+        post_process=post_process,
+        num_tokentypes=0,
+    )
+
+    with torch.no_grad():
+        for p in model.parameters():
+            p.random_()
+    return model
+
+
+class TestBertModel:
+    @pytest.mark.parametrize(
+        'src_layer_spec', [bert_layer_with_transformer_engine_spec, bert_layer_local_spec]
+    )
+    @pytest.mark.parametrize(
+        'dst_layer_spec', [bert_layer_with_transformer_engine_spec, bert_layer_local_spec]
+    )
+    def test_sharded_state_dict_save_load(self, tmp_path_dist_ckpt, src_layer_spec, dst_layer_spec):
+        common_test_simple_sharded_state_dict_save_load(
+            initialize_bert_model, tmp_path_dist_ckpt, src_layer_spec, dst_layer_spec
+        )
+
+
+class TestBERTModelReconfiguration:
+    def setup_method(self, method):
+        pass
+
+    def teardown_method(self, method):
+        Utils.destroy_model_parallel()
+
+    @pytest.mark.parametrize(
+        ('use_fpsl', 'src_tp_pp', 'dest_tp_pp', 'src_layer_spec', 'dst_layer_spec'),
+        [
+            (
+                False,
+                (2, 4),
+                (4, 2),
+                bert_layer_with_transformer_engine_spec,
+                bert_layer_with_transformer_engine_spec,
+            ),
+            (
+                False,
+                (1, 8),
+                (8, 1),
+                bert_layer_with_transformer_engine_spec,
+                bert_layer_with_transformer_engine_spec,
+            ),
+            (
+                True,
+                (2, 1),
+                (1, 8),
+                bert_layer_with_transformer_engine_spec,
+                bert_layer_with_transformer_engine_spec,
+            ),
+            (
+                False,
+                (1, 1),
+                (2, 2),
+                bert_layer_with_transformer_engine_spec,
+                bert_layer_with_transformer_engine_spec,
+            ),
+            (True, (2, 1), (1, 8), bert_layer_local_spec, bert_layer_local_spec),
+            (True, (1, 1), (2, 4), bert_layer_with_transformer_engine_spec, bert_layer_local_spec),
+            (False, (1, 8), (2, 1), bert_layer_local_spec, bert_layer_with_transformer_engine_spec),
+        ],
+    )
+    @pytest.mark.internal
+    def test_parallel_reconfiguration_e2e(
+        self, tmp_path_dist_ckpt, src_tp_pp, dest_tp_pp, src_layer_spec, dst_layer_spec, use_fpsl
+    ):
+        """Test model saving and loading with different TP/PP"""
+        Utils.initialize_model_parallel(src_tp_pp[0], src_tp_pp[1])
+
+        common_test_parallel_reconfiguration_e2e(
+            initialize_bert_model,
+            tmp_path_dist_ckpt,
+            src_tp_pp,
+            dest_tp_pp,
+            src_layer_spec,
+            dst_layer_spec,
+            use_fpsl,
+        )
+
+    @pytest.mark.internal
+    def test_state_dict_comparison(self, tmp_path_dist_ckpt):
+        common_test_state_dict_comparison(initialize_bert_model, tmp_path_dist_ckpt)
+
+    @pytest.mark.parametrize(
+        "vocab_size_base,src_tp_pp,dest_tp_pp",
+        [
+            (128, (2, 4), (4, 2)),
+            (17, (1, 8), (8, 1)),
+            (127, (1, 8), (8, 1)),
+            (31123, (1, 1), (1, 8)),
+            (17, (1, 1), (1, 8)),
+        ],
+    )
+    @pytest.mark.internal
+    def test_vocab_size_padding_change(
+        self, tmp_path_dist_ckpt, vocab_size_base, src_tp_pp, dest_tp_pp
+    ):
+        """Test model loading with different vocab size (caused by TP padding)."""
+        Utils.initialize_model_parallel(src_tp_pp[0], src_tp_pp[1])
+        common_test_vocab_size_padding_change(
+            initialize_bert_model, tmp_path_dist_ckpt, vocab_size_base, src_tp_pp, dest_tp_pp
+        )
diff --git a/tests/unit_tests/dist_checkpointing/models/test_gpt_model.py b/tests/unit_tests/dist_checkpointing/models/test_gpt_model.py
new file mode 100644
index 0000000000..20699d4500
--- /dev/null
+++ b/tests/unit_tests/dist_checkpointing/models/test_gpt_model.py
@@ -0,0 +1,137 @@
+# Copyright (c) 2023, NVIDIA CORPORATION. All rights reserved.
+import pytest
+import torch
+
+from megatron.core import parallel_state as ps
+from megatron.core.models.gpt.gpt_layer_specs import get_gpt_layer_local_spec as gpt_local_spec
+from megatron.core.models.gpt.gpt_layer_specs import (
+    get_gpt_layer_with_transformer_engine_spec as gpt_te_spec,
+)
+from megatron.core.models.gpt.gpt_model import GPTModel
+from megatron.core.tensor_parallel.random import model_parallel_cuda_manual_seed
+from megatron.core.transformer.transformer_config import TransformerConfig
+from tests.unit_tests.dist_checkpointing.models.common import (
+    common_test_parallel_reconfiguration_e2e,
+    common_test_simple_sharded_state_dict_save_load,
+    common_test_state_dict_comparison,
+    common_test_vocab_size_padding_change,
+)
+from tests.unit_tests.test_utilities import Utils
+
+
+def initialize_gpt_model(seed, layer_spec_fn=gpt_te_spec, vocab_size=128, **config_kwargs):
+    torch.manual_seed(seed)
+    model_parallel_cuda_manual_seed(seed)
+
+    default_config_kwargs = dict(
+        num_layers=8,
+        hidden_size=16,
+        num_attention_heads=8,
+        use_cpu_initialization=True,
+        pipeline_dtype=torch.bfloat16,
+    )
+    default_config_kwargs.update(**config_kwargs)
+    transformer_config = TransformerConfig(**default_config_kwargs)
+    pre_process = ps.is_pipeline_first_stage()
+    post_process = ps.is_pipeline_last_stage()
+    model = GPTModel(
+        config=transformer_config,
+        transformer_layer_spec=layer_spec_fn(),
+        vocab_size=vocab_size,
+        max_sequence_length=4,
+        pre_process=pre_process,
+        post_process=post_process,
+    )
+
+    with torch.no_grad():
+        for p in model.parameters():
+            p.random_()
+    return model
+
+
+class TestGPTModel:
+    @pytest.mark.parametrize('src_layer_spec_fn', [gpt_te_spec, gpt_local_spec])
+    @pytest.mark.parametrize('dst_layer_spec_fn', [gpt_te_spec, gpt_local_spec])
+    def test_sharded_state_dict_save_load(
+        self, tmp_path_dist_ckpt, src_layer_spec_fn, dst_layer_spec_fn
+    ):
+        common_test_simple_sharded_state_dict_save_load(
+            initialize_gpt_model, tmp_path_dist_ckpt, src_layer_spec_fn, dst_layer_spec_fn
+        )
+
+
+class TestGPTModelReconfiguration:
+    def setup_method(self, method):
+        pass
+
+    def teardown_method(self, method):
+        Utils.destroy_model_parallel()
+
+    @pytest.mark.parametrize(
+        (
+            'use_fpsl',
+            'load_order',
+            'store_order',
+            'src_tp_pp',
+            'dest_tp_pp',
+            'src_layer_spec_fn',
+            'dst_layer_spec_fn',
+        ),
+        [
+            (False, 'tp-dp-pp', 'tp-dp-pp', (2, 4), (4, 2), gpt_te_spec, gpt_te_spec),
+            (False, 'tp-pp-dp', 'tp-pp-dp', (1, 8), (8, 1), gpt_te_spec, gpt_te_spec),
+            (True, 'tp-dp-pp', 'tp-pp-dp', (2, 1), (1, 8), gpt_te_spec, gpt_te_spec),
+            (False, 'tp-dp-pp', 'tp-dp-pp', (1, 1), (2, 2), gpt_te_spec, gpt_te_spec),
+            (True, 'tp-pp-dp', 'tp-pp-dp', (2, 1), (1, 8), gpt_local_spec, gpt_local_spec),
+            (False, 'tp-dp-pp', 'tp-pp-dp', (1, 1), (2, 4), gpt_te_spec, gpt_local_spec),
+            (True, 'tp-dp-pp', 'tp-dp-pp', (2, 4), (4, 2), gpt_local_spec, gpt_te_spec),
+            (False, 'tp-pp-dp', 'tp-pp-dp', (2, 1), (1, 8), gpt_te_spec, gpt_local_spec),
+            (False, 'tp-dp-pp', 'tp-pp-dp', (2, 4), (2, 4), gpt_local_spec, gpt_local_spec),
+        ],
+    )
+    def test_parallel_reconfiguration_e2e(
+        self,
+        tmp_path_dist_ckpt,
+        src_tp_pp,
+        dest_tp_pp,
+        src_layer_spec_fn,
+        dst_layer_spec_fn,
+        use_fpsl,
+        load_order,
+        store_order,
+    ):
+        """Test model saving and loading with different TP/PP"""
+        Utils.initialize_model_parallel(src_tp_pp[0], src_tp_pp[1])
+        common_test_parallel_reconfiguration_e2e(
+            initialize_gpt_model,
+            tmp_path_dist_ckpt,
+            src_tp_pp,
+            dest_tp_pp,
+            src_layer_spec_fn,
+            dst_layer_spec_fn,
+            use_fpsl,
+            load_order,
+            store_order,
+        )
+
+    def test_state_dict_comparison(self, tmp_path_dist_ckpt):
+        common_test_state_dict_comparison(initialize_gpt_model, tmp_path_dist_ckpt)
+
+    @pytest.mark.parametrize(
+        "vocab_size_base,src_tp_pp,dest_tp_pp",
+        [
+            (128, (2, 4), (4, 2)),
+            (17, (1, 8), (8, 1)),
+            (127, (1, 8), (8, 1)),
+            (31123, (1, 1), (1, 8)),
+            (17, (1, 1), (1, 8)),
+        ],
+    )
+    def test_vocab_size_padding_change(
+        self, tmp_path_dist_ckpt, vocab_size_base, src_tp_pp, dest_tp_pp
+    ):
+        """Test model loading with different vocab size (caused by TP padding)."""
+        Utils.initialize_model_parallel(src_tp_pp[0], src_tp_pp[1])
+        common_test_vocab_size_padding_change(
+            initialize_gpt_model, tmp_path_dist_ckpt, vocab_size_base, src_tp_pp, dest_tp_pp
+        )
diff --git a/tests/unit_tests/dist_checkpointing/models/test_mamba.py b/tests/unit_tests/dist_checkpointing/models/test_mamba.py
new file mode 100644
index 0000000000..6bdcd9b827
--- /dev/null
+++ b/tests/unit_tests/dist_checkpointing/models/test_mamba.py
@@ -0,0 +1,131 @@
+# Copyright (c) 2023, NVIDIA CORPORATION. All rights reserved.
+
+import pytest
+import torch
+
+from megatron.core import parallel_state
+from megatron.core.dist_checkpointing import load, load_plain_tensors, save
+from megatron.core.dist_checkpointing.dict_utils import diff
+from megatron.core.dist_checkpointing.serialization import (
+    get_default_load_sharded_strategy,
+    get_default_save_sharded_strategy,
+)
+from megatron.core.dist_checkpointing.strategies.fully_parallel import (
+    FullyParallelLoadStrategyWrapper,
+    FullyParallelSaveStrategyWrapper,
+)
+from megatron.core.ssm.mamba_mixer import MambaMixer, MambaMixerSubmodules
+from megatron.core.tensor_parallel.random import model_parallel_cuda_manual_seed
+from megatron.core.transformer.custom_layers.transformer_engine import (
+    TELayerNormColumnParallelLinear,
+    TERowParallelLinear,
+)
+from megatron.core.transformer.transformer_config import TransformerConfig
+from tests.unit_tests.dist_checkpointing import TempNamedDir
+from tests.unit_tests.test_utilities import Utils
+
+
+def initialize_mamba(seed, glu=True, **config_kwargs):
+    torch.manual_seed(seed)
+    model_parallel_cuda_manual_seed(seed)
+
+    pp_size = parallel_state.get_pipeline_model_parallel_world_size()
+    num_moe_experts = 8
+    default_config_kwargs = dict(
+        num_layers=pp_size,
+        hidden_size=128,
+        num_attention_heads=4,
+        num_moe_experts=num_moe_experts,
+        use_cpu_initialization=True,
+        gated_linear_unit=glu,
+    )
+    default_config_kwargs.update(**config_kwargs)
+    transformer_config = TransformerConfig(**default_config_kwargs)
+    submodules = MambaMixerSubmodules(
+        in_proj=TELayerNormColumnParallelLinear, out_proj=TERowParallelLinear
+    )
+    model = MambaMixer(transformer_config, submodules, transformer_config.hidden_size, rmsnorm=True)
+    return model
+
+
+def get_pp_offsets():
+    pp_rank = parallel_state.get_pipeline_model_parallel_rank()
+    pp_size = parallel_state.get_pipeline_model_parallel_world_size()
+    return ((0, pp_rank, pp_size),)
+
+
+class TestMambaReconfiguration:
+    @pytest.mark.parametrize(
+        "use_fpsl,src_tp_pp_exp,dest_tp_pp_exp,use_glu",
+        [
+            # changing PP is impossible because the number of layers must be the same
+            (False, (2, 4, 1), (2, 4, 1), False),
+            (True, (2, 4, 1), (2, 4, 1), False),
+            (False, (1, 1, 1), (1, 1, 1), False),
+            (True, (1, 1, 1), (1, 1, 4), False),
+            (False, (1, 1, 8), (1, 1, 2), False),
+            (False, (2, 2, 2), (4, 2, 1), False),
+            # (True,  (1, 1, 4), (8, 1, 1), False),
+            (False, (1, 8, 1), (1, 8, 1), False),
+            (False, (1, 1, 4), (2, 1, 1), False),
+            (False, (1, 1, 1), (1, 1, 1), True),
+            (False, (1, 1, 1), (1, 1, 4), True),
+            (True, (1, 1, 1), (2, 1, 1), True),
+            # (False, (1, 1, 4), (8, 1, 1), True),
+        ],
+    )
+    @pytest.mark.flaky
+    def test_parallel_reconfiguration_e2e(
+        self, tmp_path_dist_ckpt, src_tp_pp_exp, dest_tp_pp_exp, use_glu, use_fpsl
+    ):
+        """Test model saving and loading with different TP/PP/expert parallelism"""
+        src_tp, src_pp, src_exp = src_tp_pp_exp
+        Utils.initialize_model_parallel(src_tp, src_pp, expert_model_parallel_size=src_exp)
+        dest_tp, dest_pp, dest_exp = dest_tp_pp_exp
+        with TempNamedDir(
+            tmp_path_dist_ckpt / 'test_sequential_mlp_reconfiguration_model_A'
+        ) as ckpt_dir_A, TempNamedDir(
+            tmp_path_dist_ckpt / 'test_sequential_mlp_reconfiguration_model_B'
+        ) as ckpt_dir_B:
+            # Save checkpoint A
+            model_A = initialize_mamba(1, use_glu)
+            sharded_state_dict = model_A.sharded_state_dict(sharded_offsets=get_pp_offsets())
+
+            save_strategy = get_default_save_sharded_strategy()
+            if use_fpsl:
+                save_strategy = FullyParallelSaveStrategyWrapper(
+                    save_strategy,
+                    parallel_state.get_data_parallel_group(with_context_parallel=True),
+                    True,
+                )
+            save(sharded_state_dict, ckpt_dir_A, save_strategy)
+            Utils.destroy_model_parallel()
+
+            # Load checkpoint A with different TP/PP/expert and save as checkpoint B
+            # No FPS this time, only FPL
+            Utils.initialize_model_parallel(dest_tp, dest_pp, expert_model_parallel_size=dest_exp)
+            model_B = initialize_mamba(2, use_glu)
+            if use_fpsl:
+                load_strategy = get_default_load_sharded_strategy(ckpt_dir_A)
+                load_strategy = FullyParallelLoadStrategyWrapper(
+                    load_strategy,
+                    parallel_state.get_data_parallel_group(with_context_parallel=True),
+                )
+            else:
+                load_strategy = None
+            state_dict = load(
+                model_B.sharded_state_dict(sharded_offsets=get_pp_offsets()),
+                ckpt_dir_A,
+                load_strategy,
+            )
+            model_B.load_state_dict(state_dict)
+            save(model_B.sharded_state_dict(sharded_offsets=get_pp_offsets()), ckpt_dir_B)
+            Utils.destroy_model_parallel()
+
+            # Test both checkpoints are equal
+            Utils.initialize_model_parallel(1, 1)
+            state_dict_A = load_plain_tensors(ckpt_dir_A)
+            state_dict_B = load_plain_tensors(ckpt_dir_B)
+            diffs = diff(state_dict_A, state_dict_B)
+            assert not any(map(bool, diffs)), diffs
+        Utils.destroy_model_parallel()
diff --git a/tests/unit_tests/dist_checkpointing/models/test_mlp_glu.py b/tests/unit_tests/dist_checkpointing/models/test_mlp_glu.py
new file mode 100644
index 0000000000..1a0851039a
--- /dev/null
+++ b/tests/unit_tests/dist_checkpointing/models/test_mlp_glu.py
@@ -0,0 +1,89 @@
+# Copyright (c) 2023, NVIDIA CORPORATION. All rights reserved.
+
+import pytest
+import torch
+from torch.optim import Adam
+
+from megatron.core import parallel_state
+from megatron.core.dist_checkpointing import ShardedTensor, load, load_plain_tensors, save
+from megatron.core.dist_checkpointing.dict_utils import diff, nested_values
+from megatron.core.dist_checkpointing.optimizer import (
+    get_param_id_to_sharded_param_map,
+    optim_state_to_sharding_state,
+)
+from megatron.core.models.gpt.gpt_layer_specs import get_gpt_layer_with_transformer_engine_spec
+from megatron.core.tensor_parallel.random import model_parallel_cuda_manual_seed
+from megatron.core.transformer.mlp import MLP
+from megatron.core.transformer.transformer_config import TransformerConfig
+from tests.unit_tests.dist_checkpointing import TempNamedDir
+from tests.unit_tests.test_utilities import Utils
+
+
+def initialize_mlp(glu=True):
+    model_parallel_cuda_manual_seed(123)
+    pp_size = parallel_state.get_pipeline_model_parallel_world_size()
+    transformer_config = TransformerConfig(
+        num_layers=pp_size,
+        hidden_size=12,
+        num_attention_heads=4,
+        use_cpu_initialization=True,
+        gated_linear_unit=glu,
+    )
+    return MLP(
+        transformer_config, get_gpt_layer_with_transformer_engine_spec().submodules.mlp.submodules
+    )
+
+
+def get_pp_offsets():
+    pp_rank = parallel_state.get_pipeline_model_parallel_rank()
+    pp_size = parallel_state.get_pipeline_model_parallel_world_size()
+    return ((0, pp_rank, pp_size),)
+
+
+class TestParallelMLPWithGLU:
+    def setup_method(self, method):
+        pass
+
+    def teardown_method(self, method):
+        Utils.destroy_model_parallel()
+
+    @pytest.mark.parametrize(
+        "src_tp_pp,dest_tp_pp",
+        [
+            # changing PP is impossible because the number of layers must be the same
+            ((2, 2), (4, 2)),
+            ((1, 1), (8, 1)),
+            ((1, 8), (1, 8)),
+            ((1, 1), (2, 1)),
+        ],
+    )
+    def test_parallel_reconfiguration_e2e(self, tmp_path_dist_ckpt, src_tp_pp, dest_tp_pp):
+        """Test module saving and loading with different TP/PP"""
+        Utils.initialize_model_parallel(*src_tp_pp)
+
+        with TempNamedDir(
+            tmp_path_dist_ckpt / 'test_mlp_glu_reconfiguration_model_A'
+        ) as ckpt_dir_A, TempNamedDir(
+            tmp_path_dist_ckpt / 'test_mlp_glu_reconfiguration_model_B'
+        ) as ckpt_dir_B:
+            # Save checkpoint A
+            mlp_A = initialize_mlp()
+            save(mlp_A.sharded_state_dict(sharded_offsets=get_pp_offsets()), ckpt_dir_A)
+            Utils.destroy_model_parallel()
+
+            # Load checkpoint A with different TP/PP and save as checkpoint B
+            Utils.initialize_model_parallel(*dest_tp_pp)
+            mlp_B = initialize_mlp()
+            state_dict = load(
+                mlp_B.sharded_state_dict(sharded_offsets=get_pp_offsets()), ckpt_dir_A
+            )
+            mlp_B.load_state_dict(state_dict)
+            save(mlp_B.sharded_state_dict(sharded_offsets=get_pp_offsets()), ckpt_dir_B)
+            Utils.destroy_model_parallel()
+
+            # Test both checkpoints are equal
+            Utils.initialize_model_parallel(1, 1)
+            state_dict_A = load_plain_tensors(ckpt_dir_A)
+            state_dict_B = load_plain_tensors(ckpt_dir_B)
+            diffs = diff(state_dict_A, state_dict_B)
+            assert not any(map(bool, diffs)), diffs
diff --git a/tests/unit_tests/dist_checkpointing/models/test_moe_experts.py b/tests/unit_tests/dist_checkpointing/models/test_moe_experts.py
new file mode 100644
index 0000000000..4a8f153ed4
--- /dev/null
+++ b/tests/unit_tests/dist_checkpointing/models/test_moe_experts.py
@@ -0,0 +1,232 @@
+# Copyright (c) 2023, NVIDIA CORPORATION. All rights reserved.
+
+import pytest
+import torch
+
+from megatron.core import parallel_state
+from megatron.core.dist_checkpointing import load, load_plain_tensors, save
+from megatron.core.dist_checkpointing.dict_utils import diff
+from megatron.core.dist_checkpointing.serialization import (
+    get_default_load_sharded_strategy,
+    get_default_save_sharded_strategy,
+)
+from megatron.core.dist_checkpointing.strategies.fully_parallel import (
+    FullyParallelLoadStrategyWrapper,
+    FullyParallelSaveStrategyWrapper,
+)
+from megatron.core.models.gpt.gpt_layer_specs import get_gpt_layer_with_transformer_engine_spec
+from megatron.core.tensor_parallel.random import model_parallel_cuda_manual_seed
+from megatron.core.transformer.moe.experts import GroupedMLP, SequentialMLP, TEGroupedMLP
+from megatron.core.transformer.transformer_config import TransformerConfig
+from megatron.core.utils import is_te_min_version
+from tests.unit_tests.dist_checkpointing import TempNamedDir
+from tests.unit_tests.test_utilities import Utils
+
+
+def initialize_expert_layer(seed, glu=True, expert_type='sequential', **config_kwargs):
+    torch.manual_seed(seed)
+    model_parallel_cuda_manual_seed(seed)
+
+    pp_size = parallel_state.get_pipeline_model_parallel_world_size()
+    num_moe_experts = 8
+    num_local_experts = num_moe_experts // parallel_state.get_expert_model_parallel_world_size()
+    default_config_kwargs = dict(
+        num_layers=pp_size,
+        hidden_size=12,
+        num_attention_heads=4,
+        num_moe_experts=num_moe_experts,
+        use_cpu_initialization=True,
+        gated_linear_unit=glu,
+    )
+    default_config_kwargs.update(**config_kwargs)
+    transformer_config = TransformerConfig(**default_config_kwargs)
+    transformer_layer_spec = get_gpt_layer_with_transformer_engine_spec(
+        num_experts=num_moe_experts, moe_grouped_gemm=(expert_type != 'sequential')
+    )
+    if expert_type == 'grouped':
+        model = GroupedMLP(num_local_experts, transformer_config)
+    elif expert_type == 'te_grouped':
+        model = TEGroupedMLP(
+            num_local_experts,
+            transformer_config,
+            transformer_layer_spec.submodules.mlp.submodules.experts,
+        )
+    elif expert_type == 'sequential':
+        model = SequentialMLP(
+            num_local_experts,
+            transformer_config,
+            transformer_layer_spec.submodules.mlp.submodules.experts,
+        )
+    else:
+        raise ValueError('expert_type can only be one of ["sequential", "grouped", "te_grouped"]')
+    return model
+
+
+def get_pp_offsets():
+    pp_rank = parallel_state.get_pipeline_model_parallel_rank()
+    pp_size = parallel_state.get_pipeline_model_parallel_world_size()
+    return ((0, pp_rank, pp_size),)
+
+
+expert_type = ['sequential', 'grouped']
+src_dest_expert_type = [('sequential', 'grouped'), ('grouped', 'sequential')]
+if is_te_min_version("1.9.0.dev0"):
+    expert_type.append('te_grouped')
+    src_dest_expert_type.append(('sequential', 'te_grouped'))
+    src_dest_expert_type.append(('te_grouped', 'sequential'))
+
+
+class TestExpertLayerReconfiguration:
+    def setup_method(self, method):
+        pass
+
+    def teardown_method(self, method):
+        Utils.destroy_model_parallel()
+
+    @pytest.mark.parametrize(
+        "use_fpsl,src_tp_pp_exp,dest_tp_pp_exp,use_glu",
+        [
+            # changing PP is impossible because the number of layers must be the same
+            (False, (2, 4, 1), (2, 4, 1), False),
+            (True, (2, 4, 1), (2, 4, 1), False),
+            (False, (1, 1, 1), (1, 1, 1), False),
+            (True, (1, 1, 1), (1, 1, 4), False),
+            (False, (1, 1, 8), (1, 1, 2), False),
+            (False, (2, 2, 2), (4, 2, 1), False),
+            (True, (1, 1, 4), (8, 1, 1), False),
+            (False, (1, 8, 1), (1, 8, 1), False),
+            (False, (1, 1, 4), (2, 1, 1), False),
+            (False, (1, 1, 1), (1, 1, 1), True),
+            (False, (1, 1, 1), (1, 1, 4), True),
+            (True, (1, 1, 1), (2, 1, 1), True),
+            (False, (1, 1, 4), (8, 1, 1), True),
+        ],
+    )
+    @pytest.mark.parametrize("expert_type", expert_type)
+    def test_parallel_reconfiguration_e2e(
+        self, tmp_path_dist_ckpt, src_tp_pp_exp, dest_tp_pp_exp, use_glu, use_fpsl, expert_type
+    ):
+        """Test model saving and loading with different TP/PP/expert parallelism"""
+        src_tp, src_pp, src_exp = src_tp_pp_exp
+        dest_tp, dest_pp, dest_exp = dest_tp_pp_exp
+        if expert_type == 'grouped':
+            add_bias_linear = False
+        else:
+            add_bias_linear = True
+        # Save checkpoint A
+        Utils.initialize_model_parallel(src_tp, src_pp, expert_model_parallel_size=src_exp)
+        with TempNamedDir(
+            tmp_path_dist_ckpt / 'test_expert_layer_reconfiguration_model_A'
+        ) as ckpt_dir_A, TempNamedDir(
+            tmp_path_dist_ckpt / 'test_expert_layer_reconfiguration_model_B'
+        ) as ckpt_dir_B:
+            model_A = initialize_expert_layer(
+                1, use_glu, expert_type, add_bias_linear=add_bias_linear
+            )
+            sharded_state_dict = model_A.sharded_state_dict(sharded_offsets=get_pp_offsets())
+
+            save_strategy = get_default_save_sharded_strategy()
+            if use_fpsl:
+                save_strategy = FullyParallelSaveStrategyWrapper(
+                    save_strategy,
+                    parallel_state.get_data_parallel_group(with_context_parallel=True),
+                    True,
+                )
+            save(sharded_state_dict, ckpt_dir_A, save_strategy)
+            Utils.destroy_model_parallel()
+
+            # Load checkpoint A with different TP/PP/expert and save as checkpoint B
+            # No FPS this time, only FPL
+            Utils.initialize_model_parallel(dest_tp, dest_pp, expert_model_parallel_size=dest_exp)
+            model_B = initialize_expert_layer(
+                1, use_glu, expert_type, add_bias_linear=add_bias_linear
+            )
+            if use_fpsl:
+                load_strategy = get_default_load_sharded_strategy(ckpt_dir_A)
+                load_strategy = FullyParallelLoadStrategyWrapper(
+                    load_strategy,
+                    parallel_state.get_data_parallel_group(with_context_parallel=True),
+                )
+            else:
+                load_strategy = None
+            state_dict = load(
+                model_B.sharded_state_dict(sharded_offsets=get_pp_offsets()),
+                ckpt_dir_A,
+                load_strategy,
+            )
+            model_B.load_state_dict(state_dict)
+            save(model_B.sharded_state_dict(sharded_offsets=get_pp_offsets()), ckpt_dir_B)
+            Utils.destroy_model_parallel()
+
+            # Test both checkpoints are equal
+            Utils.initialize_model_parallel(1, 1)
+            state_dict_A = load_plain_tensors(ckpt_dir_A)
+            state_dict_B = load_plain_tensors(ckpt_dir_B)
+            diffs = diff(state_dict_A, state_dict_B)
+            assert not any(map(bool, diffs)), diffs
+
+    @pytest.mark.parametrize(
+        "src_tp_pp_exp,dest_tp_pp_exp,use_glu",
+        [
+            # changing PP is impossible because the number of layers must be the same
+            ((2, 4, 1), (2, 4, 1), False),
+            ((1, 1, 1), (1, 1, 4), False),
+            ((2, 2, 2), (4, 2, 1), False),
+            ((1, 1, 4), (8, 1, 1), False),
+            ((2, 1, 4), (1, 1, 8), False),
+            ((2, 4, 1), (2, 4, 1), True),
+            ((1, 1, 1), (1, 1, 4), True),
+            ((2, 2, 2), (4, 2, 1), True),
+            ((1, 1, 4), (8, 1, 1), True),
+            ((2, 1, 4), (1, 1, 8), True),
+        ],
+    )
+    @pytest.mark.parametrize("src_module,dest_module", src_dest_expert_type)
+    def test_sequential_grouped_mlp_interchangeable(
+        self, tmp_path_dist_ckpt, src_tp_pp_exp, dest_tp_pp_exp, use_glu, src_module, dest_module
+    ):
+        """Test model saving and loading with different TP/PP/expert parallelism"""
+        src_tp, src_pp, src_exp = src_tp_pp_exp
+        dest_tp, dest_pp, dest_exp = dest_tp_pp_exp
+        if src_module == 'grouped' or dest_module == 'grouped':
+            add_bias_linear = False
+        else:
+            add_bias_linear = True
+        # Save checkpoint A
+        Utils.initialize_model_parallel(src_tp, src_pp, expert_model_parallel_size=src_exp)
+        with TempNamedDir(
+            tmp_path_dist_ckpt / 'test_sequential_grouped_mlp_interchangeable_model_A'
+        ) as ckpt_dir_A, TempNamedDir(
+            tmp_path_dist_ckpt / 'test_sequential_grouped_mlp_interchangeable_model_B'
+        ) as ckpt_dir_B:
+
+            model_A = initialize_expert_layer(
+                1, use_glu, expert_type=src_module, add_bias_linear=add_bias_linear
+            )
+            sharded_state_dict = model_A.sharded_state_dict(sharded_offsets=get_pp_offsets())
+
+            save_strategy = get_default_save_sharded_strategy()
+            save(sharded_state_dict, ckpt_dir_A, save_strategy)
+            Utils.destroy_model_parallel()
+
+            Utils.initialize_model_parallel(dest_tp, dest_pp, expert_model_parallel_size=dest_exp)
+            model_B = initialize_expert_layer(
+                1, use_glu, expert_type=dest_module, add_bias_linear=add_bias_linear
+            )
+            load_strategy = None
+            state_dict = load(
+                model_B.sharded_state_dict(sharded_offsets=get_pp_offsets()),
+                ckpt_dir_A,
+                load_strategy,
+            )
+            model_B.load_state_dict(state_dict)
+            save(model_B.sharded_state_dict(sharded_offsets=get_pp_offsets()), ckpt_dir_B)
+            Utils.destroy_model_parallel()
+
+            # Test both checkpoints are equal
+            Utils.initialize_model_parallel(1, 1)
+            state_dict_A = load_plain_tensors(ckpt_dir_A)
+            state_dict_B = load_plain_tensors(ckpt_dir_B)
+            diffs = diff(state_dict_A, state_dict_B)
+            assert not any(map(bool, diffs)), diffs
+            Utils.destroy_model_parallel()
diff --git a/tests/unit_tests/dist_checkpointing/models/test_retro_model.py b/tests/unit_tests/dist_checkpointing/models/test_retro_model.py
new file mode 100644
index 0000000000..3f570920aa
--- /dev/null
+++ b/tests/unit_tests/dist_checkpointing/models/test_retro_model.py
@@ -0,0 +1,92 @@
+# Copyright (c) 2023, NVIDIA CORPORATION. All rights reserved.
+import types
+
+import pytest
+import torch
+
+from megatron.core import parallel_state as ps
+from megatron.core.dist_checkpointing import load, load_plain_tensors, save
+from megatron.core.dist_checkpointing.validation import StrictHandling
+from megatron.core.models.retro import RetroConfig, RetroModel, get_retro_decoder_block_spec
+from megatron.core.tensor_parallel.random import model_parallel_cuda_manual_seed
+from megatron.core.transformer.transformer_config import TransformerConfig
+from tests.unit_tests.dist_checkpointing import TempNamedDir
+from tests.unit_tests.test_utilities import Utils
+
+
+def initialize_retro_model(seed, decoder_spec_fn, spec_type, num_layers=9, **config_kwargs):
+    torch.manual_seed(seed)
+    model_parallel_cuda_manual_seed(seed)
+
+    default_config_kwargs = dict(
+        num_layers=num_layers,
+        hidden_size=16,
+        num_attention_heads=12,
+        kv_channels=64,
+        ffn_hidden_size=64,
+        use_cpu_initialization=True,
+        retro_num_neighbors=2,
+        retro_chunk_length=4,
+        retro_retrieved_length=8,
+        retro_split_preprocessing="98,2,0",
+    )
+    default_config_kwargs.update(**config_kwargs)
+    retro_config = RetroConfig(**default_config_kwargs)
+    pre_process = ps.is_pipeline_first_stage()
+    post_process = ps.is_pipeline_last_stage()
+
+    de_block_spec = decoder_spec_fn(
+        retro_config, use_transformer_engine=True if spec_type == "te" else False
+    )
+    model = RetroModel(
+        config=retro_config,
+        transformer_layer_spec=de_block_spec,
+        pre_process=pre_process,
+        post_process=post_process,
+        vocab_size=29184,
+        max_sequence_length=4,
+    )
+
+    with torch.no_grad():
+        for p in model.parameters():
+            p.random_()
+    return model
+
+
+class TestRetroModel:
+    def setup_method(self, method):
+        pass
+
+    def teardown_method(self, method):
+        Utils.destroy_model_parallel()
+
+    @pytest.mark.parametrize('src_spec_type', ['te', 'local'])
+    @pytest.mark.parametrize('dst_spec_type', ['te', 'local'])
+    @pytest.mark.parametrize('model_type', ['retro'])
+    @pytest.mark.flaky_in_dev
+    def test_sharded_state_dict_save_load(
+        self, tmp_path_dist_ckpt, src_spec_type, dst_spec_type, model_type
+    ):
+        decoder_spec_fn = get_retro_decoder_block_spec
+
+        Utils.initialize_model_parallel(1, 1)
+        gpt_model = initialize_retro_model(2, decoder_spec_fn, src_spec_type)
+        with TempNamedDir(tmp_path_dist_ckpt / 'test_gpt_model') as ckpt_dir:
+            # Save
+            sharded_state_dict = gpt_model.sharded_state_dict()
+            save(sharded_state_dict, ckpt_dir)
+
+            # Load
+            gpt_model = initialize_retro_model(2, decoder_spec_fn, dst_spec_type)
+            sharded_state_dict = gpt_model.sharded_state_dict()
+
+            state_dict, missing_keys, unexpected_keys = load(
+                sharded_state_dict, ckpt_dir, strict=StrictHandling.RETURN_ALL
+            )
+            # Potential mismatch is because of extra states which is ok
+            assert all('_extra_state' in k for k in missing_keys)
+            assert all('_extra_state' in k for k in unexpected_keys)
+            gpt_model.load_state_dict(state_dict)
+            gpt_model.load_state_dict(state_dict)
+
+        Utils.destroy_model_parallel()
diff --git a/tests/unit_tests/dist_checkpointing/models/test_t5_model.py b/tests/unit_tests/dist_checkpointing/models/test_t5_model.py
new file mode 100644
index 0000000000..07c9f8676a
--- /dev/null
+++ b/tests/unit_tests/dist_checkpointing/models/test_t5_model.py
@@ -0,0 +1,115 @@
+# Copyright (c) 2023, NVIDIA CORPORATION. All rights reserved.
+
+import pytest
+import torch
+
+from megatron.core import parallel_state as ps
+from megatron.core.dist_checkpointing import load, load_plain_tensors, save
+from megatron.core.dist_checkpointing.validation import StrictHandling
+from megatron.core.models.retro.decoder_spec import (
+    get_retro_decoder_layer_local_spec,
+    get_retro_decoder_layer_te_spec,
+)
+from megatron.core.models.retro.encoder_spec import (
+    get_retro_encoder_layer_local_spec,
+    get_retro_encoder_layer_te_spec,
+)
+from megatron.core.models.T5 import T5Model
+from megatron.core.models.T5.t5_spec import decoder_model_with_local_spec as t5_decoder_local_spec
+from megatron.core.models.T5.t5_spec import (
+    decoder_model_with_transformer_engine_default_spec as t5_decoder_te_spec,
+)
+from megatron.core.models.T5.t5_spec import encoder_model_with_local_spec as t5_encoder_local_spec
+from megatron.core.models.T5.t5_spec import (
+    encoder_model_with_transformer_engine_default_spec as t5_encoder_te_spec,
+)
+from megatron.core.tensor_parallel.random import model_parallel_cuda_manual_seed
+from megatron.core.transformer.transformer_block import TransformerBlockSubmodules
+from megatron.core.transformer.transformer_config import TransformerConfig
+from tests.unit_tests.dist_checkpointing import TempNamedDir
+from tests.unit_tests.test_utilities import Utils
+
+
+def initialize_t5_model(seed, encoder_spec_fn, decoder_spec_fn, num_layers=2, **config_kwargs):
+    torch.manual_seed(seed)
+    model_parallel_cuda_manual_seed(seed)
+
+    default_config_kwargs = dict(
+        num_layers=num_layers,
+        hidden_size=16,
+        num_attention_heads=12,
+        kv_channels=64,
+        ffn_hidden_size=64,
+        use_cpu_initialization=True,
+        pipeline_dtype=torch.bfloat16,
+    )
+    default_config_kwargs.update(**config_kwargs)
+    transformer_config = TransformerConfig(**default_config_kwargs)
+    pre_process = ps.is_pipeline_first_stage()
+    post_process = ps.is_pipeline_last_stage()
+
+    en_block_spec = TransformerBlockSubmodules([encoder_spec_fn()] * num_layers)
+    de_block_spec = TransformerBlockSubmodules([decoder_spec_fn()] * num_layers)
+    model = T5Model(
+        encoder_config=transformer_config,
+        config=transformer_config,
+        transformer_encoder_layer_spec=en_block_spec,
+        transformer_decoder_layer_spec=de_block_spec,
+        pre_process=False,
+        post_process=False,
+        vocab_size=29184,
+        max_sequence_length=4,
+    )
+
+    with torch.no_grad():
+        for p in model.parameters():
+            p.random_()
+    return model
+
+
+class TestT5Model:
+    def setup_method(self, method):
+        pass
+
+    def teardown_method(self, method):
+        Utils.destroy_model_parallel()
+
+    @pytest.mark.parametrize('src_spec_type', ['te', 'local'])
+    @pytest.mark.parametrize('dst_spec_type', ['te', 'local'])
+    @pytest.mark.parametrize('model_type', ['t5'])
+    def test_sharded_state_dict_save_load(
+        self, tmp_path_dist_ckpt, src_spec_type, dst_spec_type, model_type
+    ):
+        enc_dec_spec_fn = {
+            'te': {
+                't5': (t5_encoder_te_spec, t5_decoder_te_spec),
+                'retro': (get_retro_encoder_layer_te_spec, get_retro_decoder_layer_te_spec),
+            },
+            'local': {
+                't5': (t5_encoder_local_spec, t5_decoder_local_spec),
+                'retro': (get_retro_encoder_layer_local_spec, get_retro_decoder_layer_local_spec),
+            },
+        }
+        src_encoder_spec_fn, src_decoder_spec_fn = enc_dec_spec_fn[src_spec_type][model_type]
+        dst_encoder_spec_fn, dst_decoder_spec_fn = enc_dec_spec_fn[dst_spec_type][model_type]
+
+        Utils.initialize_model_parallel(1, 1)
+        gpt_model = initialize_t5_model(1, src_encoder_spec_fn, src_decoder_spec_fn)
+        with TempNamedDir(tmp_path_dist_ckpt / 'test_gpt_model') as ckpt_dir:
+            # Save
+            sharded_state_dict = gpt_model.sharded_state_dict()
+            save(sharded_state_dict, ckpt_dir)
+
+            # Load
+            gpt_model = initialize_t5_model(2, dst_encoder_spec_fn, dst_decoder_spec_fn)
+            sharded_state_dict = gpt_model.sharded_state_dict()
+
+            state_dict, missing_keys, unexpected_keys = load(
+                sharded_state_dict, ckpt_dir, strict=StrictHandling.RETURN_ALL
+            )
+            # Potential mismatch is because of extra states which is ok
+            assert all('_extra_state' in k for k in missing_keys)
+            assert all('_extra_state' in k for k in unexpected_keys)
+            gpt_model.load_state_dict(state_dict)
+
+        Utils.destroy_model_parallel()
diff --git a/tests/unit_tests/dist_checkpointing/test_async_save.py b/tests/unit_tests/dist_checkpointing/test_async_save.py
new file mode 100644
index 0000000000..d6aa879982
--- /dev/null
+++ b/tests/unit_tests/dist_checkpointing/test_async_save.py
@@ -0,0 +1,102 @@
+# Copyright (c) 2024, NVIDIA CORPORATION. All rights reserved.
+from unittest import mock
+
+import pytest
+import torch
+
+from megatron.core.dist_checkpointing import ShardedTensor, load, save
+from megatron.core.dist_checkpointing.dict_utils import diff
+from megatron.core.dist_checkpointing.strategies.async_utils import AsyncCallsQueue
+from megatron.core.dist_checkpointing.strategies.filesystem_async import FileSystemWriterAsync
+from megatron.core.dist_checkpointing.strategies.torch import TorchDistSaveShardedStrategy
+from tests.unit_tests.dist_checkpointing import TempNamedDir
+from tests.unit_tests.test_utilities import Utils
+
+
+def write_data_os_err_mock_fn(local_proc_idx, write_bucket, results_queue, count_queue, use_fsync):
+    """Raises an error on worker #2 during storage save"""
+    try:
+        if local_proc_idx == 2:
+            raise OSError('worker #2 critical failure')
+        output = (local_proc_idx, [])
+    except Exception as e:
+        output = (local_proc_idx, e)
+    results_queue.put(output)
+    count_queue.get()
+    count_queue.task_done()
+
+
+class TestAsyncSave:
+    def setup_method(self, method):
+        pass
+
+    def teardown_method(self, method):
+        Utils.destroy_model_parallel()
+
+    def test_async_is_equivalent_to_sync(self, tmp_path_dist_ckpt):
+        Utils.initialize_model_parallel(2, 4)
+
+        sharded_state_dict = {
+            'sd_keyA': ShardedTensor.from_rank_offsets(
+                'keyA', torch.ones(2, 4), replica_id=Utils.rank
+            ),
+            'sd_keyB': ShardedTensor.from_rank_offsets(
+                'keyB', torch.ones(3, 5, 7), replica_id=Utils.world_size - Utils.rank - 1
+            ),
+        }
+
+        with TempNamedDir(
+            tmp_path_dist_ckpt / 'test_equivalence_async'
+        ) as async_ckpt_dir, TempNamedDir(
+            tmp_path_dist_ckpt / 'test_equivalence_sync'
+        ) as sync_ckpt_dir:
+            # async
+            async_calls = AsyncCallsQueue()
+            async_request = save(sharded_state_dict, async_ckpt_dir, async_sharded_save=True)
+            async_calls.schedule_async_request(async_request)
+
+            # sync
+            save(sharded_state_dict, sync_ckpt_dir, async_sharded_save=False)
+
+            # finalize async
+            async_calls.maybe_finalize_async_calls(blocking=True)
+
+            # load and compare
+            loaded_async_state_dict = load(sharded_state_dict, async_ckpt_dir)
+            loaded_sync_state_dict = load(sharded_state_dict, sync_ckpt_dir)
+            diffs = diff(loaded_async_state_dict, loaded_sync_state_dict)
+            assert not any(map(bool, diffs)), diffs
+
+        Utils.destroy_model_parallel()
+
+    @pytest.mark.parametrize('async_save', [False, True])
+    @pytest.mark.parametrize('worker_fn', [write_data_os_err_mock_fn])
+    def test_errors_are_reported(self, tmp_path_dist_ckpt, async_save, worker_fn):
+        Utils.initialize_model_parallel(2, 4)
+        sharded_state_dict = {
+            f'key{i}': ShardedTensor.from_rank_offsets(f'key{i}_rank{Utils.rank}', torch.ones(2, 4))
+            for i in range(4)  # make sure there is enough non-empty saving workers
+        }
+
+        with TempNamedDir(tmp_path_dist_ckpt / 'test_errors_are_reported') as ckpt_dir:
+            async_calls = AsyncCallsQueue()
+            save_strategy = TorchDistSaveShardedStrategy('torch_dist', 1, thread_count=8)
+
+            try:
+                orig_fn = FileSystemWriterAsync.write_preloaded_data
+                FileSystemWriterAsync.write_preloaded_data = worker_fn
+                with pytest.raises(RuntimeError) as exc_info:
+                    if async_save:
+                        async_request = save(
+                            sharded_state_dict, ckpt_dir, save_strategy, async_sharded_save=True
+                        )
+                        async_calls.schedule_async_request(async_request)
+                        async_calls.maybe_finalize_async_calls(blocking=True)
+                    else:
+                        save(sharded_state_dict, ckpt_dir, save_strategy)
+                assert 'Worker failure' in str(exc_info.value)
+
+            finally:
+                FileSystemWriterAsync.write_preloaded_data = orig_fn
+
+        Utils.destroy_model_parallel()
diff --git a/tests/unit_tests/dist_checkpointing/test_cached_metadata.py b/tests/unit_tests/dist_checkpointing/test_cached_metadata.py
new file mode 100644
index 0000000000..2733ea7a1b
--- /dev/null
+++ b/tests/unit_tests/dist_checkpointing/test_cached_metadata.py
@@ -0,0 +1,90 @@
+# Copyright (c) 2024, NVIDIA CORPORATION. All rights reserved.
+
+import pickle
+from copy import deepcopy
+from dataclasses import fields
+
+import torch
+
+from megatron.core.dist_checkpointing import ShardedTensor, load, save
+from megatron.core.dist_checkpointing.dict_utils import diff
+from megatron.core.dist_checkpointing.serialization import get_default_save_sharded_strategy
+from megatron.core.dist_checkpointing.strategies.async_utils import AsyncCallsQueue
+from tests.unit_tests.dist_checkpointing import TempNamedDir
+from tests.unit_tests.test_utilities import Utils
+
+
+class TestCachedMetadata:
+    def setup_method(self, method):
+        pass
+
+    def teardown_method(self, method):
+        Utils.destroy_model_parallel()
+
+    def test_cached_metadata(self, tmp_path_dist_ckpt):
+        Utils.initialize_model_parallel(2, 4)
+
+        sharded_state_dict_non_cached = {
+            'sd_keyA': ShardedTensor.from_rank_offsets(
+                'keyA', torch.ones(2, 4), replica_id=Utils.rank
+            ),
+            'sd_keyB': ShardedTensor.from_rank_offsets(
+                'keyB', torch.ones(3, 5, 7), replica_id=Utils.world_size - Utils.rank - 1
+            ),
+        }
+
+        sharded_state_dict_cached = {
+            'sd_keyA': ShardedTensor.from_rank_offsets(
+                'keyA', torch.ones(2, 4), replica_id=Utils.rank
+            ),
+            'sd_keyB': ShardedTensor.from_rank_offsets(
+                'keyB', torch.ones(3, 5, 7), replica_id=Utils.world_size - Utils.rank - 1
+            ),
+        }
+
+        loaded_non_cached, loaded_cached = None, None
+        md_non_cached, md_cached = None, None
+        with TempNamedDir(tmp_path_dist_ckpt / 'ckpt_dir') as ckpt_dir:
+            save(sharded_state_dict_non_cached, ckpt_dir, async_sharded_save=False)
+            loaded_non_cached = load(sharded_state_dict_non_cached, ckpt_dir)
+            md_path = ckpt_dir / '.metadata'
+            with md_path.open('rb') as f:
+                md_non_cached = pickle.load(f)
+
+        save_strategy = deepcopy(get_default_save_sharded_strategy())
+        save_strategy.use_cached_ckpt_structure = True
+        # Run over 3 iterations with cached metadata enabled
+        # The 3rd iteration will run with cached metadata
+        # `ckpt_dir` at the 3rd iteration 2 will be maintained for comparison
+        ckpt_dir = None
+        for i in range(3):
+            ckpt_dir = TempNamedDir(tmp_path_dist_ckpt / f'ckpt_dir_${i}_cached')
+            save(
+                sharded_state_dict_cached,
+                ckpt_dir.__enter__(),
+                save_strategy,
+                async_sharded_save=False,
+            )
+            if i < 2:
+                ckpt_dir.cleanup()
+        loaded_cached = load(sharded_state_dict_cached, ckpt_dir.__enter__())
+        md_path = ckpt_dir.__enter__() / '.metadata'
+
+        with md_path.open('rb') as f:
+            md_cached = pickle.load(f)
+
+        # Check loaded state dict
+        diffs = diff(loaded_non_cached, loaded_cached)
+
+        assert not any(
+            len(x) for x in diffs
+        ), 'Cached metadata doesn\'t produce the same state_dict in loading'
+        # Check metadata recorded in .metadata, torch.distributed.metadata.Metadata
+        for field in fields(md_non_cached):
+            if field.name not in ['storage_data', 'storage_meta']:
+                diffs = diff(getattr(md_non_cached, field.name), getattr(md_cached, field.name))
+                assert not any(
+                    len(x) for x in diffs
+                ), f'{field.name} is different in metadata from non-cached, cached metadata impls'
+        ckpt_dir.cleanup()
+        Utils.destroy_model_parallel()
diff --git a/tests/unit_tests/dist_checkpointing/test_flattened_resharding.py b/tests/unit_tests/dist_checkpointing/test_flattened_resharding.py
new file mode 100644
index 0000000000..fa00a20cad
--- /dev/null
+++ b/tests/unit_tests/dist_checkpointing/test_flattened_resharding.py
@@ -0,0 +1,200 @@
+# Copyright (c) 2023, NVIDIA CORPORATION. All rights reserved.
+
+import io
+
+import numpy as np
+import pytest
+import torch
+from torch.distributed.checkpoint import CheckpointException
+
+from megatron.core import parallel_state
+from megatron.core.dist_checkpointing import ShardedTensor, load, save
+from megatron.core.dist_checkpointing.core import CheckpointingException, maybe_load_config
+from megatron.core.dist_checkpointing.dict_utils import diff
+from megatron.core.dist_checkpointing.mapping import ShardedObject, ShardedTensorFactory
+from megatron.core.dist_checkpointing.serialization import load_tensors_metadata
+from megatron.core.dist_checkpointing.strategies.resharding import (
+    apply_nd_flattened_tensors_reformulation,
+    restore_nd_flattened_tensors_formulation,
+)
+from megatron.core.dist_checkpointing.strategies.torch import get_reformulation_metadata
+from tests.unit_tests.dist_checkpointing import TempNamedDir
+from tests.unit_tests.test_utilities import Utils
+
+
+class TestFlattenedResharding:
+    def setup_method(self, method):
+        pass
+
+    def teardown_method(self, method):
+        Utils.destroy_model_parallel()
+
+    @pytest.mark.parametrize(
+        ('src_tp_pp', 'dest_tp_pp'),
+        [((2, 4), (2, 4)), ((2, 4), (2, 2)), ((2, 4), (4, 2)), ((8, 1), (1, 2))],
+    )
+    def test_partition_change_save_load(self, tmp_path_dist_ckpt, src_tp_pp, dest_tp_pp):
+        Utils.initialize_model_parallel(*src_tp_pp)
+        with TempNamedDir(
+            tmp_path_dist_ckpt / 'test_flattened_partition_change_save_load'
+        ) as ckpt_dir:
+
+            state_dict = self._build_state_dict()
+
+            save(state_dict, ckpt_dir)
+
+            # change TPxPP
+            Utils.destroy_model_parallel()
+            Utils.initialize_model_parallel(*dest_tp_pp)
+            loaded_state_dict = load(self._build_state_dict(random=True), ckpt_dir)
+            expected_state_dict = {k: v.data for k, v in self._build_state_dict().items()}
+
+            diffs = diff(expected_state_dict, loaded_state_dict)
+            assert not any(diffs), diffs
+
+        Utils.destroy_model_parallel()
+
+    @pytest.mark.parametrize(
+        ('src_tp_pp', 'dest_tp_pp', 'expected_ckpt_offsets_by_rank'),
+        [
+            (
+                (2, 4),
+                (2, 2),
+                {
+                    0: [(0, 0, 0), (0, 0, 10)],  # TP 0, DP 0, PP 0
+                    1: [(4, 0, 0), (4, 0, 10)],  # TP 1, DP 0, PP 0
+                    2: [(0, 0, 0), (0, 0, 10)],  # TP 0, DP 1, PP 0
+                    3: [(4, 0, 0), (4, 0, 10)],  # TP 1, DP 1, PP 0
+                    4: [(0, 0, 20), (0, 0, 30)],  # TP 0, DP 0, PP 1
+                    5: [(4, 0, 20), (4, 0, 30)],  # TP 1, DP 0, PP 1
+                    6: [(0, 0, 20), (0, 0, 30)],  # TP 0, DP 1, PP 1
+                    7: [(4, 0, 20), (4, 0, 30)],  # TP 1, DP 1, PP 1
+                },
+            ),
+            ((8, 1), (1, 2), {rank: [(tp, 0, 0) for tp in range(8)] for rank in range(8)}),
+        ],
+    )
+    def test_reformulate_nd_flattened_tensors(
+        self, tmp_path_dist_ckpt, src_tp_pp, dest_tp_pp, expected_ckpt_offsets_by_rank
+    ):
+        Utils.initialize_model_parallel(*src_tp_pp, order='tp-dp-pp')
+        with TempNamedDir(tmp_path_dist_ckpt / 'test_reformulate_nd_flattened_tensors') as ckpt_dir:
+
+            state_dict = self._build_state_dict()
+
+            ckpt_local_shape = state_dict['sd_key_flat'].local_shape
+
+            save(state_dict, ckpt_dir)
+
+            # change TPxPP
+            Utils.destroy_model_parallel()
+            Utils.initialize_model_parallel(*dest_tp_pp, order='tp-dp-pp')
+            load_state_dict = self._build_state_dict(random=True)
+
+            reformulation_metadata = get_reformulation_metadata(load_state_dict, ckpt_dir)
+            reformulated_state_dict, formulation_restore_data = (
+                apply_nd_flattened_tensors_reformulation(load_state_dict, reformulation_metadata)
+            )
+            assert isinstance(reformulated_state_dict['sd_key_unflat'], ShardedTensor)
+            assert isinstance(reformulated_state_dict['sd_key_flat'], dict)
+
+            assert reformulated_state_dict['sd_key_flat'].keys() == set(
+                (offset, ckpt_local_shape) for offset in expected_ckpt_offsets_by_rank[Utils.rank]
+            ), (
+                reformulated_state_dict['sd_key_flat'].keys(),
+                ckpt_local_shape,
+                expected_ckpt_offsets_by_rank[Utils.rank],
+            )
+
+            # We can even load the reformulated state dict with a high-level API
+            loaded_state_dict = load(
+                reformulated_state_dict, ckpt_dir, validate_access_integrity=False
+            )
+            loaded_state_dict = restore_nd_flattened_tensors_formulation(
+                loaded_state_dict, formulation_restore_data
+            )
+            expected_state_dict = {k: v.data for k, v in self._build_state_dict().items()}
+            diffs = diff(expected_state_dict, loaded_state_dict)
+            assert not any(diffs), diffs
+
+        Utils.destroy_model_parallel()
+
+    @pytest.mark.parametrize(('src_tp_pp',), [((2, 4),), ((8, 1),), ((1, 1),), ((1, 4),)])
+    def test_load_tensor_metadata(self, tmp_path_dist_ckpt, src_tp_pp):
+        Utils.initialize_model_parallel(*src_tp_pp, order='tp-dp-pp')
+        with TempNamedDir(tmp_path_dist_ckpt / 'test_reformulate_nd_flattened_tensors') as ckpt_dir:
+
+            state_dict = self._build_state_dict()
+
+            save(state_dict, ckpt_dir)
+
+            # change TPxPP
+            Utils.destroy_model_parallel()
+            Utils.initialize_model_parallel(1, 1)
+
+            sharded_metadata = load_tensors_metadata(ckpt_dir)
+
+            for attr_name in ('local_shape', 'global_shape'):
+                flat_val = getattr(sharded_metadata['flat'], attr_name)
+                unflat_val = getattr(sharded_metadata['unflat'], attr_name)
+                assert flat_val == unflat_val, (attr_name, flat_val, unflat_val)
+
+            for sh_ten in sharded_metadata.values():
+                sh_ten.replica_id = Utils.rank
+            loaded_state_dict = load(sharded_metadata, ckpt_dir)
+            assert torch.all(
+                loaded_state_dict['unflat'] == torch.arange(8 * 5 * 40).reshape(8, 5, 40)
+            )
+            assert torch.all(loaded_state_dict['flat'] == torch.arange(8 * 5 * 40))
+
+        Utils.destroy_model_parallel()
+
+    def _build_state_dict(self, random=False):
+        tp_rank = parallel_state.get_tensor_model_parallel_rank()
+        tp_size = parallel_state.get_tensor_model_parallel_world_size()
+        pp_rank = parallel_state.get_pipeline_model_parallel_rank()
+        pp_size = parallel_state.get_pipeline_model_parallel_world_size()
+        dp_rank = parallel_state.get_data_parallel_rank()
+        dp_size = parallel_state.get_data_parallel_world_size()
+
+        init_fn = torch.rand if random else torch.arange
+        global_ten = init_fn(8 * 5 * 40).reshape(8, 5, 40)
+        local_ten = global_ten
+        local_ten = local_ten.chunk(tp_size, dim=0)[tp_rank]
+        local_ten = local_ten.chunk(pp_size, dim=2)[pp_rank]
+        assert local_ten.shape == (8 // tp_size, 5, 40 // pp_size)
+
+        local_ten_size_by_dp = local_ten.numel()
+        assert local_ten_size_by_dp % dp_size == 0, (local_ten_size_by_dp, dp_size)
+        local_ten_size_by_dp = local_ten_size_by_dp // dp_size
+        # make a bit shifted DP slices so that they are not equal
+        start_jitter = dp_rank
+        end_jitter = dp_rank + 1 if dp_rank + 1 < dp_size else 0
+        local_dp_slice = slice(
+            local_ten_size_by_dp * dp_rank + start_jitter,
+            local_ten_size_by_dp * (dp_rank + 1) + end_jitter,
+        )
+        local_flat_ten = local_ten.flatten()[local_dp_slice]
+        if dp_rank == dp_size - 1:
+            assert local_flat_ten.numel() == local_ten_size_by_dp - dp_rank
+        else:
+            assert local_flat_ten.numel() == local_ten_size_by_dp + 1
+
+        state_dict = {
+            'sd_key_unflat': ShardedTensor.from_rank_offsets(
+                'unflat',
+                local_ten,
+                (0, tp_rank, tp_size),
+                (2, pp_rank, pp_size),
+                replica_id=dp_rank,
+            ),
+            'sd_key_flat': ShardedTensor.from_rank_offsets_flat(
+                'flat',
+                local_flat_ten,
+                local_ten.shape,
+                (0, tp_rank, tp_size),
+                (2, pp_rank, pp_size),
+                flattened_range=local_dp_slice,
+            ),
+        }
+        return state_dict
diff --git a/tests/unit_tests/dist_checkpointing/test_fp8.py b/tests/unit_tests/dist_checkpointing/test_fp8.py
new file mode 100644
index 0000000000..d2dcb367c7
--- /dev/null
+++ b/tests/unit_tests/dist_checkpointing/test_fp8.py
@@ -0,0 +1,98 @@
+# Copyright (c) 2023, NVIDIA CORPORATION. All rights reserved.
+
+import pytest
+import torch
+from transformer_engine.pytorch.float8_tensor import Float8Tensor
+
+from megatron.core.dist_checkpointing import ShardedTensor, load, save
+from megatron.core.dist_checkpointing.serialization import (
+    get_default_load_sharded_strategy,
+    get_default_save_sharded_strategy,
+)
+from megatron.core.dist_checkpointing.strategies.fully_parallel import (
+    FullyParallelLoadStrategyWrapper,
+    FullyParallelSaveStrategyWrapper,
+)
+from tests.unit_tests.dist_checkpointing import TempNamedDir
+from tests.unit_tests.test_utilities import Utils
+
+
+class TestFP8:
+    @pytest.mark.parametrize('dtype', ['bf16', 'fp16', 'fp8'])
+    @pytest.mark.parametrize('src_rank', [0, 6])
+    def test_simple_broadcast(self, dtype, src_rank):
+        Utils.initialize_model_parallel()
+
+        def get_ten(dtype: str = 'fp8'):
+            if dtype == 'fp8':
+                return Float8Tensor.to_float8(
+                    torch.full((3,), Utils.rank, dtype=torch.bfloat16, device='cuda')
+                )
+            elif dtype == 'bf16':
+                return torch.full((3,), Utils.rank, dtype=torch.bfloat16, device='cuda')
+            elif dtype == 'fp16':
+                return torch.full((3,), Utils.rank, dtype=torch.float16, device='cuda')
+            else:
+                raise NotImplementedError(dtype)
+
+        ten = get_ten(dtype)
+
+        # because of a bug in TE, with the cast broadcast fails
+        if isinstance(ten, Float8Tensor):
+            ten = ten.from_float8()
+        torch.distributed.broadcast(ten, src=src_rank)
+        assert torch.all(ten == src_rank)
+
+    @pytest.mark.parametrize(
+        ('use_fpsl', 'src_tp_pp', 'dest_tp_pp', 'load_exchange_algo'),
+        [
+            (True, (2, 4), (2, 4), 'broadcast'),
+            (True, (2, 4), (2, 4), 'gather_rounds'),
+            (False, (2, 4), (2, 4), None),
+        ],
+    )
+    @pytest.mark.flaky
+    def test_fp8_save_load(
+        self, tmp_path_dist_ckpt, use_fpsl, src_tp_pp, dest_tp_pp, load_exchange_algo
+    ):
+        Utils.initialize_model_parallel(*src_tp_pp)
+
+        def get_fp8_tensor(fill_val=1):
+            return Float8Tensor.to_float8(
+                torch.full((3,), fill_val, dtype=torch.bfloat16, device='cuda')
+            )
+
+        def get_state_dict(fill_val=1):
+            return {
+                'a': ShardedTensor.from_rank_offsets(
+                    'a', get_fp8_tensor(fill_val), (0, Utils.rank, Utils.world_size), replica_id=0
+                ),
+                'b': ShardedTensor.from_rank_offsets(
+                    'b', get_fp8_tensor(fill_val), replica_id=Utils.rank
+                ),
+                'c': ShardedTensor.from_rank_offsets(
+                    'c', get_fp8_tensor(fill_val), replica_id=Utils.rank
+                ),
+            }
+
+        with TempNamedDir(tmp_path_dist_ckpt / 'test_fp8_save_load', sync=True) as ckpt_dir:
+            save_strategy = get_default_save_sharded_strategy()
+            if use_fpsl:
+                save_strategy = FullyParallelSaveStrategyWrapper(save_strategy, None, True)
+            save(get_state_dict(4), ckpt_dir, save_strategy)
+
+            Utils.destroy_model_parallel()
+            Utils.initialize_model_parallel(*dest_tp_pp)
+
+            if use_fpsl:
+                load_strategy = get_default_load_sharded_strategy(ckpt_dir)
+                load_strategy = FullyParallelLoadStrategyWrapper(
+                    load_strategy, None, False, load_exchange_algo
+                )
+            else:
+                load_strategy = None
+
+            loaded_state_dict = load(get_state_dict(8), ckpt_dir, load_strategy)
+            assert torch.all(loaded_state_dict['a'] == 4)
+            assert torch.all(loaded_state_dict['b'] == 4)
+        Utils.destroy_model_parallel()
diff --git a/tests/unit_tests/dist_checkpointing/test_fully_parallel.py b/tests/unit_tests/dist_checkpointing/test_fully_parallel.py
new file mode 100644
index 0000000000..623e37d6b8
--- /dev/null
+++ b/tests/unit_tests/dist_checkpointing/test_fully_parallel.py
@@ -0,0 +1,378 @@
+# Copyright (c) 2023, NVIDIA CORPORATION. All rights reserved.
+from pathlib import Path
+from typing import List, Tuple
+from unittest import mock
+
+import pytest
+import torch
+
+from megatron.core import parallel_state
+from megatron.core.dist_checkpointing import ShardedTensor
+from megatron.core.dist_checkpointing.dict_utils import (
+    dict_list_map_outplace,
+    map_reduce,
+    nested_values,
+)
+from megatron.core.dist_checkpointing.exchange_utils import _get_empty_tensor_for_exchange
+from megatron.core.dist_checkpointing.mapping import ShardedStateDict, is_main_replica
+from megatron.core.dist_checkpointing.strategies.base import (
+    LoadShardedStrategy,
+    SaveShardedStrategy,
+)
+from megatron.core.dist_checkpointing.strategies.fully_parallel import (
+    FullyParallelLoadStrategyWrapper,
+    FullyParallelSaveStrategyWrapper,
+    _sharded_tensor_shard_id,
+)
+from tests.unit_tests.dist_checkpointing import TempNamedDir
+from tests.unit_tests.test_utilities import Utils
+
+
+class MockSaveStrategy(SaveShardedStrategy):
+    def __init__(self):
+        super().__init__('mock', 1)
+        self.save_keys = set()
+
+    def save(self, sharded_state_dict, ckpt_dir):
+        self.save_keys = {
+            sh_ten.key
+            for sh_ten in nested_values(sharded_state_dict)
+            if is_main_replica(sh_ten.replica_id)
+        }
+
+
+class MockLoadStrategy(LoadShardedStrategy):
+    def __init__(self, device='cpu'):
+        super().__init__()
+        self.device = device
+        self.load_keys = set()
+
+    def load(self, sharded_state_dict, ckpt_dir):
+        self.load_keys = {
+            sh_ten.key
+            for sh_ten in nested_values(sharded_state_dict)
+            if is_main_replica(sh_ten.replica_id)
+        }
+
+        def load_rand(x):
+            assert isinstance(x, ShardedTensor)
+            x.init_data(self.device)
+            x.data.fill_(Utils.rank)
+            return x.data
+
+        return dict_list_map_outplace(load_rand, sharded_state_dict)
+
+    def load_tensors_metadata(self, checkpoint_dir: Path):
+        pass
+
+    def check_backend_compatibility(self, loaded_version):
+        pass
+
+    def check_version_compatibility(self, loaded_version):
+        pass
+
+
+class TestFullyParallelSaveAndLoad:
+    def setup_method(self, method):
+        pass
+
+    def teardown_method(self, method):
+        Utils.destroy_model_parallel()
+
+    @staticmethod
+    def get_sharded_state_dict():
+        return {
+            'sd_key_tp_repl1': ShardedTensor.from_rank_offsets(
+                'key_TP_repl1',
+                torch.ones(10),
+                (
+                    0,
+                    parallel_state.get_tensor_model_parallel_rank(),
+                    parallel_state.get_tensor_model_parallel_world_size(),
+                ),
+                replica_id=parallel_state.get_data_parallel_rank(with_context_parallel=True),
+            ),
+            'sd_key_tp_repl2': ShardedTensor.from_rank_offsets(
+                'key_TP_repl2',
+                torch.ones(10),
+                (
+                    0,
+                    parallel_state.get_tensor_model_parallel_rank(),
+                    parallel_state.get_tensor_model_parallel_world_size(),
+                ),
+                replica_id=parallel_state.get_data_parallel_rank(with_context_parallel=True),
+            ),
+            'sd_keyB': ShardedTensor.from_rank_offsets(
+                'keyB', torch.ones(20), (0, Utils.rank, Utils.world_size)
+            ),
+            'sd_keyE_no_C': ShardedTensor.from_rank_offsets(
+                'keyC', torch.ones(100), replica_id=Utils.rank
+            ),
+            'sd_keyX_no_D': ShardedTensor.from_rank_offsets(
+                'keyD', torch.ones(1000), replica_id=Utils.rank
+            ),
+            'sd_keyC_no_E': ShardedTensor.from_rank_offsets(
+                'keyE', torch.ones(100), replica_id=Utils.rank
+            ),
+        }
+
+    @pytest.mark.parametrize("parallelization_along_dp", [False, True])
+    def test_save_distribution(self, parallelization_along_dp, tmp_path_dist_ckpt):
+        Utils.initialize_model_parallel(2, 1)
+        state_dict = self.get_sharded_state_dict()
+
+        # Ranks assignment:
+        # 1. Lowest coverage
+        # 2. Largest tensor
+        # 3. Shard id (key)
+        if not parallelization_along_dp:
+            expected_key_to_saving_ranks = {
+                'keyB': list(
+                    range(Utils.world_size)
+                ),  # everyone must save (disjoint shards, coverage == 1)
+                'key_TP_repl1': [0, 1],  # lowest coverage (4), first TP domain
+                'key_TP_repl2': [2, 3],  # lowest coverage (4), second TP domain
+                'keyD': [4],  # largest tensor
+                'keyC': [5],  # second largest tensor
+                'keyE': [6],  # second largest tensor
+            }
+        else:
+            if parallel_state.get_tensor_model_parallel_rank() == 0:
+                expected_key_to_saving_ranks = {
+                    # everyone must save (disjoint shards, coverage == 1):
+                    'keyB': list(
+                        range(
+                            parallel_state.get_data_parallel_world_size(with_context_parallel=True)
+                        )
+                    ),
+                    # this time, TP sharded tensors have the same coverage as fully replicated!
+                    'keyD': [0],  # largest tensor
+                    'keyC': [1],  # second largest tensor
+                    'keyE': [2],  # second largest tensor
+                    'key_TP_repl1': [3],  # smallest tensor
+                    'key_TP_repl2': [3],  # smallest tensor, last rank is the least occupied
+                }
+            else:
+                expected_key_to_saving_ranks = {
+                    # everyone must save (disjoint shards, coverage == 1):
+                    'keyB': list(
+                        range(
+                            parallel_state.get_data_parallel_world_size(with_context_parallel=True)
+                        )
+                    ),
+                    # tensors C, D, E are absent in this DP group
+                    'key_TP_repl1': [0],  # smallest tensor
+                    'key_TP_repl2': [1],  # smallest tensor, last rank is the least occupied
+                }
+
+        parallelization_group = (
+            parallel_state.get_data_parallel_group(with_context_parallel=True)
+            if parallelization_along_dp
+            else None
+        )
+        dp_rank = torch.distributed.get_rank(parallelization_group)
+        expected_keys_saved_by_current_rank = {
+            k for k, v in expected_key_to_saving_ranks.items() if dp_rank in v
+        }
+
+        # Run save and tests
+        mock_strategy = MockSaveStrategy()
+        save_strategy = FullyParallelSaveStrategyWrapper(
+            mock_strategy, parallelization_group, do_cache_distribution=True
+        )
+        with TempNamedDir(tmp_path_dist_ckpt / 'mock_dir') as ckpt_dir_A:
+            save_strategy.save(state_dict, ckpt_dir_A)
+        key_to_saving_rank = dict(
+            map_reduce(
+                save_strategy.cached_distribution.main_rank_for_shard.items(),
+                lambda shard_rank: shard_rank[0][0],
+                lambda shard_rank: shard_rank[1],
+            )
+        )
+        assert expected_key_to_saving_ranks == key_to_saving_rank
+
+        for _, sh_ten in state_dict.items():
+            if (
+                _sharded_tensor_shard_id(sh_ten)
+                in save_strategy.cached_distribution.shards_in_this_group
+            ):
+                is_expected_to_be_saved_by_this_rank = dp_rank in expected_key_to_saving_ranks.get(
+                    sh_ten.key, []
+                )
+                assert sh_ten.replica_id == int(
+                    not is_expected_to_be_saved_by_this_rank
+                ), expected_key_to_saving_ranks
+
+        assert mock_strategy.save_keys == expected_keys_saved_by_current_rank, (
+            Utils.rank,
+            mock_strategy.save_keys,
+            expected_keys_saved_by_current_rank,
+        )
+
+    @pytest.mark.parametrize("parallelization_along_dp", [False, True])
+    def test_load_distribution(self, parallelization_along_dp, tmp_path_dist_ckpt):
+        Utils.initialize_model_parallel(2, 1)
+
+        state_dict = self.get_sharded_state_dict()
+
+        # Ranks assignment:
+        # 1. Lowest coverage
+        # 2. Largest tensor
+        # 3. Shard id (key)
+        if not parallelization_along_dp:
+            expected_key_to_saving_ranks = {
+                'keyB': list(
+                    range(Utils.world_size)
+                ),  # everyone must save (disjoint shards, coverage == 1)
+                'key_TP_repl1': [0, 1],  # lowest coverage (4), first TP domain
+                'key_TP_repl2': [2, 3],  # lowest coverage (4), second TP domain
+                'keyD': [4],  # largest tensor
+                'keyC': [5],  # second largest tensor
+                'keyE': [6],  # second largest tensor
+            }
+        else:
+            # When loading, expected key distribution is the same across TP, because every replica
+            # needs to be loaded
+            expected_key_to_saving_ranks = {
+                # everyone must load (disjoint shards, coverage == 1):
+                'keyB': list(
+                    range(parallel_state.get_data_parallel_world_size(with_context_parallel=True))
+                ),
+                # this time, TP sharded tensors have the same coverage as fully replicated!
+                'keyD': [0],  # largest tensor
+                'keyC': [1],  # second largest tensor
+                'keyE': [2],  # second largest tensor
+                'key_TP_repl1': [3],  # smallest tensor
+                'key_TP_repl2': [3],  # smallest tensor, last rank is the least occupied
+            }
+
+        parallelization_group = (
+            parallel_state.get_data_parallel_group(with_context_parallel=True)
+            if parallelization_along_dp
+            else None
+        )
+        dp_rank = torch.distributed.get_rank(parallelization_group)
+        expected_keys_saved_by_current_rank = {
+            k for k, v in expected_key_to_saving_ranks.items() if dp_rank in v
+        }
+
+        # Run save and tests
+        mock_strategy = MockLoadStrategy()
+        load_strategy = FullyParallelLoadStrategyWrapper(
+            mock_strategy, parallelization_group, do_cache_distribution=True
+        )
+        with TempNamedDir(tmp_path_dist_ckpt / 'mock_dir') as ckpt_dir_A:
+            loaded_state_dict = load_strategy.load(state_dict, ckpt_dir_A)
+        key_to_saving_rank = dict(
+            map_reduce(
+                load_strategy.cached_distribution.main_rank_for_shard.items(),
+                lambda shard_rank: shard_rank[0][0],
+                lambda shard_rank: shard_rank[1],
+            )
+        )
+        assert expected_key_to_saving_ranks == key_to_saving_rank
+
+        assert mock_strategy.load_keys == expected_keys_saved_by_current_rank, (
+            Utils.rank,
+            mock_strategy.load_keys,
+            expected_keys_saved_by_current_rank,
+        )
+
+        assert loaded_state_dict.keys() == state_dict.keys()
+
+    @pytest.mark.parametrize('state_dict_device', ['cpu', 'cuda'])
+    @pytest.mark.flaky
+    def test_memory_usage(self, state_dict_device, tmp_path_dist_ckpt):
+        Utils.initialize_model_parallel(2, 1)
+
+        megabytes = 1024 * 1024
+        mock_strategy = MockLoadStrategy(state_dict_device)
+
+        mem_alloc = []
+
+        real_get_empty_tensor_for_exchange = _get_empty_tensor_for_exchange
+
+        def mock_get_empty_tensor_for_exchange(*args, **kwargs) -> torch.Tensor:
+            ret = real_get_empty_tensor_for_exchange(*args, **kwargs)
+            mem_alloc.append(torch.cuda.memory_allocated())
+            return ret
+
+        load_strategy = FullyParallelLoadStrategyWrapper(mock_strategy)
+        torch.distributed.barrier()
+
+        # Each tensor is 4MB, 40MB in total.
+        # We expect extra memory usage peak at ~32MB, not 1GB
+        sharded_state_dict = {
+            f'ten_{i}': ShardedTensor.from_rank_offsets(
+                f'ten_{i}',
+                torch.rand(megabytes, dtype=torch.float, device=state_dict_device),
+                (0, Utils.rank, Utils.world_size),
+            )
+            for i in range(10)
+        }
+
+        mem_alloc_start = torch.cuda.memory_allocated()
+
+        with mock.patch(
+            'megatron.core.dist_checkpointing.exchange_utils._get_empty_tensor_for_exchange',
+            new=mock_get_empty_tensor_for_exchange,
+        ), TempNamedDir(tmp_path_dist_ckpt / 'mock_dir') as ckpt_dir_A:
+            _ = load_strategy.load(sharded_state_dict, ckpt_dir_A)
+
+        # Each rank is expected to do 7 * 10 empty allocations
+        assert len(mem_alloc) == 7 * 10
+        # Peak mem usage should be within 4MB (single tensor)
+        assert max(mem_alloc) - mem_alloc_start < 4.01 * megabytes, (
+            max(mem_alloc),
+            mem_alloc_start,
+        )
+
+        Utils.destroy_model_parallel()
+
+    def test_only_necessary_exchanges_performed_during_load(self, tmp_path_dist_ckpt):
+        Utils.initialize_model_parallel(2, 1)
+
+        # State dict with 2 expected exchanges
+        sharded_state_dict_baseline_two_exchanges = {
+            'needed_by_all_A': ShardedTensor.from_rank_offsets(
+                'needed_by_all_A',
+                torch.ones(4, dtype=torch.float, device='cuda'),
+                replica_id=Utils.rank,
+            ),
+            'needed_by_all_B': ShardedTensor.from_rank_offsets(
+                'needed_by_all_B',
+                torch.ones(4, dtype=torch.float, device='cuda'),
+                replica_id=Utils.rank,
+            ),
+        }
+        # State dict with 1 expected exchange
+        sharded_state_dict_baseline_one_exchange = {
+            'needed_by_all': sharded_state_dict_baseline_two_exchanges['needed_by_all_A']
+        }
+        # State dict with 1 expected exchanges even though there are 2 tensors to load (1 is unique for each rank)
+        sharded_state_dict_test_one_exchange = sharded_state_dict_baseline_one_exchange.copy()
+        sharded_state_dict_test_one_exchange['unique'] = ShardedTensor.from_rank_offsets(
+            'unique',
+            torch.ones(4, dtype=torch.float, device='cuda'),
+            (0, Utils.rank, Utils.world_size),
+        )
+
+        expected_call_counts: List[Tuple[ShardedStateDict, int]] = [
+            (sharded_state_dict_baseline_one_exchange, 1),
+            (sharded_state_dict_baseline_two_exchanges, 2),
+            (sharded_state_dict_test_one_exchange, 1),
+        ]
+
+        mock_strategy = MockLoadStrategy()
+        with TempNamedDir(tmp_path_dist_ckpt / 'mock_dir') as ckpt_dir:
+            for sharded_state_dict, expected_count in expected_call_counts:
+                load_strategy = FullyParallelLoadStrategyWrapper(
+                    mock_strategy, None, do_cache_distribution=True, exchange_algo='broadcast'
+                )
+                with mock.patch(
+                    'megatron.core.dist_checkpointing.strategies.fully_parallel.torch.distributed.broadcast'
+                ) as broadcast_mock:
+                    _ = load_strategy.load(sharded_state_dict, ckpt_dir)
+                    assert broadcast_mock.call_count == expected_count
+
+        Utils.destroy_model_parallel()
diff --git a/tests/unit_tests/dist_checkpointing/test_local.py b/tests/unit_tests/dist_checkpointing/test_local.py
new file mode 100644
index 0000000000..e4dfc6f8e8
--- /dev/null
+++ b/tests/unit_tests/dist_checkpointing/test_local.py
@@ -0,0 +1,217 @@
+# Copyright (c) 2024, NVIDIA CORPORATION. All rights reserved.
+import filecmp
+import shutil
+from pathlib import Path
+from types import SimpleNamespace
+from typing import Any, Callable, Tuple, Union
+from unittest import mock
+
+import pytest
+import torch
+
+from megatron.core.dist_checkpointing import ShardedTensor
+from megatron.core.dist_checkpointing.dict_utils import diff
+from megatron.core.dist_checkpointing.mapping import ShardedBase, ShardedTensorFactory
+from megatron.core.dist_checkpointing.state_dict_transformation import (
+    prepare_state_dict_for_save,
+    recreate_state_dict_after_load,
+)
+from megatron.core.dist_checkpointing.utils import extract_nonpersistent
+from megatron.training.async_utils import maybe_finalize_async_save
+from megatron.training.checkpointing import generate_state_dict, load_checkpoint, save_checkpoint
+from tests.unit_tests.dist_checkpointing import (
+    TempNamedDir,
+    init_basic_mock_args,
+    init_checkpointing_mock_args,
+    setup_model_and_optimizer,
+)
+from tests.unit_tests.test_utilities import Utils
+
+
+def find_matching_values(
+    x: Union[dict, list], predicate: Callable[[Any], bool]
+) -> Tuple[Union[dict, list], Union[dict, list]]:
+    """Return matching values in a single list
+
+    Args:
+        x (Union[dict, list]) : state dict to process. Top-level argument must be a dict or list
+        predicate (object -> bool): determines matching values
+    """
+
+    matching_vals = []
+    if isinstance(x, dict):
+        values = x.values()
+    elif isinstance(x, list):
+        values = x
+    else:
+        raise ValueError(f'Unexpected top-level object type: {type(x)}')
+    for v in values:
+        if isinstance(v, (list, dict)):
+            matching_vals += find_matching_values(v, predicate)
+        elif predicate(v):
+            matching_vals.append(v)
+    return matching_vals
+
+
+class TestLocalCheckpointing:
+    def setup_method(self, method):
+        pass
+
+    def teardown_method(self, method):
+        Utils.destroy_model_parallel()
+
+    @pytest.mark.parametrize(('tp,pp'), [(2, 4)])
+    def test_sharded_tensors(self, tp, pp):
+        Utils.initialize_model_parallel(tp, pp)
+        num_floating_point_operations_so_far = 0
+        model, optimizer = setup_model_and_optimizer(1, tp, pp)
+        opt_param_scheduler = None
+        rng_state = None
+        use_dist_ckpt = True
+        iteration = None
+        optim_sd_kwargs = dict(sharding_type='fully_sharded_model_space')
+        mock_args = SimpleNamespace()
+        mock_args.no_save_optim = False
+        mock_args.no_save_rng = True
+        # Test save_local
+        state_dict = generate_state_dict(
+            mock_args,
+            model,
+            optimizer,
+            opt_param_scheduler,
+            rng_state,
+            use_dist_ckpt,
+            iteration,
+            optim_sd_kwargs=optim_sd_kwargs,
+        )
+        sharded_tensor_factories = find_matching_values(
+            state_dict, lambda x: isinstance(x, ShardedTensorFactory)
+        )
+        sharded_tensors = find_matching_values(state_dict, lambda x: isinstance(x, ShardedTensor))
+        for ten in sharded_tensors:
+            assert ten.data != None
+        saved_state_dict = prepare_state_dict_for_save(state_dict)
+        saved_sharded_tensors = find_matching_values(
+            saved_state_dict, lambda x: isinstance(x, ShardedTensor)
+        )
+        for ten in saved_sharded_tensors:
+            assert ten.data == None
+        assert (
+            len(saved_sharded_tensors)
+            == len(sharded_tensors) + 2 * len(sharded_tensor_factories)
+            == len(saved_state_dict['raw_tensors'])
+        )
+        common_sharded_tensors = find_matching_values(
+            saved_state_dict["common"], lambda x: isinstance(x, ShardedTensor)
+        )
+        assert common_sharded_tensors == []
+        # Test load_local
+        state_dict = generate_state_dict(
+            mock_args,
+            model,
+            optimizer,
+            opt_param_scheduler,
+            rng_state,
+            True,
+            iteration,
+            optim_sd_kwargs=optim_sd_kwargs,
+        )
+        nonpersistent_state_dict, _ = extract_nonpersistent(state_dict)
+        # For a given use case
+        assert not nonpersistent_state_dict
+        loaded_state_dict = recreate_state_dict_after_load(state_dict, saved_state_dict)
+        only_left, only_right, mismatch = diff(loaded_state_dict, state_dict)
+        assert not only_left
+        assert not only_right
+        for i in mismatch:
+            # ShardedObjects and ShardedTensors should be replaced
+            assert issubclass(i[-1], ShardedBase)
+
+    @pytest.mark.parametrize(('tp,pp'), [(2, 4), (1, 1)])
+    @pytest.mark.parametrize(('use_ramdisk'), [True, False])
+    @pytest.mark.parametrize(('async_save'), [True, False])
+    @pytest.mark.parametrize(('algo'), ['atomic', 'fully_parallel'])
+    @pytest.mark.skip(reason="BasicLocalCheckpointManager is not yet integrated")
+    def test_basic_save_load_scenarios(
+        self, tmp_path_dist_ckpt, tp, pp, use_ramdisk, async_save, algo
+    ):
+        Utils.initialize_model_parallel(tp, pp)
+        num_floating_point_operations_so_far = 0
+        model, optimizer = setup_model_and_optimizer(1, tp, pp)
+        opt_param_scheduler = None
+
+        mock_args = SimpleNamespace()
+        if use_ramdisk:
+            tmp_path_dist_ckpt = Path("/dev/shm")
+        with TempNamedDir(tmp_path_dist_ckpt / "test_local") as local_ckpt_dir, mock.patch(
+            'megatron.training.checkpointing.get_args', new=lambda: mock_args
+        ), mock.patch('megatron.training.async_utils.get_args', new=lambda: mock_args), mock.patch(
+            "megatron.training.checkpointing.update_num_microbatches"
+        ):
+            local_ckpt_dir = local_ckpt_dir / "subdir"  # Test handling of non-existent directories
+            init_basic_mock_args(mock_args, tp, pp)
+            init_checkpointing_mock_args(mock_args, None)
+            mock_args.non_persistent_ckpt_type = 'local'
+            mock_args.non_persistent_local_ckpt_algo = algo
+            mock_args.async_save = async_save
+            checkpointing_context = {
+                'local_checkpoint_manager': BasicLocalCheckpointManager(local_ckpt_dir)
+            }
+
+            save_checkpoint(
+                1,
+                model,
+                optimizer,
+                opt_param_scheduler,
+                num_floating_point_operations_so_far,
+                checkpointing_context=checkpointing_context,
+                non_persistent_ckpt=True,
+            )
+            if async_save:
+                maybe_finalize_async_save(True)
+            iteration, _ = load_checkpoint(
+                model, optimizer, opt_param_scheduler, checkpointing_context=checkpointing_context
+            )
+            assert iteration == 1
+            ckpt_path = checkpointing_context['local_checkpoint_manager'].local_ckpt_path
+            backup_path = ckpt_path.with_name('backup_' + ckpt_path.name)
+            checkpointing_context['local_checkpoint_manager'].latest_iteration = -1
+            iteration, _ = load_checkpoint(
+                model, optimizer, opt_param_scheduler, checkpointing_context=checkpointing_context
+            )
+            assert iteration == 1
+            shutil.move(ckpt_path, backup_path)
+            checkpointing_context['local_checkpoint_manager'].latest_iteration = -1
+            torch.distributed.barrier()
+            iteration, _ = load_checkpoint(
+                model, optimizer, opt_param_scheduler, checkpointing_context=checkpointing_context
+            )
+            assert iteration == 0
+            save_checkpoint(
+                1,
+                model,
+                optimizer,
+                opt_param_scheduler,
+                num_floating_point_operations_so_far,
+                checkpointing_context=checkpointing_context,
+                non_persistent_ckpt=True,
+            )
+            if async_save:
+                maybe_finalize_async_save(True)
+            assert filecmp.cmp(ckpt_path, backup_path, shallow=False), [ckpt_path, backup_path]
+            save_checkpoint(
+                2,
+                model,
+                optimizer,
+                opt_param_scheduler,
+                num_floating_point_operations_so_far,
+                checkpointing_context=checkpointing_context,
+                non_persistent_ckpt=True,
+            )
+            if async_save:
+                maybe_finalize_async_save(True)
+            assert not ckpt_path.exists()
+            ckpt_path = checkpointing_context['local_checkpoint_manager'].local_ckpt_path
+            assert ckpt_path.exists()
+
+        Utils.destroy_model_parallel()
diff --git a/tests/unit_tests/dist_checkpointing/test_mapping.py b/tests/unit_tests/dist_checkpointing/test_mapping.py
new file mode 100644
index 0000000000..38582d7524
--- /dev/null
+++ b/tests/unit_tests/dist_checkpointing/test_mapping.py
@@ -0,0 +1,178 @@
+# Copyright (c) 2023, NVIDIA CORPORATION. All rights reserved.
+
+import pytest
+import torch
+
+from megatron.core.dist_checkpointing import ShardedTensor
+from megatron.core.dist_checkpointing.core import CheckpointingException
+from megatron.core.dist_checkpointing.mapping import (
+    ShardedObject,
+    ShardedTensorFactory,
+    apply_factories,
+    apply_factory_merges,
+    is_main_replica,
+)
+from megatron.core.transformer.transformer_config import TransformerConfig
+from tests.unit_tests.test_utilities import Utils
+
+
+class TestShardedTensor:
+
+    # def setup_method(self, method):
+    #     Utils.initialize_model_parallel(1,1)
+    #     transformer_config = TransformerConfig(num_layers=2, hidden_size=12, num_attention_heads=4, use_cpu_initialization=True)
+    #     self.gpt_embedding = GPTEmbedding(config=transformer_config, vocab_size=100, max_sequence_length=4, add_position_embedding=True)
+    #
+    # def teardown_method(self, method):
+    #     Utils.destroy_model_parallel()
+
+    def test_from_rank_offsets_constructor(self, dtype=torch.float, device='cuda'):
+        data = torch.ones((1, 3, 7, 9), dtype=dtype, device=device)
+        shape = data.shape
+        rank_offsets = [(0, 0, 10), (2, 3, 6)]
+        sh_ten = ShardedTensor.from_rank_offsets('keyA', data, *rank_offsets)
+
+        assert isinstance(sh_ten, ShardedTensor)
+        assert sh_ten.dtype is dtype
+        assert sh_ten.local_shape == shape
+        assert sh_ten.global_shape == (shape[0] * 10, shape[1], shape[2] * 6, shape[3])
+        assert sh_ten.global_offset == (0, 0, shape[2] * 3, 0)
+        assert sh_ten.axis_fragmentations == (10, 1, 6, 1)
+
+    def test_from_rank_offsets_flat_constructor(self, dtype=torch.float, device='cuda'):
+        data = torch.arange(28, dtype=dtype, device=device).reshape((1, 4, 7))
+        shape = data.shape
+        rank_offsets = [(1, 0, 2), (2, 3, 5)]
+        flattened_range = slice(4, 9)
+        flat_data = data.flatten()[flattened_range]
+        sh_ten = ShardedTensor.from_rank_offsets_flat(
+            'keyA', flat_data, data.shape, *rank_offsets, flattened_range=flattened_range
+        )
+
+        # The main attributes properties are unchanged
+        assert isinstance(sh_ten, ShardedTensor)
+        assert sh_ten.dtype is dtype
+        assert sh_ten.local_shape == shape
+        assert sh_ten.global_shape == (shape[0], shape[1] * 2, shape[2] * 5)
+        assert sh_ten.global_offset == (0, 0, shape[2] * 3)
+        assert sh_ten.axis_fragmentations == (1, 2, 5)
+
+        assert torch.all(sh_ten.data == torch.arange(4, 9, device=device))
+
+    def test_metadata_integrity_violation(self):
+        data = torch.ones((1, 3, 7, 9), device='meta')
+        rank_offsets = [(0, 0, 10), (2, 3, 6)]
+        sh_ten = ShardedTensor.from_rank_offsets('keyA', data, *rank_offsets)
+        sh_ten.validate_metadata_integrity()
+        with pytest.raises(CheckpointingException):
+            sh_ten.local_shape = (1, 2, 7, 9)
+            sh_ten.validate_metadata_integrity()
+
+        sh_ten = ShardedTensor.from_rank_offsets('keyA', data, *rank_offsets)
+        with pytest.raises(CheckpointingException):
+            sh_ten.global_offset = (0, 1, 0)
+            sh_ten.validate_metadata_integrity()
+
+        with pytest.raises(CheckpointingException):
+            sh_ten = ShardedTensor.from_rank_offsets_flat(
+                'keyA', data, data.shape, *rank_offsets, flattened_range=slice(4, 9)
+            )
+
+        sh_ten = ShardedTensor.from_rank_offsets_flat(
+            'keyA', data.flatten()[4:9], data.shape, *rank_offsets, flattened_range=slice(4, 9)
+        )
+        assert sh_ten.local_shape == (1, 3, 7, 9)
+        with pytest.raises(CheckpointingException):
+            sh_ten.local_shape = (5,)
+            sh_ten.validate_metadata_integrity()
+
+    def test_narrowing(self):
+        data = torch.ones((1, 3, 7, 9))
+        rank_offsets = [(0, 0, 10), (2, 3, 6)]
+        sh_ten = ShardedTensor.from_rank_offsets('keyA', data, *rank_offsets)
+        (narr_sh_ten,) = sh_ten.narrow(1, 1, 2)
+        assert narr_sh_ten.local_shape == (1, 2, 7, 9)
+        assert narr_sh_ten.global_shape == (10, 2, 42, 9)
+        assert narr_sh_ten.global_offset == (0, 0, 21, 0)
+
+        (narr_sh_ten,) = sh_ten.narrow(2, 3, 2)
+        assert narr_sh_ten.local_shape == (1, 3, 2, 9)
+        assert narr_sh_ten.global_shape == (10, 3, 12, 9)
+        assert narr_sh_ten.global_offset == (0, 0, 6, 0)
+
+    def test_flat_narrow(self):
+        data = torch.arange(28).reshape((4, 7))
+        rank_offsets = [(0, 1, 2), (1, 3, 5)]
+        flattened_range = slice(4, 9)
+        flat_data = data.flatten()[flattened_range]
+        sh_ten = ShardedTensor.from_rank_offsets_flat(
+            'keyA', flat_data, data.shape, *rank_offsets, flattened_range=flattened_range
+        )
+
+        # The main attributes properties are unchanged
+        assert isinstance(sh_ten, ShardedTensor)
+        assert torch.all(sh_ten.data == torch.arange(4, 9))
+
+        (narrow_sh_ten,) = sh_ten.narrow(
+            0, 0, 1
+        )  # First seven elements of unflat, intersection has 3 elements
+        assert torch.all(narrow_sh_ten.data == torch.arange(4, 7))
+        assert narrow_sh_ten.local_shape == (1, 7)
+        assert narrow_sh_ten.global_shape == (2, 35)
+        assert narrow_sh_ten.global_offset == (1, 21)
+
+        (narrow_sh_ten,) = sh_ten.narrow(
+            0, 0, 3
+        )  # First 21 elements of unflat, intersection has all 5 elements
+        assert torch.all(narrow_sh_ten.data == torch.arange(4, 9))
+        assert narrow_sh_ten.local_shape == (3, 7)
+        assert narrow_sh_ten.global_shape == (6, 35)
+        assert narrow_sh_ten.global_offset == (3, 21)
+
+        narrow_sh_ten = sh_ten.narrow(0, 2, 1)  # empty intersection
+        assert not narrow_sh_ten, narrow_sh_ten
+
+
+class TestShardedTensorFactory:
+    def test_build_and_merge(self):
+        def build_fn(key, tensor, replica_id, flattened_range):
+            assert flattened_range is None
+            return {
+                'level2_a': ShardedTensor.from_rank_offsets(
+                    key + 'part1', tensor + 1, replica_id=replica_id
+                ),
+                'level2_b': ShardedTensor.from_rank_offsets(
+                    key + 'part2', tensor + 2, replica_id=replica_id
+                ),
+            }
+
+        # state_dict will be modified in-place
+        def get_state_dict():
+            return {
+                'level1': ShardedTensorFactory(
+                    'a', torch.arange(3), build_fn, lambda x: x['level2_b']
+                )
+            }
+
+        state_dict = get_state_dict()
+        apply_factories(state_dict)
+        assert torch.allclose(state_dict['level1']['level2_a'].data, torch.tensor([1, 2, 3]))
+        assert torch.allclose(state_dict['level1']['level2_b'].data, torch.tensor([2, 3, 4]))
+
+        # Simulate loading
+        state_dict['level1']['level2_a'] = state_dict['level1']['level2_a'].data
+        state_dict['level1']['level2_b'] = state_dict['level1']['level2_b'].data
+
+        loaded_state_dict = apply_factory_merges(state_dict, get_state_dict())
+        assert torch.allclose(loaded_state_dict['level1'], torch.tensor([2, 3, 4]))
+
+
+def test_is_main_replica():
+    assert is_main_replica(0)
+    assert is_main_replica((0,))
+    assert is_main_replica((0, 0))
+    assert not is_main_replica(1)
+    assert not is_main_replica(2)
+    assert not is_main_replica((1,))
+    assert not is_main_replica((1, 0))
+    assert not is_main_replica((1, 1, 1))
diff --git a/tests/unit_tests/dist_checkpointing/test_nonpersistent.py b/tests/unit_tests/dist_checkpointing/test_nonpersistent.py
new file mode 100644
index 0000000000..346751e264
--- /dev/null
+++ b/tests/unit_tests/dist_checkpointing/test_nonpersistent.py
@@ -0,0 +1,142 @@
+# Copyright (c) 2024, NVIDIA CORPORATION. All rights reserved.
+
+import filecmp
+import os
+from types import SimpleNamespace
+from unittest import mock
+
+import pytest
+
+from megatron.training.checkpointing import (
+    _NON_PERSISTENT_CKPT_SUBDIR,
+    load_checkpoint,
+    save_checkpoint,
+)
+from tests.unit_tests.dist_checkpointing import (
+    TempNamedDir,
+    init_basic_mock_args,
+    init_checkpointing_mock_args,
+    setup_model_and_optimizer,
+)
+from tests.unit_tests.test_utilities import Utils
+
+
+class TestNonPersistentSaveAndLoad:
+    def setup_method(self, method):
+        pass
+
+    def teardown_method(self, method):
+        Utils.destroy_model_parallel()
+
+    @pytest.mark.parametrize(('tp,pp'), [(2, 4)])
+    @pytest.mark.flaky
+    def test_basic_save_load_scenarios(self, tmp_path_dist_ckpt, tp, pp):
+        Utils.initialize_model_parallel(tp, pp)
+        num_floating_point_operations_so_far = 0
+        model, optimizer = setup_model_and_optimizer(1, tp, pp)
+        opt_param_scheduler = None
+
+        mock_args = SimpleNamespace()
+        with TempNamedDir(
+            tmp_path_dist_ckpt / "test_non_persistent"
+        ) as non_persistent_ckpt_dir, mock.patch(
+            'megatron.training.checkpointing.get_args', new=lambda: mock_args
+        ), mock.patch(
+            "megatron.training.checkpointing.update_num_microbatches"
+        ):
+            init_basic_mock_args(mock_args, tp, pp)
+            init_checkpointing_mock_args(mock_args, non_persistent_ckpt_dir)
+            mock_args.non_persistent_ckpt_type = "global"
+
+            save_checkpoint(
+                2,
+                model,
+                optimizer,
+                opt_param_scheduler,
+                num_floating_point_operations_so_far,
+                {},
+                non_persistent_ckpt=True,
+            )
+            save_checkpoint(
+                3, model, optimizer, opt_param_scheduler, num_floating_point_operations_so_far, {}
+            )
+            save_checkpoint(
+                4,
+                model,
+                optimizer,
+                opt_param_scheduler,
+                num_floating_point_operations_so_far,
+                {},
+                non_persistent_ckpt=True,
+            )
+            iteration, _ = load_checkpoint(model, optimizer, opt_param_scheduler)
+            assert iteration == 4
+            save_checkpoint(
+                6, model, optimizer, opt_param_scheduler, num_floating_point_operations_so_far, {}
+            )
+            iteration, _ = load_checkpoint(model, optimizer, opt_param_scheduler)
+            assert iteration == 6
+            save_checkpoint(
+                8,
+                model,
+                optimizer,
+                opt_param_scheduler,
+                num_floating_point_operations_so_far,
+                {},
+                non_persistent_ckpt=True,
+            )
+            iteration, _ = load_checkpoint(model, optimizer, opt_param_scheduler)
+            assert iteration == 8
+            assert "iter_0000003" in os.listdir(non_persistent_ckpt_dir)
+            assert "iter_0000006" in os.listdir(non_persistent_ckpt_dir)
+            assert "iter_0000002" not in os.listdir(
+                os.path.join(non_persistent_ckpt_dir, _NON_PERSISTENT_CKPT_SUBDIR)
+            )
+            assert "iter_0000004" in os.listdir(
+                os.path.join(non_persistent_ckpt_dir, _NON_PERSISTENT_CKPT_SUBDIR)
+            )
+            assert "iter_0000008" in os.listdir(
+                os.path.join(non_persistent_ckpt_dir, _NON_PERSISTENT_CKPT_SUBDIR)
+            )
+            ckpt_dirs = [
+                "iter_0000003",
+                "iter_0000006",
+                _NON_PERSISTENT_CKPT_SUBDIR + "/iter_0000004",
+                _NON_PERSISTENT_CKPT_SUBDIR + "/iter_0000008",
+            ]
+            for ckpt_a in ckpt_dirs:
+                for ckpt_b in ckpt_dirs:
+                    for filename in os.listdir(os.path.join(non_persistent_ckpt_dir, ckpt_a)):
+                        if filename != "common.pt" and filename != ".metadata":
+                            assert filecmp.cmp(
+                                os.path.join(non_persistent_ckpt_dir, ckpt_a, filename),
+                                os.path.join(non_persistent_ckpt_dir, ckpt_b, filename),
+                                shallow=False,
+                            ), [filename, ckpt_a, ckpt_b]
+        Utils.destroy_model_parallel()
+
+
+class TestLegacySaveAndLoad:
+    @pytest.mark.parametrize(('tp,pp'), [(2, 4)])
+    @pytest.mark.flaky
+    def test_basic_save_load_scenario(self, tmp_path_dist_ckpt, tp, pp):
+        Utils.initialize_model_parallel(tp, pp)
+        num_floating_point_operations_so_far = 0
+        model, optimizer = setup_model_and_optimizer(1, tp, pp)
+        opt_param_scheduler = None
+
+        mock_args = SimpleNamespace()
+        with TempNamedDir(tmp_path_dist_ckpt / "test_legacy") as legacy_ckpt_dir, mock.patch(
+            'megatron.training.checkpointing.get_args', new=lambda: mock_args
+        ), mock.patch("megatron.training.checkpointing.update_num_microbatches"):
+            init_basic_mock_args(mock_args, tp, pp)
+            init_checkpointing_mock_args(mock_args, legacy_ckpt_dir)
+
+            save_checkpoint(
+                2, model, optimizer, opt_param_scheduler, num_floating_point_operations_so_far, {}
+            )
+            iteration, _ = load_checkpoint(model, optimizer, opt_param_scheduler)
+            assert iteration == 2
+            assert "iter_0000002" in os.listdir(legacy_ckpt_dir)
+
+        Utils.destroy_model_parallel()
diff --git a/tests/unit_tests/dist_checkpointing/test_optimizer.py b/tests/unit_tests/dist_checkpointing/test_optimizer.py
new file mode 100644
index 0000000000..a3ec2c3c4c
--- /dev/null
+++ b/tests/unit_tests/dist_checkpointing/test_optimizer.py
@@ -0,0 +1,586 @@
+# Copyright (c) 2023, NVIDIA CORPORATION. All rights reserved.
+from copy import deepcopy
+from functools import partial
+from time import sleep
+from types import MethodType, SimpleNamespace
+from unittest import mock
+
+import pytest
+import torch
+from torch.optim import Adam
+
+from megatron.core import parallel_state
+from megatron.core.dist_checkpointing import (
+    ShardedTensor,
+    load,
+    load_plain_tensors,
+    load_tensors_metadata,
+    save,
+)
+from megatron.core.dist_checkpointing.dict_utils import diff, nested_values
+from megatron.core.dist_checkpointing.optimizer import (
+    get_param_id_to_sharded_param_map,
+    optim_state_to_sharding_state,
+)
+from megatron.core.dist_checkpointing.serialization import get_default_save_sharded_strategy
+from megatron.core.dist_checkpointing.strategies.fully_parallel import (
+    FullyParallelSaveStrategyWrapper,
+)
+from megatron.core.dist_checkpointing.utils import extract_sharded_tensors
+from megatron.core.tensor_parallel import model_parallel_cuda_manual_seed
+from megatron.core.transformer import TransformerConfig
+from megatron.core.transformer.mlp import apply_swiglu_sharded_factory
+from megatron.training.checkpointing import load_checkpoint, save_checkpoint
+from tests.unit_tests.dist_checkpointing import (
+    TempNamedDir,
+    init_basic_mock_args,
+    init_checkpointing_mock_args,
+    initialize_gpt_model,
+    setup_model_and_optimizer,
+    setup_moe_model_and_optimizer,
+)
+from tests.unit_tests.test_utilities import Utils
+
+
+class Model(torch.nn.Module):
+    def __init__(self):
+        super().__init__()
+        self.conv = torch.nn.Conv1d(8, 16, 3)
+        self.proj = torch.nn.Linear(8, 5)
+        self.config = TransformerConfig(hidden_size=8, num_attention_heads=1, num_layers=1)
+
+    def sharded_state_dict(self):
+        sharded_state_dict = self.state_dict(keep_vars=True)
+        # conv
+        sharded_state_dict['conv.weight'] = ShardedTensor.from_rank_offsets(
+            'conv.weight',
+            sharded_state_dict['conv.weight'],
+            (
+                1,
+                parallel_state.get_tensor_model_parallel_rank(),
+                parallel_state.get_tensor_model_parallel_world_size(),
+            ),
+        )
+        # bias is non-sharded
+        sharded_state_dict['conv.bias'] = ShardedTensor.from_rank_offsets(
+            'conv.bias', sharded_state_dict['conv.bias']
+        )
+
+        # proj
+        sharded_state_dict['proj.weight'] = ShardedTensor.from_rank_offsets(
+            'proj.weight', sharded_state_dict['proj.weight'], (0, Utils.rank, Utils.world_size)
+        )
+        sharded_state_dict['proj.bias'] = ShardedTensor.from_rank_offsets(
+            'proj.bias', sharded_state_dict['proj.bias'], (0, Utils.rank, Utils.world_size)
+        )
+        return sharded_state_dict
+
+
+class SwigluFactoryModel(torch.nn.Module):
+    def __init__(self):
+        super().__init__()
+        self.linear = torch.nn.Linear(
+            5, 64 // parallel_state.get_tensor_model_parallel_world_size(), bias=False
+        )
+        self.config = TransformerConfig(hidden_size=8, num_attention_heads=1, num_layers=1)
+
+    def sharded_state_dict(self):
+        sharded_state_dict = self.state_dict(keep_vars=True)
+        sharded_state_dict['linear.weight'] = ShardedTensor.from_rank_offsets(
+            'linear.weight',
+            sharded_state_dict['linear.weight'],
+            (
+                (
+                    0,
+                    parallel_state.get_tensor_model_parallel_rank(),
+                    parallel_state.get_tensor_model_parallel_world_size(),
+                )
+            ),
+            replica_id=(
+                (
+                    parallel_state.get_pipeline_model_parallel_rank(),
+                    0,
+                    parallel_state.get_data_parallel_rank(with_context_parallel=True),
+                )
+            ),
+        )
+        sharded_state_dict['linear.weight'] = apply_swiglu_sharded_factory(
+            sharded_state_dict['linear.weight'], ()
+        )
+        return sharded_state_dict
+
+
+class TestOptimizer:
+    def setup_method(self, method):
+        pass
+
+    def teardown_method(self, method):
+        Utils.destroy_model_parallel()
+
+    def test_optimizer_params(self, tmp_path_dist_ckpt):
+        Utils.initialize_model_parallel(1, 1)
+        model = Model()
+        # Force optimizer state initialization
+        for p in model.parameters():
+            p.grad = torch.ones_like(p.data)
+        optim = Adam(model.parameters())
+        optim.step()
+
+        model_state_dict = model.sharded_state_dict()
+        param_map = get_param_id_to_sharded_param_map(
+            model_state_dict, optim.param_groups[0]['params']
+        )
+        optim_state_dict = optim.state_dict()
+        optim_state_to_sharding_state(optim_state_dict, param_map, exclude_keys=('step',))
+
+        optim_sharded_tensors = nested_values(extract_sharded_tensors(optim_state_dict)[0])
+        optim_sharded_keys = {sh_ten.key for sh_ten in optim_sharded_tensors}
+        assert len(optim_sharded_keys) == 2 * len(model_state_dict)
+        assert optim_sharded_keys == set(
+            [
+                f'optimizer.state.{state_key}.{layer_name}'
+                for state_key in ['exp_avg', 'exp_avg_sq']
+                for layer_name in model_state_dict
+            ]
+        )
+
+
+def initialize_small_model(pre_process=True, post_process=True, seed=0, **config_kwargs):
+    torch.manual_seed(seed)
+    model_parallel_cuda_manual_seed(seed)
+
+    return SwigluFactoryModel()
+
+
+def load_checkpoint_no_arg_checks(*args, **kwargs):
+    with mock.patch('megatron.training.checkpointing.check_checkpoint_args'):
+        with mock.patch('megatron.training.checkpointing.update_num_microbatches'):
+            return load_checkpoint(*args, **kwargs)
+
+
+class TestDistributedOptimizer:
+    def setup_method(self, method):
+        pass
+
+    def teardown_method(self, method):
+        Utils.destroy_model_parallel()
+
+    @pytest.mark.parametrize("initialize_fn", [initialize_small_model, initialize_gpt_model])
+    @pytest.mark.parametrize("use_fpsl", [False, True])
+    # TODO: changing DP doesn't work in unit tests because of NCCL crashes
+    @pytest.mark.parametrize(
+        "tp_pp,src_dp,dest_dp",
+        [
+            ((4, 1), 2, 2),
+            # ((1, 1), 8, 1),
+            # ((1, 1), 1, 8),
+            # ((2, 1), 2, 1),
+            # ((2, 1), 2, 2),
+        ],
+    )
+    @pytest.mark.flaky
+    def test_dp_sharding(self, tmp_path_dist_ckpt, tp_pp, src_dp, dest_dp, use_fpsl, initialize_fn):
+        src_world_size = tp_pp[0] * tp_pp[1] * src_dp
+        dest_world_size = tp_pp[0] * tp_pp[1] * dest_dp
+        assert src_world_size <= Utils.world_size, (tp_pp, src_dp)
+        assert dest_world_size <= Utils.world_size, (tp_pp, dest_dp)
+
+        sharding_type = 'fully_sharded_model_space' if use_fpsl else 'dp_zero_gather_scatter'
+
+        Utils.initialize_model_parallel(*tp_pp)
+
+        # sync=True to make sure other ranks wait for rank 0 to finish creating directory.
+        with TempNamedDir(tmp_path_dist_ckpt / 'test_dp_sharding', sync=True) as ckpt_dir:
+            try:
+                Utils.set_world_size(src_world_size)
+                if Utils.rank >= 0:
+                    # Save checkpoint A
+                    model, optimizer_A = setup_model_and_optimizer(
+                        seed=2, tp=tp_pp[0], pp=tp_pp[1], initialize_fn=initialize_fn
+                    )
+
+                    save_strategy = get_default_save_sharded_strategy()
+                    if use_fpsl:
+                        save_strategy = FullyParallelSaveStrategyWrapper(
+                            save_strategy,
+                            parallel_state.get_data_parallel_group(with_context_parallel=True),
+                            True,
+                        )
+                    save(
+                        optimizer_A.sharded_state_dict(
+                            model[0].sharded_state_dict(), sharding_type=sharding_type
+                        ),
+                        ckpt_dir,
+                        save_strategy,
+                    )
+                    optim_param_state_A = optimizer_A.get_parameter_state_dp_zero()
+                    Utils.destroy_model_parallel()
+                else:
+                    # this prevents NCCL errors when changing DP. TODO: fix it properly
+                    sleep(20)
+
+                # Load checkpoint A with different TP/PP and save as checkpoint B
+                Utils.set_world_size(dest_world_size)
+                if Utils.rank == 0:
+                    print('_____________________')
+                if Utils.rank >= 0:
+                    Utils.initialize_model_parallel(*tp_pp)
+
+                    model, optimizer_B = setup_model_and_optimizer(
+                        seed=3, tp=tp_pp[0], pp=tp_pp[1], initialize_fn=initialize_fn
+                    )
+                    optim_param_state_B = optimizer_B.get_parameter_state_dp_zero()
+                    diffs = diff(optim_param_state_A, optim_param_state_B)
+                    # Expect a mismatch in values - diffs[2] nonempty
+                    if parallel_state.get_data_parallel_rank(with_context_parallel=True) == 0:
+                        assert not diffs[0] and not diffs[1] and diffs[2], diffs
+
+                    sharded_state_dict = optimizer_B.sharded_state_dict(
+                        model[0].sharded_state_dict(), is_loading=True, sharding_type=sharding_type
+                    )
+                    optim_state_dict = load(sharded_state_dict, ckpt_dir)
+                    optimizer_B.load_state_dict(optim_state_dict)
+                    optim_param_state_B = optimizer_B.get_parameter_state_dp_zero()
+
+                    # Test both param state dicts are equal
+                    diffs = diff(optim_param_state_A, optim_param_state_B)
+                    assert not any(map(bool, diffs)), diffs
+
+                else:
+                    # this prevents NCCL errors when changing DP. TODO: fix it properly
+                    sleep(20)
+            finally:
+                Utils.set_world_size()
+
+    @pytest.mark.parametrize(
+        ('src_tp_pp', 'dest_tp_pp', 'use_glu'),
+        [((2, 2), (2, 4), False), ((1, 8), (4, 1), True), ((2, 4), (4, 2), False)],
+    )
+    @pytest.mark.flaky
+    def test_finetune_doesnt_load_optimizer(
+        self, tmp_path_dist_ckpt, src_tp_pp, dest_tp_pp, use_glu
+    ):
+        # sync=True to make sure other ranks wait for rank 0 to finish creating directory.
+        Utils.initialize_model_parallel(*src_tp_pp)
+        with TempNamedDir(
+            tmp_path_dist_ckpt / 'test_finetune_doesnt_load_optimizer', sync=True
+        ) as ckpt_dir:
+            mock_args = SimpleNamespace()
+            with mock.patch('megatron.training.checkpointing.get_args', new=lambda: mock_args):
+                init_basic_mock_args(mock_args, tp=src_tp_pp[0], pp=src_tp_pp[1])
+                init_checkpointing_mock_args(mock_args, ckpt_dir, False)
+
+                model, optimizer = setup_model_and_optimizer(
+                    seed=2,
+                    tp=src_tp_pp[0],
+                    pp=src_tp_pp[1],
+                    initialize_fn=partial(initialize_gpt_model, use_glu=use_glu),
+                )
+
+                save_checkpoint(10, model, optimizer, None, 0)
+                Utils.destroy_model_parallel()
+
+                Utils.initialize_model_parallel(*dest_tp_pp)
+                model, optimizer = setup_model_and_optimizer(
+                    seed=3,
+                    tp=dest_tp_pp[0],
+                    pp=dest_tp_pp[1],
+                    initialize_fn=partial(initialize_gpt_model, use_glu=use_glu),
+                )
+                model_unloaded_state_dict = deepcopy(model[0].state_dict())
+                optim_unloaded_state_dict = deepcopy(optimizer.state_dict())
+
+                # Load with different TPxPP should raise DistributeOptimizer error
+                with pytest.raises(RuntimeError) as exc_info:
+                    load_checkpoint_no_arg_checks(model, optimizer, None)
+                assert "(TP, PP) mismatch" in str(exc_info.value)
+
+                # Check that the state didn't change
+                assert not any(diff(model[0].state_dict(), model_unloaded_state_dict))
+                assert not any(diff(optimizer.state_dict(), optim_unloaded_state_dict))
+
+                # Now test the same with a `finetune` flag
+                mock_args.finetune = True
+                load_checkpoint_no_arg_checks(model, optimizer, None)
+
+                # Model weights should be different, but optimizer state is unchanged
+                diffs = diff(model[0].state_dict(), model_unloaded_state_dict)
+                # diffs[0] and diffs[1] is structural diff, diffs[2] is values diff -
+                # we expect only values diff
+                assert not diffs[0] and not diffs[1] and diffs[2]
+                assert not any(diff(optimizer.state_dict(), optim_unloaded_state_dict))
+
+                # ... or `no_load_optim` flag
+                model, optimizer = setup_model_and_optimizer(
+                    seed=3,
+                    tp=dest_tp_pp[0],
+                    pp=dest_tp_pp[1],
+                    initialize_fn=partial(initialize_gpt_model, use_glu=use_glu),
+                )
+                mock_args.finetune = False
+                mock_args.no_load_optim = True
+                mock_args.no_load_rng = True
+                load_checkpoint_no_arg_checks(model, optimizer, None)
+
+                # Model weights should be different, but optimizer state is unchanged
+                diffs = diff(model[0].state_dict(), model_unloaded_state_dict)
+                # diffs[0] and diffs[1] is structural diff, diffs[2] is values diff -
+                # we expect only values diff
+                assert not diffs[0] and not diffs[1] and diffs[2]
+                assert not any(diff(optimizer.state_dict(), optim_unloaded_state_dict))
+
+    @pytest.mark.flaky
+    def test_can_load_deprecated_bucket_space_format(self, tmp_path_dist_ckpt):
+        # sync=True to make sure other ranks wait for rank 0 to finish creating directory.
+        tp = 4
+        pp = 2
+
+        Utils.initialize_model_parallel(tp, pp)
+        with TempNamedDir(
+            tmp_path_dist_ckpt / 'test_can_load_deprecated_bucket_space_format', sync=True
+        ) as ckpt_dir:
+            mock_args = SimpleNamespace()
+            with mock.patch('megatron.training.checkpointing.get_args', new=lambda: mock_args):
+
+                init_basic_mock_args(mock_args, tp=tp, pp=pp)
+                init_checkpointing_mock_args(mock_args, ckpt_dir, True)
+
+                model, optimizer = setup_model_and_optimizer(
+                    seed=2, tp=tp, pp=pp, initialize_fn=initialize_gpt_model
+                )
+
+                # Mock optimizer sharded_state_dict so that it ignores the externally
+                # passed sharding_type and uses 'fully_sharded_bucket_space' instead
+                orig_optim_sharded_state_dict_fn = optimizer.sharded_state_dict
+
+                def sharded_state_dict_bucket_space(
+                    self, *args, sharding_type: str = 'fully_sharded_model_space', **kwargs
+                ):
+                    return orig_optim_sharded_state_dict_fn(
+                        *args, sharding_type='fully_sharded_bucket_space', **kwargs
+                    )
+
+                optimizer.sharded_state_dict = MethodType(
+                    sharded_state_dict_bucket_space, optimizer
+                )
+                save_checkpoint(10, model, optimizer, None, 0)
+
+                flag = 0
+                key_list = []
+                torch.distributed.barrier()
+                if Utils.rank == 0:
+                    sharded_metadata = load_tensors_metadata(ckpt_dir / 'iter_0000010')
+                    key_list = list(sharded_metadata.keys())
+                    # Check if actually using `fully_parallel_bucket_space` format.
+                    key = (
+                        "optimizer.distributed.dp_group_idx_0.gbuf_idx_0.dtype_"
+                        "(torch.bfloat16, torch.bfloat16).bucket_idx_0.exp_avg_sq"
+                    )
+                    if key in key_list:
+                        flag = 1
+
+                tensor = torch.tensor([flag], dtype=torch.long, device='cuda')
+                torch.distributed.broadcast(tensor, 0)
+                flag = tensor[0].item()
+                assert flag == 1, key_list
+
+                optimizer.sharded_state_dict = orig_optim_sharded_state_dict_fn
+                load_checkpoint_no_arg_checks(model, optimizer, None)
+
+
+class TestFP32Optimizer:
+    def setup_method(self, method):
+        pass
+
+    def teardown_method(self, method):
+        Utils.destroy_model_parallel()
+
+    @pytest.mark.parametrize(
+        ('src_tp_pp', 'dest_tp_pp'), [((2, 4), (2, 4)), ((2, 4), (4, 2)), ((8, 1), (1, 2))]
+    )
+    @pytest.mark.flaky
+    def test_fp32_optimizer_resharding(self, tmp_path_dist_ckpt, src_tp_pp, dest_tp_pp):
+        # sync=True to make sure other ranks wait for rank 0 to finish creating directory.
+        Utils.initialize_model_parallel(*src_tp_pp)
+        with TempNamedDir(
+            tmp_path_dist_ckpt / 'test_fp32_optimizer_state_dict_A', sync=True
+        ) as ckpt_dir_A:
+            with TempNamedDir(
+                tmp_path_dist_ckpt / 'test_fp32_optimizer_state_dict_B', sync=True
+            ) as ckpt_dir_B:
+
+                model_A, optimizer_A = setup_model_and_optimizer(
+                    seed=2,
+                    tp=src_tp_pp[0],
+                    pp=src_tp_pp[1],
+                    initialize_fn=initialize_small_model,
+                    bf16=False,
+                )
+
+                save(optimizer_A.sharded_state_dict(model_A[0].sharded_state_dict()), ckpt_dir_A)
+                Utils.destroy_model_parallel()
+
+                # Load checkpoint A with different TP/PP and save as checkpoint B
+                Utils.initialize_model_parallel(*dest_tp_pp)
+                model_B, optimizer_B = setup_model_and_optimizer(
+                    seed=3,
+                    tp=dest_tp_pp[0],
+                    pp=dest_tp_pp[1],
+                    initialize_fn=initialize_small_model,
+                    bf16=False,
+                )
+                load_sharded_state_dict = optimizer_B.sharded_state_dict(
+                    model_B[0].sharded_state_dict()
+                )
+                state_dict = load(load_sharded_state_dict, ckpt_dir_A)
+
+                optimizer_B.load_state_dict(state_dict)
+                save(optimizer_B.sharded_state_dict(model_B[0].sharded_state_dict()), ckpt_dir_B)
+                Utils.destroy_model_parallel()
+
+                # Test both checkpoints are equal
+                Utils.initialize_model_parallel(1, 1)
+                plain_state_dict_A = load_plain_tensors(ckpt_dir_A)
+                plain_state_dict_B = load_plain_tensors(ckpt_dir_B)
+                diffs = diff(plain_state_dict_A, plain_state_dict_B)
+                assert not any(map(bool, diffs)), diffs
+
+
+class TestOptimizerResharding:
+    def setup_method(self, method):
+        pass
+
+    def teardown_method(self, method):
+        Utils.destroy_model_parallel()
+
+    @pytest.mark.parametrize(
+        ('use_dist_opt', 'bf16'),
+        (
+            (False, True),  # regular BF16
+            (True, True),  # DistOpt BF16
+            # (False, False), # FP32
+        ),
+    )
+    @pytest.mark.parametrize(
+        ('src_tp_pp', 'dest_tp_pp'),
+        [((2, 4), (2, 4)), ((2, 4), (2, 2)), ((2, 4), (4, 2)), ((8, 1), (1, 2))],
+    )
+    @pytest.mark.flaky
+    def test_optimizer_resharding(
+        self, tmp_path_dist_ckpt, src_tp_pp, dest_tp_pp, use_dist_opt, bf16
+    ):
+        Utils.initialize_model_parallel(*src_tp_pp)
+        with TempNamedDir(
+            tmp_path_dist_ckpt / 'test_fp32_optimizer_state_dict_A', sync=False
+        ) as ckpt_dir_A:
+            with TempNamedDir(
+                tmp_path_dist_ckpt / 'test_fp32_optimizer_state_dict_B', sync=False
+            ) as ckpt_dir_B:
+
+                model_A, optimizer_A = setup_model_and_optimizer(
+                    seed=2, tp=src_tp_pp[0], pp=src_tp_pp[1], bf16=bf16, dist_opt=use_dist_opt
+                )
+
+                save(optimizer_A.sharded_state_dict(model_A[0].sharded_state_dict()), ckpt_dir_A)
+                Utils.destroy_model_parallel()
+
+                # Load checkpoint A with different TP/PP and save as checkpoint B
+                Utils.initialize_model_parallel(*dest_tp_pp)
+                model_B, optimizer_B = setup_model_and_optimizer(
+                    seed=3, tp=dest_tp_pp[0], pp=dest_tp_pp[1], bf16=bf16, dist_opt=use_dist_opt
+                )
+                load_sharded_state_dict = optimizer_B.sharded_state_dict(
+                    model_B[0].sharded_state_dict()
+                )
+                state_dict = load(load_sharded_state_dict, ckpt_dir_A)
+
+                optimizer_B.load_state_dict(state_dict)
+                save(optimizer_B.sharded_state_dict(model_B[0].sharded_state_dict()), ckpt_dir_B)
+                Utils.destroy_model_parallel()
+
+                # Test both checkpoints are equal
+                Utils.initialize_model_parallel(1, 1)
+                plain_state_dict_A = load_plain_tensors(ckpt_dir_A)
+                plain_state_dict_B = load_plain_tensors(ckpt_dir_B)
+                diffs = diff(plain_state_dict_A, plain_state_dict_B)
+                assert not any(map(bool, diffs)), diffs
+
+    @pytest.mark.parametrize(('use_dist_opt', 'bf16'), ((True, True),))  # DistOpt BF16
+    @pytest.mark.parametrize(('use_te', 'use_grouped_mlp'), ((False, False), (False, True)))
+    @pytest.mark.parametrize('use_glu', [False, True])
+    @pytest.mark.parametrize(
+        ('src_tp_pp_exp', 'dest_tp_pp_exp'),
+        [
+            ((2, 2, 2), (2, 2, 2)),
+            ((4, 1, 2), (1, 2, 2)),
+            ((1, 1, 2), (1, 1, 4)),
+            ((2, 1, 2), (1, 1, 8)),
+        ],
+    )
+    @pytest.mark.flaky
+    def test_chained_optimizer_resharding(
+        self,
+        tmp_path_dist_ckpt,
+        src_tp_pp_exp,
+        dest_tp_pp_exp,
+        use_dist_opt,
+        bf16,
+        use_te,
+        use_grouped_mlp,
+        use_glu,
+    ):
+        src_tp, src_pp, src_exp = src_tp_pp_exp
+        dest_tp, dest_pp, dest_exp = dest_tp_pp_exp
+        with TempNamedDir(
+            tmp_path_dist_ckpt / 'test_fp32_optimizer_state_dict_A', sync=False
+        ) as ckpt_dir_A:
+            with TempNamedDir(
+                tmp_path_dist_ckpt / 'test_fp32_optimizer_state_dict_B', sync=False
+            ) as ckpt_dir_B:
+                Utils.initialize_model_parallel(src_tp, src_pp, expert_model_parallel_size=src_exp)
+                model_A, optimizer_A = setup_moe_model_and_optimizer(
+                    seed=2,
+                    tp=src_tp,
+                    pp=src_pp,
+                    ep=src_exp,
+                    bf16=bf16,
+                    dist_opt=use_dist_opt,
+                    use_te=use_te,
+                    use_grouped_mlp=use_grouped_mlp,
+                    use_glu=use_glu,
+                )
+
+                save(optimizer_A.sharded_state_dict(model_A[0].sharded_state_dict()), ckpt_dir_A)
+                Utils.destroy_model_parallel()
+
+                # Load checkpoint A with different TP/PP and save as checkpoint B
+                Utils.initialize_model_parallel(
+                    dest_tp, dest_pp, expert_model_parallel_size=dest_exp
+                )
+                model_B, optimizer_B = setup_moe_model_and_optimizer(
+                    seed=3,
+                    tp=dest_tp,
+                    pp=dest_pp,
+                    ep=dest_exp,
+                    bf16=bf16,
+                    dist_opt=use_dist_opt,
+                    use_te=use_te,
+                    use_grouped_mlp=use_grouped_mlp,
+                    use_glu=use_glu,
+                )
+                load_sharded_state_dict = optimizer_B.sharded_state_dict(
+                    model_B[0].sharded_state_dict()
+                )
+                state_dict = load(load_sharded_state_dict, ckpt_dir_A)
+
+                optimizer_B.load_state_dict(state_dict)
+                save(optimizer_B.sharded_state_dict(model_B[0].sharded_state_dict()), ckpt_dir_B)
+                Utils.destroy_model_parallel()
+
+                # Test both checkpoints are equal
+                Utils.initialize_model_parallel(1, 1)
+                plain_state_dict_A = load_plain_tensors(ckpt_dir_A)
+                plain_state_dict_B = load_plain_tensors(ckpt_dir_B)
+                diffs = diff(plain_state_dict_A, plain_state_dict_B)
+                assert not any(map(bool, diffs)), diffs
+                Utils.destroy_model_parallel()
diff --git a/tests/unit_tests/dist_checkpointing/test_serialization.py b/tests/unit_tests/dist_checkpointing/test_serialization.py
new file mode 100644
index 0000000000..19e99de553
--- /dev/null
+++ b/tests/unit_tests/dist_checkpointing/test_serialization.py
@@ -0,0 +1,693 @@
+# Copyright (c) 2023, NVIDIA CORPORATION. All rights reserved.
+
+import io
+import logging
+
+import numpy as np
+import pytest
+import torch
+from torch.distributed.checkpoint import CheckpointException as PyTCheckpointingException
+
+from megatron.core import parallel_state
+from megatron.core.dist_checkpointing import ShardedTensor, load, save
+from megatron.core.dist_checkpointing.core import CheckpointingException, maybe_load_config
+from megatron.core.dist_checkpointing.dict_utils import diff
+from megatron.core.dist_checkpointing.mapping import ShardedObject, ShardedTensorFactory
+from megatron.core.dist_checkpointing.serialization import (
+    load_sharded_metadata,
+    load_tensors_metadata,
+)
+from megatron.core.dist_checkpointing.strategies.base import StrategyAction, get_default_strategy
+from megatron.core.dist_checkpointing.validation import StrictHandling
+from tests.unit_tests.dist_checkpointing import TempNamedDir
+from tests.unit_tests.test_utilities import Utils
+
+
+class TestSerialization:
+    def setup_method(self, method):
+        pass
+
+    def teardown_method(self, method):
+        Utils.destroy_model_parallel()
+
+    def test_single_process_save_load(self, tmp_path_dist_ckpt):
+        Utils.initialize_model_parallel(1, 1)
+
+        sharded_state_dict = {
+            'sd_keyA': ShardedTensor.from_rank_offsets(
+                'keyA', torch.ones(2, 4), replica_id=Utils.rank
+            ),
+            'sd_keyB': ShardedTensor.from_rank_offsets(
+                'keyB', torch.ones(3, 5, 7), replica_id=Utils.rank
+            ),
+        }
+
+        # sync=True to make sure other ranks wait for rank 0 to finish creating directory.
+        with TempNamedDir(
+            tmp_path_dist_ckpt / 'test_single_process_save_load', sync=True
+        ) as ckpt_dir:
+            save(sharded_state_dict, ckpt_dir)
+            torch.distributed.barrier()
+
+            saved_config = maybe_load_config(ckpt_dir)
+            if saved_config.sharded_backend == 'zarr':
+                assert (ckpt_dir / 'keyA').is_dir()
+                assert (ckpt_dir / 'keyB').is_dir()
+                assert not (ckpt_dir / 'keyC').exists()
+                assert not (ckpt_dir / 'sd_keyA').is_dir()
+
+            load_ssd = {
+                'load_sd_keyA': ShardedTensor.from_rank_offsets(
+                    'keyA', torch.ones(2, 4), replica_id=Utils.rank
+                )
+            }
+            loaded_state_dict = load(load_ssd, ckpt_dir)
+
+            assert set(loaded_state_dict.keys()) == {'load_sd_keyA'}
+            assert isinstance(loaded_state_dict['load_sd_keyA'], torch.Tensor)
+            assert loaded_state_dict['load_sd_keyA'].shape == (2, 4)
+
+        Utils.destroy_model_parallel()
+
+    def test_multi_process_save(self, tmp_path_dist_ckpt):
+        Utils.initialize_model_parallel(2, 4)
+
+        state_dict = {
+            'sd_keyA': ShardedTensor.from_rank_offsets(
+                'keyA', torch.ones(2, 4), (0, Utils.rank, Utils.world_size)
+            ),
+            'sd_keyB': ShardedTensor.from_rank_offsets(
+                'keyB', torch.ones(3, 5, 7), (2, Utils.rank, Utils.world_size)
+            ),
+        }
+
+        # sync=True to make sure other ranks wait for rank 0 to finish creating directory.
+        with TempNamedDir(tmp_path_dist_ckpt / 'test_multi_process_save', sync=True) as ckpt_dir:
+            save(state_dict, ckpt_dir)
+
+            saved_config = maybe_load_config(ckpt_dir)
+            if saved_config.sharded_backend == 'zarr':
+                assert (ckpt_dir / 'keyA').is_dir()
+                assert (ckpt_dir / 'keyB').is_dir()
+                assert not (ckpt_dir / 'keyC').exists()
+                assert not (ckpt_dir / 'sd_keyA').is_dir()
+
+        Utils.destroy_model_parallel()
+
+    def test_partition_change_save_load(self, tmp_path_dist_ckpt, strategy=None):
+        Utils.initialize_model_parallel(2, 4)
+
+        # ten_a: global shape (2, 4):
+        ten_a_global = torch.tensor([[0, 1, 2, 3], [10, 11, 12, 13]])
+        ten_a = (
+            torch.zeros(1, 1)
+            + 10 * parallel_state.get_tensor_model_parallel_rank()
+            + parallel_state.get_pipeline_model_parallel_rank()
+        )
+        assert ten_a.shape == (1, 1)
+
+        # ten_b: global shape (4, 5, 80), where (x, y, z) is (100x + z)
+        ten_b = torch.zeros(4, 5, 10) + (torch.arange(10) + 10 * Utils.rank)
+        ten_b += torch.arange(4).unsqueeze(-1).unsqueeze(-1) * 100
+        assert ten_b.shape == (4, 5, 10)
+
+        state_dict = {
+            'sd_keyA': ShardedTensor.from_rank_offsets(
+                'keyA',
+                ten_a,
+                (
+                    0,
+                    parallel_state.get_tensor_model_parallel_rank(),
+                    parallel_state.get_tensor_model_parallel_world_size(),
+                ),
+                (
+                    1,
+                    parallel_state.get_pipeline_model_parallel_rank(),
+                    parallel_state.get_pipeline_model_parallel_world_size(),
+                ),
+                replica_id=0,
+            ),
+            'sd_keyB': ShardedTensor.from_rank_offsets(
+                'keyB', ten_b, (2, Utils.rank, Utils.world_size)
+            ),
+        }
+
+        ten_a_global_shape = ten_a_global.shape
+        ten_b_global_shape = (4, 5, 10 * 8)
+
+        assert state_dict['sd_keyA'].local_shape == (1, 1)
+        assert state_dict['sd_keyA'].global_shape == ten_a_global_shape
+        assert state_dict['sd_keyB'].global_shape == ten_b_global_shape
+
+        # sync=True to make sure other ranks wait for rank 0 to finish creating directory.
+        with TempNamedDir(
+            tmp_path_dist_ckpt / 'test_partition_change_save_load', sync=True
+        ) as ckpt_dir:
+            save(state_dict, ckpt_dir, strategy)
+
+            del ten_a, ten_b
+
+            # without changing TPxPP, load tensors without any sharding
+            load_sd = {
+                'sd_keyA': ShardedTensor.from_rank_offsets(
+                    'keyA', torch.empty(ten_a_global_shape), replica_id=Utils.rank
+                ),
+                'sd_keyB': ShardedTensor.from_rank_offsets(
+                    'keyB', torch.empty(ten_b_global_shape), replica_id=Utils.rank
+                ),
+            }
+            loaded_state_dict = load(load_sd, ckpt_dir)
+
+            ten_a = loaded_state_dict['sd_keyA']
+            ten_b = loaded_state_dict['sd_keyB']
+            assert isinstance(ten_a, torch.Tensor)
+            assert ten_a.shape == ten_a_global_shape
+            assert torch.all(ten_a == ten_a_global)
+
+            assert isinstance(ten_b, torch.Tensor)
+            assert ten_b.shape == ten_b_global_shape
+            assert np.all(
+                [
+                    val == 100 * x + z
+                    for x, x_row in enumerate(ten_b)
+                    for y, y_row in enumerate(x_row)
+                    for z, val in enumerate(y_row)
+                ]
+            )
+
+            del ten_a, ten_b
+
+            # change TPxPP
+            Utils.destroy_model_parallel()
+            Utils.initialize_model_parallel(1, 2)
+
+            load_sd = {
+                'sd_keyA': ShardedTensor.from_rank_offsets(
+                    'keyA',
+                    torch.empty(2, 1),
+                    (
+                        1,
+                        parallel_state.get_data_parallel_rank(),
+                        parallel_state.get_data_parallel_world_size(),
+                    ),
+                    replica_id=parallel_state.get_pipeline_model_parallel_rank(),
+                ),
+                'sd_keyB': ShardedTensor.from_rank_offsets(
+                    'keyB',
+                    torch.empty(5, 80),
+                    (0, Utils.rank // 2, 4),
+                    prepend_axis_num=1,
+                    replica_id=Utils.rank % 2,
+                ),
+            }
+
+            loaded_state_dict = load(load_sd, ckpt_dir)
+            ten_a = loaded_state_dict['sd_keyA']
+            ten_b = loaded_state_dict['sd_keyB']
+
+            assert isinstance(ten_a, torch.Tensor)
+            assert ten_a.shape == (2, 1)
+            assert torch.all(
+                ten_a[:, 0] == ten_a_global[:, parallel_state.get_data_parallel_rank()]
+            )
+
+            assert isinstance(ten_b, torch.Tensor)
+            assert ten_b.shape == (5, 10 * 8)
+            assert torch.all(
+                ten_b == torch.arange(80).unsqueeze(0).expand(5, 80) + Utils.rank // 2 * 100
+            )
+
+    def test_load_tensors_metadata(self, tmp_path_dist_ckpt):
+        Utils.initialize_model_parallel(2, 4)
+
+        state_dict = {
+            'sd_keyA': ShardedTensor.from_rank_offsets(
+                'keyA', torch.arange(10) + Utils.rank * 10, (0, Utils.rank, Utils.world_size)
+            ),
+            'sd_keyB': ShardedTensor.from_rank_offsets(
+                'keyB', torch.ones(3, 5, 7), (2, Utils.rank, Utils.world_size)
+            ),
+        }
+
+        # sync=True to make sure other ranks wait for rank 0 to finish creating directory.
+        with TempNamedDir(tmp_path_dist_ckpt / 'test_load_tensors_metadata', sync=True) as ckpt_dir:
+            save(state_dict, ckpt_dir)
+
+            del state_dict
+            sharded_state_dict = load_tensors_metadata(ckpt_dir)
+            # loaded dict keys are ShardedTensor keys!
+            assert 'keyA' in sharded_state_dict
+            assert 'sd_keyA' not in sharded_state_dict
+
+            # Check metadata
+            assert sharded_state_dict['keyA'].global_shape == (10 * Utils.world_size,)
+            assert sharded_state_dict['keyB'].global_shape == (3, 5, 7 * Utils.world_size)
+            assert sharded_state_dict['keyA'].local_shape == sharded_state_dict['keyA'].global_shape
+            assert sharded_state_dict['keyB'].local_shape == sharded_state_dict['keyB'].global_shape
+            assert sharded_state_dict['keyA'].global_offset == (0,)
+            assert sharded_state_dict['keyB'].global_offset == (0, 0, 0)
+            assert sharded_state_dict['keyA'].axis_fragmentations == (1,)
+            assert sharded_state_dict['keyB'].axis_fragmentations == (1, 1, 1)
+            assert sharded_state_dict['keyA'].replica_id == 0
+            assert sharded_state_dict['keyB'].replica_id == 0
+
+            # metadata dict can be loaded. We don't validate access because there are multiple replica_id=0
+            state_dict = load(sharded_state_dict, ckpt_dir, validate_access_integrity=False)
+            assert torch.all(state_dict['keyA'] == torch.arange(10 * Utils.world_size))
+
+        Utils.destroy_model_parallel()
+
+    def test_can_mix_sharded_tensors_and_factories(self, tmp_path_dist_ckpt):
+        Utils.initialize_model_parallel(1, 1)
+
+        def _build_fn(key, tensor, replica_id, flattened_range):
+            assert flattened_range is None
+            return [
+                ShardedTensor.from_rank_offsets(key + 'part1', tensor, replica_id=replica_id),
+                ShardedTensor.from_rank_offsets(key + 'part2', tensor, replica_id=replica_id),
+                ShardedTensor.from_rank_offsets(key + 'part3', tensor, replica_id=replica_id),
+            ]
+
+        # state dict can be modified by dist_checkpointing.save, so two copies
+        def get_sharded_state_dict(base=0):
+            return {
+                'all': [
+                    ShardedTensor.from_rank_offsets(
+                        'A', torch.arange(2) + base, replica_id=Utils.rank
+                    ),
+                    ShardedTensor.from_rank_offsets(
+                        'B', torch.arange(3) + base, replica_id=Utils.rank
+                    ),
+                    ShardedTensor.from_rank_offsets(
+                        'C', torch.arange(4) + base, replica_id=Utils.rank
+                    ),
+                    ShardedTensorFactory(
+                        'D', torch.arange(5) + base, _build_fn, sum, replica_id=Utils.rank
+                    ),
+                ]
+            }
+
+        # sync=True to make sure other ranks wait for rank 0 to finish creating directory.
+        with TempNamedDir(
+            tmp_path_dist_ckpt / 'test_can_mix_sharded_tensors_and_factories', sync=True
+        ) as ckpt_dir:
+            save(get_sharded_state_dict(0), ckpt_dir)
+            loaded_state_dict = load(get_sharded_state_dict(10), ckpt_dir)
+
+        expected_sd = {
+            'all': [
+                torch.arange(2),
+                torch.arange(3),
+                torch.arange(4),
+                torch.arange(5) * 3,  # sum of three parts, as specified in merge_fn
+            ]
+        }
+        diffs = diff(loaded_state_dict, expected_sd)
+        assert not any(map(bool, diffs)), diffs
+
+        Utils.destroy_model_parallel()
+
+    def test_load_error_msg(self, tmp_path_dist_ckpt):
+        ckpt_dir_name = 'test_load_error_msg'
+        Utils.initialize_model_parallel(1, 1)
+        sh_ten = ShardedTensor.from_rank_offsets('keyA', torch.rand(10), replica_id=Utils.rank)
+        state_dict = {'some_key': sh_ten}
+
+        # Non-existent directory
+        non_ex_path = f'/tmp/non-existent-path/{ckpt_dir_name}'
+        with pytest.raises(CheckpointingException) as exc_info:
+            load(state_dict, non_ex_path)
+        assert f'directory {non_ex_path} does not exist' in str(exc_info.value)
+
+        # sync=True to make sure other ranks wait for rank 0 to finish creating directory.
+        with TempNamedDir(tmp_path_dist_ckpt / ckpt_dir_name, sync=True) as ckpt_dir:
+            # Empty directory - not a distributed checkpoint
+            with pytest.raises(CheckpointingException) as exc_info:
+                load(state_dict, ckpt_dir)
+            assert f'is not a distributed checkpoint' in str(exc_info.value)
+
+            # Missing Zarr arrays
+            torch.distributed.barrier()
+            save(state_dict, ckpt_dir)
+            sh_ten.key = 'different_key'
+            with pytest.raises((CheckpointingException, PyTCheckpointingException)) as exc_info:
+                load(state_dict, ckpt_dir)
+            assert "different_key" in str(exc_info.value)
+
+    def test_sharded_object_serialization(self, tmp_path_dist_ckpt):
+        Utils.initialize_model_parallel(1, 1)
+        # sync=True to make sure other ranks wait for rank 0 to finish creating directory.
+        with TempNamedDir(tmp_path_dist_ckpt / 'test_sh_obj', sync=True) as ckpt_dir:
+            state = {'some': 'dict'}
+            state_serialized = io.BytesIO()
+            torch.save(state, state_serialized)
+            state_dict = {
+                'some_key': ShardedObject(
+                    'sh_obj_A', state_serialized, (1,), (0,), replica_id=Utils.rank
+                )
+            }
+
+            save(state_dict, ckpt_dir)
+            del state, state_serialized, state_dict
+            other_state = {'other': 'dictionary'}
+            other_serialized = io.BytesIO()
+            torch.save(other_state, other_serialized)
+            state_dict = {
+                'other_key': ShardedObject(
+                    'sh_obj_A', other_serialized, (1,), (0,), replica_id=Utils.rank
+                )
+            }
+            load_state_dict = load(state_dict, ckpt_dir)
+            assert 'other_key' in load_state_dict
+            load_state_dict['other_key'].seek(0)
+            loaded_state = torch.load(load_state_dict['other_key'])
+
+            assert loaded_state == {'some': 'dict'}
+
+        Utils.destroy_model_parallel()
+
+    def test_tensor_shape_mismatch(self, tmp_path_dist_ckpt):
+        Utils.initialize_model_parallel(2, 4)
+
+        # Global tensor is just a range(32) repeated twice over the first dimension
+        local_tensor = torch.arange(4).unsqueeze(0).expand(2, 4) + Utils.rank * 4
+
+        state_dict = {
+            'rigid': ShardedTensor.from_rank_offsets(
+                'keyA', local_tensor, (1, Utils.rank, Utils.world_size)
+            ),
+            'flexible': ShardedTensor.from_rank_offsets(
+                'keyB', local_tensor, (1, Utils.rank, Utils.world_size), allow_shape_mismatch=True
+            ),
+        }
+        assert state_dict['rigid'].global_shape == (2, 32)
+        assert state_dict['flexible'].global_shape == (2, 32)
+
+        # sync=True to make sure other ranks wait for rank 0 to finish creating directory.
+        with TempNamedDir(tmp_path_dist_ckpt / 'test_tensor_shape_mismatch', sync=True) as ckpt_dir:
+            save(state_dict, ckpt_dir)
+
+            pp_size = parallel_state.get_pipeline_model_parallel_world_size()
+            pp_rank = parallel_state.get_pipeline_model_parallel_rank()
+            tp_rank = parallel_state.get_tensor_model_parallel_rank()
+
+            # Smaller coverage than expected (28 < 32)
+            state_dict = {
+                'rigid': ShardedTensor.from_rank_offsets(
+                    'keyA', torch.ones(2, 7), (1, pp_rank, pp_size), replica_id=tp_rank
+                )
+            }
+            with pytest.raises((CheckpointingException, PyTCheckpointingException)):
+                load(state_dict, ckpt_dir)
+
+            state_dict = {
+                'flexible': ShardedTensor.from_rank_offsets(
+                    'keyB',
+                    torch.ones(2, 7),
+                    (1, pp_rank, pp_size),
+                    replica_id=tp_rank,
+                    allow_shape_mismatch=True,
+                )
+            }
+            loaded_state_dict = load(state_dict, ckpt_dir)
+            assert torch.all(
+                loaded_state_dict['flexible']
+                == torch.arange(7).unsqueeze(0).expand(2, 7) + pp_rank * 7
+            )
+
+            # Larger coverage than expected (36 > 32)
+            state_dict = {
+                'rigid': ShardedTensor.from_rank_offsets(
+                    'keyA', torch.ones(2, 9), (1, pp_rank, pp_size), replica_id=tp_rank
+                )
+            }
+            with pytest.raises((CheckpointingException, PyTCheckpointingException)):
+                load(state_dict, ckpt_dir)
+
+            state_dict = {
+                'flexible': ShardedTensor.from_rank_offsets(
+                    'keyB',
+                    torch.ones(2, 9),
+                    (1, pp_rank, pp_size),
+                    replica_id=tp_rank,
+                    allow_shape_mismatch=True,
+                )
+            }
+            loaded_state_dict = load(state_dict, ckpt_dir)
+            expected_tensor = torch.arange(9).unsqueeze(0).expand(2, 9) + pp_rank * 9
+
+            if pp_rank >= (32 // 9):
+                assert pp_rank == 3, pp_rank
+                expected_tensor[:, 5:] = 0  # padding with 0s
+            assert torch.all(loaded_state_dict['flexible'] == expected_tensor)
+
+        Utils.destroy_model_parallel()
+
+
+class TestNonStrictLoad:
+    def setup_method(self, method):
+        Utils.initialize_model_parallel(2, 4)  # doesn't matter for this test
+
+    def teardown_method(self, method):
+        Utils.destroy_model_parallel()
+
+    def _get_base_state_dict(self):
+        return {
+            'TenA': ShardedTensor.from_rank_offsets('TenA', torch.arange(2), replica_id=Utils.rank),
+            'TenB': ShardedTensor.from_rank_offsets(
+                'TenB', torch.arange(3), (0, Utils.rank, Utils.world_size), replica_id=0
+            ),
+            'TenC': ShardedTensor.from_rank_offsets(
+                'TenC', torch.arange(3), replica_id=Utils.world_size - Utils.rank - 1
+            ),
+            'ObjA': ShardedObject('ObjA', list(range(10)), (1,), (0,), replica_id=Utils.rank),
+            'ObjB': ShardedObject(
+                'ObjB', {Utils.rank + 7}, (1, Utils.world_size), (0, Utils.rank), replica_id=0
+            ),
+        }
+
+    @pytest.mark.parametrize('save_format', ['zarr', 'torch_dist'])
+    @pytest.mark.parametrize('validate_integrity', [True, False])
+    def test_unexpected_keys_handling_during_validation(
+        self, caplog, tmp_path_dist_ckpt, validate_integrity, save_format
+    ):
+        sharded_state_dict = self._get_base_state_dict()
+        with TempNamedDir(
+            tmp_path_dist_ckpt / 'test_unexpected_keys_raises_error_during_validation'
+        ) as ckpt_dir:
+            save_strategy = get_default_strategy(StrategyAction.SAVE_SHARDED, save_format, 1)
+            save(sharded_state_dict, ckpt_dir, save_strategy)
+
+            def load_with_flag(strict):
+                sharded_state_dict = self._get_base_state_dict()
+                sharded_state_dict['TenD'] = ShardedTensor.from_rank_offsets(
+                    'UnexpectedTenD', torch.arange(3), replica_id=Utils.rank
+                )
+                sharded_state_dict['ObjD'] = ShardedObject(
+                    'UnexpectedObjD', None, (1,), (0,), replica_id=Utils.rank
+                )
+                return load(
+                    sharded_state_dict,
+                    ckpt_dir,
+                    validate_access_integrity=validate_integrity,
+                    strict=strict,
+                )
+
+            def test_error(error_msg):
+                assert 'Unexpected keys' in error_msg
+                assert 'UnexpectedTenD' in error_msg
+                assert 'UnexpectedObjD' in error_msg
+                assert 'Missing keys' not in error_msg
+
+            # ASSUME_OK_UNEXPECTED results in an exception raised by the underlying strategy
+            with pytest.raises(
+                PyTCheckpointingException if save_format == 'torch_dist' else CheckpointingException
+            ) as exc_info:
+                load_with_flag(StrictHandling.ASSUME_OK_UNEXPECTED)
+            # Informative exceptions with `RAISE_*` options:
+            with pytest.raises(CheckpointingException) as exc_info:
+                load_with_flag(StrictHandling.RAISE_UNEXPECTED)
+            test_error(str(exc_info.value))
+            with pytest.raises(CheckpointingException) as exc_info:
+                load_with_flag(StrictHandling.RAISE_ALL)
+            test_error(str(exc_info.value))
+
+            # Logged mismatches:
+            with caplog.at_level(logging.WARNING):
+                loaded_state_dict = load_with_flag(StrictHandling.LOG_UNEXPECTED)
+            assert 'TenA' in loaded_state_dict
+            test_error(caplog.text)
+            with caplog.at_level(logging.WARNING):
+                loaded_state_dict = load_with_flag(StrictHandling.LOG_ALL)
+            assert 'TenA' in loaded_state_dict
+            test_error(caplog.text)
+
+            # Returned mismatches
+            loaded_state_dict, missing_keys, unexpected_keys = load_with_flag(
+                StrictHandling.RETURN_UNEXPECTED
+            )
+            assert 'TenA' in loaded_state_dict
+            assert unexpected_keys == {'UnexpectedTenD', 'UnexpectedObjD'}
+            assert missing_keys == set()
+            loaded_state_dict, missing_keys, unexpected_keys = load_with_flag(
+                StrictHandling.RETURN_ALL
+            )
+            assert 'TenA' in loaded_state_dict
+            assert unexpected_keys == {'UnexpectedTenD', 'UnexpectedObjD'}
+            assert missing_keys == set()
+
+            # Ignore mismatch
+            loaded_state_dict = load_with_flag(StrictHandling.IGNORE_ALL)
+            assert 'TenA' in loaded_state_dict
+
+    @pytest.mark.parametrize('save_format', ['zarr', 'torch_dist'])
+    @pytest.mark.parametrize('validate_integrity', [True, False])
+    def test_missing_keys_raises_error_during_validation(
+        self, caplog, tmp_path_dist_ckpt, validate_integrity, save_format
+    ):
+        sharded_state_dict = self._get_base_state_dict()
+        with TempNamedDir(
+            tmp_path_dist_ckpt / 'test_missing_keys_raises_error_during_validation'
+        ) as ckpt_dir:
+            save_strategy = get_default_strategy(StrategyAction.SAVE_SHARDED, save_format, 1)
+            save(sharded_state_dict, ckpt_dir, save_strategy)
+
+            def load_with_flag(strict):
+                sharded_state_dict = self._get_base_state_dict()
+                del sharded_state_dict['TenA']
+                del sharded_state_dict['ObjB']
+                return load(
+                    sharded_state_dict,
+                    ckpt_dir,
+                    validate_access_integrity=validate_integrity,
+                    strict=strict,
+                )
+
+            def test_error(error_msg):
+                assert 'Unexpected keys' not in error_msg
+                assert 'TenA' in error_msg
+                assert 'ObjB' in error_msg
+                assert 'Missing keys' in error_msg
+
+            # no mismatch for `*_UNEXPECTED` flag
+            loaded_state_dict = load_with_flag(StrictHandling.ASSUME_OK_UNEXPECTED)
+            assert 'TenB' in loaded_state_dict
+
+            loaded_state_dict = load_with_flag(StrictHandling.RAISE_UNEXPECTED)
+            assert 'TenB' in loaded_state_dict
+
+            with caplog.at_level(logging.WARNING):
+                loaded_state_dict = load_with_flag(StrictHandling.LOG_UNEXPECTED)
+            assert (
+                caplog.text == ''
+                or '`zarr` distributed checkpoint backend is deprecated' in caplog.text
+            )
+            assert 'TenB' in loaded_state_dict
+
+            loaded_state_dict, missing_keys, unexpected_keys = load_with_flag(
+                StrictHandling.RETURN_UNEXPECTED
+            )
+            assert 'TenB' in loaded_state_dict
+            assert missing_keys == set()
+            assert unexpected_keys == set()
+
+            loaded_state_dict = load_with_flag(StrictHandling.IGNORE_ALL)
+            assert 'TenB' in loaded_state_dict
+
+            # Informative exceptions with `RAISE_ALL` option:
+            with pytest.raises(CheckpointingException) as exc_info:
+                load_with_flag(StrictHandling.RAISE_ALL)
+            test_error(str(exc_info.value))
+
+            # Logged mismatches:
+            with caplog.at_level(logging.WARNING):
+                loaded_state_dict = load_with_flag(StrictHandling.LOG_ALL)
+            assert 'TenB' in loaded_state_dict
+            test_error(caplog.text)
+
+            # Returned mismatches
+            loaded_state_dict, missing_keys, unexpected_keys = load_with_flag(
+                StrictHandling.RETURN_ALL
+            )
+            assert 'TenB' in loaded_state_dict
+            assert unexpected_keys == set()
+            assert missing_keys == {'TenA', 'ObjB'}
+
+    @pytest.mark.parametrize('save_format', ['zarr', 'torch_dist'])
+    @pytest.mark.parametrize('validate_integrity', [True, False])
+    def test_exact_load_handling(self, caplog, tmp_path_dist_ckpt, validate_integrity, save_format):
+        sharded_state_dict = self._get_base_state_dict()
+        with TempNamedDir(tmp_path_dist_ckpt / 'test_exact_load_handling') as ckpt_dir:
+            save_strategy = get_default_strategy(StrategyAction.SAVE_SHARDED, save_format, 1)
+            save(sharded_state_dict, ckpt_dir, save_strategy)
+
+            def load_with_flag(strict):
+                sharded_state_dict = self._get_base_state_dict()
+                return load(
+                    sharded_state_dict,
+                    ckpt_dir,
+                    validate_access_integrity=validate_integrity,
+                    strict=strict,
+                )
+
+            for strict in (
+                StrictHandling.ASSUME_OK_UNEXPECTED,
+                StrictHandling.LOG_UNEXPECTED,
+                StrictHandling.LOG_ALL,
+                StrictHandling.RAISE_UNEXPECTED,
+                StrictHandling.RAISE_ALL,
+                StrictHandling.IGNORE_ALL,
+            ):
+                with caplog.at_level(logging.WARNING):
+                    loaded_state_dict = load_with_flag(strict)
+                assert (
+                    caplog.text == ''
+                    or '`zarr` distributed checkpoint backend is deprecated' in caplog.text
+                )
+                assert 'TenB' in loaded_state_dict
+                assert 'ObjB' in loaded_state_dict
+
+            for strict in (StrictHandling.RETURN_UNEXPECTED, StrictHandling.RETURN_ALL):
+                with caplog.at_level(logging.WARNING):
+                    loaded_state_dict, missing_keys, unexpected_keys = load_with_flag(strict)
+                assert (
+                    caplog.text == ''
+                    or '`zarr` distributed checkpoint backend is deprecated' in caplog.text
+                )
+                assert 'TenB' in loaded_state_dict
+                assert 'ObjB' in loaded_state_dict
+                assert missing_keys == set()
+                assert unexpected_keys == set()
+
+    @pytest.mark.parametrize('save_format', ['zarr', 'torch_dist'])
+    def test_sharded_metadata(self, tmp_path_dist_ckpt, save_format):
+
+        sharded_state_dict = self._get_base_state_dict()
+        with TempNamedDir(tmp_path_dist_ckpt / 'test_exact_load_handling') as ckpt_dir:
+            save_strategy = get_default_strategy(StrategyAction.SAVE_SHARDED, save_format, 1)
+            save(sharded_state_dict, ckpt_dir, save_strategy)
+            torch.distributed.barrier()
+            sharded_metadata = load_sharded_metadata(ckpt_dir)
+            assert set(sh_base.key for sh_base in sharded_metadata.values()) == {
+                'TenA',
+                'TenB',
+                'TenC',
+                'ObjA',
+                'ObjB',
+            }
+            assert set(sharded_metadata.keys()) == {
+                'TenA',
+                'TenB',
+                'TenC',
+                'ObjA/shard_0_1',
+                *(f'ObjB/shard_0.{i}_1.8' for i in range(8)),
+            }
+
+            loaded_state_dict = load(sharded_metadata, ckpt_dir, validate_access_integrity=False)
+
+            assert loaded_state_dict['ObjA/shard_0_1'] == list(range(10))
+            for shard_idx in range(8):
+                assert loaded_state_dict[f'ObjB/shard_0.{shard_idx}_1.8'] == {shard_idx + 7}
+            assert torch.all(loaded_state_dict['TenA'] == torch.arange(2))
+            assert torch.all(loaded_state_dict['TenB'] == torch.arange(3).repeat(8))
+            assert torch.all(loaded_state_dict['TenC'] == torch.arange(3))
diff --git a/tests/unit_tests/dist_checkpointing/utils.py b/tests/unit_tests/dist_checkpointing/utils.py
new file mode 100644
index 0000000000..5dcf60b472
--- /dev/null
+++ b/tests/unit_tests/dist_checkpointing/utils.py
@@ -0,0 +1,238 @@
+from functools import partial
+from types import SimpleNamespace
+from unittest import mock
+
+import torch
+
+from megatron.core.models.gpt import GPTModel
+from megatron.core.models.gpt.gpt_layer_specs import (
+    get_gpt_layer_local_spec,
+    get_gpt_layer_with_transformer_engine_spec,
+)
+from megatron.core.optimizer import OptimizerConfig, get_megatron_optimizer
+from megatron.core.tensor_parallel import model_parallel_cuda_manual_seed
+from megatron.core.transformer import TransformerConfig
+from megatron.training.training import get_model
+from megatron.training.utils import unwrap_model
+
+NUM_LAYERS = 8
+HIDDEN_SIZE = 16
+NUM_ATTENTION_HEADS = 8
+
+
+def initialize_gpt_model(
+    pre_process=True, post_process=True, seed=0, use_glu=True, **config_kwargs
+):
+    torch.manual_seed(seed)
+    model_parallel_cuda_manual_seed(seed)
+
+    default_config_kwargs = dict(
+        num_layers=NUM_LAYERS,
+        hidden_size=HIDDEN_SIZE,
+        num_attention_heads=NUM_ATTENTION_HEADS,
+        use_cpu_initialization=True,
+    )
+    default_config_kwargs.update(**config_kwargs)
+    transformer_config = TransformerConfig(**default_config_kwargs, gated_linear_unit=use_glu)
+    model = GPTModel(
+        config=transformer_config,
+        transformer_layer_spec=get_gpt_layer_local_spec(),
+        vocab_size=128,
+        max_sequence_length=4,
+        pre_process=pre_process,
+        post_process=post_process,
+    )
+
+    model.bfloat16()
+    with torch.no_grad():
+        for p in model.parameters():
+            p.random_()
+    return model
+
+
+def initialize_moe_model(
+    pre_process=True,
+    post_process=True,
+    seed=0,
+    use_glu=True,
+    use_sp=False,
+    use_te=False,
+    use_grouped_mlp=False,
+    **config_kwargs
+):
+    torch.manual_seed(seed)
+    model_parallel_cuda_manual_seed(seed)
+    expert_num = 8
+
+    default_config_kwargs = dict(
+        num_layers=8,
+        hidden_size=16,
+        num_attention_heads=8,
+        use_cpu_initialization=True,
+        num_moe_experts=expert_num,
+        sequence_parallel=use_sp,
+        moe_grouped_gemm=use_grouped_mlp,
+        add_bias_linear=False,
+    )
+    default_config_kwargs.update(**config_kwargs)
+    transformer_config = TransformerConfig(**default_config_kwargs, gated_linear_unit=use_glu)
+    if use_te:
+        spec = get_gpt_layer_with_transformer_engine_spec(
+            num_experts=expert_num, moe_grouped_gemm=use_grouped_mlp
+        )
+    else:
+        spec = get_gpt_layer_local_spec(num_experts=expert_num, moe_grouped_gemm=use_grouped_mlp)
+    model = GPTModel(
+        config=transformer_config,
+        transformer_layer_spec=spec,
+        vocab_size=128,
+        max_sequence_length=4,
+        pre_process=pre_process,
+        post_process=post_process,
+    )
+
+    model.bfloat16()
+    with torch.no_grad():
+        for p in model.parameters():
+            p.random_()
+    return model
+
+
+def init_basic_mock_args(args, tp, pp, bf16=True):
+    args.data_parallel_random_init = False
+    args.virtual_pipeline_model_parallel_size = None
+    args.fp16 = False
+    args.bf16 = bf16
+    args.accumulate_allreduce_grads_in_fp32 = False
+    args.overlap_grad_reduce = False
+    args.overlap_param_gather_with_optimizer_step = False
+    args.fp8_param_gather = False
+    args.use_distributed_optimizer = True
+    args.ddp_bucket_size = None
+    args.check_for_nan_in_loss_and_grad = False
+    args.ddp_average_in_collective = False
+    args.tensor_model_parallel_size = tp
+    args.pipeline_model_parallel_size = pp
+    args.enable_ft_package = False
+    return args
+
+
+def init_checkpointing_mock_args(args, ckpt_dir, fully_parallel=False):
+    args.non_persistent_global_ckpt_dir = None
+    args.non_persistent_ckpt_type = None
+    args.save = ckpt_dir
+    args.load = ckpt_dir
+    args.pretrained_checkpoint = None
+    args.ckpt_fully_parallel_save = fully_parallel
+    args.ckpt_fully_parallel_load = fully_parallel
+    args.async_save = False
+    args.use_dist_ckpt = True
+    args.ckpt_format = 'torch_dist'
+    args.no_save_optim = False
+    args.no_save_rng = False
+    args.ckpt_assume_constant_structure = False
+    args.log_progress = False
+    args.auto_detect_ckpt_format = False
+    args.exit_on_missing_checkpoint = False
+    args.finetune = False
+    args.consumed_train_samples = 0
+    args.skipped_train_samples = 0
+    args.consumed_valid_samples = 0
+    args.retro_add_retriever = False
+    args.no_load_optim = False
+    args.no_load_rng = False
+    args.dist_ckpt_strictness = 'assume_ok_unexpected'
+    args.add_position_embedding = True
+    args.vocab_file = False
+    args.num_layers = NUM_LAYERS
+    args.hidden_size = HIDDEN_SIZE
+    args.num_attention_heads = NUM_ATTENTION_HEADS
+
+
+def setup_model_and_optimizer(
+    seed, tp, pp, initialize_fn=initialize_gpt_model, bf16=True, dist_opt=True
+):
+    mock_args = SimpleNamespace()
+    with mock.patch('megatron.training.training.get_args', new=lambda: mock_args):
+        init_basic_mock_args(mock_args, tp, pp, bf16=bf16)
+        model = get_model(
+            partial(
+                initialize_fn,
+                seed=seed,
+                tensor_model_parallel_size=tp,
+                pipeline_model_parallel_size=pp,
+                pipeline_dtype=torch.bfloat16,
+            )
+        )
+
+    config = OptimizerConfig(
+        bf16=bf16,
+        params_dtype=torch.bfloat16 if bf16 else torch.float,
+        use_distributed_optimizer=dist_opt,
+    )
+    optimizer = get_megatron_optimizer(config, model)
+
+    torch.manual_seed(seed + 1)
+    model_parallel_cuda_manual_seed(seed + 1)
+
+    for group in optimizer.optimizer.param_groups:
+        for p in group['params']:
+            if len(optimizer.optimizer.state[p]) == 0:
+                optimizer.optimizer.state[p]['exp_avg'] = torch.rand_like(p.data)
+                optimizer.optimizer.state[p]['exp_avg_sq'] = torch.rand_like(p.data)
+
+    optimizer.reload_model_params()
+
+    return unwrap_model(model), optimizer
+
+
+def setup_moe_model_and_optimizer(
+    seed,
+    tp,
+    pp,
+    ep,
+    initialize_fn=initialize_moe_model,
+    bf16=True,
+    dist_opt=True,
+    use_te=False,
+    use_grouped_mlp=False,
+    use_glu=False,
+):
+    mock_args = SimpleNamespace()
+    with mock.patch('megatron.training.training.get_args', new=lambda: mock_args):
+        init_basic_mock_args(mock_args, tp, pp, bf16=bf16)
+        model = get_model(
+            partial(
+                initialize_fn,
+                seed=seed,
+                tensor_model_parallel_size=tp,
+                pipeline_model_parallel_size=pp,
+                pipeline_dtype=torch.bfloat16,
+                expert_model_parallel_size=ep,
+                use_sp=(tp > 1 and ep > 1),
+                use_te=use_te,
+                use_grouped_mlp=use_grouped_mlp,
+                use_glu=use_glu,
+            )
+        )
+
+    config = OptimizerConfig(
+        bf16=bf16,
+        params_dtype=torch.bfloat16 if bf16 else torch.float,
+        use_distributed_optimizer=dist_opt,
+    )
+    optimizer = get_megatron_optimizer(config, model)
+
+    torch.manual_seed(seed + 1)
+    model_parallel_cuda_manual_seed(seed + 1)
+
+    for opt in optimizer.chained_optimizers:
+        for group in opt.param_groups:
+            for p in group['params']:
+                if len(opt.state[p]) == 0:
+                    opt.state[p]['exp_avg'] = torch.rand_like(p.data)
+                    opt.state[p]['exp_avg_sq'] = torch.rand_like(p.data)
+
+    optimizer.reload_model_params()
+
+    return unwrap_model(model), optimizer
diff --git a/tests/unit_tests/distributed/test_param_and_grad_buffer.py b/tests/unit_tests/distributed/test_param_and_grad_buffer.py
new file mode 100644
index 0000000000..9174665eed
--- /dev/null
+++ b/tests/unit_tests/distributed/test_param_and_grad_buffer.py
@@ -0,0 +1,223 @@
+import contextlib
+import math
+from typing import Optional
+
+import pytest
+import torch
+
+from megatron.core import parallel_state
+from megatron.core.distributed import DistributedDataParallelConfig
+from megatron.core.distributed.param_and_grad_buffer import _ParamAndGradBuffer, partition_buckets
+from tests.unit_tests.test_utilities import TestModel, Utils
+
+
+def get_model_and_buffers(
+    input_dim: int,
+    output_dim: int,
+    num_layers: int,
+    bias: bool,
+    shared_embedding: bool,
+    bucket_size: int,
+    use_distributed_optimizer: bool,
+    overlap_grad_reduce: bool,
+):
+    ddp_config = DistributedDataParallelConfig(
+        grad_reduce_in_fp32=True,
+        use_distributed_optimizer=use_distributed_optimizer,
+        overlap_grad_reduce=overlap_grad_reduce,
+    )
+    model = TestModel(
+        input_dim=input_dim,
+        output_dim=output_dim,
+        num_layers=num_layers,
+        bias=bias,
+        shared_embedding=shared_embedding,
+    )
+    params = list(model.parameters())
+    param_to_name = {}
+    for name, param in model.named_parameters():
+        param_to_name[param] = name
+    param_indices = list(range(len(params)))
+
+    param_and_grad_buffer = _ParamAndGradBuffer(
+        ddp_config,
+        param_dtype=torch.bfloat16,
+        grad_dtype=torch.float32,
+        params=params,
+        data_parallel_group=parallel_state.get_data_parallel_group(),
+        bucket_size=bucket_size,
+        param_to_name=param_to_name,
+        gradient_scaling_factor=1.0,
+        param_indices=param_indices,
+    )
+
+    return model, param_and_grad_buffer
+
+
+@pytest.mark.parametrize("bucket_size", [None, 9000, 9025, 9050, 18000, 18050, 20000])
+@pytest.mark.parametrize("use_distributed_optimizer", [False, True])
+@pytest.mark.parametrize("bias", [False, True])
+@pytest.mark.parametrize("shared_embedding", [False, True])
+@pytest.mark.flaky
+def test_bucket_sizes(
+    bucket_size: Optional[int], use_distributed_optimizer: bool, bias: bool, shared_embedding: bool
+):
+    Utils.initialize_model_parallel()
+
+    if shared_embedding and bias:
+        # Don't bother running shared_embedding + bias since gold values are trickier to compute.
+        return
+
+    input_dim = 95
+    output_dim = 95
+    num_layers = 10
+    _, param_and_grad_buffer = get_model_and_buffers(
+        input_dim=input_dim,
+        output_dim=output_dim,
+        num_layers=num_layers,
+        bias=bias,
+        shared_embedding=shared_embedding,
+        bucket_size=bucket_size,
+        use_distributed_optimizer=use_distributed_optimizer,
+        overlap_grad_reduce=False,
+    )
+
+    actual_numel_in_each_bucket = [
+        bucket.numel_unpadded for bucket in param_and_grad_buffer.buckets
+    ]
+    actual_numel_padded_in_each_bucket = [
+        bucket.grad_data.numel() for bucket in param_and_grad_buffer.buckets
+    ]
+
+    def _pad_if_needed(numel_unpadded, divisor):
+        if use_distributed_optimizer:
+            return math.ceil(numel_unpadded / divisor) * divisor
+        return numel_unpadded
+
+    def _pad_bucket_if_needed(numel_unpadded):
+        # Want 128-byte alignment for distributed optimizer.
+        divisor = math.lcm(parallel_state.get_data_parallel_world_size(), 128)
+        return _pad_if_needed(numel_unpadded, divisor)
+
+    def _pad_param_if_needed(numel_unpadded):
+        # Want 64-byte alignment for params.
+        return _pad_if_needed(numel_unpadded, 64)
+
+    if bucket_size is None:
+        # If bucket_size is infinite (None), number of buckets should be 1.
+        if shared_embedding and use_distributed_optimizer:
+            assert len(param_and_grad_buffer.buckets) == 2
+        else:
+            assert len(param_and_grad_buffer.buckets) == 1
+    else:
+        # Else, compute number of buckets.
+        numel_in_each_bucket = []
+        numel_padded_in_each_bucket = []
+        numel_in_last_bucket = 0
+        param_sizes = []
+        for _ in range(num_layers):
+            param_sizes.append(input_dim * output_dim)
+            if bias:  # Include bias term.
+                param_sizes.append(output_dim)
+        # Create separate bucket for first parameter from reverse direction.
+        if shared_embedding and use_distributed_optimizer:
+            numel_in_each_bucket.append(param_sizes[-1])
+            numel_padded_in_each_bucket.append(_pad_bucket_if_needed(param_sizes[-1]))
+            param_sizes = param_sizes[:-1]
+        # Iterate through params in backward direction.
+        for param_size in param_sizes[::-1]:
+            numel_in_last_bucket = _pad_param_if_needed(numel_in_last_bucket)
+            numel_in_last_bucket += param_size
+            if numel_in_last_bucket >= bucket_size:
+                numel_in_each_bucket.append(numel_in_last_bucket)
+                numel_padded_in_each_bucket.append(_pad_bucket_if_needed(numel_in_last_bucket))
+                numel_in_last_bucket = 0
+        if numel_in_last_bucket > 0:
+            numel_in_each_bucket.append(numel_in_last_bucket)
+            numel_padded_in_each_bucket.append(_pad_bucket_if_needed(numel_in_last_bucket))
+
+        assert len(param_and_grad_buffer.buckets) == len(
+            numel_in_each_bucket
+        ), f"Buckets don't match (got {actual_numel_in_each_bucket} but should be {numel_in_each_bucket})"
+        assert actual_numel_in_each_bucket == numel_in_each_bucket, (
+            f"Number of parameters in each bucket should be {numel_in_each_bucket}, "
+            f"but is {actual_numel_in_each_bucket}"
+        )
+        if use_distributed_optimizer:
+            assert all(
+                [
+                    x % parallel_state.get_data_parallel_world_size() == 0
+                    for x in actual_numel_padded_in_each_bucket
+                ]
+            ), (
+                f"Size of each padded bucket should be divisible by "
+                f"{parallel_state.get_data_parallel_world_size()}"
+            )
+        assert actual_numel_padded_in_each_bucket == numel_padded_in_each_bucket, (
+            f"Number of parameters in each padded bucket should be {numel_padded_in_each_bucket}, "
+            f"but is {actual_numel_padded_in_each_bucket}"
+        )
+
+    Utils.destroy_model_parallel()
+
+
+@pytest.mark.parametrize("use_distributed_optimizer", [False, True])
+@pytest.mark.parametrize("overlap_grad_reduce", [False, True])
+def test_grad_sync(use_distributed_optimizer: bool, overlap_grad_reduce: bool):
+    Utils.initialize_model_parallel()
+
+    input_dim = 100
+    output_dim = 100
+    num_layers = 10
+    model, param_and_grad_buffer = get_model_and_buffers(
+        input_dim=input_dim,
+        output_dim=output_dim,
+        num_layers=num_layers,
+        bias=True,
+        shared_embedding=False,
+        bucket_size=None,  # Group all params into single bucket.
+        use_distributed_optimizer=use_distributed_optimizer,
+        overlap_grad_reduce=overlap_grad_reduce,
+    )
+    bucket_groups = partition_buckets([param_and_grad_buffer])
+    param_to_bucket_group = {}
+    for bucket_group in bucket_groups:
+        for param in bucket_group.params:
+            assert param not in param_to_bucket_group
+            param_to_bucket_group[param] = bucket_group
+
+    param_and_grad_buffer.grad_data.data.fill_(1.0)
+    expected_grad_data_value_after_collective = 1
+    if torch.distributed.get_rank() == 0 or not use_distributed_optimizer:
+        expected_grad_data_value_after_collective = parallel_state.get_data_parallel_world_size()
+
+    params = list(model.parameters())
+    for i, param in enumerate(params):
+        assert param in param_to_bucket_group
+        bucket_group = param_to_bucket_group[param]
+        register_grad_sync_context = (
+            contextlib.nullcontext() if overlap_grad_reduce else pytest.raises(AssertionError)
+        )
+        finish_grad_sync_context = contextlib.nullcontext()
+        if i < (len(params) - 1) and overlap_grad_reduce:
+            # Can't finish grad sync until all params have been registered ready.
+            finish_grad_sync_context = pytest.raises(AssertionError)
+
+        with register_grad_sync_context:
+            bucket_group.register_grad_ready(param)
+        with finish_grad_sync_context:
+            # When overlap_grad_reduce is True, this should throw an assertion error until all
+            # params in the model have registered their grad above.
+            # When overlap_grad_reduce is False, the collective is forced through.
+            bucket_group.finish_grad_sync()
+
+        expected_grad_data_value = expected_grad_data_value_after_collective
+        if overlap_grad_reduce and i < (len(params) - 1):
+            expected_grad_data_value = 1
+        assert int(param_and_grad_buffer.grad_data[0]) == expected_grad_data_value
+
+        if not overlap_grad_reduce:
+            # Reset grad_data for subsequent collectives.
+            param_and_grad_buffer.grad_data.data.fill_(1.0)
+
+    Utils.destroy_model_parallel()
diff --git a/tests/unit_tests/export/trtllm/__init__.py b/tests/unit_tests/export/trtllm/__init__.py
new file mode 100644
index 0000000000..e69de29bb2
diff --git a/tests/unit_tests/export/trtllm/test_trtllm_distributed_gpu_converter.py b/tests/unit_tests/export/trtllm/test_trtllm_distributed_gpu_converter.py
new file mode 100644
index 0000000000..5a0aa0e9c5
--- /dev/null
+++ b/tests/unit_tests/export/trtllm/test_trtllm_distributed_gpu_converter.py
@@ -0,0 +1,100 @@
+import pytest
+import torch
+from pytest_mock import mocker
+
+from megatron.core.export.data_type import DataType
+from megatron.core.export.trtllm.model_to_trllm_mapping.gpt_model import GPT_DICT
+from megatron.core.export.trtllm.trtllm_weights_converter.distributed_trtllm_model_weights_converter import (
+    DistributedTRTLLMModelWeightsConverter,
+)
+from megatron.core.models.gpt.gpt_layer_specs import get_gpt_layer_local_spec
+from megatron.core.models.gpt.gpt_model import GPTModel
+from megatron.core.tensor_parallel.random import model_parallel_cuda_manual_seed
+from megatron.core.transformer.transformer_config import TransformerConfig
+from tests.unit_tests.test_utilities import Utils
+
+_SEQUENCE_LENGTH = 64
+_VOCAB_SIZE = 256
+
+
+class TestTRTLLMDistributedGPUConverter:
+
+    def setup_method(self, method):
+        Utils.initialize_model_parallel(2, 1)
+        model_parallel_cuda_manual_seed(123)
+
+        transformer_config = TransformerConfig(
+            num_layers=2,
+            hidden_size=64,
+            num_attention_heads=2,
+            use_cpu_initialization=True,
+            pipeline_dtype=torch.float32,
+            add_qkv_bias=False,
+            add_bias_linear=False,
+        )
+        self.gpt_model = GPTModel(
+            config=transformer_config,
+            transformer_layer_spec=get_gpt_layer_local_spec(),
+            vocab_size=_VOCAB_SIZE,
+            max_sequence_length=_SEQUENCE_LENGTH,
+        )
+
+    def teardown_method(self, method):
+        Utils.destroy_model_parallel()
+
+    def test_get_model_weights_converter(self, mocker):
+        device = torch.device("cuda")
+        self.gpt_model.to(device)
+
+        transformer_config = self.gpt_model.config
+
+        mocker.patch(
+            "megatron.core.export.trtllm.trtllm_weights_converter.distributed_trtllm_model_weights_converter.str_dtype_to_torch",
+            return_value=torch.float32,
+        )
+
+        dtype = DataType.bfloat16
+        distributed_converter = DistributedTRTLLMModelWeightsConverter(
+            transformer_config, dtype, activation="gelu"
+        )
+
+        model_state_dict = {}
+        for key, val in self.gpt_model.state_dict().items():
+            # val is non for _extra_state layers . We filter it out
+            if val is not None:
+                model_state_dict[key] = val
+
+        distributed_converter.convert(
+            model_state_dict=model_state_dict,
+            trtllm_conversion_dict=GPT_DICT,
+            tokenizer_vocab_size=_VOCAB_SIZE,
+        )
+
+        expected_result = {
+            'transformer.vocab_embedding.weight': torch.Size([128, 64]),
+            'transformer.position_embedding.weight': torch.Size([32, 64]),
+            'lm_head.weight': torch.Size([128, 64]),
+            'transformer.ln_f.weight': torch.Size([64]),
+            'transformer.ln_f.bias': torch.Size([64]),
+            'transformer.layers.0.input_layernorm.weight': torch.Size([64]),
+            'transformer.layers.0.input_layernorm.bias': torch.Size([64]),
+            'transformer.layers.0.attention.dense.weight': torch.Size([64, 32]),
+            'transformer.layers.0.attention.qkv.weight': torch.Size([96, 64]),
+            'transformer.layers.0.post_layernorm.weight': torch.Size([64]),
+            'transformer.layers.0.post_layernorm.bias': torch.Size([64]),
+            'transformer.layers.0.mlp.fc.weight': torch.Size([128, 64]),
+            'transformer.layers.0.mlp.proj.weight': torch.Size([64, 128]),
+            'transformer.layers.1.input_layernorm.weight': torch.Size([64]),
+            'transformer.layers.1.input_layernorm.bias': torch.Size([64]),
+            'transformer.layers.1.attention.dense.weight': torch.Size([64, 32]),
+            'transformer.layers.1.attention.qkv.weight': torch.Size([96, 64]),
+            'transformer.layers.1.post_layernorm.weight': torch.Size([64]),
+            'transformer.layers.1.post_layernorm.bias': torch.Size([64]),
+            'transformer.layers.1.mlp.fc.weight': torch.Size([128, 64]),
+            'transformer.layers.1.mlp.proj.weight': torch.Size([64, 128]),
+        }
+
+        for key, value in distributed_converter.trtllm_model_weights.items():
+            assert (
+                expected_result[key] == value.shape
+            ), f"Shape mismatch for {key}. Expected {expected_result[key]} but got {value.shape}"
diff --git a/tests/unit_tests/export/trtllm/test_trtllm_helper.py b/tests/unit_tests/export/trtllm/test_trtllm_helper.py
new file mode 100644
index 0000000000..53c0a5ffea
--- /dev/null
+++ b/tests/unit_tests/export/trtllm/test_trtllm_helper.py
@@ -0,0 +1,73 @@
+import pytest
+
+from megatron.core.export.export_config import ExportConfig
+from megatron.core.export.model_type import ModelType
+
+
+# TODO : Remove importorskip and handle with mocker
+class TestTRTLLMHelper:
+
+    def test_exceptions(self, mocker):
+        pytest.importorskip('tensorrt_llm')
+
+        from megatron.core.export.trtllm.trtllm_helper import TRTLLMHelper
+
+        trtllm_helper = TRTLLMHelper(
+            transformer_config=None,
+            model_type=ModelType.gpt,
+            share_embeddings_and_output_weights=True,
+        )
+
+        with pytest.raises(AssertionError):
+            trtllm_helper.get_trtllm_pretrained_config_and_model_weights(
+                model_state_dict=None,
+                dtype=None,
+                on_device_distributed_conversion=True,
+                vocab_size=None,
+                gpus_per_node=2,
+            )
+
+        with pytest.raises(AssertionError):
+            trtllm_helper.get_trtllm_pretrained_config_and_model_weights(
+                model_state_dict=None,
+                dtype=None,
+                on_device_distributed_conversion=True,
+                ModelType=ModelType.falcon,
+                vocab_size=100,
+                gpus_per_node=2,
+            )
+
+        with pytest.raises(AssertionError):
+            trtllm_helper.get_trtllm_pretrained_config_and_model_weights(
+                model_state_dict=None,
+                dtype=None,
+                export_config=ExportConfig(),
+                on_device_distributed_conversion=True,
+                vocab_size=100,
+                gpus_per_node=2,
+            )
+
+        with pytest.raises(AssertionError):
+            trtllm_helper.get_trtllm_pretrained_config_and_model_weights(
+                model_state_dict=None,
+                dtype=None,
+                vocab_size=100,
+                on_device_distributed_conversion=True,
+                gpus_per_node=None,
+            )
+
+        with pytest.raises(AssertionError):
+            trtllm_helper.get_trtllm_pretrained_config_and_model_weights(
+                model_state_dict=None,
+                dtype=None,
+                export_config=ExportConfig(use_embedding_sharing=False),
+                on_device_distributed_conversion=False,
+            )
+
+        with pytest.raises(AssertionError):
+            trtllm_helper.get_trtllm_pretrained_config_and_model_weights(
+                model_state_dict=None,
+                dtype=None,
+                export_config=ExportConfig(use_embedding_sharing=True),
+                vocab_size=100,
+            )
diff --git a/tests/unit_tests/export/trtllm/test_trtllm_layers.py b/tests/unit_tests/export/trtllm/test_trtllm_layers.py
new file mode 100644
index 0000000000..b2e88852e5
--- /dev/null
+++ b/tests/unit_tests/export/trtllm/test_trtllm_layers.py
@@ -0,0 +1,111 @@
+import pytest
+
+from megatron.core.export.trtllm.trtllm_layers import TRTLLMLayers, get_layer_name_without_prefix
+
+
+class TestTRTLLMLayers:
+
+    def test_rename_input_layer_names_to_trtllm_layer_names_without_layer_numbers(self):
+
+        conversion_dict = {
+            "transformer.layers.attn.dense.bias": TRTLLMLayers.attention_dense_bias,
+            "transformer.layers.mlp.fc1.weight": TRTLLMLayers.mlp_fc_weight,
+        }
+        sample_dict = {
+            "transformer.layers.attn.dense.bias": 0,
+            "transformer.layers.mlp.fc1.weight": 1,
+        }
+
+        converted_dict = TRTLLMLayers.rename_input_layer_names_to_trtllm_layer_names(
+            model_state_dict=sample_dict,
+            trtllm_conversion_dict=conversion_dict,
+            state_dict_split_by_layer_numbers=False,
+        )
+        assert (
+            converted_dict[TRTLLMLayers.attention_dense_bias.value] == 0
+        ), "Something wrong with conversion dict"
+        assert (
+            converted_dict[TRTLLMLayers.mlp_fc_weight.value] == 1
+        ), "Something wrong with conversion dict"
+
+    def test_rename_input_layer_names_to_trtllm_layer_names_exception(self):
+
+        with pytest.raises(AssertionError):
+            conversion_dict = {
+                "transformer.layers.attn.dense.bias": "randomValue",
+                "transformer.layers.mlp.fc1.weight": TRTLLMLayers.mlp_fc_weight,
+            }
+            sample_dict = {
+                "transformer.layers.attn.dense.bias": 0,
+                "transformer.layers.mlp.fc1.weight": 1,
+            }
+            TRTLLMLayers.rename_input_layer_names_to_trtllm_layer_names(
+                model_state_dict=sample_dict,
+                trtllm_conversion_dict=conversion_dict,
+                state_dict_split_by_layer_numbers=False,
+            )
+
+        with pytest.raises(Exception):
+            sample_dict = {
+                "transformer.layers.attn.dense.bias": 0,
+                "transformer.layers.mlp.fc1.weight": 1,
+            }
+            del conversion_dict["attn.dense.bias"]
+            TRTLLMLayers.rename_input_layer_names_to_trtllm_layer_names(
+                model_state_dict=sample_dict,
+                trtllm_conversion_dict=conversion_dict,
+                state_dict_split_by_layer_numbers=False,
+            )
+
+        with pytest.raises(Exception):
+            conversion_dict = {
+                "transformer.layers.attn.dense.bias": TRTLLMLayers.attention_dense_bias,
+                "transformer.layers.mlp.fc1.weight": TRTLLMLayers.mlp_fc_weight,
+            }
+            sample_dict = {
+                "transformer.layers.attn.dense.bias": 0,
+                "transformer.layers.mlp.fc1.weight": 1,
+            }
+
+            TRTLLMLayers.rename_input_layer_names_to_trtllm_layer_names(
+                model_state_dict=sample_dict,
+                trtllm_conversion_dict=conversion_dict,
+                state_dict_split_by_layer_numbers=True,
+            )
+
+    def test_rename_input_layer_names_to_trtllm_layer_names_with_layer_numbers(self):
+
+        conversion_dict = {
+            "decoder.lm_head.weight": TRTLLMLayers.lm_head,
+            "decoder.layers.attn.dense.bias": TRTLLMLayers.attention_dense_bias,
+            "deocder.layers.mlp.fc1.weight": TRTLLMLayers.mlp_fc_weight,
+        }
+        sample_dict = {
+            "decoder.lm_head.weight": 2,
+            "decoder.layers.0.attn.dense.bias": 0,
+            "deocder.layers.43.mlp.fc1.weight": 1,
+        }
+
+        converted_dict = TRTLLMLayers.rename_input_layer_names_to_trtllm_layer_names(
+            model_state_dict=sample_dict,
+            trtllm_conversion_dict=conversion_dict,
+            state_dict_split_by_layer_numbers=False,
+        )
+
+        assert (
+            converted_dict['transformer.layers.0.attention.dense.bias'] == 0
+        ), "Something wrong with conversion of layer names"
+        assert (
+            converted_dict['transformer.layers.43.mlp.fc.weight'] == 1
+        ), "Something wrong with conversion of layer names"
+        assert (
+            converted_dict['lm_head.weight'] == 2
+        ), "Something wrong with conversion of layer names"
+
+    def test_get_layer_name_without_prefix(self):
+        layer_name_without_prefix = get_layer_name_without_prefix(
+            TRTLLMLayers.attention_dense_weight
+        )
+        assert (
+            layer_name_without_prefix == "attention.dense.weight"
+        ), f"get_layer_name_without_prefix returned {layer_name_without_prefix}, expected attention.dense.weight"
diff --git a/tests/unit_tests/export/trtllm/test_trtllm_single_device_converter.py b/tests/unit_tests/export/trtllm/test_trtllm_single_device_converter.py
new file mode 100644
index 0000000000..e431326f0b
--- /dev/null
+++ b/tests/unit_tests/export/trtllm/test_trtllm_single_device_converter.py
@@ -0,0 +1,169 @@
+import torch
+from pytest_mock import mocker
+
+from megatron.core.export.data_type import DataType
+from megatron.core.export.export_config import ExportConfig
+from megatron.core.export.trtllm.trtllm_layers import TRTLLMLayers
+from megatron.core.export.trtllm.trtllm_weights_converter.single_device_trtllm_model_weights_converter import (
+    SingleDeviceTRTLLMModelWeightsConverter,
+)
+from megatron.core.transformer.transformer_config import TransformerConfig
+
+
+class TestTRTLLMSingleDeviceConverter:
+    def test_get_model_weights_converter(self, mocker):
+
+        export_config = ExportConfig(inference_tp_size=2)
+
+        vocab_size = 10
+        hidden_dim = 4
+        seq_len = 8
+        num_layers = 2
+        num_attn_heads = 2
+
+        model_config = TransformerConfig(
+            num_layers=num_layers,
+            num_attention_heads=num_attn_heads,
+            num_query_groups=0,
+            hidden_size=hidden_dim,
+            ffn_hidden_size=hidden_dim * 4,
+        )
+
+        dtype = DataType.bfloat16
+
+        model_state_dict = {
+            "decoder.position_embedding.weight": torch.randn(seq_len, hidden_dim),
+            "decoder.word_embedding.weight": torch.randn(vocab_size, hidden_dim),
+            "decoder.lm_head.weight": torch.randn(vocab_size, hidden_dim),
+            "decoder.final_layernorm.weight": torch.randn(hidden_dim),
+            "decoder.layers.input_layernorm.weight": torch.randn(num_layers, hidden_dim),
+            "decoder.layers.attention.qkv.weight": torch.randn(
+                num_layers, hidden_dim * 3, hidden_dim
+            ),
+            "decoder.layers.attention.qkv.bias": torch.randn(num_layers, hidden_dim * 3),
+            "decoder.layers.attention.dense.weight": torch.randn(
+                num_layers, hidden_dim, hidden_dim
+            ),
+            "deocder.layers.mlp.fc.weight": torch.randn(num_layers, 4 * hidden_dim, hidden_dim),
+            "decoder.layers.mlp.fc.expert": torch.randn(num_layers, hidden_dim, hidden_dim * 4),
+            "decoder.layers.mlp.proj.expert": torch.randn(num_layers, hidden_dim * 4, hidden_dim),
+        }
+
+        trtllm_conversion_dict = {
+            "decoder.position_embedding.weight": TRTLLMLayers.position_embedding,
+            "decoder.word_embedding.weight": TRTLLMLayers.vocab_embedding,
+            "decoder.final_layernorm.weight": TRTLLMLayers.final_layernorm_weight,
+            "decoder.lm_head.weight": TRTLLMLayers.lm_head,
+            "decoder.layers.input_layernorm.weight": TRTLLMLayers.input_layernorm_weight,
+            "decoder.layers.attention.qkv.weight": TRTLLMLayers.attention_qkv_weight,
+            "decoder.layers.attention.qkv.bias": TRTLLMLayers.attention_qkv_bias,
+            "decoder.layers.attention.dense.weight": TRTLLMLayers.attention_dense_weight,
+            "deocder.layers.mlp.fc.weight": TRTLLMLayers.mlp_fc_weight,
+            "decoder.layers.mlp.fc.expert": TRTLLMLayers.mlp_fc_weight_mixture_of_experts,
+            "decoder.layers.mlp.proj.expert": TRTLLMLayers.mlp_projection_weight_mixture_of_experts,
+        }
+
+        mocker.patch(
+            "megatron.core.export.trtllm.trtllm_weights_converter.single_device_trtllm_model_weights_converter.str_dtype_to_torch",
+            return_value=torch.float32,
+        )
+
+        trtllm_model_weights_converter_cpu = SingleDeviceTRTLLMModelWeightsConverter(
+            export_config, model_config, dtype, activation="swiglu"
+        )
+
+        mocker.patch(
+            "megatron.core.export.trtllm.trtllm_weights_converter.single_device_trtllm_model_weights_converter.pad_vocab_size",
+            return_value=10,
+        )
+
+        trtllm_model_weights_converter_cpu.convert(
+            model_state_dict=model_state_dict,
+            trtllm_conversion_dict=trtllm_conversion_dict,
+            state_dict_split_by_layer_numbers=False,
+        )
+
+        expected_shapes = {
+            'transformer.vocab_embedding.weight': (10, 4),
+            'transformer.position_embedding.weight': (8, 4),
+            'lm_head.weight': (10, 4),
+            'transformer.ln_f.weight': (4,),
+            'transformer.layers.0.input_layernorm.weight': (4,),
+            'transformer.layers.1.input_layernorm.weight': (4,),
+            'transformer.layers.0.attention.qkv.weight.0.bin': (6, 4),
+            'transformer.layers.0.attention.qkv.weight.1.bin': (6, 4),
+            'transformer.layers.1.attention.qkv.weight.0.bin': (6, 4),
+            'transformer.layers.1.attention.qkv.weight.1.bin': (6, 4),
+            'transformer.layers.0.attention.qkv.bias.0.bin': (6,),
+            'transformer.layers.0.attention.qkv.bias.1.bin': (6,),
+            'transformer.layers.1.attention.qkv.bias.0.bin': (6,),
+            'transformer.layers.1.attention.qkv.bias.1.bin': (6,),
+            'transformer.layers.0.attention.dense.weight.0.bin': (4, 2),
+            'transformer.layers.0.attention.dense.weight.1.bin': (4, 2),
+            'transformer.layers.1.attention.dense.weight.0.bin': (4, 2),
+            'transformer.layers.1.attention.dense.weight.1.bin': (4, 2),
+            'transformer.layers.0.mlp.gate.weight.0.bin': (4, 4),
+            'transformer.layers.0.mlp.gate.weight.1.bin': (4, 4),
+            'transformer.layers.0.mlp.fc.weight.0.bin': (16, 2),
+            'transformer.layers.0.mlp.fc.weight.1.bin': (16, 2),
+            'transformer.layers.1.mlp.gate.weight.0.bin': (4, 4),
+            'transformer.layers.1.mlp.gate.weight.1.bin': (4, 4),
+            'transformer.layers.1.mlp.fc.weight.0.bin': (16, 2),
+            'transformer.layers.1.mlp.fc.weight.1.bin': (16, 2),
+            'transformer.layers.0.mlp.proj.weight.0.bin': (4, 8),
+            'transformer.layers.0.mlp.proj.weight.1.bin': (4, 8),
+            'transformer.layers.1.mlp.proj.weight.0.bin': (4, 8),
+            'transformer.layers.1.mlp.proj.weight.1.bin': (4, 8),
+        }
+
+        for key, value in trtllm_model_weights_converter_cpu.trtllm_model_weights.items():
+            assert (
+                expected_shapes[key] == value.shape
+            ), f"Shape mismatch for {key}. Expected {expected_shapes[key]} but got {value.shape}"
+
+        class SampleMapping:
+
+            def __init__(self):
+                self.tp_size = 2
+                self.tp_rank = 1
+
+            def pp_layers(self, num_layers):
+                return [0, 1]
+
+            def is_first_pp_rank(self):
+                return True
+
+            def is_last_pp_rank(self):
+                return True
+
+        trtllm_model_weights_per_gpu = (
+            trtllm_model_weights_converter_cpu.get_local_model_weights_per_gpu(
+                mapping=SampleMapping(), trtllm_model_config=None
+            )
+        )
+
+        expected_result_per_gpu = {
+            'transformer.layers.0.input_layernorm.weight': (4,),
+            'transformer.layers.1.input_layernorm.weight': (4,),
+            'transformer.layers.0.attention.qkv.weight': (6, 4),
+            'transformer.layers.1.attention.qkv.weight': (6, 4),
+            'transformer.layers.0.attention.qkv.bias': (6,),
+            'transformer.layers.1.attention.qkv.bias': (6,),
+            'transformer.layers.0.attention.dense.weight': (4, 2),
+            'transformer.layers.1.attention.dense.weight': (4, 2),
+            'transformer.layers.0.mlp.gate.weight': (4, 4),
+            'transformer.layers.0.mlp.fc.weight': (16, 2),
+            'transformer.layers.1.mlp.gate.weight': (4, 4),
+            'transformer.layers.1.mlp.fc.weight': (16, 2),
+            'transformer.layers.0.mlp.proj.weight': (4, 8),
+            'transformer.layers.1.mlp.proj.weight': (4, 8),
+            'transformer.vocab_embedding.weight': (10, 4),
+            'transformer.position_embedding.weight': (8, 4),
+            'lm_head.weight': (5, 4),
+            'transformer.ln_f.weight': (4,),
+        }
+
+        for key, value in trtllm_model_weights_per_gpu.items():
+            assert (
+                expected_result_per_gpu[key] == value.shape
+            ), f"Shape mismatch for {key}. Expected {expected_result_per_gpu[key]} but got {value.shape}"
diff --git a/tests/unit_tests/fusions/test_torch_softmax.py b/tests/unit_tests/fusions/test_torch_softmax.py
new file mode 100644
index 0000000000..63b0bc7b5d
--- /dev/null
+++ b/tests/unit_tests/fusions/test_torch_softmax.py
@@ -0,0 +1,47 @@
+import pytest
+import torch
+
+from megatron.core.fusions.fused_softmax import FusedScaleMaskSoftmax
+from megatron.core.transformer.enums import AttnMaskType
+from megatron.core.transformer.utils import attention_mask_func, get_default_causal_mask
+
+
+class TestTorchSoftmax:
+    def setup_method(self, method):
+        # The important settings tested are forward_torch_softmax path
+        # with locally generated casual mask for attention_mask_func:
+        self.softmax = FusedScaleMaskSoftmax(
+            input_in_fp16=False,
+            input_in_bf16=False,
+            attn_mask_type=AttnMaskType.causal,
+            scaled_masked_softmax_fusion=False,
+            mask_func=attention_mask_func,
+            softmax_in_fp32=True,
+            scale=None,
+        )
+
+    def teardown_method(self):
+        get_default_causal_mask.cache_clear()
+
+    def test_output_shape(self):
+        x = torch.randn(8, 2, 4, 4, device="cuda")
+        y = self.softmax(x, None)
+        assert x.shape == y.shape
+
+    def test_causal_mask_input_shape_assert(self):
+        x = torch.randn(1, 1, 4, 16, device="cuda")
+        with pytest.raises(AssertionError):
+            self.softmax(x, None)
+
+    def test_causal_mask_equal_scores(self):
+        # For equal input values (e.g. zero) correctly masked softmax should
+        # produce equal scores among non-masked elements. For example, in case
+        # sq == sk == 2 the expected output is (ignoring b and np dimensions):
+        # [[1.0, 0.0],
+        #  [0.5, 0.5]]
+        b, np, sq, sk = 8, 2, 32, 32
+        x = torch.zeros([b, np, sq, sk]).cuda()
+        y = self.softmax(x, None)
+        y_expected = torch.tril(torch.ones(b, np, sq, sk, device="cuda"))
+        y_expected /= torch.arange(1, sq + 1, device="cuda").reshape((-1, 1))
+        assert torch.allclose(y, y_expected, rtol=1e-08, atol=1e-08)
diff --git a/tests/unit_tests/inference/__init__.py b/tests/unit_tests/inference/__init__.py
new file mode 100644
index 0000000000..e69de29bb2
diff --git a/tests/unit_tests/inference/engines/__init__.py b/tests/unit_tests/inference/engines/__init__.py
new file mode 100644
index 0000000000..e69de29bb2
diff --git a/tests/unit_tests/inference/engines/test_mcore_engine.py b/tests/unit_tests/inference/engines/test_mcore_engine.py
new file mode 100644
index 0000000000..835aeed22d
--- /dev/null
+++ b/tests/unit_tests/inference/engines/test_mcore_engine.py
@@ -0,0 +1,122 @@
+import random
+import string
+from typing import List
+from unittest import mock
+
+import torch
+
+from megatron.core.inference.common_inference_params import CommonInferenceParams
+from megatron.core.inference.engines.mcore_engine import MCoreEngine
+from megatron.core.inference.inference_request import InferenceRequest, Status
+from megatron.core.inference.model_inference_wrappers.gpt.gpt_inference_wrapper import (
+    GPTInferenceWrapper,
+)
+from megatron.core.inference.model_inference_wrappers.inference_wrapper_config import (
+    InferenceWrapperConfig,
+)
+from megatron.core.inference.text_generation_controllers.simple_text_generation_controller import (
+    SimpleTextGenerationController,
+)
+from megatron.core.models.gpt.gpt_layer_specs import get_gpt_layer_local_spec
+from megatron.core.models.gpt.gpt_model import GPTModel
+from megatron.core.tensor_parallel.random import model_parallel_cuda_manual_seed
+from megatron.core.transformer.transformer_config import TransformerConfig
+from tests.unit_tests.test_utilities import Utils
+
+
+class TestMCoreEngine:
+    def setup_method(self, method):
+        Utils.initialize_model_parallel(
+            tensor_model_parallel_size=1, pipeline_model_parallel_size=1
+        )
+        model_parallel_cuda_manual_seed(123)
+        self.batch_size = 4
+        self.hidden_size = 12
+        self.vocab_size = 100
+        self.sequence_length = 64
+        transformer_config = TransformerConfig(
+            num_layers=4,
+            hidden_size=self.hidden_size,
+            num_attention_heads=4,
+            use_cpu_initialization=True,
+        )
+
+        gpt_model = GPTModel(
+            config=transformer_config,
+            transformer_layer_spec=get_gpt_layer_local_spec(),
+            vocab_size=self.vocab_size,
+            max_sequence_length=self.sequence_length,
+            parallel_output=True,
+        ).cuda()
+
+        inference_wrapper_config = InferenceWrapperConfig(
+            hidden_size=self.hidden_size,
+            inference_batch_times_seqlen_threshold=400,
+            fp32_residual_connection=False,
+            params_dtype=torch.float,
+            padded_vocab_size=self.vocab_size,
+        )
+
+        inference_wrapped_model = GPTInferenceWrapper(gpt_model, inference_wrapper_config)
+        self.mock_tokenizer = mock.Mock()
+        text_generation_controller = SimpleTextGenerationController(
+            inference_wrapped_model=inference_wrapped_model, tokenizer=self.mock_tokenizer
+        )
+
+        self.mcore_engine = MCoreEngine(
+            text_generation_controller=text_generation_controller, max_batch_size=4
+        )
+
+    def teardown_method(self, method):
+        Utils.destroy_model_parallel()
+
+    def test_generate(self):
+        self.mock_tokenizer.vocab_size = self.vocab_size
+        self.mock_tokenizer.eod = self.vocab_size - 1
+        # Generating random length integer prompts
+        self.mock_tokenizer.tokenize.return_value = [
+            random.randint(0, self.vocab_size - 1) for _ in range(random.randint(5, 10))
+        ]
+        # Generates some random string
+        self.mock_tokenizer.detokenize.return_value = ''.join(
+            random.choices(string.ascii_letters, k=random.randint(4, 10))
+        )
+
+        prompts = ["sample" * (i + 1) for i in range(self.batch_size)]
+        results: List[InferenceRequest] = self.mcore_engine.generate(
+            prompts, common_inference_params=CommonInferenceParams(num_tokens_to_generate=10)
+        )
+
+        for result in results:
+            assert (
+                result.status == Status.COMPLETED
+            ), f"Status should be completed but its {result.status}"
+            assert result.generated_length > 0, f"Generated length should be greater than zero"
+            assert result.generated_text is not None, f'Generated text should not be None'
+
+    def test_generate_empty_prompt(self):
+        self.mock_tokenizer.vocab_size = self.vocab_size
+        self.mock_tokenizer.eod = self.vocab_size - 1
+        self.mock_tokenizer.bos = self.vocab_size - 2
+        # Generating random length integer prompts
+        self.mock_tokenizer.tokenize.return_value = [
+            random.randint(0, self.vocab_size - 1) for _ in range(random.randint(5, 10))
+        ]
+        # Generates some random string
+        self.mock_tokenizer.detokenize.return_value = ''.join(
+            random.choices(string.ascii_letters, k=random.randint(4, 10))
+        )
+
+        prompts = ["" for i in range(self.batch_size)]
+        results: List[InferenceRequest] = self.mcore_engine.generate(
+            prompts,
+            add_BOS=True,
+            common_inference_params=CommonInferenceParams(num_tokens_to_generate=10),
+        )
+
+        for result in results:
+            assert (
+                result.status == Status.COMPLETED
+            ), f"Status should be completed but its {result.status}"
+            assert result.generated_length > 0, f"Generated length should be greater than zero"
+            assert result.generated_text is not None, f'Generated text should not be None'
diff --git a/tests/unit_tests/inference/model_inference_wrappers/__init__.py b/tests/unit_tests/inference/model_inference_wrappers/__init__.py
new file mode 100644
index 0000000000..e69de29bb2
diff --git a/tests/unit_tests/inference/model_inference_wrappers/gpt/test_gpt_inference_wrapper.py b/tests/unit_tests/inference/model_inference_wrappers/gpt/test_gpt_inference_wrapper.py
new file mode 100644
index 0000000000..e01c3f4d17
--- /dev/null
+++ b/tests/unit_tests/inference/model_inference_wrappers/gpt/test_gpt_inference_wrapper.py
@@ -0,0 +1,124 @@
+from argparse import Namespace
+
+import torch
+
+from megatron.core import parallel_state
+from megatron.core.inference.model_inference_wrappers.gpt.gpt_inference_wrapper import (
+    GPTInferenceWrapper,
+)
+from megatron.core.inference.model_inference_wrappers.inference_wrapper_config import (
+    InferenceWrapperConfig,
+)
+from megatron.core.models.gpt.gpt_layer_specs import (
+    get_gpt_layer_local_spec,
+    get_gpt_layer_with_transformer_engine_spec,
+)
+from megatron.core.models.gpt.gpt_model import GPTModel
+from megatron.core.tensor_parallel.random import model_parallel_cuda_manual_seed
+from megatron.core.transformer.transformer_config import TransformerConfig
+from tests.unit_tests.test_utilities import Utils
+
+
+class TestGPTInferenceWrapper:
+
+    def setup_model(self, tensor_parallel_size, pipeline_parallel_size):
+        Utils.initialize_model_parallel(
+            tensor_model_parallel_size=tensor_parallel_size,
+            pipeline_model_parallel_size=pipeline_parallel_size,
+        )
+        model_parallel_cuda_manual_seed(123)
+        self.vocab_size = 100
+        self.batch_size = 4
+        self.sequence_length = 32
+        hidden_size = 12
+
+        transformer_config = TransformerConfig(
+            num_layers=4,
+            hidden_size=hidden_size,
+            num_attention_heads=4,
+            use_cpu_initialization=True,
+        )
+
+        gpt_model = GPTModel(
+            config=transformer_config,
+            transformer_layer_spec=get_gpt_layer_local_spec(),
+            vocab_size=self.vocab_size,
+            max_sequence_length=self.sequence_length,
+            parallel_output=True,
+        ).cuda()
+
+        inference_wrapper_config = InferenceWrapperConfig(
+            hidden_size=hidden_size,
+            inference_batch_times_seqlen_threshold=20,
+            fp32_residual_connection=False,
+            params_dtype=torch.float,
+            padded_vocab_size=self.vocab_size,
+        )
+
+        self.inference_wrapped_model = GPTInferenceWrapper(gpt_model, inference_wrapper_config)
+
+    def teardown_method(self, method):
+        Utils.destroy_model_parallel()
+
+    # This will call the inference_wrapped_model.forward_pass_with_pipeline_parallel_small_input_batch()
+    def test_inference_pipeline_parallel_small_size(self):
+        self.setup_model(tensor_parallel_size=2, pipeline_parallel_size=2)
+
+        batch_prompt_tokens = (
+            torch.randint(low=0, high=self.vocab_size, size=(self.batch_size, self.sequence_length))
+            .int()
+            .cuda()
+        )
+        self.inference_wrapped_model.prep_model_for_inference(prompts_tokens=batch_prompt_tokens)
+
+        inference_input = self.inference_wrapped_model.get_batch_for_context_window(0, 5)
+
+        logits = self.inference_wrapped_model.run_one_forward_step(inference_input)
+        # Logits are not returned in all ranks in PP
+        if parallel_state.is_pipeline_last_stage():
+            assert logits.shape == (
+                self.batch_size,
+                5,
+                self.vocab_size,
+            ), f"Shape mismatch . Expected {(self.batch_size, 5, self.vocab_size)}, but got {logits.shape}"
+
+    # This will call the inference_wrapped_model.forward_pass_with_pipeline_parallel_large_input_batch()
+    def test_inference_pipeline_parallel_large__size(self):
+        self.setup_model(tensor_parallel_size=2, pipeline_parallel_size=2)
+
+        batch_prompt_tokens = (
+            torch.randint(low=0, high=self.vocab_size, size=(self.batch_size, self.sequence_length))
+            .int()
+            .cuda()
+        )
+        self.inference_wrapped_model.prep_model_for_inference(prompts_tokens=batch_prompt_tokens)
+
+        inference_input = self.inference_wrapped_model.get_batch_for_context_window(0, 10)
+
+        logits = self.inference_wrapped_model.run_one_forward_step(inference_input)
+
+        if parallel_state.is_pipeline_last_stage():
+            assert logits.shape == (
+                self.batch_size,
+                10,
+                self.vocab_size,
+            ), f"Shape mismatch . Expected {(self.batch_size,10, self.vocab_size)}, but got {logits.shape}"
+
+    def test_inference_only_tensor_parallel(self):
+        self.setup_model(tensor_parallel_size=4, pipeline_parallel_size=1)
+
+        batch_prompt_tokens = (
+            torch.randint(low=0, high=self.vocab_size, size=(self.batch_size, self.sequence_length))
+            .int()
+            .cuda()
+        )
+        self.inference_wrapped_model.prep_model_for_inference(prompts_tokens=batch_prompt_tokens)
+
+        inference_input = self.inference_wrapped_model.get_batch_for_context_window(0, 5)
+        logits = self.inference_wrapped_model.run_one_forward_step(inference_input)
+
+        assert logits.shape == (
+            self.batch_size,
+            5,
+            self.vocab_size,
+        ), f"Shape mismatch . Expected {(self.batch_size, 5, self.vocab_size)}, but got {logits.shape}"
diff --git a/tests/unit_tests/inference/model_inference_wrappers/t5/test_t5_inference_wrapper.py b/tests/unit_tests/inference/model_inference_wrappers/t5/test_t5_inference_wrapper.py
new file mode 100644
index 0000000000..b9ece5c395
--- /dev/null
+++ b/tests/unit_tests/inference/model_inference_wrappers/t5/test_t5_inference_wrapper.py
@@ -0,0 +1,124 @@
+from argparse import Namespace
+from copy import deepcopy
+from unittest import mock
+
+import numpy as np
+import torch
+
+from megatron.core import parallel_state
+from megatron.core.inference.model_inference_wrappers.inference_wrapper_config import (
+    InferenceWrapperConfig,
+)
+from megatron.core.inference.model_inference_wrappers.t5.t5_inference_wrapper import (
+    T5InferenceWrapper,
+)
+from megatron.core.models.T5.t5_model import T5Model
+from megatron.core.models.T5.t5_spec import (
+    get_t5_decoder_with_transformer_engine_block_spec,
+    get_t5_encoder_with_transformer_engine_block_spec,
+)
+from megatron.core.tensor_parallel.random import model_parallel_cuda_manual_seed
+from megatron.core.transformer.transformer_config import TransformerConfig
+from tests.unit_tests.test_utilities import Utils
+
+
+class TestT5InferenceWrapper:
+
+    def setup_model(self, tensor_parallel_size, pipeline_parallel_size):
+        Utils.initialize_model_parallel(
+            tensor_model_parallel_size=tensor_parallel_size,
+            pipeline_model_parallel_size=pipeline_parallel_size,
+        )
+        model_parallel_cuda_manual_seed(123)
+        self.vocab_size = 100
+        self.batch_size = 8
+        self.encoder_sequence_length = 32
+        self.decoder_sequence_length = 16
+        hidden_size = 768
+
+        transformer_config = TransformerConfig(
+            num_layers=12,
+            hidden_size=hidden_size,
+            num_attention_heads=12,
+            tensor_model_parallel_size=tensor_parallel_size,
+            pipeline_model_parallel_size=pipeline_parallel_size,
+        )
+
+        encoder_config = deepcopy(transformer_config)
+        encoder_config.num_layers = transformer_config.num_layers
+
+        encoder_layers_per_pipeline = (
+            encoder_config.num_layers // encoder_config.pipeline_model_parallel_size
+        )
+        decoder_layers_per_pipeline = (
+            transformer_config.num_layers // transformer_config.pipeline_model_parallel_size
+        )
+        en_block_spec = get_t5_encoder_with_transformer_engine_block_spec(
+            encoder_layers_per_pipeline
+        )
+        de_block_spec = get_t5_decoder_with_transformer_engine_block_spec(
+            decoder_layers_per_pipeline
+        )
+
+        t5_model = T5Model(
+            config=transformer_config,
+            encoder_config=encoder_config,
+            transformer_encoder_layer_spec=en_block_spec,
+            transformer_decoder_layer_spec=de_block_spec,
+            vocab_size=self.vocab_size,
+            max_sequence_length=self.encoder_sequence_length,
+            parallel_output=True,
+            pre_process=True,
+            post_process=True,
+            add_encoder=True,
+            add_decoder=True,
+        ).cuda()
+
+        inference_wrapper_config = InferenceWrapperConfig(
+            hidden_size=hidden_size,
+            inference_batch_times_seqlen_threshold=20,
+            fp32_residual_connection=False,
+            params_dtype=torch.float,
+            padded_vocab_size=self.vocab_size,
+        )
+
+        self.inference_wrapped_model = T5InferenceWrapper(t5_model, inference_wrapper_config)
+
+    def teardown_method(self, method):
+        Utils.destroy_model_parallel()
+
+    def test_inference_only_tensor_parallel(self):
+        self.setup_model(tensor_parallel_size=4, pipeline_parallel_size=1)
+
+        batch_prompt_tokens = (
+            torch.randint(
+                low=0, high=self.vocab_size, size=(self.batch_size, self.decoder_sequence_length)
+            )
+            .int()
+            .cuda()
+        )
+        batch_encoder_prompts = ["sample prompt encoders"] * self.batch_size
+        mock_tokenizer = mock.Mock()
+        mock_tokenizer.pad = self.vocab_size - 1
+        mock_tokenizer.additional_special_tokens_ids = list(range(100))
+        mock_tokenizer.tokenize.return_value = np.random.randint(
+            self.vocab_size, size=self.encoder_sequence_length
+        ).tolist()
+
+        self.inference_wrapped_model.prep_model_for_inference(
+            prompts_tokens=batch_prompt_tokens,
+            encoder_prompts=batch_encoder_prompts,
+            tokenizer=mock_tokenizer,
+        )
+
+        inference_input = self.inference_wrapped_model.get_batch_for_context_window(
+            0, self.decoder_sequence_length
+        )
+
+        logits = self.inference_wrapped_model.run_one_forward_step(inference_input)
+
+        assert logits.shape == (
+            self.batch_size,
+            self.decoder_sequence_length,
+            self.vocab_size,
+        ), f"Shape mismatch . Expected {(self.batch_size, self.decoder_sequence_length, self.vocab_size)}, but got {logits.shape}"
diff --git a/tests/unit_tests/inference/model_inference_wrappers/test_model_inference_wrapper_config.py b/tests/unit_tests/inference/model_inference_wrappers/test_model_inference_wrapper_config.py
new file mode 100644
index 0000000000..e3da997cd4
--- /dev/null
+++ b/tests/unit_tests/inference/model_inference_wrappers/test_model_inference_wrapper_config.py
@@ -0,0 +1,21 @@
+import torch
+
+from megatron.core.inference.model_inference_wrappers.inference_wrapper_config import (
+    InferenceWrapperConfig,
+)
+
+
+class TestModelInferenceWrapperConfig:
+
+    def test_inference_params(self):
+        inference_parameters = InferenceWrapperConfig(
+            hidden_size=10,
+            inference_batch_times_seqlen_threshold=10,
+            padded_vocab_size=10,
+            params_dtype=torch.float,
+            fp32_residual_connection=False,
+        )
+        inference_parameters.add_attributes({"abc": 45})
+        assert (
+            inference_parameters.abc == 45
+        ), f"min tokens not set correctly. it is {inference_parameters.min_tokens}"
diff --git a/tests/unit_tests/inference/test_common_inference_params.py b/tests/unit_tests/inference/test_common_inference_params.py
new file mode 100644
index 0000000000..af51e433df
--- /dev/null
+++ b/tests/unit_tests/inference/test_common_inference_params.py
@@ -0,0 +1,11 @@
+from megatron.core.inference.common_inference_params import CommonInferenceParams
+
+
+class TestCommonInferenceParams:
+
+    def test_inference_params(self):
+        inference_parameters = CommonInferenceParams()
+        inference_parameters.add_attributes({"min_tokens": 45})
+        assert (
+            inference_parameters.min_tokens == 45
+        ), f"min tokens not set correctly. it is {inference_parameters.min_tokens}"
diff --git a/tests/unit_tests/inference/test_inference_utils.py b/tests/unit_tests/inference/test_inference_utils.py
new file mode 100644
index 0000000000..fc4e69018d
--- /dev/null
+++ b/tests/unit_tests/inference/test_inference_utils.py
@@ -0,0 +1,12 @@
+from megatron.core.inference.utils import Counter
+
+
+class TestInferenceUtils:
+
+    def test_counter(self):
+        counter = Counter()
+        r = next(counter)
+        assert r == 0, f'Counter return value should be 0 but it is {r}'
+        assert counter.counter == 1, f'Counter should be 1 but it is {counter.counter}'
+        counter.reset()
+        assert counter.counter == 0, f'Counter should be 0 but it is {counter.counter}'
diff --git a/tests/unit_tests/inference/test_modelopt_gpt_model.py b/tests/unit_tests/inference/test_modelopt_gpt_model.py
new file mode 100644
index 0000000000..380ac7fa16
--- /dev/null
+++ b/tests/unit_tests/inference/test_modelopt_gpt_model.py
@@ -0,0 +1,43 @@
+# Copyright (c) 2024, NVIDIA CORPORATION. All rights reserved.
+from megatron.core.inference.modelopt_support.gpt.model_specs import get_gpt_layer_modelopt_spec
+from megatron.core.inference.modelopt_support.gpt.state_dict_hooks import (
+    mcore_gpt_load_te_state_dict_pre_hook,
+)
+from megatron.core.models.gpt.gpt_layer_specs import get_gpt_layer_with_transformer_engine_spec
+from megatron.core.models.gpt.gpt_model import GPTModel
+from megatron.core.tensor_parallel.random import model_parallel_cuda_manual_seed
+from megatron.core.transformer.transformer_config import TransformerConfig
+from tests.unit_tests.test_utilities import Utils
+
+
+class TestModelOptGPTModel:
+
+    def setup_method(self, method):
+        Utils.initialize_model_parallel(1, 1)
+        model_parallel_cuda_manual_seed(123)
+        transformer_config = TransformerConfig(
+            num_layers=2, hidden_size=12, num_attention_heads=4, use_cpu_initialization=True
+        )
+        self.gpt_model = GPTModel(
+            config=transformer_config,
+            transformer_layer_spec=get_gpt_layer_with_transformer_engine_spec(),
+            vocab_size=100,
+            max_sequence_length=4,
+        )
+        # Ensure that a GPTModel can be built with the modelopt spec.
+        self.modelopt_gpt_model = GPTModel(
+            config=transformer_config,
+            transformer_layer_spec=get_gpt_layer_modelopt_spec(),
+            vocab_size=100,
+            max_sequence_length=4,
+        )
+
+    def test_load_te_state_dict_pre_hook(self):
+        handle = self.modelopt_gpt_model._register_load_state_dict_pre_hook(
+            mcore_gpt_load_te_state_dict_pre_hook
+        )
+        self.modelopt_gpt_model.load_state_dict(self.gpt_model.state_dict())
+        handle.remove()
+
+    def teardown_method(self, method):
+        Utils.destroy_model_parallel()
diff --git a/tests/unit_tests/inference/test_scheduler.py b/tests/unit_tests/inference/test_scheduler.py
new file mode 100644
index 0000000000..b1f0ea184e
--- /dev/null
+++ b/tests/unit_tests/inference/test_scheduler.py
@@ -0,0 +1,89 @@
+from typing import Dict
+
+import torch
+
+from megatron.core.inference.common_inference_params import CommonInferenceParams
+from megatron.core.inference.inference_request import InferenceRequest, Status
+from megatron.core.inference.scheduler import Scheduler
+
+
+class TestScheduler:
+
+    def setup_method(self, method):
+        self.max_batch_size = 4
+        self.scheduler = Scheduler(max_batch_size=self.max_batch_size)
+        assert (
+            len(self.scheduler.active_request_pool) == 0
+        ), "Active request pool should be empty on initalization"
+        assert (
+            len(self.scheduler.waiting_request_pool) == 0
+        ), "Waiting request pool should be empty on initalization"
+        assert (
+            len(self.scheduler.completed_request_pool) == 0
+        ), "Completed request pool should be empty on initalization"
+
+    def test_scheduler(self):
+        prompt = "sample prompt"
+        prompt_tokens = torch.randn(5)
+        inference_parameters = CommonInferenceParams()
+
+        for i in range(self.max_batch_size):
+            self.scheduler.add_request(prompt, prompt_tokens, inference_parameters)
+            assert (
+                len(self.scheduler.active_request_pool) == i + 1
+            ), f"Active request pool should have {i+1} requests, but it has only {len(self.scheduler.active_request_pool)}"
+
+        self.scheduler.add_request(prompt, prompt_tokens, inference_parameters)
+        assert (
+            len(self.scheduler.waiting_request_pool) == 1
+        ), f"Waiting request pool should have 1 request but it has {len(self.scheduler.waiting_request_pool)} requests"
+
+        waiting_request: InferenceRequest = list(self.scheduler.waiting_request_pool.values())[0]
+        assert (
+            waiting_request.status == Status.WAITING_IN_QUEUE
+        ), f"Status should be WAITING_IN_QUEUE, but its {waiting_request.status} for the waiting request"
+
+        assert (
+            self.scheduler.have_requests_pending()
+        ), "Scheduler should have requests pending, but it seems to be having no requests"
+
+        active_request_dict: Dict[int, InferenceRequest] = self.scheduler.active_request_pool
+        for request_id, request in active_request_dict.items():
+            # Mark every even request compelted
+            if int(request_id) % 2 == 0:
+                request.status = Status.COMPLETED
+
+        self.scheduler.update_requests_pools(active_request_dict)
+        assert (
+            len(self.scheduler.active_request_pool) == 3
+        ), f"Active request pool should have 3 requests, but it has {len(self.scheduler.active_request_pool)}"
+
+        assert (
+            len(self.scheduler.waiting_request_pool) == 0
+        ), f"Waiting request pool should be empty but it has {len(self.scheduler.waiting_request_pool)} requests"
+
+        assert (
+            len(self.scheduler.completed_request_pool) == 2
+        ), f"Completed request pool should have 2 requests but it has {len(self.scheduler.completed_request_pool)} requests "
+
+        active_request_dict: Dict[int, InferenceRequest] = self.scheduler.active_request_pool
+        for request_id, request in active_request_dict.items():
+            # Mark all requests compelted
+            request.status = Status.COMPLETED
+
+        self.scheduler.update_requests_pools(active_request_dict)
+        assert (
+            len(self.scheduler.active_request_pool) == 0
+        ), f"Active request pool should be empty, but it has {len(self.scheduler.active_request_pool)}"
+
+        assert (
+            len(self.scheduler.waiting_request_pool) == 0
+        ), f"Waiting request pool should be empty but it has {len(self.scheduler.waiting_request_pool)} requests"
+
+        assert (
+            len(self.scheduler.completed_request_pool) == 5
+        ), f"Completed request pool should have 5 requests but it has {len(self.scheduler.completed_request_pool)} requests "
+
+        assert (
+            self.scheduler.have_requests_pending() == False
+        ), "Scheduler should not have any requests pending"
diff --git a/tests/unit_tests/inference/text_generation_controllers/__init__.py b/tests/unit_tests/inference/text_generation_controllers/__init__.py
new file mode 100644
index 0000000000..e69de29bb2
diff --git a/tests/unit_tests/inference/text_generation_controllers/test_encoder_decoder_text_generation_controller.py b/tests/unit_tests/inference/text_generation_controllers/test_encoder_decoder_text_generation_controller.py
new file mode 100644
index 0000000000..14c9a88852
--- /dev/null
+++ b/tests/unit_tests/inference/text_generation_controllers/test_encoder_decoder_text_generation_controller.py
@@ -0,0 +1,143 @@
+import random
+import string
+import time
+from collections import OrderedDict
+from copy import deepcopy
+from typing import Dict
+from unittest import mock
+
+import numpy as np
+import pytest
+import torch
+
+from megatron.core.inference.common_inference_params import CommonInferenceParams
+from megatron.core.inference.inference_request import InferenceRequest, Status
+from megatron.core.inference.model_inference_wrappers.inference_wrapper_config import (
+    InferenceWrapperConfig,
+)
+from megatron.core.inference.model_inference_wrappers.t5.t5_inference_wrapper import (
+    T5InferenceWrapper,
+)
+from megatron.core.inference.text_generation_controllers.encoder_decoder_text_generation_controller import (
+    EncoderDecoderTextGenerationController,
+)
+from megatron.core.models.T5.t5_model import T5Model
+from megatron.core.models.T5.t5_spec import (
+    get_t5_decoder_with_transformer_engine_block_spec,
+    get_t5_encoder_with_transformer_engine_block_spec,
+)
+from megatron.core.tensor_parallel.random import model_parallel_cuda_manual_seed
+from megatron.core.transformer.transformer_config import TransformerConfig
+from tests.unit_tests.test_utilities import Utils
+
+
+class TestEncoderDecoderTextGenerationController:
+
+    def setup_method(self, method):
+        Utils.initialize_model_parallel(
+            tensor_model_parallel_size=4, pipeline_model_parallel_size=1
+        )
+        model_parallel_cuda_manual_seed(123)
+        self.vocab_size = 100
+        self.batch_size = 8
+        self.encoder_sequence_length = 32
+        self.decoder_sequence_length = 16
+        hidden_size = 768
+
+        transformer_config = TransformerConfig(
+            num_layers=12,
+            hidden_size=hidden_size,
+            num_attention_heads=12,
+            tensor_model_parallel_size=4,
+            pipeline_model_parallel_size=1,
+        )
+
+        encoder_config = deepcopy(transformer_config)
+        encoder_config.num_layers = transformer_config.num_layers
+
+        encoder_layers_per_pipeline = (
+            encoder_config.num_layers // encoder_config.pipeline_model_parallel_size
+        )
+        decoder_layers_per_pipeline = (
+            transformer_config.num_layers // transformer_config.pipeline_model_parallel_size
+        )
+        en_block_spec = get_t5_encoder_with_transformer_engine_block_spec(
+            encoder_layers_per_pipeline
+        )
+        de_block_spec = get_t5_decoder_with_transformer_engine_block_spec(
+            decoder_layers_per_pipeline
+        )
+
+        t5_model = T5Model(
+            config=transformer_config,
+            encoder_config=encoder_config,
+            transformer_encoder_layer_spec=en_block_spec,
+            transformer_decoder_layer_spec=de_block_spec,
+            vocab_size=self.vocab_size,
+            max_sequence_length=self.encoder_sequence_length,
+            parallel_output=True,
+            pre_process=True,
+            post_process=True,
+            add_encoder=True,
+            add_decoder=True,
+        ).cuda()
+
+        inference_wrapper_config = InferenceWrapperConfig(
+            hidden_size=hidden_size,
+            inference_batch_times_seqlen_threshold=20,
+            fp32_residual_connection=False,
+            params_dtype=torch.float,
+            padded_vocab_size=self.vocab_size,
+        )
+
+        inference_wrapped_model = T5InferenceWrapper(t5_model, inference_wrapper_config)
+
+        self.mock_tokenizer = mock.Mock()
+
+        self.text_generation_controller = EncoderDecoderTextGenerationController(
+            inference_wrapped_model=inference_wrapped_model, tokenizer=self.mock_tokenizer
+        )
+
+    def teardown_method(self, method):
+        Utils.destroy_model_parallel()
+
+    def test_generate_all_output_tokens_static_batch(self):
+        self.mock_tokenizer.vocab_size = self.vocab_size
+        self.mock_tokenizer.eod = self.vocab_size - 1
+        self.mock_tokenizer.pad = self.vocab_size - 2
+        self.mock_tokenizer.additional_special_tokens_ids = list(range(100))
+        self.mock_tokenizer.detokenize.return_value = ''.join(
+            random.choices(string.ascii_letters, k=random.randint(4, 10))
+        )
+        self.mock_tokenizer.tokenize.return_value = np.random.randint(
+            self.vocab_size, size=(self.encoder_sequence_length - 5)
+        ).tolist()
+
+        active_requests: Dict[int, InferenceRequest] = OrderedDict()
+        for i in range(self.batch_size):
+            prompt = "decoder_sample"
+            prompt_tokens = np.random.randint(
+                self.vocab_size, size=self.decoder_sequence_length
+            ).tolist()
+            encoder_prompt = "encoder_sample"
+            inference_request = InferenceRequest(
+                request_id=i,
+                prompt=prompt,
+                encoder_prompt=encoder_prompt,
+                inference_parameters=CommonInferenceParams(num_tokens_to_generate=10),
+                arrival_time=time.time(),
+                prompt_tokens=prompt_tokens,
+                status=Status.ACTIVE_BUT_NOT_GENERATING_TOKENS,
+            )
+            active_requests[i] = inference_request
+
+        requests = self.text_generation_controller.generate_all_output_tokens_static_batch(
+            active_requests
+        )
+
+        for request_id, request in requests.items():
+            assert (
+                request.status == Status.COMPLETED
+            ), f"Status should be completed but its {request.status}"
+            assert request.generated_length > 0, f"Generated length should be greater than zero"
+            assert request.generated_text is not None, "Generated text should not be None"
diff --git a/tests/unit_tests/inference/text_generation_controllers/test_simple_text_generation_controller.py b/tests/unit_tests/inference/text_generation_controllers/test_simple_text_generation_controller.py
new file mode 100644
index 0000000000..df7109e021
--- /dev/null
+++ b/tests/unit_tests/inference/text_generation_controllers/test_simple_text_generation_controller.py
@@ -0,0 +1,172 @@
+import random
+import string
+import time
+from collections import OrderedDict
+from typing import Dict
+from unittest import mock
+
+import pytest
+import torch
+
+from megatron.core.inference.common_inference_params import CommonInferenceParams
+from megatron.core.inference.inference_request import InferenceRequest, Status
+from megatron.core.inference.model_inference_wrappers.gpt.gpt_inference_wrapper import (
+    GPTInferenceWrapper,
+)
+from megatron.core.inference.model_inference_wrappers.inference_wrapper_config import (
+    InferenceWrapperConfig,
+)
+from megatron.core.inference.text_generation_controllers.simple_text_generation_controller import (
+    SimpleTextGenerationController,
+)
+from megatron.core.models.gpt.gpt_layer_specs import get_gpt_layer_local_spec
+from megatron.core.models.gpt.gpt_model import GPTModel
+from megatron.core.tensor_parallel.random import model_parallel_cuda_manual_seed
+from megatron.core.transformer.transformer_config import TransformerConfig
+from tests.unit_tests.test_utilities import Utils
+
+
+class TestSimpleTextGenerationController:
+
+    def setup_method(self, method):
+        Utils.initialize_model_parallel(
+            tensor_model_parallel_size=2, pipeline_model_parallel_size=2
+        )
+        model_parallel_cuda_manual_seed(123)
+        self.batch_size = 4
+        self.hidden_size = 12
+        self.vocab_size = 100
+        self.sequence_length = 64
+        transformer_config = TransformerConfig(
+            num_layers=4,
+            hidden_size=self.hidden_size,
+            num_attention_heads=4,
+            use_cpu_initialization=True,
+        )
+
+        gpt_model = GPTModel(
+            config=transformer_config,
+            transformer_layer_spec=get_gpt_layer_local_spec(),
+            vocab_size=self.vocab_size,
+            max_sequence_length=self.sequence_length,
+            parallel_output=True,
+        ).cuda()
+
+        inference_wrapper_config = InferenceWrapperConfig(
+            hidden_size=self.hidden_size,
+            inference_batch_times_seqlen_threshold=20,
+            fp32_residual_connection=False,
+            params_dtype=torch.float,
+            padded_vocab_size=self.vocab_size,
+        )
+
+        inference_wrapped_model = GPTInferenceWrapper(gpt_model, inference_wrapper_config)
+
+        self.mock_tokenizer = mock.Mock()
+
+        self.text_generation_controller = SimpleTextGenerationController(
+            inference_wrapped_model=inference_wrapped_model, tokenizer=self.mock_tokenizer
+        )
+
+    def teardown_method(self, method):
+        Utils.destroy_model_parallel()
+
+    def test_sample_from_logits(self):
+        with pytest.raises(AssertionError) as aerror:
+            self.text_generation_controller.sample_from_logits(
+                last_token_logits=None,
+                common_inference_params=CommonInferenceParams(top_k=2, top_p=0.4),
+                vocab_size=self.vocab_size,
+            )
+        assert str(aerror.value) == 'Cannot have top-p and top-k both greater than zero'
+
+        with pytest.raises(AssertionError) as aerror:
+            self.text_generation_controller.sample_from_logits(
+                last_token_logits=None,
+                common_inference_params=CommonInferenceParams(top_p=1.4, top_k=0),
+                vocab_size=self.vocab_size,
+            )
+        assert str(aerror.value) == 'top-p should be in (0,1]'
+
+        with pytest.raises(AssertionError) as aerror:
+            self.text_generation_controller.sample_from_logits(
+                last_token_logits=torch.randn(self.batch_size, 1),
+                common_inference_params=CommonInferenceParams(top_k=self.vocab_size + 10),
+                vocab_size=self.vocab_size,
+            )
+        assert str(aerror.value) == 'top-k is larger than logit size.'
+
+        last_token_logits = (
+            torch.arange(0, self.vocab_size).repeat(self.batch_size, 1).float().cuda()
+        )
+        sampled_logits = self.text_generation_controller.sample_from_logits(
+            last_token_logits, CommonInferenceParams(top_k=1), self.vocab_size
+        )
+        assert torch.all(
+            sampled_logits.cpu() == torch.ones(self.batch_size) * self.vocab_size - 1
+        ), f"The sampled logits should all be {self.vocab_size} but its {sampled_logits}"
+
+        sampled_logits = self.text_generation_controller.sample_from_logits(
+            last_token_logits, CommonInferenceParams(top_k=2), self.vocab_size
+        )
+        assert torch.all(
+            sampled_logits >= self.vocab_size - 2
+        ), f"The sampled logits should all be greater than {self.vocab_size-2} but its {sampled_logits}"
+
+        l = last_token_logits[0]
+        top_p = 0.3
+        expected_min_value = l[l.softmax(dim=-1).cumsum(dim=-1) > top_p][0].item()
+        sampled_logits = self.text_generation_controller.sample_from_logits(
+            last_token_logits, CommonInferenceParams(top_p=top_p, top_k=0), self.vocab_size
+        )
+        assert torch.all(
+            sampled_logits >= expected_min_value
+        ), f"The sampled logits should all be greater than {expected_min_value} but its {sampled_logits}"
+
+        top_p = 0.95
+        temperature = 2
+        expected_min_value = l[l.div_(temperature).softmax(dim=-1).cumsum(dim=-1) > top_p][0].item()
+        sampled_logits = self.text_generation_controller.sample_from_logits(
+            last_token_logits,
+            CommonInferenceParams(top_p=top_p, temperature=temperature, top_k=0),
+            self.vocab_size,
+        )
+        assert torch.all(
+            sampled_logits >= expected_min_value
+        ), f"The sampled logits should all be greater than {expected_min_value} but its {sampled_logits}"
+
+    def test_generate_all_output_tokens_static_batch(self):
+        self.mock_tokenizer.vocab_size = self.vocab_size
+        self.mock_tokenizer.eod = self.vocab_size - 1
+        self.mock_tokenizer.detokenize.return_value = ''.join(
+            random.choices(string.ascii_letters, k=random.randint(4, 10))
+        )
+
+        active_requests: Dict[int, InferenceRequest] = OrderedDict()
+        for i in range(self.batch_size):
+            prompt = "sample" * (i + 1)
+            self.mock_tokenizer.tokenize.return_value = torch.randn(
+                self.batch_size, self.vocab_size
+            ).cuda()
+            inference_request = InferenceRequest(
+                request_id=i,
+                prompt=prompt,
+                inference_parameters=CommonInferenceParams(num_tokens_to_generate=10),
+                arrival_time=time.time(),
+                prompt_tokens=torch.randint(
+                    low=0, high=self.vocab_size - 1, size=(len(prompt),)
+                ).tolist(),
+                status=Status.ACTIVE_BUT_NOT_GENERATING_TOKENS,
+            )
+            active_requests[i] = inference_request
+
+        requests = self.text_generation_controller.generate_all_output_tokens_static_batch(
+            active_requests
+        )
+
+        for request_id, request in requests.items():
+            assert (
+                request.status == Status.COMPLETED
+            ), f"Status should be completed but its {request.status}"
+            assert request.generated_length > 0, f"Generated length should be greater than zero"
+            assert request.generated_text is not None, "Generated text should not be None"
diff --git a/tests/unit_tests/models/__init__.py b/tests/unit_tests/models/__init__.py
new file mode 100644
index 0000000000..e69de29bb2
diff --git a/tests/unit_tests/models/test_base_embedding.py b/tests/unit_tests/models/test_base_embedding.py
new file mode 100644
index 0000000000..0ce18b3843
--- /dev/null
+++ b/tests/unit_tests/models/test_base_embedding.py
@@ -0,0 +1,57 @@
+# Copyright (c) 2023, NVIDIA CORPORATION. All rights reserved.
+
+import pytest
+import torch
+
+from megatron.core.models.common.embeddings.language_model_embedding import LanguageModelEmbedding
+from megatron.core.transformer.transformer_config import TransformerConfig
+from tests.unit_tests.test_utilities import Utils
+
+
+class TestBaseEmbedding:
+
+    def setup_method(self, method):
+        Utils.initialize_model_parallel(1, 1)
+        transformer_config = TransformerConfig(
+            num_layers=2, hidden_size=12, num_attention_heads=4, use_cpu_initialization=True
+        )
+        self.base_embedding = LanguageModelEmbedding(
+            config=transformer_config,
+            vocab_size=100,
+            max_sequence_length=4,
+            position_embedding_type='learned_absolute',
+        )
+
+    def teardown_method(self, method):
+        Utils.destroy_model_parallel()
+
+    def test_constructor(self):
+        assert isinstance(self.base_embedding, LanguageModelEmbedding)
+        num_weights = sum([p.numel() for p in self.base_embedding.parameters()])
+        assert num_weights == 1248
+
+    def test_zero_parameters(self):
+        sum_weights = sum([p.sum() for p in self.base_embedding.parameters()])
+        assert sum_weights != 0
+        self.base_embedding.zero_parameters()
+        sum_weights = sum([p.sum() for p in self.base_embedding.parameters()])
+        assert sum_weights == 0
+
+    def test_cpu_forward(self):
+        input_ids = torch.tensor([0, 1, 2, 3], dtype=torch.int64).repeat((2, 1))
+        position_ids = torch.tensor([0, 1, 2, 3], dtype=torch.int64).repeat((2, 1))
+        embeddings = self.base_embedding(input_ids, position_ids)
+        assert embeddings.device.type == 'cpu'
+        assert embeddings.shape[0] == self.base_embedding.max_sequence_length
+        assert embeddings.shape[1] == input_ids.shape[0]
+        assert embeddings.shape[2] == self.base_embedding.config.hidden_size
+
+    def test_gpu_forward(self):
+        self.base_embedding.cuda()
+        input_ids = torch.tensor([0, 1, 2, 3], dtype=torch.int64).repeat((2, 1)).cuda()
+        position_ids = torch.tensor([0, 1, 2, 3], dtype=torch.int64).repeat((2, 1)).cuda()
+        embeddings = self.base_embedding(input_ids, position_ids)
+        assert embeddings.device.type == 'cuda'
+        assert embeddings.shape[0] == self.base_embedding.max_sequence_length
+        assert embeddings.shape[1] == input_ids.shape[0]
+        assert embeddings.shape[2] == self.base_embedding.config.hidden_size
diff --git a/tests/unit_tests/models/test_bert_model.py b/tests/unit_tests/models/test_bert_model.py
new file mode 100644
index 0000000000..b03a3e5969
--- /dev/null
+++ b/tests/unit_tests/models/test_bert_model.py
@@ -0,0 +1,225 @@
+# Copyright (c) 2023, NVIDIA CORPORATION. All rights reserved.
+
+import os
+from importlib.metadata import version
+
+import pytest
+import torch
+from packaging.version import Version as PkgVersion
+from pytest_mock import mocker
+
+from megatron.core.models.bert.bert_layer_specs import (
+    bert_layer_local_spec,
+    bert_layer_with_transformer_engine_spec,
+)
+from megatron.core.models.bert.bert_model import BertModel
+from megatron.core.tensor_parallel.random import model_parallel_cuda_manual_seed
+from megatron.core.transformer.enums import AttnMaskType
+from megatron.core.transformer.transformer_config import TransformerConfig
+from megatron.core.utils import is_te_min_version
+from tests.unit_tests.test_utilities import Utils
+
+
+class TestBertModel:
+
+    def setup_method(self, method):
+        os.environ['NVTE_FUSED_ATTN'] = '0'
+        os.environ['NVTE_FLASH_ATTN'] = '0'
+        tp = 1
+        pp = 1
+        Utils.initialize_model_parallel(tp, pp)
+        model_parallel_cuda_manual_seed(123)
+        transformer_config = TransformerConfig(
+            num_layers=2,
+            hidden_size=12,
+            num_attention_heads=4,
+            use_cpu_initialization=True,
+            perform_initialization=True,
+            tensor_model_parallel_size=tp,
+            pipeline_model_parallel_size=pp,
+            pipeline_dtype=torch.bfloat16,
+        )
+        self.bert_model = BertModel(
+            config=transformer_config,
+            num_tokentypes=0,
+            transformer_layer_spec=bert_layer_with_transformer_engine_spec,
+            vocab_size=100,
+            max_sequence_length=4,
+        )
+
+    def teardown_method(self, method):
+        Utils.destroy_model_parallel()
+
+    @pytest.mark.internal
+    def test_constructor(self):
+        assert isinstance(self.bert_model, BertModel)
+
+        assert self.bert_model.max_sequence_length == 4
+
+        num_weights = sum([p.numel() for p in self.bert_model.parameters()])
+        assert num_weights == 6702
+
+    @pytest.mark.internal
+    def test_set_input_tensor(self):
+        config: TransformerConfig = self.bert_model.config
+        sequence_length = self.bert_model.max_sequence_length
+        micro_batch_size = 2
+
+        # [sequence length, batch size, hidden size]
+        input_tensor = torch.ones((sequence_length, micro_batch_size, config.hidden_size))
+
+        self.bert_model.set_input_tensor(input_tensor)
+
+        assert self.bert_model.encoder.input_tensor.shape[0] == sequence_length
+        assert self.bert_model.encoder.input_tensor.shape[1] == micro_batch_size
+        assert self.bert_model.encoder.input_tensor.shape[2] == config.hidden_size
+
+    @pytest.mark.internal
+    def test_post_process_forward(self):
+        config: TransformerConfig = self.bert_model.config
+        sequence_length = self.bert_model.max_sequence_length
+        micro_batch_size = 2
+
+        self.bert_model.cuda()
+
+        data = list(range(sequence_length))
+        input_ids = torch.tensor(data, dtype=torch.int64).repeat((micro_batch_size, 1)).cuda()
+        position_ids = torch.tensor(data, dtype=torch.int64).repeat((micro_batch_size, 1)).cuda()
+        attention_mask = torch.ones((micro_batch_size, sequence_length), dtype=bool).cuda()
+
+        logits = self.bert_model.forward(input_ids=input_ids, attention_mask=attention_mask)
+
+        assert logits[0].shape[0] == micro_batch_size
+        assert logits[0].shape[1] == sequence_length
+        assert logits[0].shape[2] == self.bert_model.vocab_size
+
+
+class TestBertModelAttentionDimensions:
+
+    def teardown_method(self, method):
+        Utils.destroy_model_parallel()
+        os.environ.pop('NVTE_FUSED_ATTN', None)
+        os.environ.pop('NVTE_FLASH_ATTN', None)
+        os.environ.pop('NVTE_UNFUSED_ATTN', None)
+
+    def setup_method(self, method):
+        Utils.initialize_model_parallel(1, 1)
+        model_parallel_cuda_manual_seed(123)
+        self.transformer_config = TransformerConfig(
+            num_layers=2,
+            hidden_size=12,
+            num_attention_heads=4,
+            use_cpu_initialization=True,
+            pipeline_dtype=torch.bfloat16,
+        )
+        # This should convert arbitray mask to padding mask
+        self.bert_model = BertModel(
+            config=self.transformer_config,
+            num_tokentypes=0,
+            transformer_layer_spec=bert_layer_with_transformer_engine_spec,
+            vocab_size=100,
+            max_sequence_length=4,
+        )
+
+    @pytest.mark.internal
+    def test_local_spec(self, mocker):
+        self.bert_model.transformer_layer_spec = bert_layer_local_spec
+        attn_mask_dimensions = self.bert_model._sanity_check_attention_and_get_attn_mask_dimension()
+        assert (
+            attn_mask_dimensions == "b1ss"
+        ), f"Expected b1ss for attn_mask_dimensions but got {attn_mask_dimensions}"
+
+    @pytest.mark.internal
+    def test_transformer_engine_version_1_10(self, mocker):
+        bert_layer_with_transformer_engine_spec.submodules.self_attention.params[
+            'attn_mask_type'
+        ] == AttnMaskType.arbitrary
+
+        mocker.patch("megatron.core.utils.get_te_version", return_value=PkgVersion("1.10"))
+        self.bert_model.transformer_layer_spec = bert_layer_with_transformer_engine_spec
+        attn_mask_dimensions = self.bert_model._sanity_check_attention_and_get_attn_mask_dimension()
+        attn_mask_type = self.bert_model.transformer_layer_spec.submodules.self_attention.params[
+            'attn_mask_type'
+        ]
+        assert (
+            attn_mask_type == AttnMaskType.padding
+        ), f"Exepcted attn mask type to be padding, but got {attn_mask_type}"
+        assert (
+            attn_mask_dimensions == "b11s"
+        ), f"Expected b11s for attn_mask_dimensions but got {attn_mask_dimensions}"
+
+    @pytest.mark.internal
+    def test_transformer_engine_version_1_7_to_1_10_flash_attn(self, mocker):
+        os.environ['NVTE_FLASH_ATTN'] = '1'
+
+        mocker.patch("megatron.core.utils.get_te_version", return_value=PkgVersion("1.8"))
+        self.bert_model.transformer_layer_spec = bert_layer_with_transformer_engine_spec
+        attn_mask_dimensions = self.bert_model._sanity_check_attention_and_get_attn_mask_dimension()
+        assert (
+            attn_mask_dimensions == "b11s"
+        ), f"Expected b11s for attn_mask_dimensions but got {attn_mask_dimensions}"
+
+    @pytest.mark.internal
+    @pytest.mark.flaky_in_dev
+    def test_transformer_engine_version_1_7_to_1_10_rng_error(self, mocker):
+        os.environ['NVTE_FLASH_ATTN'] = '0'
+        os.environ['NVTE_FUSED_ATTN'] = '0'
+
+        bert_layer_with_transformer_engine_spec.submodules.self_attention.params[
+            'attn_mask_type'
+        ] == AttnMaskType.padding
+        mocker.patch("megatron.core.utils.get_te_version", return_value=PkgVersion("1.8"))
+        with pytest.raises(Exception) as exc_info:
+            self.bert_model = BertModel(
+                config=self.transformer_config,
+                num_tokentypes=0,
+                transformer_layer_spec=bert_layer_with_transformer_engine_spec,
+                vocab_size=100,
+                max_sequence_length=4,
+            )
+        assert str(exc_info.value) == (
+            "Linear.__init__() got an unexpected keyword argument 'rng_tracker_name' when "
+            "instantiating TERowParallelLinear when instantiating SelfAttention when "
+            "instantiating TransformerLayer"
+        )
+
+    @pytest.mark.internal
+    def test_transformer_engine_version_1_7_to_1_10_unfused_attention(self, mocker):
+        os.environ['NVTE_FLASH_ATTN'] = '0'
+        os.environ['NVTE_FUSED_ATTN'] = '0'
+        bert_layer_with_transformer_engine_spec.submodules.self_attention.params[
+            'attn_mask_type'
+        ] == AttnMaskType.padding
+        mocker.patch("megatron.core.utils.get_te_version", return_value=PkgVersion("1.8"))
+        self.bert_model.transformer_layer_spec = bert_layer_with_transformer_engine_spec
+        attn_mask_dimensions = self.bert_model._sanity_check_attention_and_get_attn_mask_dimension()
+        attn_mask_type = self.bert_model.transformer_layer_spec.submodules.self_attention.params[
+            'attn_mask_type'
+        ]
+        assert (
+            attn_mask_type == AttnMaskType.arbitrary
+        ), f"Exepcted attn mask type to be arbitrary, but got {attn_mask_type}"
+        assert (
+            attn_mask_dimensions == "b1ss"
+        ), f"Expected b1ss for attn_mask_dimensions but got {attn_mask_dimensions}"
+
+        Utils.destroy_model_parallel()
+
+    @pytest.mark.internal
+    def test_transformer_engine_version_less_than_1_7(self, mocker):
+        os.environ['NVTE_FLASH_ATTN'] = '1'
+        with pytest.raises(Exception) as exc_info:
+            mocker.patch("megatron.core.utils.get_te_version", return_value=PkgVersion("1.5"))
+            self.bert_model = BertModel(
+                config=self.transformer_config,
+                num_tokentypes=0,
+                transformer_layer_spec=bert_layer_with_transformer_engine_spec,
+                vocab_size=100,
+                max_sequence_length=4,
+            )
+
+        assert str(exc_info.value) == (
+            "Flash and fused attention is not supported with transformer engine version "
+            "< 1.7. Set NVTE_FLASH_ATTN=0 and NVTE_FUSED_ATTN=0 or upgrade transformer "
+            "engine >= 1.7"
+        )
diff --git a/tests/unit_tests/models/test_clip_vit_model.py b/tests/unit_tests/models/test_clip_vit_model.py
new file mode 100644
index 0000000000..fcbf2ad440
--- /dev/null
+++ b/tests/unit_tests/models/test_clip_vit_model.py
@@ -0,0 +1,56 @@
+# Copyright (c) 2024, NVIDIA CORPORATION. All rights reserved.
+import pytest
+import torch
+
+from megatron.core.models.gpt.gpt_layer_specs import get_gpt_layer_with_transformer_engine_spec
+from megatron.core.models.vision.clip_vit_model import CLIPViTModel
+from megatron.core.tensor_parallel.random import model_parallel_cuda_manual_seed
+from megatron.core.transformer.transformer_config import TransformerConfig
+from tests.unit_tests.test_utilities import Utils
+
+
+class TestCLIPViTModel:
+    """Test CLIP ViT model."""
+
+    def setup_method(self, method):
+        Utils.initialize_model_parallel(1, 1)
+        model_parallel_cuda_manual_seed(123)
+        transformer_config = TransformerConfig(
+            num_layers=2, hidden_size=64, num_attention_heads=4, use_cpu_initialization=True
+        )
+        transformer_layer_spec = get_gpt_layer_with_transformer_engine_spec()
+        self.model = CLIPViTModel(
+            transformer_config, transformer_layer_spec, img_h=336, img_w=336, patch_dim=14
+        )
+
+    def teardown_method(self, method):
+        Utils.destroy_model_parallel()
+
+    def test_constructor(self):
+        assert isinstance(self.model, CLIPViTModel)
+
+        num_weights = sum([p.numel() for p in self.model.parameters()])
+        assert num_weights == 174720
+
+    def test_set_input_tensor(self):
+        # [s, b, h] expected to the transformer.
+        expected_shape = (577, 2, 64)
+        input_tensor = torch.zeros(expected_shape)
+
+        self.model.set_input_tensor(input_tensor)
+
+        assert self.model.decoder.input_tensor.shape == torch.Size(expected_shape)
+
+    def test_forward(self):
+        self.model.cuda()
+
+        img = torch.zeros((2, 3, 336, 336)).cuda()
+
+        out = self.model.forward(img)
+        assert out.shape == torch.Size([2, 577, 64])
+
+    def test_save_load(self, tmp_path):
+        path = tmp_path / "model.pt"
+        torch.save(self.model.state_dict(), path)
+
+        self.model.load_state_dict(torch.load(path))
diff --git a/tests/unit_tests/models/test_gpt_model.py b/tests/unit_tests/models/test_gpt_model.py
new file mode 100644
index 0000000000..ce298c3b29
--- /dev/null
+++ b/tests/unit_tests/models/test_gpt_model.py
@@ -0,0 +1,85 @@
+# Copyright (c) 2023, NVIDIA CORPORATION. All rights reserved.
+
+import pytest
+import torch
+
+from megatron.core.models.gpt.gpt_layer_specs import get_gpt_layer_with_transformer_engine_spec
+from megatron.core.models.gpt.gpt_model import GPTModel
+from megatron.core.tensor_parallel.random import model_parallel_cuda_manual_seed
+from megatron.core.transformer.transformer_config import TransformerConfig
+from tests.unit_tests.test_utilities import Utils
+
+
+class TestGPTModel:
+
+    def setup_method(self, method):
+        Utils.initialize_model_parallel(1, 1)
+        model_parallel_cuda_manual_seed(123)
+        transformer_config = TransformerConfig(
+            num_layers=2, hidden_size=12, num_attention_heads=4, use_cpu_initialization=True
+        )
+        self.gpt_model = GPTModel(
+            config=transformer_config,
+            transformer_layer_spec=get_gpt_layer_with_transformer_engine_spec(),
+            vocab_size=100,
+            max_sequence_length=4,
+        )
+
+    def teardown_method(self, method):
+        Utils.destroy_model_parallel()
+
+    def test_constructor(self):
+        assert isinstance(self.gpt_model, GPTModel)
+
+        assert self.gpt_model.max_sequence_length == 4
+
+        num_weights = sum([p.numel() for p in self.gpt_model.parameters()])
+        assert num_weights == 6240
+
+    def test_set_input_tensor(self):
+        config: TransformerConfig = self.gpt_model.config
+        sequence_length = self.gpt_model.max_sequence_length
+        micro_batch_size = 2
+
+        # [sequence length, batch size, hidden size]
+        input_tensor = torch.ones((sequence_length, micro_batch_size, config.hidden_size))
+
+        self.gpt_model.set_input_tensor(input_tensor)
+
+        assert self.gpt_model.decoder.input_tensor.shape[0] == sequence_length
+        assert self.gpt_model.decoder.input_tensor.shape[1] == micro_batch_size
+        assert self.gpt_model.decoder.input_tensor.shape[2] == config.hidden_size
+
+    def test_post_process_forward(self):
+        config: TransformerConfig = self.gpt_model.config
+        sequence_length = self.gpt_model.max_sequence_length
+        micro_batch_size = 2
+
+        self.gpt_model.cuda()
+
+        data = list(range(sequence_length))
+        input_ids = torch.tensor(data, dtype=torch.int64).repeat((micro_batch_size, 1)).cuda()
+        position_ids = torch.tensor(data, dtype=torch.int64).repeat((micro_batch_size, 1)).cuda()
+        attention_mask = torch.ones(
+            (micro_batch_size, 1, sequence_length, sequence_length), dtype=bool
+        ).cuda()
+
+        logits = self.gpt_model.forward(
+            input_ids=input_ids, position_ids=position_ids, attention_mask=attention_mask
+        )
+
+        assert logits.shape[0] == micro_batch_size
+        assert logits.shape[1] == sequence_length
+        assert logits.shape[2] == self.gpt_model.vocab_size
+
+    def test_no_post_process_forward(self):
+        pass
+
+    def test_no_preprocess_forward(self):
+        pass
+
+    def test_state_dict_for_save_checkpoint(self):
+        pass
+
+    def test_load_state_dict(self):
+        pass
diff --git a/tests/unit_tests/models/test_llava_model.py b/tests/unit_tests/models/test_llava_model.py
new file mode 100644
index 0000000000..22167f82b5
--- /dev/null
+++ b/tests/unit_tests/models/test_llava_model.py
@@ -0,0 +1,441 @@
+# Copyright (c) 2024, NVIDIA CORPORATION. All rights reserved.
+from copy import deepcopy
+
+import pytest
+import torch
+
+from megatron.core import InferenceParams
+from megatron.core.models.gpt.gpt_layer_specs import get_gpt_layer_with_transformer_engine_spec
+from megatron.core.models.multimodal.llava_model import LLaVAModel
+from megatron.core.tensor_parallel.random import model_parallel_cuda_manual_seed
+from megatron.core.transformer.transformer_config import TransformerConfig
+from tests.unit_tests.test_utilities import Utils
+
+
+class TestLLaVAModel:
+    @pytest.mark.internal  # The model is under active development and its methods may change.
+    def setup_method(self, method):
+        Utils.initialize_model_parallel(1, 1)
+        model_parallel_cuda_manual_seed(123)
+
+        self.language_hidden_size = 64
+        self.language_num_attention_heads = 4
+
+        language_config = TransformerConfig(
+            num_layers=3,
+            hidden_size=self.language_hidden_size,
+            num_attention_heads=self.language_num_attention_heads,
+            use_cpu_initialization=False,
+        )
+        vision_config = TransformerConfig(
+            num_layers=2, hidden_size=16, num_attention_heads=2, use_cpu_initialization=False
+        )
+        vision_projection_config = TransformerConfig(
+            num_layers=2,
+            hidden_size=self.language_hidden_size,
+            ffn_hidden_size=32,
+            num_attention_heads=1,
+            use_cpu_initialization=False,
+        )
+
+        language_layer_spec = get_gpt_layer_with_transformer_engine_spec()
+        vision_layer_spec = deepcopy(language_layer_spec)
+        vision_projection_spec = deepcopy(language_layer_spec.submodules.mlp.submodules)
+
+        vision_config.vision_model_type = "clip"
+        self.model = LLaVAModel(
+            language_transformer_config=language_config,
+            language_transformer_layer_spec=language_layer_spec,
+            language_vocab_size=8192,
+            language_max_sequence_length=4096,
+            vision_transformer_config=vision_config,
+            vision_transformer_layer_spec=vision_layer_spec,
+            drop_vision_class_token=False,
+            vision_projection_config=vision_projection_config,
+            vision_projection_layer_spec=vision_projection_spec,
+            img_h=336,
+            img_w=336,
+            patch_dim=14,
+        )
+
+    @pytest.mark.internal
+    def teardown_method(self, method):
+        Utils.destroy_model_parallel()
+
+    @pytest.mark.internal
+    def test_constructor(self):
+        assert isinstance(self.model, LLaVAModel)
+
+        num_weights = sum([p.numel() for p in self.model.parameters()])
+        assert num_weights == 1488736
+
+    @pytest.mark.internal
+    def test_set_input_tensor(self):
+        expected_shape = (1, 2, 3, 4)
+        input_tensor = torch.zeros(expected_shape)
+        self.model.set_input_tensor(input_tensor)
+        assert self.model.vision_model.decoder.input_tensor.shape == expected_shape
+
+    @pytest.mark.internal
+    def test_preprocess_data(self):
+        self.model.cuda()
+
+        hidden_size = 72
+
+        # 3 images with 1 tile and 2 image with 2 tiles = 7 tiles.
+        image_embeddings = (
+            1e-5
+            * torch.arange(577 * 7 * hidden_size, dtype=torch.float)
+            .reshape(577, 7, hidden_size)
+            .cuda()
+        )
+
+        image_token_index = -200
+        input_ids = torch.arange(1024).expand(5, 1024).cuda()
+        input_ids[0, 0] = image_token_index  # image before text
+        input_ids[1, 100] = image_token_index  # image in between
+        input_ids[2, -1] = image_token_index  # image at the end
+        # input_ids[3] - no image
+        input_ids[4, 50] = image_token_index  # two images in between
+        input_ids[4, 150] = image_token_index
+
+        # Offset by 1000 to distinguish from image embeddings.
+        language_embeddings = (
+            1000.0
+            + 1e-5
+            * torch.arange(5 * 1024 * hidden_size, dtype=torch.float)
+            .reshape(5, 1024, hidden_size)
+            .cuda()
+        )
+
+        # Labels are input_ids shifted to left by one.
+        labels = torch.arange(1, 1025, dtype=torch.int).expand(5, 1024).cuda()
+        labels[1, 99] = image_token_index
+        labels[2, -2] = image_token_index
+        labels[4, 49] = image_token_index
+        labels[4, 149] = image_token_index
+
+        loss_mask = torch.ones((5, 1024), dtype=torch.float).cuda()
+        # Mask some text inputs (the text mask should carry over)
+        loss_mask[:2, :10] = 0.0
+        loss_mask[:2, 110:120] = 0.0
+
+        # Number of tiles for each image in the batch.
+        num_image_tiles = torch.tensor([1, 2, 1, 2, 1], dtype=torch.int).cuda()
+
+        use_inference_kv_cache = False
+        attention_mask = None
+
+        embeddings, labels, loss_mask, attention_mask = self.model._preprocess_data(
+            image_embeddings,
+            language_embeddings,
+            input_ids,
+            loss_mask,
+            labels,
+            use_inference_kv_cache,
+            image_token_index,
+            num_image_tiles,
+            attention_mask,
+        )
+
+        img_seq_len = 577
+        # The fifth sample has 2 images with 3 tiles and 1024 text tokens.
+        max_seq_len = 3 * img_seq_len - 2 + 1024
+
+        assert embeddings.shape == torch.Size((max_seq_len, 5, hidden_size))
+        assert labels.shape == torch.Size((5, max_seq_len))
+        assert loss_mask.shape == labels.shape
+
+        # First sample where image is before text (index 0).
+        expected_embeddings = torch.empty(max_seq_len, hidden_size).cuda()
+        expected_embeddings[:577] = image_embeddings[:, 0]
+        expected_embeddings[577:1600] = language_embeddings[0, 1:]
+        expected_embeddings[1600:] = 0  # padding
+
+        expected_labels = torch.empty(max_seq_len, dtype=torch.int).cuda()
+        expected_labels[:576] = -100  # image
+        expected_labels[576:1600] = torch.arange(1, 1025, dtype=torch.int)
+        expected_labels[1600:] = -100  # padding
+
+        expected_loss_mask = torch.empty(max_seq_len, dtype=torch.float).cuda()
+        expected_loss_mask[:577] = 0
+        expected_loss_mask[577:586] = 0
+        expected_loss_mask[586:686] = 1
+        expected_loss_mask[686:696] = 0
+        expected_loss_mask[696:1600] = 1
+        expected_loss_mask[1600:] = 0
+
+        assert torch.allclose(embeddings[:, 0], expected_embeddings)
+        assert torch.allclose(labels[0], expected_labels)
+        assert torch.allclose(loss_mask[0], expected_loss_mask)
+
+        # Second sample where image is in between (index 100). The image has 2 tiles.
+        expected_embeddings = torch.empty(max_seq_len, hidden_size).cuda()
+        expected_embeddings[:100] = language_embeddings[1, :100]
+        expected_embeddings[100:677] = image_embeddings[:, 1]
+        expected_embeddings[677:1254] = image_embeddings[:, 2]
+        expected_embeddings[1254:2177] = language_embeddings[1, 101:]
+        expected_embeddings[2177:] = 0  # padding
+
+        expected_labels = torch.empty(max_seq_len, dtype=torch.int).cuda()
+        expected_labels[:99] = torch.arange(1, 100)
+        expected_labels[99:1253] = -100  # image
+        expected_labels[1253:2177] = torch.arange(101, 1025)
+        expected_labels[2177:] = -100  # padding
+
+        expected_loss_mask = torch.empty(max_seq_len, dtype=torch.float).cuda()
+        expected_loss_mask[:10] = 0
+        expected_loss_mask[10:99] = 1
+        # Last text position before the image is not required to predict the first image embedding.
+        expected_loss_mask[99] = 0
+        expected_loss_mask[100:1254] = 0
+        expected_loss_mask[1254:1263] = 1
+        expected_loss_mask[1263:1273] = 0
+        expected_loss_mask[1273:2177] = 1
+        expected_loss_mask[2177:] = 0  # padding
+
+        assert torch.allclose(embeddings[:, 1], expected_embeddings)
+        assert torch.allclose(labels[1], expected_labels)
+        assert torch.allclose(loss_mask[1], expected_loss_mask)
+
+        # Third sample where image is at the end.
+        expected_embeddings = torch.empty(max_seq_len, hidden_size).cuda()
+        expected_embeddings[:1023] = language_embeddings[2, :1023]
+        expected_embeddings[1023:1600] = image_embeddings[:, 3]
+        expected_embeddings[1600:] = 0  # padding
+
+        expected_labels = torch.empty(max_seq_len, dtype=torch.int).cuda()
+        expected_labels[:1022] = torch.arange(1, 1023)
+        expected_labels[1022:1599] = -100
+        expected_labels[1599] = 1024
+        expected_labels[1600:] = -100  # padding
+
+        expected_loss_mask = torch.empty(max_seq_len, dtype=torch.float).cuda()
+        expected_loss_mask[:1022] = 1
+        # Last text position before the image is not required to predict the first image embedding.
+        expected_loss_mask[1022] = 0
+        expected_loss_mask[1023:1600] = 0
+        expected_loss_mask[1600:] = 0  # padding
+
+        assert torch.allclose(embeddings[:, 2], expected_embeddings)
+        assert torch.allclose(labels[2], expected_labels)
+        assert torch.allclose(loss_mask[2], expected_loss_mask)
+
+        # Fourth sample where there is no image.
+        expected_embeddings = torch.empty(max_seq_len, hidden_size).cuda()
+        expected_embeddings[:1024] = language_embeddings[3]
+        expected_embeddings[1024:] = 0  # padding
+
+        expected_labels = torch.empty(max_seq_len, dtype=torch.int).cuda()
+        expected_labels[:1024] = torch.arange(1, 1025)
+        expected_labels[1024:] = -100  # padding
+
+        expected_loss_mask = torch.empty(max_seq_len, dtype=torch.float).cuda()
+        expected_loss_mask[:1024] = 1
+        expected_loss_mask[1024:] = 0  # padding
+
+        assert torch.allclose(embeddings[:, 3], expected_embeddings)
+        assert torch.allclose(labels[3], expected_labels)
+        assert torch.allclose(loss_mask[3], expected_loss_mask)
+
+        # Fifth sample has two images in between (indices 50 and 150). The first image has two tiles.
+        expected_embeddings = torch.empty(max_seq_len, hidden_size).cuda()
+        expected_embeddings[:50] = language_embeddings[4, :50]
+        expected_embeddings[50:627] = image_embeddings[:, 4]  # two tiles
+        expected_embeddings[627:1204] = image_embeddings[:, 5]
+        expected_embeddings[1204:1303] = language_embeddings[4, 51:150]
+        expected_embeddings[1303:1880] = image_embeddings[:, 6]
+        expected_embeddings[1880:] = language_embeddings[4, 151:]
+
+        expected_labels = torch.empty(max_seq_len, dtype=torch.int).cuda()
+        expected_labels[:49] = torch.arange(1, 50)
+        expected_labels[49:1203] = -100  # image
+        expected_labels[1203:1302] = torch.arange(51, 150)
+        expected_labels[1302:1879] = -100  # image
+        expected_labels[1879:] = torch.arange(151, 1025)
+
+        expected_loss_mask = torch.empty(max_seq_len, dtype=torch.float).cuda()
+        expected_loss_mask[:49] = 1
+        expected_loss_mask[49:1204] = 0
+        expected_loss_mask[1204:1302] = 1
+        expected_loss_mask[1302:1880] = 0
+        expected_loss_mask[1880:] = 1
+
+        assert torch.allclose(embeddings[:, 4], expected_embeddings)
+        assert torch.allclose(labels[4], expected_labels)
+        assert torch.allclose(loss_mask[4], expected_loss_mask)
+
+    @pytest.mark.internal
+    def test_forward(self):
+        self.model.cuda()
+
+        # 3 images with 1 tile and 2 images with 2 tiles.
+        img = torch.randn((7, 3, 336, 336)).cuda()
+
+        image_token_index = -200
+        input_ids = torch.randint(0, 2048, (5, 1024)).cuda()
+        input_ids[0, 0] = image_token_index  # image before text
+        input_ids[1, 100] = image_token_index  # image in between
+        input_ids[2, -1] = image_token_index  # image at the end
+        # input_ids[3] - no image
+        input_ids[4, 50] = image_token_index
+        input_ids[4, 150] = image_token_index
+
+        position_ids = torch.arange(0, 1024, dtype=torch.int).expand(5, 1024).cuda()
+
+        loss_mask = torch.ones((5, 1024)).cuda()
+
+        attention_mask = None  # Causal.
+
+        labels = torch.randint(0, 2048, (5, 1024)).cuda()
+        labels[1, 99] = image_token_index
+        labels[2, -2] = image_token_index
+
+        num_image_tiles = torch.tensor([1, 2, 1, 2, 1], dtype=torch.int).cuda()
+
+        # Try with labels.
+        loss, new_loss_mask = self.model.forward(
+            img,
+            input_ids,
+            position_ids,
+            attention_mask,
+            labels,
+            loss_mask,
+            num_image_tiles=num_image_tiles,
+        )
+
+        # The maximum sequence length is given by the sample with 2 images in 3 tiles, minus two image token indices, plus other text tokens.
+        img_seq_len = 577
+        max_seq_len = img_seq_len * 3 - 2 + 1024
+        assert loss.shape == new_loss_mask.shape == torch.Size((5, max_seq_len))
+
+        # Try text-only input.
+        loss, new_loss_mask = self.model.forward(
+            torch.tensor([], dtype=torch.float).cuda(),
+            torch.randint(0, 2048, (5, 1024)).cuda(),
+            position_ids,
+            attention_mask,
+            torch.randint(0, 2048, (5, 1024)).cuda(),
+            loss_mask,
+            num_image_tiles=torch.tensor([], dtype=torch.int).cuda(),
+        )
+
+        assert loss.shape == new_loss_mask.shape == torch.Size((5, 1024))
+
+        # Try without labels and without inference params.
+        logits = self.model.forward(
+            img,
+            input_ids,
+            position_ids,
+            attention_mask,
+            labels=None,
+            loss_mask=None,
+            num_image_tiles=num_image_tiles,
+        )
+        assert logits.shape == torch.Size((5, max_seq_len, 8192))
+
+        # Try without labels and with inference params.
+        inference_params = InferenceParams(5, max_seq_len)
+        logits = self.model.forward(
+            img,
+            input_ids,
+            position_ids,
+            attention_mask,
+            labels=None,
+            loss_mask=None,
+            num_image_tiles=num_image_tiles,
+            inference_params=inference_params,
+        )
+        assert logits.shape == torch.Size((5, max_seq_len, 8192))
+
+        # Check KV cache got populated correctly.
+        kv_dict = inference_params.key_value_memory_dict
+
+        assert kv_dict["image_tokens_count"] == 577 * 7
+        for layer_no in range(1, 4):  # 3 layers in the model.
+            layer_kv = kv_dict[layer_no]
+            # Expected shape is [sequence_len, batch_size, num_heads, hidden_size_per_head]
+            assert (
+                layer_kv[0].shape
+                == layer_kv[1].shape
+                == torch.Size((max_seq_len, 5, self.language_num_attention_heads, 16))
+            )
+
+    @pytest.mark.internal
+    def test_save_load(self, tmp_path):
+        path = tmp_path / "model.pt"
+        torch.save(self.model.state_dict(), path)
+
+        self.model.load_state_dict(torch.load(path))
+
+    @pytest.mark.internal
+    def test_freeze(self):
+        self.model.freeze(
+            freeze_language_model=True, freeze_vision_model=True, freeze_vision_projection=False
+        )
+
+        for module in [self.model.language_model, self.model.vision_model]:
+            for param in module.parameters():
+                assert not param.requires_grad
+
+        for param in self.model.vision_projection.parameters():
+            assert param.requires_grad
+
+
+class TestLLaVAModelSigLIP:
+    @pytest.mark.internal  # The model is under active development and its methods may change.
+    def setup_method(self, method):
+        Utils.initialize_model_parallel(1, 1)
+        model_parallel_cuda_manual_seed(123)
+
+        language_config = TransformerConfig(
+            num_layers=3, hidden_size=128, num_attention_heads=8, use_cpu_initialization=False
+        )
+        vision_config = TransformerConfig(
+            num_layers=2, hidden_size=64, num_attention_heads=4, use_cpu_initialization=False
+        )
+        vision_projection_config = TransformerConfig(
+            num_layers=2,
+            hidden_size=128,
+            ffn_hidden_size=72,
+            num_attention_heads=1,
+            use_cpu_initialization=False,
+        )
+
+        language_layer_spec = get_gpt_layer_with_transformer_engine_spec()
+        vision_layer_spec = deepcopy(language_layer_spec)
+        vision_projection_spec = deepcopy(language_layer_spec.submodules.mlp.submodules)
+
+        vision_config.vision_model_type = "siglip"
+        self.model = LLaVAModel(
+            language_transformer_config=language_config,
+            language_transformer_layer_spec=language_layer_spec,
+            language_vocab_size=2048,
+            language_max_sequence_length=4096,
+            vision_transformer_config=vision_config,
+            vision_transformer_layer_spec=vision_layer_spec,
+            drop_vision_class_token=False,
+            vision_projection_config=vision_projection_config,
+            vision_projection_layer_spec=vision_projection_spec,
+            img_h=336,
+            img_w=336,
+            patch_dim=14,
+        )
+
+    @pytest.mark.internal
+    def teardown_method(self, method):
+        Utils.destroy_model_parallel()
+
+    @pytest.mark.internal
+    def test_constructor(self):
+        assert isinstance(self.model, LLaVAModel)
+
+        num_weights = sum([p.numel() for p in self.model.parameters()])
+        assert num_weights == 1832456
+
+    @pytest.mark.internal
+    def test_set_input_tensor(self):
+        expected_shape = (1, 2, 3, 4)
+        input_tensor = torch.zeros(expected_shape)
+        self.model.set_input_tensor(input_tensor)
+        assert self.model.vision_model.decoder.input_tensor.shape == expected_shape
diff --git a/tests/unit_tests/models/test_mamba_model.py b/tests/unit_tests/models/test_mamba_model.py
new file mode 100644
index 0000000000..913adb538c
--- /dev/null
+++ b/tests/unit_tests/models/test_mamba_model.py
@@ -0,0 +1,123 @@
+# Copyright (c) 2024, NVIDIA CORPORATION. All rights reserved.
+
+import pytest
+import torch
+
+from megatron.core import InferenceParams
+from megatron.core.models.mamba.mamba_layer_specs import mamba_stack_spec
+from megatron.core.models.mamba.mamba_model import MambaModel
+from megatron.core.tensor_parallel.random import model_parallel_cuda_manual_seed
+from megatron.core.transformer.transformer_config import TransformerConfig
+from tests.unit_tests.test_utilities import Utils
+
+
+class TestMambaModel:
+
+    def setup_method(self, method):
+        Utils.initialize_model_parallel(1, 1)
+        model_parallel_cuda_manual_seed(123)
+        transformer_config = TransformerConfig(
+            num_layers=3,  # 1 Mamba layer, 1 attention layer, 1 MLP layer
+            hidden_size=256,  # The Mamba layer places several constraints on this
+            num_attention_heads=4,
+            use_cpu_initialization=True,
+        )
+        self.model = MambaModel(
+            config=transformer_config,
+            mamba_stack_spec=mamba_stack_spec,
+            vocab_size=100,
+            max_sequence_length=4,
+            hybrid_attention_ratio=0.3,
+            hybrid_mlp_ratio=0.3,
+        )
+
+    def teardown_method(self, method):
+        Utils.destroy_model_parallel()
+
+    def test_constructor(self):
+        assert isinstance(self.model, MambaModel)
+
+        assert self.model.max_sequence_length == 4
+
+        num_weights = sum([p.numel() for p in self.model.parameters()])
+        assert num_weights == 1774872
+
+    def test_set_input_tensor(self):
+        config: TransformerConfig = self.model.config
+        sequence_length = self.model.max_sequence_length
+        micro_batch_size = 2
+
+        # [sequence length, batch size, hidden size]
+        input_tensor = torch.ones((sequence_length, micro_batch_size, config.hidden_size))
+
+        self.model.set_input_tensor(input_tensor)
+
+        assert self.model.decoder.input_tensor.shape[0] == sequence_length
+        assert self.model.decoder.input_tensor.shape[1] == micro_batch_size
+        assert self.model.decoder.input_tensor.shape[2] == config.hidden_size
+
+    def test_forward(self):
+        config: TransformerConfig = self.model.config
+        sequence_length = self.model.max_sequence_length
+        micro_batch_size = 2
+
+        self.model.cuda()
+
+        data = list(range(sequence_length))
+        input_ids = torch.tensor(data, dtype=torch.int64).repeat((micro_batch_size, 1)).cuda()
+        position_ids = torch.tensor(data, dtype=torch.int64).repeat((micro_batch_size, 1)).cuda()
+        attention_mask = torch.ones(
+            (micro_batch_size, 1, sequence_length, sequence_length), dtype=bool
+        ).cuda()
+
+        logits = self.model.forward(
+            input_ids=input_ids, position_ids=position_ids, attention_mask=attention_mask
+        )
+
+        assert logits.shape[0] == micro_batch_size
+        assert logits.shape[1] == sequence_length
+        assert logits.shape[2] == self.model.vocab_size
+
+    def test_inference(self):
+        config: TransformerConfig = self.model.config
+        micro_batch_size = 2
+        inference_params: InferenceParams = InferenceParams(
+            max_batch_size=micro_batch_size, max_sequence_length=self.model.max_sequence_length
+        )
+        prompt_length = self.model.max_sequence_length - 1
+
+        self.model.cuda()
+
+        # load-context/first-output-token, step/generate
+        for offset in (0, prompt_length):
+            if offset == 0:
+                sequence_length = prompt_length
+            else:
+                sequence_length = 1
+            inference_params.sequence_len_offset = offset
+
+            data = list(range(sequence_length))
+            input_ids = torch.tensor(data, dtype=torch.int64).repeat((micro_batch_size, 1)).cuda()
+            position_ids = (
+                torch.tensor(data, dtype=torch.int64).repeat((micro_batch_size, 1)).cuda()
+            )
+            attention_mask = torch.ones(
+                (micro_batch_size, 1, sequence_length, sequence_length), dtype=bool
+            ).cuda()
+
+            logits = self.model.forward(
+                input_ids=input_ids,
+                position_ids=position_ids,
+                attention_mask=attention_mask,
+                inference_params=inference_params,
+            )
+
+            assert logits.shape[0] == micro_batch_size
+            assert logits.shape[1] == sequence_length
+            assert logits.shape[2] == self.model.vocab_size
+
+    def test_save_load(self, tmp_path):
+        path = tmp_path / "model.pt"
+        torch.save(self.model.state_dict(), path)
+
+        self.model.load_state_dict(torch.load(path))
diff --git a/tests/unit_tests/models/test_multimodal_projector.py b/tests/unit_tests/models/test_multimodal_projector.py
new file mode 100644
index 0000000000..976dc489da
--- /dev/null
+++ b/tests/unit_tests/models/test_multimodal_projector.py
@@ -0,0 +1,75 @@
+# Copyright (c) 2024, NVIDIA CORPORATION. All rights reserved.
+
+import pytest
+import torch
+
+from megatron.core.models.gpt.gpt_layer_specs import _get_mlp_module_spec
+from megatron.core.models.vision.multimodal_projector import MultimodalProjector
+from megatron.core.tensor_parallel.layers import ColumnParallelLinear
+from megatron.core.tensor_parallel.random import model_parallel_cuda_manual_seed
+from megatron.core.transformer.mlp import MLPSubmodules
+from megatron.core.transformer.transformer_config import TransformerConfig
+from tests.unit_tests.test_utilities import Utils
+
+
+class TestMultimodalProjector:
+
+    def setup_method(self, method):
+        Utils.initialize_model_parallel(1, 1)
+        model_parallel_cuda_manual_seed(123)
+        transformer_config = TransformerConfig(
+            num_layers=1, hidden_size=64, num_attention_heads=4, use_cpu_initialization=True
+        )
+        mlp_layer_spec = _get_mlp_module_spec().submodules
+
+        affine_layer_spec = MLPSubmodules(linear_fc1=ColumnParallelLinear, linear_fc2=None)
+        self.mlp = MultimodalProjector(
+            config=transformer_config,
+            submodules=mlp_layer_spec,
+            projector_type="mlp",
+            input_size=1024,
+        )
+        self.affine = MultimodalProjector(
+            config=transformer_config,
+            submodules=affine_layer_spec,
+            projector_type="affine",
+            input_size=1024,
+        )
+
+    def teardown_method(self, method):
+        Utils.destroy_model_parallel()
+
+    def test_constructor(self):
+        assert isinstance(self.mlp, MultimodalProjector)
+        assert isinstance(self.affine, MultimodalProjector)
+
+        num_weights = sum([p.numel() for p in self.mlp.parameters()])
+        assert num_weights == 280896
+
+        num_weights = sum([p.numel() for p in self.affine.parameters()])
+        assert num_weights == 65600
+
+    def test_forward(self):
+        self.mlp.cuda()
+        self.affine.cuda()
+
+        image_projection = torch.zeros((2, 1024)).cuda()
+
+        logits = self.mlp.forward(image_projection)
+        assert len(logits) == 2
+        assert logits.shape == torch.Size([2, 64])
+
+        logits = self.affine.forward(image_projection)
+        assert len(logits) == 2
+        assert logits.shape == torch.Size([2, 64])
+
+    def test_save_load(self, tmp_path):
+        path = tmp_path / "mlp.pt"
+        torch.save(self.mlp.state_dict(), path)
+
+        self.mlp.load_state_dict(torch.load(path))
+
+        path = tmp_path / "affine.pt"
+        torch.save(self.affine.state_dict(), path)
+
+        self.affine.load_state_dict(torch.load(path))
diff --git a/tests/unit_tests/models/test_t5_model.py b/tests/unit_tests/models/test_t5_model.py
new file mode 100644
index 0000000000..efe12b78f4
--- /dev/null
+++ b/tests/unit_tests/models/test_t5_model.py
@@ -0,0 +1,245 @@
+# Copyright (c) 2023, NVIDIA CORPORATION. All rights reserved.
+
+from copy import deepcopy
+
+import pytest
+import torch
+
+import megatron.core.parallel_state as ps
+from megatron.core.models.T5.t5_model import T5Model
+from megatron.core.models.T5.t5_spec import (
+    get_t5_decoder_with_local_block_spec,
+    get_t5_decoder_with_transformer_engine_block_spec,
+    get_t5_encoder_with_local_block_spec,
+    get_t5_encoder_with_transformer_engine_block_spec,
+)
+from megatron.core.tensor_parallel.random import model_parallel_cuda_manual_seed
+from megatron.core.transformer.transformer_config import TransformerConfig
+from tests.unit_tests.test_utilities import Utils
+
+
+class TestT5Model:
+
+    def setup_method(self, method):
+        tp = 4
+        pp = 1
+        Utils.initialize_model_parallel(
+            tensor_model_parallel_size=tp,
+            pipeline_model_parallel_size=pp,
+            encoder_pipeline_model_parallel_size=pp,
+        )
+        model_parallel_cuda_manual_seed(123)
+        transformer_config = TransformerConfig(
+            num_layers=12,
+            hidden_size=768,
+            num_attention_heads=12,
+            kv_channels=64,
+            ffn_hidden_size=3072,
+            use_cpu_initialization=True,
+            pipeline_dtype=torch.bfloat16,
+            tensor_model_parallel_size=tp,
+            pipeline_model_parallel_size=pp,
+        )
+        rank = ps.get_pipeline_model_parallel_rank()
+        world_size = ps.get_pipeline_model_parallel_world_size()
+        en_block_spec = get_t5_encoder_with_transformer_engine_block_spec(12)
+        de_block_spec = get_t5_decoder_with_transformer_engine_block_spec(12)
+
+        first_decoder_rank = pp
+        pre_process = rank == 0 or rank == first_decoder_rank
+        post_process = (rank == (first_decoder_rank - 1)) or (rank == (world_size - 1))
+        add_encoder = ps.is_inside_encoder(rank)
+        add_decoder = ps.is_inside_decoder(rank)
+
+        self.t5_model = T5Model(
+            encoder_config=transformer_config,
+            config=transformer_config,
+            transformer_encoder_layer_spec=en_block_spec,
+            transformer_decoder_layer_spec=de_block_spec,
+            vocab_size=29184,
+            max_sequence_length=4,
+            pre_process=pre_process,
+            post_process=post_process,
+            add_encoder=add_encoder,
+            add_decoder=add_decoder,
+        )
+
+    def teardown_method(self, method):
+        Utils.destroy_model_parallel()
+
+    def test_constructor(self):
+        assert isinstance(self.t5_model, T5Model)
+        assert Utils.world_size == 8
+
+        assert self.t5_model.max_sequence_length == 4
+        if self.t5_model.add_encoder:
+            assert not self.t5_model.add_decoder
+            assert self.t5_model.encoder.num_layers_per_pipeline_rank == 12
+            assert self.t5_model.pre_process
+            assert self.t5_model.post_process
+        else:
+            assert self.t5_model.add_decoder
+            assert self.t5_model.decoder.num_layers_per_pipeline_rank == 12
+            assert self.t5_model.pre_process
+            assert self.t5_model.post_process
+
+    def test_set_input_tensor(self):
+        config: TransformerConfig = self.t5_model.config
+        sequence_length = self.t5_model.max_sequence_length
+        micro_batch_size = 2
+
+        # [sequence length, batch size, hidden size]
+        input_tensor = torch.ones((sequence_length, micro_batch_size, config.hidden_size))
+
+        self.t5_model.set_input_tensor(input_tensor)
+
+        if self.t5_model.add_encoder:
+            assert self.t5_model.encoder.input_tensor.shape[0] == sequence_length
+            assert self.t5_model.encoder.input_tensor.shape[1] == micro_batch_size
+            assert self.t5_model.encoder.input_tensor.shape[2] == config.hidden_size
+        else:
+            assert self.t5_model.encoder is None
+            assert self.t5_model.encoder_hidden_state.shape[0] == sequence_length
+            assert self.t5_model.encoder_hidden_state.shape[1] == micro_batch_size
+            assert self.t5_model.encoder_hidden_state.shape[2] == config.hidden_size
+
+    def test_post_process_forward(self):
+        config: TransformerConfig = self.t5_model.config
+        sequence_length = self.t5_model.max_sequence_length
+        micro_batch_size = 2
+
+        self.t5_model.cuda()
+
+        data = list(range(sequence_length))
+        encoder_input_ids = (
+            torch.tensor(data, dtype=torch.int64).repeat((micro_batch_size, 1)).cuda()
+        )
+        decoder_input_ids = (
+            torch.tensor(data, dtype=torch.int64).repeat((micro_batch_size, 1)).cuda()
+        )
+        encoder_attn_mask = torch.ones((1, sequence_length, sequence_length), dtype=bool).cuda()
+        decoder_attn_mask = torch.ones((1, sequence_length, sequence_length), dtype=bool).cuda()
+        encoder_decoder_attn_mask = torch.ones(
+            (1, sequence_length, sequence_length), dtype=bool
+        ).cuda()
+
+        if self.t5_model.add_decoder:
+            encoder_hidden_states = torch.zeros(
+                (sequence_length, micro_batch_size, config.hidden_size), dtype=torch.float32
+            ).cuda()
+        else:
+            encoder_hidden_states = None
+
+        output = self.t5_model.forward(
+            encoder_input_ids=encoder_input_ids,
+            decoder_input_ids=decoder_input_ids,
+            encoder_attn_mask=encoder_attn_mask,
+            decoder_attn_mask=decoder_attn_mask,
+            encoder_decoder_attn_mask=encoder_decoder_attn_mask,
+            encoder_hidden_states=encoder_hidden_states,
+        )
+        if self.t5_model.add_decoder:
+            logits = output
+            assert logits.shape[0] == micro_batch_size
+            assert logits.shape[1] == sequence_length
+            assert (
+                logits.shape[2]
+                == self.t5_model.vocab_size // ps.get_tensor_model_parallel_world_size()
+            )
+        else:
+            encoder_hidden_states = output
+            assert encoder_hidden_states.shape[0] == sequence_length
+            assert encoder_hidden_states.shape[1] == micro_batch_size
+            assert encoder_hidden_states.shape[2] == config.hidden_size
+
+    def test_forward_output_encoder_hidden_only(self):
+        config: TransformerConfig = self.t5_model.config
+        sequence_length = self.t5_model.max_sequence_length
+        micro_batch_size = 2
+
+        self.t5_model.cuda()
+
+        data = list(range(sequence_length))
+        encoder_input_ids = (
+            torch.tensor(data, dtype=torch.int64).repeat((micro_batch_size, 1)).cuda()
+        )
+        decoder_input_ids = (
+            torch.tensor(data, dtype=torch.int64).repeat((micro_batch_size, 1)).cuda()
+        )
+        encoder_attn_mask = torch.ones((1, sequence_length, sequence_length), dtype=bool).cuda()
+        decoder_attn_mask = torch.ones((1, sequence_length, sequence_length), dtype=bool).cuda()
+        encoder_decoder_attn_mask = torch.ones(
+            (1, sequence_length, sequence_length), dtype=bool
+        ).cuda()
+
+        encoder_hidden_states = self.t5_model.forward(
+            encoder_input_ids=encoder_input_ids,
+            decoder_input_ids=decoder_input_ids,
+            encoder_attn_mask=encoder_attn_mask,
+            decoder_attn_mask=decoder_attn_mask,
+            encoder_decoder_attn_mask=encoder_decoder_attn_mask,
+            output_encoder_hidden_only=True,
+        )
+        if self.t5_model.add_decoder:
+            assert encoder_hidden_states is None
+        else:
+            assert encoder_hidden_states.shape[0] == sequence_length
+            assert encoder_hidden_states.shape[1] == micro_batch_size
+            assert encoder_hidden_states.shape[2] == config.hidden_size
+
+    def test_forward_with_encoder_hidden_states(self):
+        config: TransformerConfig = self.t5_model.config
+        sequence_length = self.t5_model.max_sequence_length
+        micro_batch_size = 2
+
+        self.t5_model.cuda()
+
+        data = list(range(sequence_length))
+        encoder_input_ids = (
+            torch.tensor(data, dtype=torch.int64).repeat((micro_batch_size, 1)).cuda()
+        )
+        decoder_input_ids = (
+            torch.tensor(data, dtype=torch.int64).repeat((micro_batch_size, 1)).cuda()
+        )
+        encoder_attn_mask = torch.ones((1, sequence_length, sequence_length), dtype=bool).cuda()
+        decoder_attn_mask = torch.ones((1, sequence_length, sequence_length), dtype=bool).cuda()
+        encoder_decoder_attn_mask = torch.ones(
+            (1, sequence_length, sequence_length), dtype=bool
+        ).cuda()
+        encoder_hidden_states = torch.zeros(
+            (sequence_length, micro_batch_size, config.hidden_size), dtype=torch.float32
+        ).cuda()
+
+        output = self.t5_model.forward(
+            encoder_input_ids=None,
+            decoder_input_ids=decoder_input_ids,
+            encoder_attn_mask=encoder_attn_mask,
+            decoder_attn_mask=decoder_attn_mask,
+            encoder_decoder_attn_mask=encoder_decoder_attn_mask,
+            encoder_hidden_states=encoder_hidden_states,
+        )
+        if self.t5_model.add_decoder:
+            logits = output
+            assert logits.shape[0] == micro_batch_size
+            assert logits.shape[1] == sequence_length
+            assert (
+                logits.shape[2]
+                == self.t5_model.vocab_size // ps.get_tensor_model_parallel_world_size()
+            )
+        else:
+            encoder_hidden_states = output
+            assert encoder_hidden_states.shape[0] == sequence_length
+            assert encoder_hidden_states.shape[1] == micro_batch_size
+            assert encoder_hidden_states.shape[2] == config.hidden_size
+
+    def test_no_post_process_forward(self):
+        pass
+
+    def test_no_preprocess_forward(self):
+        pass
+
+    def test_state_dict_for_save_checkpoint(self):
+        pass
+
+    def test_load_state_dict(self):
+        pass
diff --git a/tests/unit_tests/pipeline_parallel/__init__.py b/tests/unit_tests/pipeline_parallel/__init__.py
new file mode 100644
index 0000000000..e69de29bb2
diff --git a/tests/unit_tests/pipeline_parallel/test_schedules.py b/tests/unit_tests/pipeline_parallel/test_schedules.py
new file mode 100644
index 0000000000..06994094fc
--- /dev/null
+++ b/tests/unit_tests/pipeline_parallel/test_schedules.py
@@ -0,0 +1,271 @@
+import pytest
+import torch
+from pytest_mock import mocker
+
+import megatron.core.pipeline_parallel.schedules as schedule
+from megatron.core import ModelParallelConfig
+from tests.unit_tests.test_utilities import Utils
+
+rank = Utils.rank
+
+
+def test_get_forward_backward_func():
+    Utils.initialize_model_parallel(tensor_model_parallel_size=2, pipeline_model_parallel_size=1)
+    assert schedule.get_forward_backward_func() == schedule.forward_backward_no_pipelining
+    Utils.destroy_model_parallel()
+    Utils.initialize_model_parallel(tensor_model_parallel_size=2, pipeline_model_parallel_size=4)
+    assert (
+        schedule.get_forward_backward_func()
+        == schedule.forward_backward_pipelining_without_interleaving
+    )
+    Utils.destroy_model_parallel()
+    Utils.initialize_model_parallel(
+        tensor_model_parallel_size=2,
+        pipeline_model_parallel_size=4,
+        virtual_pipeline_model_parallel_size=2,
+    )
+    assert (
+        schedule.get_forward_backward_func()
+        == schedule.forward_backward_pipelining_with_interleaving
+    )
+    Utils.destroy_model_parallel()
+    Utils.initialize_model_parallel(
+        tensor_model_parallel_size=2,
+        pipeline_model_parallel_size=2,
+        virtual_pipeline_model_parallel_size=4,
+    )
+    assert (
+        schedule.get_forward_backward_func()
+        == schedule.forward_backward_pipelining_with_interleaving
+    )
+    Utils.destroy_model_parallel()
+
+
+def test_deallocate_output_tensor():
+    out = torch.tensor([[1, 2, 3], [4, 5, 6]])
+    schedule.deallocate_output_tensor(out)
+    assert out.nelement() == 6
+
+
+def test_forward_backward_func_without_pipeline_parallel(mocker):
+    from megatron.core.pipeline_parallel import get_forward_backward_func
+
+    Utils.initialize_model_parallel(tensor_model_parallel_size=2, pipeline_model_parallel_size=1)
+
+    def forward_step_func(data_iterator, model):
+        import os
+
+        rank = int(os.environ['LOCAL_RANK'])
+        dummy_data = torch.ones(1, 4)
+
+        def loss_func(output_tensor):
+            return rank, {'loss_reduced': rank}
+
+        return model(dummy_data), loss_func
+
+    model = torch.nn.Linear(4, 1)
+    model.model_type = 'unit-test'
+
+    def set_input_tensor(input_tensor):
+        return None
+
+    model.set_input_tensor = set_input_tensor
+
+    forward_backward_func = get_forward_backward_func()
+    assert schedule.get_forward_backward_func() == schedule.forward_backward_no_pipelining
+
+    mocker.patch("megatron.core.pipeline_parallel.schedules.custom_backward", return_value=2)
+    config = ModelParallelConfig(pipeline_model_parallel_size=1)
+    model.config = config
+
+    losses_reduced = forward_backward_func(
+        forward_step_func=forward_step_func,
+        data_iterator=range(0, 100),
+        model=[model],
+        num_microbatches=4,
+        seq_length=None,
+        micro_batch_size=None,
+        forward_only=True,
+    )
+
+    loss_reduced_expected = [
+        {'loss_reduced': rank},
+        {'loss_reduced': rank},
+        {'loss_reduced': rank},
+        {'loss_reduced': rank},
+    ]
+
+    for i, j in zip(losses_reduced, loss_reduced_expected):
+        print(losses_reduced)
+        assert i['loss_reduced'] == j['loss_reduced']
+    Utils.destroy_model_parallel()
+
+
+def test_forward_backward_func_with_pipeline_parallel(mocker):
+    from megatron.core.pipeline_parallel import get_forward_backward_func
+
+    Utils.initialize_model_parallel(tensor_model_parallel_size=1, pipeline_model_parallel_size=4)
+
+    def forward_step_func(data_iterator, model):
+        import os
+
+        rank = int(os.environ['LOCAL_RANK'])
+
+        def loss_func(output_tensor):
+            return rank, {'loss_reduced': rank}
+
+        return torch.rand(512, 8, 256).cuda(), loss_func
+
+    model = torch.nn.Linear(4, 1)
+    model.model_type = 'unit-test'
+
+    def set_input_tensor(input_tensor):
+        return None
+
+    model.set_input_tensor = set_input_tensor
+
+    forward_backward_func = get_forward_backward_func()
+    assert (
+        schedule.get_forward_backward_func()
+        == schedule.forward_backward_pipelining_without_interleaving
+    )
+
+    sequence_length = 512
+    micro_batch_size = 8
+    hidden_size = 256
+
+    config = ModelParallelConfig(
+        pipeline_model_parallel_size=4, sequence_parallel=False, pipeline_dtype=torch.float
+    )
+    config.hidden_size = hidden_size
+    model.config = config
+
+    losses_reduced = forward_backward_func(
+        forward_step_func=forward_step_func,
+        data_iterator=None,
+        model=[model],
+        num_microbatches=micro_batch_size,
+        seq_length=sequence_length,
+        micro_batch_size=micro_batch_size,
+        forward_only=True,
+    )
+
+    loss_reduced_expected = [
+        {'loss_reduced': rank},
+        {'loss_reduced': rank},
+        {'loss_reduced': rank},
+        {'loss_reduced': rank},
+    ]
+    for i, j in zip(losses_reduced, loss_reduced_expected):
+        print(losses_reduced)
+        assert i['loss_reduced'] == j['loss_reduced']
+    Utils.destroy_model_parallel()
+
+
+def test_forward_backward_func_with_interleaving(mocker):
+    from megatron.core.enums import ModelType
+    from megatron.core.pipeline_parallel import get_forward_backward_func
+
+    Utils.initialize_model_parallel(
+        tensor_model_parallel_size=1,
+        pipeline_model_parallel_size=4,
+        virtual_pipeline_model_parallel_size=2,
+    )
+
+    def forward_step_func(data_iterator, model):
+        import os
+
+        rank = int(os.environ['LOCAL_RANK'])
+
+        def loss_func(output_tensor):
+            return rank, {'loss_reduced': rank}
+
+        return torch.rand(512, 8, 256).cuda(), loss_func
+
+    model = torch.nn.Linear(4, 1)
+
+    def set_input_tensor(input_tensor):
+        return None
+
+    model.set_input_tensor = set_input_tensor
+
+    forward_backward_func = get_forward_backward_func()
+    assert (
+        schedule.get_forward_backward_func()
+        == schedule.forward_backward_pipelining_with_interleaving
+    )
+
+    sequence_length = 512
+    micro_batch_size = 8
+    hidden_size = 256
+
+    config = ModelParallelConfig(
+        pipeline_model_parallel_size=4, sequence_parallel=False, pipeline_dtype=torch.float
+    )
+    config.hidden_size = hidden_size
+    model.config = config
+
+    mocker.patch("megatron.core.pipeline_parallel.schedules.custom_backward", return_value=2)
+
+    with pytest.raises(RuntimeError):
+        model.model_type = ModelType.encoder_and_decoder
+        forward_backward_func(
+            forward_step_func=forward_step_func,
+            data_iterator=[range(0, 100)],
+            model=[model, model],
+            num_microbatches=micro_batch_size,
+            seq_length=sequence_length,
+            micro_batch_size=micro_batch_size,
+            decoder_seq_length=sequence_length,
+            forward_only=True,
+        )
+
+    with pytest.raises(RuntimeError):
+        model.model_type = ModelType.encoder_or_decoder
+        forward_backward_func(
+            forward_step_func=forward_step_func,
+            data_iterator=[range(0, 100)],
+            model=[model, model],
+            num_microbatches=micro_batch_size,
+            seq_length=sequence_length,
+            micro_batch_size=micro_batch_size,
+            decoder_seq_length=256,
+            forward_only=True,
+        )
+
+    with pytest.raises(RuntimeError):
+        model.model_type = ModelType.encoder_or_decoder
+        forward_backward_func(
+            forward_step_func=forward_step_func,
+            data_iterator=[range(0, 100)],
+            model=[model, model],
+            num_microbatches=7,
+            seq_length=sequence_length,
+            micro_batch_size=micro_batch_size,
+            decoder_seq_length=512,
+            forward_only=True,
+        )
+
+    model.model_type = ModelType.encoder_or_decoder
+    losses_reduced = forward_backward_func(
+        forward_step_func=forward_step_func,
+        data_iterator=[range(0, 100), range(0, 100)],
+        model=[model, model],
+        num_microbatches=micro_batch_size,
+        seq_length=sequence_length,
+        micro_batch_size=micro_batch_size,
+        decoder_seq_length=sequence_length,
+        forward_only=True,
+    )
+
+    loss_reduced_expected = [
+        {'loss_reduced': rank},
+        {'loss_reduced': rank},
+        {'loss_reduced': rank},
+        {'loss_reduced': rank},
+    ]
+    for i, j in zip(losses_reduced, loss_reduced_expected):
+        print(losses_reduced)
+        assert i['loss_reduced'] == j['loss_reduced']
+
+    Utils.destroy_model_parallel()
diff --git a/tests/unit_tests/tensor_parallel/__init__.py b/tests/unit_tests/tensor_parallel/__init__.py
new file mode 100644
index 0000000000..e69de29bb2
diff --git a/tests/unit_tests/tensor_parallel/test_cross_entropy.py b/tests/unit_tests/tensor_parallel/test_cross_entropy.py
index a29365ee43..66982fd234 100644
--- a/tests/unit_tests/tensor_parallel/test_cross_entropy.py
+++ b/tests/unit_tests/tensor_parallel/test_cross_entropy.py
@@ -1,14 +1,34 @@
-from megatron.core.tensor_parallel.cross_entropy import vocab_parallel_cross_entropy
+import numpy as np
 import torch
+
+from megatron.core.tensor_parallel.cross_entropy import vocab_parallel_cross_entropy
 from tests.unit_tests.test_utilities import Utils
-import numpy as np
+
 
 def test_vocab_parallel_cross_entropy():
-    Utils.initialize_model_parallel(4,2)
-    vocab_parallel_logits = torch.range(0,7).repeat(16,4).cuda()
-    target = torch.arange(0,32,2).cuda()
+    Utils.initialize_model_parallel(4, 2)
+    vocab_parallel_logits = torch.range(0, 7).repeat(16, 4).cuda()
+    target = torch.arange(0, 32, 2).cuda()
     output = vocab_parallel_cross_entropy(vocab_parallel_logits, target)
-    expected_output = torch.tensor([10.2309,  8.2309,  6.2309,  4.2309, 10.2309,  8.2309,  6.2309,  4.2309,
-        10.2309,  8.2309,  6.2309,  4.2309, 10.2309,  8.2309,  6.2309,  4.2309]).cuda()
-    assert(torch.equal(torch.round(expected_output), torch.round(output)))
-    Utils.destroy_model_parallel()
\ No newline at end of file
+    expected_output = torch.tensor(
+        [
+            10.2309,
+            8.2309,
+            6.2309,
+            4.2309,
+            10.2309,
+            8.2309,
+            6.2309,
+            4.2309,
+            10.2309,
+            8.2309,
+            6.2309,
+            4.2309,
+            10.2309,
+            8.2309,
+            6.2309,
+            4.2309,
+        ]
+    ).cuda()
+    assert torch.equal(torch.round(expected_output), torch.round(output))
+    Utils.destroy_model_parallel()
diff --git a/tests/unit_tests/tensor_parallel/test_data.py b/tests/unit_tests/tensor_parallel/test_data.py
index 38a39ce37f..211d48b4fd 100644
--- a/tests/unit_tests/tensor_parallel/test_data.py
+++ b/tests/unit_tests/tensor_parallel/test_data.py
@@ -1,21 +1,23 @@
-from megatron.core.tensor_parallel.data import broadcast_data
 import torch
+
+from megatron.core.tensor_parallel.data import broadcast_data
 from tests.unit_tests.test_utilities import Utils
 
+
 def test_broadcast_data():
-    Utils.initialize_model_parallel(2,4)
+    Utils.initialize_model_parallel(2, 4)
     input_data = {
-        0 : torch.ones((8,8)).cuda() * 0.0,
-        1 : torch.ones((8,8)).cuda() * 1.0,
-        2 : torch.ones((8,8)).cuda() * 2.0,
-        3 : torch.ones((8,8)).cuda() * 3.0,
-        4 : torch.ones((8,8)).cuda() * 4.0,
-        5 : torch.ones((8,8)).cuda() * 5.0,
-        6 : torch.ones((8,8)).cuda() * 6.0,
-        7 : torch.ones((8,8)).cuda() * 7.0
-        }
+        0: torch.ones((8, 8)).cuda() * 0.0,
+        1: torch.ones((8, 8)).cuda() * 1.0,
+        2: torch.ones((8, 8)).cuda() * 2.0,
+        3: torch.ones((8, 8)).cuda() * 3.0,
+        4: torch.ones((8, 8)).cuda() * 4.0,
+        5: torch.ones((8, 8)).cuda() * 5.0,
+        6: torch.ones((8, 8)).cuda() * 6.0,
+        7: torch.ones((8, 8)).cuda() * 7.0,
+    }
     dtype = torch.float32
-    actual_output = broadcast_data([0,1],input_data, dtype)
-    assert(torch.equal(actual_output[0], input_data[0]))
-    assert(torch.equal(actual_output[1], input_data[1]))
-    Utils.destroy_model_parallel()
\ No newline at end of file
+    actual_output = broadcast_data([0, 1], input_data, dtype)
+    assert torch.equal(actual_output[0], input_data[0])
+    assert torch.equal(actual_output[1], input_data[1])
+    Utils.destroy_model_parallel()
diff --git a/tests/unit_tests/tensor_parallel/test_initialization.py b/tests/unit_tests/tensor_parallel/test_initialization.py
new file mode 100644
index 0000000000..039ad071a7
--- /dev/null
+++ b/tests/unit_tests/tensor_parallel/test_initialization.py
@@ -0,0 +1,201 @@
+# Copyright (c) 2023, NVIDIA CORPORATION. All rights reserved.
+
+import pytest
+import torch
+
+import megatron.core.parallel_state as ps
+from megatron.core.tensor_parallel.layers import (
+    ColumnParallelLinear,
+    RowParallelLinear,
+    VocabParallelEmbedding,
+)
+from megatron.core.tensor_parallel.random import model_parallel_cuda_manual_seed
+from megatron.core.transformer.custom_layers.transformer_engine import (
+    TEColumnParallelLinear,
+    TERowParallelLinear,
+)
+from megatron.core.transformer.transformer_config import TransformerConfig
+from tests.unit_tests.test_utilities import Utils
+
+
+class Test:
+
+    transformer_config = TransformerConfig(
+        num_layers=1, hidden_size=12, num_attention_heads=4, use_cpu_initialization=True
+    )
+
+    def teardown_method(self, method):
+        Utils.destroy_model_parallel()
+
+    @pytest.mark.skipif(not torch.cuda.is_available(), reason="CUDA not available")
+    def test_embedding_init(self):
+
+        Utils.initialize_model_parallel(1, 1)
+        torch.manual_seed(42)
+        model_parallel_cuda_manual_seed(42)
+
+        tp1 = VocabParallelEmbedding(
+            num_embeddings=16,
+            embedding_dim=4,
+            init_method=self.transformer_config.init_method,
+            config=self.transformer_config,
+        ).weight
+        Utils.destroy_model_parallel()
+
+        Utils.initialize_model_parallel(4, 1)
+        torch.manual_seed(42)
+        model_parallel_cuda_manual_seed(41)  # intentionally different.
+        tp4 = VocabParallelEmbedding(
+            num_embeddings=16,
+            embedding_dim=4,
+            init_method=self.transformer_config.init_method,
+            config=self.transformer_config,
+        ).weight
+
+        rank = ps.get_tensor_model_parallel_rank()
+        assert tp4.shape[0] * 4 == tp1.shape[0]
+        assert torch.equal(tp1[rank * 4 : (rank + 1) * 4], tp4)
+
+    @pytest.mark.skipif(not torch.cuda.is_available(), reason="CUDA not available")
+    def test_row_init(self):
+
+        Utils.initialize_model_parallel(1, 1)
+        torch.manual_seed(42)
+        model_parallel_cuda_manual_seed(42)
+
+        tp1 = RowParallelLinear(
+            input_size=16,
+            output_size=16,
+            init_method=self.transformer_config.init_method,
+            bias=True,
+            input_is_parallel=False,
+            config=self.transformer_config,
+            skip_bias_add=False,
+        ).weight
+        Utils.destroy_model_parallel()
+
+        Utils.initialize_model_parallel(4, 1)
+        torch.manual_seed(42)
+        model_parallel_cuda_manual_seed(41)  # intentionally different.
+        tp4 = RowParallelLinear(
+            input_size=16,
+            output_size=16,
+            init_method=self.transformer_config.init_method,
+            bias=True,
+            input_is_parallel=False,
+            config=self.transformer_config,
+            skip_bias_add=False,
+        ).weight
+
+        rank = ps.get_tensor_model_parallel_rank()
+        assert tp4.shape[1] * 4 == tp1.shape[1]
+        assert torch.equal(tp1[:, rank * 4 : (rank + 1) * 4], tp4)
+
+    @pytest.mark.skipif(not torch.cuda.is_available(), reason="CUDA not available")
+    def test_col_init(self):
+
+        Utils.initialize_model_parallel(1, 1)
+        torch.manual_seed(42)
+        model_parallel_cuda_manual_seed(42)
+
+        tp1 = ColumnParallelLinear(
+            input_size=16,
+            output_size=16,
+            init_method=self.transformer_config.init_method,
+            bias=True,
+            config=self.transformer_config,
+            skip_bias_add=False,
+        ).weight
+        Utils.destroy_model_parallel()
+
+        Utils.initialize_model_parallel(4, 1)
+        torch.manual_seed(42)
+        model_parallel_cuda_manual_seed(41)  # intentionally different.
+        tp4 = ColumnParallelLinear(
+            input_size=16,
+            output_size=16,
+            init_method=self.transformer_config.init_method,
+            bias=True,
+            config=self.transformer_config,
+            skip_bias_add=False,
+        ).weight
+
+        rank = ps.get_tensor_model_parallel_rank()
+        assert tp4.shape[0] * 4 == tp1.shape[0]
+        assert torch.equal(tp1[rank * 4 : (rank + 1) * 4], tp4)
+
+    @pytest.mark.skipif(not torch.cuda.is_available(), reason="CUDA not available")
+    @pytest.mark.timeout(100)
+    def test_te_col_init(self):
+
+        Utils.initialize_model_parallel(1, 1)
+        torch.manual_seed(42)
+        model_parallel_cuda_manual_seed(42)
+
+        tp1 = TEColumnParallelLinear(
+            input_size=16,
+            output_size=16,
+            init_method=self.transformer_config.init_method,
+            bias=True,
+            config=self.transformer_config,
+            skip_bias_add=False,
+            gather_output=False,
+            is_expert=False,
+        ).weight
+        Utils.destroy_model_parallel()
+
+        Utils.initialize_model_parallel(4, 1)
+        torch.manual_seed(42)
+        model_parallel_cuda_manual_seed(41)  # intentionally different.
+        tp4 = TEColumnParallelLinear(
+            input_size=16,
+            output_size=16,
+            init_method=self.transformer_config.init_method,
+            bias=True,
+            config=self.transformer_config,
+            skip_bias_add=False,
+            gather_output=False,
+            is_expert=False,
+        ).weight
+
+        if torch.distributed.get_rank() == 0:
+            assert tp4.shape[0] * 4 == tp1.shape[0]
+            assert torch.allclose(tp1[:4], tp4)
+
+    @pytest.mark.skipif(not torch.cuda.is_available(), reason="CUDA not available")
+    @pytest.mark.timeout(100)
+    def test_te_row_init(self):
+
+        Utils.initialize_model_parallel(1, 1)
+        torch.manual_seed(42)
+        model_parallel_cuda_manual_seed(42)
+
+        tp1 = TERowParallelLinear(
+            input_size=16,
+            output_size=16,
+            init_method=self.transformer_config.init_method,
+            bias=True,
+            input_is_parallel=True,
+            config=self.transformer_config,
+            skip_bias_add=False,
+            is_expert=False,
+        ).weight
+        Utils.destroy_model_parallel()
+
+        Utils.initialize_model_parallel(4, 1)
+        torch.manual_seed(42)
+        model_parallel_cuda_manual_seed(41)  # intentionally different.
+        tp4 = TERowParallelLinear(
+            input_size=16,
+            output_size=16,
+            init_method=self.transformer_config.init_method,
+            bias=True,
+            input_is_parallel=True,
+            config=self.transformer_config,
+            skip_bias_add=False,
+            is_expert=False,
+        ).weight
+
+        if torch.distributed.get_rank() == 0:
+            assert tp4.shape[1] * 4 == tp1.shape[1]
+            assert torch.allclose(tp1[:, :4], tp4)
diff --git a/tests/unit_tests/tensor_parallel/test_layers.py b/tests/unit_tests/tensor_parallel/test_layers.py
new file mode 100644
index 0000000000..709fc598ff
--- /dev/null
+++ b/tests/unit_tests/tensor_parallel/test_layers.py
@@ -0,0 +1,54 @@
+# Copyright (c) 2024, NVIDIA CORPORATION. All rights reserved.
+import pytest
+import torch
+
+from megatron.core.tensor_parallel.layers import linear_with_frozen_weight
+from megatron.core.tensor_parallel.mappings import gather_from_tensor_model_parallel_region
+from tests.unit_tests.test_utilities import Utils
+
+
+@pytest.mark.parametrize("tensor_parallel,allreduce_dgrad", [(1, False), (8, True)])
+def test_LinearWithFrozenWeight(tensor_parallel, allreduce_dgrad):
+    Utils.initialize_model_parallel(tensor_parallel, 1)
+
+    size_per_partition = int(8 / tensor_parallel)
+
+    # Input is an 8x8 identity matrix.
+    input_data = torch.eye(8).cuda()
+    input_data.requires_grad = True
+
+    # Weight is an 8x8 matrix of all ones. If tensor parallelism > 1, the weight is partitioned evenly across GPUs.
+    weight = torch.ones((size_per_partition, 8)).cuda()
+
+    # Bias is a vector of length 8 of all zeros. If tensor parallelism > 1, the bias is partitioned evenly across GPUs
+    bias = torch.zeros((size_per_partition)).cuda()
+
+    gradient_accumulation_fusion = False
+    async_grad_allreduce = allreduce_dgrad
+    sequence_parallel = False
+    grad_output_buffer = None
+    wgrad_deferral_limit = None
+
+    output_parallel = linear_with_frozen_weight(
+        input_data,
+        weight,
+        bias,
+        gradient_accumulation_fusion,
+        async_grad_allreduce,
+        sequence_parallel,
+        grad_output_buffer,
+        wgrad_deferral_limit,
+        allreduce_dgrad,
+    )
+    output = gather_from_tensor_model_parallel_region(
+        output_parallel
+    )  # no-op if tensor_parallel == 1.
+    output.sum().backward()
+
+    expected_output = torch.ones(8).cuda()
+    expected_grad = 8 * torch.ones(8).cuda()
+
+    assert torch.allclose(output, expected_output)
+    assert torch.allclose(input_data.grad, expected_grad)
+
+    Utils.destroy_model_parallel()
diff --git a/tests/unit_tests/tensor_parallel/test_mappings.py b/tests/unit_tests/tensor_parallel/test_mappings.py
index 6be486ef3c..d5bc3f2127 100644
--- a/tests/unit_tests/tensor_parallel/test_mappings.py
+++ b/tests/unit_tests/tensor_parallel/test_mappings.py
@@ -1,135 +1,144 @@
+import torch
+
 from megatron.core.tensor_parallel import mappings
 from tests.unit_tests.test_utilities import Utils
-import torch
+
 
 def test_CopyToModelParallelRegion():
-    Utils.initialize_model_parallel(4,2)
-    input_data = torch.ones((1)).cuda()*Utils.rank
+    Utils.initialize_model_parallel(4, 2)
+    input_data = torch.ones((1)).cuda() * Utils.rank
     output_data = mappings._CopyToModelParallelRegion.backward(None, input_data)
     result = torch.ones(1).cuda()
     result = result * 22 if Utils.rank >= 4 else result * 6
-    assert(torch.equal(output_data, result))
-    assert(torch.equal(input_data, mappings.copy_to_tensor_model_parallel_region(input_data)))
-    assert(torch.equal(input_data, mappings._CopyToModelParallelRegion.symbolic(None, input_data)))
+    assert torch.equal(output_data, result)
+    assert torch.equal(input_data, mappings.copy_to_tensor_model_parallel_region(input_data))
+    assert torch.equal(input_data, mappings._CopyToModelParallelRegion.symbolic(None, input_data))
     Utils.destroy_model_parallel()
 
+
 def test_ReduceFromModelParallelRegion():
-    Utils.initialize_model_parallel(4,2)
-    input_data = torch.ones((1)).cuda()*Utils.rank
+    Utils.initialize_model_parallel(4, 2)
+    input_data = torch.ones((1)).cuda() * Utils.rank
     output_data = mappings._ReduceFromModelParallelRegion.symbolic(None, input_data)
     result = torch.ones(1).cuda()
     result = result * 22 if Utils.rank >= 4 else result * 6
-    assert(torch.equal(output_data, result))
-    input_data = torch.ones((1)).cuda()*Utils.rank
-    assert(torch.equal(mappings.reduce_from_tensor_model_parallel_region(input_data), result))
-    assert(torch.equal(input_data, mappings._ReduceFromModelParallelRegion.backward(None, input_data)))
+    assert torch.equal(output_data, result)
+    input_data = torch.ones((1)).cuda() * Utils.rank
+    assert torch.equal(mappings.reduce_from_tensor_model_parallel_region(input_data), result)
+    assert torch.equal(
+        input_data, mappings._ReduceFromModelParallelRegion.backward(None, input_data)
+    )
     Utils.destroy_model_parallel()
 
+
 def test_ScatterToModelParallelRegion():
-    Utils.initialize_model_parallel(4,2)
-    input_data = torch.rand((8,4)).cuda()
+    Utils.initialize_model_parallel(4, 2)
+    input_data = torch.rand((8, 4)).cuda()
     output_data = mappings.scatter_to_tensor_model_parallel_region(input_data)
-    req_dim = int(Utils.rank%(Utils.world_size/2))
-    assert(torch.equal(output_data, input_data[:,req_dim].reshape((8,1))))
+    req_dim = int(Utils.rank % (Utils.world_size / 2))
+    assert torch.equal(output_data, input_data[:, req_dim].reshape((8, 1)))
     output_data = mappings._ScatterToModelParallelRegion.symbolic(None, input_data)
-    assert(torch.equal(output_data, input_data[:, req_dim].reshape((8,1))))
+    assert torch.equal(output_data, input_data[:, req_dim].reshape((8, 1)))
 
     input_data = torch.ones(8).cuda() * Utils.rank
     actual_output_data = mappings._ScatterToModelParallelRegion.backward(None, input_data)
-    expected_output = torch.cat((
-        torch.ones(8)*0,
-        torch.ones(8)*1,
-        torch.ones(8)*2,
-        torch.ones(8)*3)).cuda()
-    if (Utils.rank >= 4):
+    expected_output = torch.cat(
+        (torch.ones(8) * 0, torch.ones(8) * 1, torch.ones(8) * 2, torch.ones(8) * 3)
+    ).cuda()
+    if Utils.rank >= 4:
         expected_output = expected_output + 4
-    assert(torch.equal(actual_output_data, expected_output))
+    assert torch.equal(actual_output_data, expected_output)
     Utils.destroy_model_parallel()
 
+
 def test_GatherFromModelParallelRegion():
-    Utils.initialize_model_parallel(4,2)
-    input_data = torch.rand((8,4)).cuda()
-    req_dim = int(Utils.rank%(Utils.world_size/2))
+    Utils.initialize_model_parallel(4, 2)
+    input_data = torch.rand((8, 4)).cuda()
+    req_dim = int(Utils.rank % (Utils.world_size / 2))
     output_data = mappings._GatherFromModelParallelRegion.backward(None, input_data)
-    assert(torch.equal(output_data, input_data[:, req_dim].reshape((8,1))))
+    assert torch.equal(output_data, input_data[:, req_dim].reshape((8, 1)))
     input_data = torch.ones(8).cuda() * Utils.rank
     actual_output_data = mappings.gather_from_tensor_model_parallel_region(input_data)
-    expected_output = torch.cat((
-        torch.ones(8)*0,
-        torch.ones(8)*1,
-        torch.ones(8)*2,
-        torch.ones(8)*3)).cuda()
-    if (Utils.rank >= 4):
+    expected_output = torch.cat(
+        (torch.ones(8) * 0, torch.ones(8) * 1, torch.ones(8) * 2, torch.ones(8) * 3)
+    ).cuda()
+    if Utils.rank >= 4:
         expected_output = expected_output + 4
-    assert(torch.equal(actual_output_data, expected_output))
-    assert(torch.equal(mappings._GatherFromModelParallelRegion.symbolic(None, input_data), expected_output))
+    assert torch.equal(actual_output_data, expected_output)
+    assert torch.equal(
+        mappings._GatherFromModelParallelRegion.symbolic(None, input_data), expected_output
+    )
     Utils.destroy_model_parallel()
- 
+
+
 def test_ScatterToSequenceParallelRegion():
-    Utils.initialize_model_parallel(4,2)
-    input_data = torch.rand((8,4)).cuda()
-    req_dim = int(Utils.rank%(Utils.world_size/2))*2
+    Utils.initialize_model_parallel(4, 2)
+    input_data = torch.rand((8, 4)).cuda()
+    req_dim = int(Utils.rank % (Utils.world_size / 2)) * 2
     output_data = mappings._ScatterToSequenceParallelRegion.symbolic(None, input_data)
-    assert(torch.equal(output_data, input_data[req_dim:req_dim+2, :]))
+    assert torch.equal(output_data, input_data[req_dim : req_dim + 2, :])
     output_data = mappings.scatter_to_sequence_parallel_region(input_data)
-    assert(torch.equal(output_data, input_data[req_dim:req_dim+2, :]))
+    assert torch.equal(output_data, input_data[req_dim : req_dim + 2, :])
     input_data = torch.ones(4).cuda() * Utils.rank
     output_data = mappings._ScatterToModelParallelRegion.backward(None, input_data)
-    expected_output = torch.concat((
-        torch.ones(4)*0,
-        torch.ones(4)*1,
-        torch.ones(4)*2,
-        torch.ones(4)*3)).cuda()
-    if (Utils.rank >= 4):
+    expected_output = torch.concat(
+        (torch.ones(4) * 0, torch.ones(4) * 1, torch.ones(4) * 2, torch.ones(4) * 3)
+    ).cuda()
+    if Utils.rank >= 4:
         expected_output = expected_output + 4
-    assert(torch.equal(output_data, expected_output))
+    assert torch.equal(output_data, expected_output)
     Utils.destroy_model_parallel()
 
+
 def test_GatherFromSequenceParallelRegion():
-    Utils.initialize_model_parallel(4,2)
+    Utils.initialize_model_parallel(4, 2)
     input_data = torch.ones(4).cuda() * Utils.rank
     output_data = mappings.gather_from_sequence_parallel_region(input_data)
-    expected_output = torch.concat((
-        torch.ones(4)*0,
-        torch.ones(4)*1,
-        torch.ones(4)*2,
-        torch.ones(4)*3)).cuda()
-    if (Utils.rank >= 4):
+    expected_output = torch.concat(
+        (torch.ones(4) * 0, torch.ones(4) * 1, torch.ones(4) * 2, torch.ones(4) * 3)
+    ).cuda()
+    if Utils.rank >= 4:
         expected_output = expected_output + 4
-    assert(torch.equal(output_data, expected_output))
-    assert(torch.equal(mappings._GatherFromSequenceParallelRegion.symbolic(None, input_data), expected_output))
-    input_data = torch.vstack((
-        torch.ones(4)*0,
-        torch.ones(4)*1,
-        torch.ones(4)*2,
-        torch.ones(4)*3)).cuda()
+    assert torch.equal(output_data, expected_output)
+    assert torch.equal(
+        mappings._GatherFromSequenceParallelRegion.symbolic(None, input_data), expected_output
+    )
+    input_data = torch.vstack(
+        (torch.ones(4) * 0, torch.ones(4) * 1, torch.ones(4) * 2, torch.ones(4) * 3)
+    ).cuda()
+
     class Ctx:
         tensor_parallel_output_grad = True
+        output_split_sizes = None
+
     output_data = mappings._GatherFromSequenceParallelRegion.backward(Ctx(), input_data)
-    expected_output = torch.ones((1,4)).cuda() * 4 * int(Utils.rank % 4)
-    assert(torch.equal(output_data[0], expected_output))
+    expected_output = torch.ones((1, 4)).cuda() * 4 * int(Utils.rank % 4)
+    assert torch.equal(output_data[0], expected_output)
     Utils.destroy_model_parallel()
 
+
 def test_ReduceScatterToSequenceParallelRegion():
-    Utils.initialize_model_parallel(4,2)
-    input_data = torch.vstack((
-        torch.ones(4)*0,
-        torch.ones(4)*1,
-        torch.ones(4)*2,
-        torch.ones(4)*3)).cuda()
+    Utils.initialize_model_parallel(4, 2)
+    input_data = torch.vstack(
+        (torch.ones(4) * 0, torch.ones(4) * 1, torch.ones(4) * 2, torch.ones(4) * 3)
+    ).cuda()
     output_data = mappings.reduce_scatter_to_sequence_parallel_region(input_data)
     expected_output = torch.ones(4).cuda() * 4 * int(Utils.rank % 4)
-    assert(torch.equal(output_data[0], expected_output))
-    assert(torch.equal(mappings._ReduceScatterToSequenceParallelRegion.symbolic(None, input_data) , expected_output.reshape((1,4))))
+    assert torch.equal(output_data[0], expected_output)
+    assert torch.equal(
+        mappings._ReduceScatterToSequenceParallelRegion.symbolic(None, input_data),
+        expected_output.reshape((1, 4)),
+    )
     input_data = torch.ones(4).cuda() * Utils.rank
-    output_data = mappings._ReduceScatterToSequenceParallelRegion.backward(None,input_data)
-    expected_output = torch.concat((
-        torch.ones(4)*0,
-        torch.ones(4)*1,
-        torch.ones(4)*2,
-        torch.ones(4)*3)).cuda()
-    if (Utils.rank >= 4):
+
+    class Ctx:
+        input_split_sizes = None
+
+    output_data, _ = mappings._ReduceScatterToSequenceParallelRegion.backward(Ctx(), input_data)
+    expected_output = torch.concat(
+        (torch.ones(4) * 0, torch.ones(4) * 1, torch.ones(4) * 2, torch.ones(4) * 3)
+    ).cuda()
+    if Utils.rank >= 4:
         expected_output = expected_output + 4
-    assert(torch.equal(output_data, expected_output))
+    assert torch.equal(output_data, expected_output)
     Utils.destroy_model_parallel()
-
diff --git a/tests/unit_tests/tensor_parallel/test_random.py b/tests/unit_tests/tensor_parallel/test_random.py
index 4ee98c96d4..ace500839d 100644
--- a/tests/unit_tests/tensor_parallel/test_random.py
+++ b/tests/unit_tests/tensor_parallel/test_random.py
@@ -1,44 +1,54 @@
-from megatron.core.tensor_parallel.random import CudaRNGStatesTracker
-from megatron.core.tensor_parallel.random import model_parallel_cuda_manual_seed
-from megatron.core.tensor_parallel.random import _CUDA_RNG_STATE_TRACKER
-from megatron.core.tensor_parallel.random import checkpoint
-from tests.unit_tests.test_utilities import Utils
 import pytest
 import torch
 
+from megatron.core.tensor_parallel.random import (
+    CudaRNGStatesTracker,
+    checkpoint,
+    get_cuda_rng_tracker,
+    model_parallel_cuda_manual_seed,
+)
+from tests.unit_tests.test_utilities import Utils
+
+
 def test_cuda_rng_states_tracker():
     rng_tracker = CudaRNGStatesTracker()
-    rng_tracker.set_states({"state1":1234})
-    assert(rng_tracker.get_states()["state1"] == 1234)
+    rng_tracker.set_states({"state1": 1234})
+    assert rng_tracker.get_states()["state1"] == 1234
     rng_tracker.reset()
-    assert(rng_tracker.get_states() == {})
+    assert rng_tracker.get_states() == {}
     seed = 1111
-    rng_tracker.add("state2",seed)
+    rng_tracker.add("state2", seed)
     with pytest.raises(Exception):
-        assert(rng_tracker.add("state3",seed))
+        assert rng_tracker.add("state3", seed)
     with pytest.raises(Exception):
-        assert(rng_tracker.add("state2",111))
-    assert(rng_tracker.get_states()['state2'] is not None)
+        assert rng_tracker.add("state2", 111)
+    assert rng_tracker.get_states()['state2'] is not None
     with pytest.raises(Exception):
-        assert()
-    
+        assert ()
+
     rng_tracker.fork("state2")
     torch.cuda.manual_seed(seed)
     rng_state = torch.cuda.get_rng_state()
     assert torch.equal(rng_tracker.get_states()['state2'], rng_state)
 
+
 def test_model_parallel_cuda_manual_seed():
-    Utils.initialize_model_parallel(4,2)
+    Utils.initialize_model_parallel(4, 2)
     model_parallel_cuda_manual_seed(0)
-    assert(_CUDA_RNG_STATE_TRACKER.get_states()['model-parallel-rng'] is not None)
+    rng_tracker = get_cuda_rng_tracker()
+    assert rng_tracker.get_states()['model-parallel-rng'] is not None
     Utils.destroy_model_parallel()
 
+
 def test_checkpoint():
     def test_forward(*input):
-        return input[0]+input[1]
-    assert(torch.equal(torch.ones(16)*3,checkpoint(test_forward, None, torch.ones(16), torch.ones(16)*2)))
+        return input[0] + input[1]
+
+    assert torch.equal(
+        torch.ones(16) * 3, checkpoint(test_forward, None, torch.ones(16), torch.ones(16) * 2)
+    )
     Utils.initialize_model_parallel()
-    input1 = torch.ones((4,4))
-    checkpoint(test_forward, True, input1, torch.ones((4,4))*2)
-    assert(torch.equal(torch.ones(input1.numel()).cuda(), input1))
-    Utils.destroy_model_parallel()
\ No newline at end of file
+    input1 = torch.ones((4, 4))
+    checkpoint(test_forward, True, input1, torch.ones((4, 4)) * 2)
+    assert torch.equal(torch.ones(input1.numel()).cuda(), input1)
+    Utils.destroy_model_parallel()
diff --git a/tests/unit_tests/tensor_parallel/test_tensor_parallel_utils.py b/tests/unit_tests/tensor_parallel/test_tensor_parallel_utils.py
index f82e5fa693..5df774e5ff 100644
--- a/tests/unit_tests/tensor_parallel/test_tensor_parallel_utils.py
+++ b/tests/unit_tests/tensor_parallel/test_tensor_parallel_utils.py
@@ -1,43 +1,55 @@
 import torch
-import megatron.core.tensor_parallel.utils as util
+
 import megatron.core.parallel_state as ps
+import megatron.core.tensor_parallel.utils as util
 from tests.unit_tests.test_utilities import Utils
 
 rank = Utils.rank
 
+
 def test_split_tensor_along_last_dim():
-    input_tensor = torch.rand((3,4))
-    torch.equal(input_tensor[0:2,0:2], util.split_tensor_along_last_dim(input_tensor,2)[0])
-    torch.equal(input_tensor[2:,2:], util.split_tensor_along_last_dim(input_tensor,2)[1])
+    input_tensor = torch.rand((3, 4))
+    torch.equal(input_tensor[0:2, 0:2], util.split_tensor_along_last_dim(input_tensor, 2)[0])
+    torch.equal(input_tensor[2:, 2:], util.split_tensor_along_last_dim(input_tensor, 2)[1])
+
 
 def test_split_tensor_into_1d_equal_chunks():
     Utils.initialize_model_parallel(tensor_model_parallel_size=2, pipeline_model_parallel_size=4)
-    input_tensor = torch.rand((3,4))
+    input_tensor = torch.rand((3, 4))
     output_tensor = util.split_tensor_into_1d_equal_chunks(input_tensor)
-    if rank % 2 == 0 :
+    if rank % 2 == 0:
         start = 0
-        end = int(input_tensor.numel()/2)
-    else :
-        start = int(input_tensor.numel()/2)
+        end = int(input_tensor.numel() / 2)
+    else:
+        start = int(input_tensor.numel() / 2)
         end = input_tensor.numel()
-        
+
     assert torch.equal(output_tensor, input_tensor.flatten()[start:end])
     Utils.destroy_model_parallel()
 
+
 def test_gather_split_1d_tensor():
     Utils.initialize_model_parallel(tensor_model_parallel_size=2, pipeline_model_parallel_size=4)
-    input_tensor = torch.ones((2,4)).cuda() * rank
+    input_tensor = torch.ones((2, 4)).cuda() * rank
     actual_output_tensor = util.gather_split_1d_tensor(input_tensor)
-    if rank %2 == 0:
+    if rank % 2 == 0:
         expected_output_tensor = torch.concat((input_tensor.flatten(), input_tensor.flatten() + 1))
-    else : 
+    else:
         expected_output_tensor = torch.concat((input_tensor.flatten() - 1, input_tensor.flatten()))
-    assert(torch.equal(actual_output_tensor, expected_output_tensor))
+    assert torch.equal(actual_output_tensor, expected_output_tensor)
     Utils.destroy_model_parallel()
 
+
 def test_vocab():
     global_vocab_size = 1600
     per_partition_vocab_size = 1600 / Utils.world_size
-    assert((rank * per_partition_vocab_size, (rank + 1)* per_partition_vocab_size) == (util.VocabUtility.vocab_range_from_per_partition_vocab_size(global_vocab_size // Utils.world_size, rank, Utils.world_size)))
-    assert((rank * per_partition_vocab_size, (rank + 1)* per_partition_vocab_size) == (util.VocabUtility.vocab_range_from_global_vocab_size(global_vocab_size, rank, Utils.world_size)))
-    
\ No newline at end of file
+    assert (rank * per_partition_vocab_size, (rank + 1) * per_partition_vocab_size) == (
+        util.VocabUtility.vocab_range_from_per_partition_vocab_size(
+            global_vocab_size // Utils.world_size, rank, Utils.world_size
+        )
+    )
+    assert (rank * per_partition_vocab_size, (rank + 1) * per_partition_vocab_size) == (
+        util.VocabUtility.vocab_range_from_global_vocab_size(
+            global_vocab_size, rank, Utils.world_size
+        )
+    )
diff --git a/tests/unit_tests/test_basic.py b/tests/unit_tests/test_basic.py
index 915d2c1001..d2a60f92c8 100644
--- a/tests/unit_tests/test_basic.py
+++ b/tests/unit_tests/test_basic.py
@@ -1,3 +1,2 @@
 def test_import():
     import megatron
-
diff --git a/tests/unit_tests/test_imports.py b/tests/unit_tests/test_imports.py
new file mode 100644
index 0000000000..bad67cd8d5
--- /dev/null
+++ b/tests/unit_tests/test_imports.py
@@ -0,0 +1,149 @@
+# Copyright (c) 2022, NVIDIA CORPORATION.  All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+import argparse
+import importlib
+import inspect
+import os
+import traceback
+
+import torch
+import wrapt
+
+from megatron.core.transformer.module import MegatronModule
+
+
+def import_class_by_path(path: str):
+    paths = path.split('.')
+    path = ".".join(paths[:-1])
+    class_name = paths[-1]
+    mod = __import__(path, fromlist=[class_name])
+    mod = getattr(mod, class_name)
+    return mod
+
+
+def _build_import_path(subdomains: list, imp):
+    import_path = ["megatron", "core"]
+    import_path.extend(subdomains)
+    import_path.append(imp)
+    path = ".".join(import_path)
+    return path
+
+
+def _get_class_from_path(subdomains, imp):
+    path = _build_import_path(subdomains, imp)
+    print(path)
+    class_ = None
+    result = None
+    try:
+        class_ = import_class_by_path(path)
+        if inspect.isclass(class_):
+            if isinstance(class_, wrapt.FunctionWrapper):
+                class_ = class_.__wrapped__
+            if issubclass(class_, (MegatronModule, torch.nn.Module)):
+                result = class_
+        else:
+            class_ = None
+        error = None
+    except Exception:
+        error = traceback.format_exc()
+    return class_, result, error
+
+
+def _test_domain_module_imports(module, subdomains: list):
+    module_list = []
+    failed_list = []
+    error_list = []
+
+    error = None
+    if len(subdomains) > 0:
+        basepath = module.__path__[0]
+        megatron_index = basepath.rfind("megatron")
+        basepath = basepath[megatron_index:].replace(os.path.sep, ".")
+        new_path = '.'.join([basepath, *subdomains])
+
+        try:
+            module = importlib.import_module(new_path)
+        except Exception:
+            print(f"Could not import `{new_path}` ; Traceback below :")
+            error = traceback.format_exc()
+            error_list.append(error)
+
+    if error is None:
+        for imp in dir(module):
+            class_, result, error = _get_class_from_path(subdomains, imp)
+
+            if result is not None:
+                module_list.append(class_)
+
+            elif class_ is not None:
+                failed_list.append(class_)
+
+            if error is not None:
+                error_list.append(error)
+
+    for module in module_list:
+        print("Module successfully imported :", module)
+
+    print()
+    for module in failed_list:
+        print(
+            "Module did not match a valid signature of Megatron core Model (hence ignored):", module
+        )
+
+    print()
+    if len(error_list) > 0:
+        print("Imports crashed with following traceback !")
+
+        for error in error_list:
+            print("*" * 100)
+            print()
+            print(error)
+            print()
+            print("*" * 100)
+            print()
+
+    if len(error_list) > 0:
+        return False
+    else:
+        return True
+
+
+###############################
+
+
+def test_domain_mcore():
+    import megatron.core as mcore
+
+    all_passed = _test_domain_module_imports(mcore, subdomains=['models'])
+
+    all_passed = _test_domain_module_imports(mcore, subdomains=['pipeline_parallel'])
+
+    all_passed = _test_domain_module_imports(mcore, subdomains=['tensor_parallel'])
+
+    all_passed = _test_domain_module_imports(mcore, subdomains=['transformer'])
+
+    all_passed = _test_domain_module_imports(mcore, subdomains=['fusions'])
+
+    all_passed = _test_domain_module_imports(mcore, subdomains=['distributed'])
+
+    all_passed = _test_domain_module_imports(mcore, subdomains=['datasets'])
+
+    all_passed = _test_domain_module_imports(mcore, subdomains=['dist_checkpointing'])
+
+    if not all_passed:
+        exit(1)
+
+
+if __name__ == '__main__':
+    test_domain_mcore()
diff --git a/tests/unit_tests/test_local_multi_tensor_fns.py b/tests/unit_tests/test_local_multi_tensor_fns.py
new file mode 100644
index 0000000000..086de6f6d0
--- /dev/null
+++ b/tests/unit_tests/test_local_multi_tensor_fns.py
@@ -0,0 +1,70 @@
+import copy
+
+import pytest
+import torch
+
+from megatron.core.utils import (
+    local_multi_tensor_applier,
+    local_multi_tensor_l2_norm,
+    local_multi_tensor_scale,
+)
+
+
+def test_local_multi_tensor_l2_norm_and_scale():
+    amp_C = pytest.importorskip("amp_C")
+    multi_tensor_apply = pytest.importorskip("apex.multi_tensor_apply")
+
+    torch.manual_seed(42)
+
+    tensor_list = [torch.rand(5, 5).cuda() for _ in range(10)]
+    tensor_list_copy = copy.deepcopy(tensor_list)
+
+    norm_apex, _ = multi_tensor_apply.multi_tensor_applier(
+        amp_C.multi_tensor_l2norm,
+        torch.tensor([0], dtype=torch.int, device='cuda'),
+        [tensor_list],
+        False,
+    )
+    norm_local, _ = multi_tensor_apply.multi_tensor_applier(
+        local_multi_tensor_l2_norm,
+        torch.tensor([0], dtype=torch.int, device='cuda'),
+        [tensor_list_copy],
+        False,
+    )
+    torch.testing.assert_close(norm_apex, norm_local)
+
+    clip_coeff = 0.05
+    multi_tensor_apply.multi_tensor_applier(
+        amp_C.multi_tensor_scale,
+        torch.tensor([0], dtype=torch.int, device='cuda'),
+        [tensor_list, tensor_list],
+        clip_coeff,
+    )
+    multi_tensor_apply.multi_tensor_applier(
+        local_multi_tensor_scale,
+        torch.tensor([0], dtype=torch.int, device='cuda'),
+        [tensor_list_copy, tensor_list_copy],
+        clip_coeff,
+    )
+    torch.testing.assert_close(tensor_list, tensor_list_copy)
+
+
+def test_local_multi_tensor_apply():
+    amp_C = pytest.importorskip("amp_C")
+    multi_tensor_apply = pytest.importorskip("apex.multi_tensor_apply")
+
+    tensor_list = [torch.rand(5, 5).cuda() for _ in range(10)]
+
+    norm_apex, _ = multi_tensor_apply.multi_tensor_applier(
+        amp_C.multi_tensor_l2norm,
+        torch.tensor([0], dtype=torch.int, device='cuda'),
+        [tensor_list],
+        False,
+    )
+    norm_local, _ = local_multi_tensor_applier(
+        amp_C.multi_tensor_l2norm,
+        torch.tensor([0], dtype=torch.int, device='cuda'),
+        [tensor_list],
+        False,
+    )
+    torch.testing.assert_close(norm_apex, norm_local)
diff --git a/tests/unit_tests/test_num_microbatches_calculator.py b/tests/unit_tests/test_num_microbatches_calculator.py
new file mode 100644
index 0000000000..9b3356b8af
--- /dev/null
+++ b/tests/unit_tests/test_num_microbatches_calculator.py
@@ -0,0 +1,147 @@
+from typing import List, Optional
+
+import pytest
+
+import megatron.core.num_microbatches_calculator as mb_calculator
+
+
+def test_init_num_microbatches_calculator():
+    mb_calculator._GLOBAL_NUM_MICROBATCHES_CALCULATOR = None
+    mb_calculator.init_num_microbatches_calculator(0, None, 32, 8, 2, False)
+    assert mb_calculator.get_num_microbatches() == 2
+    assert mb_calculator.get_current_global_batch_size() == 32
+
+    with pytest.raises(AssertionError):
+        mb_calculator.init_num_microbatches_calculator(0, None, 32, 8, 2, False)
+
+    mb_calculator._GLOBAL_NUM_MICROBATCHES_CALCULATOR = None
+    mb_calculator.init_num_microbatches_calculator(0, None, 32, 8, 3, True)
+    assert mb_calculator.get_num_microbatches() == 1
+    assert mb_calculator.get_current_global_batch_size() == 32
+    assert mb_calculator.get_current_running_global_batch_size() == 24
+
+    mb_calculator._GLOBAL_NUM_MICROBATCHES_CALCULATOR = None
+    mb_calculator.init_num_microbatches_calculator(0, None, 33, 8, 2, True)
+    assert mb_calculator.get_num_microbatches() == 2
+    assert mb_calculator.get_current_global_batch_size() == 33
+    assert mb_calculator.get_current_running_global_batch_size() == 32
+
+
+def test_reconfigure_num_microbatches_calculator():
+    mb_calculator._GLOBAL_NUM_MICROBATCHES_CALCULATOR = None
+    mb_calculator.init_num_microbatches_calculator(0, None, 32, 8, 2, False)
+    assert mb_calculator.get_num_microbatches() == 2
+    assert mb_calculator.get_current_global_batch_size() == 32
+
+    mb_calculator.reconfigure_num_microbatches_calculator(0, None, 16, 8, 2, False)
+    assert mb_calculator.get_num_microbatches() == 1
+    assert mb_calculator.get_current_global_batch_size() == 16
+
+    mb_calculator.reconfigure_num_microbatches_calculator(0, [16, 16, 96], 32, 8, 2, False)
+    assert mb_calculator.get_num_microbatches() == 1
+    assert mb_calculator.get_current_global_batch_size() == 16
+
+
+def test_get_num_microbatches():
+    mb_calculator.reconfigure_num_microbatches_calculator(0, None, 16, 8, 2, False)
+    assert mb_calculator.get_num_microbatches() == 1
+
+    mb_calculator.reconfigure_num_microbatches_calculator(0, None, 16, 4, 3, True)
+    assert mb_calculator.get_num_microbatches() == 1
+
+
+def test_get_current_global_batch_size():
+    mb_calculator.reconfigure_num_microbatches_calculator(0, None, 16, 4, 2, False)
+    assert mb_calculator.get_current_global_batch_size() == 16
+
+    mb_calculator.reconfigure_num_microbatches_calculator(0, None, 16, 4, 3, True)
+    assert mb_calculator.get_current_global_batch_size() == 16
+    assert mb_calculator.get_current_running_global_batch_size() == 12
+
+
+def test_get_micro_batch_size():
+    mb_calculator.reconfigure_num_microbatches_calculator(0, None, 16, 8, 2, False)
+    assert mb_calculator.get_micro_batch_size() == 8
+
+
+def test_update_num_microbatches():
+    mb_calculator.reconfigure_num_microbatches_calculator(0, [16, 8, 96], 32, 4, 2, False)
+    assert mb_calculator.get_num_microbatches() == 2
+    mb_calculator.update_num_microbatches(48, False)
+    assert mb_calculator.get_num_microbatches() == 3
+
+    mb_calculator.reconfigure_num_microbatches_calculator(0, [16, 8, 96], 32, 8, 2, False)
+    with pytest.raises(AssertionError):
+        mb_calculator.update_num_microbatches(49, True)
+
+    mb_calculator.reconfigure_num_microbatches_calculator(0, None, 32, 8, 2, False)
+    mb_calculator.update_num_microbatches(16)
+    assert mb_calculator.get_num_microbatches() == 2
+
+
+def test_build_num_microbatches_calculator():
+    temp_calculator = mb_calculator._build_num_microbatches_calculator(0, None, 32, 8, 2, False)
+    assert temp_calculator.get() == 2
+    assert temp_calculator.get_current_global_batch_size() == 32
+    assert type(temp_calculator) is mb_calculator.ConstantNumMicroBatchesCalculator
+
+    temp_calculator = mb_calculator._build_num_microbatches_calculator(
+        0, [16, 16, 48], 32, 8, 2, False
+    )
+    assert temp_calculator.get() == 1
+    assert temp_calculator.get_current_global_batch_size() == 16
+    assert type(temp_calculator) is mb_calculator.RampupBatchsizeNumMicroBatchesCalculator
+
+
+class TestConstantNumMicroBatchesCalculator:
+    def setup_method(self, method):
+        self.mb_calculator = mb_calculator.ConstantNumMicroBatchesCalculator(32, 8, 2, False, 0)
+
+    def test_constructor(self):
+        assert type(self.mb_calculator) is mb_calculator.ConstantNumMicroBatchesCalculator
+        assert self.mb_calculator.num_micro_batches == 2
+        assert self.mb_calculator.current_global_batch_size == 32
+        assert self.mb_calculator.micro_batch_size == 8
+
+    def test_get(self):
+        assert self.mb_calculator.get() == 2
+
+    def test_get_current_global_batch_size(self):
+        assert self.mb_calculator.get_current_global_batch_size() == 32
+
+
+class TestRampupBatchsizeNumMicroBatchesCalculator:
+    def setup_method(self, method):
+        self.mb_calculator = mb_calculator.RampupBatchsizeNumMicroBatchesCalculator(
+            32, 8, 2, False, 0, 16, 16, 48
+        )
+
+    def test_constructor(self):
+        assert type(self.mb_calculator) is mb_calculator.RampupBatchsizeNumMicroBatchesCalculator
+        assert self.mb_calculator.global_batch_size == 32
+        assert self.mb_calculator.micro_batch_size == 8
+        assert self.mb_calculator.data_parallel_size == 2
+        assert self.mb_calculator.start_global_batch_size == 16
+        assert self.mb_calculator.batch_size_increment == 16
+        assert self.mb_calculator.ramup_samples == 48
+        assert self.mb_calculator.micro_batch_times_data_parallel_size == 16
+        assert self.mb_calculator.num_micro_batches == 1
+
+    def test_get(self):
+        assert self.mb_calculator.get() == 1
+
+    def test_get_current_global_batch_size(self):
+        assert self.mb_calculator.get_current_global_batch_size() == 16
+
+
+def test_ramp_up():
+    mb_calculator.reconfigure_num_microbatches_calculator(0, [16, 16, 96], 32, 8, 2, False)
+    consumed_samples = 0
+    count = 0
+    expected_consumed_samples = [0, 16, 32, 48, 64, 80, 96, 128, 160, 192, 224, 256]
+
+    while consumed_samples < 256:
+        consumed_samples += mb_calculator.get_current_global_batch_size()
+        count += 1
+        assert consumed_samples == expected_consumed_samples[count]
+        mb_calculator.update_num_microbatches(consumed_samples, True)
diff --git a/tests/unit_tests/test_optimizer.py b/tests/unit_tests/test_optimizer.py
new file mode 100644
index 0000000000..732a68cfa6
--- /dev/null
+++ b/tests/unit_tests/test_optimizer.py
@@ -0,0 +1,66 @@
+import torch
+import torch.nn as nn
+import torch.nn.functional as F
+from torch.optim import SGD, Adam
+
+from megatron.core.optimizer import ChainedOptimizer
+
+
+class Net(nn.Module):
+    def __init__(self):
+        super().__init__()
+        self.conv1 = nn.Conv2d(3, 6, 5)
+        self.pool = nn.MaxPool2d(2, 2)
+        self.conv2 = nn.Conv2d(6, 16, 5)
+        self.fc1 = nn.Linear(16 * 5 * 5, 120)
+        self.fc2 = nn.Linear(120, 84)
+        self.fc3 = nn.Linear(84, 10)
+
+    def forward(self, x):
+        x = self.pool(F.relu(self.conv1(x)))
+        x = self.pool(F.relu(self.conv2(x)))
+        x = torch.flatten(x, 1)  # flatten all dimensions except batch
+        x = F.relu(self.fc1(x))
+        x = F.relu(self.fc2(x))
+        x = self.fc3(x)
+        return x
+
+
+def test_chained_optimizer():
+    net = Net()
+    optimizer_1 = Adam(list(net.parameters())[:2], lr=0.01)
+    optimizer_2 = SGD(list(net.parameters())[2:], lr=0.1, momentum=0.9)
+    chained_optimizer = ChainedOptimizer([optimizer_1, optimizer_2])
+
+    # Test the chained optimizer's param groups is a reference of the underlying optimizers' param groups
+    assert optimizer_1.param_groups[0]["lr"] == 0.01
+    chained_optimizer.param_groups[0]["lr"] = 0.02
+    assert optimizer_1.param_groups[0]["lr"] == 0.02
+
+    # Test the chained optimizer's state is a reference of the underlying optimizers' state
+    # 1. run step on optimizers, make sure there is state
+    assert len(chained_optimizer.state) == 0
+    input = torch.randn(1, 3, 32, 32)
+    output = net(input)
+    output.sum().backward()
+    optimizer_1.step()
+    optimizer_2.step()
+    assert len(chained_optimizer.state) != 0
+
+    # 2. check the state is a reference
+    assert not list(optimizer_1.state.values())[0]["exp_avg"].is_cuda
+    assert not list(optimizer_2.state.values())[0]["momentum_buffer"].is_cuda
+
+    def to_cuda(d):
+        for k, v in d.items():
+            if isinstance(v, torch.Tensor):
+                d[k] = v.to("cuda")
+            elif isinstance(v, dict):
+                to_cuda(v)
+        return d
+
+    for k, v in chained_optimizer.state.items():
+        chained_optimizer.state[k] = to_cuda(v)
+
+    assert list(optimizer_1.state.values())[0]["exp_avg"].is_cuda
+    assert list(optimizer_2.state.values())[0]["momentum_buffer"].is_cuda
diff --git a/tests/unit_tests/test_optimizer_param_scheduler.py b/tests/unit_tests/test_optimizer_param_scheduler.py
new file mode 100644
index 0000000000..9b78169454
--- /dev/null
+++ b/tests/unit_tests/test_optimizer_param_scheduler.py
@@ -0,0 +1,251 @@
+import math
+from unittest.mock import MagicMock
+
+import pytest
+
+from megatron.core.optimizer_param_scheduler import (  # Adjust import according to your module path
+    OptimizerParamScheduler,
+)
+
+
+@pytest.fixture
+def mock_optimizer():
+    optimizer = MagicMock()
+    optimizer.param_groups = [{'lr': 0.0, 'weight_decay': 0.0}]
+    return optimizer
+
+
+def test_initialization(mock_optimizer):
+    scheduler = OptimizerParamScheduler(
+        optimizer=mock_optimizer,
+        init_lr=0.01,
+        max_lr=0.1,
+        min_lr=0.001,
+        lr_warmup_steps=100,
+        lr_decay_steps=1000,
+        lr_decay_style='linear',
+        start_wd=0.0,
+        end_wd=0.1,
+        wd_incr_steps=1000,
+        wd_incr_style='linear',
+    )
+
+    assert scheduler.init_lr == 0.01
+    assert scheduler.max_lr == 0.1
+    assert scheduler.min_lr == 0.001
+    assert scheduler.lr_warmup_steps == 100
+    assert scheduler.lr_decay_steps == 1000
+    assert scheduler.lr_decay_style == 'linear'
+    assert scheduler.start_wd == 0.0
+    assert scheduler.end_wd == 0.1
+    assert scheduler.wd_incr_steps == 1000
+    assert scheduler.wd_incr_style == 'linear'
+
+
+def test_get_wd_constant(mock_optimizer):
+    scheduler = OptimizerParamScheduler(
+        optimizer=mock_optimizer,
+        init_lr=0.01,
+        max_lr=0.1,
+        min_lr=0.001,
+        lr_warmup_steps=100,
+        lr_decay_steps=1000,
+        lr_decay_style='linear',
+        start_wd=0.1,
+        end_wd=0.1,
+        wd_incr_steps=1000,
+        wd_incr_style='constant',
+    )
+
+    scheduler.step(500)
+    wd = scheduler.get_wd()
+    assert wd == 0.1
+
+
+def test_get_wd_linear(mock_optimizer):
+    scheduler = OptimizerParamScheduler(
+        optimizer=mock_optimizer,
+        init_lr=0.01,
+        max_lr=0.1,
+        min_lr=0.001,
+        lr_warmup_steps=100,
+        lr_decay_steps=1000,
+        lr_decay_style='linear',
+        start_wd=0.0,
+        end_wd=0.1,
+        wd_incr_steps=1000,
+        wd_incr_style='linear',
+    )
+
+    scheduler.step(500)
+    wd = scheduler.get_wd()
+    assert wd == 0.05
+
+
+def test_get_wd_cosine(mock_optimizer):
+    scheduler = OptimizerParamScheduler(
+        optimizer=mock_optimizer,
+        init_lr=0.01,
+        max_lr=0.1,
+        min_lr=0.001,
+        lr_warmup_steps=100,
+        lr_decay_steps=1000,
+        lr_decay_style='cosine',
+        start_wd=0.0,
+        end_wd=0.1,
+        wd_incr_steps=1000,
+        wd_incr_style='cosine',
+    )
+
+    scheduler.step(500)
+    wd = scheduler.get_wd()
+    expected_wd = 0.05 * (math.cos(math.pi * (1 - 0.5)) + 1.0)
+    assert math.isclose(wd, expected_wd, rel_tol=1e-5)
+
+
+def test_get_lr_linear(mock_optimizer):
+    scheduler = OptimizerParamScheduler(
+        optimizer=mock_optimizer,
+        init_lr=0.01,
+        max_lr=0.1,
+        min_lr=0.001,
+        lr_warmup_steps=100,
+        lr_decay_steps=1000,
+        lr_decay_style='linear',
+        start_wd=0.0,
+        end_wd=0.1,
+        wd_incr_steps=1000,
+        wd_incr_style='linear',
+    )
+
+    param_group = {'max_lr': 0.1, 'min_lr': 0.001}
+
+    scheduler.step(50)
+    lr = scheduler.get_lr(param_group)
+    expected_lr = 0.01 + (0.1 - 0.01) * (50 / 100)
+    assert math.isclose(lr, expected_lr, rel_tol=1e-5)
+
+    scheduler.step(450)
+    lr = scheduler.get_lr(param_group)
+    expected_lr = 0.1 - ((0.1 - 0.001) * ((500 - 100) / (1000 - 100)))
+    assert math.isclose(lr, expected_lr, rel_tol=1e-5)
+
+    scheduler.step(501)
+    lr = scheduler.get_lr(param_group)
+    expected_lr = 0.001
+    assert math.isclose(lr, expected_lr, rel_tol=1e-5)
+
+
+def test_get_lr_cosine(mock_optimizer):
+    scheduler = OptimizerParamScheduler(
+        optimizer=mock_optimizer,
+        init_lr=0.01,
+        max_lr=0.1,
+        min_lr=0.001,
+        lr_warmup_steps=100,
+        lr_decay_steps=1000,
+        lr_decay_style='cosine',
+        start_wd=0.0,
+        end_wd=0.1,
+        wd_incr_steps=1000,
+        wd_incr_style='linear',
+    )
+
+    scheduler.step(500)
+    param_group = {'max_lr': 0.1, 'min_lr': 0.001}
+    lr = scheduler.get_lr(param_group)
+    expected_lr = 0.001 + (0.1 - 0.001) * 0.5 * (
+        math.cos(math.pi * ((500 - 100) / (1000 - 100))) + 1.0
+    )
+    assert math.isclose(lr, expected_lr, rel_tol=1e-5)
+
+
+def test_step_function(mock_optimizer):
+    scheduler = OptimizerParamScheduler(
+        optimizer=mock_optimizer,
+        init_lr=0.01,
+        max_lr=0.1,
+        min_lr=0.001,
+        lr_warmup_steps=100,
+        lr_decay_steps=1000,
+        lr_decay_style='linear',
+        start_wd=0.0,
+        end_wd=0.1,
+        wd_incr_steps=1000,
+        wd_incr_style='linear',
+    )
+
+    scheduler.step(100)
+    assert scheduler.num_steps == 100
+    param_group = mock_optimizer.param_groups[0]
+    assert math.isclose(param_group['lr'], 0.01 + (0.1 - 0.01) * (100 / 100), rel_tol=1e-5)
+    assert math.isclose(param_group['weight_decay'], 0.01, rel_tol=1e-5)
+
+
+def test_state_dict(mock_optimizer):
+    scheduler = OptimizerParamScheduler(
+        optimizer=mock_optimizer,
+        init_lr=0.01,
+        max_lr=0.1,
+        min_lr=0.001,
+        lr_warmup_steps=100,
+        lr_decay_steps=1000,
+        lr_decay_style='linear',
+        start_wd=0.0,
+        end_wd=0.1,
+        wd_incr_steps=1000,
+        wd_incr_style='linear',
+    )
+
+    state_dict = scheduler.state_dict()
+    assert state_dict['max_lr'] == 0.1
+    assert state_dict['lr_warmup_steps'] == 100
+    assert state_dict['num_steps'] == 0
+    assert state_dict['lr_decay_style'] == 'linear'
+    assert state_dict['lr_decay_steps'] == 1000
+    assert state_dict['min_lr'] == 0.001
+    assert state_dict['start_wd'] == 0.0
+    assert state_dict['end_wd'] == 0.1
+    assert state_dict['wd_incr_style'] == 'linear'
+    assert state_dict['wd_incr_steps'] == 1000
+
+
+def test_load_state_dict(mock_optimizer):
+    scheduler = OptimizerParamScheduler(
+        optimizer=mock_optimizer,
+        init_lr=0.01,
+        max_lr=0.1,
+        min_lr=0.001,
+        lr_warmup_steps=100,
+        lr_decay_steps=1000,
+        lr_decay_style='linear',
+        start_wd=0.0,
+        end_wd=0.1,
+        wd_incr_steps=1000,
+        wd_incr_style='linear',
+    )
+
+    state_dict = {
+        'max_lr': 0.2,
+        'min_lr': 0.0005,
+        'lr_warmup_steps': 200,
+        'lr_decay_steps': 2000,
+        'lr_decay_style': 'cosine',
+        'num_steps': 500,
+        'start_wd': 0.01,
+        'end_wd': 0.2,
+        'wd_incr_steps': 500,
+        'wd_incr_style': 'cosine',
+    }
+
+    scheduler.load_state_dict(state_dict)
+    assert scheduler.max_lr == 0.2
+    assert scheduler.min_lr == 0.0005
+    assert scheduler.lr_warmup_steps == 200
+    assert scheduler.lr_decay_steps == 2000
+    assert scheduler.lr_decay_style == 'cosine'
+    assert scheduler.num_steps == 500
+    assert scheduler.start_wd == 0.01
+    assert scheduler.end_wd == 0.2
+    assert scheduler.wd_incr_steps == 500
+    assert scheduler.wd_incr_style == 'cosine'
diff --git a/tests/unit_tests/test_parallel_state.py b/tests/unit_tests/test_parallel_state.py
index 552c0acdf9..6dbf0394a9 100644
--- a/tests/unit_tests/test_parallel_state.py
+++ b/tests/unit_tests/test_parallel_state.py
@@ -1,104 +1,512 @@
+import os
+
+import pytest
 import torch
+
 import megatron.core.parallel_state as ps
-import pytest
 from tests.unit_tests.test_utilities import Utils
-import os 
 
 rank = Utils.rank
 world_size = Utils.world_size
+test_parallel_order = ['tp-cp-ep-dp-pp', 'tp-cp-pp-ep-dp']
 
-def test_initialize__and_destroy_model_parallel():
+
+@pytest.mark.parametrize('order', test_parallel_order)
+def test_initialize_and_destroy_model_parallel(order):
     with pytest.raises(AssertionError):
-        assert(ps.initialize_model_parallel())
+        assert ps.initialize_model_parallel(order=order)
     Utils.initialize_distributed()
     with pytest.raises(RuntimeError):
-        assert(ps.initialize_model_parallel(tensor_model_parallel_size=2*world_size))
+        assert ps.initialize_model_parallel(tensor_model_parallel_size=2 * world_size, order=order)
     with pytest.raises(RuntimeError):
-        assert(ps.initialize_model_parallel(pipeline_model_parallel_size=2*world_size))
+        assert ps.initialize_model_parallel(
+            pipeline_model_parallel_size=2 * world_size, order=order
+        )
     with pytest.raises(RuntimeError):
-        assert(ps.initialize_model_parallel(pipeline_model_parallel_size=world_size, tensor_model_parallel_size=world_size))
+        assert ps.initialize_model_parallel(
+            pipeline_model_parallel_size=world_size,
+            tensor_model_parallel_size=world_size,
+            order=order,
+        )
     with pytest.raises(RuntimeError):
-        assert(ps.initialize_model_parallel(virtual_pipeline_model_parallel_size=2))
-    Utils.initialize_model_parallel(tensor_model_parallel_size=2, pipeline_model_parallel_size=4)
+        assert ps.initialize_model_parallel(virtual_pipeline_model_parallel_size=2, order=order)
+    Utils.initialize_model_parallel(
+        tensor_model_parallel_size=2, pipeline_model_parallel_size=4, order=order
+    )
 
-    assert(ps.model_parallel_is_initialized())
-    assert(ps.get_model_parallel_group() is not None)
-    assert(ps.get_tensor_model_parallel_group() is not None)
-    assert(ps.get_pipeline_model_parallel_group() is not None)
-    assert(ps.get_data_parallel_group() is not None)  
+    assert ps.model_parallel_is_initialized()
+    assert ps.get_model_parallel_group() is not None
+    assert ps.get_tensor_model_parallel_group() is not None
+    assert ps.get_pipeline_model_parallel_group() is not None
+    assert ps.get_data_parallel_group() is not None
     Utils.destroy_model_parallel()
-    assert(ps._MODEL_PARALLEL_GROUP is None)
+    assert ps._MODEL_PARALLEL_GROUP is None
+
 
-def test_pipeline_parallel_initializations():
-    Utils.initialize_model_parallel(tensor_model_parallel_size=2, pipeline_model_parallel_size=4)
-    assert(ps.get_pipeline_model_parallel_first_rank() == rank % 2 )
-    assert(ps.get_data_parallel_src_rank() == rank)
-    assert(ps.get_pipeline_model_parallel_next_rank() == ((rank + 2) % world_size))
-    assert(ps.get_pipeline_model_parallel_prev_rank() == ((rank - 2) % world_size))
+@pytest.mark.parametrize('order', test_parallel_order)
+def test_pipeline_parallel_initializations(order):
+    Utils.initialize_model_parallel(
+        tensor_model_parallel_size=2, pipeline_model_parallel_size=4, order=order
+    )
+    assert ps.get_pipeline_model_parallel_first_rank() == rank % 2
+    assert ps.get_data_parallel_src_rank() == rank
+    assert ps.get_pipeline_model_parallel_next_rank() == ((rank + 2) % world_size)
+    assert ps.get_pipeline_model_parallel_prev_rank() == ((rank - 2) % world_size)
     Utils.destroy_model_parallel()
 
-def test_data_parallel_initializations():
-    Utils.initialize_model_parallel(pipeline_model_parallel_size=world_size)
-    assert(ps.get_data_parallel_src_rank() == rank)
-    assert(ps.get_data_parallel_world_size() == 1)
-    assert(ps.get_data_parallel_rank() == 0)
+
+@pytest.mark.parametrize('order', test_parallel_order)
+def test_data_parallel_initializations(order):
+    Utils.initialize_model_parallel(pipeline_model_parallel_size=world_size, order=order)
+    assert ps.get_data_parallel_src_rank() == rank
+    assert ps.get_data_parallel_world_size() == 1
+    assert ps.get_data_parallel_rank() == 0
     Utils.destroy_model_parallel()
-    
 
-def test_tensor_model_parellel_world_size():
-    Utils.initialize_model_parallel(tensor_model_parallel_size=world_size)
-    assert(ps.get_tensor_model_parallel_world_size() == world_size)
+
+@pytest.mark.parametrize('order', test_parallel_order)
+def test_tensor_model_parellel_world_size(order):
+    Utils.initialize_model_parallel(tensor_model_parallel_size=world_size, order=order)
+    assert ps.get_tensor_model_parallel_world_size() == world_size
     ps.set_tensor_model_parallel_world_size(None)
-    assert(ps.get_tensor_model_parallel_world_size() == world_size)
+    assert ps.get_tensor_model_parallel_world_size() == world_size
     Utils.destroy_model_parallel()
-    
 
-def test_pipeline_model_parallel_world_size():
-    Utils.initialize_model_parallel(pipeline_model_parallel_size=world_size)
-    assert(ps.get_pipeline_model_parallel_world_size() == world_size)
+
+@pytest.mark.parametrize('order', test_parallel_order)
+def test_pipeline_model_parallel_world_size(order):
+    Utils.initialize_model_parallel(pipeline_model_parallel_size=world_size, order=order)
+    assert ps.get_pipeline_model_parallel_world_size() == world_size
     ps.set_pipeline_model_parallel_world_size(None)
-    assert(ps.get_pipeline_model_parallel_world_size() == world_size)
-    Utils.destroy_model_parallel()    
-    
+    assert ps.get_pipeline_model_parallel_world_size() == world_size
+    Utils.destroy_model_parallel()
 
-def test_tensor_model_parallel_rank():
-    Utils.initialize_model_parallel(tensor_model_parallel_size=world_size)
-    assert(ps.get_tensor_model_parallel_rank() == rank)
+
+@pytest.mark.parametrize('order', test_parallel_order)
+def test_tensor_model_parallel_rank(order):
+    Utils.initialize_model_parallel(tensor_model_parallel_size=world_size, order=order)
+    assert ps.get_tensor_model_parallel_rank() == rank
     ps.set_tensor_model_parallel_rank(None)
-    assert(ps.get_tensor_model_parallel_rank() == rank)    
-    Utils.destroy_model_parallel()    
-    
+    assert ps.get_tensor_model_parallel_rank() == rank
+    Utils.destroy_model_parallel()
 
-def test_pipeline_model_parallel_rank():
-    Utils.initialize_model_parallel(pipeline_model_parallel_size=world_size)
-    assert(ps.get_pipeline_model_parallel_rank() == rank)
+
+@pytest.mark.parametrize('order', test_parallel_order)
+def test_pipeline_model_parallel_rank(order):
+    Utils.initialize_model_parallel(pipeline_model_parallel_size=world_size, order=order)
+    assert ps.get_pipeline_model_parallel_rank() == rank
     ps.set_pipeline_model_parallel_rank(None)
-    assert(ps.get_pipeline_model_parallel_rank() == rank)
+    assert ps.get_pipeline_model_parallel_rank() == rank
     Utils.destroy_model_parallel()
-    
 
-def test_is_pipeline_first_stage():
-    Utils.initialize_model_parallel(pipeline_model_parallel_size=world_size)
-    assert(ps.is_pipeline_first_stage(ignore_virtual=True) == (rank == 0))
-    assert(ps.is_pipeline_first_stage() == (rank == 0))
+
+def test_context_parallel_rank():
+    Utils.initialize_model_parallel(context_parallel_size=world_size)
+    assert ps.get_context_parallel_rank() == rank
     Utils.destroy_model_parallel()
-    
 
-def test_is_pipeline_last_stage():
-    Utils.initialize_model_parallel(pipeline_model_parallel_size=world_size)
-    assert(ps.is_pipeline_last_stage(ignore_virtual=True) == (rank == world_size-1))
-    assert(ps.is_pipeline_last_stage() == (rank == world_size-1))
+
+def test_expert_model_parallel_rank():
+    Utils.initialize_model_parallel(expert_model_parallel_size=world_size)
+    assert ps.get_expert_model_parallel_rank() == rank
+    ps.set_expert_model_parallel_rank(None)
+    assert ps.get_expert_model_parallel_rank() == rank
     Utils.destroy_model_parallel()
-    
 
-def test_virtual_pipeline_model_parallel_rank():
-    Utils.initialize_model_parallel(pipeline_model_parallel_size=world_size)
+
+@pytest.mark.parametrize('order', test_parallel_order)
+def test_is_pipeline_first_stage(order):
+    Utils.initialize_model_parallel(pipeline_model_parallel_size=world_size, order=order)
+    assert ps.is_pipeline_first_stage(ignore_virtual=True) == (rank == 0)
+    assert ps.is_pipeline_first_stage() == (rank == 0)
+    Utils.destroy_model_parallel()
+
+
+@pytest.mark.parametrize('order', test_parallel_order)
+def test_is_pipeline_last_stage(order):
+    Utils.initialize_model_parallel(pipeline_model_parallel_size=world_size, order=order)
+    assert ps.is_pipeline_last_stage(ignore_virtual=True) == (rank == world_size - 1)
+    assert ps.is_pipeline_last_stage() == (rank == world_size - 1)
+    Utils.destroy_model_parallel()
+
+
+@pytest.mark.parametrize('order', test_parallel_order)
+def test_virtual_pipeline_model_parallel_rank(order):
+    Utils.initialize_model_parallel(pipeline_model_parallel_size=world_size, order=order)
     ps.set_virtual_pipeline_model_parallel_rank(rank)
-    assert(ps.get_virtual_pipeline_model_parallel_rank() == rank)
+    assert ps.get_virtual_pipeline_model_parallel_rank() == rank
+    Utils.destroy_model_parallel()
+
+
+@pytest.mark.parametrize('order', test_parallel_order)
+def test_get_tensor_model_parallel_src_rank(order):
+    Utils.initialize_model_parallel(tensor_model_parallel_size=world_size, order=order)
+    assert ps.get_tensor_model_parallel_src_rank() == ((rank // world_size) * world_size)
+    Utils.destroy_model_parallel()
+
+
+@pytest.mark.parametrize('order', test_parallel_order)
+def test_encoder_tensor_pipeline_parallelism(order):
+    Utils.initialize_model_parallel(
+        tensor_model_parallel_size=5,
+        pipeline_model_parallel_size=1,
+        encoder_pipeline_model_parallel_size=1,
+        encoder_tensor_model_parallel_size=3,
+        order=order,
+    )
+    if rank < 2:
+        assert ps.get_tensor_model_parallel_world_size() == 3
+        assert isinstance(ps._PIPELINE_GLOBAL_RANKS[0], list)
+    elif rank == 2:
+        assert ps.get_tensor_model_parallel_world_size() == 3
+        assert isinstance(ps._PIPELINE_GLOBAL_RANKS[0], int)
+    else:
+        assert ps.get_tensor_model_parallel_world_size() == 5
+        assert isinstance(ps._PIPELINE_GLOBAL_RANKS[0], int)
+    Utils.destroy_model_parallel()
+
+
+@pytest.mark.parametrize(
+    'src_tp_pp, ep_size',
+    [
+        ((1, 8), 1),
+        ((2, 4), 1),
+        ((4, 2), 1),
+        ((8, 1), 1),
+        ((4, 1), 2),
+        ((1, 1), 8),
+        ((1, 1), 2),
+        ((2, 1), 4),
+    ],
+)
+def test_different_initialize_order_consistency(src_tp_pp, ep_size):
+    Utils.initialize_model_parallel(
+        *src_tp_pp, expert_model_parallel_size=ep_size, order='tp-ep-dp-pp'
+    )
+    tp_rank = ps.get_tensor_model_parallel_rank()
+    dp_rank = ps.get_data_parallel_rank()
+    pp_rank = ps.get_pipeline_model_parallel_rank()
+    ep_rank = ps.get_expert_model_parallel_rank()
+
+    tp_g = torch.distributed.get_process_group_ranks(ps.get_tensor_model_parallel_group())
+    dp_g = torch.distributed.get_process_group_ranks(ps.get_data_parallel_group(False))
+    pp_g = torch.distributed.get_process_group_ranks(ps.get_pipeline_model_parallel_group())
+    dp_no_ep_g = torch.distributed.get_process_group_ranks(
+        ps.get_data_modulo_expert_parallel_group()
+    )
+    cp_g = torch.distributed.get_process_group_ranks(ps.get_context_parallel_group())
+    mp_g = torch.distributed.get_process_group_ranks(ps.get_model_parallel_group())
+    tp_ep_g = torch.distributed.get_process_group_ranks(ps.get_tensor_and_expert_parallel_group())
+    tp_dp_g = torch.distributed.get_process_group_ranks(
+        ps.get_tensor_and_data_parallel_group(False)
+    )
+
+    Utils.destroy_model_parallel()
+
+    Utils.initialize_model_parallel(
+        *src_tp_pp, expert_model_parallel_size=ep_size, order='tp-pp-ep-dp'
+    )
+    assert tp_rank == ps.get_tensor_model_parallel_rank()
+    assert dp_rank == ps.get_data_parallel_rank()
+    assert pp_rank == ps.get_pipeline_model_parallel_rank()
+    assert ep_rank == ps.get_expert_model_parallel_rank()
+
+    assert tp_g == torch.distributed.get_process_group_ranks(ps.get_tensor_model_parallel_group())
+    assert dp_g == torch.distributed.get_process_group_ranks(ps.get_data_parallel_group(False))
+    assert pp_g == torch.distributed.get_process_group_ranks(ps.get_pipeline_model_parallel_group())
+    assert dp_no_ep_g == torch.distributed.get_process_group_ranks(
+        ps.get_data_modulo_expert_parallel_group()
+    )
+    assert cp_g == torch.distributed.get_process_group_ranks(ps.get_context_parallel_group())
+    assert mp_g == torch.distributed.get_process_group_ranks(ps.get_model_parallel_group())
+    assert tp_ep_g == torch.distributed.get_process_group_ranks(
+        ps.get_tensor_and_expert_parallel_group()
+    )
+    assert tp_dp_g == torch.distributed.get_process_group_ranks(
+        ps.get_tensor_and_data_parallel_group(False)
+    )
+
     Utils.destroy_model_parallel()
-    
 
-def test_get_tensor_model_parallel_src_rank():
-    Utils.initialize_model_parallel(tensor_model_parallel_size=world_size)
-    assert(ps.get_tensor_model_parallel_src_rank() == ((rank // world_size) * world_size))
-    Utils.destroy_model_parallel() 
\ No newline at end of file
+
+@pytest.mark.parametrize(
+    'src_tp_pp, ep_size',
+    [((1, 2), 1), ((1, 4), 1), ((2, 2), 1), ((1, 2), 2), ((1, 4), 2), ((2, 2), 2)],
+)
+def test_different_initialize_order_unconsistency(src_tp_pp, ep_size):
+    Utils.initialize_model_parallel(
+        *src_tp_pp, expert_model_parallel_size=ep_size, order='tp-ep-dp-pp'
+    )
+
+    tp_g = torch.distributed.get_process_group_ranks(ps.get_tensor_model_parallel_group())
+    dp_g = torch.distributed.get_process_group_ranks(ps.get_data_parallel_group(False))
+    pp_g = torch.distributed.get_process_group_ranks(ps.get_pipeline_model_parallel_group())
+    cp_g = torch.distributed.get_process_group_ranks(ps.get_context_parallel_group())
+    amax_g = torch.distributed.get_process_group_ranks(ps.get_amax_reduction_group(False))
+    mp_g = torch.distributed.get_process_group_ranks(ps.get_model_parallel_group())
+
+    Utils.destroy_model_parallel()
+
+    Utils.initialize_model_parallel(
+        *src_tp_pp, expert_model_parallel_size=ep_size, order='tp-pp-ep-dp'
+    )
+    assert tp_g == torch.distributed.get_process_group_ranks(ps.get_tensor_model_parallel_group())
+    assert dp_g != torch.distributed.get_process_group_ranks(ps.get_data_parallel_group(False))
+    assert pp_g != torch.distributed.get_process_group_ranks(ps.get_pipeline_model_parallel_group())
+    assert cp_g == torch.distributed.get_process_group_ranks(ps.get_context_parallel_group())
+    assert amax_g != torch.distributed.get_process_group_ranks(ps.get_amax_reduction_group(False))
+    assert mp_g != torch.distributed.get_process_group_ranks(ps.get_model_parallel_group())
+
+    Utils.destroy_model_parallel()
+
+
+@pytest.mark.parametrize(
+    'nodes, num_gpu, tp, pp, cp, ep',
+    [
+        (1, 1, 1, 1, 1, 1),
+        (1, 8, 8, 1, 1, 1),
+        (1, 8, 2, 2, 1, 1),
+        (1, 8, 2, 4, 1, 1),
+        (3, 8, 8, 3, 1, 1),
+        (4, 8, 2, 4, 1, 1),
+        (8, 8, 8, 8, 1, 1),
+        (8, 8, 2, 1, 1, 4),
+        (8, 8, 2, 2, 2, 4),
+        (8, 8, 2, 1, 4, 8),
+        (8, 8, 2, 2, 2, 8),
+        (16, 8, 4, 8, 1, 1),
+        (16, 8, 4, 8, 1, 4),
+        (16, 8, 4, 8, 4, 1),
+        (16, 8, 8, 8, 1, 1),
+        (16, 8, 4, 8, 1, 1),
+        (16, 8, 8, 8, 1, 1),
+        (32, 8, 4, 8, 1, 1),
+        (32, 8, 8, 8, 1, 1),
+        (32, 8, 4, 8, 1, 4),
+        (32, 8, 8, 8, 4, 1),
+        (64, 8, 4, 2, 8, 8),
+        (64, 8, 4, 8, 1, 1),
+        (64, 8, 8, 8, 1, 1),
+        (96, 8, 4, 8, 1, 1),
+        (128, 8, 4, 2, 8, 8),
+        (128, 8, 4, 8, 1, 1),
+        (256, 8, 4, 8, 1, 1),
+        (316, 8, 4, 8, 1, 1),
+        (384, 8, 4, 8, 1, 1),
+        (512, 8, 4, 8, 1, 1),
+        (768, 8, 4, 8, 1, 1),
+        (1024, 8, 4, 8, 1, 1),
+        (1280, 8, 4, 8, 1, 1),
+        (1344, 8, 4, 8, 1, 1),
+    ],
+)
+def test_rank_generator_for_tp_dp_pp(nodes, num_gpu, tp, pp, cp, ep):
+    def golden_rank_result_from_past_code(
+        world_size: int,
+        tensor_model_parallel_size: int = 1,
+        pipeline_model_parallel_size: int = 1,
+        context_parallel_size: int = 1,
+        expert_model_parallel_size: int = 1,
+    ):
+        data_parallel_size: int = world_size // (
+            tensor_model_parallel_size * pipeline_model_parallel_size * context_parallel_size
+        )
+        num_tensor_model_parallel_groups: int = world_size // tensor_model_parallel_size
+        num_pipeline_model_parallel_groups: int = world_size // pipeline_model_parallel_size
+
+        dp_groups = []
+        dp_groups_with_cp = []
+
+        all_data_parallel_group_ranks_with_cp = []
+        for i in range(pipeline_model_parallel_size):
+            start_rank = i * num_pipeline_model_parallel_groups
+            end_rank = (i + 1) * num_pipeline_model_parallel_groups
+            for j in range(context_parallel_size * tensor_model_parallel_size):
+                ranks = range(
+                    start_rank + j, end_rank, context_parallel_size * tensor_model_parallel_size
+                )
+                dp_groups.append(list(ranks))
+            for j in range(tensor_model_parallel_size):
+                ranks_with_cp = range(start_rank + j, end_rank, tensor_model_parallel_size)
+                all_data_parallel_group_ranks_with_cp.append(list(ranks_with_cp))
+                dp_groups_with_cp.append(list(ranks_with_cp))
+
+        cp_group = []
+        for i in range(pipeline_model_parallel_size):
+            for j in range(data_parallel_size):
+                start_rank = (
+                    i * num_pipeline_model_parallel_groups
+                    + j * tensor_model_parallel_size * context_parallel_size
+                )
+                end_rank = (
+                    i * num_pipeline_model_parallel_groups
+                    + (j + 1) * tensor_model_parallel_size * context_parallel_size
+                )
+                for k in range(tensor_model_parallel_size):
+                    ranks = range(start_rank + k, end_rank, tensor_model_parallel_size)
+                    cp_group.append(list(ranks))
+
+        mp_group = []
+        for i in range(data_parallel_size * context_parallel_size):
+            ranks = [
+                data_parallel_group_ranks_with_cp[i]
+                for data_parallel_group_ranks_with_cp in all_data_parallel_group_ranks_with_cp
+            ]
+            mp_group.append(list(ranks))
+
+        tp_group = []
+        for i in range(num_tensor_model_parallel_groups):
+            ranks = range(i * tensor_model_parallel_size, (i + 1) * tensor_model_parallel_size)
+            tp_group.append(list(ranks))
+
+        pp_group = []
+        for i in range(num_pipeline_model_parallel_groups):
+            ranks = range(i, world_size, num_pipeline_model_parallel_groups)
+            pp_group.append(list(ranks))
+
+        tp_dp_group = []
+        tp_dp_cp_group = []
+        tensor_and_data_group_size_with_cp: int = (
+            tensor_model_parallel_size * data_parallel_size * context_parallel_size
+        )
+        num_tensor_and_data_groups_with_cp: int = world_size // tensor_and_data_group_size_with_cp
+        for i in range(num_tensor_and_data_groups_with_cp):
+            start_rank = i * tensor_and_data_group_size_with_cp
+            end_rank = start_rank + tensor_and_data_group_size_with_cp
+            ranks = range(start_rank, end_rank)
+            tp_dp_cp_group.append(list(ranks))
+
+            for j in range(context_parallel_size):
+                ranks = []
+                for k in range(data_parallel_size):
+                    start_rank = (
+                        i * tensor_and_data_group_size_with_cp
+                        + j * tensor_model_parallel_size
+                        + k * tensor_model_parallel_size * context_parallel_size
+                    )
+                    end_rank = start_rank + tensor_model_parallel_size
+                    ranks = ranks + list(range(start_rank, end_rank))
+                tp_dp_group.append(list(ranks))
+
+        tp_ep_group = []
+        dp_no_ep_group = []
+        dp_no_ep_group_with_cp = []
+
+        all_ranks = torch.arange(world_size).reshape(
+            (
+                pipeline_model_parallel_size,
+                data_parallel_size // expert_model_parallel_size,
+                expert_model_parallel_size,
+                context_parallel_size,
+                tensor_model_parallel_size,
+            )
+        )
+        # 'pp edp ep cp tp -> (pp edp cp) (ep tp)'
+        tp_ep_rearrange = torch.transpose(all_ranks, 2, 3)
+        tp_ep_rearrange = torch.reshape(
+            tp_ep_rearrange, (-1, expert_model_parallel_size * tensor_model_parallel_size)
+        )
+        tp_ep_rearrange = tp_ep_rearrange.tolist()
+        tp_ep_rearrange.sort()
+        for tensor_and_expert_parallel_ranks in tp_ep_rearrange:
+            tensor_and_expert_parallel_ranks = list(tensor_and_expert_parallel_ranks)
+            tensor_and_expert_parallel_ranks.sort()
+            tp_ep_group.append(tensor_and_expert_parallel_ranks)
+        # 'pp edp ep cp tp -> (pp ep cp tp) edp'
+        edp_rearrange = torch.transpose(all_ranks, 1, 4)
+        edp_rearrange = torch.reshape(
+            edp_rearrange, (-1, data_parallel_size // expert_model_parallel_size)
+        )
+        edp_rearrange = edp_rearrange.tolist()
+        edp_rearrange.sort()
+        for expert_data_parallel_ranks in edp_rearrange:
+            expert_data_parallel_ranks = list(expert_data_parallel_ranks)
+            expert_data_parallel_ranks.sort()
+            dp_no_ep_group.append(expert_data_parallel_ranks)
+        # 'pp edp ep cp tp -> (pp ep tp) (cp edp)'
+        edp_cp_rearrange = torch.transpose(all_ranks, 1, 2)
+        edp_cp_rearrange = torch.transpose(edp_cp_rearrange, 2, 4)
+        edp_cp_rearrange = torch.reshape(
+            edp_cp_rearrange,
+            (-1, context_parallel_size * data_parallel_size // expert_model_parallel_size),
+        )
+        edp_cp_rearrange = edp_cp_rearrange.tolist()
+        edp_cp_rearrange.sort()
+        for expert_data_parallel_ranksj_with_cp in edp_cp_rearrange:
+            expert_data_parallel_ranksj_with_cp = list(expert_data_parallel_ranksj_with_cp)
+            expert_data_parallel_ranksj_with_cp.sort()
+            dp_no_ep_group_with_cp.append(expert_data_parallel_ranksj_with_cp)
+
+        return (
+            dp_groups,
+            dp_groups_with_cp,
+            cp_group,
+            mp_group,
+            tp_group,
+            pp_group,
+            tp_dp_group,
+            tp_dp_cp_group,
+            tp_ep_group,
+            dp_no_ep_group,
+            dp_no_ep_group_with_cp,
+        )
+
+    world_size = nodes * num_gpu
+    dp = world_size // (tp * pp * cp)
+    assert dp % ep == 0, f"dp size ({dp}) is not divisible by ep {ep} ."
+    assert (
+        world_size % (tp * pp * cp) == 0
+    ), f"world_size ({world_size}) is not divisible by tp {tp} x pp {pp} x cp {cp}."
+    (
+        dp_groups,
+        dp_groups_with_cp,
+        cp_group,
+        mp_group,
+        tp_group,
+        pp_group,
+        tp_dp_group,
+        tp_dp_cp_group,
+        tp_ep_group,
+        dp_no_ep_group,
+        dp_no_ep_group_with_cp,
+    ) = golden_rank_result_from_past_code(
+        world_size=world_size,
+        tensor_model_parallel_size=tp,
+        pipeline_model_parallel_size=pp,
+        context_parallel_size=cp,
+        expert_model_parallel_size=ep,
+    )
+    rank_generator = ps.RankGenerator(tp=tp, ep=ep, dp=dp, pp=pp, cp=cp, order="tp-cp-ep-dp-pp")
+    assert dp_groups == rank_generator.get_ranks(
+        "dp"
+    ), f"{dp_groups} != {rank_generator.get_ranks('dp')}"
+    assert dp_groups_with_cp == rank_generator.get_ranks(
+        'dp-cp'
+    ), f"{dp_groups_with_cp} != {rank_generator.get_ranks('dp-cp')}"
+    assert cp_group == rank_generator.get_ranks(
+        "cp"
+    ), f"{cp_group} != {rank_generator.get_ranks('cp')}."
+    assert mp_group == rank_generator.get_ranks(
+        "tp-pp"
+    ), f"{mp_group} != {rank_generator.get_ranks('tp-pp')}"
+    assert tp_group == rank_generator.get_ranks(
+        "tp"
+    ), f"{tp_group} != {rank_generator.get_ranks('tp')}"
+    assert pp_group == rank_generator.get_ranks(
+        "pp"
+    ), f"{pp_group} != {rank_generator.get_ranks('pp')}"
+    assert tp_dp_group == rank_generator.get_ranks(
+        "tp-dp"
+    ), f"{tp_dp_group} != {rank_generator.get_ranks('tp-dp')}"
+    assert tp_dp_cp_group == rank_generator.get_ranks(
+        "tp-dp-cp"
+    ), f"{tp_dp_cp_group} != {rank_generator.get_ranks('tp-dp-cp')}"
+    assert tp_ep_group == rank_generator.get_ranks(
+        "tp-ep", independent_ep=True
+    ), f"{tp_ep_group} != {rank_generator.get_ranks('tp-ep', independent_ep=True)}."
+    assert dp_no_ep_group == rank_generator.get_ranks(
+        "dp", independent_ep=True
+    ), f"{dp_no_ep_group} != {rank_generator.get_ranks('dp', independent_ep=True)}."
+    assert dp_no_ep_group_with_cp == rank_generator.get_ranks(
+        "dp-cp", independent_ep=True
+    ), f"{dp_no_ep_group_with_cp} != {rank_generator.get_ranks('dp-cp', independent_ep=True)}."
diff --git a/tests/unit_tests/test_tokenizer.py b/tests/unit_tests/test_tokenizer.py
new file mode 100644
index 0000000000..13e222953b
--- /dev/null
+++ b/tests/unit_tests/test_tokenizer.py
@@ -0,0 +1,193 @@
+import base64
+import json
+from argparse import Namespace
+from pathlib import Path
+
+import pytest
+import requests
+
+from megatron.training import tokenizer
+from megatron.training.tokenizer.gpt2_tokenization import PRETRAINED_VOCAB_ARCHIVE_MAP
+
+TOKENIZER_DIR = Path("~/data/tokenizers").expanduser()
+
+# Copied over from test_preprocess_data.py
+__LOCAL_GPT2_VOCAB = "/home/gitlab-runner/data/gpt3_data/gpt2-vocab.json"
+
+
+def offsets_to_substrs(offsets, string):
+    return [string[start:end] for start, end in zip([0] + offsets, offsets + [len(string)])]
+
+
+def local_test_specs():
+    return [
+        Namespace(
+            rank=0,
+            tensor_model_parallel_size=8,
+            make_vocab_size_divisible_by=128,
+            tokenizer_type="GPTSentencePieceTokenizer",
+            tokenizer_model=f"{TOKENIZER_DIR}/nemotron_2_256k.model",
+        ),
+        Namespace(
+            rank=0,
+            vocab_size=131072,
+            make_vocab_size_divisible_by=128,
+            tensor_model_parallel_size=8,
+            tokenizer_type="TikTokenizer",
+            tokenizer_model=f"{TOKENIZER_DIR}/multiMixV8.gpt4o_nc_sd.500000.128k.vocab.json",
+            tiktoken_pattern="v2",
+            tiktoken_num_special_tokens=1000,
+            tiktoken_special_tokens=["<unk>", "<s>", "</s>"],
+        ),
+        Namespace(
+            rank=0,
+            vocab_size=131072,
+            make_vocab_size_divisible_by=128,
+            tensor_model_parallel_size=8,
+            tokenizer_type="TikTokenizer",
+            tokenizer_model=f"{TOKENIZER_DIR}/multiMixV5_fix_default_500000_128k.vocab.json",
+            tiktoken_pattern="v1",
+            tiktoken_num_special_tokens=1000,
+            tiktoken_special_tokens=["<unk>", "<s>", "</s>"],
+        ),
+        Namespace(
+            rank=0,
+            vocab_size=128000,
+            make_vocab_size_divisible_by=128,
+            tensor_model_parallel_size=8,
+            tokenizer_type="HuggingFaceTokenizer",
+            tokenizer_model="meta-llama/Llama-2-7b-hf",
+        ),
+        Namespace(
+            rank=0,
+            vocab_size=128000,
+            make_vocab_size_divisible_by=128,
+            tensor_model_parallel_size=8,
+            tokenizer_type="HuggingFaceTokenizer",
+            tokenizer_model="meta-llama/Meta-Llama-3.1-8B",
+        ),
+    ]
+
+
+@pytest.fixture(scope="session")
+def gpt2_tiktok_vocab(tmp_path_factory):
+
+    if Path(__LOCAL_GPT2_VOCAB).exists():
+        with open(__LOCAL_GPT2_VOCAB, "r", encoding="utf-8") as reader:
+            gpt2_vocab = json.load(reader)
+    else:
+        gpt2_vocab = json.loads(requests.get(PRETRAINED_VOCAB_ARCHIVE_MAP["gpt2"]).content)
+
+    N = 256
+    tiktok_vocab = [
+        {"token_bytes": base64.b64encode(bytes([i])).decode("utf-8"), "token_str": str(i)}
+        for i in range(N)
+    ]
+    tiktok_vocab_bytes = {x["token_bytes"] for x in tiktok_vocab}
+
+    tiktok_vocab += [
+        {"token_bytes": base64.b64encode(token.encode('utf-8')).decode("utf-8"), "token_str": token}
+        for token in gpt2_vocab
+        if base64.b64encode(token.encode('utf-8')).decode("utf-8") not in tiktok_vocab_bytes
+    ]
+
+    for i, entry in enumerate(tiktok_vocab):
+        entry["rank"] = i
+
+    for i, x in enumerate(tiktok_vocab):
+        assert x.keys() == {"rank", "token_bytes", "token_str"}
+        assert x["rank"] == i
+        merge = base64.b64decode(x["token_bytes"])
+        assert i >= 256 or merge == bytes([i]), f"{i} {merge} {bytes([i])}"
+
+    file_name = tmp_path_factory.mktemp("data") / "gpt2_vocab.json"
+    with open(file_name, "w") as f:
+        json.dump(tiktok_vocab, f)
+
+    return Namespace(
+        rank=0,
+        vocab_size=32768,
+        make_vocab_size_divisible_by=128,
+        tensor_model_parallel_size=8,
+        tokenizer_type="TikTokenizer",
+        tokenizer_model=str(file_name),
+        tiktoken_pattern="v1",
+        tiktoken_num_special_tokens=1000,
+        tiktoken_special_tokens=["<unk>", "<s>", "</s>"],
+    )
+
+
+def specs():
+    if TOKENIZER_DIR.exists():
+        return local_test_specs()
+    return []
+
+
+@pytest.mark.parametrize("args", specs())
+def test_tokenizer(args):
+    tok = tokenizer.build_tokenizer(args)
+    run_tokenizer_tests(tok)
+
+
+def test_gpt2_tiktok_tokenizer(gpt2_tiktok_vocab):
+    tok = tokenizer.build_tokenizer(gpt2_tiktok_vocab)
+    run_tokenizer_tests(tok)
+
+
+def run_tokenizer_tests(tok):
+    string1 = (
+        "The following are multiple choice questions (with answers) about college biology.\n"
+        "Monoclonal antisera are distinguished from polyclonal antisera in which of the "
+        "following ways?\n"
+        "A. Each type of antibody in a monoclonal antiserum reacts against a single region of "
+        "a single antigen; each type of antibody in a polyclonal antiserum reacts against "
+        "multiple regions of different antigens.\n"
+        "B. A monoclonal antibody reacts against multiple regions of a single antigen; a "
+        "polyclonal antibody reacts against a single region of related antigens.\n"
+        "C. A monoclonal antiserum contains antibodies secreted from the descendants of a "
+        "single B lymphocyte; a polyclonal antiserum contains antibodies secreted from the "
+        "descendants of different B lymphocytes.\n"
+        "D. A monoclonal antiserum contains antibodies secreted from the descendants of a "
+        "single B lymphocyte; a polyclonal antiserum contains antibodies secreted from the "
+        "descendants of both B and T lymphocytes.\n"
+        "Answer: C"
+    )
+    string2 = "Жизнь прекрасна и удивительна"
+    string3 = "お誕生日おめでとう"
+    strings = [string1, string2, string3]
+
+    for test_string in strings:
+        toks = tok.tokenize(test_string)
+        offsets = tok.offsets(toks, test_string)
+        dec = offsets_to_substrs(offsets, test_string)
+        detok_str = ''.join(dec)
+        # the following is not necessarily true by construction above,
+        # since the many tokenizers may operate at the byte level and not
+        # only at the character level.
+        assert (
+            detok_str == test_string
+        ), f"Detokenized string {detok_str} does not match original {test_string}"
+        assert len(toks) == len(
+            offsets
+        ), f"Tokenized string {toks} does not match original {offsets}"
+
+
+def test_null_tokenizer():
+    args = Namespace(
+        tokenizer_type="NullTokenizer",
+        rank=0,
+        vocab_size=128000,
+        make_vocab_size_divisible_by=128,
+        tensor_model_parallel_size=8,
+    )
+    tok = tokenizer.build_tokenizer(args)
+    test_string = "1 23 456 789"
+    toks = tok.tokenize(test_string)
+    offsets = tok.offsets(toks, test_string)
+    dec = offsets_to_substrs(offsets, test_string)
+    detok_str = ''.join(dec)
+
+    assert (
+        detok_str == test_string
+    ), f"Detokenized string {detok_str} does not match original {test_string}"
+    assert len(toks) == len(offsets), f"Tokenized string {toks} does not match original {offsets}"
diff --git a/tests/unit_tests/test_training.py b/tests/unit_tests/test_training.py
new file mode 100644
index 0000000000..a23496f981
--- /dev/null
+++ b/tests/unit_tests/test_training.py
@@ -0,0 +1,68 @@
+from types import SimpleNamespace
+
+from megatron.training.global_vars import set_args
+from megatron.training.tokenizer.tokenizer import _vocab_size_with_padding
+from megatron.training.training import build_train_valid_test_data_iterators
+from tests.unit_tests.test_utilities import Utils
+
+
+def mock_train_valid_test_datasets_provider(train_val_test_num_samples):
+    return 1, 2, 3
+
+
+def create_test_args():
+    # Set dummy values for the args.
+    args = SimpleNamespace()
+    args.iteration = 0
+    args.train_samples = 1
+    args.train_iters = 1
+    args.eval_interval = 1
+    args.eval_iters = 1
+    args.global_batch_size = 1
+    args.consumed_train_samples = 1
+    args.consumed_valid_samples = 1
+    args.dataloader_type = "external"
+    args.skip_train = False
+
+    return args
+
+
+class TestTraining:
+    def setup_method(self, method):
+        Utils.initialize_model_parallel(1, 1)
+        args = create_test_args()
+        set_args(args)
+
+    def test_build_train_valid_test_data_iterators(self):
+        train_iter, valid_iter, test_iter = build_train_valid_test_data_iterators(
+            mock_train_valid_test_datasets_provider
+        )
+
+        assert (train_iter, valid_iter, test_iter) == (1, 2, 3)
+
+    def test_closed_formula_vocab_size_with_padding(self):
+        def old_round_impl(after, multiple):
+            while (after % multiple) != 0:
+                after += 1
+            return after
+
+        args = SimpleNamespace()
+        args.rank = 0
+        args.tensor_model_parallel_size = 1
+
+        for vocab in range(1, 600000, 1000):
+            for mult in [1, 17, 32, 64, 128]:
+                args.make_vocab_size_divisible_by = mult
+                assert old_round_impl(vocab, mult) == _vocab_size_with_padding(
+                    vocab, args, False
+                ), (vocab, mult)
+
+        for vocab in range(1, 10_000, 500):
+            for mult in range(1, 1024 + 1):
+                args.make_vocab_size_divisible_by = mult
+                assert old_round_impl(vocab, mult) == _vocab_size_with_padding(
+                    vocab, args, False
+                ), (vocab, mult)
+
+    def teardown_method(self, method):
+        Utils.destroy_model_parallel()
diff --git a/tests/unit_tests/test_utilities.py b/tests/unit_tests/test_utilities.py
index b35c77b58d..29aef63c88 100644
--- a/tests/unit_tests/test_utilities.py
+++ b/tests/unit_tests/test_utilities.py
@@ -1,30 +1,104 @@
 import os
+from datetime import timedelta
+
 import torch
+from torch._C._distributed_c10d import PrefixStore
+from torch.distributed import rendezvous
+
 import megatron.core.parallel_state as ps
 
+
+class TestModel(torch.nn.Module):
+    def __init__(
+        self,
+        input_dim: int,
+        output_dim: int,
+        num_layers: int,
+        bias: bool,
+        shared_embedding: bool = False,
+    ):
+        super().__init__()
+        self.layers = torch.nn.ModuleList(
+            [torch.nn.Linear(input_dim, output_dim, bias) for _ in range(num_layers)]
+        )
+        if shared_embedding:
+            self.layers[-1].weight.shared_embedding = True
+
+
 class Utils:
 
     world_size = torch.cuda.device_count()
     rank = int(os.environ['LOCAL_RANK'])
+    inited = False
+    store = None
 
     @staticmethod
     def initialize_distributed():
-        print(f'Initializing torch.distributed with rank: {Utils.rank}, world_size: {Utils.world_size}')
-        torch.cuda.set_device(Utils.rank % torch.cuda.device_count())
-        init_method = 'tcp://'
-        master_ip = os.getenv('MASTER_ADDR', 'localhost')
-        master_port = os.getenv('MASTER_PORT', '6000')
-        init_method += master_ip + ':' + master_port
-        torch.distributed.init_process_group(backend='nccl', world_size=Utils.world_size, rank=Utils.rank, init_method=init_method)
-        
+        if not torch.distributed.is_initialized() and Utils.rank >= 0:
+            print(
+                f'Initializing torch.distributed with rank: {Utils.rank}, '
+                f'world_size: {Utils.world_size}'
+            )
+            torch.cuda.set_device(Utils.rank % torch.cuda.device_count())
+            init_method = 'tcp://'
+            master_ip = os.getenv('MASTER_ADDR', 'localhost')
+            master_port = os.getenv('MASTER_PORT', '6000')
+            init_method += master_ip + ':' + master_port
+            rendezvous_iterator = rendezvous(
+                init_method, Utils.rank, Utils.world_size, timeout=timedelta(minutes=1)
+            )
+            store, rank, world_size = next(rendezvous_iterator)
+            store.set_timeout(timedelta(minutes=1))
+
+            # Use a PrefixStore to avoid accidental overrides of keys used by
+            # different systems (e.g. RPC) in case the store is multi-tenant.
+            store = PrefixStore("default_pg", store)
+            Utils.store = store
+
+            torch.distributed.init_process_group(
+                backend='nccl', world_size=Utils.world_size, rank=Utils.rank, store=store
+            )
+
+            torch.distributed.barrier()
+        Utils.inited = True
+
+    @staticmethod
+    def set_world_size(world_size=None, rank=None):
+        Utils.world_size = torch.cuda.device_count() if world_size is None else world_size
+        if (
+            torch.distributed.is_initialized()
+            and Utils.world_size != torch.distributed.get_world_size()
+        ):
+            torch.distributed.destroy_process_group()
+
+        if rank is None:
+            Utils.rank = int(os.environ['LOCAL_RANK'])
+            if Utils.rank >= Utils.world_size:
+                Utils.rank = -1
+        else:
+            Utils.rank = rank
+
     @staticmethod
     def destroy_model_parallel():
-        ps.destroy_model_parallel()
+        if not Utils.inited:
+            return
         torch.distributed.barrier()
+        ps.destroy_model_parallel()
+        Utils.inited = False
 
     @staticmethod
-    def initialize_model_parallel(tensor_model_parallel_size = 1, pipeline_model_parallel_size = 1, virtual_pipeline_model_parallel_size = None, pipeline_model_parallel_split_rank = None):
+    def initialize_model_parallel(
+        tensor_model_parallel_size=1,
+        pipeline_model_parallel_size=1,
+        virtual_pipeline_model_parallel_size=None,
+        **kwargs,
+    ):
         ps.destroy_model_parallel()
-        if not torch.distributed.is_initialized():
-            Utils.initialize_distributed()
-        ps.initialize_model_parallel(tensor_model_parallel_size, pipeline_model_parallel_size, virtual_pipeline_model_parallel_size, pipeline_model_parallel_split_rank)
\ No newline at end of file
+        Utils.initialize_distributed()
+        ps.initialize_model_parallel(
+            tensor_model_parallel_size,
+            pipeline_model_parallel_size,
+            virtual_pipeline_model_parallel_size,
+            **kwargs,
+        )
+        Utils.inited = True
diff --git a/tests/unit_tests/test_utils.py b/tests/unit_tests/test_utils.py
index fda10450d8..229cead1c3 100644
--- a/tests/unit_tests/test_utils.py
+++ b/tests/unit_tests/test_utils.py
@@ -1,36 +1,213 @@
+import os
+import time
+import urllib.request as req
+
+import numpy as np
 import pytest
 import torch
+
 import megatron.core.utils as util
-import numpy as np
+from tests.unit_tests.test_utilities import Utils
+
 
 def test_divide_properly():
-    assert util.divide(4,2) == 2
+    assert util.divide(4, 2) == 2
+
 
 def test_divide_improperly():
     with pytest.raises(AssertionError):
-        util.divide(4,5)
+        util.divide(4, 5)
+
 
 def test_global_memory_buffer():
     global_memory_buffer = util.GlobalMemoryBuffer()
-    obtained_tensor = global_memory_buffer.get_tensor((3,2), torch.float32, "test_tensor")
-    expected_tensor = torch.empty((3,2), dtype=torch.float32, device=torch.cuda.current_device())
-    assert torch.equal(obtained_tensor, expected_tensor)
+    obtained_tensor = global_memory_buffer.get_tensor((3, 2), torch.float32, "test_tensor")
+    expected_tensor = torch.empty((3, 2), dtype=torch.float32, device=torch.cuda.current_device())
+    assert obtained_tensor.shape == expected_tensor.shape
+
 
 def test_make_viewless_tensor():
-    inp = torch.rand((3,4))
-    assert(torch.equal(inp, util.make_viewless_tensor(inp, True, True)))
-    assert(torch.equal(inp, util.make_viewless_tensor(inp, True, False)))
+    inp = torch.rand((3, 4))
+    assert torch.equal(inp, util.make_viewless_tensor(inp, True, True))
+    assert torch.equal(inp, util.make_viewless_tensor(inp, True, False))
+
 
 def test_safely_set_viewless_tensor_data():
-    tensor = torch.zeros((3,4))
-    new_data_tensor = torch.tensor(np.random.rand(3,4))
+    tensor = torch.zeros((3, 4))
+    new_data_tensor = torch.tensor(np.random.rand(3, 4))
     util.safely_set_viewless_tensor_data(tensor, new_data_tensor)
-    assert(torch.equal(tensor, new_data_tensor))
+    assert torch.equal(tensor, new_data_tensor)
+
 
 def test_assert_viewless_tensor():
-    tensor = torch.rand((3,4))
-    assert(torch.equal(util.assert_viewless_tensor(tensor), tensor))
-    input_tensor_list=[tensor,tensor,tensor]
+    tensor = torch.rand((3, 4))
+    assert torch.equal(util.assert_viewless_tensor(tensor), tensor)
+    input_tensor_list = [tensor, tensor, tensor]
     output_tensor_list = util.assert_viewless_tensor(input_tensor_list)
-    for inp,out in zip(input_tensor_list, output_tensor_list):
-        assert(torch.equal(inp,out))
+    for inp, out in zip(input_tensor_list, output_tensor_list):
+        assert torch.equal(inp, out)
+
+
+# Initialize torch.distributed; do not call init_process_group here, call
+# Utils.initialize_distributed() instead.
+def _init_distributed(world, rank):
+    Utils.initialize_distributed()
+    assert torch.distributed.is_initialized() == True
+    assert torch.distributed.get_rank() == rank
+    assert torch.cuda.device_count() == world
+    torch.distributed.barrier()
+
+
+# Deinitialization and cleanup.
+# Do not call torch.distributed.destroy_process_group, may be needed by other tests.
+def _deinit_distributed():
+    assert torch.distributed.is_initialized() == True
+    torch.distributed.barrier()
+
+
+def test_check_param_hashes_across_dp_replicas():
+    world = int(os.getenv('WORLD_SIZE', '1'))
+    rank = int(os.getenv('RANK', '0'))
+
+    # Setup.
+    _init_distributed(world, rank)
+    Utils.initialize_model_parallel()
+    model = torch.nn.Linear(100, 100, bias=False)
+
+    # First check case where all replicas agree.
+    model.weight.data.fill_(1.0)
+    assert util.check_param_hashes_across_dp_replicas([model])
+
+    # Now check case where replica 0 disagrees with all other replicas.
+    if rank == 0:
+        model.weight.data.fill_(0.0)
+    param_hashes_match = util.check_param_hashes_across_dp_replicas([model])
+    expected_param_hashes_match = rank == 0
+    assert param_hashes_match == expected_param_hashes_match
+
+    # Teardown.
+    _deinit_distributed()
+
+
+def test_cross_check_param_hashes_across_dp_replicas():
+    world = int(os.getenv('WORLD_SIZE', '1'))
+    rank = int(os.getenv('RANK', '0'))
+
+    # Setup.
+    _init_distributed(world, rank)
+    Utils.initialize_model_parallel()
+    model = torch.nn.Linear(100, 100, bias=False)
+
+    # First check case where all replicas agree.
+    model.weight.data.fill_(1.0)
+    assert util.check_param_hashes_across_dp_replicas([model], True)
+
+    # Now check case where replica 0 disagrees with all other replicas.
+    if rank == 0:
+        model.weight.data.fill_(0.0)
+    assert not util.check_param_hashes_across_dp_replicas([model], True)
+
+    # Teardown.
+    _deinit_distributed()
+
+
+def test_straggler_detector():
+    world = int(os.getenv('WORLD_SIZE', '1'))
+    rank = int(os.getenv('RANK', '0'))
+    master = os.getenv('MASTER_ADDR', 'localhost')
+    port = 65535
+
+    # Checks if the instance is disabled.
+    def straggler_detector_disabled():
+        assert stimer.enabled == False
+
+    # Checks if the instance is enabled.
+    def straggler_detector_enabled():
+        assert stimer.enabled == True
+
+    # Enable.
+    def straggler_detector_enable():
+        if rank == 0:
+            resp = req.urlopen(f"http://{master}:{port}").read().decode().split()
+            assert resp[3] == "ON"
+        # Call the report function, this will propagate the change.
+        stimer.report()
+
+    # Time an operation.
+    def straggler_detector_timeit():
+        s = 2  # Sleep for 2 seconds.
+        M = 20
+        K = 30
+        N = 40
+        mat1 = torch.randn(M, K, device='cuda')
+        mat2 = torch.randn(K, N, device='cuda')
+        # batch_data.
+        with stimer(bdata=True):
+            time.sleep(s)
+        # GEMM.
+        with stimer:
+            res = torch.matmul(mat1, mat2)
+        delta, batch_delta, _, _, _, _ = stimer.elapsed()
+        assert delta > 0.0
+        assert batch_delta >= s
+
+    # Test function to raise ValueError
+    def straggler_value_error():
+        raise ValueError("Exception value raised")
+
+    # Check that exception is not suppressed.
+    def straggler_detector_exception_propagate():
+        # batch_data
+        with pytest.raises(ZeroDivisionError):
+            with stimer(bdata=True):
+                x = 1 / 0
+        # non-batch-data
+        with pytest.raises(ValueError, match=r".* value .*"):
+            with stimer():
+                straggler_value_error()
+
+    # Reporting.
+    def straggler_detector_report():
+        s = 2  # Sleep for 2 seconds.
+        N = 20
+        P = 30
+        M = 40
+        mat1 = torch.randn(N, P, device='cuda')
+        mat2 = torch.randn(P, M, device='cuda')
+        tfp = (N * M) * (2 * P - 1)  # Theoretical.
+        iter = 10  # Mock.
+        # batch_data.
+        with stimer(bdata=True):
+            time.sleep(s)
+        # GEMM.
+        with stimer:
+            res = torch.matmul(mat1, mat2)
+        r = stimer.report(total_flops=tfp, log_interval=iter)
+        rb = True if rank == 0 else False
+        assert r == rb
+
+    # Start test.
+    # Setup.
+    _init_distributed(world, rank)
+
+    # Create a straggler_detector with enabled set to false.
+    stimer = util.StragglerDetector()
+    stimer.configure(world, rank, enabled=False, port=port)
+    # Check if configuration was success.
+    assert stimer.configured == True
+
+    # Check if the instance is in disabled state.
+    straggler_detector_disabled()
+    # Enable it now, must call report.
+    straggler_detector_enable()
+    # Check if all ranks have straggler detector enabled.
+    straggler_detector_enabled()
+    # Time some operation.
+    straggler_detector_timeit()
+    # Report only from rank 0.
+    straggler_detector_report()
+    # Check that exception is not suppressed.
+    straggler_detector_exception_propagate()
+    util.StragglerDetector._configured = False
+    # Teardown.
+    _deinit_distributed()
diff --git a/tests/unit_tests/transformer/__init__.py b/tests/unit_tests/transformer/__init__.py
new file mode 100644
index 0000000000..e69de29bb2
diff --git a/tests/unit_tests/transformer/moe/__init__.py b/tests/unit_tests/transformer/moe/__init__.py
new file mode 100644
index 0000000000..e69de29bb2
diff --git a/tests/unit_tests/transformer/moe/test_a2a_token_dispatcher.py b/tests/unit_tests/transformer/moe/test_a2a_token_dispatcher.py
new file mode 100644
index 0000000000..ad829881d0
--- /dev/null
+++ b/tests/unit_tests/transformer/moe/test_a2a_token_dispatcher.py
@@ -0,0 +1,87 @@
+# Copyright (c) 2023, NVIDIA CORPORATION. All rights reserved.
+
+import pytest
+import torch
+
+from megatron.core.transformer.moe.moe_utils import permute, unpermute
+from tests.unit_tests.test_utilities import Utils
+from tests.unit_tests.transformer.moe.test_token_dispatcher import MoEModelTestContainer
+
+
+class TestAlltoAllDispatcher:
+    def setup_method(self, method):
+        pass
+
+    def teardown_method(self, method):
+        Utils.destroy_model_parallel()
+
+    @pytest.mark.skipif(not torch.cuda.is_available(), reason="CUDA not available")
+    @pytest.mark.internal
+    @pytest.mark.timeout(120)
+    @pytest.mark.parametrize("tp_size,ep_size", [(1, 8), (8, 1), (4, 2), (1, 1)])
+    def test_forward_backward(self, tp_size, ep_size):
+        container = MoEModelTestContainer(
+            tp_size=tp_size,
+            ep_size=ep_size,
+            pp_size=1,
+            num_moe_experts=8,
+            moe_router_topk=2,
+            moe_router_load_balancing_type="aux_loss",
+            moe_token_dispatcher_type="alltoall",
+        )
+        container.dispatcher_dropless_test()
+
+    @pytest.mark.skipif(not torch.cuda.is_available(), reason="CUDA not available")
+    @pytest.mark.internal
+    @pytest.mark.timeout(120)
+    @pytest.mark.parametrize("tp_size,ep_size", [(1, 8), (8, 1), (4, 2), (1, 1)])
+    def test_a2aseq_forward_backward(self, tp_size, ep_size):
+        container = MoEModelTestContainer(
+            tp_size=tp_size,
+            ep_size=ep_size,
+            pp_size=1,
+            num_moe_experts=8,
+            moe_router_topk=2,
+            moe_router_load_balancing_type="aux_loss",
+            moe_token_dispatcher_type="alltoall_seq",
+        )
+        container.dispatcher_dropless_test()
+
+    @pytest.mark.skipif(not torch.cuda.is_available(), reason="CUDA not available")
+    @pytest.mark.internal
+    @pytest.mark.timeout(120)
+    @pytest.mark.parametrize("tp_size,ep_size", [(1, 8), (8, 1), (4, 2), (1, 1)])
+    def test_capacity_forward_backward(self, tp_size, ep_size):
+        container = MoEModelTestContainer(
+            tp_size=tp_size,
+            ep_size=ep_size,
+            pp_size=1,
+            num_moe_experts=8,
+            moe_router_topk=2,
+            moe_router_load_balancing_type="aux_loss",
+            moe_token_dispatcher_type="alltoall",
+            moe_token_drop_policy="probs",
+            moe_expert_capacity_factor=0.5,
+            moe_pad_expert_input_to_capacity=False,
+        )
+        container.dispacher_capacity_test()
+
+    @pytest.mark.skipif(not torch.cuda.is_available(), reason="CUDA not available")
+    @pytest.mark.internal
+    @pytest.mark.timeout(120)
+    @pytest.mark.parametrize("tp_size,ep_size", [(1, 8), (8, 1), (4, 2), (1, 1)])
+    @pytest.mark.flaky
+    def test_capacity_padding_forward_backward(self, tp_size, ep_size):
+        container = MoEModelTestContainer(
+            tp_size=tp_size,
+            ep_size=ep_size,
+            pp_size=1,
+            num_moe_experts=8,
+            moe_router_topk=2,
+            moe_router_load_balancing_type="aux_loss",
+            moe_token_dispatcher_type="alltoall",
+            moe_token_drop_policy="probs",
+            moe_expert_capacity_factor=0.5,
+            moe_pad_expert_input_to_capacity=True,
+        )
+        container.dispatcher_drop_and_pad_test()
diff --git a/tests/unit_tests/transformer/moe/test_aux_loss.py b/tests/unit_tests/transformer/moe/test_aux_loss.py
new file mode 100644
index 0000000000..2b7b2e109b
--- /dev/null
+++ b/tests/unit_tests/transformer/moe/test_aux_loss.py
@@ -0,0 +1,95 @@
+# Copyright (c) 2023, NVIDIA CORPORATION. All rights reserved.
+
+import pytest
+import torch
+
+from megatron.core import parallel_state
+from megatron.core.transformer.moe.moe_utils import clear_aux_losses_tracker
+from tests.unit_tests.test_utilities import Utils
+from tests.unit_tests.transformer.moe.test_token_dispatcher import MoEModelTestContainer
+
+
+class AuxlossTestContainer(MoEModelTestContainer):
+    def partition_input(self, input):
+        partitioned_input = input.chunk(
+            parallel_state.get_tensor_and_context_parallel_world_size(), dim=1
+        )[parallel_state.get_tensor_and_context_parallel_rank()]
+        output = partitioned_input.clone().detach()
+        output.requires_grad = True
+        return output
+
+    def aux_loss_test(self, input, baseline_grad):
+        partitioned_input = self.partition_input(input)
+        moe_layer = self.moe_layer
+        probs, indices = moe_layer.router(partitioned_input)
+        probs.sum().mul_(0).backward()
+        aux_loss_grad = partitioned_input.grad
+        torch.distributed.barrier()
+        ans = self.partition_input(baseline_grad)
+        assert torch.allclose(aux_loss_grad, ans), f"Diff: {(aux_loss_grad/ans).mean()}"
+        loss = parallel_state.get_moe_layer_wise_logging_tracker()['load_balancing_loss']
+        clear_aux_losses_tracker()
+
+
+class TestAuxLoss:
+    def setup_method(self, method):
+        baseline_container = AuxlossTestContainer(
+            tp_size=1,
+            ep_size=1,
+            pp_size=1,
+            cp_size=1,
+            num_moe_experts=8,
+            moe_router_topk=2,
+            moe_router_load_balancing_type="aux_loss",
+            moe_token_dispatcher_type="alltoall",
+            moe_aux_loss_coeff=0.1,
+        )
+        moe_layer = baseline_container.moe_layer
+        self.input = torch.randn((32, 8, moe_layer.config.hidden_size)).cuda()
+        self.input.requires_grad = True
+        probs, indices = moe_layer.router(self.input)
+        probs.sum().mul_(0).backward()  # zero out the main gradients
+        self.baseline_grad = self.input.grad
+        self.input.grad = None
+        clear_aux_losses_tracker()
+
+    def teardown_method(self, method):
+        Utils.destroy_model_parallel()
+
+    @pytest.mark.skipif(not torch.cuda.is_available(), reason="CUDA not available")
+    @pytest.mark.internal
+    @pytest.mark.parametrize(
+        "tp_size,ep_size,cp_size", [(8, 1, 1), (4, 2, 1), (1, 1, 8), (2, 1, 4), (2, 2, 2)]
+    )
+    def test_allgather_dispatcher(self, tp_size, ep_size, cp_size):
+        container = AuxlossTestContainer(
+            tp_size=tp_size,
+            ep_size=ep_size,
+            pp_size=1,
+            cp_size=cp_size,
+            num_moe_experts=8,
+            moe_router_topk=2,
+            moe_router_load_balancing_type="aux_loss",
+            moe_token_dispatcher_type="allgather",
+            moe_aux_loss_coeff=0.1,
+        )
+        container.aux_loss_test(self.input, self.baseline_grad)
+
+    @pytest.mark.skipif(not torch.cuda.is_available(), reason="CUDA not available")
+    @pytest.mark.internal
+    @pytest.mark.parametrize(
+        "tp_size,ep_size,cp_size", [(8, 1, 1), (4, 2, 1), (1, 1, 8), (2, 1, 4), (2, 2, 2)]
+    )
+    def test_a2a_dispatcher(self, tp_size, ep_size, cp_size):
+        container = AuxlossTestContainer(
+            tp_size=tp_size,
+            ep_size=ep_size,
+            pp_size=1,
+            cp_size=cp_size,
+            num_moe_experts=8,
+            moe_router_topk=2,
+            moe_router_load_balancing_type="aux_loss",
+            moe_token_dispatcher_type="alltoall",
+            moe_aux_loss_coeff=0.1,
+        )
+        container.aux_loss_test(self.input, self.baseline_grad)
diff --git a/tests/unit_tests/transformer/moe/test_grouped_mlp.py b/tests/unit_tests/transformer/moe/test_grouped_mlp.py
new file mode 100644
index 0000000000..043bdc8c58
--- /dev/null
+++ b/tests/unit_tests/transformer/moe/test_grouped_mlp.py
@@ -0,0 +1,388 @@
+# Copyright (c) 2023, NVIDIA CORPORATION. All rights reserved.
+
+import pytest
+import torch
+import torch.nn.functional as F
+
+from megatron.core.models.gpt.gpt_layer_specs import get_gpt_layer_with_transformer_engine_spec
+from megatron.core.transformer.moe import grouped_gemm_util as gg
+from megatron.core.transformer.moe.experts import TEGroupedMLP
+from megatron.core.transformer.moe.moe_layer import MoELayer
+from megatron.core.transformer.transformer_config import TransformerConfig
+from megatron.core.utils import is_te_min_version
+from megatron.legacy.model import Float16Module
+from megatron.training.arguments import parse_args
+from megatron.training.initialize import _set_random_seed
+from tests.unit_tests.test_utilities import Utils
+
+DEVICE_CAPABILITY = None
+if torch.cuda.is_available():
+    DEVICE_CAPABILITY = torch.cuda.get_device_capability()
+
+
+class TestParallelGroupedMLP:
+
+    def setup_method(self, method, use_cpu_initialization=False, swiglu=True):
+        print("============")
+        print(
+            "Test for use_cpu_initilization={} and swiglu={}.".format(
+                use_cpu_initialization, swiglu
+            )
+        )
+        print("============")
+        Utils.initialize_model_parallel(1, 1)
+        num_layers = 1  # 2
+        self.hidden_size = (
+            16  # must be an multiple of 16, otherwise trigger CUTLASS misaligned issue
+        )
+        self.num_experts = 2
+        self.gated_linear_unit = swiglu
+        self.activation_func = F.silu if swiglu else F.gelu
+        self.use_cpu_initialization = use_cpu_initialization
+
+        tf_config = TransformerConfig(
+            num_layers=num_layers,
+            hidden_size=self.hidden_size,
+            num_attention_heads=4,
+            num_moe_experts=self.num_experts,
+            use_cpu_initialization=self.use_cpu_initialization,
+            add_bias_linear=False,
+            gated_linear_unit=self.gated_linear_unit,
+            activation_func=self.activation_func,
+            bias_activation_fusion=False,
+            bf16=True,
+            params_dtype=torch.bfloat16,
+            moe_router_load_balancing_type="sinkhorn",
+            moe_router_topk=1,
+        )
+
+        self.fc1_ffn_hidden_size = tf_config.ffn_hidden_size
+        self.fc2_ffn_hidden_size = tf_config.ffn_hidden_size
+        # If using swiglu double the output width, see https://arxiv.org/pdf/2002.05202.pdf
+        if self.gated_linear_unit:
+            self.fc1_ffn_hidden_size *= 2
+
+        ## Vanilla sequential GEMM
+        # Set random seed for reproducability
+        _set_random_seed(seed_=123, data_parallel_random_init=False)
+        transformer_layer_spec = get_gpt_layer_with_transformer_engine_spec(
+            self.num_experts, moe_grouped_gemm=False
+        )
+        self.sequential_mlp = MoELayer(tf_config, transformer_layer_spec.submodules.mlp.submodules)
+
+        self.args = parse_args(ignore_unknown_args=True)
+        self.args.bf16 = True
+        # Bias is not supported in grouped gemm currently, thus we disable the
+        # bias in the linear layer.
+        self.args.add_bias_linear = False
+        self.sequential_mlp = Float16Module(self.sequential_mlp, self.args).module
+        print("done intializing for sequential gemm")
+
+        ## Grouped GEMM
+        _set_random_seed(seed_=123, data_parallel_random_init=False)
+        tf_config.moe_grouped_gemm = True
+        transformer_layer_spec = get_gpt_layer_with_transformer_engine_spec(
+            self.num_experts, moe_grouped_gemm=True
+        )
+        self.grouped_mlp = MoELayer(tf_config, transformer_layer_spec.submodules.mlp.submodules)
+        self.grouped_mlp = Float16Module(self.grouped_mlp, self.args).module
+        print("done intializing for grouped gemm")
+
+    def teardown_method(self, method):
+        Utils.destroy_model_parallel()
+
+    @pytest.mark.internal
+    def test_constructor(self):
+        assert isinstance(self.sequential_mlp, MoELayer)
+        assert isinstance(self.grouped_mlp, MoELayer)
+
+        num_weights_smm = sum([p.numel() for p in self.sequential_mlp.parameters()])
+        num_weights_gmm = sum([p.numel() for p in self.grouped_mlp.parameters()])
+
+        # For the same hyper-parm model configs except the `moe_grouped_gemm`,
+        # GroupedGEMM and sequential GEMMs should hold the same number of parms.
+        assert num_weights_smm == num_weights_gmm
+        # expected num weights: router linear weights+bias + MLP weights(no bias) of all experts
+        expected_num_weights = (
+            self.hidden_size * self.num_experts
+            + self.hidden_size
+            * (self.fc1_ffn_hidden_size + self.fc2_ffn_hidden_size)
+            * self.num_experts
+        )
+        assert num_weights_smm == expected_num_weights
+
+        assert torch.equal(self.sequential_mlp.router.weight, self.grouped_mlp.router.weight)
+
+        # weight1: [h, num_experts*4h]
+        # weight2: [num_experts*4h, h]
+        assert self.grouped_mlp.experts.weight1.shape[0] == self.hidden_size
+        assert (
+            self.grouped_mlp.experts.weight1.shape[1] == self.num_experts * self.fc1_ffn_hidden_size
+        )
+        if self.gated_linear_unit:
+            assert (
+                self.grouped_mlp.experts.weight2.shape[0]
+                == self.num_experts * self.fc2_ffn_hidden_size
+            )
+            assert self.grouped_mlp.experts.weight2.shape[1] == self.hidden_size
+        else:
+            assert (
+                self.grouped_mlp.experts.weight1.shape == self.grouped_mlp.experts.weight2.t().shape
+            )
+
+    @pytest.mark.internal
+    def test_weight_init_value_the_same(self):
+        gmm_w1 = self.grouped_mlp.experts.weight1.view(self.num_experts, -1, self.hidden_size)
+        gmm_w2 = self.grouped_mlp.experts.weight2.view(self.num_experts, self.hidden_size, -1)
+        gmm_expert1_fc1 = gmm_w1[0]
+        gmm_expert1_fc2 = gmm_w2[0]
+        gmm_expert2_fc1 = gmm_w1[1]
+        gmm_expert2_fc2 = gmm_w2[1]
+
+        smm_expert1_fc1 = self.sequential_mlp.experts.local_experts[0].linear_fc1.weight
+        smm_expert1_fc2 = self.sequential_mlp.experts.local_experts[0].linear_fc2.weight
+        smm_expert2_fc1 = self.sequential_mlp.experts.local_experts[1].linear_fc1.weight
+        smm_expert2_fc2 = self.sequential_mlp.experts.local_experts[1].linear_fc2.weight
+
+        assert torch.equal(gmm_expert1_fc1, smm_expert1_fc1)
+        if not self.use_cpu_initialization:
+            assert torch.equal(gmm_expert1_fc2, smm_expert1_fc2)
+        # the param init value is not exactly the same between gmm and smm (refer to test_weight_init_value_the_same.)
+        # TODO: is it necessary to keep smm and gmm share exactly the same init params?
+        # assert torch.equal(gmm_expert2_fc1, smm_expert2_fc1)
+        if self.use_cpu_initialization:
+            assert torch.equal(gmm_expert2_fc2, smm_expert2_fc2)
+
+    @pytest.mark.skipif(not torch.cuda.is_available(), reason="CUDA not available")
+    @pytest.mark.internal
+    @pytest.mark.skipif(
+        not DEVICE_CAPABILITY or DEVICE_CAPABILITY[0] < 8,
+        reason='GroupedGEMM kernels are not supported on this device.',
+    )
+    def test_gpu_forward(self):
+        self.sequential_mlp.cuda()
+        self.grouped_mlp.cuda()
+        # [sequence length, batch size, hidden size]
+        seq_len = 3  # 32
+        batch_size = 2
+        hidden_states = torch.rand(
+            (seq_len, batch_size, self.sequential_mlp.config.hidden_size), dtype=torch.bfloat16
+        )
+        hidden_states = hidden_states.cuda()
+        output_smm, _ = self.sequential_mlp(hidden_states)
+        output_gmm, _ = self.grouped_mlp(hidden_states)
+
+        # The following assert fails due to the param init value is not exactly
+        # the same between gmm and smm (refer to test_weight_init_value_the_same.)
+        # assert torch.equal(output_smm, output_gmm)
+
+    @pytest.mark.skipif(not torch.cuda.is_available(), reason="CUDA not available")
+    @pytest.mark.internal
+    @pytest.mark.skipif(
+        not DEVICE_CAPABILITY or DEVICE_CAPABILITY[0] < 8,
+        reason='GroupedGEMM kernels are not supported on this device.',
+    )
+    def test_gpu_forward_with_no_tokens_allocated(self):
+        """Test the case when no token is allocated for groupedGEMM kernels."""
+        w1 = self.grouped_mlp.experts.weight1.view(self.num_experts, -1, self.hidden_size)
+        num_allocated_tokens = 0
+        tokens_per_expert = torch.zeros(self.num_experts)
+        hidden_states = torch.rand((num_allocated_tokens, self.hidden_size), dtype=torch.bfloat16)
+        hidden_states = hidden_states.cuda()
+        try:
+            gg.ops.gmm(hidden_states, w1, tokens_per_expert, trans_b=False)
+        except Exception as e:
+            print("Expected error message from groupedGEMM:", e)
+            assert str(e) == "Input batch_sizes should not be all zeros!"
+
+    @pytest.mark.skipif(not torch.cuda.is_available(), reason="CUDA not available")
+    @pytest.mark.internal
+    @pytest.mark.skipif(
+        not DEVICE_CAPABILITY or DEVICE_CAPABILITY[0] < 8,
+        reason='GroupedGEMM kernels are not supported on this device.',
+    )
+    def test_gradient_with_no_tokens_allocated(self):
+        """Test that when no token is passed in, the parameters of the grouped MLP will also have gradients."""
+        self.grouped_mlp.cuda()
+        num_allocated_tokens = 0
+        tokens_per_expert = torch.zeros(self.num_experts)
+        hidden_states = torch.rand((num_allocated_tokens, self.hidden_size), dtype=torch.bfloat16)
+        hidden_states = hidden_states.cuda()
+        output_gmm, _ = self.grouped_mlp.experts(hidden_states, tokens_per_expert=tokens_per_expert)
+        output_gmm.mean().backward()
+        assert self.grouped_mlp.experts.weight1.grad is not None
+
+
+@pytest.mark.skipif(
+    not is_te_min_version("1.9.0.dev0"),
+    reason="TE Grouped MLP is only supported in TE 1.9.0.dev0 and later.",
+)
+class TestTEGroupedMLP:
+
+    def setup_method(self, method, use_cpu_initialization=False, swiglu=True):
+        Utils.initialize_model_parallel(1, 1)
+        num_layers = 1
+        self.hidden_size = 16
+        self.num_experts = 2
+        self.gated_linear_unit = swiglu
+        self.activation_func = F.silu if swiglu else F.gelu
+        self.use_cpu_initialization = use_cpu_initialization
+
+        tf_config = TransformerConfig(
+            num_layers=num_layers,
+            hidden_size=self.hidden_size,
+            num_attention_heads=4,
+            num_moe_experts=self.num_experts,
+            use_cpu_initialization=self.use_cpu_initialization,
+            add_bias_linear=False,
+            gated_linear_unit=self.gated_linear_unit,
+            activation_func=self.activation_func,
+            bias_activation_fusion=False,
+            bf16=True,
+            params_dtype=torch.bfloat16,
+            moe_router_load_balancing_type="sinkhorn",
+            moe_router_topk=1,
+        )
+
+        self.fc1_ffn_hidden_size = tf_config.ffn_hidden_size
+        self.fc2_ffn_hidden_size = tf_config.ffn_hidden_size
+        # If using swiglu double the output width, see https://arxiv.org/pdf/2002.05202.pdf
+        if self.gated_linear_unit:
+            self.fc1_ffn_hidden_size *= 2
+
+        ## Vanilla sequential GEMM
+        # Set random seed for reproducability
+        _set_random_seed(seed_=123, data_parallel_random_init=False)
+        transformer_layer_spec = get_gpt_layer_with_transformer_engine_spec(
+            self.num_experts, moe_grouped_gemm=False
+        )
+        self.sequential_mlp = MoELayer(tf_config, transformer_layer_spec.submodules.mlp.submodules)
+
+        self.args = parse_args(ignore_unknown_args=True)
+        self.args.bf16 = True
+        # Bias is not supported in grouped gemm currently, thus we disable the
+        # bias in the linear layer.
+        self.args.add_bias_linear = False
+        self.sequential_mlp = Float16Module(self.sequential_mlp, self.args).module
+
+        ## Grouped GEMM
+        _set_random_seed(seed_=123, data_parallel_random_init=False)
+        transformer_layer_spec = get_gpt_layer_with_transformer_engine_spec(
+            self.num_experts, moe_grouped_gemm=True
+        )
+        tf_config.moe_grouped_gemm = True
+        self.grouped_mlp = MoELayer(tf_config, transformer_layer_spec.submodules.mlp.submodules)
+        assert isinstance(self.grouped_mlp.experts, TEGroupedMLP)
+        self.grouped_mlp = Float16Module(self.grouped_mlp, self.args).module
+
+    def teardown_method(self, method):
+        Utils.destroy_model_parallel()
+
+    @pytest.mark.internal
+    def test_constructor(self):
+        assert isinstance(self.sequential_mlp, MoELayer)
+        assert isinstance(self.grouped_mlp, MoELayer)
+
+        num_weights_smm = sum([p.numel() for p in self.sequential_mlp.parameters()])
+        num_weights_gmm = sum([p.numel() for p in self.grouped_mlp.parameters()])
+
+        # For the same hyper-parm model configs except the `moe_grouped_gemm`,
+        # GroupedGEMM and sequential GEMMs should hold the same number of parms.
+        assert num_weights_smm == num_weights_gmm
+        # expected num weights: router linear weights+bias + MLP weights(no bias) of all experts
+        expected_num_weights = (
+            self.hidden_size * self.num_experts
+            + self.hidden_size
+            * (self.fc1_ffn_hidden_size + self.fc2_ffn_hidden_size)
+            * self.num_experts
+        )
+        assert num_weights_smm == expected_num_weights
+
+        assert torch.equal(self.sequential_mlp.router.weight, self.grouped_mlp.router.weight)
+
+        # weights of linear_fc1: [fc1_ffn_hidden_size, hidden_size]
+        # weights of linear_fc2: [hidden_size, fc2_ffn_hidden_size]
+        for i in range(self.num_experts):
+            assert getattr(self.grouped_mlp.experts.linear_fc1, f"weight{i}").shape == (
+                self.fc1_ffn_hidden_size,
+                self.hidden_size,
+            )
+            assert getattr(self.grouped_mlp.experts.linear_fc2, f"weight{i}").shape == (
+                self.hidden_size,
+                self.fc2_ffn_hidden_size,
+            )
+
+    @pytest.mark.skipif(not torch.cuda.is_available(), reason="CUDA not available")
+    @pytest.mark.internal
+    def test_gpu_forward_backward(self):
+        self.sequential_mlp.cuda()
+        self.grouped_mlp.cuda()
+        # Copy the weights to ensure the same init value
+        with torch.no_grad():
+            for i in range(self.num_experts):
+                self.sequential_mlp.experts.local_experts[i].linear_fc1.weight.copy_(
+                    getattr(self.grouped_mlp.experts.linear_fc1, f"weight{i}")
+                )
+                self.sequential_mlp.experts.local_experts[i].linear_fc2.weight.copy_(
+                    getattr(self.grouped_mlp.experts.linear_fc2, f"weight{i}")
+                )
+        # [sequence length, batch size, hidden size]
+        seq_len = 32
+        batch_size = 2
+        hidden_states = torch.rand(
+            (seq_len, batch_size, self.hidden_size),
+            dtype=torch.bfloat16,
+            device="cuda",
+            requires_grad=True,
+        )
+        hidden_states.retain_grad()
+
+        output_smm, _ = self.sequential_mlp(hidden_states)
+        output_smm.mean().backward()
+        smm_results = [output_smm, hidden_states.grad]
+        for i in range(self.num_experts):
+            smm_results.append(self.sequential_mlp.experts.local_experts[i].linear_fc1.weight.grad)
+            smm_results.append(self.sequential_mlp.experts.local_experts[i].linear_fc2.weight.grad)
+
+        hidden_states.grad = None
+        output_gmm, _ = self.grouped_mlp(hidden_states)
+        output_gmm.mean().backward()
+        gmm_results = [output_gmm, hidden_states.grad]
+        for i in range(self.num_experts):
+            gmm_results.append(getattr(self.grouped_mlp.experts.linear_fc1, f"weight{i}").grad)
+            gmm_results.append(getattr(self.grouped_mlp.experts.linear_fc2, f"weight{i}").grad)
+
+        for smm_result, gmm_result in zip(smm_results, gmm_results):
+            torch.testing.assert_close(smm_result, gmm_result)
+
+    @pytest.mark.skipif(not torch.cuda.is_available(), reason="CUDA not available")
+    @pytest.mark.internal
+    def test_gpu_forward_backward_with_no_tokens_allocated(self):
+        """Test the case when no token is allocated for groupedGEMM kernels."""
+        self.grouped_mlp.cuda()
+        num_allocated_tokens = 0
+        tokens_per_expert = torch.zeros(self.num_experts, dtype=torch.int32)
+        hidden_states = torch.rand((num_allocated_tokens, self.hidden_size), dtype=torch.bfloat16)
+        hidden_states = hidden_states.cuda()
+        output, _ = self.grouped_mlp.experts(hidden_states, tokens_per_expert=tokens_per_expert)
+        assert torch.equal(output, torch.zeros_like(output))
+        assert output.shape == (num_allocated_tokens, self.hidden_size)
+
+        output.mean().backward()
+        for i in range(self.num_experts):
+            assert getattr(self.grouped_mlp.experts.linear_fc1, f"weight{i}").grad is not None
+            assert getattr(self.grouped_mlp.experts.linear_fc2, f"weight{i}").grad is not None
+
+
+if __name__ == "__main__":
+    for use_cpu_unitilization in [True, False]:
+        for swiglu in [True, False]:
+            GMLP_test = TestParallelGroupedMLP()
+            GMLP_test.setup_method(
+                method=None, use_cpu_initialization=use_cpu_unitilization, swiglu=swiglu
+            )
+            GMLP_test.test_constructor()
+            GMLP_test.test_weight_init_value_the_same()
+            GMLP_test.test_gpu_forward()
+            GMLP_test.test_gpu_forward_with_no_tokens_allocated()
+            GMLP_test.teardown_method(method=None)
diff --git a/tests/unit_tests/transformer/moe/test_moe_layer.py b/tests/unit_tests/transformer/moe/test_moe_layer.py
new file mode 100644
index 0000000000..e65e7f2253
--- /dev/null
+++ b/tests/unit_tests/transformer/moe/test_moe_layer.py
@@ -0,0 +1,73 @@
+# Copyright (c) 2023, NVIDIA CORPORATION. All rights reserved.
+
+import pytest
+import torch
+
+from megatron.core.models.gpt.gpt_layer_specs import (
+    get_gpt_layer_local_spec,
+    get_gpt_layer_with_transformer_engine_spec,
+)
+from megatron.core.transformer.moe.moe_layer import MoELayer
+from megatron.core.transformer.moe.router import Router
+from megatron.core.transformer.transformer_config import TransformerConfig
+from megatron.training.initialize import _set_random_seed
+from tests.unit_tests.test_utilities import Utils
+
+
+class TestMoELayerInit:
+    def setup_method(self, method):
+        pass
+
+    @pytest.mark.parametrize("moe_token_dispatcher_type", ["allgather", "alltoall"])
+    @pytest.mark.parametrize("num_moe_experts", [1, 2])
+    @pytest.mark.parametrize("grouped_gemm", [True, False])
+    def test_te_moe_layer(self, num_moe_experts, moe_token_dispatcher_type, grouped_gemm):
+        Utils.initialize_model_parallel(1, 1)
+        _set_random_seed(seed_=123, data_parallel_random_init=False)
+        self.transformer_config = TransformerConfig(
+            num_layers=1,
+            hidden_size=12,
+            num_attention_heads=4,
+            num_moe_experts=num_moe_experts,
+            use_cpu_initialization=True,
+            moe_token_dispatcher_type=moe_token_dispatcher_type,
+            moe_router_topk=2,
+            moe_aux_loss_coeff=0.01,
+            moe_grouped_gemm=grouped_gemm,
+            add_bias_linear=False,
+        )
+        transformer_layer_spec = get_gpt_layer_with_transformer_engine_spec(
+            num_experts=num_moe_experts, moe_grouped_gemm=grouped_gemm
+        )
+        moe_layer = MoELayer(
+            self.transformer_config, transformer_layer_spec.submodules.mlp.submodules
+        )
+        Utils.destroy_model_parallel()
+
+    @pytest.mark.parametrize("moe_token_dispatcher_type", ["allgather", "alltoall"])
+    @pytest.mark.parametrize("num_moe_experts", [1, 2])
+    def test_legacy_moe_layer(self, num_moe_experts, moe_token_dispatcher_type):
+        Utils.initialize_model_parallel(1, 1)
+        _set_random_seed(seed_=123, data_parallel_random_init=False)
+        num_moe_experts = 4
+        self.transformer_config = TransformerConfig(
+            num_layers=1,
+            hidden_size=12,
+            num_attention_heads=4,
+            num_moe_experts=num_moe_experts,
+            use_cpu_initialization=True,
+            moe_router_load_balancing_type="aux_loss",
+            moe_router_topk=2,
+            moe_aux_loss_coeff=0.01,
+            add_bias_linear=False,
+        )
+        transformer_layer_spec = get_gpt_layer_local_spec(
+            num_experts=num_moe_experts, moe_grouped_gemm=False
+        )
+        moe_layer = MoELayer(
+            self.transformer_config, transformer_layer_spec.submodules.mlp.submodules
+        )
+        Utils.destroy_model_parallel()
+
+    def teardown_method(self, method):
+        Utils.destroy_model_parallel()
diff --git a/tests/unit_tests/transformer/moe/test_routers.py b/tests/unit_tests/transformer/moe/test_routers.py
new file mode 100644
index 0000000000..c1633834b6
--- /dev/null
+++ b/tests/unit_tests/transformer/moe/test_routers.py
@@ -0,0 +1,83 @@
+# Copyright (c) 2023, NVIDIA CORPORATION. All rights reserved.
+
+import pytest
+import torch
+
+from megatron.core.models.gpt.gpt_layer_specs import get_gpt_layer_with_transformer_engine_spec
+from megatron.core.transformer.moe.moe_layer import MoELayer
+from megatron.core.transformer.moe.router import Router
+from megatron.core.transformer.transformer_config import TransformerConfig
+from megatron.training.initialize import _set_random_seed
+from tests.unit_tests.test_utilities import Utils
+
+
+class TestTop2Router:
+    def setup_method(self, method):
+        Utils.initialize_model_parallel(1, 1)
+        _set_random_seed(seed_=123, data_parallel_random_init=False)
+        print("done intializing")
+        num_moe_experts = 4
+        self.transformer_config = TransformerConfig(
+            num_layers=2,
+            hidden_size=12,
+            num_attention_heads=4,
+            num_moe_experts=num_moe_experts,
+            use_cpu_initialization=True,
+            moe_router_load_balancing_type="aux_loss",
+            moe_router_topk=2,
+            moe_aux_loss_coeff=0,
+        )
+        transformer_layer_spec = get_gpt_layer_with_transformer_engine_spec(
+            num_experts=num_moe_experts, moe_grouped_gemm=False
+        )
+        self.sequential_mlp = MoELayer(
+            self.transformer_config, transformer_layer_spec.submodules.mlp.submodules
+        )
+        self.router = self.sequential_mlp.router
+
+    def teardown_method(self, method):
+        Utils.destroy_model_parallel()
+
+    def test_constructor(self):
+        assert isinstance(self.router, Router)
+
+        num_weights = sum([p.numel() for p in self.router.parameters()])
+        assert num_weights == 12 * 4, num_weights
+
+    @pytest.mark.skipif(not torch.cuda.is_available(), reason="CUDA not available")
+    @pytest.mark.internal
+    @pytest.mark.parametrize("moe_router_pre_softmax", [(True), (False)])
+    def test_router_forward(self, moe_router_pre_softmax):
+        with torch.no_grad():
+            self.router = self.router.cuda()
+            self.router.config.moe_router_pre_softmax = moe_router_pre_softmax
+            # [num tokens, hidden size]
+            hidden_states = torch.randn((32, 2, self.router.config.hidden_size))
+            hidden_states = hidden_states.cuda()
+            scores, indices = self.router(hidden_states)
+
+    @pytest.mark.skipif(not torch.cuda.is_available(), reason="CUDA not available")
+    @pytest.mark.internal
+    def test_aux_loss(self):
+        self.sequential_mlp = self.sequential_mlp.cuda()
+
+        # Without aux loss
+        hidden_states = torch.randn((32, 2, self.router.config.hidden_size))
+        hidden_states = hidden_states.cuda()
+        out = self.sequential_mlp(hidden_states)[0]
+        out.sum().mul_(0).backward()
+        assert self.sequential_mlp.router.weight.grad.abs().sum() == 0
+
+        # With aux loss
+        self.transformer_config.moe_aux_loss_coeff = 1
+        out = self.sequential_mlp(hidden_states)[0]
+        out.sum().mul_(0).backward()
+        assert self.sequential_mlp.router.weight.grad.abs().sum() > 0
+
+        # With Z loss
+        self.transformer_config.moe_aux_loss_coeff = 0
+        self.transformer_config.moe_z_loss_coeff = 1
+        self.sequential_mlp.router.weight.grad.fill_(0)
+        out = self.sequential_mlp(hidden_states)[0]
+        out.sum().mul_(0).backward()
+        assert self.sequential_mlp.router.weight.grad.abs().sum() > 0
diff --git a/tests/unit_tests/transformer/moe/test_sequential_mlp.py b/tests/unit_tests/transformer/moe/test_sequential_mlp.py
new file mode 100644
index 0000000000..514e098bfd
--- /dev/null
+++ b/tests/unit_tests/transformer/moe/test_sequential_mlp.py
@@ -0,0 +1,211 @@
+# Copyright (c) 2023, NVIDIA CORPORATION. All rights reserved.
+from importlib.metadata import version
+
+import pytest
+import torch
+
+from megatron.core.models.gpt.gpt_layer_specs import get_gpt_layer_with_transformer_engine_spec
+from megatron.core.tensor_parallel.layers import ColumnParallelLinear, RowParallelLinear
+from megatron.core.tensor_parallel.random import model_parallel_cuda_manual_seed
+from megatron.core.transformer.custom_layers.transformer_engine import (
+    TEColumnParallelLinear,
+    TERowParallelLinear,
+)
+from megatron.core.transformer.mlp import MLPSubmodules
+from megatron.core.transformer.moe.experts import SequentialMLP
+from megatron.core.transformer.moe.moe_layer import MoELayer
+from megatron.core.transformer.transformer_config import TransformerConfig
+from megatron.core.utils import is_te_min_version
+from tests.unit_tests.test_utilities import Utils
+
+
+class TestParallelSequentialMLP:
+
+    def setup_method(self, method):
+        Utils.initialize_model_parallel(1, 1)
+        model_parallel_cuda_manual_seed(123)
+        print("done intializing")
+        num_moe_experts = 2
+        transformer_config = TransformerConfig(
+            num_layers=2,
+            hidden_size=12,
+            num_attention_heads=4,
+            num_moe_experts=num_moe_experts,
+            use_cpu_initialization=True,
+            activation_func=torch.nn.functional.silu,
+            gated_linear_unit=True,
+            bias_activation_fusion=True,
+            moe_router_load_balancing_type="sinkhorn",
+            moe_router_topk=1,
+        )
+        transformer_layer_spec = get_gpt_layer_with_transformer_engine_spec(
+            num_experts=num_moe_experts, moe_grouped_gemm=False
+        )
+        self.sequential_mlp = MoELayer(
+            transformer_config, transformer_layer_spec.submodules.mlp.submodules
+        )
+
+    def teardown_method(self, method):
+        Utils.destroy_model_parallel()
+
+    @pytest.mark.internal
+    def test_constructor(self):
+        assert isinstance(self.sequential_mlp, MoELayer)
+
+        num_weights = sum([p.numel() for p in self.sequential_mlp.parameters()])
+        assert num_weights == 3696
+
+    @pytest.mark.internal
+    @pytest.mark.skipif(not torch.cuda.is_available(), reason="CUDA not available")
+    def test_gpu_forward(self):
+        sequential_mlp = self.sequential_mlp
+        sequential_mlp.cuda()
+        # [sequence length, batch size, hidden size]
+        hidden_states = torch.ones((32, 2, sequential_mlp.config.hidden_size))
+        hidden_states = hidden_states.cuda()
+        output, output_bias = sequential_mlp(hidden_states)
+        assert output.shape[0] == 32
+        assert output.shape[1] == 2
+        assert output.shape[2] == sequential_mlp.config.hidden_size
+        assert output_bias.shape[2] == sequential_mlp.config.hidden_size
+        assert output.dtype == torch.float32
+        assert output.device.type == 'cuda'
+        assert output_bias.device.type == 'cuda'
+
+
+class TestTEParallelSequentialMLP:
+    def setup_method(self, method):
+        Utils.initialize_model_parallel(tensor_model_parallel_size=2, expert_model_parallel_size=2)
+        model_parallel_cuda_manual_seed(123)
+        num_moe_experts = 4
+        self.transformer_config = TransformerConfig(
+            num_layers=2,
+            hidden_size=12,
+            num_attention_heads=4,
+            num_moe_experts=num_moe_experts,
+            use_cpu_initialization=False,
+            activation_func=torch.nn.functional.silu,
+            gated_linear_unit=True,
+            bias_activation_fusion=False,
+            moe_router_load_balancing_type="sinkhorn",
+            moe_router_topk=1,
+            params_dtype=torch.bfloat16,
+            expert_model_parallel_size=2,
+            tensor_model_parallel_size=2,
+            sequence_parallel=True,
+        )
+
+        self.local_mlp_spec = MLPSubmodules(
+            linear_fc1=ColumnParallelLinear, linear_fc2=RowParallelLinear
+        )
+        self.te_mlp_spec = MLPSubmodules(
+            linear_fc1=TEColumnParallelLinear, linear_fc2=TERowParallelLinear
+        )
+        print("Done intializing")
+
+        self.num_local_experts = 2
+        model_parallel_cuda_manual_seed(123)
+        self.local_sequential_mlp = SequentialMLP(
+            self.num_local_experts, self.transformer_config, self.local_mlp_spec
+        )
+
+        model_parallel_cuda_manual_seed(123)
+        self.te_sequential_mlp = SequentialMLP(
+            self.num_local_experts, self.transformer_config, self.te_mlp_spec
+        )
+
+    @pytest.mark.skipif(
+        not is_te_min_version("1.7.0"),
+        reason="Transformer Engine under v1.7.0 doesn't support MoE training.",
+    )
+    @pytest.mark.internal
+    def test_constructor(self):
+        for i in range(self.num_local_experts):
+            assert torch.equal(
+                self.local_sequential_mlp.local_experts[i].linear_fc1.weight,
+                self.te_sequential_mlp.local_experts[i].linear_fc1.weight,
+            )
+            assert torch.equal(
+                self.local_sequential_mlp.local_experts[i].linear_fc2.weight,
+                self.te_sequential_mlp.local_experts[i].linear_fc2.weight,
+            )
+
+    @pytest.mark.skipif(
+        not is_te_min_version("1.7.0"),
+        reason="Transformer Engine under v1.7.0 doesn't support MoE training.",
+    )
+    @pytest.mark.internal
+    def test_gpu_forward(self):
+        self.local_sequential_mlp.cuda()
+        self.te_sequential_mlp.cuda()
+        seq_len = 4
+        batch_size = 2
+
+        tokens_per_expert = torch.tensor([2, 2], device="cuda")
+        hidden_states = torch.rand(
+            (seq_len, batch_size, self.local_sequential_mlp.config.hidden_size),
+            dtype=torch.bfloat16,
+            device="cuda",
+        )
+
+        output_local, _ = self.local_sequential_mlp(hidden_states, tokens_per_expert)
+        output_te, _ = self.te_sequential_mlp(hidden_states, tokens_per_expert)
+        assert torch.equal(output_local, output_te)
+
+    @pytest.mark.skipif(
+        not is_te_min_version("1.7.0"),
+        reason="Transformer Engine under v1.7.0 doesn't support MoE training.",
+    )
+    @pytest.mark.internal
+    def test_gpu_forward_with_one_local_expert(self):
+        model_parallel_cuda_manual_seed(123)
+        local_sequential_mlp = SequentialMLP(1, self.transformer_config, self.local_mlp_spec)
+        model_parallel_cuda_manual_seed(123)
+        te_sequential_mlp = SequentialMLP(1, self.transformer_config, self.te_mlp_spec)
+        seq_len = 4
+        batch_size = 2
+
+        tokens_per_expert = torch.tensor([4], device="cuda")
+        hidden_states = torch.rand(
+            (seq_len, batch_size, self.local_sequential_mlp.config.hidden_size),
+            dtype=torch.bfloat16,
+            device="cuda",
+        )
+
+        output_local, _ = local_sequential_mlp(hidden_states, tokens_per_expert)
+        output_te, _ = te_sequential_mlp(hidden_states, tokens_per_expert)
+        assert torch.equal(output_local, output_te)
+
+    @pytest.mark.skipif(
+        not is_te_min_version("1.7.0"),
+        reason="Transformer Engine under v1.7.0 doesn't support MoE training.",
+    )
+    @pytest.mark.internal
+    def test_gpu_forward_with_no_tokens_allocated(self):
+        self.local_sequential_mlp.cuda()
+        self.te_sequential_mlp.cuda()
+        seq_len = 4
+        batch_size = 2
+
+        tokens_per_expert = torch.tensor([0, 4], device="cuda")
+        hidden_states = torch.rand(
+            (seq_len, batch_size, self.local_sequential_mlp.config.hidden_size),
+            dtype=torch.bfloat16,
+            device="cuda",
+        )
+        output_local, _ = self.local_sequential_mlp(hidden_states, tokens_per_expert)
+        output_te, _ = self.te_sequential_mlp(hidden_states, tokens_per_expert)
+        assert torch.equal(output_local, output_te)
+
+    def teardown_method(self, method):
+        Utils.destroy_model_parallel()
+
+
+if __name__ == "__main__":
+    MLP_test = TestTEParallelSequentialMLP()
+    MLP_test.setup_method(method=None)
+    MLP_test.test_constructor()
+    MLP_test.test_gpu_forward()
+    MLP_test.test_gpu_forward_with_one_local_expert()
+    MLP_test.test_gpu_forward_with_no_tokens_allocated()
+    MLP_test.teardown_method(method=None)
diff --git a/tests/unit_tests/transformer/moe/test_shared_experts.py b/tests/unit_tests/transformer/moe/test_shared_experts.py
new file mode 100644
index 0000000000..0cacf30836
--- /dev/null
+++ b/tests/unit_tests/transformer/moe/test_shared_experts.py
@@ -0,0 +1,126 @@
+# Copyright (c) 2023, NVIDIA CORPORATION. All rights reserved.
+
+import pytest
+import torch
+
+from megatron.core.models.gpt.gpt_layer_specs import get_gpt_layer_with_transformer_engine_spec
+from megatron.core.tensor_parallel.random import model_parallel_cuda_manual_seed
+from megatron.core.transformer.moe.moe_layer import MoELayer
+from megatron.core.transformer.transformer_config import TransformerConfig
+from tests.unit_tests.test_utilities import Utils
+
+
+class TestSharedExperts:
+
+    def setup_method(self, method):
+        pass
+
+    def teardown_method(self, method):
+        Utils.destroy_model_parallel()
+
+    @pytest.mark.skipif(not torch.cuda.is_available(), reason="CUDA not available")
+    @pytest.mark.internal
+    def test_gpu_forward(self):
+        Utils.initialize_model_parallel(1, 1)
+        model_parallel_cuda_manual_seed(123)
+        print("done intializing")
+        num_moe_experts = 2
+        transformer_config = TransformerConfig(
+            num_layers=1,
+            hidden_size=12,
+            num_attention_heads=4,
+            num_moe_experts=num_moe_experts,
+            moe_shared_expert_intermediate_size=32,
+            use_cpu_initialization=True,
+            activation_func=torch.nn.functional.silu,
+            gated_linear_unit=True,
+            bias_activation_fusion=True,
+            moe_router_load_balancing_type="sinkhorn",
+            moe_router_topk=1,
+            add_bias_linear=False,
+        )
+        transformer_layer_spec = get_gpt_layer_with_transformer_engine_spec(
+            num_experts=num_moe_experts, moe_grouped_gemm=False
+        )
+        self.moe_layer = MoELayer(
+            transformer_config, transformer_layer_spec.submodules.mlp.submodules
+        )
+
+        assert isinstance(self.moe_layer, MoELayer)
+
+        num_weights = sum([p.numel() for p in self.moe_layer.parameters()])
+        assert num_weights == 3480 + 1152
+        assert self.moe_layer.shared_experts is not None
+        assert self.moe_layer.shared_experts.stream is None
+        assert self.moe_layer.token_dispatcher.shared_experts is None
+
+        moe_layer = self.moe_layer
+        moe_layer.cuda()
+        # [sequence length, batch size, hidden size]
+        hidden_states = torch.ones((32, 2, moe_layer.config.hidden_size))
+        hidden_states = hidden_states.cuda()
+        output, _ = moe_layer(hidden_states)
+        assert output.shape[0] == 32
+        assert output.shape[1] == 2
+        assert output.shape[2] == moe_layer.config.hidden_size
+        assert output.dtype == torch.float32
+        assert output.device.type == 'cuda'
+
+
+class TestSharedExpertsOverlap:
+
+    def setup_method(self, method):
+        pass
+
+    def teardown_method(self, method):
+        Utils.destroy_model_parallel()
+
+    @pytest.mark.skipif(not torch.cuda.is_available(), reason="CUDA not available")
+    @pytest.mark.internal
+    def test_gpu_forward(self):
+        Utils.initialize_model_parallel(1, 1)
+        model_parallel_cuda_manual_seed(123)
+        print("done intializing")
+        num_moe_experts = 2
+        transformer_config = TransformerConfig(
+            num_layers=1,
+            hidden_size=12,
+            num_attention_heads=4,
+            num_moe_experts=num_moe_experts,
+            moe_shared_expert_intermediate_size=32,
+            moe_shared_expert_overlap=True,
+            moe_token_dispatcher_type="alltoall",
+            use_cpu_initialization=True,
+            activation_func=torch.nn.functional.silu,
+            gated_linear_unit=True,
+            bias_activation_fusion=True,
+            moe_router_load_balancing_type="sinkhorn",
+            moe_router_topk=1,
+            add_bias_linear=False,
+        )
+        transformer_layer_spec = get_gpt_layer_with_transformer_engine_spec(
+            num_experts=num_moe_experts, moe_grouped_gemm=False
+        )
+        self.moe_layer = MoELayer(
+            transformer_config, transformer_layer_spec.submodules.mlp.submodules
+        )
+
+        assert isinstance(self.moe_layer, MoELayer)
+
+        num_weights = sum([p.numel() for p in self.moe_layer.parameters()])
+        assert num_weights == 3480 + 1152
+        assert self.moe_layer.shared_experts is not None
+        assert self.moe_layer.shared_experts.stream is not None
+        assert self.moe_layer.token_dispatcher.shared_experts is not None
+
+        moe_layer = self.moe_layer
+        moe_layer.cuda()
+        # [sequence length, batch size, hidden size]
+        hidden_states = torch.ones((32, 2, moe_layer.config.hidden_size))
+        hidden_states = hidden_states.cuda()
+        output, _ = moe_layer(hidden_states)
+        assert output.shape[0] == 32
+        assert output.shape[1] == 2
+        assert output.shape[2] == moe_layer.config.hidden_size
+        assert output.dtype == torch.float32
+        assert output.device.type == 'cuda'
diff --git a/tests/unit_tests/transformer/moe/test_token_dispatcher.py b/tests/unit_tests/transformer/moe/test_token_dispatcher.py
new file mode 100644
index 0000000000..e85f8512b4
--- /dev/null
+++ b/tests/unit_tests/transformer/moe/test_token_dispatcher.py
@@ -0,0 +1,265 @@
+# Copyright (c) 2023, NVIDIA CORPORATION. All rights reserved.
+
+import copy
+
+import pytest
+import torch
+
+from megatron.core import parallel_state
+from megatron.core.models.gpt.gpt_layer_specs import get_gpt_layer_with_transformer_engine_spec
+from megatron.core.transformer.moe.moe_layer import MoELayer
+from megatron.core.transformer.moe.moe_utils import permute, unpermute
+from megatron.core.transformer.transformer_config import TransformerConfig
+from megatron.training.initialize import _set_random_seed
+from tests.unit_tests.test_utilities import Utils
+
+
+class MoEModelTestContainer:
+    def __init__(
+        self,
+        tp_size,
+        ep_size,
+        pp_size,
+        cp_size=1,
+        data_parallel_random_init=False,
+        num_moe_experts=8,
+        moe_router_topk=2,
+        moe_router_load_balancing_type="aux_loss",
+        moe_token_dispatcher_type="alltoall",
+        moe_expert_capacity_factor=None,
+        moe_pad_expert_input_to_capacity=False,
+        moe_aux_loss_coeff=0.1,
+        **kwargs,
+    ):
+        self.num_local_experts = num_moe_experts // ep_size
+        Utils.initialize_model_parallel(
+            tensor_model_parallel_size=tp_size,
+            pipeline_model_parallel_size=pp_size,
+            expert_model_parallel_size=ep_size,
+            context_parallel_size=cp_size,
+        )
+        _set_random_seed(seed_=123, data_parallel_random_init=data_parallel_random_init)
+        local_expert_indices_offset = (
+            parallel_state.get_expert_model_parallel_rank() * self.num_local_experts
+        )
+        self.local_expert_indices = [
+            local_expert_indices_offset + i for i in range(self.num_local_experts)
+        ]
+
+        self.config = TransformerConfig(
+            tensor_model_parallel_size=tp_size,
+            expert_model_parallel_size=ep_size,
+            pipeline_model_parallel_size=pp_size,
+            context_parallel_size=cp_size,
+            moe_router_topk=moe_router_topk,
+            num_moe_experts=num_moe_experts,
+            moe_router_load_balancing_type=moe_router_load_balancing_type,
+            moe_token_dispatcher_type=moe_token_dispatcher_type,
+            moe_expert_capacity_factor=moe_expert_capacity_factor,
+            moe_pad_expert_input_to_capacity=moe_pad_expert_input_to_capacity,
+            moe_aux_loss_coeff=moe_aux_loss_coeff,
+            num_layers=1,
+            moe_extended_tp=kwargs.get("moe_extended_tp", False),
+            moe_grouped_gemm=kwargs.get("moe_grouped_gemm", False),
+            hidden_size=kwargs.get("hidden_size", 1024),
+            num_attention_heads=kwargs.get("num_attention_heads", 8),
+            use_cpu_initialization=kwargs.get("use_cpu_initialization", True),
+            sequence_parallel=tp_size > 1,
+            add_bias_linear=kwargs.get("add_bias_linear", False),
+        )
+
+        # init moe layer
+        transformer_layer_spec = get_gpt_layer_with_transformer_engine_spec(
+            num_experts=num_moe_experts, moe_grouped_gemm=kwargs.get("moe_grouped_gemm", False)
+        )
+        self.moe_layer = MoELayer(
+            self.config, transformer_layer_spec.submodules.mlp.submodules
+        ).cuda()
+        self.moe_layer.set_layer_number(0)
+
+    def __del__(self):
+        torch.distributed.barrier()
+        torch.cuda.synchronize()
+        Utils.destroy_model_parallel()
+
+    def dispatcher_dropless_test(self):
+        moe_layer = self.moe_layer
+        bs = 32
+        seql = 8
+        hidden_states = torch.randn((bs, seql, moe_layer.config.hidden_size))
+        hidden_states = hidden_states.cuda()
+        ans = hidden_states / 2
+        hidden_states.requires_grad = True
+        probs, indices = moe_layer.router(hidden_states)
+        probs = torch.ones_like(probs) / moe_layer.router.topk / 2
+
+        ## Uncomment these lines to assist in bug location.
+        # hidden_states = torch.ones_like(hidden_states) * torch.distributed.get_rank()
+        # hidden_states.requires_grad = True
+        # indices = torch.ones_like(indices) * torch.distributed.get_rank()
+        # print(permuted_local_hidden_states)
+
+        (permuted_local_hidden_states, tokens_per_expert) = (
+            moe_layer.token_dispatcher.token_permutation(hidden_states, probs, indices)
+        )
+
+        if self.config.moe_extended_tp:
+            scale = (
+                moe_layer.config.tensor_model_parallel_size
+                * moe_layer.config.expert_model_parallel_size
+            )
+        else:
+            scale = moe_layer.config.tensor_model_parallel_size
+
+        permuted_local_hidden_states /= scale
+
+        restored_hidden_states, restored_bias = moe_layer.token_dispatcher.token_unpermutation(
+            permuted_local_hidden_states
+        )
+
+        assert torch.allclose(
+            restored_hidden_states, ans
+        ), "Restored hidden states do not match original hidden states"
+
+        # check if the grad of the hidden states is same as the hidden states
+        torch.autograd.backward(restored_hidden_states, hidden_states)
+        assert torch.allclose(
+            hidden_states.grad, ans
+        ), "Restored hidden states do not match original hidden states"
+
+    def dispacher_capacity_test(self):
+        moe_layer = self.moe_layer
+        hidden_states = torch.randn((256, moe_layer.config.hidden_size))
+        hidden_states = hidden_states.cuda()
+        hidden_states.requires_grad = True
+        probs, indices = moe_layer.router(hidden_states)
+        tp_size = moe_layer.config.tensor_model_parallel_size
+        tp_rank = parallel_state.get_tensor_model_parallel_rank()
+
+        # Create the answer.
+        prob_mask = probs != 0
+        probs = torch.ones_like(probs) * prob_mask / moe_layer.router.topk
+        local_probss = probs
+        restored_hidden_states_answer = hidden_states * local_probss.sum(dim=1).unsqueeze(1)
+
+        (permuted_local_hidden_states, tokens_per_expert) = (
+            moe_layer.token_dispatcher.token_permutation(hidden_states, probs, indices)
+        )
+
+        print(f"Dispatched tokens per expert: {tokens_per_expert}")
+
+        permuted_local_hidden_states /= moe_layer.config.tensor_model_parallel_size
+
+        restored_hidden_states, restored_bias = moe_layer.token_dispatcher.token_unpermutation(
+            permuted_local_hidden_states
+        )
+        assert torch.allclose(
+            restored_hidden_states, restored_hidden_states_answer
+        ), "Restored hidden states does not match"
+
+        # check if the grad of the hidden states is same as the hidden states
+        torch.autograd.backward(restored_hidden_states, hidden_states)
+        assert torch.allclose(
+            hidden_states.grad, restored_hidden_states_answer
+        ), "Gradient of hidden states should be same as hidden states"
+
+    def dispatcher_drop_and_pad_test(self):
+        "Test if the tokens are dropped and padded correctly"
+        moe_layer = self.moe_layer
+        moe_layer_2 = copy.deepcopy(moe_layer)
+        hidden_states = torch.randn((256, moe_layer.config.hidden_size)).cuda()
+        hidden_states.requires_grad = True
+
+        # Create the answer.
+        moe_layer.config.moe_pad_expert_input_to_capacity = False
+        moe_layer.token_dispatcher.drop_and_pad = False
+
+        # Uncomment these lines to help bug location.
+        # hidden_states = torch.ones((8, moe_layer.config.hidden_size)).cuda()
+        # hidden_states = hidden_states * torch.range(1, 8).unsqueeze(1).cuda()
+        # hidden_states.requires_grad = True
+        # indices_1 = torch.tensor([[0, 0], [1, 1], [2, 2], [3, 3], [4, 4], [5, 5], [6, 6], [7, 7]]).cuda()
+        # probs_1 = torch.ones_like(indices_1)
+        # indices_2 = torch.tensor([[0, 0], [1, 1], [2, 2], [3, 3], [4, 4], [5, 5], [6, 6], [7, 7]]).cuda()
+        # probs_2 = torch.ones_like(indices_2)
+        # num_local_tokens_per_expert = torch.tensor([2, 2, 2, 2, 2, 2, 2, 2]).cuda()
+
+        probs_1, indices_1 = moe_layer.router(hidden_states)
+        (permuted_input_1, tokens_per_expert) = moe_layer.token_dispatcher.token_permutation(
+            hidden_states, probs_1, indices_1
+        )
+        torch.distributed.barrier()
+        forward_answer, restored_bias = moe_layer.token_dispatcher.token_unpermutation(
+            permuted_input_1
+        )
+        torch.autograd.backward(forward_answer, forward_answer)
+        backward_answer = hidden_states.grad.clone()
+        hidden_states.grad = None
+        torch.cuda.synchronize()
+        # End
+
+        probs_2, indices_2 = moe_layer_2.router(hidden_states)
+        (permuted_input_2, tokens_per_expert) = moe_layer_2.token_dispatcher.token_permutation(
+            hidden_states, probs_2, indices_2
+        )
+        restored_hidden_states, restored_bias = moe_layer_2.token_dispatcher.token_unpermutation(
+            permuted_input_2
+        )
+        torch.distributed.barrier()
+        assert torch.allclose(
+            restored_hidden_states, forward_answer
+        ), "Restored hidden states does not match"
+
+        # check if the grad of the hidden states is same as the hidden states
+        torch.autograd.backward(restored_hidden_states, restored_hidden_states)
+        assert torch.allclose(
+            hidden_states.grad, backward_answer
+        ), "Gradient of hidden states should be same as hidden states"
+
+    def set_params(self):
+        # TODO: Set consistent parameters for various parallelisms.
+        raise NotImplementedError
+
+    def destroy(self):
+        Utils.destroy_model_parallel()
+
+
+class TestAllgatherDispatcher:
+    def setup_method(self, method):
+        pass
+
+    def teardown_method(self, method):
+        Utils.destroy_model_parallel()
+
+    @pytest.mark.skipif(not torch.cuda.is_available(), reason="CUDA not available")
+    @pytest.mark.internal
+    @pytest.mark.parametrize("tp_size,ep_size", [(8, 1), (1, 8), (2, 4), (1, 1)])
+    def test_forward_backward(self, tp_size, ep_size):
+        container = MoEModelTestContainer(
+            tp_size=tp_size,
+            ep_size=ep_size,
+            pp_size=1,
+            num_moe_experts=8,
+            moe_router_topk=2,
+            moe_router_load_balancing_type="aux_loss",
+            moe_token_dispatcher_type="allgather",
+        )
+
+        container.dispatcher_dropless_test()
+
+    @pytest.mark.skipif(not torch.cuda.is_available(), reason="CUDA not available")
+    @pytest.mark.internal
+    @pytest.mark.parametrize("tp_size,ep_size", [(2, 4)])
+    def test_extend_tp_forward_backward(self, tp_size, ep_size):
+        container = MoEModelTestContainer(
+            tp_size=tp_size,
+            ep_size=ep_size,
+            pp_size=1,
+            num_moe_experts=8,
+            moe_router_topk=2,
+            moe_router_load_balancing_type="aux_loss",
+            moe_token_dispatcher_type="allgather",
+            moe_extended_tp=True,
+        )
+
+        container.dispatcher_dropless_test()
diff --git a/tests/unit_tests/transformer/moe/test_upcycling.py b/tests/unit_tests/transformer/moe/test_upcycling.py
new file mode 100644
index 0000000000..b5a98c3713
--- /dev/null
+++ b/tests/unit_tests/transformer/moe/test_upcycling.py
@@ -0,0 +1,193 @@
+# Copyright (c) 2023, NVIDIA CORPORATION. All rights reserved.
+import sys
+
+import pytest
+import torch
+import torch.distributed
+
+from megatron.core import mpu
+from megatron.core.enums import ModelType
+from megatron.core.models.gpt.gpt_layer_specs import (
+    get_gpt_layer_with_transformer_engine_spec as gpt_te_spec,
+)
+from megatron.core.models.gpt.gpt_model import GPTModel
+from megatron.core.num_microbatches_calculator import destroy_num_microbatches_calculator
+from megatron.core.tensor_parallel.random import model_parallel_cuda_manual_seed
+from megatron.core.transformer.moe import upcycling_utils
+from megatron.training.arguments import core_transformer_config_from_args, parse_args, validate_args
+from megatron.training.global_vars import (
+    destroy_global_vars,
+    get_args,
+    set_args,
+    set_global_variables,
+)
+from megatron.training.training import get_model, setup_model_and_optimizer
+from megatron.training.utils import (
+    get_batch_on_this_cp_rank,
+    get_batch_on_this_tp_rank,
+    unwrap_model,
+)
+from tests.unit_tests.test_utilities import Utils
+
+_SEED = 42
+
+
+def model_provider(pre_process=True, post_process=True, layer_spec_fn=gpt_te_spec, **config_kwargs):
+    model_parallel_cuda_manual_seed(_SEED)
+    args = get_args()
+
+    config = core_transformer_config_from_args(args)
+
+    model = GPTModel(
+        config=config,
+        transformer_layer_spec=gpt_te_spec(
+            args.num_experts, args.moe_grouped_gemm, args.qk_layernorm
+        ),
+        vocab_size=args.vocal_size,
+        max_sequence_length=args.max_position_embeddings,
+        pre_process=pre_process,
+        post_process=post_process,
+        fp16_lm_cross_entropy=args.fp16_lm_cross_entropy,
+        parallel_output=True,
+        share_embeddings_and_output_weights=not args.untie_embeddings_and_output_weights,
+        position_embedding_type=args.position_embedding_type,
+        rotary_percent=args.rotary_percent,
+    )
+
+    return model
+
+
+def create_test_args(
+    tensor_model_parallel_size, pipeline_model_parallel_size, enable_vp, enable_grouped_gemm
+):
+    destroy_global_vars()
+    destroy_num_microbatches_calculator()
+
+    sys.argv = ['test_upcycling.py']
+    args = parse_args()
+    args.num_layers = 2
+    args.vocal_size = 256
+    args.hidden_size = 128
+    args.num_attention_heads = 8
+    args.max_position_embeddings = 256
+    args.micro_batch_size = 1
+    args.create_attention_mask_in_dataloader = True
+    args.seq_length = 256
+    args.pipeline_model_parallel_size = pipeline_model_parallel_size
+    args.tensor_model_parallel_size = tensor_model_parallel_size
+    args.context_parallel_size = 1
+    args.num_experts = None
+    args.train_iters = 1
+    if enable_vp:
+        args.num_layers_per_virtual_pipeline_stage = 1
+    args.ckpt_format = 'torch_dist'
+    args.moe_router_topk = 2
+    args.moe_router_pre_softmax = False
+    args.moe_token_dispatcher_type = "alltoall"
+    args.lr = 3e-5
+    args.attention_dropout = 0.0
+    args.hidden_dropout = 0.0
+    args.async_tensor_model_parallel_allreduce = False
+    args.no_save_optim = True
+    args.no_load_optim = True
+    args.no_load_rng = True
+    args.moe_grouped_gemm = enable_grouped_gemm
+    args.add_bias_linear = False
+
+    validate_args(args)
+    set_global_variables(args, False)
+    return args
+
+
+def set_upcycling_args(enable_grouped_gemm, ep):
+    args = get_args()
+    args.moe_use_upcycling = True
+    args.num_experts = 2
+    args.moe_grouped_gemm = enable_grouped_gemm
+    args.expert_model_parallel_size = ep
+    set_args(args)
+
+
+def get_batch(data_iterator):
+    if (not mpu.is_pipeline_first_stage()) and (not mpu.is_pipeline_last_stage()):
+        return None, None, None, None, None
+
+    batch = get_batch_on_this_tp_rank(data_iterator)
+    batch = get_batch_on_this_cp_rank(batch)
+
+    return batch.values()
+
+
+class TestGPTModel:
+    def setup_method(self, method):
+        Utils.destroy_model_parallel()
+
+    def teardown_method(self, method):
+        Utils.destroy_model_parallel()
+        destroy_global_vars()
+        destroy_num_microbatches_calculator()
+
+    @pytest.mark.internal
+    @pytest.mark.flaky  # TODO: Fix the test
+    @pytest.mark.parametrize(
+        ('tp_pp_ep', 'enable_vp', 'enable_grouped_gemm'), [((1, 1, 2), (False), (False))]
+    )
+    def test_upcycling(self, tp_pp_ep, enable_vp, enable_grouped_gemm):
+        tp = tp_pp_ep[0]
+        pp = tp_pp_ep[1]
+        ep = tp_pp_ep[2]
+        args = create_test_args(tp, pp, enable_vp, enable_grouped_gemm)
+        set_args(args)
+
+        torch.manual_seed(_SEED)
+        Utils.initialize_model_parallel(
+            tensor_model_parallel_size=tp,
+            pipeline_model_parallel_size=pp,
+            virtual_pipeline_model_parallel_size=args.virtual_pipeline_model_parallel_size,
+        )
+
+        dense_model, optimizer, opt_param_scheduler = setup_model_and_optimizer(
+            model_provider, ModelType.encoder_or_decoder
+        )
+
+        Utils.destroy_model_parallel()
+        Utils.initialize_model_parallel(
+            tensor_model_parallel_size=tp,
+            pipeline_model_parallel_size=pp,
+            expert_model_parallel_size=ep,
+            virtual_pipeline_model_parallel_size=args.virtual_pipeline_model_parallel_size,
+        )
+        set_upcycling_args(enable_grouped_gemm, ep)
+        # model_parallel_cuda_manual_seed(_SEED+1)
+        moe_model = get_model(model_provider, ModelType.encoder_or_decoder)
+
+        # Upcycle the dense model to the MoE model
+        moe_model = unwrap_model(moe_model)
+        dense_model = unwrap_model(dense_model)
+
+        data = list(range(args.seq_length))
+        input_ids = torch.tensor(data, dtype=torch.int64).repeat((args.micro_batch_size, 1)).cuda()
+        position_ids = (
+            torch.tensor(data, dtype=torch.int64).repeat((args.micro_batch_size, 1)).cuda()
+        )
+        attention_mask = torch.ones(
+            (args.micro_batch_size, 1, args.seq_length, args.seq_length), dtype=bool
+        ).cuda()
+
+        dense_logits = dense_model[0].forward(
+            input_ids=input_ids, position_ids=position_ids, attention_mask=attention_mask
+        )
+
+        state_dict = upcycling_utils.upcycle_state_dict(moe_model, dense_model)
+        if len(moe_model) == 1:
+            moe_model[0].load_state_dict(state_dict['model'], strict=True)
+        else:
+            for i in range(len(moe_model)):
+                mpu.set_virtual_pipeline_model_parallel_rank(i)
+                moe_model[i].load_state_dict(state_dict['model%d' % i], strict=True)
+
+        moe_logits = moe_model[0].forward(
+            input_ids=input_ids, position_ids=position_ids, attention_mask=attention_mask
+        )
+
+        torch.allclose(dense_logits, moe_logits, rtol=1e-03, atol=1e-03)
diff --git a/tests/unit_tests/transformer/test_attention.py b/tests/unit_tests/transformer/test_attention.py
new file mode 100644
index 0000000000..8c13ff3f8c
--- /dev/null
+++ b/tests/unit_tests/transformer/test_attention.py
@@ -0,0 +1,123 @@
+# Copyright (c) 2023, NVIDIA CORPORATION. All rights reserved.
+
+import pytest
+import torch
+
+from megatron.core.models.gpt.gpt_layer_specs import get_gpt_layer_with_transformer_engine_spec
+from megatron.core.tensor_parallel.random import model_parallel_cuda_manual_seed
+from megatron.core.transformer.attention import SelfAttention
+from megatron.core.transformer.transformer_config import TransformerConfig
+from tests.unit_tests.test_utilities import Utils
+
+
+class TestParallelAttention:
+
+    def setup_method(self, method):
+        Utils.initialize_model_parallel(1, 1)
+        model_parallel_cuda_manual_seed(123)
+        self.transformer_config = TransformerConfig(
+            num_layers=2, hidden_size=12, num_attention_heads=4, use_cpu_initialization=True
+        )
+        self.parallel_attention = SelfAttention(
+            self.transformer_config,
+            get_gpt_layer_with_transformer_engine_spec().submodules.self_attention.submodules,
+            layer_number=1,
+        )
+
+    def teardown_method(self, method):
+        Utils.destroy_model_parallel()
+
+    def test_constructor(self):
+        assert isinstance(self.parallel_attention, SelfAttention)
+        assert self.parallel_attention.layer_number == 1
+
+        num_weights = sum([p.numel() for p in self.parallel_attention.parameters()])
+        assert num_weights == 648
+
+    def test_cpu_forward(self):
+        # we can't currently do this because the global memory buffer is on GPU
+        pass
+
+    def test_gpu_forward(self):
+
+        config = self.parallel_attention.config
+        sequence_length = 32
+        micro_batch_size = 2
+
+        self.parallel_attention.cuda()
+
+        # [sequence length, batch size, hidden size]
+        hidden_states = torch.ones(
+            (sequence_length, micro_batch_size, self.parallel_attention.config.hidden_size)
+        )
+        hidden_states = hidden_states.cuda()
+
+        attention_mask = torch.ones((1, 1, sequence_length, sequence_length), dtype=bool).cuda()
+
+        output, bias = self.parallel_attention(hidden_states, attention_mask)
+
+        assert config.recompute_granularity is None
+        assert output.shape[0] == sequence_length
+        assert output.shape[1] == micro_batch_size
+        assert output.shape[2] == config.hidden_size
+        assert bias.shape[0] == config.hidden_size
+
+    def test_fused_rope_gpu_forward(self):
+        self.parallel_attention.config.apply_rope_fusion = True
+        config = self.parallel_attention.config
+        sequence_length = 32
+        micro_batch_size = 2
+
+        self.parallel_attention.cuda()
+
+        # [sequence length, batch size, hidden size]
+        hidden_states = torch.ones(
+            (sequence_length, micro_batch_size, self.parallel_attention.config.hidden_size)
+        )
+        hidden_states = hidden_states.cuda()
+
+        attention_mask = torch.ones((1, 1, sequence_length, sequence_length), dtype=bool).cuda()
+        rotary_pos_emb = torch.ones(
+            sequence_length, 1, 1, self.parallel_attention.config.kv_channels
+        ).cuda()
+        output, bias = self.parallel_attention(
+            hidden_states, attention_mask, rotary_pos_emb=rotary_pos_emb
+        )
+
+        assert config.recompute_granularity is None
+        assert output.shape[0] == sequence_length
+        assert output.shape[1] == micro_batch_size
+        assert output.shape[2] == config.hidden_size
+        assert bias.shape[0] == config.hidden_size
+        self.parallel_attention.config.apply_rope_fusion = False
+
+    def test_checkpointed_gpu_forward(self):
+        transformer_config = self.transformer_config
+        transformer_config.recompute_granularity = 'selective'
+        checkpointed_parallel_attention = SelfAttention(
+            transformer_config,
+            get_gpt_layer_with_transformer_engine_spec().submodules.self_attention.submodules,
+            layer_number=1,
+        )
+        config = checkpointed_parallel_attention.config
+
+        sequence_length = 32
+        micro_batch_size = 2
+
+        checkpointed_parallel_attention.cuda()
+
+        # [sequence length, batch size, hidden size]
+        hidden_states = torch.ones(
+            (sequence_length, micro_batch_size, checkpointed_parallel_attention.config.hidden_size)
+        )
+        hidden_states = hidden_states.cuda()
+
+        attention_mask = torch.ones((1, 1, sequence_length, sequence_length), dtype=bool).cuda()
+
+        output, bias = checkpointed_parallel_attention(hidden_states, attention_mask)
+
+        assert config.recompute_granularity == 'selective'
+        assert output.shape[0] == sequence_length
+        assert output.shape[1] == micro_batch_size
+        assert output.shape[2] == config.hidden_size
+        assert bias.shape[0] == config.hidden_size
diff --git a/tests/unit_tests/transformer/test_attention_packed_seq.py b/tests/unit_tests/transformer/test_attention_packed_seq.py
new file mode 100644
index 0000000000..66371e842f
--- /dev/null
+++ b/tests/unit_tests/transformer/test_attention_packed_seq.py
@@ -0,0 +1,172 @@
+# Copyright (c) 2023, NVIDIA CORPORATION. All rights reserved.
+
+import pytest
+import torch
+
+from megatron.core.models.gpt.gpt_layer_specs import get_gpt_layer_with_transformer_engine_spec
+from megatron.core.packed_seq_params import PackedSeqParams
+from megatron.core.tensor_parallel.random import model_parallel_cuda_manual_seed
+from megatron.core.transformer.attention import SelfAttention
+from megatron.core.transformer.enums import AttnMaskType
+from megatron.core.transformer.transformer_config import TransformerConfig
+from tests.unit_tests.test_utilities import Utils
+
+# Note: this test requires TE >= 0.13 as well as Flash Attention to run
+# FIXME this unit test doesn't work in the current test container. to be fixed soon
+"""
+def make_test_packed_seq_params(sequence_length):
+    cu_seqlens = torch.IntTensor([0, 6, 19, 22, sequence_length]).cuda()
+    seqlens = cu_seqlens[1:] - cu_seqlens[:-1]
+    max_seqlen, _ = seqlens.max(dim=0, keepdim=True)
+    packed_seq_params = PackedSeqParams(
+        cu_seqlens_q=cu_seqlens,
+        cu_seqlens_kv=cu_seqlens,
+        max_seqlen_q=max_seqlen,
+        max_seqlen_kv=max_seqlen,
+        qkv_format='thd',
+    )
+    return packed_seq_params
+
+def make_test_packed_padded_seq_params(sequence_length):
+    cu_seqlens = torch.IntTensor([0, 18, 44, 52, 96, 118]).cuda()
+    cu_seqlens_padded = torch.IntTensor([0, 20, 48, 56, 100, sequence_length]).cuda()
+    seqlens = cu_seqlens_padded[1:] - cu_seqlens_padded[:-1]
+    max_seqlen, _ = seqlens.max(dim=0, keepdim=True)
+    packed_seq_params = PackedSeqParams(
+        cu_seqlens_q=cu_seqlens,
+        cu_seqlens_kv=cu_seqlens,
+        cu_seqlens_q_padded=cu_seqlens_padded,
+        cu_seqlens_kv_padded=cu_seqlens_padded,
+        max_seqlen_q=max_seqlen,
+        max_seqlen_kv=max_seqlen,
+        qkv_format='thd',
+    )
+    return packed_seq_params
+
+
+class TestParallelAttentionWithPackedSequence:
+
+    def setup_method(self, method):
+        Utils.initialize_model_parallel(1,1)
+        model_parallel_cuda_manual_seed(123)
+        # use BF16 and a large enough hidden size to enable FlashAttention for thd format.
+        self.transformer_config = TransformerConfig(num_layers=2, hidden_size=64, num_attention_heads=4, use_cpu_initialization=True,
+                                                    bf16=True, params_dtype=torch.bfloat16,
+                                                    pipeline_dtype=torch.bfloat16, autocast_dtype=torch.bfloat16)
+        self.parallel_attention = SelfAttention(self.transformer_config,
+                                                get_gpt_layer_with_transformer_engine_spec().submodules.self_attention.submodules,
+                                                layer_number=1,
+                                                attn_mask_type=AttnMaskType.causal)
+
+    def teardown_method(self, method):
+        Utils.destroy_model_parallel()
+
+    def test_cpu_forward(self):
+        # we can't currently do this because the global memory buffer is on GPU
+        pass
+
+    def test_gpu_forward(self):
+
+        config = self.parallel_attention.config
+        sequence_length = 32
+        micro_batch_size = 1
+
+        self.parallel_attention.cuda()
+
+        # [sequence length, batch size, hidden size]
+        hidden_states = torch.ones((sequence_length, micro_batch_size, self.parallel_attention.config.hidden_size))
+        hidden_states = hidden_states.cuda().to(torch.bfloat16)
+
+        attention_mask = None
+
+        packed_seq_params = make_test_packed_seq_params(sequence_length)
+        output, bias = self.parallel_attention(hidden_states, attention_mask, packed_seq_params=packed_seq_params)
+
+        assert config.recompute_granularity is None
+        assert output.shape[0] == sequence_length
+        assert output.shape[1] == micro_batch_size
+        assert output.shape[2] == config.hidden_size
+        assert bias.shape[0] == config.hidden_size
+
+    def test_fused_rope_gpu_forward(self):
+        self.parallel_attention.config.apply_rope_fusion = True
+        config = self.parallel_attention.config
+        sequence_length = 32
+        micro_batch_size = 1
+
+        self.parallel_attention.cuda()
+
+        # [sequence length, batch size, hidden size]
+        hidden_states = torch.ones((sequence_length, micro_batch_size, self.parallel_attention.config.hidden_size))
+        hidden_states = hidden_states.cuda().to(torch.bfloat16)
+
+        attention_mask = None
+        rotary_pos_emb = torch.ones(sequence_length, 1, 1, self.parallel_attention.config.kv_channels).cuda()
+        
+        packed_seq_params = make_test_packed_seq_params(sequence_length)
+        output, bias = self.parallel_attention(hidden_states, attention_mask, packed_seq_params=packed_seq_params)
+
+        assert config.recompute_granularity is None
+        assert output.shape[0] == sequence_length
+        assert output.shape[1] == micro_batch_size
+        assert output.shape[2] == config.hidden_size
+        assert bias.shape[0] == config.hidden_size
+        self.parallel_attention.config.apply_rope_fusion = False
+
+    def test_checkpointed_gpu_forward(self):
+        transformer_config = self.transformer_config
+        transformer_config.recompute_granularity='selective'
+        checkpointed_parallel_attention = SelfAttention(transformer_config,
+                                                        get_gpt_layer_with_transformer_engine_spec().submodules.self_attention.submodules,
+                                                        layer_number=1,
+                                                        attn_mask_type=AttnMaskType.causal)
+        config = checkpointed_parallel_attention.config
+
+        sequence_length = 32
+        micro_batch_size = 1
+
+        checkpointed_parallel_attention.cuda()
+
+        # [sequence length, batch size, hidden size]
+        hidden_states = torch.ones(
+            (sequence_length, micro_batch_size, checkpointed_parallel_attention.config.hidden_size)
+        )
+        hidden_states = hidden_states.cuda().to(torch.bfloat16)
+
+        attention_mask = None
+
+        packed_seq_params = make_test_packed_seq_params(sequence_length)
+        output, bias = checkpointed_parallel_attention(hidden_states, attention_mask, packed_seq_params=packed_seq_params)
+
+        assert config.recompute_granularity == 'selective'
+        assert output.shape[0] == sequence_length
+        assert output.shape[1] == micro_batch_size
+        assert output.shape[2] == config.hidden_size
+        assert bias.shape[0] == config.hidden_size
+
+# Note: this test requires TE >= 1.8 as well as cuDNN FusedAttention to run
+class TestParallelAttentionWithPackedPaddedSequence(TestParallelAttentionWithPackedSequence):
+
+    def test_gpu_forward(self):
+
+        config = self.parallel_attention.config
+        sequence_length = 128
+        micro_batch_size = 1
+
+        self.parallel_attention.cuda()
+
+        # [sequence length, batch size, hidden size]
+        hidden_states = torch.ones((sequence_length, micro_batch_size, self.parallel_attention.config.hidden_size))
+        hidden_states = hidden_states.cuda().to(torch.bfloat16)
+
+        attention_mask = None
+
+        packed_seq_params = make_test_packed_padded_seq_params(sequence_length)
+        output, bias = self.parallel_attention(hidden_states, attention_mask, packed_seq_params=packed_seq_params)
+
+        assert config.recompute_granularity is None
+        assert output.shape[0] == sequence_length
+        assert output.shape[1] == micro_batch_size
+        assert output.shape[2] == config.hidden_size
+        assert bias.shape[0] == config.hidden_size
+"""
diff --git a/tests/unit_tests/transformer/test_core_attention.py b/tests/unit_tests/transformer/test_core_attention.py
new file mode 100644
index 0000000000..d8710e2242
--- /dev/null
+++ b/tests/unit_tests/transformer/test_core_attention.py
@@ -0,0 +1,64 @@
+# Copyright (c) 2023, NVIDIA CORPORATION. All rights reserved.
+
+
+import pytest
+import torch
+
+from megatron.core.transformer.attention import CrossAttention
+
+""" 
+
+@pytest.fixture
+def core_attention(transformer_config):
+    return CrossAttention(transformer_config)
+
+
+class TestCoreAttention:
+    def test_constructor(self, core_attention):
+        assert isinstance(core_attention, CrossAttention)
+        assert core_attention.layer_number == 1
+
+        num_weights = sum([p.numel() for p in core_attention.parameters()])
+        assert num_weights == 0
+
+    def test_cpu_forward(self, core_attention):
+        # we can't currently do this because the global memory buffer is on GPU
+        pass
+
+    def test_gpu_forward(self, core_attention):
+
+        # destroy_global_memory_buffer()
+        # _set_global_memory_buffer()
+        # model_parallel_cuda_manual_seed(123)
+
+        core_attention.cuda()
+        config = core_attention.config
+        sequence_length = 32
+        micro_batch_size = 2
+        # query_layer (float): [sequence_length, micro_batch_size, num_attention_heads, hidden_size / num_attention_heads]
+        query_layer = torch.ones(
+            (
+                sequence_length,
+                micro_batch_size,
+                config.num_attention_heads,
+                config.hidden_size // config.num_attention_heads,
+            )
+        ).cuda()
+
+        key_layer = torch.ones_like(query_layer).cuda()
+
+        value_layer = torch.ones_like(query_layer).cuda()
+
+        attention_mask = torch.ones((1, 1, sequence_length, sequence_length), dtype=bool).cuda()
+
+        context_layer = core_attention(
+            query_layer=query_layer, key_layer=key_layer, value_layer=value_layer, attention_mask=attention_mask
+        )
+
+        assert context_layer.shape[0] == sequence_length
+        assert context_layer.shape[1] == micro_batch_size
+        assert context_layer.shape[2] == config.hidden_size
+        assert context_layer.device.type == 'cuda'
+        assert context_layer.dtype == torch.float32
+
+"""
diff --git a/tests/unit_tests/transformer/test_mlp.py b/tests/unit_tests/transformer/test_mlp.py
new file mode 100644
index 0000000000..d2c25e0cc5
--- /dev/null
+++ b/tests/unit_tests/transformer/test_mlp.py
@@ -0,0 +1,58 @@
+# Copyright (c) 2023, NVIDIA CORPORATION. All rights reserved.
+
+import pytest
+import torch
+
+from megatron.core.models.gpt.gpt_layer_specs import get_gpt_layer_local_spec
+from megatron.core.tensor_parallel.random import model_parallel_cuda_manual_seed
+from megatron.core.transformer.mlp import MLP
+from megatron.core.transformer.transformer_config import TransformerConfig
+from tests.unit_tests.test_utilities import Utils
+
+
+class TestParallelMLP:
+
+    def setup_method(self, method):
+        Utils.initialize_model_parallel(1, 1)
+        model_parallel_cuda_manual_seed(123)
+        transformer_config = TransformerConfig(
+            num_layers=2, hidden_size=12, num_attention_heads=4, use_cpu_initialization=True
+        )
+        self.mlp = MLP(transformer_config, get_gpt_layer_local_spec().submodules.mlp.submodules)
+
+    def teardown_method(self, method):
+        Utils.destroy_model_parallel()
+
+    def test_constructor(self):
+        assert isinstance(self.mlp, MLP)
+
+        num_weights = sum([p.numel() for p in self.mlp.parameters()])
+        assert num_weights == 1212
+
+    """
+    def test_cpu_forward(self, mlp):
+        # [sequence length, micro batch size, hidden size]
+        hidden_states = torch.ones((32, 2, mlp.config.hidden_size))
+        output, output_bias = mlp(hidden_states)
+        assert output.shape[0] == 32
+        assert output.shape[1] == 2
+        assert output.shape[2] == mlp.config.hidden_size
+        assert output_bias.shape[0] == mlp.config.hidden_size
+        assert output.dtype == torch.float32
+    """
+
+    @pytest.mark.skipif(not torch.cuda.is_available(), reason="CUDA not available")
+    def test_gpu_forward(self):
+        mlp = self.mlp
+        mlp.cuda()
+        # [sequence length, batch size, hidden size]
+        hidden_states = torch.ones((32, 2, mlp.config.hidden_size))
+        hidden_states = hidden_states.cuda()
+        output, output_bias = mlp(hidden_states)
+        assert output.shape[0] == 32
+        assert output.shape[1] == 2
+        assert output.shape[2] == mlp.config.hidden_size
+        assert output_bias.shape[0] == mlp.config.hidden_size
+        assert output.dtype == torch.float32
+        assert output.device.type == 'cuda'
+        assert output_bias.device.type == 'cuda'
diff --git a/tests/unit_tests/transformer/test_module.py b/tests/unit_tests/transformer/test_module.py
new file mode 100644
index 0000000000..64826a0ee5
--- /dev/null
+++ b/tests/unit_tests/transformer/test_module.py
@@ -0,0 +1,102 @@
+# Copyright (c) 2023, NVIDIA CORPORATION. All rights reserved.
+
+import pytest
+import torch
+
+from megatron.core.tensor_parallel.random import model_parallel_cuda_manual_seed
+from megatron.core.transformer.module import Float16Module, MegatronModule
+from megatron.core.transformer.transformer_config import TransformerConfig
+from tests.unit_tests.test_utilities import Utils
+
+DEVICE_CAPABILITY = None
+if torch.cuda.is_available():
+    DEVICE_CAPABILITY = torch.cuda.get_device_capability()
+
+
+class DummyModule(MegatronModule):
+    # def __init__(self, config: TransformerConfig, share_embeddings_and_output_weights=True):
+    def __init__(self, config: TransformerConfig):
+        super().__init__(config)
+
+        self.linear = torch.nn.modules.Linear(in_features=2, out_features=1)
+
+    def forward(self, x):
+        return self.linear(x)
+
+
+class TestMegatronModule:
+
+    def setup_method(self, method):
+        Utils.initialize_model_parallel(1, 1)
+        model_parallel_cuda_manual_seed(123)
+        transformer_config = TransformerConfig(
+            num_layers=2, hidden_size=12, num_attention_heads=4, use_cpu_initialization=True
+        )
+        self.megatron_module = DummyModule(config=transformer_config).cuda()
+
+    def teardown_method(self, method):
+        Utils.destroy_model_parallel()
+
+    def test_megatron_module(self):
+        megatron_module = self.megatron_module
+        assert megatron_module
+        assert megatron_module.config.hidden_size == 12
+        assert megatron_module.config.ffn_hidden_size == 48
+        assert megatron_module.linear.weight.dtype == torch.float32
+
+        x = torch.ones((2, 2)).cuda()
+        assert megatron_module(x).dtype == torch.float32
+
+        # TODO: test bad configs actually fail
+        # failed_module = megatron_module
+        # failed_module.fp16 = True
+        # failed_module.bf16 = True
+
+
+class TestFloat16Module:
+
+    def setup_method(self, method):
+        Utils.initialize_model_parallel(1, 1)
+        model_parallel_cuda_manual_seed(123)
+        self.transformer_config = TransformerConfig(
+            num_layers=2, hidden_size=12, num_attention_heads=4, use_cpu_initialization=True
+        )
+        self.megatron_module = DummyModule(config=self.transformer_config).cuda()
+
+    def teardown_method(self, method):
+        Utils.destroy_model_parallel()
+
+    def test_fp16_module(self):
+        transformer_config = self.transformer_config
+        megatron_module = self.megatron_module
+        transformer_config.fp16 = True
+        fp16_module = Float16Module(config=transformer_config, module=megatron_module)
+
+        assert fp16_module
+        assert fp16_module.config.hidden_size == 12
+        assert fp16_module.config.ffn_hidden_size == 48
+        assert fp16_module.module.linear.weight.dtype == torch.float16
+
+        x = torch.ones((2, 2)).cuda()
+        # inputs are converted to fp16 then outputs are converted to fp32
+        assert fp16_module(x).dtype == torch.float32
+
+    pytest.mark.skipif(
+        not DEVICE_CAPABILITY or DEVICE_CAPABILITY[0] < 8,
+        reason='bfloat16 is not supported on this device',
+    )
+
+    def test_bf16_module(self):
+        transformer_config = self.transformer_config
+        megatron_module = self.megatron_module
+        transformer_config.bf16 = True
+        bf16_module = Float16Module(config=transformer_config, module=megatron_module)
+
+        assert bf16_module
+        assert bf16_module.config.hidden_size == 12
+        assert bf16_module.config.ffn_hidden_size == 48
+        assert bf16_module.module.linear.weight.dtype == torch.bfloat16
+
+        x = torch.ones((2, 2)).cuda()
+        # inputs are converted to bf16 then outputs are converted to fp32
+        assert bf16_module(x).dtype == torch.float32
diff --git a/tests/unit_tests/transformer/test_multi_latent_attention.py b/tests/unit_tests/transformer/test_multi_latent_attention.py
new file mode 100644
index 0000000000..4188d7b069
--- /dev/null
+++ b/tests/unit_tests/transformer/test_multi_latent_attention.py
@@ -0,0 +1,160 @@
+# Copyright (c) 2023, NVIDIA CORPORATION. All rights reserved.
+
+import os
+from importlib.metadata import version
+
+import pytest
+import torch
+import transformer_engine as te
+
+from megatron.core.models.gpt.gpt_layer_specs import get_gpt_layer_with_transformer_engine_spec
+from megatron.core.tensor_parallel.random import model_parallel_cuda_manual_seed
+from megatron.core.transformer.multi_latent_attention import MLASelfAttention
+from megatron.core.transformer.transformer_config import MLATransformerConfig
+from megatron.core.utils import is_te_min_version
+from tests.unit_tests.test_utilities import Utils
+
+
+class TestParallelMLAAttention:
+
+    def setup_method(self, method):
+        Utils.initialize_model_parallel(1, 1)
+        model_parallel_cuda_manual_seed(123)
+        self.transformer_config = MLATransformerConfig(
+            num_layers=2,
+            hidden_size=12,
+            num_attention_heads=4,
+            use_cpu_initialization=True,
+            q_lora_rank=32,
+            kv_lora_rank=32,
+            qk_head_dim=128,
+            v_head_dim=128,
+            qk_pos_emb_head_dim=64,
+            rotary_base=10000,
+        )
+        self.parallel_attention = MLASelfAttention(
+            self.transformer_config,
+            get_gpt_layer_with_transformer_engine_spec(
+                multi_latent_attention=True
+            ).submodules.self_attention.submodules,
+            layer_number=1,
+        )
+
+    def teardown_method(self, method):
+        Utils.destroy_model_parallel()
+
+    def test_constructor(self):
+        assert isinstance(self.parallel_attention, MLASelfAttention)
+        assert self.parallel_attention.layer_number == 1
+
+        num_weights = sum([p.numel() for p in self.parallel_attention.parameters()])
+        assert num_weights == 65036
+
+    def test_cpu_forward(self):
+        # we can't currently do this because the global memory buffer is on GPU
+        pass
+
+    def test_gpu_forward(self):
+        if is_te_min_version("1.10.0"):
+
+            # use flash attention for hopper, future may support fused attention for ampere
+            os.environ['NVTE_FUSED_ATTN'] = "0"
+            os.environ['NVTE_FLASH_ATTN'] = "1"
+
+            config = self.parallel_attention.config
+            sequence_length = 32
+            micro_batch_size = 2
+
+            self.parallel_attention.cuda()
+
+            # [sequence length, batch size, hidden size]
+            hidden_states = torch.ones(
+                (sequence_length, micro_batch_size, self.parallel_attention.config.hidden_size)
+            )
+            hidden_states = hidden_states.cuda()
+
+            attention_mask = torch.ones((1, 1, sequence_length, sequence_length), dtype=bool).cuda()
+
+            output, bias = self.parallel_attention(hidden_states, attention_mask)
+
+            assert config.recompute_granularity is None
+            assert output.shape[0] == sequence_length
+            assert output.shape[1] == micro_batch_size
+            assert output.shape[2] == config.hidden_size
+            assert bias.shape[0] == config.hidden_size
+
+    def test_fused_rope_gpu_forward(self):
+        if is_te_min_version("1.10.0"):
+            # use flash attention for hopper, future may support fused attention for ampere
+            os.environ['NVTE_FUSED_ATTN'] = "0"
+            os.environ['NVTE_FLASH_ATTN'] = "1"
+
+            self.parallel_attention.config.apply_rope_fusion = True
+            config = self.parallel_attention.config
+            sequence_length = 32
+            micro_batch_size = 2
+
+            self.parallel_attention.cuda()
+
+            # [sequence length, batch size, hidden size]
+            hidden_states = torch.ones(
+                (sequence_length, micro_batch_size, self.parallel_attention.config.hidden_size)
+            )
+            hidden_states = hidden_states.cuda()
+
+            attention_mask = torch.ones((1, 1, sequence_length, sequence_length), dtype=bool).cuda()
+            rotary_pos_emb = torch.ones(
+                sequence_length, 1, 1, self.parallel_attention.config.kv_channels
+            ).cuda()
+            output, bias = self.parallel_attention(
+                hidden_states, attention_mask, rotary_pos_emb=rotary_pos_emb
+            )
+
+            assert config.recompute_granularity is None
+            assert output.shape[0] == sequence_length
+            assert output.shape[1] == micro_batch_size
+            assert output.shape[2] == config.hidden_size
+            assert bias.shape[0] == config.hidden_size
+            self.parallel_attention.config.apply_rope_fusion = False
+
+    def test_checkpointed_gpu_forward(self):
+        if is_te_min_version("1.10.0"):
+            # use flash attention for hopper, future may support fused attention for ampere
+            os.environ['NVTE_FUSED_ATTN'] = "0"
+            os.environ['NVTE_FLASH_ATTN'] = "1"
+
+            transformer_config = self.transformer_config
+            transformer_config.recompute_granularity = 'selective'
+            checkpointed_parallel_attention = MLASelfAttention(
+                transformer_config,
+                get_gpt_layer_with_transformer_engine_spec(
+                    multi_latent_attention=True
+                ).submodules.self_attention.submodules,
+                layer_number=1,
+            )
+            config = checkpointed_parallel_attention.config
+
+            sequence_length = 32
+            micro_batch_size = 2
+
+            checkpointed_parallel_attention.cuda()
+
+            # [sequence length, batch size, hidden size]
+            hidden_states = torch.ones(
+                (
+                    sequence_length,
+                    micro_batch_size,
+                    checkpointed_parallel_attention.config.hidden_size,
+                )
+            )
+            hidden_states = hidden_states.cuda()
+
+            attention_mask = torch.ones((1, 1, sequence_length, sequence_length), dtype=bool).cuda()
+
+            output, bias = checkpointed_parallel_attention(hidden_states, attention_mask)
+
+            assert config.recompute_granularity == 'selective'
+            assert output.shape[0] == sequence_length
+            assert output.shape[1] == micro_batch_size
+            assert output.shape[2] == config.hidden_size
+            assert bias.shape[0] == config.hidden_size
diff --git a/tests/unit_tests/transformer/test_retro_attention.py b/tests/unit_tests/transformer/test_retro_attention.py
new file mode 100644
index 0000000000..6fe68518fe
--- /dev/null
+++ b/tests/unit_tests/transformer/test_retro_attention.py
@@ -0,0 +1,200 @@
+# Copyright (c) 2023, NVIDIA CORPORATION. All rights reserved.
+
+import types
+
+import pytest
+import torch
+
+from megatron.core.models.retro import RetroConfig, get_retro_decoder_block_spec
+from megatron.core.models.retro.decoder_attention import (
+    RetroDecoderBiasDropoutAdd,
+    RetroDecoderCrossAttention,
+)
+from megatron.core.models.retro.encoder_attention import (
+    RetroEncoderBiasDropoutAdd,
+    RetroEncoderCrossAttention,
+    RetroEncoderLayerNorm,
+)
+from megatron.core.tensor_parallel.random import model_parallel_cuda_manual_seed
+from megatron.core.transformer.transformer_block import TransformerBlock
+from tests.unit_tests.test_utilities import Utils
+
+
+class TestRetroAttention:
+
+    @classmethod
+    def get_config(cls):
+        return RetroConfig(
+            num_layers=12,
+            hidden_size=16,
+            num_attention_heads=4,
+            use_cpu_initialization=True,
+            retro_num_neighbors=2,
+            retro_chunk_length=4,
+            retro_retrieved_length=8,
+            retro_split_preprocessing="98,2,0",
+        )
+
+    @classmethod
+    def get_modules(cls, config, use_transformer_engine, use_gpu):
+
+        # Retro decoder layer.
+        decoder_block_spec = get_retro_decoder_block_spec(
+            config, use_transformer_engine=use_transformer_engine
+        )
+        decoder_block = TransformerBlock(config=config, spec=decoder_block_spec)
+        decoder_layers = [
+            layer
+            for layer in decoder_block.layers
+            if isinstance(layer.cross_attention, RetroDecoderCrossAttention)
+        ]
+        decoder_layer = decoder_layers[0]
+
+        # Retro encoder layer.
+        encoder_block = decoder_layer.cross_attention.encoder
+        encoder_layers = [
+            layer
+            for layer in encoder_block.layers
+            if isinstance(layer.cross_attention, RetroEncoderCrossAttention)
+        ]
+        encoder_layer = encoder_layers[0]
+
+        # Modules.
+        modules = types.SimpleNamespace(
+            decoder_attn=decoder_layer.cross_attention,
+            decoder_bda=decoder_layer.cross_attn_bda,
+            encoder_attn=encoder_layer.cross_attention,
+            encoder_bda=encoder_layer.cross_attn_bda,
+            encoder_norm=encoder_layer.pre_mlp_layernorm,
+        )
+
+        # GPU.
+        if use_gpu:
+            [m.cuda() for m in vars(modules).values()]
+
+        return modules
+
+    def setup_method(self, method):
+        Utils.initialize_model_parallel(1, 1)
+        model_parallel_cuda_manual_seed(123)
+
+    def teardown_method(self, method):
+        Utils.destroy_model_parallel()
+
+    @pytest.mark.flaky_in_dev
+    def test_constructor(self):
+
+        config = self.get_config()
+        modules = self.get_modules(config, use_transformer_engine=True, use_gpu=False)
+
+        assert isinstance(modules.decoder_attn, RetroDecoderCrossAttention)
+        assert isinstance(modules.decoder_bda, RetroDecoderBiasDropoutAdd)
+        assert isinstance(modules.encoder_attn, RetroEncoderCrossAttention)
+        assert isinstance(modules.encoder_bda, RetroEncoderBiasDropoutAdd)
+        assert isinstance(modules.encoder_norm, RetroEncoderLayerNorm)
+
+        assert modules.decoder_attn.attn.layer_number == 6
+        assert modules.encoder_attn.attn.layer_number == 1
+
+        get_nparams = lambda m: sum(p.numel() for p in m.parameters())
+        assert get_nparams(modules.decoder_attn) == 8768
+        assert get_nparams(modules.decoder_bda) == 0
+        assert get_nparams(modules.encoder_attn) == 1088
+        assert get_nparams(modules.encoder_bda) == 0
+        assert get_nparams(modules.encoder_norm) == 32
+
+    def test_cpu_forward(self):
+        # we can't currently do this because the global memory buffer is on GPU
+        pass
+
+    def run_gpu_forward(self, recompute_granularity, use_transformer_engine):
+
+        config = self.get_config()
+        config.recompute_granularity = recompute_granularity
+        modules = self.get_modules(config, use_transformer_engine, use_gpu=True)
+
+        seq_length = 32
+        micro_batch_size = 2
+        n_chunks_per_sample = seq_length // config.retro_chunk_length
+
+        # Init tensors.
+        hidden_states = torch.ones((seq_length, micro_batch_size, config.hidden_size)).cuda()
+        attention_mask = None
+        decoder_context = torch.ones(
+            (
+                config.retro_retrieved_length,
+                config.retro_num_neighbors * micro_batch_size * n_chunks_per_sample,
+                config.hidden_size,
+            )
+        ).cuda()
+        encoder_context = torch.ones(
+            (config.retro_chunk_length, micro_batch_size * n_chunks_per_sample, config.hidden_size)
+        ).cuda()
+
+        # Forward decoder.
+        decoder_attn_output = modules.decoder_attn(hidden_states, attention_mask, decoder_context)
+        with torch.enable_grad():
+            decoder_bda_output = modules.decoder_bda(True, True)(
+                decoder_attn_output, hidden_states, config.hidden_dropout
+            )
+
+        # Forward encoder.
+        encoder_attn_output_tuples = modules.encoder_attn(decoder_context, None, encoder_context)
+        with torch.enable_grad():
+            encoder_bda_output = modules.encoder_bda(True, True)(
+                encoder_attn_output_tuples, decoder_context, config.retro_encoder_hidden_dropout
+            )
+        encoder_norm_output = modules.encoder_norm(encoder_bda_output)
+
+        # Verify decoder.
+        assert set(decoder_attn_output.keys()) == set(
+            ["ns", "bs", "d", "l", "pad", "attention_output", "attention_bias", "context"]
+        )
+        assert decoder_attn_output["ns"] == seq_length
+        assert decoder_attn_output["bs"] == micro_batch_size
+        assert decoder_attn_output["d"] == config.hidden_size
+        assert decoder_attn_output["l"] == n_chunks_per_sample
+        assert decoder_attn_output["pad"] == 3
+        assert tuple(decoder_attn_output["attention_output"].shape) == (
+            config.retro_chunk_length,
+            micro_batch_size * n_chunks_per_sample,
+            config.hidden_size,
+        )
+        assert tuple(decoder_attn_output["attention_bias"].shape) == (config.hidden_size,)
+        assert decoder_attn_output["context"].shape == (
+            config.retro_retrieved_length * config.retro_num_neighbors,
+            micro_batch_size * n_chunks_per_sample,
+            config.hidden_size,
+        )
+        assert decoder_bda_output.shape == hidden_states.shape
+
+        # Verify encoder.
+        assert len(encoder_attn_output_tuples) == config.retro_num_neighbors
+        for output, bias, residual in encoder_attn_output_tuples:
+            assert tuple(output.shape) == (
+                config.retro_retrieved_length,
+                micro_batch_size * n_chunks_per_sample,
+                config.hidden_size,
+            )
+            assert tuple(bias.shape) == (config.hidden_size,)
+            assert tuple(residual.shape) == (
+                config.retro_retrieved_length,
+                micro_batch_size * n_chunks_per_sample,
+                config.hidden_size,
+            )
+        assert encoder_bda_output.shape == (
+            config.retro_retrieved_length,
+            config.retro_num_neighbors * micro_batch_size * n_chunks_per_sample,
+            config.hidden_size,
+        )
+        assert encoder_norm_output.shape == (
+            config.retro_retrieved_length,
+            config.retro_num_neighbors * micro_batch_size * n_chunks_per_sample,
+            config.hidden_size,
+        )
+
+    @pytest.mark.flaky_in_dev
+    def test_gpu_forward(self):
+        for recompute_granularity in (None, 'selective'):
+            for use_transformer_engine in (True, False):
+                self.run_gpu_forward(recompute_granularity, use_transformer_engine)
diff --git a/tests/unit_tests/transformer/test_rope.py b/tests/unit_tests/transformer/test_rope.py
new file mode 100644
index 0000000000..d5ed85391b
--- /dev/null
+++ b/tests/unit_tests/transformer/test_rope.py
@@ -0,0 +1,54 @@
+# Copyright (c) 2024, NVIDIA CORPORATION. All rights reserved.
+
+import pytest
+import torch
+
+from megatron.core.models.common.embeddings.rotary_pos_embedding import RotaryEmbedding
+from megatron.core.tensor_parallel.random import model_parallel_cuda_manual_seed
+from tests.unit_tests.test_utilities import Utils
+
+
+class TestRotaryEmbedding:
+    def setup_method(self):
+        Utils.initialize_model_parallel(1, 1)
+        model_parallel_cuda_manual_seed(123)
+        self.kv_channels = 8
+        self.rotary_percent = 1.0
+        self.rope_cpu_init = RotaryEmbedding(
+            self.kv_channels, self.rotary_percent, use_cpu_initialization=True
+        )
+        self.rope_gpu_init = RotaryEmbedding(
+            self.kv_channels, self.rotary_percent, use_cpu_initialization=False
+        )
+
+    def teardown_method(self, method):
+        del self.rope_gpu_init
+        del self.rope_cpu_init
+        Utils.destroy_model_parallel()
+
+    @pytest.mark.skipif(not torch.cuda.is_available(), reason="CUDA not available")
+    def test_constructor(self):
+        assert isinstance(self.rope_cpu_init, RotaryEmbedding)
+        assert self.rope_cpu_init.inv_freq.device.type == 'cpu'
+        assert isinstance(self.rope_gpu_init, RotaryEmbedding)
+        assert self.rope_gpu_init.inv_freq.device.type == 'cuda'
+
+    @pytest.mark.skipif(not torch.cuda.is_available(), reason="CUDA not available")
+    def test_gpu_forward(self):
+        output = self.rope_gpu_init(64)
+        assert output.shape[0] == 64
+        assert output.shape[1] == 1
+        assert output.shape[2] == 1
+        assert output.shape[3] == self.kv_channels
+        assert output.dtype == torch.float32
+        assert output.device.type == 'cuda'
+
+    @pytest.mark.skipif(not torch.cuda.is_available(), reason="CUDA not available")
+    def test_cpu_forward(self):
+        output = self.rope_cpu_init(64)
+        assert output.shape[0] == 64
+        assert output.shape[1] == 1
+        assert output.shape[2] == 1
+        assert output.shape[3] == self.kv_channels
+        assert output.dtype == torch.float32
+        assert output.device.type == 'cuda'
diff --git a/tests/unit_tests/transformer/test_spec_customization.py b/tests/unit_tests/transformer/test_spec_customization.py
new file mode 100755
index 0000000000..a9a245b861
--- /dev/null
+++ b/tests/unit_tests/transformer/test_spec_customization.py
@@ -0,0 +1,241 @@
+# Copyright (c) 2023, NVIDIA CORPORATION. All rights reserved.
+
+import sys
+from dataclasses import dataclass, fields
+
+import pytest
+import torch
+import transformer_engine as te
+
+from megatron.core.extensions.transformer_engine import (
+    TEDotProductAttention,
+    TELayerNormColumnParallelLinear,
+    TENorm,
+    TERowParallelLinear,
+)
+from megatron.core.fusions.fused_bias_dropout import get_bias_dropout_add
+from megatron.core.models.gpt.gpt_layer_specs import get_gpt_layer_local_spec
+from megatron.core.tensor_parallel.random import model_parallel_cuda_manual_seed
+from megatron.core.transformer.attention import SelfAttention, SelfAttentionSubmodules
+from megatron.core.transformer.dot_product_attention import DotProductAttention
+from megatron.core.transformer.enums import AttnMaskType
+from megatron.core.transformer.identity_op import IdentityFuncOp, IdentityOp
+from megatron.core.transformer.spec_utils import ModuleSpec, build_module, import_module
+from megatron.core.transformer.transformer_block import TransformerBlock, TransformerBlockSubmodules
+from megatron.core.transformer.transformer_config import TransformerConfig
+from megatron.core.transformer.transformer_layer import TransformerLayer, TransformerLayerSubmodules
+from megatron.core.utils import is_te_min_version
+from tests.unit_tests.test_utilities import Utils
+
+
+class TestSpecCustomization:
+    def setup_method(self, method):
+        Utils.initialize_model_parallel(1, 1)
+        model_parallel_cuda_manual_seed(123)
+        self.config = TransformerConfig(
+            num_layers=2, hidden_size=12, num_attention_heads=4, use_cpu_initialization=True
+        )
+
+        # specify Transformer Layer spec with all identity ops
+        self.transformer_layer_spec = TransformerLayerSubmodules()
+
+        # specify attention spec using already imported class
+        self.attention_spec = ModuleSpec(
+            module=SelfAttention,
+            params={"attn_mask_type": AttnMaskType.causal},
+            submodules=SelfAttentionSubmodules(
+                linear_qkv=TELayerNormColumnParallelLinear,
+                core_attention=TEDotProductAttention,
+                linear_proj=TERowParallelLinear,
+                q_layernorm=IdentityOp,
+                k_layernorm=IdentityOp,
+            ),
+        )
+
+        # specify layernorm spec with module path to test dynamic importing
+        self.layernorm_spec = ModuleSpec(
+            module=("megatron.core.extensions.transformer_engine", "TENorm")
+        )
+
+        # specify bias dropout add with module path
+        self.bda_spec = ModuleSpec(
+            module=("megatron.core.fusions.fused_bias_dropout", "get_bias_dropout_add")
+        )
+
+    def teardown_method(self, method):
+        Utils.destroy_model_parallel()
+
+    def test_import_module(self):
+        self_attention_cls = import_module(
+            module_path=('megatron.core.transformer.attention', 'SelfAttention')
+        )
+        assert id(self_attention_cls) == id(SelfAttention)
+
+        layernorm_cls = import_module(module_path=self.layernorm_spec.module)
+        assert id(layernorm_cls) == id(TENorm)
+
+    def test_build_module(self):
+        # Check NoOp TransformerLayer
+        random_input = 12
+        noop_transformer_layer = [
+            build_module(getattr(self.transformer_layer_spec, field.name))
+            for field in fields(self.transformer_layer_spec)
+            if field.name != 'sharded_state_dict_keys_map'
+        ]
+
+        x = random_input
+        for mod in noop_transformer_layer:
+            # checking for `IdentityFuncOp` before `IdentityOp` because former
+            # is derived from the latter and so the second if statement will
+            # always be `True`.
+            if isinstance(mod, IdentityFuncOp):
+                x = mod()(x)
+            elif isinstance(mod, IdentityOp):
+                x = mod(x)
+
+        assert x == random_input
+
+        # Check SelfAttention
+        self_attention = build_module(self.attention_spec, config=self.config, layer_number=1)
+        assert isinstance(self_attention, SelfAttention)
+        assert self_attention.layer_number == 1
+        assert self_attention.attn_mask_type == self.attention_spec.params['attn_mask_type']
+
+        num_weights = sum([p.numel() for p in self_attention.parameters()])
+        assert num_weights == 648
+
+        # Check SelfAttention but with already initialized module
+        # `self_attention`. In this test, `build_module` acts as a no op as it
+        # simply returns the initialized module.
+        # NOTE: (sudhakars) Uncomment this test once this feature gets added
+        # back.
+        # self_attention2 = build_module(
+        #     self_attention, config=self.config, spec=self.attention_spec,
+        # )
+        # assert isinstance(self_attention2, SelfAttention)
+        # assert self_attention2.layer_number == 1
+        # assert self_attention2.attn_mask_type == self.attention_spec.params['attn_mask_type']
+
+        # num_weights = sum([p.numel() for p in self_attention2.parameters()])
+        # assert num_weights == 648
+
+        # Check LayerNorm
+        layernorm = build_module(
+            self.layernorm_spec,
+            config=self.config,
+            hidden_size=self.config.hidden_size,
+            eps=self.config.layernorm_epsilon,
+        )
+        assert isinstance(layernorm, te.pytorch.LayerNorm)
+
+        # Check BiasDropoutAdd
+        bda_op = build_module(self.bda_spec)
+        assert id(bda_op) == id(get_bias_dropout_add)
+
+    def test_sliding_window_attention(self):
+        if not is_te_min_version("1.2.0"):
+            print("SWA not tested because TE version is not >= 1.2.0", file=sys.stderr)
+            return
+
+        config = TransformerConfig(
+            num_layers=2,
+            hidden_size=12,
+            num_attention_heads=4,
+            use_cpu_initialization=True,
+            window_size=[10, 0],
+        )
+        # Make sure DotProductAttention throws (swa unsupported).
+        threw = False
+        try:
+            attn = DotProductAttention(
+                config, layer_number=1, attn_mask_type=AttnMaskType.causal, attention_type='self'
+            )
+        except:
+            threw = True
+        finally:
+            assert threw, 'Expected DotProductAttention to throw exception for SWA'
+
+        # Test TEDotProductAttention
+        attn = TEDotProductAttention(
+            config, layer_number=1, attn_mask_type=AttnMaskType.causal, attention_type='self'
+        )
+        # Make sure window-size is what we expect.
+        assert attn.window_size == config.window_size
+
+        # Single integer window-size unsupported, make sure it throws
+        threw = False
+        try:
+            config.window_size = 11
+            attn = TEDotProductAttention(
+                config, layer_number=1, attn_mask_type=AttnMaskType.causal, attention_type='self'
+            )
+        except:
+            threw = True
+        finally:
+            assert threw, "Expected TEDotProductAttention to throw for integer window-size"
+
+        # `None` makes this causal.
+        config.window_size = None
+        attn = TEDotProductAttention(
+            config, layer_number=1, attn_mask_type=AttnMaskType.causal, attention_type='self'
+        )
+        # Make sure it's causal.
+        assert attn.window_size == (-1, 0)
+
+    def test_transformer_block_custom(self):
+        """
+        This test checks that the two ways of passing `layer_spec` to  a
+        `TransformerBlock` result in an identical model:
+        1. ModuleSpec(module=..., submodules=...)
+        2. TransformerBlockSubmodules(layer_specs=...)
+        """
+
+        transformer_config = TransformerConfig(
+            num_layers=2, hidden_size=12, num_attention_heads=4, use_cpu_initialization=True
+        )
+        layer_local_spec = get_gpt_layer_local_spec()
+
+        # The following way can be used to pass a different `TransformerLayer`
+        # and internally the `TransformerBlock` would fan out the single
+        # `ModuleSpec` layer spec provided to all the layers of the block.
+        layer_spec1 = ModuleSpec(module=TransformerLayer, submodules=layer_local_spec.submodules)
+        model_parallel_cuda_manual_seed(123)
+        torch.manual_seed(0)
+        parallel_transformer_block1 = TransformerBlock(transformer_config, layer_spec1)
+
+        layer_spec2 = TransformerBlockSubmodules(
+            layer_specs=[
+                ModuleSpec(module=TransformerLayer, submodules=layer_local_spec.submodules)
+            ]
+            * transformer_config.num_layers,
+            layer_norm=TENorm,
+        )
+        # make sure the model init conditions are identical
+        model_parallel_cuda_manual_seed(123)
+        torch.manual_seed(0)
+        parallel_transformer_block2 = TransformerBlock(transformer_config, layer_spec2)
+
+        sequence_length = 32
+        micro_batch_size = 2
+        parallel_transformer_block1.cuda()
+        parallel_transformer_block2.cuda()
+
+        # [sequence length, batch size, hidden size]
+        hidden_states = torch.ones(
+            (sequence_length, micro_batch_size, transformer_config.hidden_size)
+        )
+        hidden_states = hidden_states.cuda()
+
+        attention_mask = torch.ones((1, 1, sequence_length, sequence_length), dtype=bool).cuda()
+
+        out1 = parallel_transformer_block1(
+            hidden_states=hidden_states, attention_mask=attention_mask
+        )
+        out2 = parallel_transformer_block2(
+            hidden_states=hidden_states, attention_mask=attention_mask
+        )
+
+        assert torch.all(torch.eq(out1, out2))
+        assert out1.shape[0] == sequence_length == out2.shape[0]
+        assert out1.shape[1] == micro_batch_size == out2.shape[1]
+        assert out1.shape[2] == transformer_config.hidden_size == out2.shape[2]
diff --git a/tests/unit_tests/transformer/test_transformer_block.py b/tests/unit_tests/transformer/test_transformer_block.py
new file mode 100644
index 0000000000..02702a9ff7
--- /dev/null
+++ b/tests/unit_tests/transformer/test_transformer_block.py
@@ -0,0 +1,136 @@
+# Copyright (c) 2023, NVIDIA CORPORATION. All rights reserved.
+
+import os
+
+import pytest
+import torch
+
+from megatron.core import dist_checkpointing
+from megatron.core.models.gpt.gpt_layer_specs import get_gpt_layer_with_transformer_engine_spec
+from megatron.core.packed_seq_params import PackedSeqParams
+from megatron.core.tensor_parallel.random import model_parallel_cuda_manual_seed
+from megatron.core.transformer.transformer_block import TransformerBlock
+from megatron.core.transformer.transformer_config import TransformerConfig
+from megatron.core.transformer.transformer_layer import TransformerLayer
+from tests.unit_tests.test_utilities import Utils
+
+
+class TestParallelTransformerBlock:
+
+    def setup_method(self, method):
+        Utils.initialize_model_parallel(1, 1)
+        model_parallel_cuda_manual_seed(123)
+        self.transformer_config = TransformerConfig(
+            num_layers=2, hidden_size=64, num_attention_heads=4, use_cpu_initialization=True
+        )
+        self.parallel_transformer_block = TransformerBlock(
+            self.transformer_config, get_gpt_layer_with_transformer_engine_spec()
+        )
+
+    def teardown_method(self, method):
+        Utils.destroy_model_parallel()
+
+    def test_constructor(self):
+        parallel_transformer_block = self.parallel_transformer_block
+        assert isinstance(parallel_transformer_block, TransformerBlock)
+        num_weights = sum([p.numel() for p in parallel_transformer_block.parameters()])
+        assert num_weights == 100096
+        assert parallel_transformer_block.num_layers_per_pipeline_rank == 2
+        assert len(parallel_transformer_block.layers) == 2
+        layer_0: TransformerLayer = parallel_transformer_block._get_layer(0)
+        assert layer_0.layer_number == 1
+        layer_1: TransformerLayer = parallel_transformer_block._get_layer(1)
+        assert layer_1.layer_number == 2
+
+    def test_gpu_forward(self):
+        parallel_transformer_block = self.parallel_transformer_block
+        config: TransformerConfig = parallel_transformer_block.config
+
+        sequence_length = 32
+        micro_batch_size = 2
+        parallel_transformer_block.cuda()
+
+        # [sequence length, batch size, hidden size]
+        hidden_states = torch.ones((sequence_length, micro_batch_size, config.hidden_size))
+        hidden_states = hidden_states.cuda()
+
+        attention_mask = torch.ones((1, 1, sequence_length, sequence_length), dtype=bool).cuda()
+
+        hidden_states = parallel_transformer_block(
+            hidden_states=hidden_states, attention_mask=attention_mask
+        )
+        assert hidden_states.shape[0] == sequence_length
+        assert hidden_states.shape[1] == micro_batch_size
+        assert hidden_states.shape[2] == config.hidden_size
+
+    def test_gpu_forward_full_checkpoint(self):
+        self._run_full_checkpoint_test(fp8=None)
+
+    def test_gpu_forward_full_checkpoint_fp8(self):
+        self._run_full_checkpoint_test(fp8="e4m3")
+
+    def test_gpu_forward_selective_checkpoint(self):
+        self._run_selective_checkpoint_test(fp8=None)
+
+    def test_gpu_forward_selective_checkpoint_fp8(self):
+        self._run_selective_checkpoint_test(fp8="e4m3")
+
+    def _run_full_checkpoint_test(self, fp8):
+        transformer_config = self.transformer_config
+        config = transformer_config
+        config.recompute_granularity = 'full'
+        config.recompute_method = 'block'
+        config.fp8 = fp8
+        config.recompute_num_layers = config.num_layers
+        full_transformer_block = TransformerBlock(
+            config, get_gpt_layer_with_transformer_engine_spec()
+        )
+        assert full_transformer_block.config.recompute_granularity == 'full'
+        assert full_transformer_block.config.recompute_method == 'block'
+        assert full_transformer_block.config.fp8 == fp8
+
+        sequence_length = 32
+        micro_batch_size = 2
+        full_transformer_block.cuda()
+
+        # [sequence length, batch size, hidden size]
+        hidden_states = torch.ones((sequence_length, micro_batch_size, config.hidden_size))
+        hidden_states = hidden_states.cuda()
+
+        attention_mask = torch.ones((1, 1, sequence_length, sequence_length), dtype=bool).cuda()
+
+        hidden_states = full_transformer_block(
+            hidden_states=hidden_states, attention_mask=attention_mask
+        )
+        assert hidden_states.shape[0] == sequence_length
+        assert hidden_states.shape[1] == micro_batch_size
+        assert hidden_states.shape[2] == config.hidden_size
+
+    def _run_selective_checkpoint_test(self, fp8):
+        transformer_config = self.transformer_config
+        config = transformer_config
+        config.recompute_granularity = 'selective'
+        config.fp8 = fp8
+        selective_transformer_block = TransformerBlock(
+            config, get_gpt_layer_with_transformer_engine_spec()
+        )
+        assert selective_transformer_block.config.recompute_granularity == 'selective'
+        assert selective_transformer_block.checkpoint_core_attention
+        assert selective_transformer_block.config.fp8 == fp8
+
+        sequence_length = 32
+        micro_batch_size = 2
+        selective_transformer_block.cuda()
+
+        # [sequence length, batch size, hidden size]
+        hidden_states = torch.ones((sequence_length, micro_batch_size, config.hidden_size))
+        hidden_states = hidden_states.cuda()
+
+        attention_mask = torch.ones((1, 1, sequence_length, sequence_length), dtype=bool).cuda()
+
+        hidden_states = selective_transformer_block(
+            hidden_states=hidden_states, attention_mask=attention_mask
+        )
+        assert hidden_states.shape[0] == sequence_length
+        assert hidden_states.shape[1] == micro_batch_size
+        assert hidden_states.shape[2] == config.hidden_size
diff --git a/tests/unit_tests/transformer/test_transformer_layer.py b/tests/unit_tests/transformer/test_transformer_layer.py
new file mode 100644
index 0000000000..ad8d3ea0f2
--- /dev/null
+++ b/tests/unit_tests/transformer/test_transformer_layer.py
@@ -0,0 +1,115 @@
+# Copyright (c) 2023, NVIDIA CORPORATION. All rights reserved.
+
+
+import pytest
+import torch
+
+from megatron.core import parallel_state
+from megatron.core.dist_checkpointing.mapping import ShardedObject, ShardedTensor
+from megatron.core.models.gpt.gpt_layer_specs import get_gpt_layer_with_transformer_engine_spec
+from megatron.core.tensor_parallel.random import model_parallel_cuda_manual_seed
+from megatron.core.transformer.transformer_config import TransformerConfig
+from megatron.core.transformer.transformer_layer import TransformerLayer
+from tests.unit_tests.test_utilities import Utils
+
+
+class TestParallelTransformerLayer:
+
+    def setup_method(self, method):
+        Utils.initialize_model_parallel(1, 1)
+        model_parallel_cuda_manual_seed(123)
+        transformer_config = TransformerConfig(
+            num_layers=2, hidden_size=12, num_attention_heads=4, use_cpu_initialization=True
+        )
+        self.parallel_transformer_layer = TransformerLayer(
+            transformer_config, get_gpt_layer_with_transformer_engine_spec().submodules
+        )
+
+    def teardown_method(self, method):
+        Utils.destroy_model_parallel()
+
+    def test_constructor(self):
+        parallel_transformer_layer = self.parallel_transformer_layer
+        assert isinstance(parallel_transformer_layer, TransformerLayer)
+        assert parallel_transformer_layer.layer_number == 1
+
+        num_weights = sum([p.numel() for p in parallel_transformer_layer.parameters()])
+        assert num_weights == 1884
+
+    def test_gpu_forward(self):
+        parallel_transformer_layer = self.parallel_transformer_layer
+        config: TransformerConfig = parallel_transformer_layer.config
+        sequence_length = 32
+        micro_batch_size = 2
+        parallel_transformer_layer.cuda()
+
+        # [sequence length, batch size, hidden size]
+        hidden_states = torch.ones((sequence_length, micro_batch_size, config.hidden_size))
+        hidden_states = hidden_states.cuda()
+
+        attention_mask = torch.ones((1, 1, sequence_length, sequence_length), dtype=bool).cuda()
+
+        hidden_states, context = parallel_transformer_layer(
+            hidden_states=hidden_states, attention_mask=attention_mask
+        )
+        assert hidden_states.shape[0] == sequence_length
+        assert hidden_states.shape[1] == micro_batch_size
+        assert hidden_states.shape[2] == config.hidden_size
+
+    @pytest.mark.parametrize('order', ['tp-pp-dp', 'tp-dp-pp'])
+    @pytest.mark.parametrize('tp_pp', [(4, 2), (1, 1), (8, 1), (2, 2)])
+    def test_sharded_state_dict(self, tp_pp, order):
+        Utils.destroy_model_parallel()
+        Utils.initialize_model_parallel(*tp_pp, order=order)
+
+        model_parallel_cuda_manual_seed(123)
+        transformer_config = TransformerConfig(
+            num_layers=2, hidden_size=128, num_attention_heads=8, use_cpu_initialization=True
+        )
+        parallel_transformer_layer = TransformerLayer(
+            transformer_config, get_gpt_layer_with_transformer_engine_spec().submodules
+        )
+
+        sharded_state_dict = parallel_transformer_layer.sharded_state_dict()
+
+        extra_states = {k: v for k, v in sharded_state_dict.items() if k.endswith('extra_state')}
+        sharded_tensors = {
+            k: v for k, v in sharded_state_dict.items() if not k.endswith('extra_state')
+        }
+        assert all(isinstance(t, ShardedObject) for t in extra_states.values())
+        assert all(isinstance(t, ShardedTensor) for t in sharded_tensors.values())
+
+        # Test all local shapes
+        tensor_local_shapes = {k: v.local_shape for k, v in sharded_tensors.items()}
+        tp_size = parallel_state.get_tensor_model_parallel_world_size()
+        assert tensor_local_shapes == get_tensor_shapes_for_tp(transformer_config, tp_size)
+
+        # Test all global shapes. Prepend num layers in front of expected shapes
+        tensor_global_shapes = {k: v.global_shape for k, v in sharded_tensors.items()}
+        expected_global_shapes = get_tensor_shapes_for_tp(transformer_config, 1)
+        assert tensor_global_shapes == expected_global_shapes
+
+        # Test ShardedTensor keys
+        for state_dict_key, sh_ten in sharded_tensors.items():
+            assert state_dict_key == sh_ten.key
+
+        Utils.destroy_model_parallel()
+        Utils.initialize_model_parallel(1, 1)
+
+
+def get_tensor_shapes_for_tp(transformer_config, tp_size):
+    hs = transformer_config.hidden_size
+    return {
+        'mlp.linear_fc1.layer_norm_weight': (hs,),
+        'mlp.linear_fc1.layer_norm_bias': (hs,),
+        'mlp.linear_fc1.weight': (hs * 4 // tp_size, hs),
+        'mlp.linear_fc1.bias': (hs * 4 // tp_size,),
+        'mlp.linear_fc2.weight': (hs, hs * 4 // tp_size),
+        'mlp.linear_fc2.bias': (hs,),
+        'self_attention.linear_proj.weight': (hs, hs // tp_size),
+        'self_attention.linear_proj.bias': (hs,),
+        'self_attention.linear_qkv.layer_norm_weight': (hs,),
+        'self_attention.linear_qkv.layer_norm_bias': (hs,),
+        'self_attention.linear_qkv.weight': (hs * 3 // tp_size, hs),
+        'self_attention.linear_qkv.bias': (hs * 3 // tp_size,),
+    }
diff --git a/tools/autoformat.sh b/tools/autoformat.sh
new file mode 100755
index 0000000000..4595b9cbdc
--- /dev/null
+++ b/tools/autoformat.sh
@@ -0,0 +1,39 @@
+#!/bin/bash
+set -euox pipefail
+
+GIT_VERSION=$(git version | awk '{print $3}')
+GIT_MAJOR=$(echo $GIT_VERSION | awk -F. '{print $1}')
+GIT_MINOR=$(echo $GIT_VERSION | awk -F. '{print $2}')
+
+if [[ $GIT_MAJOR -eq 2 && $GIT_MINOR -lt 31 ]]; then
+    echo "Git version must be at least 2.31.0. Found $GIT_VERSION"
+    exit 1
+fi
+
+SCRIPT_DIR=$( cd -- "$( dirname -- "${BASH_SOURCE[0]}" )" &> /dev/null && pwd )
+CHECK_ONLY=${CHECK_ONLY:-false}
+SKIP_DOCS=${SKIP_DOCS:-false}
+
+BASE_REF=${BASE_REF:-main}
+CHANGED_FILES=$(git diff --name-only --diff-filter=d --merge-base origin/${BASE_REF} megatron/core tests/ | grep '\.py$' || true)
+ADDITIONAL_ARGS=""
+ADDITIONAL_BLACK_ARGS=""
+ADDITIONAL_PYLINT_ARGS=""
+
+
+if [[ $CHECK_ONLY == true ]]; then
+    ADDITIONAL_ARGS="--check"
+    ADDITIONAL_BLACK_ARGS="--diff"
+fi
+
+if [[ $SKIP_DOCS == true ]]; then
+    ADDITIONAL_PYLINT_ARGS="--disable=C0115,C0116"
+fi
+
+if [[ -n "$CHANGED_FILES" ]]; then
+    black --skip-magic-trailing-comma $ADDITIONAL_ARGS $ADDITIONAL_BLACK_ARGS --verbose $CHANGED_FILES
+    isort $ADDITIONAL_ARGS $CHANGED_FILES
+    pylint $ADDITIONAL_PYLINT_ARGS $CHANGED_FILES
+else
+    echo Changeset is empty, all good.
+fi
diff --git a/tools/bert_embedding/dataset.py b/tools/bert_embedding/dataset.py
index 72eb1f4d58..da165b8b10 100644
--- a/tools/bert_embedding/dataset.py
+++ b/tools/bert_embedding/dataset.py
@@ -1,10 +1,9 @@
-# Copyright (c) 2023, NVIDIA CORPORATION.  All rights reserved.
+# Copyright (c) 2024, NVIDIA CORPORATION.  All rights reserved.
 
 import numpy as np
 import torch
 
-from megatron import get_args, get_tokenizer
-from megatron.data.bert_dataset import build_training_sample
+from megatron.training import get_args, get_tokenizer
 
 
 class BertEmbeddingDataset(torch.utils.data.Dataset):
@@ -18,24 +17,25 @@ def __init__(self, text_dataset, max_seq_length):
 
         # Dataset, tokenizer.
         self.text_dataset = text_dataset
-        self.bert_tokenizer = get_tokenizer()
-
-        # Params to store.
         self.max_seq_length = max_seq_length
-        self.seed = args.seed
-        self.masked_lm_prob = args.mask_prob
-
-        # Vocab stuff.
-        self.vocab_id_list = list(self.bert_tokenizer.inv_vocab.keys())
-        self.vocab_id_to_token_dict = self.bert_tokenizer.inv_vocab
-        self.cls_id = self.bert_tokenizer.cls
-        self.sep_id = self.bert_tokenizer.sep
-        self.mask_id = self.bert_tokenizer.mask
-        self.pad_id = self.bert_tokenizer.pad
+        self.bert_tokenizer = get_tokenizer()
 
     def __len__(self):
         return len(self.text_dataset)
 
+    @classmethod
+    def build_sample(cls, tokenizer, token_ids):
+        get_constant_array = lambda c : np.full((len(token_ids) + 2,), c, "int64")
+        return {
+            "text" : np.array([ tokenizer.cls, *token_ids, tokenizer.sep ], dtype="int64"),
+            "types" : get_constant_array(0),
+            "labels" : get_constant_array(-1),
+            "is_random" : 0,
+            "loss_mask" : get_constant_array(0),
+            "padding_mask" : get_constant_array(1),
+            "truncated" : 0,
+        }
+
     def __getitem__(self, idx):
 
         # Text.
@@ -49,20 +49,7 @@ def __getitem__(self, idx):
         if not bert_token_ids:
             bert_token_ids = [ self.bert_tokenizer.pad_id ] # hack when empty seq
 
-        # Note that this rng state should be numpy and not python since
-        # python randint is inclusive whereas the numpy one is exclusive.
-        # We % 2**32 since numpy requres the seed to be between 0 and 2**32 - 1
-        np_rng = np.random.RandomState(seed=((self.seed + idx) % 2**32))
+        # Bert sample.
+        sample = self.build_sample(self.bert_tokenizer, bert_token_ids)
 
-        # Build sample.
-        sample = build_training_sample([bert_token_ids],
-                                       len(bert_token_ids),
-                                       len(bert_token_ids) + 2, # for cls+sep
-                                       self.vocab_id_list,
-                                       self.vocab_id_to_token_dict,
-                                       self.cls_id, self.sep_id,
-                                       self.mask_id, self.pad_id,
-                                       self.masked_lm_prob, np_rng,
-                                       binary_head=False)
-        sample["seq_length"] = len(sample["text"])
         return sample
diff --git a/tools/bert_embedding/embed.py b/tools/bert_embedding/embed.py
index dfe2c1d6ba..2236182a75 100644
--- a/tools/bert_embedding/embed.py
+++ b/tools/bert_embedding/embed.py
@@ -1,4 +1,4 @@
-# Copyright (c) 2023, NVIDIA CORPORATION.  All rights reserved.
+# Copyright (c) 2024, NVIDIA CORPORATION.  All rights reserved.
 
 from functools import partial
 import numpy as np
@@ -9,89 +9,19 @@
 from torch.utils.data._utils.collate import default_collate
 from tqdm import tqdm
 
-from megatron import get_args, get_tokenizer, print_rank_0
+from megatron.training import get_args, get_tokenizer, print_rank_0
 from megatron import core
+from megatron.training.arguments import core_transformer_config_from_args
+from megatron.core.datasets.retro.utils import get_blocks_by_rank
 from megatron.core.enums import ModelType
 from megatron.core.pipeline_parallel import get_forward_backward_func
-from megatron.model import BertModel
-from megatron.training import setup_model_and_optimizer
+from megatron.legacy.model import BertModel
+from megatron.training.training import setup_model_and_optimizer
+from pretrain_bert import model_provider, get_batch, loss_func, forward_step
 
 from .dataset import BertEmbeddingDataset
 from .external_libs import h5py
 from .huggingface import HuggingfaceEmbedder
-from .utils import get_missing_blocks_by_rank
-
-
-def model_provider(pre_process=True, post_process=True):
-    """Build the model."""
-
-    print_rank_0(" > build Bert model.")
-
-    args = get_args()
-    num_tokentypes = 2 if args.bert_binary_head else 0
-    model = BertModel(
-        num_tokentypes=num_tokentypes,
-        add_binary_head=args.bert_binary_head,
-        parallel_output=True,
-        pre_process=pre_process,
-        post_process=post_process)
-
-    return model
-
-
-def get_batch(data_iterator):
-    """Build the batch."""
-
-    # Items and their type.
-    keys = ['text', 'types', 'labels', 'is_random', 'loss_mask', 'padding_mask',
-            'seq_length']
-    datatype = torch.int64
-
-    # Broadcast data.
-    if data_iterator is not None:
-        data = next(data_iterator)
-    else:
-        data = None
-    data_b = core.tensor_parallel.broadcast_data(keys, data, datatype)
-
-    # Unpack.
-    tokens = data_b['text'].long()
-    types = data_b['types'].long()
-    sentence_order = data_b['is_random'].long()
-    loss_mask = data_b['loss_mask'].float()
-    lm_labels = data_b['labels'].long()
-    padding_mask = data_b['padding_mask'].long()
-    seq_lengths = data_b['seq_length'].long()
-
-    return tokens, types, sentence_order, loss_mask, lm_labels, padding_mask, \
-        seq_lengths
-
-
-def loss_func(loss_mask, sentence_order, seq_lengths,
-              output_tensor, non_loss_data):
-    """Loss function. Sequence lengths returned here for progress print-outs."""
-    assert non_loss_data
-    return seq_lengths, output_tensor
-
-
-def forward_step(data_iterator, model):
-    """Forward step."""
-
-    args = get_args()
-
-    # Get the batch.
-    tokens, types, sentence_order, loss_mask, lm_labels, padding_mask, \
-        seq_lengths = get_batch(data_iterator)
-
-    if not args.bert_binary_head:
-        types = None
-
-    # Forward pass through the model.
-    output_tensor = model(tokens, padding_mask, tokentype_ids=types,
-                          lm_labels=lm_labels)
-
-    return output_tensor, partial(loss_func, loss_mask, sentence_order,
-                                  seq_lengths)
 
 
 def collate_batch(samples):
@@ -163,7 +93,7 @@ def get_data_loader(dataset, batch_size):
     return data_loader
 
 
-def embed_data_loader(models, data_loader):
+def embed_data_loader(models, data_loader, tag):
     '''Iterate data loader and compute embeddings.'''
 
     # Verify no model parallelism.
@@ -181,7 +111,12 @@ def embed_data_loader(models, data_loader):
 
     # Embed.
     embeddings = []
-    for _ in tqdm(range(len(data_loader)), "mt embed"):
+    for _ in tqdm(
+        range(len(data_loader)),
+        "  embed%s" % ("" if tag is None else " / '%s'" % tag),
+        miniters=len(data_loader) // 10,
+        disable=torch.distributed.get_rank() != 0,
+    ):
         with torch.no_grad():
             result = forward_step(data_iterator, models[0])
             embeddings.append(result[0].detach().cpu().numpy())
@@ -192,10 +127,26 @@ def embed_data_loader(models, data_loader):
     return embeddings
 
 
+class TextDataset(torch.utils.data.Dataset):
+    '''Dataset that holds a list of strings.'''
+
+    def __init__(self, texts):
+        assert isinstance(texts, list)
+        for t in texts:
+            assert isinstance(t, str)
+        self.texts = texts
+
+    def __len__(self):
+        return len(self.texts)
+
+    def __getitem__(self, i):
+        return {"text": self.texts[i]}
+
+
 class BertEmbedder:
     '''Compute Bert embeddings, from a text dataset.'''
 
-    def __init__(self, batch_size, max_bert_seq_length, embedder_type):
+    def __init__(self, batch_size, max_bert_seq_length, embedder_type, warmup=True):
 
         args = get_args()
 
@@ -216,7 +167,25 @@ def __init__(self, batch_size, max_bert_seq_length, embedder_type):
         else:
             raise Exception("specialize for embedder type '%s'." % embedder_type)
 
-    def embed_text_dataset(self, text_dataset):
+        # Warm-up JIT.
+        # - Important to separately warm up:
+        #   1. batch_size == 1
+        #   2. batch_size > 1
+        if warmup:
+            warmup_dataset = TextDataset([
+                "great fleas have lesser fleas, upon their backs to bite’em,",
+                "and lesser fleas have lesser fleas, and so, ad infinitum,",
+                "and those great fleas, themselves, in turn have greater fleas to go on,",
+                "while those again have greater still, and greater still, and so on.",
+            ])
+            print_rank_0("bert / warmup single.")
+            for _ in range(3):
+                self.embed_text("hi, bert.")            # batch size == 1
+            print_rank_0("bert / warmup batch.")
+            for _ in range(3):
+                self.embed_text_dataset(warmup_dataset) # batch size > 1
+
+    def embed_text_dataset(self, text_dataset, tag=None):
         '''Embed a text dataset.'''
 
         # Huggingface.
@@ -229,7 +198,7 @@ def embed_text_dataset(self, text_dataset):
 
         # Embed.
         data_loader = get_data_loader(bert_dataset, self.batch_size)
-        embeddings = embed_data_loader(self.models, data_loader)
+        embeddings = embed_data_loader(self.models, data_loader, tag)
 
         return embeddings
 
@@ -240,18 +209,8 @@ def embed_text(self, text):
         analysis or debugging. For large scale, use 'embed_text_dataset()'.
         '''
 
-        class SingleTextDataset(torch.utils.data.Dataset):
-            '''Dataset that holds single string.'''
-            def __init__(self, text):
-                assert isinstance(text, str)
-                self.text = text
-            def __len__(self):
-                return 1
-            def __getitem__(self, i):
-                return {"text": self.text}
-
         # Embed text.
-        text_ds = SingleTextDataset(text)
+        text_ds = TextDataset([ text ])
         embed = self.embed_text_dataset(text_ds)[0]
 
         return embed
@@ -260,13 +219,12 @@ def __getitem__(self, i):
 class DiskDataParallelBertEmbedder:
     '''Process embeddings in blocks & save to disk.'''
 
-    def __init__(self, batch_size, max_bert_seq_length, block_size,
-                 embedder_type):
-        self.embedder = BertEmbedder(batch_size, max_bert_seq_length,
-                                     embedder_type)
+    def __init__(self, embedder, block_size):
+        assert isinstance(embedder, BertEmbedder)
+        self.embedder = embedder
         self.block_size = block_size
 
-    def embed_text_blocks(self, name, workdir, text_dataset,
+    def embed_text_blocks(self, name, dirname, text_dataset,
                           missing_embedding_blocks):
         '''Process a text dataset in blocks.'''
 
@@ -298,17 +256,17 @@ def embed_text_blocks(self, name, workdir, text_dataset,
             print_rank_0(" > waiting for other ranks to finish block.")
             torch.distributed.barrier()
 
-    def embed_text_dataset(self, name, workdir, text_dataset):
+    def embed_text_dataset(self, name, dirname, text_dataset):
         '''Embed a text dataset.'''
 
-        # Dataset workdir.
-        os.makedirs(workdir, exist_ok=True)
+        # Dataset dir.
+        os.makedirs(dirname, exist_ok=True)
 
         # Missing embedding blocks (stored on disk).
         def validate(f):
             assert f["data"].shape[1] == 1024
-        n_missing_world, missing_embedding_blocks = get_missing_blocks_by_rank(
-            workdir,
+        blocks = get_blocks_by_rank(
+            dirname,
             len(text_dataset),
             self.block_size,
             validate=validate)
@@ -317,5 +275,4 @@ def validate(f):
         torch.distributed.barrier()
 
         # Embed batches.
-        self.embed_text_blocks(name, workdir, text_dataset,
-                               missing_embedding_blocks)
+        self.embed_text_blocks(name, dirname, text_dataset, blocks.missing)
diff --git a/tools/bert_embedding/utils.py b/tools/bert_embedding/utils.py
deleted file mode 100644
index 27a8fe13c8..0000000000
--- a/tools/bert_embedding/utils.py
+++ /dev/null
@@ -1,193 +0,0 @@
-# Copyright (c) 2023, NVIDIA CORPORATION.  All rights reserved.
-
-from collections import defaultdict
-import glob
-import numpy as np
-import os
-import torch
-from tqdm import tqdm
-
-from megatron import print_rank_0
-from megatron.core import parallel_state
-
-from .external_libs import h5py
-
-
-def save_data(data_map, *args):
-    '''Save map of numpy arrays to hdf5 file.'''
-
-    # Parse args.
-    if len(args) == 1:
-        path = args[0]
-    elif len(args) == 2:
-        dir_path, file_name = args
-        path = os.path.join(dir_path, file_name)
-    else:
-        raise Exception("specialize for len(args) == %d." % len(args))
-
-    # Save data.
-    if not os.path.isfile(path):
-        f = h5py.File(path, "w")
-        for k, v in data_map.items():
-            f.create_dataset(k, data=v)
-        f.close()
-
-    return path
-
-
-def load_data(paths):
-    '''Load multiple hdf5 files to single numpy array.'''
-
-    # Read data shapes.
-    shape_map = defaultdict(lambda : (0, None))
-    for p in paths:
-        f = h5py.File(p, "r")
-        for k in f.keys():
-            shape = tuple(f[k].shape)
-            shape_map[k] = (shape_map[k][0] + shape[0], shape[1])
-        f.close()
-
-    # Allocate output array.
-    data_map = { k : np.empty(s, dtype="f4") for k, s in shape_map.items() }
-    start_map = { k : 0 for k in shape_map }
-
-    # Load files.
-    for pi, p in enumerate(tqdm(paths, "load data")):
-        f = h5py.File(p, "r")
-        for k in f.keys():
-            i0 = start_map[k]
-            i1 = i0 + len(f[k])
-            data_map[k][i0:i1] = f[k]
-            start_map[k] += len(f[k])
-        f.close()
-
-    return data_map
-
-
-def get_missing_blocks(workdir, n_samples, block_size,
-                       validate=lambda f : None):
-    '''Divide range [0, num_samples) to sequence of block ranges.
-
-    This is a core method within the concept of block processing. The idea
-    is to divide a range (size n_samples) into a sequence of blocks. Each
-    block corresponds to a file within 'workdir' with name
-    '{start_idx}-{end_idx}.hdf5'. This method checks for the existence of
-    these files, and returns a list of the ones that are missing.
-    '''
-
-    # Block ranges.
-    block_start_idxs = list(range(0, n_samples, block_size))
-    block_end_idxs = [ min(n_samples, i + block_size) for i in block_start_idxs ]
-    block_ranges = list(zip(block_start_idxs, block_end_idxs))
-
-    # All block files (existing + missing).
-    n_digits = int(np.ceil(np.log(n_samples) / np.log(10)) + 1)
-    all_blocks = [{
-        "range" : r,
-        "path" : os.path.join(
-            workdir,
-            "%s-%s.hdf5" % tuple([ str(i).zfill(n_digits) for i in r ]),
-        )
-    } for r in block_ranges]
-    all_block_path_set = set(block["path"] for block in all_blocks)
-
-    # Delete corrupt files.
-    if torch.distributed.get_rank() == 0:
-        existing_block_paths = [block["path"]
-                                for block in all_blocks
-                                if os.path.exists(block["path"])]
-        for index, path in enumerate(
-                tqdm(existing_block_paths, "validating block.")):
-
-            assert path in all_block_path_set, "unexpected filename, '%s'." % path
-
-            try:
-                f = h5py.File(path, "r")
-            except:
-                # raise Exception("unable to open/validate '%s'." % path)
-                os.remove(path)
-                continue
-
-            try:
-                validate(f)
-            except:
-                # raise Exception("delete block file '%s'." % path)
-                os.remove(path)
-            finally:
-                f.close()
-
-    # Wait for files to be deleted.
-    torch.distributed.barrier()
-
-    # Filter missing files.
-    missing_blocks = [block
-                      for block in all_blocks
-                      if not os.path.exists(block["path"])]
-
-    return missing_blocks
-
-
-def get_missing_blocks_by_rank(workdir, n_samples, block_size,
-                               validate=lambda f : None):
-    '''Divide missing blocks evenly across all ranks.
-
-    See 'get_missing_blocks()' above for description. The returned list of
-    missing blocks is split evenly across ranks via interleaving. This way,
-    each rank has a roughly equal number of blocks to process for a
-    downstream operation.
-    '''
-
-    missing_blocks = get_missing_blocks(workdir, n_samples, block_size,
-                                        validate)
-
-    # This rank's missing files.
-    data_parallel_rank = parallel_state.get_data_parallel_rank()
-    data_parallel_world_size = parallel_state.get_data_parallel_world_size()
-    rank_missing_blocks = missing_blocks[data_parallel_rank:len(missing_blocks):data_parallel_world_size]
-
-    # Extend rank's missing blocks (with None) such that all ranks have equal
-    # length lists. This allows for easier tracking of global progress.
-    n_missing_tensor = torch.cuda.LongTensor([len(rank_missing_blocks)])
-    torch.distributed.all_reduce(n_missing_tensor,
-                                 op=torch.distributed.ReduceOp.MAX)
-    max_n_missing = n_missing_tensor.item()
-    rank_missing_blocks += [None] * (max_n_missing - len(rank_missing_blocks))
-
-    return len(missing_blocks), rank_missing_blocks
-
-
-class BlockPathMap:
-    '''Map an index to its containing block path.
-
-    The common use for this class is to have a directory of files containing
-    blocks of processed data, of uniform block size (e.g., 100k samples per
-    file). Each file must follow a naming convention of 'startIdx-endIdx.[ext]',
-    where 'endIdx' minus 'startIdx' must equal the block size, with the possible
-    exception of the final block. Given an input index, this class maps the
-    index to the containing block file.
-    '''
-
-    @classmethod
-    def from_dir(cls, _dir, block_size, ext="hdf5"):
-        '''Get list of block files, and create map.'''
-        assert os.path.isdir(_dir), f"directory not found, '{_dir}'."
-        return cls(sorted(glob.glob(_dir + f"/*.{ext}")), block_size)
-
-    def __init__(self, block_paths, block_size):
-        self.max_idx = 0
-        self.block_path_map = {}
-        for block_path in block_paths:
-            name = os.path.splitext(os.path.basename(block_path))[0]
-            start_idx, end_idx = [ int(i) for i in name.split("-") ]
-            self.block_path_map[start_idx] = block_path
-            self.max_idx = max(self.max_idx, end_idx)
-        self.block_size = block_size
-
-    def __str__(self):
-        return "%d paths" % len(self.block_path_map)
-
-    def __getitem__(self, idx):
-        '''Get block path from index.'''
-        block_start_idx = self.block_size * (idx // self.block_size)
-        block_path = self.block_path_map[block_start_idx]
-        return block_path
diff --git a/tools/checkpoint_util.py b/tools/checkpoint/convert.py
similarity index 91%
rename from tools/checkpoint_util.py
rename to tools/checkpoint/convert.py
index 628ce47c62..935613b143 100644
--- a/tools/checkpoint_util.py
+++ b/tools/checkpoint/convert.py
@@ -1,7 +1,8 @@
+# Copyright (c) 2023, NVIDIA CORPORATION. All rights reserved.
+
 import argparse
 import importlib
 import torch.multiprocessing as mp
-import os
 import sys
 
 # A loader is a python file with at least two functions
@@ -48,14 +49,14 @@
 # (for each transformer layer):
 # {
 #   "name": "transformer layer N"
-#   "input layernorm weight"
-#   "input layernorm bias"
+#   "input norm weight"
+#   "input norm bias"
 #   "qkv weight"
 #   "qkv bias"
 #   "dense weight"
 #   "dense bias"
-#   "post layernorm weight"
-#   "post layernorm bias"
+#   "post norm weight"
+#   "post norm bias"
 #   "mlp l0 weight"
 #   "mlp l0 bias"
 #   "mlp l1 weight"
@@ -76,8 +77,8 @@
 #   "name": "lm head"
 #   "dense weight"
 #   "dense bias"
-#   "layernorm weight"
-#   "layernorm bias"
+#   "norm weight"
+#   "norm bias"
 # }
 # {
 #   "name": "binary head"
@@ -87,14 +88,16 @@
 # - "done"
 
 def load_plugin(plugin_type, name):
-    module_name = f"checkpoint_{plugin_type}_{name}"
+    module_name = f"{plugin_type}_{name}"
     try:
         plugin = importlib.import_module(module_name)
-    except ModuleNotFoundError:
+    except ModuleNotFoundError as e:
+        print(e)
         module_name = name
         try:
             plugin = importlib.import_module(module_name)
-        except ModuleNotFoundError:
+        except ModuleNotFoundError as e:
+            print(e)
             sys.exit(f"Unable to load {plugin_type} plugin {name}. Exiting.")
 
     if not hasattr(plugin, 'add_arguments'):
@@ -105,7 +108,7 @@ def load_plugin(plugin_type, name):
 
 def main():
     import argparse
-    parser = argparse.ArgumentParser(description="Megatron Checkpoint Utility Arguments",
+    parser = argparse.ArgumentParser(description="Megatron Checkpoint Converter Arguments",
                                      allow_abbrev=False, conflict_handler='resolve')
 
     parser.add_argument('--model-type', type=str, required=True,
@@ -114,7 +117,7 @@ def main():
     parser.add_argument('--loader', type=str, default='megatron',
                         help='Module name to load checkpoint, should be on python path')
     parser.add_argument('--saver', type=str, default='megatron',
-                        help='Module name to save checkpoint, shdoul be on python path')
+                        help='Module name to save checkpoint, should be on python path')
     parser.add_argument('--load-dir', type=str, required=True,
                         help='Directory to load model checkpoint from')
     parser.add_argument('--save-dir', type=str, required=True,
diff --git a/tools/checkpoint/hybrid_conversion.py b/tools/checkpoint/hybrid_conversion.py
new file mode 100644
index 0000000000..19a4c014b1
--- /dev/null
+++ b/tools/checkpoint/hybrid_conversion.py
@@ -0,0 +1,398 @@
+# Copyright (c) 2024, NVIDIA CORPORATION. All rights reserved.
+
+# Note (rwaleffe): This is a temporary file for hybrid mamba-transformer model checkpoint conversion.
+# This functionality should be integrated with the megatron core checkpoint loader/saver.
+
+
+import copy
+import os
+import re
+import shutil
+from collections import OrderedDict
+
+import torch
+import argparse
+
+
+tp_split_dim = {
+    'word_embeddings.weight': 0,
+    'norm.weight': -1,
+    'final_norm.weight': -1,
+    'output_layer.weight': 0,
+    # mamba1/2
+    'A_log': 0,
+    'D': 0,
+    'dt_bias': 0,
+    'in_proj.weight': 0,
+    'conv1d.weight': 0,
+    'conv1d.bias': 0,
+    'x_proj.weight': 1,
+    'dt_proj.weight': 0,
+    'dt_proj.bias': 0,
+    'out_proj.weight': 1,
+    'mixer.norm.weight': 0,
+    # mlp
+    'linear_fc1.layer_norm_weight': -1,
+    'linear_fc1.weight': 0,
+    'linear_fc2.weight': 1,
+    # attention
+    'self_attention.linear_proj.weight': 1,
+    'self_attention.linear_qkv.layer_norm_weight': -1,
+    'self_attention.linear_qkv.weight': 0,
+}
+
+
+def get_split_dim(tensor_name):
+    # norm.weight will match tensor_name of mixer.norm.weight and norm.weight, need to distinguish
+    if 'norm.weight' in tensor_name:
+        if 'mixer.norm.weight' in tensor_name:
+            return tp_split_dim['mixer.norm.weight']
+        else:
+            return tp_split_dim['norm.weight']
+
+    for key in tp_split_dim.keys():
+        if key in tensor_name:
+            return tp_split_dim[key]
+    raise Exception("Unknown tensor name {}".format(tensor_name))
+
+
+def combine_tp_tensors(params, key, dim, tensors):
+    tp_size = len(tensors)
+
+    if 'mixer.in_proj.weight' in key and params.mamba_version == 1:
+        xs = []; zs = []
+        for tensor in tensors:
+            x, z = torch.split(tensor, [params.mamba_d_inner//tp_size,
+                                        params.mamba_d_inner//tp_size], dim=dim)
+            xs.append(x); zs.append(z)
+        return torch.cat([torch.cat(xs, dim=dim), torch.cat(zs, dim=dim)], dim=dim)
+
+    elif 'mixer.in_proj.weight' in key and params.mamba_version == 2:
+        xs = []; zs = []; Bs = []; Cs = []; dts = []
+        for tensor in tensors:
+            x, z, B, C, dt = torch.split(tensor, [params.mamba_d_inner // tp_size,
+                                                  params.mamba_d_inner // tp_size,
+                                                  (params.mamba2_n_groups // tp_size) * args.mamba_d_state,
+                                                  (params.mamba2_n_groups // tp_size) * args.mamba_d_state,
+                                                  params.mamba2_n_heads // tp_size], dim=dim)
+            xs.append(x); zs.append(z); Bs.append(B); Cs.append(C); dts.append(dt)
+
+        for ii in range(len(Bs)):
+            Bs[ii] = torch.reshape(Bs[ii], (-1, params.mamba_d_state, Bs[ii].shape[-1]))
+            Cs[ii] = torch.reshape(Cs[ii], (-1, params.mamba_d_state, Cs[ii].shape[-1]))
+        B = torch.cat(Bs, dim=dim); C = torch.cat(Cs, dim=dim)
+        x = torch.cat(xs, dim=dim); z = torch.cat(zs, dim=dim); dt = torch.cat(dts, dim=dim)
+
+        return torch.cat([x, z, B.flatten(0, 1), C.flatten(0, 1), dt], dim=dim)
+
+    elif 'mixer.conv1d' in key and params.mamba_version == 2:
+        xs = []; Bs = []; Cs = []
+        for tensor in tensors:
+            x, B, C = torch.split(tensor, [params.mamba_d_inner//tp_size,
+                                           (params.mamba2_n_groups // tp_size) * params.mamba_d_state,
+                                           (params.mamba2_n_groups // tp_size) * params.mamba_d_state], dim=dim)
+            xs.append(x); Bs.append(B); Cs.append(C)
+
+        for ii in range(len(Bs)):
+            if 'weight' in key:
+                Bs[ii] = torch.reshape(Bs[ii], (-1, params.mamba_d_state, Bs[ii].shape[-2], Bs[ii].shape[-1]))
+                Cs[ii] = torch.reshape(Cs[ii], (-1, params.mamba_d_state, Cs[ii].shape[-2], Cs[ii].shape[-1]))
+            elif 'bias' in key:
+                Bs[ii] = torch.reshape(Bs[ii], (-1, params.mamba_d_state))
+                Cs[ii] = torch.reshape(Cs[ii], (-1, params.mamba_d_state))
+            else:
+                raise Exception("Unknown key")
+        B = torch.cat(Bs, dim=dim); C = torch.cat(Cs, dim=dim)
+        x = torch.cat(xs, dim=dim)
+
+        return torch.cat([x, B.flatten(0, 1), C.flatten(0, 1)], dim=dim)
+
+    else:
+        return torch.cat(tensors, dim=dim)
+
+
+def split_tensor_for_tp(params, key, dim, tensor):
+    tp_size = params.target_tp_size
+    tensor_sliced = []
+
+    if 'mixer.in_proj.weight' in key and params.mamba_version == 1:
+        x, z = torch.split(tensor, [params.mamba_d_inner, params.mamba_d_inner], dim=dim)
+        x_sliced = torch.chunk(x, tp_size, dim=dim)
+        z_sliced = torch.chunk(z, tp_size, dim=dim)
+        for (x, z) in zip(x_sliced, z_sliced):
+            tensor_sliced.append(torch.cat((x, z), dim=dim))
+
+    elif 'mixer.in_proj.weight' in key and params.mamba_version == 2:
+        x, z, B, C, dt = torch.split(tensor, [params.mamba_d_inner, params.mamba_d_inner,
+                                                      params.mamba2_n_groups * params.mamba_d_state,
+                                                      params.mamba2_n_groups * params.mamba_d_state,
+                                                      params.mamba2_n_heads], dim=dim)
+        B = torch.reshape(B, (-1, params.mamba_d_state, B.shape[-1]))
+        C = torch.reshape(C, (-1, params.mamba_d_state, C.shape[-1]))
+
+        B_sliced = torch.chunk(B, tp_size, dim=dim)
+        C_sliced = torch.chunk(C, tp_size, dim=dim)
+        x_sliced = torch.chunk(x, tp_size, dim=dim)
+        z_sliced = torch.chunk(z, tp_size, dim=dim)
+        dt_sliced = torch.chunk(dt, tp_size, dim=dim)
+
+        tensor_sliced = []
+        for (x, z, B, C, dt) in zip(x_sliced, z_sliced, B_sliced, C_sliced, dt_sliced):
+            tensor_sliced.append(torch.cat((x, z, B.flatten(0, 1), C.flatten(0, 1), dt), dim=dim))
+
+    elif 'mixer.conv1d' in key and params.mamba_version == 2:
+        x, B, C = torch.split(tensor, [params.mamba_d_inner,
+                                               params.mamba2_n_groups * params.mamba_d_state,
+                                               params.mamba2_n_groups * params.mamba_d_state], dim=dim)
+        if 'weight' in key:
+            B = torch.reshape(B, (-1, params.mamba_d_state, B.shape[-2], B.shape[-1]))
+            C = torch.reshape(C, (-1, params.mamba_d_state, C.shape[-2], C.shape[-1]))
+        elif 'bias' in key:
+            B = torch.reshape(B, (-1, params.mamba_d_state))
+            C = torch.reshape(C, (-1, params.mamba_d_state))
+        else:
+            raise Exception("Unknown key")
+
+        B_sliced = torch.chunk(B, tp_size, dim=dim)
+        C_sliced = torch.chunk(C, tp_size, dim=dim)
+        x_sliced = torch.chunk(x, tp_size, dim=dim)
+
+        tensor_sliced = []
+        for (x, B, C) in zip(x_sliced, B_sliced, C_sliced):
+            tensor_sliced.append(torch.cat((x, B.flatten(0, 1), C.flatten(0, 1)), dim=dim))
+
+    else:
+        tensor_sliced = torch.chunk(tensor, tp_size, dim=dim)
+
+    return tensor_sliced
+
+
+def finalize_checkpoint(sample_model, model, params, verbose=False):
+    # make sure the rest of the checkpoint is how we want it from the original (i.e., other than the 'model')
+    reset_iterations = params.reset_iterations
+
+    # checkpoint 'args'
+    model['args'] = copy.deepcopy(sample_model['args'])
+    model['args'].tensor_model_parallel_size = params.target_tp_size
+    model['args'].pipeline_model_parallel_size = params.target_pp_size
+    if reset_iterations:
+        model['args'].iteration = 0
+        model['args'].consumed_valid_samples = 0
+        model['args'].consumed_train_samples = 0
+        model['args'].train_iters = 0
+        model['args'].train_samples = 0
+
+    # checkpoint 'checkpoint_version'
+    model['checkpoint_version'] = copy.deepcopy(sample_model['checkpoint_version'])
+
+    # checkpoint 'iteration'
+    model['iteration'] = copy.deepcopy(sample_model['iteration'])
+    if reset_iterations:
+        model['iteration'] = 0
+
+    # checkpoint 'optimizer'
+    # ignore
+
+    # checkpoint 'opt_param_scheduler'
+    if 'opt_param_scheduler' in sample_model.keys():
+        model['opt_param_scheduler'] = copy.deepcopy(sample_model['opt_param_scheduler'])
+
+    # checkpoint 'rng_state'
+    model['rng_state'] = copy.deepcopy(sample_model['rng_state'])
+
+    # report on argument difference
+    if verbose:
+        original_args = sample_model['args'].__dict__
+        final_args = model['args'].__dict__
+        for key in original_args:
+            if key in final_args:
+                if final_args[key] != original_args[key]:
+                    print("KEY MISMATCH: {}".format(key))
+                    print("\toriginal: {}\n\tfinal: {}".format(original_args[key], final_args[key]))
+            else:
+                print("KEY MISSING from final: {}, value {}".format(key, original_args[key]))
+        print("")
+        for key in final_args:
+            if key not in original_args:
+                print("KEY ADDED to final: {}, value {}".format(key, final_args[key]))
+
+    return model
+
+
+def main(args):
+    print("\n====RUNNING CHECKPOINT CONVERSION====\n")
+
+    args.mamba_d_inner = args.d_model * 2
+    args.mamba2_n_heads = args.mamba_d_inner // args.mamba2_head_dim
+
+    # get the latest iteration
+    tracker_filename = os.path.join(args.load_dir, 'latest_checkpointed_iteration.txt')
+    with open(tracker_filename, 'r') as f:
+        metastring = f.read().strip()
+        try:
+            iteration = int(metastring)
+        except ValueError:
+            raise Exception("")
+    out_iteration = iteration if not args.reset_iterations else 0
+
+    # get model directory and model parallel ranks
+    input_model_dir = os.path.join(args.load_dir, 'iter_{:07d}'.format(iteration))
+    input_sub_models = os.listdir(input_model_dir)
+    # input_sub_models = sorted(input_sub_models, key=lambda x: int(re.search(r'\d+', x).group()))
+
+    # load one of the model parallel ranks to get arguments
+    sample_model_file = os.path.join(input_model_dir, input_sub_models[0], "model_optim_rng.pt")
+    sample_model = torch.load(sample_model_file)
+    print(f"Sample model {sample_model_file} is loaded.\n")
+
+    # input tensor and pipeline parallel size
+    input_tp_rank = sample_model['args'].tensor_model_parallel_size
+    input_pp_rank = sample_model['args'].pipeline_model_parallel_size
+    num_layers_per_pipeline_rank = sample_model['args'].num_layers // input_pp_rank
+
+    # construct full model
+    full_model = OrderedDict()
+    for pp in range(input_pp_rank):
+        print("[INFO] Processing input pipeline rank {}".format(pp))
+        tp_models = []
+        for tp in range(input_tp_rank):
+            dir_name = "mp_rank_{:02d}".format(tp)
+            if input_pp_rank > 1:
+                dir_name += "_{:03d}".format(pp)
+            model_file = os.path.join(input_model_dir, dir_name, "model_optim_rng.pt")
+
+            tp_models.append(torch.load(model_file))
+            print(f"Model {model_file} is loaded.")
+
+        if input_tp_rank > 1:
+            combined_tp_model = OrderedDict()
+            for ii, (key, original_tensor) in enumerate(tp_models[0]['model'].items()):
+                if "_extra_state" in key:
+                    combined_tp_model[key] = original_tensor
+                    continue
+
+                split_dim = get_split_dim(key)
+                original_shape = list(original_tensor.shape)
+                combined_shape = copy.deepcopy(original_shape)
+                combined_shape[split_dim] *= input_tp_rank
+                # print("{}, {}, {}".format(ii, key, split_dim))
+
+                if split_dim != -1:
+                    # slice together model
+                    # print("\tshape mismatch: original {}, combined {}".format(original_shape, combined_shape))
+                    combined_tensor = combine_tp_tensors(args, key, split_dim,
+                                                    [tp_models[jj]['model'][key].cpu() for jj in range(input_tp_rank)])
+                    combined_tp_model[key] = combined_tensor
+                else:
+                    # copy model
+                    combined_tp_model[key] = original_tensor
+        else:
+            combined_tp_model = tp_models[0]['model']
+        # print("Combined tp model: {}".format(combined_tp_model.keys()))
+
+        for ii, (key, original_tensor) in enumerate(combined_tp_model.items()):
+            try:
+                layer_num = int(re.findall(r'\d+', key)[0])
+                new_key = key.replace(str(layer_num), str(layer_num + pp*num_layers_per_pipeline_rank), 1)
+            except Exception:
+                new_key = key
+            full_model[new_key] = original_tensor
+    # print("Combined model: {}".format(full_model.keys()))
+    print("\n[INFO] Loaded combined model\n")
+
+    # sort by layer
+    # full_model_sorted = dict(sorted(people.items(), key=lambda item: item[1]))
+
+    # create new split model
+    pp_offset = 0
+    num_layers_per_pipeline_rank = sample_model['args'].num_layers // args.target_pp_size
+
+    for pp in range(args.target_pp_size):
+        print("[INFO] Processing output pipeline rank {}".format(pp))
+        tp_models = []
+        for ii in range(args.target_tp_size):
+            tp_models.append({'model': OrderedDict()})
+
+        for ii, (key, original_tensor) in enumerate(full_model.items()):
+            try:
+                layer_num = int(re.findall(r'\d+', key)[0])
+                if layer_num >= num_layers_per_pipeline_rank * (pp+1):
+                    break
+                new_key = key.replace(str(layer_num), str(layer_num - (pp * num_layers_per_pipeline_rank)), 1)
+            except Exception:
+                new_key = key
+
+            if ii < pp_offset:
+                continue
+            else:
+                pp_offset += 1
+
+            if "_extra_state" in new_key:
+                # copy
+                for jj in range(args.target_tp_size):
+                    tp_models[jj]['model'][new_key] = original_tensor
+                continue
+
+            split_dim = get_split_dim(new_key)
+            original_shape = list(original_tensor.shape)
+            v0 = original_shape[split_dim]
+            split_size = v0 // args.target_tp_size
+            split_shape = copy.deepcopy(original_shape)
+            split_shape[split_dim] = split_size
+            # print("{}, {}, {}".format(ii, new_key, split_dim))
+
+            if split_dim != -1:
+                # split model
+                # print("\tshape mismatch: original {}, combined {}".format(original_shape, split_shape))
+                tensor_sliced = split_tensor_for_tp(args, new_key, split_dim, original_tensor)
+                for jj in range(args.target_tp_size):
+                    tp_models[jj]['model'][new_key] = tensor_sliced[jj]
+            else:
+                # copy model
+                for jj in range(args.target_tp_size):
+                    tp_models[jj]['model'][new_key] = original_tensor
+        # print(tp_models[0]['model'].keys())
+
+        for tp in range(args.target_tp_size):
+            dir_name = "mp_rank_{:02d}".format(tp)
+            if args.target_pp_size > 1:
+                dir_name += "_{:03d}".format(pp)
+
+            model = finalize_checkpoint(sample_model, tp_models[tp], args, verbose=False)
+
+            save_dir = os.path.join(args.save_dir, 'iter_{:07d}'.format(out_iteration), dir_name)
+            os.makedirs(save_dir, exist_ok=True)
+            model_file = os.path.join(save_dir, "model_optim_rng.pt")
+            torch.save(model, model_file)
+            print(f"Model {model_file} is saved.")
+
+    # shutil.copyfile(tracker_filename, os.path.join(args.save_dir, 'latest_checkpointed_iteration.txt'))
+    tracker_filename = os.path.join(args.save_dir, 'latest_checkpointed_iteration.txt')
+    with open(tracker_filename, 'w') as f:
+        f.write(str(out_iteration))
+
+
+if __name__ == "__main__":
+    # example run command:
+    # python hybrid_conversion.py
+    # --load-dir mamba2-840m-test/checkpoints/
+    # --save-dir mamba2-840m-test-conversion/checkpoints/
+    # --target-pp-size 1
+    # --target-tp-size 1
+
+    parser = argparse.ArgumentParser()
+    parser.add_argument('--load-dir', type=str)
+    parser.add_argument('--save-dir', type=str)
+    parser.add_argument('--target-tp-size', type=int, default=1)
+    parser.add_argument('--target-pp-size', type=int, default=1)
+    parser.add_argument('--reset-iterations', action='store_true')
+
+    parser.add_argument('--d-model', type=int, default=4096)
+    parser.add_argument('--mamba-version', type=int, default=2)
+    parser.add_argument('--mamba-d-state', type=int, default=128)
+    parser.add_argument('--mamba2-n-groups', type=int, default=8)
+    parser.add_argument('--mamba2-head-dim', type=int, default=64)
+
+    args = parser.parse_args()
+
+    main(args)
diff --git a/tools/checkpoint/loader_llama_mistral.py b/tools/checkpoint/loader_llama_mistral.py
new file mode 100644
index 0000000000..0667fad522
--- /dev/null
+++ b/tools/checkpoint/loader_llama_mistral.py
@@ -0,0 +1,665 @@
+# Copyright (c) 2023, NVIDIA CORPORATION. All rights reserved.
+
+import json
+import os
+import sys
+import torch
+try:
+    import transformers
+except ImportError:
+    raise ImportError("The 'transformers' package is not installed.")
+import gc
+import shutil
+from tqdm import tqdm
+import types
+
+
+def add_arguments(parser):
+    group = parser.add_argument_group(title='Llama/Mistral loader.')
+
+    # TODO(jbarker): Need assertion to make sure *exactly* one of these is used
+    parser.add_argument('--model-size', type=str, required=True,
+                        choices=['llama2-7B', 'llama2-13B', 'llama2-70B', 'llama2-7Bf', 'llama2-13Bf', 'llama2-70Bf', 'llama3-8B', 'llama3-70B', 'llama3-8Bf', 'llama3-70Bf', 'mistral-7B', 'mistral-7Bf', 'yi-34B', 'qwen2.5-7B', 'qwen2.5-72B', 'qwen2.5-7Bf', 'qwen2.5-72Bf'],
+                        help='Model size can be `llama2-7B`, `llama2-13B`, `llama2-70B`, `llama3-8B`, `llama3-70B`, `mistral-7B`, `qwen2.5-7B`, `qwen2.5-72B` (for pretrained models), '
+                        'and `llama2-7Bf`, `llama2-13Bf`, `llama2-70Bf`, `llama3-8Bf`, `llama3-70bf`, `mistral-7Bf`, `qwen2.5-7Bf`, and `qwen2.5-72Bf` (for chat-finetuned models).')
+    parser.add_argument('--checkpoint-type', type=str, required=True,
+                        help='Type of checkpoint to convert, options are "meta" or "hf"')
+    parser.add_argument('--bf16', action='store_true', help='Whether to load weights in bf16.')
+    parser.add_argument('--fp16', action='store_true', help='Whether to load weights in fp16.')
+    group.add_argument('--true-vocab-size', type=int, default=None,
+                       help='original size of vocab, if specified will trim padding from embedding table.')
+    group.add_argument('--vocab-file', type=str, default=None,
+                       help='Path to the vocab file. If specified will use this to get vocab size and '
+                       'trim padding from the embedding table.')
+    group.add_argument('--tokenizer-model', required=True,
+                       help='Tokenizer model file.')
+    group.add_argument('--megatron-path', type=str, default=None,
+                       help='Base directory of Megatron repository')
+    group.add_argument('--loader-transformer-impl', default='local',
+                       choices=['local', 'transformer_engine'],
+                       help='Which Transformer implementation to use.')
+
+
+def verify_transformers_version():
+    major, minor, patch = map(int, transformers.__version__.split('.'))
+    assert major >= 4 and minor >= 31
+
+
+NUM_SHARDS = {
+    "llama2-7B": 1,
+    "llama2-7Bf": 1,
+    "llama2-13B": 2,
+    "llama2-13Bf": 2,
+    "llama2-70B": 8,
+    "llama2-70Bf": 8,
+    "llama3-8B": 1,
+    "llama3-8Bf": 1,
+    "llama3-70B": 8,
+    "llama3-70Bf": 8,
+    "mistral-7B": 1,
+    "mistral-7Bf": 1,
+    "yi-34B": 8,
+    "qwen2.5-7B": 1,
+    "qwen2.5-7Bf": 1,
+    "qwen2.5-72B": 8,
+    "qwen2.5-72Bf": 8,
+}
+
+
+def compute_intermediate_size(n, ffn_dim_multiplier=1, multiple_of=256):
+    return multiple_of * ((int(ffn_dim_multiplier * int(8 * n / 3)) + multiple_of - 1) // multiple_of)
+
+
+def read_json(path):
+    with open(path, "r") as f:
+        return json.load(f)
+
+
+def write_json(text, path):
+    with open(path, "w") as f:
+        json.dump(text, f)
+
+
+# This conversion is adapted from
+# https://github.com/huggingface/transformers/blob/main/src/transformers/models/llama/convert_llama_weights_to_hf.py
+def convert_to_hf(model_path, input_base_path, model_size, tokenizer_path):
+
+    if "llama2" in model_size:
+        from transformers import LlamaConfig as ModelConfig
+        from transformers import  LlamaTokenizer, LlamaTokenizerFast
+    elif "llama3" in model_size:
+        from transformers import LlamaConfig as ModelConfig
+    elif "mistral" in model_size:
+        from transformers import MistralConfig as ModelConfig
+
+    # for backward compatibility, before you needed the repo to be called `my_repo/model_size`
+    if not os.path.isfile(os.path.join(input_base_path, "params.json")):
+        input_base_path = os.path.join(input_base_path, model_size)
+
+    os.makedirs(model_path, exist_ok=True)
+
+    params = read_json(os.path.join(input_base_path, "params.json"))
+    num_shards = NUM_SHARDS[model_size]
+    params = params.get("model", params)
+    n_layers = params["n_layers"]
+    n_heads = params["n_heads"]
+    n_heads_per_shard = n_heads // num_shards
+    dim = params["dim"]
+    dims_per_head = dim // n_heads
+    base = params.get("rope_theta", 10000.0)
+    inv_freq = 1.0 / (base ** (torch.arange(0, dims_per_head, 2).float() / dims_per_head))
+    if base > 10000.0:
+        max_position_embeddings = 32768 if "mistral" in model_size else 16384
+    else:
+        max_position_embeddings = 4096 if "mistral" in model_size else 2048
+
+    if "llama2" in model_size:
+        tokenizer_class = LlamaTokenizer if LlamaTokenizerFast is None else LlamaTokenizerFast
+    elif model_size in ["llama3", "mistral"]:
+        tokenizer_class = transformers.AutoTokenizer.from_pretrained
+    else:
+        raise AttributeError(f"model_size={model_size} not supported")
+    if tokenizer_path is not None:
+        if "llama" in model_size:
+            tokenizer = tokenizer_class(tokenizer_path)
+            if "llama2" in model_size:
+                tokenizer.save_pretrained(model_path)
+                vocab_size = tokenizer.vocab_size if tokenizer_path is not None else 32000
+            elif "llama3" in model_size:
+                 vocab_size = 128256
+        elif "mistral" in model_size:
+            tokenizer = tokenizer_class.from_file(tokenizer_path)
+            vocab_size = 32768
+        else:
+            raise AttributeError(f"model_size={model_size} is not supported")
+
+    if params.get("n_kv_heads", None) is not None:
+        num_key_value_heads = params["n_kv_heads"]  # for GQA / MQA
+        num_local_key_value_heads = n_heads_per_shard // num_key_value_heads
+        key_value_dim = dim // num_key_value_heads
+    else:  # compatibility with other checkpoints
+        num_key_value_heads = n_heads
+        num_local_key_value_heads = n_heads_per_shard
+        key_value_dim = dim
+
+    # permute for sliced rotary
+    def permute(w, n_heads=n_heads, dim1=dim, dim2=dim):
+        return w.view(n_heads, dim1 // n_heads // 2, 2, dim2).transpose(1, 2).reshape(dim1, dim2)
+
+    print(f"Fetching all parameters from the checkpoint at {input_base_path}.")
+    # Load weights
+    if num_shards == 1:
+        # Not sharded
+        # (The sharded implementation would also work, but this is simpler.)
+        loaded = torch.load(os.path.join(input_base_path, "consolidated.00.pth"), map_location="cpu")
+    else:
+        # Sharded
+        loaded = [
+            torch.load(os.path.join(input_base_path, f"consolidated.{i:02d}.pth"), map_location="cpu")
+            for i in range(num_shards)
+        ]
+    param_count = 0
+    index_dict = {"weight_map": {}}
+    for layer_i in range(n_layers):
+        filename = f"pytorch_model-{layer_i + 1}-of-{n_layers + 1}.bin"
+        if num_shards == 1:
+            # Unsharded
+            q_proj = loaded[f"layers.{layer_i}.attention.wq.weight"]
+            k_proj = loaded[f"layers.{layer_i}.attention.wk.weight"]
+            if ("llama2" in model_size) or ("mistral" in model_size):
+                q_proj = permute(q_proj)
+                k_proj = permute(k_proj)
+            state_dict = {
+                f"model.layers.{layer_i}.self_attn.q_proj.weight": q_proj,
+                f"model.layers.{layer_i}.self_attn.k_proj.weight": k_proj,
+                f"model.layers.{layer_i}.self_attn.v_proj.weight": loaded[f"layers.{layer_i}.attention.wv.weight"],
+                f"model.layers.{layer_i}.self_attn.o_proj.weight": loaded[f"layers.{layer_i}.attention.wo.weight"],
+                f"model.layers.{layer_i}.mlp.gate_proj.weight": loaded[f"layers.{layer_i}.feed_forward.w1.weight"],
+                f"model.layers.{layer_i}.mlp.down_proj.weight": loaded[f"layers.{layer_i}.feed_forward.w2.weight"],
+                f"model.layers.{layer_i}.mlp.up_proj.weight": loaded[f"layers.{layer_i}.feed_forward.w3.weight"],
+                f"model.layers.{layer_i}.input_layernorm.weight": loaded[f"layers.{layer_i}.attention_norm.weight"],
+                f"model.layers.{layer_i}.post_attention_layernorm.weight": loaded[f"layers.{layer_i}.ffn_norm.weight"],
+            }
+        else:
+            # Sharded
+            # Note that attention.w{q,k,v,o}, feed_fordward.w[1,2,3], attention_norm.weight and ffn_norm.weight share
+            # the same storage object, saving attention_norm and ffn_norm will save other weights too, which is
+            # redundant as other weights will be stitched from multiple shards. To avoid that, they are cloned.
+
+            state_dict = {
+                f"model.layers.{layer_i}.input_layernorm.weight": loaded[0][
+                    f"layers.{layer_i}.attention_norm.weight"
+                ].clone(),
+                f"model.layers.{layer_i}.post_attention_layernorm.weight": loaded[0][
+                    f"layers.{layer_i}.ffn_norm.weight"
+                ].clone(),
+            }
+            state_dict[f"model.layers.{layer_i}.self_attn.q_proj.weight"] = permute(
+                torch.cat(
+                    [
+                        loaded[i][f"layers.{layer_i}.attention.wq.weight"].view(n_heads_per_shard, dims_per_head, dim)
+                        for i in range(num_shards)
+                    ],
+                    dim=0,
+                ).reshape(dim, dim)
+            )
+            state_dict[f"model.layers.{layer_i}.self_attn.k_proj.weight"] = permute(
+                torch.cat(
+                    [
+                        loaded[i][f"layers.{layer_i}.attention.wk.weight"].view(
+                            num_local_key_value_heads, dims_per_head, dim
+                        )
+                        for i in range(num_shards)
+                    ],
+                    dim=0,
+                ).reshape(key_value_dim, dim),
+                num_key_value_heads,
+                key_value_dim,
+                dim,
+            )
+            state_dict[f"model.layers.{layer_i}.self_attn.v_proj.weight"] = torch.cat(
+                [
+                    loaded[i][f"layers.{layer_i}.attention.wv.weight"].view(
+                        num_local_key_value_heads, dims_per_head, dim
+                    )
+                    for i in range(num_shards)
+                ],
+                dim=0,
+            ).reshape(key_value_dim, dim)
+
+            state_dict[f"model.layers.{layer_i}.self_attn.o_proj.weight"] = torch.cat(
+                [loaded[i][f"layers.{layer_i}.attention.wo.weight"] for i in range(num_shards)], dim=1
+            )
+            state_dict[f"model.layers.{layer_i}.mlp.gate_proj.weight"] = torch.cat(
+                [loaded[i][f"layers.{layer_i}.feed_forward.w1.weight"] for i in range(num_shards)], dim=0
+            )
+            state_dict[f"model.layers.{layer_i}.mlp.down_proj.weight"] = torch.cat(
+                [loaded[i][f"layers.{layer_i}.feed_forward.w2.weight"] for i in range(num_shards)], dim=1
+            )
+            state_dict[f"model.layers.{layer_i}.mlp.up_proj.weight"] = torch.cat(
+                [loaded[i][f"layers.{layer_i}.feed_forward.w3.weight"] for i in range(num_shards)], dim=0
+            )
+
+        state_dict[f"model.layers.{layer_i}.self_attn.rotary_emb.inv_freq"] = inv_freq
+        for k, v in state_dict.items():
+            index_dict["weight_map"][k] = filename
+            param_count += v.numel()
+        torch.save(state_dict, os.path.join(model_path, filename))
+
+    filename = f"pytorch_model-{n_layers + 1}-of-{n_layers + 1}.bin"
+    if num_shards == 1:
+        # Unsharded
+        state_dict = {
+            "model.embed_tokens.weight": loaded["tok_embeddings.weight"],
+            "model.norm.weight": loaded["norm.weight"],
+            "lm_head.weight": loaded["output.weight"],
+        }
+    else:
+        d = 0 if "llama3" in model_size else 1
+        state_dict = {
+            "model.norm.weight": loaded[0]["norm.weight"],
+            "model.embed_tokens.weight": torch.cat(
+                [loaded[i]["tok_embeddings.weight"] for i in range(num_shards)], dim=d
+            ),
+            "lm_head.weight": torch.cat([loaded[i]["output.weight"] for i in range(num_shards)], dim=0),
+        }
+
+    for k, v in state_dict.items():
+        index_dict["weight_map"][k] = filename
+        param_count += v.numel()
+    torch.save(state_dict, os.path.join(model_path, filename))
+
+    # Write configs
+    index_dict["metadata"] = {"total_size": param_count * 2}
+    write_json(index_dict, os.path.join(model_path, "pytorch_model.bin.index.json"))
+    ffn_dim_multiplier = params["ffn_dim_multiplier"] if "ffn_dim_multiplier" in params else 1
+    multiple_of = params["multiple_of"] if "multiple_of" in params else 256
+    config = ModelConfig(
+        hidden_size=dim,
+        intermediate_size=compute_intermediate_size(dim, ffn_dim_multiplier, multiple_of),
+        num_attention_heads=params["n_heads"],
+        num_hidden_layers=params["n_layers"],
+        rms_norm_eps=params["norm_eps"],
+        num_key_value_heads=num_key_value_heads,
+        vocab_size=vocab_size,
+        rope_theta=base,
+        max_position_embeddings=max_position_embeddings,
+    )
+    config.save_pretrained(model_path)
+
+    # Make space so we can load the model properly now.
+    del state_dict
+    del loaded
+    gc.collect()
+
+    return model_path
+
+
+def load_args_from_checkpoint(args):
+
+    # Read Llama args.
+    model_args_path = os.path.join(args.load, "config.json")
+    with open(model_args_path) as f:
+        model_args = json.load(f)
+    # Update Megatron args.
+    args.seq_length = 4096
+    args.max_position_embeddings = model_args["max_position_embeddings"]
+    args.hidden_size = model_args["hidden_size"]
+    args.num_attention_heads = model_args["num_attention_heads"]
+    args.num_layers = model_args["num_hidden_layers"]
+    args.global_batch_size = 1024
+    args.norm_epsilon = model_args["rms_norm_eps"]
+    args.iteration = 1 # '0', 'release' don't work
+    args.position_embedding_type = "rope"
+    args.swiglu = True
+    args.normalization = "RMSNorm"
+    args.add_bias_linear = False
+    args.untie_embeddings_and_output_weights = True
+    args.vocab_size = model_args["vocab_size"]
+    args.padded_vocab_size = model_args["vocab_size"]
+    args.ffn_hidden_size = model_args["intermediate_size"]
+
+    if "num_key_value_heads" in model_args:
+        args.group_query_attention = True
+        args.num_query_groups = model_args["num_key_value_heads"]
+
+
+def set_preprocess_state(args, model, hf_model):
+    '''Set embedding params.'''
+    model.language_model.embedding.word_embeddings.weight.data.copy_(
+        hf_model.model.embed_tokens.weight)
+
+
+def set_postprocess_state(args, model, hf_model):
+    '''Set output layer & norm params.'''
+    model.language_model.encoder.final_norm.weight.data.copy_(hf_model.model.norm.weight)
+    model.language_model.output_layer.weight.data.copy_(hf_model.lm_head.weight)
+
+
+def set_attn_state(args, layer, hf_layer):
+    '''Set self-attention params.'''
+
+    # Get attention layer & state.
+    attn = layer.self_attention
+    hf_attn = hf_layer.self_attn
+
+    # Reshape loaded weights.
+    tp = args.tensor_model_parallel_size
+    nh = args.num_attention_heads // tp
+    ng = (args.num_query_groups if args.group_query_attention \
+        else args.num_attention_heads) // tp
+    dim = args.kv_channels
+    assert nh % ng == 0
+
+    # Copy weights (re-order dimensions for Megatron).
+    attn.query_key_value.weight.data.copy_(torch.cat([
+        hf_attn.q_proj.weight.reshape((ng, dim*nh//ng, -1)),
+        hf_attn.k_proj.weight.reshape((ng, dim, -1)),
+        hf_attn.v_proj.weight.reshape((ng, dim, -1)),
+    ], dim=1).reshape((-1, args.hidden_size)))
+    if args.add_qkv_bias:
+        attn.query_key_value.bias.data.copy_(torch.cat([
+            hf_attn.q_proj.bias.reshape((ng, dim*nh//ng)),
+            hf_attn.k_proj.bias.reshape((ng, dim)),
+            hf_attn.v_proj.bias.reshape((ng, dim)),
+        ], dim=1).reshape(-1))
+
+    attn.dense.weight.data.copy_(hf_attn.o_proj.weight)
+
+
+def set_mlp_state(args, layer, hf_layer):
+    '''Set MLP params.'''
+
+    mlp = layer.mlp
+    hf_mlp = hf_layer.mlp
+
+    mlp.dense_h_to_4h.weight.data.copy_(torch.cat([
+        hf_mlp.gate_proj.weight,
+        hf_mlp.up_proj.weight,
+    ], dim=0))
+    mlp.dense_4h_to_h.weight.data.copy_(hf_mlp.down_proj.weight)
+
+
+def set_layer_state(args, model, hf_model, layer_idx):
+    '''Set transformer layer params.'''
+
+    layer = model.language_model.encoder.layers[layer_idx]
+    hf_layer = hf_model.model.layers[layer_idx]
+
+    set_attn_state(args, layer, hf_layer)
+    set_mlp_state(args, layer, hf_layer)
+    layer.input_norm.weight.data.copy_(hf_layer.input_layernorm.weight)
+    layer.post_attention_norm.weight.data.copy_(hf_layer.post_attention_layernorm.weight)
+
+
+def load_checkpoint_to_model(args):
+    '''Set model params.'''
+
+    from pretrain_gpt import model_provider
+    from transformers import AutoModelForCausalLM
+
+    # Load Huggingface model.
+    hf_model = AutoModelForCausalLM.from_pretrained(args.load, torch_dtype=args.params_dtype, low_cpu_mem_usage=True, device_map="cpu")
+
+    # Init Megatron model.
+    model = model_provider(True, True).to(args.params_dtype)
+
+    # Set model state.
+    set_preprocess_state(args, model, hf_model)
+    set_postprocess_state(args, model, hf_model)
+    for layer_idx in tqdm(range(args.num_layers), "set layer states"):
+        set_layer_state(args, model, hf_model, layer_idx)
+
+    return model
+
+
+def _load_checkpoint(queue, args):
+
+    verify_transformers_version()
+
+    # Search in directory above this.
+    sys.path.append(os.path.abspath(
+        os.path.join(os.path.dirname(__file__),
+                     os.path.pardir,
+                     os.path.pardir)))
+    if args.megatron_path is not None:
+        sys.path.insert(0, args.megatron_path)
+
+    # Convert Meta checkpoint to HF format as an intermediate step
+    if args.checkpoint_type == "meta":
+        model_tmp_path = convert_to_hf(model_path=os.path.join(args.save_dir, 'tmp'), input_base_path=args.load_dir, model_size=args.model_size, tokenizer_path=args.tokenizer_model)
+        args.load_dir = model_tmp_path
+
+    try:
+        from megatron.training.arguments import parse_args, validate_args
+        from megatron.training.global_vars import set_args, set_global_variables
+        from megatron.legacy.model import module
+        from megatron.core import mpu
+        from megatron.core.enums import ModelType
+        from megatron.legacy import fused_kernels
+    except ModuleNotFoundError:
+        print("Unable to import Megatron, please specify the path to Megatron using --megatron-path. Exiting.")
+        queue.put("exit")
+        exit(1)
+
+    # We want all arguments to come from us.
+    sys.argv = ['script.py',
+                '--no-masked-softmax-fusion',
+                '--no-bias-gelu-fusion',
+                '--no-bias-dropout-fusion',
+                '--no-async-tensor-model-parallel-allreduce',
+                '--use-cpu-initialization',
+                '--micro-batch-size', '1',
+                '--no-load-optim',
+                '--no-load-rng',
+                '--no-save-optim',
+                '--no-save-rng',
+                '--mock-data', # To pass the "blend data checks" in arguments.py
+                '--no-initialization',
+                '--load', args.load_dir
+                ]
+
+    margs = parse_args()
+    margs.tokenizer_model = args.tokenizer_model
+    load_args_from_checkpoint(margs)
+
+    if "llama2" in args.model_size or "yi" in args.model_size:
+        margs.tokenizer_type = "Llama2Tokenizer"
+    elif "llama3" in args.model_size:
+        margs.tokenizer_type = "HuggingFaceTokenizer"
+    elif "mistral" in args.model_size:
+        margs.tokenizer_type = "HuggingFaceTokenizer"
+    elif "qwen2.5" in args.model_size:
+        margs.tokenizer_type = "HuggingFaceTokenizer"
+        margs.add_qkv_bias = True
+
+    # Arguments do sanity checks on the world size, but we don't care,
+    # so trick it into thinking we are plenty of processes.
+    margs.world_size = margs.tensor_model_parallel_size * margs.pipeline_model_parallel_size
+
+    margs = validate_args(margs)
+
+    margs.use_legacy_models = True
+    margs.transformer_impl = args.loader_transformer_impl
+
+    margs.position_embedding_type = "rope"
+
+    def check_for_arg(arg_name, default=None):
+        if getattr(margs, arg_name, None) is None:
+            if default is not None:
+                setattr(margs, arg_name, default)
+            else:
+                print(f"Checkpoint does not specify the argument {arg_name}. Exiting.")
+                print(f"Arguments: {margs}")
+                queue.put("exit")
+                exit(1)
+
+    check_for_arg('tensor_model_parallel_size')
+    check_for_arg('pipeline_model_parallel_size')
+    check_for_arg('num_layers')
+    check_for_arg('hidden_size')
+    check_for_arg('seq_length')
+    check_for_arg('num_attention_heads')
+    check_for_arg('max_position_embeddings')
+    check_for_arg('position_embedding_type')
+    check_for_arg('iteration')
+    check_for_arg('bert_binary_head')
+    check_for_arg('disable_bias_linear', False)
+    check_for_arg('params_dtype')
+    check_for_arg('swiglu', False)
+
+    # Determine how to make our models.
+    assert args.model_type == 'GPT', 'Llama-2, Llama-3 and Mistral are GPT models.'
+    margs.model_type = ModelType.encoder_or_decoder
+    margs.params_dtype = torch.bfloat16 if args.bf16 else torch.float16 if args.fp16 else torch.float32
+
+    # Suppress warning about torch.distributed not being initialized.
+    module.MegatronModule.embedding_warning_printed = True
+
+    set_global_variables(margs, build_tokenizer=False)
+    mpu.set_tensor_model_parallel_world_size(margs.tensor_model_parallel_size)
+    mpu.set_pipeline_model_parallel_world_size(margs.pipeline_model_parallel_size)
+    mpu.set_virtual_pipeline_model_parallel_world_size(margs.virtual_pipeline_model_parallel_size)
+    fused_kernels.load(margs)
+
+    # Short aliases.
+    tp_size = margs.tensor_model_parallel_size
+    pp_size = margs.pipeline_model_parallel_size
+    vp_size = margs.virtual_pipeline_model_parallel_size
+    if vp_size is None:
+        vp_size = 1
+
+    # Metadata.
+    md = types.SimpleNamespace()
+    md.model_type = args.model_type
+    md.num_layers = margs.num_layers
+    md.hidden_size = margs.hidden_size
+    md.seq_length = margs.seq_length
+    md.num_attention_heads = margs.num_attention_heads
+    md.max_position_embeddings = margs.max_position_embeddings
+    md.tokenizer_type = margs.tokenizer_type
+    md.iteration = margs.iteration
+    md.params_dtype = margs.params_dtype
+    md.bert_binary_head = margs.bert_binary_head
+    md.output_layer = margs.untie_embeddings_and_output_weights
+    md.position_embedding_type = margs.position_embedding_type
+    md.linear_bias = margs.add_bias_linear
+    md.qkv_bias = margs.add_qkv_bias
+    md.norm_has_bias = False
+    md.swiglu = margs.swiglu
+    md.previous_tensor_parallel_size = margs.tensor_model_parallel_size
+    md.previous_pipeline_parallel_size = margs.pipeline_model_parallel_size
+    md.make_vocab_size_divisible_by = None
+    md.checkpoint_args = margs
+    md.consumed_train_samples = 0
+    md.consumed_valid_samples = 0
+
+    margs.model_size = args.model_size
+
+    # Get true (non-padded) vocab size
+    tokenizer = transformers.AutoTokenizer.from_pretrained(margs.tokenizer_model)
+    md.true_vocab_size = tokenizer._tokenizer.get_vocab_size(with_added_tokens=True)
+
+    # Get first pipe stage.
+    mpu.set_tensor_model_parallel_rank(0)
+    mpu.set_pipeline_model_parallel_rank(0)
+    model = load_checkpoint_to_model(margs)
+
+    queue.put(md)
+
+    def queue_put(name, msg):
+        print(f"sending {name}")
+        msg["name"] = name
+        queue.put(msg)
+
+    # Send embeddings.
+    message = {
+        "word embeddings": model.language_model.embedding.word_embeddings.weight.data
+    }
+    if md.position_embedding_type == 'learned_absolute':
+        message["position embeddings"] = model.language_model.embedding.position_embeddings.weight.data
+    else:
+        assert not hasattr(model.language_model.embedding, 'position_embeddings')
+
+    queue_put("embeddings", message)
+
+    for layer_num in range(margs.num_layers):
+        message = {}
+
+        # Get non-parallel tensors from tp_rank 0.
+        layer = model.language_model.encoder.layers[layer_num]
+        message["input norm weight"] = layer.input_norm.weight.data
+        message["post norm weight"] = layer.post_attention_norm.weight.data
+        if md.linear_bias:
+            message["dense bias"] = layer.self_attention.dense.bias.data
+            message["mlp l1 bias"] = layer.mlp.dense_4h_to_h.bias.data
+
+        # Grab all parallel tensors for this layer.
+        qkv_weight = []
+        qkv_bias = []
+        dense_weight = []
+        mlp_l0_weight = []
+        mlp_l0_bias = []
+        mlp_l1_weight = []
+        layer = model.language_model.encoder.layers[layer_num]
+        qkv_weight.append(layer.self_attention.query_key_value.weight.data)
+        dense_weight.append(layer.self_attention.dense.weight.data)
+        mlp_l0_weight.append(layer.mlp.dense_h_to_4h.weight.data)
+        mlp_l1_weight.append(layer.mlp.dense_4h_to_h.weight.data)
+
+        if md.qkv_bias:
+            qkv_bias.append(layer.self_attention.query_key_value.bias.data)
+        if md.linear_bias:
+            mlp_l0_bias.append(layer.mlp.dense_h_to_4h.bias.data)
+
+        # Handle gated linear units.
+        if md.swiglu:
+            # Concat all the first halves ('W's) and all the second halves ('V's).
+            for tp_rank in range(tp_size):
+                mlp_l0_weight[tp_rank] = torch.chunk(mlp_l0_weight[tp_rank], 2, dim=0)
+            message["mlp l0 weight W"] = torch.cat([w[0] for w in mlp_l0_weight], dim=0)
+            message["mlp l0 weight V"] = torch.cat([w[1] for w in mlp_l0_weight], dim=0)
+        else:
+            message["mlp l0 weight"] = torch.cat(mlp_l0_weight, dim=0)
+
+        # Simple concat of the rest.
+        message["qkv weight"] = torch.cat(qkv_weight, dim=0)
+        message["dense weight"] = torch.cat(dense_weight, dim=1)
+        message["mlp l1 weight"] = torch.cat(mlp_l1_weight, dim=1)
+        if md.qkv_bias:
+            message["qkv bias"] = torch.cat(qkv_bias, dim=0)
+        if md.linear_bias:
+            if md.swiglu:
+                for tp_rank in range(tp_size):
+                    mlp_l0_bias[tp_rank] = torch.chunk(mlp_l0_bias[tp_rank], 2, dim=0)
+                message["mlp l0 bias W"] = torch.cat([b[0] for b in mlp_l0_bias],dim=0)
+                message["mlp l0 bias V"] = torch.cat([b[1] for b in mlp_l0_bias],dim=0)
+            else:
+                message["mlp l0 bias"] = torch.cat(mlp_l0_bias, dim=0)
+
+        queue_put(f"transformer layer {layer_num}", message)
+
+    # Send final norm from tp_rank 0.
+    message = {
+        "weight": model.language_model.encoder.final_norm.weight.data,
+    }
+    queue_put("final norm", message)
+
+    if md.output_layer:
+        message = {
+            "weight": model.language_model.output_layer.weight.data
+        }
+        queue_put("output layer", message)
+
+    queue.put("done")
+
+    if args.checkpoint_type == "meta":
+        shutil.rmtree(os.path.join(args.save_dir, 'tmp'))
+
+
+def load_checkpoint(queue, args):
+    try:
+        _load_checkpoint(queue, args)
+    except Exception:
+        queue.put("exit")
+        raise
diff --git a/tools/checkpoint/loader_mcore.py b/tools/checkpoint/loader_mcore.py
new file mode 100644
index 0000000000..0be90c2ab6
--- /dev/null
+++ b/tools/checkpoint/loader_mcore.py
@@ -0,0 +1,384 @@
+# Copyright (c) 2024, NVIDIA CORPORATION. All rights reserved.
+
+import json
+import os
+import sys
+import torch
+import types
+
+from utils import get_mcore_transformer_block_key, print_memory_usage
+
+
+def add_arguments(parser):
+    group = parser.add_argument_group(title='Megatron loader')
+
+    group.add_argument('--true-vocab-size', type=int, default=None,
+                       help='original size of vocab, if specified will trim padding from embedding table.')
+    group.add_argument('--vocab-file', type=str, default=None,
+                       help='Path to the vocab file. If specified will use this to get vocab size and '
+                       'trim padding from the embedding table.')
+    group.add_argument('--megatron-path', type=str, default=None,
+                       help='Base directory of Megatron repository')
+    group.add_argument('--position-embedding-type',
+                       type=str,
+                       default='learned_absolute',
+                       choices=['learned_absolute', 'rope'],
+                       help='Position embedding type.')
+    group.add_argument('--loader-transformer-impl', default='transformer_engine',
+                       choices=['local', 'transformer_engine'],
+                       help='Which Transformer implementation to use.')
+
+
+def _load_checkpoint(queue, args):
+
+    # Search in directory above this
+    sys.path.append(os.path.abspath(
+        os.path.join(os.path.dirname(__file__),
+                     os.path.pardir)))
+    if args.megatron_path is not None:
+        sys.path.insert(0, args.megatron_path)
+
+    try:
+        from megatron.training.arguments import parse_args, validate_args
+        from megatron.training.global_vars import set_args, set_global_variables
+        from megatron.training.checkpointing import load_args_from_checkpoint, load_checkpoint
+        from megatron.legacy.model import module
+        from megatron.core import mpu
+        from megatron.core.enums import ModelType
+        from megatron.legacy import fused_kernels
+    except ModuleNotFoundError:
+        print("Unable to import Megatron, please specify the path to Megatron using --megatron-path. Exiting.")
+        queue.put("exit")
+        exit(1)
+
+    # We want all arguments to come from us
+    sys.argv = ['script.py',
+                '--no-masked-softmax-fusion',
+                '--no-bias-gelu-fusion',
+                '--no-bias-dropout-fusion',
+                '--no-async-tensor-model-parallel-allreduce',
+                '--use-cpu-initialization',
+                '--micro-batch-size', '1',
+                '--no-load-optim',
+                '--no-load-rng',
+                '--no-save-optim',
+                '--no-save-rng',
+                '--no-initialization',
+                '--mock-data', # To pass the "blend data checks" in arguments.py
+                '--load', args.load_dir,
+                '--position-embedding-type', args.position_embedding_type,
+                '--exit-on-missing-checkpoint',
+                ]
+
+    margs = parse_args()
+    margs, checkpoint_args = load_args_from_checkpoint(margs)
+
+    # Arguments do sanity checks on the world size, but we don't care,
+    # so trick it into thinking we are plenty of processes
+    margs.world_size = margs.tensor_model_parallel_size * margs.pipeline_model_parallel_size
+
+    # Explicitly copy data types from checkpoint.
+    margs.fp16 = checkpoint_args.fp16
+    margs.bf16 = checkpoint_args.bf16
+
+    # Validate margs.
+    margs = validate_args(margs)
+
+    margs.use_legacy_models = False
+    margs.transformer_impl = args.loader_transformer_impl
+
+    def check_for_arg(arg_name, default=None):
+        if getattr(margs, arg_name, None) is None:
+            if default is not None:
+                setattr(margs, arg_name, default)
+            else:
+                print(f"Checkpoint does not specify the argument {arg_name}. Exiting.")
+                print(f"Arguments: {margs}")
+                queue.put("exit")
+                exit(1)
+
+    check_for_arg('tensor_model_parallel_size')
+    check_for_arg('pipeline_model_parallel_size')
+    check_for_arg('num_layers')
+    check_for_arg('hidden_size')
+    check_for_arg('seq_length')
+    check_for_arg('num_attention_heads')
+    check_for_arg('max_position_embeddings')
+    check_for_arg('position_embedding_type')
+    check_for_arg('tokenizer_type')
+    check_for_arg('iteration')
+    check_for_arg('bert_binary_head')
+    check_for_arg('disable_bias_linear', False)
+    check_for_arg('params_dtype')
+    check_for_arg('swiglu', False)
+
+    # Determine how to make our models
+    if args.model_type == 'GPT':
+        from pretrain_gpt import model_provider
+        margs.model_type = ModelType.encoder_or_decoder
+    elif args.model_type == 'BERT':
+        from pretrain_bert import model_provider
+        margs.model_type = ModelType.encoder_or_decoder
+    else:
+        raise Exception(f'unrecognized model type: {args.model_type}')
+
+    # supress warning about torch.distributed not being initialized
+    module.MegatronModule.embedding_warning_printed = True
+
+    consumed_train_samples = None
+    consumed_valid_samples = None
+    def get_models(count, dtype):
+        nonlocal consumed_train_samples
+        nonlocal consumed_valid_samples
+        model_array_len = margs.virtual_pipeline_model_parallel_size
+        if model_array_len is None:
+            model_array_len = 1
+        models = [[] for _ in range(model_array_len)]
+        pre_process = mpu.is_pipeline_first_stage()
+        post_process = mpu.is_pipeline_last_stage()
+        for rank in range(count):
+            mpu.set_tensor_model_parallel_rank(rank)
+            if margs.virtual_pipeline_model_parallel_size is not None:
+                model_ = []
+                for i in range(margs.virtual_pipeline_model_parallel_size):
+                    mpu.set_virtual_pipeline_model_parallel_rank(i)
+                    # Set pre_process and post_process only after virtual rank is set.
+                    pre_process = mpu.is_pipeline_first_stage()
+                    post_process = mpu.is_pipeline_last_stage()
+                    this_model = model_provider(
+                        pre_process=pre_process,
+                        post_process=post_process
+                    ).to(dtype)
+                    model_.append(this_model)
+            else:
+                pre_process = mpu.is_pipeline_first_stage()
+                post_process = mpu.is_pipeline_last_stage()
+                model_rank = 0
+                model_ = [model_provider(pre_process, post_process).to(dtype)]
+            margs.consumed_train_samples = 0
+            margs.consumed_valid_samples = 0
+            margs.exit_on_missing_checkpoint = True
+            load_checkpoint(model_, None, None)
+
+            if consumed_train_samples is not None:
+                assert(margs.consumed_train_samples == consumed_train_samples)
+            else:
+                consumed_train_samples = margs.consumed_train_samples
+            if consumed_valid_samples is not None:
+                assert(margs.consumed_valid_samples == consumed_valid_samples)
+            else:
+                consumed_valid_samples = margs.consumed_valid_samples
+            for vp_rank in range(model_array_len):
+                models[vp_rank].append(model_[vp_rank])
+
+            # Print memory usage.
+            print_memory_usage("loader", rank, count)
+
+        return models
+
+    set_global_variables(margs, build_tokenizer=False)
+    mpu.set_tensor_model_parallel_world_size(margs.tensor_model_parallel_size)
+    mpu.set_pipeline_model_parallel_world_size(margs.pipeline_model_parallel_size)
+    mpu.set_virtual_pipeline_model_parallel_world_size(margs.virtual_pipeline_model_parallel_size)
+    fused_kernels.load(margs)
+
+    # Get true (non-padded) vocab size
+    if args.true_vocab_size is not None:
+        true_vocab_size = args.true_vocab_size
+    elif args.vocab_file is not None:
+        vocab = json.load(open(args.vocab_file))
+        true_vocab_size = len(vocab)
+        if args.true_vocab_size is not None and true_vocab_size != args.true_vocab_size:
+            print("Both --true-vocab-size and --vocab-file specified and the vocab size does not match, aborting.")
+            queue.put("exit")
+            exit(1)
+    else:
+        true_vocab_size = None
+
+    # short aliases
+    tp_size = margs.tensor_model_parallel_size
+    pp_size = margs.pipeline_model_parallel_size
+    vp_size = margs.virtual_pipeline_model_parallel_size
+    if vp_size is None:
+        vp_size = 1
+
+    # Layernorm has bias; RMSNorm does not.
+    if hasattr(checkpoint_args, 'normalization'):
+        norm_has_bias = checkpoint_args.normalization == "LayerNorm"
+    else:
+        # older models only supported LayerNorm
+        norm_has_bias = True
+
+    # metadata
+    md = types.SimpleNamespace()
+    md.model_type = args.model_type
+    md.num_layers = margs.num_layers
+    md.hidden_size = margs.hidden_size
+    md.seq_length = margs.seq_length
+    md.num_attention_heads = margs.num_attention_heads
+    md.max_position_embeddings = margs.max_position_embeddings
+    md.tokenizer_type = margs.tokenizer_type
+    md.iteration = margs.iteration
+    md.params_dtype = margs.params_dtype
+    md.bert_binary_head = margs.bert_binary_head
+    md.output_layer = margs.untie_embeddings_and_output_weights
+    md.position_embedding_type = margs.position_embedding_type
+    md.linear_bias = margs.add_bias_linear
+    md.norm_has_bias = norm_has_bias
+    md.swiglu = margs.swiglu
+    md.previous_tensor_parallel_size = margs.tensor_model_parallel_size
+    md.previous_pipeline_parallel_size = margs.pipeline_model_parallel_size
+    md.true_vocab_size = true_vocab_size
+    md.make_vocab_size_divisible_by = margs.make_vocab_size_divisible_by
+    md.checkpoint_args = checkpoint_args
+    md.use_legacy_models = margs.use_legacy_models
+
+    # Get transformer block (named either 'encoder' or 'decoder').
+    transformer_block_key = get_mcore_transformer_block_key(md.model_type)
+    def get_transformer_block(_model):
+        return getattr(_model, transformer_block_key)
+
+    # Get first pipe stage
+    mpu.set_pipeline_model_parallel_rank(0)
+    all_models = [get_models(tp_size, md.params_dtype)]
+    models = all_models[0][0]
+
+    md.consumed_train_samples = consumed_train_samples
+    md.consumed_valid_samples = consumed_valid_samples
+    queue.put(md)
+
+    def queue_put(name, msg):
+        print(f"sending {name}")
+        msg["name"] = name
+        queue.put(msg)
+
+    # Send embeddings
+    message = {
+        "word embeddings": torch.cat(
+            [models[tp_rank].embedding.word_embeddings.weight.data for tp_rank in range(tp_size)],
+            dim = 0)
+    }
+    if md.position_embedding_type == 'learned_absolute':
+        message["position embeddings"] = models[0].embedding.position_embeddings.weight.data
+    else:
+        assert not hasattr(models[0].embedding, 'position_embeddings')
+
+    queue_put("embeddings", message)
+
+    total_layer_num = 0
+    for vp_rank in range(vp_size):
+        mpu.set_virtual_pipeline_model_parallel_rank(vp_rank)
+        for pp_rank in range(pp_size):
+            if pp_rank > 0:
+                mpu.set_pipeline_model_parallel_rank(pp_rank)
+                if vp_rank == 0:
+                    all_models.append(get_models(tp_size, md.params_dtype))
+            models = all_models[pp_rank][vp_rank]
+            for layer_num in range(len(get_transformer_block(models[0]).layers)):
+                message = {}
+
+                # Get non-parallel tensors from tp_rank 0
+                layer = get_transformer_block(models[0]).layers[layer_num]
+                message["input norm weight"] = layer.self_attention.linear_qkv.layer_norm_weight.data
+                if norm_has_bias:
+                    message["input norm bias"] = layer.self_attention.linear_qkv.layer_norm_bias.data
+                message["post norm weight"] = layer.mlp.linear_fc1.layer_norm_weight.data
+                if norm_has_bias:
+                    message["post norm bias"] = layer.mlp.linear_fc1.layer_norm_bias.data
+                if md.linear_bias:
+                    message["dense bias"] = layer.self_attention.linear_proj.bias.data
+                    message["mlp l1 bias"] = layer.mlp.linear_fc2.bias.data
+
+                # Grab all parallel tensors for this layer
+                qkv_weight = []
+                qkv_bias = []
+                dense_weight = []
+                mlp_l0_weight = []
+                mlp_l0_bias = []
+                mlp_l1_weight = []
+                for tp_rank, model in enumerate(models):
+                    layer = get_transformer_block(model).layers[layer_num]
+                    qkv_weight.append(layer.self_attention.linear_qkv.weight.data)
+                    dense_weight.append(layer.self_attention.linear_proj.weight.data)
+                    mlp_l0_weight.append(layer.mlp.linear_fc1.weight.data)
+                    mlp_l1_weight.append(layer.mlp.linear_fc2.weight.data)
+                    if md.linear_bias:
+                        qkv_bias.append(layer.self_attention.linear_qkv.bias.data)
+                        mlp_l0_bias.append(layer.mlp.linear_fc1.bias.data)
+
+                # Handle gated linear units
+                if md.swiglu:
+                    # concat all the first halves ('W's) and all the second halves ('V's)
+                    for tp_rank in range(tp_size):
+                        mlp_l0_weight[tp_rank] = torch.chunk(mlp_l0_weight[tp_rank], 2, dim=0)
+                    message["mlp l0 weight W"] = torch.cat([w[0] for w in mlp_l0_weight], dim=0)
+                    message["mlp l0 weight V"] = torch.cat([w[1] for w in mlp_l0_weight], dim=0)
+                else:
+                    message["mlp l0 weight"] = torch.cat(mlp_l0_weight, dim=0)
+
+                # simple concat of the rest
+                message["qkv weight"] = torch.cat(qkv_weight, dim=0)
+                message["dense weight"] = torch.cat(dense_weight, dim=1)
+                message["mlp l1 weight"] = torch.cat(mlp_l1_weight, dim=1)
+                if md.linear_bias:
+                    message["qkv bias"] = torch.cat(qkv_bias, dim=0)
+                    if md.swiglu:
+                        for tp_rank in range(tp_size):
+                            mlp_l0_bias[tp_rank] = torch.chunk(mlp_l0_bias[tp_rank], 2, dim=0)
+                        message["mlp l0 bias W"] = torch.cat([b[0] for b in mlp_l0_bias],dim=0)
+                        message["mlp l0 bias V"] = torch.cat([b[1] for b in mlp_l0_bias],dim=0)
+                    else:
+                        message["mlp l0 bias"] = torch.cat(mlp_l0_bias, dim=0)
+
+                queue_put(f"transformer layer {total_layer_num}", message)
+
+                total_layer_num = total_layer_num + 1
+
+    # Send final norm from tp_rank 0
+    message = {
+        "weight": get_transformer_block(models[0]).final_layernorm.weight.data,
+    }
+    if norm_has_bias:
+        message["bias"] = get_transformer_block(models[0]).final_layernorm.bias.data
+    queue_put("final norm", message)
+
+    if md.output_layer:
+        message = {
+            "weight": torch.cat(
+                [models[tp_rank].output_layer.weight.data for tp_rank in range(tp_size)],
+                dim = 0)
+        }
+        queue_put("output layer", message)
+
+
+    # Send BERT lm head and binary head if it exists
+    if md.model_type == 'BERT':
+        message = {
+            "weight": models[0].pooler.dense.weight.data,
+            "bias": models[0].pooler.dense.bias.data
+        }
+        queue_put("pooler", message)
+
+        message = {
+            "dense weight": models[0].lm_head.dense.weight.data,
+            "dense bias": models[0].lm_head.dense.bias.data,
+            "norm weight": models[0].lm_head.layer_norm.weight.data,
+        }
+        if norm_has_bias:
+            message["norm bias"] = models[0].lm_head.layer_norm.bias.data
+        queue_put("lm head", message)
+
+        if md.bert_binary_head:
+            message = {
+                "weight": models[0].binary_head.weight.data,
+                "bias": models[0].binary_head.bias.data
+            }
+            queue_put("binary head", message)
+    queue.put("done")
+
+def load_checkpoint(queue, args):
+    try:
+        _load_checkpoint(queue, args)
+    except Exception:
+        queue.put("exit")
+        raise
diff --git a/tools/checkpoint_loader_megatron.py b/tools/checkpoint/loader_megatron.py
similarity index 81%
rename from tools/checkpoint_loader_megatron.py
rename to tools/checkpoint/loader_megatron.py
index 1cd4937152..72edcd9dbf 100644
--- a/tools/checkpoint_loader_megatron.py
+++ b/tools/checkpoint/loader_megatron.py
@@ -1,3 +1,5 @@
+# Copyright (c) 2023, NVIDIA CORPORATION. All rights reserved.
+
 import json
 import os
 import sys
@@ -5,6 +7,7 @@
 
 import torch
 
+
 def add_arguments(parser):
     group = parser.add_argument_group(title='Megatron loader')
 
@@ -14,7 +17,15 @@ def add_arguments(parser):
                        help='Path to the vocab file. If specified will use this to get vocab size and '
                        'trim padding from the embedding table.')
     group.add_argument('--megatron-path', type=str, default=None,
-                       help='Base directory of deepspeed repository')
+                       help='Base directory of Megatron repository')
+    group.add_argument('--position-embedding-type',
+                       type=str,
+                       default='learned_absolute',
+                       choices=['learned_absolute', 'rope'],
+                       help='Position embedding type.')
+    group.add_argument('--loader-transformer-impl', default='local',
+                       choices=['local', 'transformer_engine'],
+                       help='Which Transformer implementation to use.')
 
 def _load_checkpoint(queue, args):
 
@@ -26,13 +37,13 @@ def _load_checkpoint(queue, args):
         sys.path.insert(0, args.megatron_path)
 
     try:
-        from megatron.arguments import parse_args, validate_args
-        from megatron.global_vars import set_args, set_global_variables
-        from megatron.checkpointing import load_args_from_checkpoint, load_checkpoint
-        from megatron.model import module
+        from megatron.training.arguments import parse_args, validate_args
+        from megatron.training.global_vars import set_args, set_global_variables
+        from megatron.training.checkpointing import load_args_from_checkpoint, load_checkpoint
+        from megatron.legacy.model import module
         from megatron.core import mpu
         from megatron.core.enums import ModelType
-        from megatron import fused_kernels
+        from megatron.legacy import fused_kernels
     except ModuleNotFoundError:
         print("Unable to import Megatron, please specify the path to Megatron using --megatron-path. Exiting.")
         queue.put("exit")
@@ -50,8 +61,11 @@ def _load_checkpoint(queue, args):
                 '--no-load-rng',
                 '--no-save-optim',
                 '--no-save-rng',
+                '--mock-data', # To pass the "blend data checks" in arguments.py
                 '--no-initialization',
-                '--load', args.load_dir
+                '--load', args.load_dir,
+                '--position-embedding-type', args.position_embedding_type,
+                '--exit-on-missing-checkpoint',
                 ]
 
     margs = parse_args()
@@ -61,8 +75,16 @@ def _load_checkpoint(queue, args):
     # so trick it into thinking we are plenty of processes
     margs.world_size = margs.tensor_model_parallel_size * margs.pipeline_model_parallel_size
 
+    # Explicitly copy data types from checkpoint.
+    margs.fp16 = checkpoint_args.fp16
+    margs.bf16 = checkpoint_args.bf16
+
+    # Validate margs.
     margs = validate_args(margs)
 
+    margs.use_legacy_models = True
+    margs.transformer_impl = args.loader_transformer_impl
+
     def check_for_arg(arg_name, default=None):
         if getattr(margs, arg_name, None) is None:
             if default is not None:
@@ -80,8 +102,7 @@ def check_for_arg(arg_name, default=None):
     check_for_arg('seq_length')
     check_for_arg('num_attention_heads')
     check_for_arg('max_position_embeddings')
-    check_for_arg('add_position_embedding', True)
-    check_for_arg('use_rotary_position_embeddings', False)
+    check_for_arg('position_embedding_type')
     check_for_arg('tokenizer_type')
     check_for_arg('iteration')
     check_for_arg('bert_binary_head')
@@ -134,6 +155,7 @@ def get_models(count, dtype):
                 model_ = [model_provider(pre_process, post_process).to(dtype)]
             margs.consumed_train_samples = 0
             margs.consumed_valid_samples = 0
+            margs.exit_on_missing_checkpoint = True
             load_checkpoint(model_, None, None)
 
             if consumed_train_samples is not None:
@@ -148,7 +170,7 @@ def get_models(count, dtype):
                 models[vp_rank].append(model_[vp_rank])
         return models
 
-    set_global_variables(margs)
+    set_global_variables(margs, build_tokenizer=False)
     mpu.set_tensor_model_parallel_world_size(margs.tensor_model_parallel_size)
     mpu.set_pipeline_model_parallel_world_size(margs.pipeline_model_parallel_size)
     mpu.set_virtual_pipeline_model_parallel_world_size(margs.virtual_pipeline_model_parallel_size)
@@ -174,6 +196,13 @@ def get_models(count, dtype):
     if vp_size is None:
         vp_size = 1
 
+    # Layernorm has bias; RMSNorm does not.
+    if hasattr(checkpoint_args, 'normalization'):
+        norm_has_bias = checkpoint_args.normalization == "LayerNorm"
+    else:
+        # older models only supported LayerNorm
+        norm_has_bias = True
+
     # metadata
     md = types.SimpleNamespace()
     md.model_type = args.model_type
@@ -187,8 +216,9 @@ def get_models(count, dtype):
     md.params_dtype = margs.params_dtype
     md.bert_binary_head = margs.bert_binary_head
     md.output_layer = margs.untie_embeddings_and_output_weights
-    md.position_embeddings = margs.add_position_embedding
+    md.position_embedding_type = margs.position_embedding_type
     md.linear_bias = margs.add_bias_linear
+    md.norm_has_bias = norm_has_bias
     md.swiglu = margs.swiglu
     md.previous_tensor_parallel_size = margs.tensor_model_parallel_size
     md.previous_pipeline_parallel_size = margs.pipeline_model_parallel_size
@@ -216,8 +246,10 @@ def queue_put(name, msg):
             [models[tp_rank].language_model.embedding.word_embeddings.weight.data for tp_rank in range(tp_size)],
             dim = 0)
     }
-    if md.position_embeddings:
+    if md.position_embedding_type == 'learned_absolute':
         message["position embeddings"] = models[0].language_model.embedding.position_embeddings.weight.data
+    else:
+        assert not hasattr(models[0].language_model.embedding, 'position_embeddings')
 
     queue_put("embeddings", message)
 
@@ -235,10 +267,12 @@ def queue_put(name, msg):
 
                 # Get non-parallel tensors from tp_rank 0
                 layer = models[0].language_model.encoder.layers[layer_num]
-                message["input layernorm weight"] = layer.input_layernorm.weight.data
-                message["input layernorm bias"] = layer.input_layernorm.bias.data
-                message["post layernorm weight"] = layer.post_attention_layernorm.weight.data
-                message["post layernorm bias"] = layer.post_attention_layernorm.bias.data
+                message["input norm weight"] = layer.input_norm.weight.data
+                if norm_has_bias:
+                    message["input norm bias"] = layer.input_norm.bias.data
+                message["post norm weight"] = layer.post_attention_norm.weight.data
+                if norm_has_bias:
+                    message["post norm bias"] = layer.post_attention_norm.bias.data
                 if md.linear_bias:
                     message["dense bias"] = layer.self_attention.dense.bias.data
                     message["mlp l1 bias"] = layer.mlp.dense_4h_to_h.bias.data
@@ -288,12 +322,13 @@ def queue_put(name, msg):
 
                 total_layer_num = total_layer_num + 1
 
-    # Send final layernorm from tp_rank 0
+    # Send final norm from tp_rank 0
     message = {
-        "weight": models[0].language_model.encoder.final_layernorm.weight.data,
-        "bias": models[0].language_model.encoder.final_layernorm.bias.data
+        "weight": models[0].language_model.encoder.final_norm.weight.data,
     }
-    queue_put("final layernorm", message)
+    if norm_has_bias:
+        message["bias"] = models[0].language_model.encoder.final_norm.bias.data
+    queue_put("final norm", message)
 
     if md.output_layer:
         message = {
@@ -315,9 +350,10 @@ def queue_put(name, msg):
         message = {
             "dense weight": models[0].lm_head.dense.weight.data,
             "dense bias": models[0].lm_head.dense.bias.data,
-            "layernorm weight": models[0].lm_head.layernorm.weight.data,
-            "layernorm bias": models[0].lm_head.layernorm.bias.data
+            "norm weight": models[0].lm_head.norm.weight.data,
         }
+        if norm_has_bias:
+            message["norm bias"] = models[0].lm_head.norm.bias.data
         queue_put("lm head", message)
 
         if md.bert_binary_head:
@@ -331,6 +367,6 @@ def queue_put(name, msg):
 def load_checkpoint(queue, args):
     try:
         _load_checkpoint(queue, args)
-    except:
+    except Exception:
         queue.put("exit")
         raise
diff --git a/tools/checkpoint/loader_mixtral_hf.py b/tools/checkpoint/loader_mixtral_hf.py
new file mode 100644
index 0000000000..9ff09f8df9
--- /dev/null
+++ b/tools/checkpoint/loader_mixtral_hf.py
@@ -0,0 +1,335 @@
+# Copyright (c) 2023, NVIDIA CORPORATION. All rights reserved.
+
+import json
+import os
+import sys
+import torch
+import transformers
+from tqdm import tqdm
+import types
+
+
+def add_arguments(parser):
+    group = parser.add_argument_group(title='Mixtral HF loader.')
+
+    group.add_argument('--true-vocab-size', type=int, default=None,
+                       help='original size of vocab, if specified will trim padding from embedding table.')
+    group.add_argument('--vocab-file', type=str, default=None,
+                       help='Path to the vocab file. If specified will use this to get vocab size and '
+                       'trim padding from the embedding table.')
+    group.add_argument('--tokenizer-model', required=True,
+                       help='Sentencepiece tokenizer model.')
+    group.add_argument('--megatron-path', type=str, default=None,
+                       help='Base directory of deepspeed repository')
+
+
+def load_args_from_checkpoint(args):
+    # Read Mixtral 8x7B args.
+    from transformers import MixtralConfig
+    mixtral_config = MixtralConfig.from_pretrained(args.load)
+
+    # Update Megatron args.
+    args.untie_embeddings_and_output_weights = True
+    args.seq_length = 4096
+    args.global_batch_size = 1024
+    args.iteration = 1 # '0', 'release' don't work
+    args.add_position_embedding = False
+    args.use_rotary_position_embeddings = True
+    args.swiglu = True
+    args.bf16 = True
+    args.add_bias_linear = False
+    args.normalization = "RMSNorm"
+    args.tokenizer_type = "Llama2Tokenizer"
+    args.disable_bias_linear = True
+
+    args.max_position_embeddings = mixtral_config.max_position_embeddings
+    args.hidden_size = mixtral_config.hidden_size
+    args.num_attention_heads = mixtral_config.num_attention_heads
+    args.num_layers = mixtral_config.num_hidden_layers
+    args.norm_epsilon = mixtral_config.rms_norm_eps
+    args.vocab_size = mixtral_config.vocab_size
+    args.padded_vocab_size = mixtral_config.vocab_size
+    args.mixtral = mixtral_config
+    args.ffn_hidden_size = mixtral_config.intermediate_size
+    args.num_experts = mixtral_config.num_local_experts
+    args.sequence_parallel = True
+
+    if mixtral_config.num_key_value_heads:
+        args.group_query_attention = True
+        args.num_query_groups = mixtral_config.num_key_value_heads
+
+def verify_transformers_version():
+    major, minor, patch = map(int, transformers.__version__.split('.'))
+    assert major >= 4 and minor >= 36
+
+def set_preprocess_state(args, model, hf_model):
+    '''Set embedding params.'''
+    model.embedding.word_embeddings.weight.data.copy_(
+        hf_model.model.embed_tokens.weight)
+
+def set_postprocess_state(args, model, hf_model):
+    '''Set output layer & norm params.'''
+    model.decoder.final_layernorm.weight.data.copy_(hf_model.model.norm.weight)
+    model.output_layer.weight.data.copy_(hf_model.lm_head.weight)
+
+def set_attn_state(args, layer, hf_layer):
+    '''Set self-attention params.'''
+
+    # Get attention layer & state.
+    attn = layer.self_attention
+    hf_attn = hf_layer.self_attn
+
+    # Reshape loaded weights.
+    tp = args.tensor_model_parallel_size
+    num_heads = args.num_attention_heads // tp
+    num_query_groups = (args.num_query_groups if args.group_query_attention else args.num_attention_heads) // tp
+    num_querys_per_group = num_heads // num_query_groups
+    dim = args.kv_channels
+    assert num_heads % num_querys_per_group == 0
+
+    # Copy weights (re-order dimensions for Megatron).
+    attn.linear_qkv.weight.data.copy_(torch.cat([
+        hf_attn.q_proj.weight.reshape((num_query_groups, num_querys_per_group*dim, -1)),
+        hf_attn.k_proj.weight.reshape((num_query_groups, dim, -1)),
+        hf_attn.v_proj.weight.reshape((num_query_groups, dim, -1)),
+    ], dim=1).reshape((-1, args.hidden_size)))
+    attn.linear_proj.weight.data.copy_(hf_attn.o_proj.weight)
+
+def set_mlp_state(args, layer, hf_layer):
+    '''Set MLP params.'''
+
+    layer.mlp.router.weight.data.copy_(hf_layer.block_sparse_moe.gate.weight)
+
+    mcore_experts = layer.mlp.experts.local_experts
+    hf_experts = hf_layer.block_sparse_moe.experts
+    for expert_idx in range(args.num_experts):
+        mcore_experts[expert_idx].linear_fc1.weight.data.copy_(
+            torch.cat([
+                hf_experts[expert_idx].w1.weight,
+                hf_experts[expert_idx].w3.weight
+            ], dim=0)
+        )
+        mcore_experts[expert_idx].linear_fc2.weight.data.copy_(
+            hf_experts[expert_idx].w2.weight
+        )
+
+def set_layer_state(args, model, hf_model, layer_idx):
+    '''Set transformer layer params.'''
+
+    layer = model.decoder.layers[layer_idx]
+    hf_layer = hf_model.model.layers[layer_idx]
+
+    set_attn_state(args, layer, hf_layer)
+    set_mlp_state(args, layer, hf_layer)
+
+    layer.self_attention.linear_qkv.layer_norm_weight.data.copy_(hf_layer.input_layernorm.weight)
+    layer.pre_mlp_layernorm.weight.data.copy_(hf_layer.post_attention_layernorm.weight)
+
+def load_checkpoint_to_model(args):
+    '''Set model params.'''
+
+    from pretrain_gpt import model_provider
+    from transformers import MixtralForCausalLM, MixtralConfig
+
+    # Load Huggingface model.
+
+    hf_model = MixtralForCausalLM.from_pretrained(args.load, device_map="cpu")
+
+    # Init Megatron model.
+    model = model_provider(True, True).to(args.params_dtype)
+
+    # Set model state.
+    set_preprocess_state(args, model, hf_model)
+    set_postprocess_state(args, model, hf_model)
+    for layer_idx in tqdm(range(args.num_layers), "set layer states"):
+        set_layer_state(args, model, hf_model, layer_idx)
+    return model
+
+
+def _load_checkpoint(queue, args):
+
+    # Llama-2 requires HF transformers >=4.31.0.
+    verify_transformers_version()
+
+    # Search in directory above this.
+    sys.path.append(os.path.abspath(
+        os.path.join(os.path.dirname(__file__),
+                     os.path.pardir,
+                     os.path.pardir)))
+    if args.megatron_path is not None:
+        sys.path.insert(0, args.megatron_path)
+
+    try:
+        from megatron.training.arguments import parse_args, validate_args
+        from megatron.training.global_vars import set_args, set_global_variables
+        from megatron.legacy.model import module
+        from megatron.core import mpu
+        from megatron.core.enums import ModelType
+        from megatron.legacy import fused_kernels
+    except ModuleNotFoundError:
+        print("Unable to import Megatron, please specify the path to Megatron using --megatron-path. Exiting.")
+        queue.put("exit")
+        exit(1)
+
+    # We want all arguments to come from us.
+    sys.argv = ['script.py',
+                '--use-mcore-models',
+                '--disable-bias-linear',
+                '--no-masked-softmax-fusion',
+                '--no-bias-gelu-fusion',
+                '--no-bias-dropout-fusion',
+                '--no-async-tensor-model-parallel-allreduce',
+                '--use-cpu-initialization',
+                '--micro-batch-size', '1',
+                '--no-load-optim',
+                '--no-load-rng',
+                '--no-save-optim',
+                '--no-save-rng',
+                '--no-initialization',
+                '--mock-data', # To pass the "blend data checks" in arguments.py
+                '--transformer-impl', 'transformer_engine',
+                '--load', args.load_dir
+                ]
+
+    margs = parse_args()
+    margs.tokenizer_model = args.tokenizer_model
+    load_args_from_checkpoint(margs)
+
+    # Arguments do sanity checks on the world size, but we don't care,
+    # so trick it into thinking we are plenty of processes.
+    margs.world_size = margs.tensor_model_parallel_size * margs.pipeline_model_parallel_size
+
+    margs = validate_args(margs)
+
+    def check_for_arg(arg_name, default=None):
+        if getattr(margs, arg_name, None) is None:
+            if default is not None:
+                setattr(margs, arg_name, default)
+            else:
+                print(f"Checkpoint does not specify the argument {arg_name}. Exiting.")
+                print(f"Arguments: {margs}")
+                queue.put("exit")
+                exit(1)
+
+    check_for_arg('tensor_model_parallel_size')
+    check_for_arg('pipeline_model_parallel_size')
+    check_for_arg('num_layers')
+    check_for_arg('hidden_size')
+    check_for_arg('seq_length')
+    check_for_arg('num_attention_heads')
+    check_for_arg('max_position_embeddings')
+    check_for_arg('position_embedding_type')
+    check_for_arg('tokenizer_type')
+    check_for_arg('iteration')
+    check_for_arg('disable_bias_linear')
+    check_for_arg('params_dtype')
+    check_for_arg('swiglu')
+
+    # Determine how to make our models.
+    assert args.model_type == 'GPT', 'Llama-2 is a GPT model.'
+    margs.model_type = ModelType.encoder_or_decoder
+
+    # Suppress warning about torch.distributed not being initialized.
+    module.MegatronModule.embedding_warning_printed = True
+
+    set_global_variables(margs, build_tokenizer=False)
+    mpu.set_tensor_model_parallel_world_size(margs.tensor_model_parallel_size)
+    mpu.set_pipeline_model_parallel_world_size(margs.pipeline_model_parallel_size)
+    mpu.set_virtual_pipeline_model_parallel_world_size(margs.virtual_pipeline_model_parallel_size)
+    mpu.set_expert_model_parallel_world_size(margs.expert_model_parallel_size)
+    fused_kernels.load(margs)
+
+    # Metadata.
+    md = types.SimpleNamespace()
+    md.model_type = args.model_type
+    md.num_layers = margs.num_layers
+    md.hidden_size = margs.hidden_size
+    md.seq_length = margs.seq_length
+    md.num_attention_heads = margs.num_attention_heads
+    md.max_position_embeddings = margs.max_position_embeddings
+    md.tokenizer_type = margs.tokenizer_type
+    md.iteration = margs.iteration
+    md.params_dtype = margs.params_dtype
+    md.bert_binary_head = margs.bert_binary_head
+    md.output_layer = margs.untie_embeddings_and_output_weights
+    md.position_embedding_type = margs.position_embedding_type
+    md.linear_bias = margs.add_bias_linear
+    md.norm_has_bias = False
+    md.swiglu = margs.swiglu
+    md.previous_tensor_parallel_size = margs.tensor_model_parallel_size
+    md.previous_pipeline_parallel_size = margs.pipeline_model_parallel_size
+    md.true_vocab_size = margs.vocab_size # skips padding in saver
+    md.make_vocab_size_divisible_by = None
+    md.checkpoint_args = margs
+    md.consumed_train_samples = 0
+    md.consumed_valid_samples = 0
+    md.num_experts = margs.num_experts
+
+    # Get first pipe stage.
+    mpu.set_tensor_model_parallel_rank(0)
+    mpu.set_pipeline_model_parallel_rank(0)
+    mpu.set_expert_model_parallel_rank(0)
+    model = load_checkpoint_to_model(margs)
+
+    queue.put(md)
+
+    def queue_put(name, msg):
+        print(f"sending {name}")
+        msg["name"] = name
+        queue.put(msg)
+
+    # Send embeddings.
+    message = {
+        "word embeddings": model.embedding.word_embeddings.weight.data
+    }
+    if md.position_embedding_type == 'learned_absolute':
+        message["position embeddings"] = model.embedding.position_embeddings.weight.data
+    else:
+        assert not hasattr(model.embedding, 'position_embeddings')
+
+    queue_put("embeddings", message)
+
+    for layer_idx in range(margs.num_layers):
+        message = {}
+
+        # Get non-parallel tensors from tp_rank 0.
+        layer = model.decoder.layers[layer_idx]
+        message["input norm weight"] = layer.self_attention.linear_qkv.layer_norm_weight.data
+        message["post norm weight"] = layer.pre_mlp_layernorm.weight.data
+
+        # Simple concat of the rest.
+        message["qkv weight"] = layer.self_attention.linear_qkv.weight.data
+        message["dense weight"] = layer.self_attention.linear_proj.weight.data
+
+        # Grab all parallel tensors for this layer.
+        layer = model.decoder.layers[layer_idx]
+        experts = layer.mlp.experts.local_experts
+
+        message["router weight"] = layer.mlp.router.weight.data
+        if md.swiglu:
+            chunked_mlp_l0_weight =  [torch.chunk(local_expert.linear_fc1.weight.data, 2, dim=0) for local_expert in experts]
+            message["mlp l0 weight W"] = torch.stack([local_weight[0] for local_weight in chunked_mlp_l0_weight], dim=0)
+            message["mlp l0 weight V"] = torch.stack([local_weight[1] for local_weight in chunked_mlp_l0_weight], dim=0)
+        else:
+            message["mlp l0 weight"] = torch.stack([local_expert.linear_fc1.weight.data for local_expert in experts])
+        message["mlp l1 weight"] = torch.stack([local_expert.linear_fc2.weight.data for local_expert in experts], dim=0)
+
+        queue_put(f"transformer layer {layer_idx}", message)
+
+    queue_put("final norm", {
+        "weight": model.decoder.final_layernorm.weight.data,
+    })
+
+    if md.output_layer:
+        queue_put("output layer", {
+            "weight": model.output_layer.weight.data
+        })
+
+    queue.put("done")
+
+def load_checkpoint(queue, args):
+    try:
+        _load_checkpoint(queue, args)
+    except Exception:
+        queue.put("exit")
+        raise
diff --git a/tools/checkpoint/saver_mcore.py b/tools/checkpoint/saver_mcore.py
new file mode 100644
index 0000000000..e1779b8969
--- /dev/null
+++ b/tools/checkpoint/saver_mcore.py
@@ -0,0 +1,801 @@
+# Copyright (c) 2024, NVIDIA CORPORATION. All rights reserved.
+
+import os
+import sys
+import torch
+
+from setter import ModelSetter
+from utils import get_mcore_transformer_block_key, print_memory_usage
+from megatron.core.utils import get_te_version, is_te_min_version
+
+
+class MCoreSetter(ModelSetter):
+
+    transformer_block_key = None
+
+    @classmethod
+    def get_transformer_block(cls, model):
+        return getattr(model, cls.transformer_block_key)
+
+    @classmethod
+    def has_position_embeddings(cls, model):
+        return hasattr(model.embedding, "position_embeddings")
+
+    @classmethod
+    def set_embeddings(
+        cls,
+        model,
+        word=None,
+        pos=None,
+    ):
+        cls.set_tensor(model.embedding.word_embeddings.weight, word)
+        if pos is not None:
+            cls.set_tensor(model.embedding.position_embeddings.weight, pos)
+
+    @classmethod
+    def set_final_norm(
+        cls,
+        model,
+        weight=None,
+        bias=None,
+    ):
+        block = cls.get_transformer_block(model)
+        cls.set_tensor(block.final_layernorm.weight, weight)
+        if bias is not None:
+            cls.set_tensor(block.final_layernorm.bias, bias)
+
+    @classmethod
+    def set_output_word_embeddings(
+        cls,
+        model,
+        emb=None,
+    ):
+        cls.set_tensor(model.embedding.word_embeddings.weight, emb)
+
+    @classmethod
+    def set_output_layer(
+        cls,
+        model,
+        weight=None,
+    ):
+        cls.set_tensor(model.output_layer.weight, weight)
+
+    @classmethod
+    def set_pooler(
+        cls,
+        model,
+        weight=None,
+        bias=None,
+    ):
+        cls.set_tensor(model.pooler.dense.weight, weight)
+        if bias is not None:
+            cls.set_tensor(model.pooler.dense.bias, bias)
+
+    @classmethod
+    def set_lm_head(
+        cls,
+        model,
+        dense_weight=None,
+        dense_bias=None,
+        norm_weight=None,
+        norm_bias=None,
+    ):
+
+        cls.set_tensor(model.lm_head.dense.weight, dense_weight)
+        if dense_bias is not None:
+            cls.set_tensor(model.lm_head.dense.bias, dense_bias)
+
+        cls.set_tensor(model.lm_head.layer_norm.weight, norm_weight)
+        if norm_bias is not None:
+            cls.set_tensor(model.lm_head.layer_norm.bias, norm_bias)
+
+    @classmethod
+    def set_binary_head(
+        cls,
+        model,
+        weight=None,
+        bias=None,
+    ):
+        cls.set_tensor(model.binary_head.weight, weight)
+        if bias is not None:
+            cls.set_tensor(model.binary_head.bias, bias)
+
+
+class MCoreLocalSetter(MCoreSetter):
+
+    @classmethod
+    def set_layer(
+        cls,
+        model,
+        layer_idx,
+        self_attn_norm_weight=None,
+        self_attn_norm_bias=None,
+        self_attn_qkv_weight=None,
+        self_attn_qkv_bias=None,
+        self_attn_proj_weight=None,
+        self_attn_proj_bias=None,
+        mlp_norm_weight=None,
+        mlp_norm_bias=None,
+        mlp_fc1_weight=None,
+        mlp_fc1_bias=None,
+        mlp_fc2_weight=None,
+        mlp_fc2_bias=None,
+    ):
+
+        block = cls.get_transformer_block(model)
+        l = block.layers[layer_idx]
+
+        # Self attention.
+        cls.set_tensor(l.input_layernorm.weight, self_attn_norm_weight)
+        if self_attn_norm_bias is not None:
+            cls.set_tensor(l.input_layernorm.bias, self_attn_norm_bias)
+
+        cls.set_tensor(l.self_attention.linear_qkv.weight, self_attn_qkv_weight)
+        if self_attn_qkv_bias is not None:
+            cls.set_tensor(l.self_attention.linear_qkv.bias, self_attn_qkv_bias)
+
+        cls.set_tensor(l.self_attention.linear_proj.weight, self_attn_proj_weight)
+        if self_attn_proj_bias is not None:
+            cls.set_tensor(l.self_attention.linear_proj.bias, self_attn_proj_bias)
+
+        # MLP.
+        cls.set_tensor(l.pre_mlp_layernorm.weight, mlp_norm_weight)
+        if mlp_norm_bias is not None:
+            cls.set_tensor(l.pre_mlp_layernorm.bias, mlp_norm_bias)
+
+        cls.set_tensor(l.mlp.linear_fc1.weight, mlp_fc1_weight)
+        if mlp_fc1_bias is not None:
+            cls.set_tensor(l.mlp.linear_fc1.bias, mlp_fc1_bias)
+
+        cls.set_tensor(l.mlp.linear_fc2.weight, mlp_fc2_weight)
+        if mlp_fc2_bias is not None:
+            cls.set_tensor(l.mlp.linear_fc2.bias, mlp_fc2_bias)
+
+
+class MCoreTESetter(MCoreSetter):
+
+    @classmethod
+    def set_layer(
+        cls,
+        model,
+        layer_idx,
+        self_attn_norm_weight=None,
+        self_attn_norm_bias=None,
+        self_attn_qkv_weight=None,
+        self_attn_qkv_bias=None,
+        self_attn_proj_weight=None,
+        self_attn_proj_bias=None,
+        mlp_norm_weight=None,
+        mlp_norm_bias=None,
+        mlp_fc1_weight=None,
+        mlp_fc1_bias=None,
+        mlp_fc2_weight=None,
+        mlp_fc2_bias=None,
+    ):
+
+        block = cls.get_transformer_block(model)
+        l = block.layers[layer_idx]
+
+        # Self attention.
+        cls.set_tensor(l.self_attention.linear_qkv.layer_norm_weight, self_attn_norm_weight)
+        if self_attn_norm_bias is not None:
+            cls.set_tensor(l.self_attention.linear_qkv.layer_norm_bias, self_attn_norm_bias)
+
+        cls.set_tensor(l.self_attention.linear_qkv.weight, self_attn_qkv_weight)
+        if self_attn_qkv_bias is not None:
+            cls.set_tensor(l.self_attention.linear_qkv.bias, self_attn_qkv_bias)
+
+        cls.set_tensor(l.self_attention.linear_proj.weight, self_attn_proj_weight)
+        if self_attn_proj_bias is not None:
+            cls.set_tensor(l.self_attention.linear_proj.bias, self_attn_proj_bias)
+
+        # MLP.
+        cls.set_tensor(l.mlp.linear_fc1.layer_norm_weight, mlp_norm_weight)
+        if mlp_norm_bias is not None:
+            cls.set_tensor(l.mlp.linear_fc1.layer_norm_bias, mlp_norm_bias)
+
+        cls.set_tensor(l.mlp.linear_fc1.weight, mlp_fc1_weight)
+        if mlp_fc1_bias is not None:
+            cls.set_tensor(l.mlp.linear_fc1.bias, mlp_fc1_bias)
+
+        cls.set_tensor(l.mlp.linear_fc2.weight, mlp_fc2_weight)
+        if mlp_fc2_bias is not None:
+            cls.set_tensor(l.mlp.linear_fc2.bias, mlp_fc2_bias)
+
+class MCoreMoETESetter(MCoreSetter):
+
+    @classmethod
+    def set_layer(
+        cls,
+        model,
+        layer_idx,
+        router_weight=None,
+        self_attn_norm_weight=None,
+        self_attn_norm_bias=None,
+        self_attn_qkv_weight=None,
+        self_attn_qkv_bias=None,
+        self_attn_proj_weight=None,
+        self_attn_proj_bias=None,
+        mlp_norm_weight=None,
+        mlp_norm_bias=None,
+        mlp_fc1_weight=None,
+        mlp_fc1_bias=None,
+        mlp_fc2_weight=None,
+        mlp_fc2_bias=None,
+    ):
+
+        block = cls.get_transformer_block(model)
+        l = block.layers[layer_idx]
+
+        # Self attention.
+        cls.set_tensor(l.self_attention.linear_qkv.layer_norm_weight, self_attn_norm_weight)
+        if self_attn_norm_bias is not None:
+            cls.set_tensor(l.self_attention.linear_qkv.layer_norm_bias, self_attn_norm_bias)
+        cls.set_tensor(l.self_attention.linear_qkv.weight, self_attn_qkv_weight)
+        if self_attn_qkv_bias is not None:
+            cls.set_tensor(l.self_attention.linear_qkv.bias, self_attn_qkv_bias)
+        cls.set_tensor(l.self_attention.linear_proj.weight, self_attn_proj_weight)
+        if self_attn_proj_bias is not None:
+            cls.set_tensor(l.self_attention.linear_proj.bias, self_attn_proj_bias)
+
+        # MLP.
+        cls.set_tensor(l.pre_mlp_layernorm.weight, mlp_norm_weight)
+        if model.config.normalization == "LayerNorm":
+            cls.set_tensor(l.pre_mlp_layernorm.bias, mlp_norm_bias)
+
+        cls.set_tensor(l.mlp.router.weight, router_weight)
+
+        num_local_experts = mlp_fc1_weight.shape[0]
+        for expert_idx in range(num_local_experts):
+            cls.set_tensor(l.mlp.experts.local_experts[expert_idx].linear_fc1.weight, mlp_fc1_weight[expert_idx])
+            cls.set_tensor(l.mlp.experts.local_experts[expert_idx].linear_fc2.weight, mlp_fc2_weight[expert_idx])
+
+
+def get_model_setter(model_type, transformer_impl, num_experts=0):
+    if num_experts is not None and num_experts > 0:
+        # Only support TE setter for MOE
+        assert transformer_impl == "transformer_engine"
+        setter = MCoreMoETESetter
+    else:
+        setter = {
+            "local" : MCoreLocalSetter,
+            "transformer_engine" : MCoreTESetter,
+        }[transformer_impl]
+    setter.transformer_block_key = get_mcore_transformer_block_key(model_type)
+    return setter
+
+
+def add_arguments(parser):
+    group = parser.add_argument_group(title='M-Core saver')
+
+    group.add_argument('--megatron-path', type=str, default=None,
+                       help='Base directory of Megatron repository')
+
+    group.add_argument('--target-tensor-parallel-size', type=int,
+                       help='Target tensor model parallel size, defaults to the tensor parallel size '
+                       'in the input checkpoint if provided by the loader, otherwise to 1')
+    group.add_argument('--target-pipeline-parallel-size', type=int,
+                       help='Target tensor model parallel size, default to the pipeline parall size '
+                       'in the input checkpoint if provided by the loader, otherwise to 1')
+    group.add_argument('--saver-transformer-impl', default='transformer_engine',
+                       choices=['local', 'transformer_engine'],
+                       help='Which Transformer implementation to use.')
+    group.add_argument('--target-expert-parallel-size', type=int, default=1,
+                       help='Target expert model parallel size, default to 1')
+
+
+def save_checkpoint(queue, args):
+
+    # Transformer engine >= 0.12.0, for CPU initialization.
+    assert is_te_min_version("0.12.0"), \
+        "transformer engine version: %s (>=0.12.0 required)." % get_te_version()
+
+    # Search in directory above this
+    sys.path.append(os.path.abspath(
+        os.path.join(os.path.dirname(__file__),
+                     os.path.pardir,
+                     os.path.pardir)))
+    if args.megatron_path is not None:
+        sys.path.insert(0, args.megatron_path)
+
+    try:
+        from megatron.training.arguments import (parse_args, validate_args)
+        from megatron.training.checkpointing import save_checkpoint
+        from megatron.training.global_vars import set_global_variables, get_args
+        from megatron.core.enums import ModelType
+        from megatron.training.tokenizer.tokenizer import _vocab_size_with_padding
+        from megatron.legacy import fused_kernels
+        from megatron.core import mpu
+    except ModuleNotFoundError:
+        print("Unable to import Megatron, please specify the path to Megatron using --megatron-path. Exiting.")
+        exit(1)
+
+    def queue_get(name=None):
+        val = queue.get()
+        if val == "exit":
+            print("Loader exited, exiting saver")
+            exit(1)
+        if name is not None and args.checking and val["name"] != name:
+            val_name = val["name"]
+            print(f'Unexpected message. Expecting "{name}" but got "{val_name}". Exiting saver.')
+            exit(1)
+        if name is not None:
+            print(f"received {name}")
+        return val
+
+    def check_message(msg):
+        if not args.checking:
+            return
+        msg_name = msg.pop("name")
+        if len(msg.keys()) > 0:
+            print(f"Unexpected values in {msg_name}:")
+            for key in msg.keys():
+                print(f"   {key}")
+            print(f"Exiting. If you want to ignore this, use the argument --no-checking.")
+            exit(1)
+
+
+    md = queue_get()
+
+    if args.target_tensor_parallel_size is None:
+        if hasattr(md, 'previous_tensor_parallel_size'):
+            args.target_tensor_parallel_size = md.previous_tensor_parallel_size
+        else:
+            print("loader did not provide a tensor parallel size and --target-tensor-parallel-size not provided on command line. "
+                  "Default to 1.")
+            args.target_tensor_parallel_size = 1
+
+    if args.target_pipeline_parallel_size is None:
+        if hasattr(md, 'previous_pipeline_parallel_size'):
+            args.target_pipeline_parallel_size = md.previous_pipeline_parallel_size
+        else:
+            print("loader did not provide a pipeline parallel size and --target-pipeline-parallel-size not provided on command line. "
+                  "Default to 1.")
+            args.target_pipeline_parallel_size = 1
+
+
+    # Arguments do sanity checks on the world size, but we don't care,
+    # so trick it into thinking we are plenty of processes
+    if args.target_tensor_parallel_size is not None and args.target_pipeline_parallel_size is not None:
+        if args.target_expert_parallel_size is not None:
+            os.environ["WORLD_SIZE"] = f'{args.target_tensor_parallel_size * args.target_pipeline_parallel_size * args.target_expert_parallel_size}'
+        else:
+            os.environ["WORLD_SIZE"] = f'{args.target_tensor_parallel_size * args.target_pipeline_parallel_size}'
+
+    # We want all arguments to come from us
+    sys.argv = ['script.py',
+                '--num-layers', str(md.num_layers),
+                '--hidden-size', str(md.hidden_size),
+                '--seq-length', str(md.seq_length),
+                '--num-experts', str(getattr(md, "num_experts", 0)),
+                '--num-attention-heads', str(md.num_attention_heads),
+                '--max-position-embeddings', str(md.max_position_embeddings),
+                '--position-embedding-type', str(md.position_embedding_type),
+                '--tokenizer-type', str(md.tokenizer_type),
+                '--tensor-model-parallel-size', str(args.target_tensor_parallel_size),
+                '--pipeline-model-parallel-size', str(args.target_pipeline_parallel_size),
+                '--expert-model-parallel-size', str(args.target_expert_parallel_size),
+                '--no-masked-softmax-fusion',
+                '--no-bias-gelu-fusion',
+                '--no-bias-dropout-fusion',
+                '--no-async-tensor-model-parallel-allreduce',
+                '--use-cpu-initialization',
+                '--micro-batch-size', '1',
+                '--no-load-optim',
+                '--no-load-rng',
+                '--no-save-optim',
+                '--no-save-rng',
+                '--no-initialization',
+                '--save-interval', '1',
+                '--save', args.save_dir,
+                '--ckpt-format', 'torch', # only 'torch' supported for conversion
+                ]
+
+    if md.make_vocab_size_divisible_by is not None:
+        sys.argv.extend(['--make-vocab-size-divisible-by', str(md.make_vocab_size_divisible_by)])
+    if md.params_dtype == torch.float16:
+        sys.argv.append('--fp16')
+    elif md.params_dtype == torch.bfloat16:
+        sys.argv.append('--bf16')
+
+    if md.output_layer:
+        sys.argv.append('--untie-embeddings-and-output-weights')
+    if not md.linear_bias:
+        sys.argv.append('--disable-bias-linear')
+
+    if md.model_type == 'BERT' and not md.bert_binary_head:
+        sys.argv.append('--bert-no-binary-head')
+
+    margs = parse_args()
+
+    if hasattr (md, 'checkpoint_args'):
+        # These are arguments that we are either changing, or cause problems for validation if they are set
+        # Note that some of these deal with T5 so will need to be changed if we support T5.
+        args_to_keep = ['tensor_model_parallel_size', 'pipeline_model_parallel_size', 'expert_model_parallel_size', 'world_size', 'params_dtype',
+                        'num_layers_per_virtual_pipeline_stage', 'virtual_pipeline_model_parallel_size',
+                        'masked_softmax_fusion', 'bias_gelu_fusion', 'bias_dropout_fusion',
+                        'sequence_parallel', 'async_tensor_model_parallel_allreduce',
+                        'no_load_optim', 'no_load_rng', 'no_save_optim', 'no_save_rng',
+                        'vocab_file', 'tokenizer_model',
+                        'save_interval', 'save',
+                        'perform_initialization', 'use_cpu_initialization',
+                        'recompute_granularity', 'recompute_num_layers', 'recompute_method',
+                        'encoder_num_layers', 'encoder_seq_length',
+                        'distribute_saved_activations',
+                        'train_iters', 'lr_decay_iters', 'lr_warmup_iters', 'lr_warmup_fraction',
+                        'start_weight_decay', 'end_weight_decay',
+                        'ckpt_format',
+        ]
+
+        for arg, value in vars(md.checkpoint_args).items():
+            if arg in args_to_keep:
+                continue
+            if not hasattr(margs, arg):
+                print(f"Checkpoint had argument {arg} but new arguments does not have this.")
+                continue
+            if getattr(margs, arg) != value:
+                print(f"Overwriting default {arg} value {getattr(margs, arg)} with value from checkpoint {value}.")
+                setattr(margs, arg, value)
+
+    # Explicitly copy sequence_parallel, apply_query_key_layer_scaling.
+    margs.sequence_parallel = md.checkpoint_args.sequence_parallel
+    margs.apply_query_key_layer_scaling = md.checkpoint_args.apply_query_key_layer_scaling
+
+    # Sequence parallel is required if use both tensor-parallel and Moe.
+    if margs.num_experts is not None and args.target_tensor_parallel_size is not None:
+        if margs.num_experts > 1 and args.target_tensor_parallel_size > 1:
+            margs.sequence_parallel = True
+
+    validate_args(margs)
+
+    # Use M-core models & unset loaded paths.
+    margs.use_legacy_models = False
+    margs.blendable_index_path = None
+    margs.data_path = []
+    margs.load = None
+    margs.save = args.save_dir
+    margs.tensorboard_dir = None
+    margs.tokenizer_model = None
+    margs.transformer_impl = args.saver_transformer_impl
+
+    set_global_variables(margs, build_tokenizer=False)
+
+    # Megatron args. (i.e., 'margs')
+    margs = get_args()
+
+    if hasattr(md, 'consumed_train_samples'):
+        margs.consumed_train_samples = md.consumed_train_samples
+        margs.consumed_valid_samples = md.consumed_valid_samples
+        print(f"Setting consumed_train_samples to {margs.consumed_train_samples}"
+              f" and consumed_valid_samples to {margs.consumed_valid_samples}")
+    else:
+        print("consumed_train_samples not provided.")
+
+    # Determine how to make our models
+    if md.model_type == 'GPT':
+        from pretrain_gpt import model_provider
+        margs.model_type = ModelType.encoder_or_decoder
+    elif md.model_type == 'BERT':
+        from pretrain_bert import model_provider
+        margs.model_type = ModelType.encoder_or_decoder
+    else:
+        raise Exception(f'unrecognized model type: {args.model_type}')
+
+    # fake initializing distributed
+    mpu.set_tensor_model_parallel_world_size(args.target_tensor_parallel_size)
+    mpu.set_pipeline_model_parallel_world_size(args.target_pipeline_parallel_size)
+    mpu.set_expert_model_parallel_world_size(args.target_expert_parallel_size)
+    mpu.set_tensor_model_parallel_rank(0)
+    mpu.set_pipeline_model_parallel_rank(0)
+    mpu.set_expert_model_parallel_rank(0)
+    fused_kernels.load(margs)
+
+    # Embeddings
+    #-----------
+    embeddings_msg = queue_get("embeddings")
+
+    pos_embed = None
+    if md.position_embedding_type == 'learned_absolute':
+        pos_embed = embeddings_msg.pop("position embeddings")
+    orig_word_embed = embeddings_msg.pop("word embeddings")
+    check_message(embeddings_msg)
+
+    # Deal with padding
+    def pad_weight(orig_word_embed, true_vocab_size):
+        if true_vocab_size is not None:
+            # figure out what our padded vocab size is
+            orig_vocab_size = orig_word_embed.shape[0]
+            margs.padded_vocab_size = _vocab_size_with_padding(true_vocab_size, margs)
+
+            # Cut out extra padding we don't need
+            if orig_vocab_size > margs.padded_vocab_size:
+                full_word_embed = orig_word_embed[0:margs.padded_vocab_size,:]
+
+            # Expanding embedding to larger size by replicating final entry
+            elif orig_vocab_size < margs.padded_vocab_size:
+                padding_size = margs.padded_vocab_size - orig_vocab_size
+
+                full_word_embed = torch.cat((
+                    orig_word_embed,
+                    orig_word_embed[-1].unsqueeze(0).expand(padding_size, -1)))
+
+            # Same size!
+            else:
+                full_word_embed = orig_word_embed
+        else:
+            print("Original vocab size not specified, leaving embedding table as-is. "
+                "If you've changed the tensor parallel size this could cause problems.")
+            margs.padded_vocab_size = orig_word_embed.shape[0]
+            full_word_embed = orig_word_embed
+        return full_word_embed
+
+    full_word_embed = pad_weight(orig_word_embed, md.true_vocab_size)
+
+    # Split into new tensor model parallel sizes
+    out_word_embed = torch.chunk(full_word_embed, args.target_tensor_parallel_size, dim=0)
+
+    # Parameter setter class.
+    setter = get_model_setter(md.model_type, margs.transformer_impl, margs.num_experts)
+
+    # Construct a 3D(PPxEPxTP) arry for models, fill it with None
+    models = [[[None for _ in range(args.target_tensor_parallel_size)] for _ in range(args.target_expert_parallel_size)] for _ in range(args.target_pipeline_parallel_size)]
+
+    # Model is lazy instantiated at firstly using
+    def get_local_model(pp_rank, ep_rank, tp_rank):
+        if models[pp_rank][ep_rank][tp_rank] is None:
+            pre_process = True if pp_rank == 0 else False
+            post_process = True if pp_rank == args.target_pipeline_parallel_size - 1 else False
+            models[pp_rank][ep_rank][tp_rank] = model_provider(pre_process, post_process).to(md.params_dtype)
+        return models[pp_rank][ep_rank][tp_rank]
+
+    # Set embeddings.
+    # --------------
+    for ep_rank in range(args.target_expert_parallel_size):
+        for tp_rank in range(args.target_tensor_parallel_size):
+            model = get_local_model(0, ep_rank, tp_rank)
+            if pos_embed is None:
+                assert not setter.has_position_embeddings(model)
+            setter.set_embeddings(
+                model,
+                word=out_word_embed[tp_rank],
+                pos=pos_embed,
+            )
+
+    def chunk_weight(weight, parallel_mode, tp_size=1, ep_size=1):
+        assert parallel_mode in ["row", "column"]
+        if weight.dim() == 3:
+            num_experts, out_features, in_features = weight.shape
+            if parallel_mode == "column":
+                weight = weight.reshape(ep_size, num_experts // ep_size, tp_size, out_features // tp_size, in_features)
+                weight = weight.permute(0, 2, 1, 3, 4)
+            else:
+                weight = weight.reshape(ep_size, num_experts // ep_size, out_features, tp_size, in_features // tp_size)
+                weight = weight.permute(0, 3, 1, 2, 4)
+            return weight # (ep_size, tp_size, local_eps, output_features, in_features)
+        else:
+            out_features, in_features = weight.shape
+            if parallel_mode == "column":
+                weight = weight.reshape(tp_size, out_features // tp_size, in_features)
+            else:
+                weight = weight.reshape(out_features, tp_size, in_features // tp_size).permute(1, 0, 2)
+            return weight # (tp_size, output_features, in_features)
+
+    def chunk_bias(bias, parallel_mode, tp_size=1, ep_size=1):
+        assert parallel_mode in ["row", "column"]
+        if bias.dim() == 2:
+            num_experts, hidden_size = bias.shape
+            if parallel_mode == 'column':
+                bias = bias.reshape(ep_size, num_experts // ep_size, tp_size, hidden_size // tp_size)
+                bias = bias.permute(0, 2, 1, 3) # (ep_size, tp_size, local_eps, hidden_size)
+            else:
+                bias = bias.reshape(ep_size, num_experts // ep_size, hidden_size) # (ep_size, local_eps, hidden_size)
+            return bias
+        else:
+            hidden_size = bias.shape
+            if parallel_mode == "column":
+                bias = bias.reshape(tp_size, hidden_size[0] // tp_size) # (tp_size, hidden_size)
+            return bias
+
+    # Transformer layers.
+    # ------------------
+    total_layer_num = 0
+    for pp_rank in range(args.target_pipeline_parallel_size):
+        # initial the first module in pp stage to get the layer_num, pooler, lm_head. binary_head
+        get_local_model(pp_rank,0,0)
+        for layer_id in range(len(setter.get_transformer_block(models[pp_rank][0][0]).layers)):
+            msg = queue_get(f"transformer layer {total_layer_num}")
+
+            # duplicated tensors
+            input_norm_weight = msg.pop("input norm weight")
+            post_norm_weight = msg.pop("post norm weight")
+            if md.norm_has_bias:
+                input_norm_bias = msg.pop("input norm bias")
+                post_norm_bias = msg.pop("post norm bias")
+
+            # Split up the parallel tensors
+            qkv_weight = chunk_weight(msg.pop("qkv weight"), "column", args.target_tensor_parallel_size)
+            dense_weight = chunk_weight(msg.pop("dense weight"), "row", args.target_tensor_parallel_size)
+            mlp_l1_weight = chunk_weight(msg.pop("mlp l1 weight"), "row", args.target_tensor_parallel_size, args.target_expert_parallel_size)
+
+            if margs.num_experts:
+                router = msg.pop("router weight")
+
+            # Special handling for swiglu
+            if md.swiglu:
+                mlp_l0_weight_W = chunk_weight(msg.pop("mlp l0 weight W"), "column", args.target_tensor_parallel_size, args.target_expert_parallel_size)
+                mlp_l0_weight_V = chunk_weight(msg.pop("mlp l0 weight V"), "column", args.target_tensor_parallel_size, args.target_expert_parallel_size)
+                mlp_l0_weight = torch.cat((mlp_l0_weight_W, mlp_l0_weight_V), dim=-2)
+            else:
+                mlp_l0_weight = chunk_weight(msg.pop("mlp l0 weight"), "column", args.target_tensor_parallel_size, args.target_expert_parallel_size)
+
+            if md.qkv_bias:
+                qkv_bias = chunk_bias(msg.pop("qkv bias"), 'column', args.target_tensor_parallel_size)
+            if md.linear_bias:
+                dense_bias = msg.pop("dense bias")
+                mlp_l1_bias = chunk_bias(msg.pop("mlp l1 bias"), 'row', args.target_tensor_parallel_size, args.target_expert_parallel_size)
+                if md.swiglu:
+                    mlp_l0_bias_W = chunk_bias(msg.pop("mlp l0 bias W"), 'column', args.target_tensor_parallel_size, args.target_expert_parallel_size)
+                    mlp_l0_bias_V = chunk_bias(msg.pop("mlp l0 bias V"), 'column', args.target_tensor_parallel_size, args.target_expert_parallel_size)
+                    mlp_l0_bias = torch.cat((mlp_l0_bias_W, mlp_l0_bias_V), dim=-1)
+                else:
+                    mlp_l0_bias = chunk_bias(msg.pop("mlp l0 bias"), 'column', args.target_tensor_parallel_size, args.target_expert_parallel_size)
+
+            # Save them to the model
+            for ep_rank in range(args.target_expert_parallel_size):
+                for tp_rank in range(args.target_tensor_parallel_size):
+                    params_dict = {
+                        "self_attn_norm_weight" : input_norm_weight,
+                        "self_attn_qkv_weight" : qkv_weight[tp_rank],
+                        "self_attn_proj_weight" : dense_weight[tp_rank],
+                        "mlp_norm_weight" : post_norm_weight
+                    }
+                    if margs.num_experts:
+                        params_dict.update({
+                            "mlp_fc1_weight" : mlp_l0_weight[ep_rank][tp_rank],
+                            "mlp_fc2_weight" : mlp_l1_weight[ep_rank][tp_rank]
+                        })
+                    else:
+                        params_dict.update({
+                            "mlp_fc1_weight" : mlp_l0_weight[tp_rank],
+                            "mlp_fc2_weight" : mlp_l1_weight[tp_rank]
+                        })
+                    params_dict.update({
+                        "self_attn_norm_bias" : input_norm_bias if md.norm_has_bias else None,
+                        "mlp_norm_bias" : post_norm_bias if md.norm_has_bias else None,
+                    })
+                    if md.qkv_bias:
+                        params_dict.update({
+                            "self_attn_qkv_bias" : qkv_bias[tp_rank]
+                        })
+                    if md.linear_bias:
+                        params_dict.update({
+                            "self_attn_proj_bias" : dense_bias
+                        })
+                        if margs.num_experts:
+                            params_dict.update({
+                                "mlp_fc1_bias" : mlp_l0_bias[ep_rank][tp_rank],
+                                "mlp_fc2_bias" : mlp_l1_bias[ep_rank]
+                            })
+                        else :
+                            params_dict.update({
+                                "mlp_fc1_bias" : mlp_l0_bias[tp_rank],
+                                "mlp_fc2_bias" : mlp_l1_bias
+                            })
+                    if margs.num_experts:
+                        params_dict.update({
+                            "router_weight":  router
+                        })
+                    model = get_local_model(pp_rank, ep_rank, tp_rank)
+                    setter.set_layer(model, layer_id, **params_dict)
+
+            total_layer_num = total_layer_num + 1
+            check_message(msg)
+
+
+        if pp_rank == args.target_pipeline_parallel_size - 1:
+            msg = queue_get("final norm")
+            final_norm_weight = msg.pop("weight")
+            if md.norm_has_bias:
+                final_norm_bias = msg.pop("bias")
+            pp_local_models = [get_local_model(pp_rank, ep_rank, tp_rank) for ep_rank in range(args.target_expert_parallel_size)
+                for tp_rank in range(args.target_tensor_parallel_size)]
+            for eptp_rank, model in enumerate(pp_local_models):
+                tp_rank = eptp_rank % args.target_tensor_parallel_size
+                setter.set_final_norm(
+                    model,
+                    weight=final_norm_weight,
+                    bias=final_norm_bias if md.norm_has_bias else None,
+                )
+                if pp_rank != 0 and not md.output_layer:
+                    # Copy word embeddings to final pipeline rank
+                    setter.set_output_word_embeddings(
+                        model,
+                        emb=out_word_embed[tp_rank],
+                    )
+            del final_norm_weight
+            if md.norm_has_bias:
+                del final_norm_bias
+            check_message(msg)
+
+            if md.output_layer:
+                msg = queue_get("output layer")
+                if not hasattr(pp_local_models[0], 'output_layer'):
+                    print("ERROR: got an output layer, but model does not have one")
+                    exit(1)
+                output_layer_weight = pad_weight(msg.pop("weight"), md.true_vocab_size)
+                output_layer_weight = torch.chunk(output_layer_weight, args.target_tensor_parallel_size, dim=0)
+                for eptp_rank, model in enumerate(pp_local_models):
+                    tp_rank = eptp_rank % args.target_tensor_parallel_size
+                    setter.set_output_layer(model, output_layer_weight[tp_rank])
+                check_message(msg)
+
+            msg = queue_get()
+            if msg != "done" and msg["name"] == "pooler":
+                if not hasattr(models[pp_rank][0][0], 'pooler'):
+                    print("ERROR: got a pooler, but model does not have one")
+                    exit(1)
+                print("received pooler")
+                pooler_weight = msg.pop("weight")
+                pooler_bias = msg.pop("bias")
+                for model in pp_local_models:
+                    setter.set_pooler(
+                        model=model,
+                        weight=pooler_weight,
+                        bias=pooler_bias,
+                    )
+                del pooler_weight
+                del pooler_bias
+                check_message(msg)
+                msg = queue_get()
+
+            if msg != "done" and msg["name"] == "lm head":
+                if not hasattr(models[pp_rank][0][0], 'lm_head'):
+                    print("ERROR: got an lm head, but model does not have one")
+                    exit(1)
+                print("received lm head")
+                lm_head_dense_weight = msg.pop("dense weight")
+                lm_head_dense_bias = msg.pop("dense bias")
+                lm_head_norm_weight = msg.pop("norm weight")
+                if md.norm_has_bias:
+                    lm_head_norm_bias = msg.pop("norm bias")
+                for model in pp_local_models:
+                    setter.set_lm_head(
+                        model=model,
+                        dense_weight=lm_head_dense_weight,
+                        dense_bias=lm_head_dense_bias,
+                        norm_weight=lm_head_norm_weight,
+                        norm_bias=lm_head_norm_bias if md.norm_has_bias else None,
+                    )
+                check_message(msg)
+                msg = queue_get()
+
+            if msg != "done" and msg["name"] == "binary head":
+                if not hasattr(models[pp_rank][0][0], 'binary_head'):
+                    print("ERROR: got a binary head, but model does not have one")
+                    exit(1)
+                print("received binary head")
+                binary_head_weight = msg.pop("weight")
+                binary_head_bias = msg.pop("bias")
+                for model in pp_local_models:
+                    setter.set_binary_head(
+                        model=model,
+                        weight=binary_head_weight,
+                        bias=binary_head_bias,
+                    )
+                check_message(msg)
+                msg = queue_get()
+
+            # TODO: delete weight when not used
+            if msg != "done":
+                print("ERROR: got some more data but was expecting to be done")
+
+        for ep_rank in range(args.target_expert_parallel_size):
+            for tp_rank in range(args.target_tensor_parallel_size):
+                save_checkpoint(md.iteration, [get_local_model(pp_rank, ep_rank, tp_rank)], None, None, num_floating_point_operations_so_far=0,
+                    pipeline_rank=pp_rank, pipeline_parallel=args.target_pipeline_parallel_size > 1,
+                    expert_rank=ep_rank, expert_parallel=args.target_expert_parallel_size > 1,
+                    tensor_rank=tp_rank)
+                # release the uselese model parts
+                models[pp_rank][ep_rank][tp_rank] = None
+
+    print("Done!")
diff --git a/tools/checkpoint_saver_megatron.py b/tools/checkpoint/saver_megatron.py
similarity index 81%
rename from tools/checkpoint_saver_megatron.py
rename to tools/checkpoint/saver_megatron.py
index 0ff8c55b1f..b017c9ed97 100644
--- a/tools/checkpoint_saver_megatron.py
+++ b/tools/checkpoint/saver_megatron.py
@@ -1,11 +1,10 @@
-import argparse
-from collections.abc import Mapping
-import concurrent.futures
+# Copyright (c) 2024, NVIDIA CORPORATION. All rights reserved.
+
 import os
 import sys
-
 import torch
 
+
 def add_arguments(parser):
     group = parser.add_argument_group(title='Megatron saver')
 
@@ -14,27 +13,30 @@ def add_arguments(parser):
 
     group.add_argument('--target-tensor-parallel-size', type=int,
                        help='Target tensor model parallel size, defaults to the tensor parallel size '
-                       'in the input checkpoint if provided by the loader, otherwise to 1')
+                            'in the input checkpoint if provided by the loader, otherwise to 1')
     group.add_argument('--target-pipeline-parallel-size', type=int,
                        help='Target tensor model parallel size, default to the pipeline parall size '
                        'in the input checkpoint if provided by the loader, otherwise to 1')
+    group.add_argument('--saver-transformer-impl', default='local',
+                       choices=['local', 'transformer_engine'],
+                       help='Which Transformer implementation to use.')
 
 def save_checkpoint(queue, args):
-
     # Search in directory above this
     sys.path.append(os.path.abspath(
         os.path.join(os.path.dirname(__file__),
+                     os.path.pardir,
                      os.path.pardir)))
     if args.megatron_path is not None:
         sys.path.insert(0, args.megatron_path)
 
     try:
-        from megatron.arguments import (parse_args, validate_args)
-        from megatron.checkpointing import save_checkpoint
-        from megatron.global_vars import set_global_variables, get_args
+        from megatron.training.arguments import (parse_args, validate_args)
+        from megatron.training.checkpointing import save_checkpoint
+        from megatron.training.global_vars import set_global_variables, get_args
         from megatron.core.enums import ModelType
-        from megatron.tokenizer.tokenizer import _vocab_size_with_padding
-        from megatron import fused_kernels
+        from megatron.training.tokenizer.tokenizer import _vocab_size_with_padding
+        from megatron.legacy import fused_kernels
         from megatron.core import mpu
     except ModuleNotFoundError:
         print("Unable to import Megatron, please specify the path to Megatron using --megatron-path. Exiting.")
@@ -64,26 +66,26 @@ def check_message(msg):
             print(f"Exiting. If you want to ignore this, use the argument --no-checking.")
             exit(1)
 
-
     md = queue_get()
 
     if args.target_tensor_parallel_size is None:
         if hasattr(md, 'previous_tensor_parallel_size'):
             args.target_tensor_parallel_size = md.previous_tensor_parallel_size
         else:
-            print("loader did not provide a tensor parallel size and --target-tensor-parallel-size not provided on command line. "
-                  "Default to 1.")
+            print(
+                "loader did not provide a tensor parallel size and --target-tensor-parallel-size not provided on command line. "
+                "Default to 1.")
             args.target_tensor_parallel_size = 1
 
     if args.target_pipeline_parallel_size is None:
         if hasattr(md, 'previous_pipeline_parallel_size'):
             args.target_pipeline_parallel_size = md.previous_pipeline_parallel_size
         else:
-            print("loader did not provide a pipeline parallel size and --target-pipeline-parallel-size not provided on command line. "
-                  "Default to 1.")
+            print(
+                "loader did not provide a pipeline parallel size and --target-pipeline-parallel-size not provided on command line. "
+                "Default to 1.")
             args.target_pipeline_parallel_size = 1
 
-
     # Arguments do sanity checks on the world size, but we don't care,
     # so trick it into thinking we are plenty of processes
     if args.target_tensor_parallel_size is not None and args.target_pipeline_parallel_size is not None:
@@ -96,6 +98,7 @@ def check_message(msg):
                 '--seq-length', str(md.seq_length),
                 '--num-attention-heads', str(md.num_attention_heads),
                 '--max-position-embeddings', str(md.max_position_embeddings),
+                '--position-embedding-type', str(md.position_embedding_type),
                 '--tokenizer-type', str(md.tokenizer_type),
                 '--tensor-model-parallel-size', str(args.target_tensor_parallel_size),
                 '--pipeline-model-parallel-size', str(args.target_pipeline_parallel_size),
@@ -111,7 +114,8 @@ def check_message(msg):
                 '--no-save-rng',
                 '--no-initialization',
                 '--save-interval', '1',
-                '--save', args.save_dir
+                '--save', args.save_dir,
+                '--ckpt-format', 'torch', # only 'torch' supported for conversion
                 ]
 
     if md.make_vocab_size_divisible_by is not None:
@@ -123,8 +127,6 @@ def check_message(msg):
 
     if md.output_layer:
         sys.argv.append('--untie-embeddings-and-output-weights')
-    if not md.position_embeddings:
-        sys.argv.append('--no-position-embedding')
     if not md.linear_bias:
         sys.argv.append('--disable-bias-linear')
 
@@ -133,11 +135,10 @@ def check_message(msg):
 
     margs = parse_args()
 
-
-    if hasattr (md, 'checkpoint_args'):
+    if hasattr(md, 'checkpoint_args'):
         # These are arguments that we are either changing, or cause problems for validation if they are set
         # Note that some of these deal with T5 so will need to be changed if we support T5.
-        args_to_keep = ['tensor_model_parallel_size', 'pipeline_model_parallel_size', 'params_dtype',
+        args_to_keep = ['tensor_model_parallel_size', 'pipeline_model_parallel_size', 'world_size', 'params_dtype',
                         'num_layers_per_virtual_pipeline_stage', 'virtual_pipeline_model_parallel_size',
                         'masked_softmax_fusion', 'bias_gelu_fusion', 'bias_dropout_fusion',
                         'sequence_parallel', 'async_tensor_model_parallel_allreduce',
@@ -145,11 +146,13 @@ def check_message(msg):
                         'vocab_file', 'tokenizer_model',
                         'save_interval', 'save',
                         'perform_initialization', 'use_cpu_initialization',
+                        'recompute_granularity', 'recompute_num_layers', 'recompute_method',
                         'encoder_num_layers', 'encoder_seq_length',
                         'distribute_saved_activations',
                         'train_iters', 'lr_decay_iters', 'lr_warmup_iters', 'lr_warmup_fraction',
-                        'start_weight_decay', 'end_weight_decay']
-
+                        'start_weight_decay', 'end_weight_decay', 'bf16', 'fp16',
+                        'ckpt_format',
+        ]
 
         for arg, value in vars(md.checkpoint_args).items():
             if arg in args_to_keep:
@@ -163,7 +166,14 @@ def check_message(msg):
 
     validate_args(margs)
 
-    set_global_variables(margs)
+    # Use MLM models.
+    margs.use_legacy_models = True
+    margs.transformer_impl = args.saver_transformer_impl
+
+    # Do not instantiate Tensorboard
+    margs.tensorboard_dir = None
+
+    set_global_variables(margs, build_tokenizer=False)
 
     # margs = megatron args
     margs = get_args()
@@ -198,10 +208,11 @@ def get_models(count, dtype, pre_process, post_process):
     fused_kernels.load(margs)
 
     # Embeddings
-    #-----------
+    # -----------
     embeddings_msg = queue_get("embeddings")
 
-    if md.position_embeddings:
+    pos_embed = None
+    if md.position_embedding_type == 'learned_absolute':
         pos_embed = embeddings_msg.pop("position embeddings")
     orig_word_embed = embeddings_msg.pop("word embeddings")
     check_message(embeddings_msg)
@@ -214,7 +225,7 @@ def get_models(count, dtype, pre_process, post_process):
 
         # Cut out extra padding we don't need
         if orig_vocab_size > margs.padded_vocab_size:
-            full_word_embed = orig_word_embed[0:margs.padded_vocab_size,:]
+            full_word_embed = orig_word_embed[0:margs.padded_vocab_size, :]
 
         # Expanding embedding to larger size by replicating final entry
         elif orig_vocab_size < margs.padded_vocab_size:
@@ -242,13 +253,13 @@ def get_models(count, dtype, pre_process, post_process):
     models = get_models(args.target_tensor_parallel_size, md.params_dtype, True, post_process)
     for tp_rank, model in enumerate(models):
         model.language_model.embedding.word_embeddings.weight.data.copy_(out_word_embed[tp_rank])
-        if md.position_embeddings:
+        if pos_embed is not None:
             model.language_model.embedding.position_embeddings.weight.data.copy_(pos_embed)
         else:
             assert not hasattr(model.language_model.embedding, "position_embeddings")
 
     # Transformer layers
-    #-------------------
+    # -------------------
     total_layer_num = 0
     for pp_rank in range(args.target_pipeline_parallel_size):
         # For later pipeline parallel ranks, make the new models
@@ -261,10 +272,12 @@ def get_models(count, dtype, pre_process, post_process):
             msg = queue_get(f"transformer layer {total_layer_num}")
 
             # duplicated tensors
-            input_layernorm_weight = msg.pop("input layernorm weight")
-            input_layernorm_bias = msg.pop("input layernorm bias")
-            post_layernorm_weight = msg.pop("post layernorm weight")
-            post_layernorm_bias = msg.pop("post layernorm bias")
+            input_norm_weight = msg.pop("input norm weight")
+            if md.norm_has_bias:
+                input_norm_bias = msg.pop("input norm bias")
+            post_norm_weight = msg.pop("post norm weight")
+            if md.norm_has_bias:
+                post_norm_bias = msg.pop("post norm bias")
             if md.linear_bias:
                 dense_bias = msg.pop("dense bias")
                 mlp_l1_bias = msg.pop("mlp l1 bias")
@@ -294,12 +307,14 @@ def get_models(count, dtype, pre_process, post_process):
             # Save them to the model
             for tp_rank in range(args.target_tensor_parallel_size):
                 l = models[tp_rank].language_model.encoder.layers[layer]
-                l.input_layernorm.weight.data.copy_(input_layernorm_weight)
-                l.input_layernorm.bias.data.copy_(input_layernorm_bias)
+                l.input_norm.weight.data.copy_(input_norm_weight)
+                if md.norm_has_bias:
+                    l.input_norm.bias.data.copy_(input_norm_bias)
                 l.self_attention.query_key_value.weight.data.copy_(qkv_weight[tp_rank])
                 l.self_attention.dense.weight.data.copy_(dense_weight[tp_rank])
-                l.post_attention_layernorm.weight.data.copy_(post_layernorm_weight)
-                l.post_attention_layernorm.bias.data.copy_(post_layernorm_bias)
+                l.post_attention_norm.weight.data.copy_(post_norm_weight)
+                if md.norm_has_bias:
+                    l.post_attention_norm.bias.data.copy_(post_norm_bias)
                 l.mlp.dense_h_to_4h.weight.data.copy_(mlp_l0_weight[tp_rank])
                 l.mlp.dense_4h_to_h.weight.data.copy_(mlp_l1_weight[tp_rank])
                 if md.linear_bias:
@@ -311,19 +326,21 @@ def get_models(count, dtype, pre_process, post_process):
             total_layer_num = total_layer_num + 1
             check_message(msg)
 
-
         if post_process:
-            msg = queue_get("final layernorm")
-            final_layernorm_weight = msg.pop("weight")
-            final_layernorm_bias = msg.pop("bias")
+            msg = queue_get("final norm")
+            final_norm_weight = msg.pop("weight")
+            if md.norm_has_bias:
+                final_norm_bias = msg.pop("bias")
             for tp_rank in range(args.target_tensor_parallel_size):
-                models[tp_rank].language_model.encoder.final_layernorm.weight.data.copy_(final_layernorm_weight)
-                models[tp_rank].language_model.encoder.final_layernorm.bias.data.copy_(final_layernorm_bias)
+                models[tp_rank].language_model.encoder.final_norm.weight.data.copy_(final_norm_weight)
+                if md.norm_has_bias:
+                    models[tp_rank].language_model.encoder.final_norm.bias.data.copy_(final_norm_bias)
                 if pp_rank != 0 and not md.output_layer:
                     # Copy word embeddings to final pipeline rank
                     models[tp_rank].word_embeddings.weight.data.copy_(out_word_embed[tp_rank])
-            del final_layernorm_weight
-            del final_layernorm_bias
+            del final_norm_weight
+            if md.norm_has_bias:
+                del final_norm_bias
             check_message(msg)
 
             if md.output_layer:
@@ -360,13 +377,15 @@ def get_models(count, dtype, pre_process, post_process):
                 print("received lm head")
                 lm_head_dense_weight = msg.pop("dense weight")
                 lm_head_dense_bias = msg.pop("dense bias")
-                lm_head_layernorm_weight = msg.pop("layernorm weight")
-                lm_head_layernorm_bias = msg.pop("layernorm bias")
+                lm_head_norm_weight = msg.pop("norm weight")
+                if md.norm_has_bias:
+                    lm_head_norm_bias = msg.pop("norm bias")
                 for tp_rank in range(args.target_tensor_parallel_size):
                     models[tp_rank].lm_head.dense.weight.data.copy_(lm_head_dense_weight)
                     models[tp_rank].lm_head.dense.bias.data.copy_(lm_head_dense_bias)
-                    models[tp_rank].lm_head.layernorm.weight.data.copy_(lm_head_layernorm_weight)
-                    models[tp_rank].lm_head.layernorm.bias.data.copy_(lm_head_layernorm_bias)
+                    models[tp_rank].lm_head.norm.weight.data.copy_(lm_head_norm_weight)
+                    if md.norm_has_bias:
+                        models[tp_rank].lm_head.norm.bias.data.copy_(lm_head_norm_bias)
                 check_message(msg)
                 msg = queue_get()
 
@@ -388,5 +407,6 @@ def get_models(count, dtype, pre_process, post_process):
 
         for tp_rank in range(args.target_tensor_parallel_size):
             mpu.set_tensor_model_parallel_rank(tp_rank)
-            save_checkpoint(md.iteration, [models[tp_rank]], None, None)
+            save_checkpoint(md.iteration, [models[tp_rank]], None, None,
+                            num_floating_point_operations_so_far=0)
     print("Done!")
diff --git a/tools/checkpoint/setter.py b/tools/checkpoint/setter.py
new file mode 100644
index 0000000000..5e84cff958
--- /dev/null
+++ b/tools/checkpoint/setter.py
@@ -0,0 +1,113 @@
+# Copyright (c) 2023, NVIDIA CORPORATION. All rights reserved.
+
+
+class ModelSetter:
+    '''Model parameter setter.
+
+    See convert.py for a full list of supported parameters and their names.
+    '''
+
+    @classmethod
+    def set_tensor(cls, dst, src):
+        '''Copy (in-place) src tensor to dst tensor.'''
+        if src is not None:
+            dst.data.copy_(src)
+
+    @classmethod
+    def has_position_embeddings(cls, model):
+        '''
+        Return True if learned parameters exist for position embeddings (e.g.,
+        learned absolute), and False otherwise (e.g., RoPE).
+        '''
+        raise NotImplementedError
+
+    @classmethod
+    def set_embeddings(
+        cls,
+        model,
+        word=None,
+        pos=None,
+    ):
+        '''Set word and position embeddings.'''
+        raise NotImplementedError
+
+    @classmethod
+    def set_output_word_embeddings(
+        cls,
+        model,
+        emb=None,
+    ):
+        '''Set output word embeddings for final pipeline stage.'''
+        raise NotImplementedError
+
+    @classmethod
+    def set_layer(
+        cls,
+        model,
+        layer_idx,
+        self_attn_norm_weight=None,
+        self_attn_norm_bias=None,
+        self_attn_qkv_weight=None,
+        self_attn_qkv_bias=None,
+        self_attn_proj_weight=None,
+        self_attn_proj_bias=None,
+        mlp_norm_weight=None,
+        mlp_norm_bias=None,
+        mlp_fc1_weight=None,
+        mlp_fc1_bias=None,
+        mlp_fc2_weight=None,
+        mlp_fc2_bias=None,
+    ):
+        '''Set layer parameters.'''
+        raise NotImplementedError
+
+    @classmethod
+    def set_final_norm(
+        cls,
+        model,
+        weight=None,
+        bias=None,
+    ):
+        '''Set final norm parameters (i.e., after last transformer layer).'''
+        raise NotImplementedError
+
+    @classmethod
+    def set_output_layer(
+        cls,
+        model,
+        weight=None,
+    ):
+        '''Set output (i.e., 'dense') weights.'''
+        raise NotImplementedError
+
+    @classmethod
+    def set_pooler(
+        cls,
+        model,
+        weight=None,
+        bias=None,
+    ):
+        '''Set pooler parameters (e.g., for Bert).'''
+        raise NotImplementedError
+
+    @classmethod
+    def set_lm_head(
+        cls,
+        model,
+        dense_weight=None,
+        dense_bias=None,
+        norm_weight=None,
+        norm_bias=None,
+    ):
+        '''Set LM head parameters.'''
+        raise NotImplementedError
+
+    @classmethod
+    def set_binary_head(
+        cls,
+        model,
+        weight=None,
+        bias=None,
+    ):
+        '''Set binary head parameters.'''
+        raise NotImplementedError
diff --git a/tools/checkpoint/utils.py b/tools/checkpoint/utils.py
new file mode 100644
index 0000000000..a604619418
--- /dev/null
+++ b/tools/checkpoint/utils.py
@@ -0,0 +1,23 @@
+# Copyright (c) 2024, NVIDIA CORPORATION. All rights reserved.
+
+import psutil
+
+
+def print_memory_usage(key, rank, num_ranks):
+    '''Print memory usage.'''
+    process = psutil.Process()
+    mem_info = process.memory_info()
+    print("> memory usage: '%s', rank %d / %d, mem %.1f/%.1f gb." % (
+        key,
+        rank,
+        num_ranks,
+        mem_info.rss / 1024**3,
+        100 * mem_info.rss / process.memory_percent() / 1024**3,
+    ))
+
+
+def get_mcore_transformer_block_key(model_key):
+    return {
+        "GPT" : "decoder",
+        "BERT" : "encoder",
+    }[model_key]
diff --git a/tools/copyright.sh b/tools/copyright.sh
new file mode 100644
index 0000000000..66098f84d2
--- /dev/null
+++ b/tools/copyright.sh
@@ -0,0 +1,34 @@
+#!/bin/bash
+
+# Files ending with .py should have Copyright notice in the first line.
+SCRIPT_DIR=$( cd -- "$( dirname -- "${BASH_SOURCE[0]}" )" &> /dev/null && pwd )
+
+# Move to the project root
+cd $SCRIPT_DIR/..
+find_files_with_missing_copyright() {
+find ./megatron/ -type f -name '*.py' | while read path; do
+    echo -en $path"\t"
+    head -2 $path | grep -iv 'coding=' | head -1
+done \
+   | egrep -iv 'Copyright.*NVIDIA CORPORATION.*All rights reserved.' \
+   | grep -iv 'BSD 3-Clause License' \
+   | grep -iv 'Copyright.*Microsoft' \
+   | grep -iv 'Copyright.*The Open AI Team' \
+   | grep -iv 'Copyright.*The Google AI' \
+   | grep -iv 'Copyright.*Facebook' | while read line; do
+     echo $line | cut -d' ' -f1
+   done
+}
+
+
+declare RESULT=($(find_files_with_missing_copyright))  # (..) = array
+
+if [ "${#RESULT[@]}" -gt 0 ]; then
+   echo "Error: Found files with missing copyright:"
+   for (( i=0; i<"${#RESULT[@]}"; i++ )); do
+      echo "path= ${RESULT[$i]}"
+   done
+   exit 1;
+else
+   echo "Ok: All files start with copyright notice"
+fi
diff --git a/tools/merge_datasets.py b/tools/merge_datasets.py
index e6e2900168..c615558a94 100644
--- a/tools/merge_datasets.py
+++ b/tools/merge_datasets.py
@@ -2,13 +2,60 @@
 import sys
 import json
 import argparse
-sys.path.append(os.path.abspath(os.path.join(os.path.dirname(__file__),
-                                             os.path.pardir)))
 
-from megatron.data import indexed_dataset
+sys.path.append(
+    os.path.abspath(os.path.join(os.path.dirname(__file__), os.path.pardir))
+)
 
+from megatron.core.datasets.indexed_dataset import (
+    IndexedDataset,
+    IndexedDatasetBuilder,
+    get_bin_path,
+    get_idx_path,
+)
 
-def main(args):
+
+def get_args():
+    parser = argparse.ArgumentParser()
+
+    group = parser.add_argument_group(title="input data")
+    group.add_argument(
+        "--input",
+        type=str,
+        required=True,
+        help="Path to directory containing all document files to merge",
+    )
+
+    group = parser.add_argument_group(title="output data")
+    group.add_argument(
+        "--output-prefix",
+        type=str,
+        required=True,
+        help="Path to binary output file without suffix",
+    )
+
+    group = parser.add_argument_group(title="miscellaneous")
+    group.add_argument(
+        "--multimodal",
+        action="store_true",
+        help="Whether the datasets are assumed to be multimodal"
+    )
+
+    args = parser.parse_args()
+
+    assert os.path.isdir(
+        args.input
+    ), f"ERROR: {args.input} is not a directory or does not exist"
+
+    assert os.path.isdir(
+        os.path.dirname(args.output_prefix)
+    ), f"ERROR: {os.path.dirname(args.output_prefix)} is not a directory or does not exist"
+
+    return args
+
+
+def main():
+    args = get_args()
 
     prefixes = set()
     for basename in os.listdir(args.input):
@@ -20,47 +67,27 @@ def main(args):
         if not os.path.isfile(os.path.join(args.input, basename)):
             continue
 
-        ext_pair = '.bin' if ext == '.idx' else '.idx'
-        assert os.path.isfile(os.path.join(args.input, prefix) + ext_pair), \
-               f'ERROR: {ext_pair} file not provided for {os.path.join(args.input, prefix)}'
+        ext_pair = ".bin" if ext == ".idx" else ".idx"
+        assert os.path.isfile(
+            os.path.join(args.input, prefix) + ext_pair
+        ), f"ERROR: {ext_pair} file not provided for {os.path.join(args.input, prefix)}"
 
         prefixes.add(prefix)
 
     builder = None
     for prefix in sorted(prefixes):
         if builder is None:
-            dataset = indexed_dataset.make_dataset(os.path.join(args.input, prefix), 'infer')
-
-            if isinstance(dataset, indexed_dataset.MMapIndexedDataset):
-                builder = indexed_dataset.MMapIndexedDatasetBuilder(args.output_prefix + '.bin', dtype=dataset._index.dtype)
-            else:
-                builder = indexed_dataset.IndexedDatasetBuilder(args.output_prefix + '.bin')
-
+            dataset = IndexedDataset(os.path.join(args.input, prefix), multimodal=args.multimodal)
+            builder = IndexedDatasetBuilder(
+                get_bin_path(args.output_prefix), dtype=dataset.index.dtype, multimodal=args.multimodal
+            )
             del dataset
 
-        builder.merge_file_(os.path.join(args.input, prefix))
+        builder.add_index(os.path.join(args.input, prefix))
 
-    builder.finalize(args.output_prefix + '.idx')
+    builder.finalize(get_idx_path(args.output_prefix))
 
 
 if __name__ == '__main__':
-    parser = argparse.ArgumentParser()
-
-    group = parser.add_argument_group(title='input data')
-    group.add_argument('--input', type=str, required=True,
-                       help='Path to directory containing all document files to merge')
-
-    group = parser.add_argument_group(title='output data')
-    group.add_argument('--output-prefix', type=str, required=True,
-                       help='Path to binary output file without suffix')
-
-    args = parser.parse_args()
-
-    assert os.path.isdir(args.input), \
-           f'ERROR: {args.input} is not a directory or does not exist'
-
-    assert os.path.isdir(os.path.dirname(args.output_prefix)), \
-           f'ERROR: {os.path.dirname(args.output_prefix)} is not a directory or does not exist'
-
-    main(args)
 
+    main()
diff --git a/tools/openwebtext/README.md b/tools/openwebtext/README.md
index 7e6f10a0a7..d7707c6d95 100644
--- a/tools/openwebtext/README.md
+++ b/tools/openwebtext/README.md
@@ -14,7 +14,7 @@ The following steps show how to prepare training dataset to train the mode.
 1. Download the deduplicated URLs from [jcpeterson](https://mega.nz/#F!EZZD0YwJ!9_PlEQzdMVLaNdKv_ICNVQ!cc4RgQQZ)
 2. Remove blacklisted URLs.
 ```
-python blacklist_urls.py <path to the dowloaded deduplicated URLs> <filename for clean urls. e.g. clean_urls.txt>
+python blacklist_urls.py <path to the downloaded deduplicated URLs> <filename for clean urls. e.g. clean_urls.txt>
 ```
 3. Download the content from the clean urls with [openwebtext's utilities](https://github.com/eukaryote31/openwebtext/blob/master/download.py). 
 
@@ -37,7 +37,7 @@ python group_duplicate_urls.py <possible duplicate urls file> <output file conta
 ```
 4. Remove similar documents that were detected in the last step.
 ```
-python remove_group_duplicates.py <file containing simialr documents> <cleaned data file> <outputfile containing deduplicate data>
+python remove_group_duplicates.py <file containing similar documents> <cleaned data file> <outputfile containing deduplicate data>
 ```
 
 5. Shuffle the dataset.
diff --git a/tools/openwebtext/blacklist_urls.py b/tools/openwebtext/blacklist_urls.py
index bf68840b6d..f54f6617a9 100644
--- a/tools/openwebtext/blacklist_urls.py
+++ b/tools/openwebtext/blacklist_urls.py
@@ -1,4 +1,6 @@
-# Copyright (c) 2022, NVIDIA CORPORATION. All rights reserved.
+# Copyright (c) 2023, NVIDIA CORPORATION. All rights reserved.
+
+# WARNING! This file contains a blacklist of known malicious sites and thus contains some NSFW language.
 
 
 import glob
@@ -47,6 +49,7 @@
     'google',
     'gunprime',
     'gyazo',
+    'horsefucker',
     'hotdealstar',
     'imagefap',
     'imageshack',
diff --git a/tools/preprocess_data.py b/tools/preprocess_data.py
index 35781a78e7..a81fe8ca7e 100644
--- a/tools/preprocess_data.py
+++ b/tools/preprocess_data.py
@@ -1,29 +1,33 @@
 # Copyright (c) 2022, NVIDIA CORPORATION. All rights reserved.
 
-"""Processing data for pretraining."""
-
+"""Processing large data for pretraining."""
 import argparse
+import math
 import json
-import multiprocessing
 import os
 import sys
 sys.path.append(os.path.abspath(os.path.join(os.path.dirname(__file__),
                                              os.path.pardir)))
 import time
-
+import gzip
+import glob
 import torch
+import numpy as np
+import multiprocessing
 try:
     import nltk
+    from nltk.tokenize.punkt import PunktLanguageVars
     nltk_available = True
 except ImportError:
+    PunktLanguageVars = object  # Fallback to the built-in object class
     nltk_available = False
 
-from megatron.tokenizer import build_tokenizer
-from megatron.data import indexed_dataset
+from megatron.training.tokenizer import build_tokenizer
+from megatron.core.datasets import indexed_dataset
 
 
 # https://stackoverflow.com/questions/33139531/preserve-empty-lines-with-nltks-punkt-tokenizer
-class CustomLanguageVars(nltk.tokenize.punkt.PunktLanguageVars):
+class CustomLanguageVars(PunktLanguageVars):
 
     _period_context_fmt = r"""
         \S*                          # some word material
@@ -39,6 +43,7 @@ class IdentitySplitter(object):
     def tokenize(self, *text):
         return text
 
+
 class Encoder(object):
     def __init__(self, args):
         self.args = args
@@ -50,34 +55,136 @@ def initializer(self):
             if not nltk_available:
                 print("NLTK is not available to split sentences.")
                 exit()
-            library = "tokenizers/punkt/{}.pickle".format(self.args.lang)
-            print("loading: " + library)
-            splitter = nltk.load(library)
+            if os.environ.get("NLTK_DATA"):
+                library = os.path.join(os.environ.get("NLTK_DATA"), "tokenizers", "punkt", f"{self.args.lang}.pickle")
+                url = f"file:{library}"
+            else:
+                library = os.path.join("tokenizers", "punkt", f"{self.args.lang}.pickle")
+                url = f"nltk:{library}"
+            splitter = nltk.load(url)
             if self.args.keep_newlines:
                 # this prevents punkt from eating newlines after sentences
                 Encoder.splitter = nltk.tokenize.punkt.PunktSentenceTokenizer(
-                    train_text=splitter._params,
-                    lang_vars=CustomLanguageVars())
+                    train_text = splitter._params,
+                    lang_vars = CustomLanguageVars())
             else:
                 Encoder.splitter = splitter
 
         else:
             Encoder.splitter = IdentitySplitter()
 
+    def split(self, json_line):
+        data = json.loads(json_line)
+        output = {}
+        for key in self.args.json_keys:
+            text = data[key]
+            max_len = 1000000
+            tokens_list = [Encoder.splitter.tokenize(text[i:i+max_len]) for i in range(0, len(text), max_len)]
+            output[key] = [tokens for partial in tokens_list for tokens in partial]
+        return json.dumps(output), len(json_line)
+
     def encode(self, json_line):
         data = json.loads(json_line)
         ids = {}
+        lens = {}
         for key in self.args.json_keys:
             text = data[key]
+            if isinstance(text, list):
+                sentences = text
+            else:
+                sentences = [text]
             doc_ids = []
-            for sentence in Encoder.splitter.tokenize(text):
+            sentence_lens = []
+            for sentence in sentences:
                 sentence_ids = Encoder.tokenizer.tokenize(sentence)
                 if len(sentence_ids) > 0:
-                    doc_ids.append(sentence_ids)
+                    doc_ids.extend(sentence_ids)
+                    sentence_lens.append(len(sentence_ids))
             if len(doc_ids) > 0 and self.args.append_eod:
-                doc_ids[-1].append(Encoder.tokenizer.eod)
+                doc_ids.append(Encoder.tokenizer.eod)
+                sentence_lens[-1] += 1
             ids[key] = doc_ids
-        return ids, len(json_line)
+            lens[key] = sentence_lens
+        return ids, lens, len(json_line)
+
+
+class Partition(object):
+    def __init__(self, args, workers):
+        self.args = args
+        self.workers = workers
+
+    def print_processing_stats(self, count, proc_start, total_bytes_processed):
+        if count % self.args.log_interval == 0:
+            current = time.time()
+            elapsed = current - proc_start
+            mbs = total_bytes_processed/elapsed/1024/1024
+            print(f"Processed {count} documents",
+                  f"({count/elapsed} docs/s, {mbs} MB/s).",
+                  file=sys.stderr)
+
+    def split_sentences(self, file_name):
+        input_file_name, output_file_name = file_name
+        print("Opening", input_file_name)
+        fin = open(input_file_name, 'r', encoding='utf-8')
+        fout = open(output_file_name, 'w')
+
+        encoder = Encoder(self.args)
+        pool = multiprocessing.Pool(self.workers, initializer=encoder.initializer)
+        split_docs = pool.imap(encoder.split, fin, 32)
+
+        proc_start = time.time()
+        total_bytes_processed = 0
+        for i, (doc, bytes_processed) in enumerate(split_docs, start=1):
+            total_bytes_processed += bytes_processed
+            fout.write(doc + "\n")
+            self.print_processing_stats(i, proc_start, total_bytes_processed)
+
+        fin.close()
+        fout.close()
+
+
+    def process_json_file(self, file_name):
+        input_file_name, output_prefix = file_name
+        print("Opening", input_file_name)
+        fin = open(input_file_name, 'r', encoding='utf-8')
+
+        startup_start = time.time()
+        encoder = Encoder(self.args)
+        tokenizer = build_tokenizer(self.args)
+        pool = multiprocessing.Pool(self.workers, initializer=encoder.initializer)
+        encoded_docs = pool.imap(encoder.encode, fin, 32)
+
+        level = "document"
+        if self.args.split_sentences:
+            level = "sentence"
+
+        output_bin_files = {}
+        output_idx_files = {}
+        builders = {}
+
+        for key in self.args.json_keys:
+            output_bin_files[key] = "{}_{}_{}.bin".format(output_prefix,
+                                                          key, level)
+            output_idx_files[key] = "{}_{}_{}.idx".format(output_prefix,
+                                                          key, level)
+            builders[key] = indexed_dataset.IndexedDatasetBuilder(
+                output_bin_files[key],
+                dtype=indexed_dataset.DType.optimal_dtype(tokenizer.vocab_size),
+            )
+
+        startup_end = time.time()
+        proc_start = time.time()
+        total_bytes_processed = 0
+        print("Time to startup:", startup_end - startup_start)
+        for i, (doc, sentence_lens, bytes_processed) in enumerate(encoded_docs, start=1):
+            total_bytes_processed += bytes_processed
+            for key in doc.keys():
+                builders[key].add_document(doc[key], sentence_lens[key])
+            self.print_processing_stats(i, proc_start, total_bytes_processed)
+
+        fin.close()
+        builders[key].finalize(output_idx_files[key])
+
 
 def get_args():
     parser = argparse.ArgumentParser()
@@ -94,109 +201,211 @@ def get_args():
     group = parser.add_argument_group(title='tokenizer')
     group.add_argument('--tokenizer-type', type=str, required=True,
                        choices=['BertWordPieceLowerCase','BertWordPieceCase',
-                                'GPT2BPETokenizer', 'SentencePieceTokenizer', 
-                                'GPTSentencePieceTokenizer', 'NullTokenizer'],
+                                'GPT2BPETokenizer', 'SentencePieceTokenizer',
+                                'GPTSentencePieceTokenizer', 'Llama2Tokenizer',
+                                'Llama3Tokenizer', 'MistralTokenizer', 'NullTokenizer'],
                        help='What type of tokenizer to use.')
+    group.add_argument('--tokenizer-model', type=str, default=None,
+                       help='YTTM tokenizer model.')
     group.add_argument('--vocab-file', type=str, default=None,
                        help='Path to the vocab file')
+    group.add_argument('--vocab-size', default=786,
+                       help='size of vocab for use with NullTokenizer')
     group.add_argument('--merge-file', type=str, default=None,
                        help='Path to the BPE merge file (if necessary).')
     group.add_argument('--append-eod', action='store_true',
                        help='Append an <eod> token to the end of a document.')
     group.add_argument('--lang', type=str, default='english',
                        help='Language to use for NLTK-powered sentence splitting.')
-    group.add_argument('--tokenizer-model', type=str, default=None,
-                       help='sentencepeice tokenizer model.')
-    group.add_argument('--vocab-size', default=786,
-                       help='size of vocab for use with NullTokenizer')
-
-
     group = parser.add_argument_group(title='output data')
     group.add_argument('--output-prefix', type=str, required=True,
                        help='Path to binary output file without suffix')
-    group.add_argument('--dataset-impl', type=str, default='mmap',
-                       choices=['lazy', 'cached', 'mmap'])
 
     group = parser.add_argument_group(title='runtime')
     group.add_argument('--workers', type=int, required=True,
-                       help='Number of worker processes to launch')
-    group.add_argument('--chunk-size', type=int, required=True,
-                       help='Chunk size assigned to each worker process')
-    group.add_argument('--log-interval', type=int, default=100,
+                       help=('Number of worker processes to launch.'
+                             'A good default for fast pre-processing '
+                             'is: (workers * partitions) = available CPU cores.'))
+    group.add_argument('--partitions', type=int, default=1,
+                        help='Number of file partitions')
+    group.add_argument('--log-interval', type=int, default=1000,
                        help='Interval between progress updates')
+    group.add_argument('--keep-sequential-samples', action='store_true',
+                       help='Ensure ordering of samples in .jsonl files is '
+                            'preserved when using partitions>1.')
     args = parser.parse_args()
     args.keep_empty = False
 
-    if args.tokenizer_type.lower().startswith('bert'):
-        if not args.split_sentences:
-            print("Bert tokenizer detected, are you sure you don't want to split sentences?")
+    if args.tokenizer_type.lower().startswith('bert') and not args.split_sentences:
+        print("Are you sure you don't want to split sentences?")
 
     # some default/dummy values for the tokenizer
-    args.rank = 0
+    args.rank = 1
     args.make_vocab_size_divisible_by = 128
     args.tensor_model_parallel_size = 1
     args.vocab_extra_ids = 0
 
     return args
 
+
+def get_file_name(args, file_id):
+    file_name, extension = os.path.splitext(args.input)
+    input_file_name = file_name + "_" + str(file_id) + extension
+    sentence_split_file = file_name + "_ss_" + str(file_id) + extension
+    output_prefix = args.output_prefix + "_" + str(file_id)
+    file_names = {
+        'partition': input_file_name,
+        'sentence_split': sentence_split_file,
+        'output_prefix': output_prefix}
+    return file_names
+
+
+def check_files_exist(in_ss_out_names, key, num_partitions):
+    for i in range(num_partitions):
+        if not os.path.exists(in_ss_out_names[i][key]):
+            return False
+    return True
+
+
 def main():
     args = get_args()
-    startup_start = time.time()
 
-    print("Opening", args.input)
-    fin = open(args.input, 'r', encoding='utf-8')
+    if args.split_sentences:
+        if nltk_available:
+            nltk.download("punkt", quiet=True, download_dir=os.environ.get("NLTK_DATA"))
+        else:
+            raise Exception(
+                "nltk library required for sentence splitting is not available.")
+
+    in_ss_out_names = []
+    if args.partitions == 1:
+        file_name, extension = os.path.splitext(args.input)
+        sentence_split_file = file_name + "_ss" + extension
+        file_names = {
+            'partition': args.input,
+            'sentence_split': sentence_split_file,
+            'output_prefix': args.output_prefix}
+        in_ss_out_names.append(file_names)
+    else:
+        in_file_names = glob.glob(args.input)
 
-    if nltk_available and args.split_sentences:
-        nltk.download("punkt", quiet=True)
+        # Count total number of lines across .jsonl files
+        if args.keep_sequential_samples:
+            total_sample_count = 0
+            for filename in in_file_names:
+                with open(filename, "r") as fin:
+                    for fc, _ in enumerate(fin):
+                        pass
+                total_sample_count += (fc + 1)
+            partition_size = math.ceil(total_sample_count / args.partitions)
 
-    encoder = Encoder(args)
-    tokenizer = build_tokenizer(args)
-    pool = multiprocessing.Pool(args.workers, initializer=encoder.initializer)
-    encoded_docs = pool.imap(encoder.encode, fin, args.chunk_size)
-    #encoded_docs = map(encoder.encode, fin)
+        # create .jsonl parition files
+        for idx in range(args.partitions):
+            in_ss_out_name = get_file_name(args, idx)
+            in_ss_out_names.append(in_ss_out_name)
+
+        # check to see if paritions were already created
+        partitions_present = check_files_exist(in_ss_out_names, 'partition', args.partitions)
+
+        # check to see if paritions with split sentences already created
+        split_sentences_present = check_files_exist(in_ss_out_names, 'sentence_split', args.partitions)
+
+        if not partitions_present and not split_sentences_present:
+            # populate .jsonl partition files from parent files
+            partitioned_input_files = []
+            for idx in range(args.partitions):
+                partitioned_input_file = open(in_ss_out_names[idx]['partition'], 'w')
+                partitioned_input_files.append(partitioned_input_file)
+
+            index = 0
+            if args.keep_sequential_samples: line_count = 0
+            for in_file_name in in_file_names:
+                # support for gzip files
+                if in_file_name.endswith(".gz"):
+                    fin = gzip.open(in_file_name, 'rt')
+                else:
+                    fin = open(in_file_name, 'r', encoding='utf-8')
+
+                for line in fin:
+                    partitioned_input_files[index].write(line)
+                    if args.keep_sequential_samples:
+                        line_count += 1
+                        if line_count % partition_size == 0:
+                            index += 1
+                    else:
+                        index = (index + 1)%args.partitions
+
+                fin.close()
+
+            for idx in range(args.partitions):
+                partitioned_input_files[idx].close()
 
+    assert args.workers % args.partitions == 0
+    partition = Partition(args, args.workers//args.partitions)
+
+    # check to see if paritions with split sentences already created
+    split_sentences_present = check_files_exist(in_ss_out_names, 'sentence_split', args.partitions)
+
+    # split sentences in partition files
+    if args.split_sentences and not split_sentences_present:
+        processes = []
+        for name in in_ss_out_names:
+            p = multiprocessing.Process(target=partition.split_sentences,
+                                        args=((name['partition'], name['sentence_split']),))
+            p.start()
+            processes.append(p)
+
+        for p in processes:
+            p.join()
+
+        if args.partitions == 1:
+            return
+
+
+    # encode partition files in parallel
+    processes = []
+    input_key = 'sentence_split' if args.split_sentences else 'partition'
+    for name in in_ss_out_names:
+        p = multiprocessing.Process(target=partition.process_json_file,
+                                    args=((name[input_key], name['output_prefix']),))
+        p.start()
+        processes.append(p)
+
+    for p in processes:
+        p.join()
+
+    if args.partitions == 1:
+        return
+
+    # merge bin/idx partitions
     level = "document"
     if args.split_sentences:
         level = "sentence"
 
-    print(f"Vocab size: {tokenizer.vocab_size}")
-    print(f"Output prefix: {args.output_prefix}")
     output_bin_files = {}
     output_idx_files = {}
     builders = {}
+    tokenizer = build_tokenizer(args)
+
     for key in args.json_keys:
         output_bin_files[key] = "{}_{}_{}.bin".format(args.output_prefix,
                                                       key, level)
         output_idx_files[key] = "{}_{}_{}.idx".format(args.output_prefix,
                                                       key, level)
-        builders[key] = indexed_dataset.make_builder(output_bin_files[key],
-                                               impl=args.dataset_impl,
-                                               vocab_size=tokenizer.vocab_size)
-
-    startup_end = time.time()
-    proc_start = time.time()
-    total_bytes_processed = 0
-    print("Time to startup:", startup_end - startup_start)
-
-    for i, (doc, bytes_processed) in enumerate(encoded_docs, start=1):
-        total_bytes_processed += bytes_processed
-        for key, sentences in doc.items():
-            if len(sentences) == 0:
-                continue
-            for sentence in sentences:
-                builders[key].add_item(torch.IntTensor(sentence))
-            builders[key].end_document()
-        if i % args.log_interval == 0:
-            current = time.time()
-            elapsed = current - proc_start
-            mbs = total_bytes_processed/elapsed/1024/1024
-            print(f"Processed {i} documents",
-                  f"({i/elapsed} docs/s, {mbs} MB/s).",
-                  file=sys.stderr)
-    print("Done! Now finalizing.")
+        builders[key] = indexed_dataset.IndexedDatasetBuilder(
+            output_bin_files[key],
+            dtype=indexed_dataset.DType.optimal_dtype(tokenizer.vocab_size),
+        )
 
-    for key in args.json_keys:
+        for name in in_ss_out_names:
+            parition_output_prefix = name['output_prefix']
+            full_partition_output_prefix = "{}_{}_{}".format(parition_output_prefix,
+                                                             key, level)
+            builders[key].add_index(full_partition_output_prefix)
         builders[key].finalize(output_idx_files[key])
 
+
 if __name__ == '__main__':
+
     main()
+
diff --git a/tools/preprocess_data_nmt.py b/tools/preprocess_data_nmt.py
index 2505c1e16d..13a04f6ee2 100644
--- a/tools/preprocess_data_nmt.py
+++ b/tools/preprocess_data_nmt.py
@@ -11,8 +11,8 @@
                                              os.path.pardir)))
 import time
 import torch
-from megatron.tokenizer import build_tokenizer
-from megatron.data import indexed_dataset
+from megatron.training.tokenizer import build_tokenizer
+from megatron.core.datasets import indexed_dataset
 
 
 class Encoder(object):
@@ -49,8 +49,6 @@ def get_args():
     group = parser.add_argument_group(title='output data')
     group.add_argument('--output-prefix', type=str, required=True,
                        help='Path to binary output file without suffix')
-    group.add_argument('--dataset-impl', type=str, default='mmap',
-                       choices=['lazy', 'cached', 'mmap'])
 
     group = parser.add_argument_group(title='runtime')
     group.add_argument('--workers', type=int, default=1,
@@ -84,9 +82,9 @@ def main():
     print(f"Output prefix: {args.output_prefix}")
     output_bin_file = "{}.bin".format(args.output_prefix)
     output_idx_file = "{}.idx".format(args.output_prefix)
-    builder = indexed_dataset.make_builder(output_bin_file,
-                                           impl=args.dataset_impl,
-                                           vocab_size=tokenizer.vocab_size)
+    builder = indexed_dataset.IndexedDatasetBuilder(
+        output_bin_file, dtype=indexed_dataset.DType.optimal_dtype(tokenizer.vocab_size)
+    )
 
     startup_end = time.time()
     proc_start = time.time()
diff --git a/tools/preprocess_data_partitions.py b/tools/preprocess_data_partitions.py
deleted file mode 100644
index 306ad3e4cd..0000000000
--- a/tools/preprocess_data_partitions.py
+++ /dev/null
@@ -1,373 +0,0 @@
-# Copyright (c) 2022, NVIDIA CORPORATION. All rights reserved.
-
-"""Processing large data for pretraining."""
-import argparse
-import math
-import json
-import os
-import sys
-sys.path.append(os.path.abspath(os.path.join(os.path.dirname(__file__),
-                                             os.path.pardir)))
-import time
-import gzip
-import glob
-import torch
-import numpy as np
-import multiprocessing
-try:
-    import nltk
-    nltk_available = True
-except ImportError:
-    nltk_available = False
-
-from megatron.tokenizer import build_tokenizer
-from megatron.data import indexed_dataset
-
-
-# https://stackoverflow.com/questions/33139531/preserve-empty-lines-with-nltks-punkt-tokenizer
-class CustomLanguageVars(nltk.tokenize.punkt.PunktLanguageVars):
-
-    _period_context_fmt = r"""
-        \S*                          # some word material
-        %(SentEndChars)s             # a potential sentence ending
-        \s*                       #  <-- THIS is what I changed
-        (?=(?P<after_tok>
-            %(NonWord)s              # either other punctuation
-            |
-            (?P<next_tok>\S+)     #  <-- Normally you would have \s+ here
-        ))"""
-
-class IdentitySplitter(object):
-    def tokenize(self, *text):
-        return text
-
-
-class Encoder(object):
-    def __init__(self, args):
-        self.args = args
-
-    def initializer(self):
-        # Use Encoder class as a container for global data
-        Encoder.tokenizer = build_tokenizer(self.args)
-        if self.args.split_sentences:
-            if not nltk_available:
-                print("NLTK is not available to split sentences.")
-                exit()
-            library = "tokenizers/punkt/{}.pickle".format(self.args.lang)
-            splitter = nltk.load(library)
-            if self.args.keep_newlines:
-                # this prevents punkt from eating newlines after sentences
-                Encoder.splitter = nltk.tokenize.punkt.PunktSentenceTokenizer(
-                    train_text = splitter._params,
-                    lang_vars = CustomLanguageVars())
-            else:
-                Encoder.splitter = splitter
-
-        else:
-            Encoder.splitter = IdentitySplitter()
-
-    def split(self, json_line):
-        data = json.loads(json_line)
-        output = {}
-        for key in self.args.json_keys:
-            text = data[key]
-            max_len = 1000000
-            tokens_list = [Encoder.splitter.tokenize(text[i:i+max_len]) for i in range(0, len(text), max_len)]
-            output[key] = [tokens for partial in tokens_list for tokens in partial]
-        return json.dumps(output), len(json_line)
-
-    def encode(self, json_line):
-        data = json.loads(json_line)
-        ids = {}
-        lens = {}
-        for key in self.args.json_keys:
-            text = data[key]
-            if isinstance(text, list):
-                sentences = text
-            else:
-                sentences = [text]
-            doc_ids = []
-            sentence_lens = []
-            for sentence in sentences:
-                sentence_ids = Encoder.tokenizer.tokenize(sentence)
-                if len(sentence_ids) > 0:
-                    doc_ids.extend(sentence_ids)
-                    sentence_lens.append(len(sentence_ids))
-            if len(doc_ids) > 0 and self.args.append_eod:
-                doc_ids.append(Encoder.tokenizer.eod)
-            ids[key] = doc_ids
-            lens[key] = sentence_lens
-        return ids, lens, len(json_line)
-
-
-class Partition(object):
-    def __init__(self, args, workers):
-        self.args = args
-        self.workers = workers
-
-    def print_processing_stats(self, count, proc_start, total_bytes_processed):
-        if count % self.args.log_interval == 0:
-            current = time.time()
-            elapsed = current - proc_start
-            mbs = total_bytes_processed/elapsed/1024/1024
-            print(f"Processed {count} documents",
-                  f"({count/elapsed} docs/s, {mbs} MB/s).",
-                  file=sys.stderr)
-
-    def split_sentences(self, file_name):
-        input_file_name, output_file_name = file_name
-        print("Opening", input_file_name)
-        fin = open(input_file_name, 'r', encoding='utf-8')
-        fout = open(output_file_name, 'w')
-
-        encoder = Encoder(self.args)
-        pool = multiprocessing.Pool(self.workers, initializer=encoder.initializer)
-        split_docs = pool.imap(encoder.split, fin, 32)
-
-        proc_start = time.time()
-        total_bytes_processed = 0
-        for i, (doc, bytes_processed) in enumerate(split_docs, start=1):
-            total_bytes_processed += bytes_processed
-            fout.write(doc + "\n")
-            self.print_processing_stats(i, proc_start, total_bytes_processed)
-
-        fin.close()
-        fout.close()
-
-
-    def process_json_file(self, file_name):
-        input_file_name, output_prefix = file_name
-        print("Opening", input_file_name)
-        fin = open(input_file_name, 'r', encoding='utf-8')
-
-        startup_start = time.time()
-        encoder = Encoder(self.args)
-        tokenizer = build_tokenizer(self.args)
-        pool = multiprocessing.Pool(self.workers, initializer=encoder.initializer)
-        encoded_docs = pool.imap(encoder.encode, fin, 32)
-
-        level = "document"
-        if self.args.split_sentences:
-            level = "sentence"
-
-        output_bin_files = {}
-        output_idx_files = {}
-        builders = {}
-
-        for key in self.args.json_keys:
-            output_bin_files[key] = "{}_{}_{}.bin".format(output_prefix,
-                                                          key, level)
-            output_idx_files[key] = "{}_{}_{}.idx".format(output_prefix,
-                                                          key, level)
-            builders[key] = indexed_dataset.make_builder(output_bin_files[key],
-                                                   impl=self.args.dataset_impl,
-                                                   vocab_size=tokenizer.vocab_size)
-
-        startup_end = time.time()
-        proc_start = time.time()
-        total_bytes_processed = 0
-        print("Time to startup:", startup_end - startup_start)
-        for i, (doc, sentence_lens, bytes_processed) in enumerate(encoded_docs, start=1):
-            total_bytes_processed += bytes_processed
-            for key in doc.keys():
-                builders[key].add_doc(doc[key], sentence_lens[key])
-            self.print_processing_stats(i, proc_start, total_bytes_processed)
-        
-        fin.close()
-        builders[key].finalize(output_idx_files[key])
-
-
-def get_args():
-    parser = argparse.ArgumentParser()
-    group = parser.add_argument_group(title='input data')
-    group.add_argument('--input', type=str, required=True,
-                       help='Path to input JSON')
-    group.add_argument('--json-keys', nargs='+', default=['text'],
-                       help='space separate listed of keys to extract from json')
-    group.add_argument('--split-sentences', action='store_true',
-                       help='Split documents into sentences.')
-    group.add_argument('--keep-newlines', action='store_true',
-                       help='Keep newlines between sentences when splitting.')
-
-    group = parser.add_argument_group(title='tokenizer')
-    group.add_argument('--tokenizer-type', type=str, required=True,
-                       choices=['BertWordPieceLowerCase','BertWordPieceCase',
-                                'GPT2BPETokenizer', 'SentencePieceTokenizer', 'GPTSentencePieceTokenizer'],
-                       help='What type of tokenizer to use.')
-    group.add_argument('--tokenizer-model', type=str, default=None,
-                       help='YTTM tokenizer model.')
-    group.add_argument('--vocab-file', type=str, default=None,
-                       help='Path to the vocab file')
-    group.add_argument('--merge-file', type=str, default=None,
-                       help='Path to the BPE merge file (if necessary).')
-    group.add_argument('--append-eod', action='store_true',
-                       help='Append an <eod> token to the end of a document.')
-    group.add_argument('--lang', type=str, default='english',
-                       help='Language to use for NLTK-powered sentence splitting.')
-    group = parser.add_argument_group(title='output data')
-    group.add_argument('--output-prefix', type=str, required=True,
-                       help='Path to binary output file without suffix')
-    group.add_argument('--dataset-impl', type=str, default='mmap',
-                       choices=['lazy', 'cached', 'mmap'])
-
-    group = parser.add_argument_group(title='runtime')
-    group.add_argument('--workers', type=int, default=1,
-                       help='Number of worker processes to launch')
-    group.add_argument('--partitions', type=int, default=1,
-                        help='Number of file partitions')
-    group.add_argument('--log-interval', type=int, default=1000,
-                       help='Interval between progress updates')
-    args = parser.parse_args()
-    args.keep_empty = False
-
-    if args.tokenizer_type.lower().startswith('bert') and not args.split_sentences:
-        print("Are you sure you don't want to split sentences?")
-
-    # some default/dummy values for the tokenizer
-    args.rank = 1
-    args.make_vocab_size_divisible_by = 128
-    args.tensor_model_parallel_size = 1
-    args.vocab_extra_ids = 0
-
-    return args
-
-
-def get_file_name(args, file_id):
-    file_name, extension = os.path.splitext(args.input)
-    input_file_name = file_name + "_" + str(file_id) + extension
-    sentence_split_file = file_name + "_ss_" + str(file_id) + extension
-    output_prefix = args.output_prefix + "_" + str(file_id)
-    file_names = {
-        'partition': input_file_name,
-        'sentence_split': sentence_split_file,
-        'output_prefix': output_prefix}
-    return file_names
-
-
-def check_files_exist(in_ss_out_names, key, num_partitions):
-    for i in range(num_partitions):
-        if not os.path.exists(in_ss_out_names[i][key]):
-            return False
-    return True
-
-
-def main():
-    args = get_args()
-
-    if args.split_sentences:
-        if nltk_available:
-            nltk.download("punkt", quiet=True)
-        else:
-            raise Exception(
-                "nltk library required for sentence splitting is not available.")
-
-    in_ss_out_names = []
-    if args.partitions == 1:
-        file_name, extension = os.path.splitext(args.input)
-        sentence_split_file = file_name + "_ss" + extension
-        file_names = {
-            'partition': args.input,
-            'sentence_split': sentence_split_file,
-            'output_prefix': args.output_prefix}
-        in_ss_out_names.append(file_names)
-    else:
-        in_file_names = glob.glob(args.input)
-
-        # create .jsonl parition files
-        for idx in range(args.partitions):
-            in_ss_out_name = get_file_name(args, idx)
-            in_ss_out_names.append(in_ss_out_name)
-
-        # check to see if paritions were already created
-        partitions_present = check_files_exist(in_ss_out_names, 'partition', args.partitions)
-
-        # check to see if paritions with split sentences already created
-        split_sentences_present = check_files_exist(in_ss_out_names, 'sentence_split', args.partitions)
-
-        if not partitions_present and not split_sentences_present:
-            # populate .jsonl partition files from parent files
-            partitioned_input_files = []
-            for idx in range(args.partitions):
-                partitioned_input_file = open(in_ss_out_names[idx]['partition'], 'w')
-                partitioned_input_files.append(partitioned_input_file)
-
-            index = 0
-            for in_file_name in in_file_names:
-                # support for gzip files
-                if in_file_name.endswith(".gz"):
-                    fin = gzip.open(in_file_name, 'rt')
-                else:
-                    fin = open(in_file_name, 'r', encoding='utf-8')
-
-                for line in fin:
-                    partitioned_input_files[index].write(line)
-                    index = (index + 1)%args.partitions
-
-                fin.close()
-
-            for idx in range(args.partitions):
-                partitioned_input_files[idx].close()
-
-    assert args.workers % args.partitions == 0
-    partition = Partition(args, args.workers//args.partitions)
-
-    # check to see if paritions with split sentences already created
-    split_sentences_present = check_files_exist(in_ss_out_names, 'sentence_split', args.partitions)
-
-    # split sentences in partition files
-    if args.split_sentences and not split_sentences_present:
-        processes = []
-        for name in in_ss_out_names:
-            p = multiprocessing.Process(target=partition.split_sentences,
-                                        args=((name['partition'], name['sentence_split']),))
-            p.start()
-            processes.append(p)
-
-        for p in processes:
-            p.join()
-
-        if args.partitions == 1:
-            return
-
-
-    # encode partition files in parallel
-    processes = []
-    input_key = 'sentence_split' if args.split_sentences else 'partition'
-    for name in in_ss_out_names:
-        p = multiprocessing.Process(target=partition.process_json_file,
-                                    args=((name[input_key], name['output_prefix']),))
-        p.start()
-        processes.append(p)
-
-    for p in processes:
-        p.join()
-
-    # merge bin/idx partitions
-    level = "document"
-    if args.split_sentences:
-        level = "sentence"
-
-    output_bin_files = {}
-    output_idx_files = {}
-    builders = {}
-    tokenizer = build_tokenizer(args)
-
-    for key in args.json_keys:
-        output_bin_files[key] = "{}_{}_{}.bin".format(args.output_prefix,
-                                                      key, level)
-        output_idx_files[key] = "{}_{}_{}.idx".format(args.output_prefix,
-                                                      key, level)
-        builders[key] = indexed_dataset.make_builder(output_bin_files[key],
-                                                     impl=args.dataset_impl,
-                                                     vocab_size=tokenizer.vocab_size)
-        for name in in_ss_out_names:
-            parition_output_prefix = name['output_prefix']
-            full_partition_output_prefix = "{}_{}_{}".format(parition_output_prefix,
-                                                             key, level)
-            builders[key].merge_file_(full_partition_output_prefix)
-        builders[key].finalize(output_idx_files[key])
-
-
-if __name__ == '__main__':
-    main()
-
diff --git a/tools/preprocess_mmdata.py b/tools/preprocess_mmdata.py
new file mode 100755
index 0000000000..8ab2c2b867
--- /dev/null
+++ b/tools/preprocess_mmdata.py
@@ -0,0 +1,169 @@
+# coding=utf-8
+# Copyright (c) 2023, NVIDIA CORPORATION.  All rights reserved.
+
+"""Processing text modality data for MultiModal pretraining."""
+
+import argparse
+import json
+import multiprocessing
+import os
+import sys
+import numpy as np
+from torchvision.transforms import ToTensor
+sys.path.append(os.path.abspath(os.path.join(os.path.dirname(__file__),
+                                             os.path.pardir)))
+import time
+
+import torch
+try:
+    from nltk.tokenize.punkt import PunktLanguageVars
+except ImportError:
+    PunktLanguageVars = object  # Fallback to the built-in object class
+
+from megatron.training.tokenizer import build_tokenizer
+from megatron.core.datasets.indexed_dataset import IndexedDatasetBuilder
+
+
+# https://stackoverflow.com/questions/33139531/preserve-empty-lines-with-nltks-punkt-tokenizer
+class CustomLanguageVars(PunktLanguageVars):
+
+    _period_context_fmt = r"""
+        \S*                          # some word material
+        %(SentEndChars)s             # a potential sentence ending
+        \s*                       #  <-- THIS is what I changed
+        (?=(?P<after_tok>
+            %(NonWord)s              # either other punctuation
+            |
+            (?P<next_tok>\S+)     #  <-- Normally you would have \s+ here
+        ))"""
+
+class IdentitySplitter(object):
+    def tokenize(self, *text):
+        return text
+
+class Encoder(object):
+    def __init__(self, args):
+        self.args = args
+
+    def initializer(self):
+        # Use Encoder class as a container for global data
+        Encoder.tokenizer = build_tokenizer(self.args)
+
+    def encode(self, input_pair):
+        json_line, img_path = input_pair
+        data = json.loads(json_line)
+        key = "text"
+        text = data[key]
+        sentence_ids = Encoder.tokenizer.tokenize(text)
+        pad_len = self.args.pad_length
+        if len(sentence_ids) > 0 and self.args.append_eod:
+            sentence_ids = sentence_ids[:pad_len]
+            current_length = len(sentence_ids)
+            sentence_ids.extend([Encoder.tokenizer.eod for _ in range(max(0,pad_len-current_length))])
+
+        with open(img_path, "rb") as tf:
+            xs = bytearray(tf.read())
+            img_pad = (4 - len(xs) % 4) % 4
+            xs.extend([0 for _ in range(img_pad)])
+            img_raw = np.frombuffer(xs, dtype=np.int32)
+            img_raw = np.insert(img_raw, 0, img_pad)
+        
+        return sentence_ids, img_raw, len(json_line)
+
+def get_args():
+    parser = argparse.ArgumentParser()
+    group = parser.add_argument_group(title='input data')
+    group.add_argument('--input', type=str, required=True,
+                       help='Path to input JSON')
+    group.add_argument('--input-image', type=str, required=True,
+                       help='Path to input image folder')
+
+    group.add_argument('--pad-length', type=int, required=True,
+                       help='Pad length of preprocessed text')
+
+    group.add_argument('--split-sentences', action='store_true',
+                       help='Split documents into sentences.')
+    group.add_argument('--keep-newlines', action='store_true',
+                       help='Keep newlines between sentences when splitting.')
+
+    group = parser.add_argument_group(title='tokenizer')
+    group.add_argument('--tokenizer-type', type=str, required=True,
+                       choices=['BertWordPieceLowerCase','BertWordPieceCase',
+                                'GPT2BPETokenizer', 'SentencePieceTokenizer', 'GPTSentencePieceTokenizer'],
+                       help='What type of tokenizer to use.')
+    group.add_argument('--vocab-file', type=str, default=None,
+                       help='Path to the vocab file')
+    group.add_argument('--merge-file', type=str, default=None,
+                       help='Path to the BPE merge file (if necessary).')
+    group.add_argument('--append-eod', action='store_true',
+                       help='Append an <eod> token to the end of a document.')
+    group.add_argument('--lang', type=str, default='english',
+                       help='Language to use for NLTK-powered sentence splitting.')
+    group.add_argument('--tokenizer-model', type=str, default=None,
+                       help='sentencepeice tokenizer model.')
+
+    group = parser.add_argument_group(title='output data')
+    group.add_argument('--output-prefix', type=str, required=True,
+                       help='Path to binary output file without suffix')
+    group = parser.add_argument_group(title='runtime')
+    group.add_argument('--workers', type=int, default=1,
+                       help='Number of worker processes to launch')
+    group.add_argument('--log-interval', type=int, default=100,
+                       help='Interval between progress updates')
+    args = parser.parse_args()
+    args.keep_empty = False
+
+    # some default/dummy values for the tokenizer
+    args.rank = 0
+    args.make_vocab_size_divisible_by = 128
+    args.tensor_model_parallel_size = 1
+    args.vocab_extra_ids = 0
+
+    return args
+
+def main():
+    args = get_args()
+    startup_start = time.time()
+
+    encoder = Encoder(args)
+    tokenizer = build_tokenizer(args)
+    pool = multiprocessing.Pool(args.workers, initializer=encoder.initializer)
+
+    fin = open(args.input, 'r', encoding='utf-8')
+    img_paths = [os.path.join(args.input_image, basename) for basename in os.listdir(args.input_image)]
+
+    encoded_docs = pool.imap(encoder.encode, zip(fin, img_paths), 25)
+
+    print(f"Vocab size: {tokenizer.vocab_size}")
+    print(f"Output prefix: {args.output_prefix}")
+    
+    output_bin_files = "{}.bin".format(args.output_prefix)
+    output_idx_files = "{}.idx".format(args.output_prefix)
+
+    builders = IndexedDatasetBuilder(output_bin_files, dtype=np.int32, multimodal=True)
+
+    startup_end = time.time()
+    proc_start = time.time()
+    total_bytes_processed = 0
+
+    print("Time to startup:", startup_end - startup_start)
+    
+    for i, (sentence, img_raw, bytes_processed) in enumerate(encoded_docs, start=1):
+        total_bytes_processed += bytes_processed
+        builders.add_item(torch.IntTensor(sentence))
+        builders.add_item(torch.from_numpy(img_raw), 1)
+        builders.end_document()
+        if i % args.log_interval == 0:
+            current = time.time()
+            elapsed = current - proc_start
+            mbs = total_bytes_processed/elapsed/1024/1024
+            print(f"Processed {i} documents",
+                  f"({i/elapsed} docs/s, {mbs} MB/s).",
+                  file=sys.stderr)
+    
+    builders.finalize(output_idx_files)
+
+
+if __name__ == '__main__':
+    main()
+
diff --git a/tools/report_theoretical_memory.py b/tools/report_theoretical_memory.py
new file mode 100644
index 0000000000..79b483dd5d
--- /dev/null
+++ b/tools/report_theoretical_memory.py
@@ -0,0 +1,14 @@
+# Copyright (c) 2023, NVIDIA CORPORATION. All rights reserved.
+
+"""Computes theoretical memory footprint for model training without instantiating
+a model and running training iterations on GPU(s)."""
+
+from megatron.training import get_args
+from megatron.training.initialize import initialize_megatron
+from megatron.training.theoretical_memory_usage import report_theoretical_memory
+
+if __name__ == "__main__":
+    initialize_megatron(allow_no_cuda=True, skip_mpu_initialization=True)
+    args = get_args()
+
+    report_theoretical_memory(args, verbose=True)
diff --git a/tools/retro/README.md b/tools/retro/README.md
index 54c6854098..395005e73b 100644
--- a/tools/retro/README.md
+++ b/tools/retro/README.md
@@ -1,226 +1,256 @@
-This directory contains a collection of tools for building the retrieval database and pretraining neighbors for Retro. This preprocessing pipeline is broken into 3 main stages:
+# Retro and InstructRetro
+
+Retro [(Borgeaud et al., 2022)](https://arxiv.org/abs/2112.04426) is an autoregressive decoder-only language model (LM)
+pretrained with retrieval-augmentation.
+Retro features practical scalability to support large-scale pretraining from scratch by retrieving from trillions of
+tokens.
+Pretraining with retrieval provides a more efficient storage mechanism of factual knowledge, when compared to storing
+factual knowledge implicitly within the network's parameters, thus largely reducing model parameters while achieving
+lower perplexity than standard GPT.
+Retro also provides the flexibility to update the
+knowledge stored in LMs [(Wang et al., 2023a)](https://arxiv.org/abs/2304.06762)
+by updating the retrieval database without training LMs again.
+
+InstructRetro [(Wang et al., 2023b)](https://arxiv.org/abs/2310.07713) further scales up the size of Retro to 48B,
+featuring the largest LLM pretrained with retrieval (as of December 2023).
+The obtained foundation model, Retro 48B, largely outperforms the GPT counterpart in terms of perplexity.
+With instruction tuning on Retro, InstructRetro demonstrates significant improvement over the instruction tuned GPT on
+downstream tasks in the zero-shot setting. Specifically, the average improvement of InstructRetro is 7% over its GPT
+counterpart across 8 short-form QA tasks, 10% over GPT across 4 challenging long-form QA tasks, and 16% over GPT across
+3 summarization tasks. We also find that one can ablate the encoder from InstructRetro architecture and directly use the
+InstructRetro decoder backbone as GPT, while achieving comparable results.
+
+This README provides an end-to-end tutorial to reproduce Retro and InstructRetro.
 
-1. **Build retrieval chunk database** : Used for retrieving neighbors and continuation chunks, which are then passed through the retrieval encoder.
-2. **Build index for similarity search** : Train and build a search index for querying chunk neighbors.
-3. **Query pretraining neighbors** : For matching pretraining samples to database chunks. Neighbors are generated separately for training, validation, and test datasets.
-
-The following overview goes into more detail on the pipeline, code structure, usage, and pretraining.
-
-<!-- ################ contents ################ -->
 # Contents
 
-  * [Quick start](#quick-start)
-  * [Stages](#stages)
-  * [Code structure](#code-structure)
-  * [Arguments](#arguments)
-  <!-- * [Pretraining](#pretraining) -->
-
-<!-- ################ quick start ################ -->
-# Quick start
-
-See `examples/get_preprocess_cmd.sh` for example arguments.
-
-Key files:
-
-- `main.py` : Entry point.
-- `examples/get_preprocess_cmd.sh` : Build preprocessing command (for `main.py`).
-- `examples/preprocess_data.sh` : Run preprocessing (calls `get_preprocess_cmd.sh`, `main.py`).
-
-Use `--retro-tasks` to move through the preprocessing pipeline.
-
-- Simplest setup (builds everything): `--retro-tasks build`
-- Alternatively, for tuning compute resources, run stages independently:
-  - Build retrieval database: `--retro-tasks db-build`
-  - Build search index: `--retro-tasks index-build`
-  - Query neighbors: `--retro-tasks pretraining-query-neighbors`
-
-Sample code flow:
-
-- `main.py` : Entry point (e.g., using `--retro-tasks X`).
-- `db/build.py` : Build retrieval database.
-- `index/build.py` : Build search index. Calls the following two files:
-  - `index/train.py` : Train index on subset of database.
-  - `index/add.py` : Add database chunks to index.
-- `pretraining/query.py` : Query pretraining samples for database neighbors (saved to disk and used during pretraining).
-
-<!-- ################ stages ################ -->
-# Stages
-
-### Build retrieval chunk database
-
-This *database* (stored as a 2-D array, NOT a relational database) consists of a list of chunks (traditionally length 64) extracted from the original GPT token dataset. This is simply a consecutive, non-overlapping chunking of the token dataset. Chunking only takes place within a document, and therefore the final chunk of each document has length: 1 <= chunk_length <= max_chunk_length.
-
-We discard chunks that would convert to an empty Bert sequence (rare case, happens ~1/100,000 chunks in our case), since we use Bert embeddings for building our index. Thus, the total number of chunks in the database will be slightly less than a naive calculation.
-
-### Build index for similarity search
+* [Checkpoints](#checkpoints)
+* [End-to-end Reproduction Guide](#end-to-end-reproduction-guide)
+    * [Step 0: Prepare the environment](#step-0-prepare-the-environment)
+        * [Docker image](#docker-image)
+        * [Install dependencies](#install-dependencies)
+    * [Step 1: Build retrieval database](#step-1-build-retrieval-database)
+    * [Step 2: Pretraining](#step-2-pretraining)
+    * [Step 3: Perplexity evaluation](#step-3-perplexity-evaluation)
+    * [Step 4: Instruction tuning](#step-4-instruction-tuning)
+    * [Step 5: Downstream task evaluation](#step-5-downstream-task-evaluation)
+* [Citations](#citations)
 
-To match pretraining chunks to database chunks, a search index must be built to perform this querying. We use Faiss (https://github.com/facebookresearch/faiss) for training and building this index. Generally, the index is trained on a subset of all chunks in the database (specified via `--retro-nchunks-sampled`). After training, all chunks are added into the index, to be available during querying.
+# Checkpoints
 
-Indexes only accept 1-D floating point vectors for training and adding, so each chunk must first be embedded before passing to the index for either training or adding. We use Bert embeddings for this purpose, and the embeddings are generated automatically within the pipeline.
+We provide the pretrained checkpoints of Retro and InstructRetro in the following table. The checkpoints are available
+to download through the following links:
 
-### Query pretraining neighbors
+| Model                   | Size | Instruction Tuning | Download Link 1                                                    | Download Link 2                                                                | Download Link 3                                                                                      |
+|-------------------------|------|--------------------|--------------------------------------------------------------------|--------------------------------------------------------------------------------|------------------------------------------------------------------------------------------------------|
+| `retro-8b-base-4k`      | 8b   |                    | [Huggingface](https://huggingface.co/nvidia/retro-8b-base-4k)      | [NGC](https://catalog.ngc.nvidia.com/orgs/nvidia/models/retro-8b-base-4k)      | [Google Drive](https://drive.google.com/drive/folders/1uSQ5DAsuvx_8XcbtnVfs_MGvEOcx0uK_?usp=sharing) |
+| `retro-8b-instruct-4k`  | 8b   | ✅                  | [Huggingface](https://huggingface.co/nvidia/retro-8b-instruct-4k)  | [NGC](https://catalog.ngc.nvidia.com/orgs/nvidia/models/retro-8b-instruct-4k)  | [Google Drive](https://drive.google.com/drive/folders/1v5dKaSN0cm2lwyAWpFaJtlTrLhtMZXsI?usp=sharing) |
+| `retro-48b-base-4k`     | 48b  |                    | [Huggingface](https://huggingface.co/nvidia/retro-48b-base-4k)     | [NGC](https://catalog.ngc.nvidia.com/orgs/nvidia/models/retro-48b-base-4k)     | [Google Drive](https://drive.google.com/drive/folders/1rtNpf0CiLElSHQcr3aLI3zgfI3teGTP5?usp=sharing) |
+| `retro-48b-instruct-4k` | 48b  | ✅                  | [Huggingface](https://huggingface.co/nvidia/retro-48b-instruct-4k) | [NGC](https://catalog.ngc.nvidia.com/orgs/nvidia/models/retro-48b-instruct-4k) | [Google Drive](https://drive.google.com/drive/folders/1qdb0AQjSsAPGlWaIu3wgHPjf_nwLeY5h?usp=sharing) |
 
-To ensure fast Retro pretraining, the database neighbors for pretraining samples are pre-computed and saved to disk, for efficient access within the Retro dataset. In this stage, the pretraining datasets (training, validation, and test) are iterated, each sample is broken into chunks, and the chunks are used for querying the index. Similar to when building the index, each chunk is embedded (via Bert) before querying the index.
+# End-to-end Reproduction Guide
 
-The saved neighbors are labeled with unique dataset properties (i.e., seed, sequence length, number of samples, etc.) to ensure the neighbors generated during preprocessing match the neighbors requested during pretraining.
+In this README, we provide an end-to-end reproduction guide for InstructRetro, covering from large-scale retrieval
+construction, pretraining, perplexity evaluation, instruction tuning, to downstream task evaluation.
 
-<!-- ################ code structure ################ -->
-# Code structure
+If you are interested in evaluation only, we also [open-sourced our checkpoints](#checkpoints) and you can directly go
+to [Step 5](#step-5-downstream-task-evaluation) to evaluate the checkpoints on downstream tasks.
 
-### `tools/retro/main.py`
+## Step 0: Prepare the environment
 
-This is the main entry point for Retro preprocessing. Call `main.py --help` to see arguments. Additionally, some Retro arguments are in Megatron's core arguments, so also see `add_retro_args()` section of `megatron/arguments.py` for additional arguments. Two of the most important arguments to customize are `--retro-workdir` and `--retro-tasks`.
+We recommend using docker environment to run the code.
 
-- **`--retro-workdir`** : Set the directory in which the preprocessing pipeline saves its datasets and configuration files. This argument should remain consistent for a full pass through the pipeline, and for pretraining.
+### Docker image
 
-- **`--retro-tasks`** : Set the stages of preprocessing to perform. As mentioned previously, the three high-level stages are: 1) build retrieval database, 2) build search index, and 3) query pretraining neighbors. `--retro-tasks` can be used to either run the full pipeline, or run each of these stages in isolation. The latter case is useful for tuning compute resources for each stage. For example, index training utilizes GPUs and requires relatively less time, while querying neighbors uses the CPU and is a relatively slow process. Example tasks include:
+We provide a docker build file in [tools/retro/examples/Dockerfile](examples/Dockerfile) for the reproduction. The
+docker image is based on the [NGC docker](https://catalog.ngc.nvidia.com/orgs/nvidia/containers/pytorch/tags) `nvcr.io/nvidia/pytorch:23.09-py3`.
 
-  - **`--retro-tasks build`** : Run entire preprocessing pipeline.
-  - **`--retro-tasks db-build`** : Build retrieval database.
-  - **`--retro-tasks index-build`** : Train and build search index.
-  - **`--retro-tasks pretraining-query-neighbors`** : Query pretraining neighbors.
+### Install dependencies
 
-Multiple tasks can be specified by separating with commas (e.g., `--retro-tasks db-build,index-build`). Additionally, various 'miscellaneous' tasks are currently including, primarily for validating data for each stage; these task names can be seen in `main.py`.
+Clone the Megatron repo:
 
-### `tools/retro/examples`
-
-Example scripts for setting arguments and launch Retro preprocessing. The key files here are:
-
-- **`get_preprocess_cmd.sh`** : Sets up arguments and command for preprocessing. **Important note**: this script assumes a few environment variables are already set before it is called. Please see the `Environment vars.` section at the top of this file. Generally, environment variables must be set to determine the location of Retro workdirs, input datasets, and GPT and Bert model information.
-- **`preprocess_data.sh`** : Calls `get_preprocess_cmd.sh` to get arguments, and then calls `main.py` to launch preprocessing.
-- **`pretrain_model.sh`** : Example script for pretraining on Wikipedia data, after preprocessing is complete.
-
-### `tools/retro/db`
+```bash
+git clone --branch InstructRetro https://github.com/NVIDIA/Megatron-LM.git
+```
 
-Build the retrieval chunk database. The key files here are:
+If docker is not available, we recommend starting from a clean conda environment with the following runtime
+dependencies:
+
+- Python 3.10
+- NVIDIA CUDA® 12.2.1
+- NVIDIA cuBLAS 12.2.5.6
+- NVIDIA cuDNN 8.9.5
+- NVIDIA NCCL 2.18.5
+- PyTorch 2.1.0a0+32f93b1
+
+Then install Retro-specific dependencies, including:
+
+```bash
+pip install -U faiss-gpu
+pip install -U transformers
+pip install -U sentencepiece
+pip install -U h5py
+pip install -U nltk
+pip install -U einops
+```
 
-- **`build.py`** : Entry point for building the database. This code is responsible for iterating the input datasets (i.e., `--data-path`), parsing each dataset into consecutive chunks, checking for empty Bert (Wordpiece) conversions, and storing this information to disk. Two databases are created: 1) the retrieval database, and 2) a sampled database used for training the search index.
-- **`dataset.py`** : Defines database class, for iterating or accessing chunks in the database. Each chunk contains its tokens, Bert conversion length, and dataset index.
+## Step 1: Build retrieval database
 
-Input data:
+In this step, we build a large-scale retrieval database for InstructRetro
+through [Faiss](https://github.com/facebookresearch/faiss) to retrieve from trillions of tokens, and preprocess (and
+save) the retrieval neighbors for the pretraining step.
 
-<!-- - Token datasets, as generated by `tools/preprocess_data.py`. Each dataset should include a `.bin` and `.idx` file. Multiple datasets can be specified by using a blended configuration (see `--data-path` in `megatron/arguments.py`). -->
-- Token datasets, as loaded by `gpt_dataset.py`. Multiple datasets can be specified by using a blended configuration (see `--data-path` in `megatron/arguments.py`).
+Please refer to [tools/retro/build_db.md](build_db.md) for more details.
 
-Output data:
+## Step 2: Pretraining
 
-- **`<RETRO_WORKDIR>/db/merged/train.hdf5`** : The main retrieval database. (*Database* here is used to denote a list of indexed chunks, rather than a *relational database*.) The chunks in this database are added to the search index, and are used for retrieval during pretraining. This file contains a single dataset `'chunks'`, which contains 5 columns:
+*Please strictly follow Step 1 to build the retrieval database before pretraining to make sure the preprocessed
+retrieval neighbors match the pretraining corpus.*
 
-  - `dataset_idx` : Dataset index, from list of blended indexed datasets.
-  - `document_idx` : Document index within dataset.
-  - `chunk_start_idx` : Chunk's starting token index within document.
-  - `chunk_end_idx` : Chunk's ending token index (exclusive) within document.
-  - `bert_chunk_length` : Length of Bert token sequence, after converting from GPT.
+In the pretraining step, we support both pretraining from scratch and continued pretraining from a pretrained GPT model.
 
-- **`<RETRO_WORKDIR>/db/merged/sampled.hdf5`** : Subset of training database that is used for training the search index. This file has the same structure as detailed above. In general, this database is significanly smaller than the `train.hdf5` database, since the search index only needs a relatively small number of samples to understand the data's structure. After training, all chunks in the main database (`train.hdf5`) are *added* to the search index.
+We provide a template pretraining script to pretrain 843M Retro from scratch. Prepare your own arguments and update our
+templates in [tools/retro/examples/pretrain_model.sh](examples/pretrain_model.sh). Please note that the data path should
+be exactly matching the one used in Step 1 to make sure the preprocessed retrieval neighbors match the pretraining
+corpus.
 
-### `tools/retro/index`
+[//]: # (Take the example of the Wikipedia corpus)
 
-Build the search index. The key files here are:
+```bash
+bash tools/retro/examples/pretrain_model.sh
+```
 
-- `build.py` : Entry point for building the search index. First, the index is trained on the sampled chunk database (see above) by calling `train.py`, and then all chunks for the full database are added to the index by calling `add.py`. Note that training requires first embedding (using Bert) all chunks (a parallel operation), and then loading these embeddings and training the index (a sequential operation), so it's best to change one's compute setup after all chunks have been embedded and saved to disk.
-- `indexes/faiss_base.py` : Wrapper class for building a Faiss index, following the standard `train()` and `add()` operations.
-- `indexes/faiss_par_add.py` : Similar to above, except it uses an embarrassingly parallel (multi-node, multi-process) `add()` operation. Vectors are first added to separate index copies, and then merged together.
+After pretraining, the model checkpoints will be saved in the `--save` directory if you specified the arg
+in `pretrain_model.sh`.
 
-Input data:
+To continue pretraining with retrieval from a pretrained GPT model, please specify `--load` in `pretrain_model.sh` to
+load the pretrained GPT model checkpoint (the architecture of GPT, including hidden size, number of layers, and
+activation methods, should be exactly the same as the one used for Retro). You should also
+specify  `--no-load-optim --finetune` to make sure the optimizer state is not loaded from the pretrained GPT model and
+the continued pretraining with retrieval is from a clean start. After the first job / the first run, you will continue
+pretraining with retrieval from your last checkpoint. In the follow-up jobs, you should launch the pretraining without
+the flags `--no-load-optim --finetune` to make sure the optimizer state is correctly loaded from your last job.
 
-- **`<RETRO_WORKDIR>/db/merged/sampled.hdf5`** : Chunks used for training the search index.
-- **`<RETRO_WORKDIR>/db/merged/train.hdf5`** : Chunks used for adding to the *trained* search index.
+## Step 3: Perplexity evaluation
 
-Output data:
+During pretraining, we will automatically evaluate the model perplexity on the specified validation corpus
+every `--eval-interval` steps. The validation corpus should be exactly the same as the one used in Step 1 to make sure
+the preprocessed retrieval neighbors match the pretraining corpus.
 
-- **`<RETRO_WORKDIR>/index/<RETRO_INDEX_TYPE>/<RETRO_INDEX_STR>/added.faissindex`** : The final index, which has been trained and has had all database chunks added to it. This index is ready for querying neighbors. Here, `RETRO_INDEX_TYPE` and `RETRO_INDEX_STR` correspond to the same-name arguments `--retro-index-type` (e.g., `faiss-par-add`) and `--retro-index-str` (e.g., `OPQ32_256,IVF4194304_HNSW32,PQ32`).
-- **`<RETRO_WORKDIR>/index/<RETRO_INDEX_TYPE>/<RETRO_INDEX_STR>/empty.faissindex`** : Generally can be discarded once `added.faissindex` has been built, but this file contains the *post-training*, *pre-adding* index. Useful for debugging or building other indexes.
+To evaluate the perplexity of a pretrained model, please add `--skip-train` in `pretrain_model.sh` to skip the
+pretraining step and only evaluate the perplexity of the model specified in `--load` on the validation corpus. Run the
+above command again to evaluate the perplexity of a pretrained model:
 
-### `tools/retro/pretraining`
+```bash
+bash tools/retro/examples/pretrain_model.sh
+```
 
-Query the pretraining datasets (training, validation, test) for their neighbors within the database. Neighbors are queried during preprocessing -- rather than during pretraining -- because querying is a fairly slow operation, so it would be a bottleneck if performed during pretraining. Queried neighbors are tagged with their unique identifying information (e.g., `train_indexmap_27662746ns_2048sl_1234s`), so as to avoid incorrect references during pretraining. The key files here are:
+## Step 4: Instruction tuning
 
-- **`query.py`** : Entry point for querying. The pretraining datasets are iterated, and each chunk within each sample is queried using the search index. These neighbors are filtered by discarding any database chunks that fall within the same document as any chunk within a pretraining sample.
-- **`chunk_dataset.py`** : This creates an iterable 'chunk' dataset form of a pretraining dataset. This is just a light wrapper, but makes it easier to deterministically iterate and assign IDs to each chunk in a sample dataset.
-- **`retro_dataset.py`** : The Retro dataset used for pretraining (not used in preprocessing). Each sample returns the sample tokens, along with neighbor tokens for each chunk within the sample.
+In this step, we fine-tune the pretrained model on the downstream task with instructions. We provide a template
+instruction tuning script to fine-tune 843M Retro.
 
-Input data:
+We also provide an open-source blend of instruction tuning datasets. The dataset is available to download
+through [here](https://drive.google.com/file/d/1nzKwwYf8lYb9gN3P4YO8pFNU_B2nMYe1/view?usp=sharing). The blendable
+dataset consists of the following open-source instruction tuning datasets:
 
-- Token datasets, as loaded by `gpt_dataset.py`.
-- **`<RETRO_WORKDIR>/index/<RETRO_INDEX_TYPE>/<RETRO_INDEX_STR>/added.faissindex`** : The trained index, with all database chunks added to it (see previous section for details).
+### Instruction Tuning Dataset Breakdown
 
-Output data:
+| Dataset                                                    | Samples | Epochs | Sampling Prob |
+|------------------------------------------------------------|--------:|-------:|--------------:|
+| [soda](https://arxiv.org/abs/2212.10465)                   |    2560 |  0.005 |         0.020 |
+| [eli5](https://arxiv.org/abs/1907.09190)                   |    2561 |  0.055 |         0.020 |
+| [self_instruct_short](https://arxiv.org/abs/2212.10560)    |    1280 |  0.043 |         0.010 |
+| [self_instruct_long](https://arxiv.org/abs/2212.10560)     |    2560 |  0.333 |         0.020 |
+| [unnatural-instructions](https://arxiv.org/abs/2212.09689) |    2560 |  0.024 |         0.020 |
+| [flan_cot](https://arxiv.org/abs/2210.11416)               |    1280 |  0.093 |         0.010 |
+| [dolly](https://arxiv.org/abs/2305.13735)                  |    6400 |  0.938 |         0.050 |
+| [oasst-skip-noncode](https://open-assistant.io/)           |  104558 |  1.839 |         0.817 |
+| [oasst-skip-code](https://open-assistant.io/)              |    4243 |  1.839 |         0.033 |
 
-- **`<RETRO_WORKDIR>/{train,valid,test}_XXns_YYsl_ZZs/WW.hdf5`** : These directories/files contain the indexes of neighbors for each chunk within each sample of the pretraining datasets. Each directory (e.g., `train_indexmap_2047435ns_2048sl_1234s`) contains a list of HDF5 files (e.g., one file might be called `0075700000-0075800000.hdf5`). Each HDF5 file contains a consecutive subset of neighbor IDs for a given chunk, for indexing into the main retrieval database. All HDF5 files taken together within a given directory, represent the entire set of neighbors for a dataset. The size of these HDF5 files is determined by the argument `--retro-block-size`. The `XX`, `YY`, `ZZ`, `WW` notation above denotes the dataset properties that are used for uniquely tagging the neighbor files, to ensure compatibility during model pretraining. These neighbor files are ultimated used by `retro_dataset.py` during pretraining, for building Retro samples.
+Refer to the paper links above for more details about each instruction tuning dataset.
 
-### `tools/retro/cli`
+*We note that the provided instruction tuning dataset is all from open-source instruction tuning datasets. It is
+slightly different from what we use in [InstructRetro](https://arxiv.org/abs/2310.07713), which contains private and
+proprietary datasets. Thus a 1-2% accuracy difference in downstream tasks may be expected.*
 
-Inspect preprocessed data. To use the CLI, open a Python terminal via the `python` command, and then load a Retro workdir with the following:
+### Instruction tuning script
 
-```
-from tools.retro.cli import retro
-retro.init("/path/to/retro/workdir")
-```
+Download
+the [blended instruction tuning dataset](https://drive.google.com/file/d/1nzKwwYf8lYb9gN3P4YO8pFNU_B2nMYe1/view?usp=sharing)
+in your data home directory `$DATA_HOME` and update our templates
+in [tools/retro/sft/sft_retro_lm.sh](sft/sft_retro_lm.sh).
 
-This initializes Megatron, and prepares the Retro data for inspection. See the printed usage for available functions. Several routines are included for viewing data in the retrieval database and viewing pretraining samples and neighbors. For example:
+An example command to run instruction tuning on 843M Retro is as follows:
 
-```python
-retro.get_db_num_indexed_datasets() # 15
-retro.get_db_chunk_text(92874113) # 'research project at ...  and philosophy'
-retro.get_pt_sample('train', 62005) # '[16084, 26158, 25387 ..., 6898, 9568]'
+```bash
+                                      [blend-dataset-name] [model-size] [batch-size]  [lr]    [checkpoints]
+bash tools/retro/sft/sft_retro_lm.sh       open_inst               843m            128    5e-6  <path/to/pretrained/retro>
 ```
 
-Most methods within the CLI are prefixed to denote the data being inspected:
-
-- **'db'** : Retrieval database (i.e., chunk tokens, document IDs, and dataset IDs)
-- **'pt'** : Pretraining datasets (i.e., sample tokens and neighbor tokens)
-
-### `tools/retro/utils.py`
+The `blend_dataset_name` argument will blend all the datasets within the `$DATA_HOME` following the weights and
+configurations specified in the `${blend_dataset_name}.sh` ([open_inst.sh](sft/open_inst.sh) in the example above).
+The checkpoints will be saved in the `--save` directory. For example, it will be saved to
+`<SFT_HOME>/checkpoints/applications/retro-sft_pp1_same_format_ctx1_843m_128_5e-6`.
 
-A collection of utility methods. Most importantly, this contains:
+## Step 5: Downstream task evaluation
 
-- **`def get_gpt_tokenizer()`** : Get the GPT tokenizer.
-- **`def get_bert_tokenizer()`** : Get the Bert tokenizer.
-- **`class GPTToTextDataset`** : Wrapper class that converts GPT (BPE) samples to raw text.
+In this step, we demonstrate how to run InstructRetro for zero-shot evaluation on downstream question answering (QA)
+tasks. We provide the pre-processed open-source evaluation datasets with a unified format for different tasks. The
+evaluation datasets used in our paper are available to download
+through [here](https://drive.google.com/drive/folders/1xw-N0LJR_lIWnH6BKzHIb49quVCS_V72?usp=sharing). Please stick to
+the same retro workdir used in Step 0-4 to make sure the preprocessed retrieval neighbors match the pretraining corpus.
+If you directly come to Step 5, an example retro workdir with `args.json` for 800M Retro is
+provided [here](https://drive.google.com/file/d/121GqAdMvf8bJEBZRt-SD4uhW-SRWgI3s/view?usp=sharing). Note that the args
+in the json can be overwritten through the command line.
 
-### `tools/bert_embedding`
+We present an example command to run retro generation given the InstructRetro checkpoints and the Natural Question (NQ)
+task. The example command is for the 843m InstructRetro obtained in Step 4. Please specify the directory for the NQ
+dataset and update the command accordingly for other checkpoints.
 
-Generate Bert embeddings. The main files here are:
+```bash
+bash tools/retro/text_generation/retro_generate.sh nq 843m greedy test  0 20000 1000 5 pp1 <SFT_HOME>/checkpoints/applications/retro-sft_pp1_same_format_ctx1_843m_128_5e-6 2
+```
 
-- **`embed.py`** : Entry point for generating embeddings, and contains the two main embedding classes, `BertEmbedder` and `DiskDataParallelBertEmbedder` (more below). This file contains code for generating Megatron embeddings, while the file below contains code for Huggingface embeddings.
-- **`huggingface.py`** : Used by `embed.py` when the embedder is configured (see below) to output Huggingface embeddings.
-- **`dataset.py`** : Wrapper class for converting a raw-text dataset to Bert (Wordpiece) tokens.
+The generated responses will be saved in the corresponding checkpoint directory. For example, for the 843m
+InstructRetro, it will be saved to
+`<SFT_HOME>/checkpoints/applications/retro-sft_pp1_same_format_ctx1_843m_128_5e-6/retro-generate-nq_5_2_843m_test_greedy_0_20000_1000.txt`.
 
-The Bert embeddings can be configured along two axes. The first axis is the output type:
+To evaluate the F1 / Exact Match (EM) scores of the generated responses, we provide an example script to run the
+evaluation on the NQ dataset. Please specify the directory for the NQ dataset and update the command accordingly for
+other checkpoints and downstream tasks.
 
-- **`class BertEmbedder`** : This class takes a raw-text dataset as input, generates its embeddings, and returns a Numpy array. The main functions are `embed_text_dataset` (accepts a raw-text dataset) and `embed_text` (accepts a string).
-- **`class DiskDataParallelBertEmbedder`** : This class wraps `BertEmbedder`, and rather than returning a Numpy array, it saves the embeddings to disk. Additionally, this class automatically splits data across data parallel ranks (using interleaving), and also processes data in a specified `block_size` (e.g., 1,000,000).
+```bash
+python3 tools/retro/text_generation/evaluate.py
+```
 
-The second axis is the type of embedding model to use, controlled by the argument `--bert-embedder-type`:
+# Citations
 
-- **`--bert-embedder-type megatron`** : Use Megatron's Bert model. The specific model used is dependent on the loaded checkpoint, vocab file, and tokenizer.
-- **`--bert-embedder-type huggingface`** : Use Huggingface's `bert-large-cased`. (*Note*: Huggingface's inclusion is likely to be deprecated; and there is no ability to configure cased/uncased.)
+See more details from our papers:
 
-### Pretraining
+[Shall we Pretrain Autoregressive Language Models with Retrieval? A Comprehensive Study.](https://arxiv.org/abs/2304.06762)
 
-- **`pretrain_retro.py`** : Launch script for pretraining Retro. Similar to `pretrain_gpt.py`, except this script handles loading neighbor tokens and setting up the neighbor attention mask.
-<!-- - `megatron/data/gpt_dataset.py` : ? -->
-- **`megatron/model/retro_transformer.py`** : Implementation of Retro model, including the main transformer, the retrieval encoder, and chunked cross-attention layers. Note that currently, `retro_transformer.py` contains several classes that are nearly identical to `transformer.py`, except for 1 or 2 lines, due to code changes that are yet to be integrated.
-- **`tools/retro/pretraining/retro_dataset.py`** : The Retro dataset used for pretraining (not used in preprocessing). Each sample returns the sample tokens, along with neighbor tokens for each chunk within the sample.
+_Boxin Wang, Wei Ping, Peng Xu, Lawrence McAfee, Zihan Liu, Mohammad Shoeybi, Yi Dong, Oleksii Kuchaiev, Bo Li, Chaowei
+Xiao, Anima Anandkumar, Bryan Catanzaro._ (EMNLP 2023)
 
+[InstructRetro: Instruction Tuning post Retrieval-Augmented Pretraining.](https://arxiv.org/abs/2310.07713)
 
-<!-- ################ arguments ################ -->
-# Arguments
+_Boxin Wang, Wei Ping, Lawrence McAfee, Peng Xu, Bo Li, Mohammad Shoeybi, Bryan Catanzaro._
 
-See `tools/retro/main.py`'s `add_retro_args()` and `megatron/arguments.py`'s `_add_retro_args()` for details and descriptions. Here we list some particularly important arguments:
+Please cite the papers as follows if you use the data or code from this repo:
 
-- `--retro-workdir` : Mentioned previously, this argument determines the directory in which a set of Retro data is stored (during preprocessing) and loaded (during pretraining). Any change in this directory during preprocessing may result in preprocessing starting over from scratch, and any change before pretraining will result in pretraining throwing an error.
-- Preprocessing
-  - `--retro-gpt-chunk-length` : Retro chunk length (e.g., 64 in original paper).
-  - `--retro-tasks` : Comma-separated list of preprocessing tasks. Generally, the `build` task is the simplest way to run the preprocessing pipeline. For finer control, individual stages can be run by using tasks (in order): `db-build`, `index-build`, and `pretraining-query-neighbors`.
-  - `--retro-index-str` : Faiss index string that defines the index configuration. This will vary based on data size, compute/disk setup, and user needs. For example, this string looks something like `IVF262144_HNSW32,Flat` or `OPQ32_256,IVF4194304_HNSW32,PQ32`.
-- Pretraining
-  - `--retro-add-retriever` : Must be used to select Retro model.
-  - `--retro-num-neighbors` : Number of neighbors to retrieve from the retrieval database (defaults to 2).
-  - `--retro-num-retrieved-chunks` : For each neighbor, the number consecutive chunks to retrieve, including the initial neighbor (defaults to 2).
+```bibtex
+@inproceedings{wang2023shall,
+    title   = {Shall We Pretrain Autoregressive Language Models with Retrieval? A Comprehensive Study},
+    author  = {Boxin Wang and Wei Ping and Peng Xu and Lawrence McAfee and Zihan Liu and Mohammad Shoeybi and Yi Dong and Oleksii Kuchaiev and Bo Li and Chaowei Xiao and Anima Anandkumar and Bryan Catanzaro},
+    journal = {The 2023 Conference on Empirical Methods in Natural Language Processing},
+    year    = {2023}
+}
 
-<!-- ################ pretraining ################ -->
-<!-- # Pretraining -->
-<!-- - New retro args in arguments.py (add_retro_args). -->
-<!-- - Most important arg is `--retro-add-retriever`. -->
+@article{wang2023instructretro,
+    title   = {InstructRetro: Instruction Tuning post Retrieval-Augmented Pretraining},
+    author  = {Boxin Wang and Wei Ping and Lawrence McAfee and Peng Xu and Bo Li and Mohammad Shoeybi and Bryan Catanzaro},
+    year    = {2023},
+    journal = {arXiv preprint arXiv: 2310.07713}
+}
+```
diff --git a/tools/retro/build_db.md b/tools/retro/build_db.md
new file mode 100644
index 0000000000..c99952485a
--- /dev/null
+++ b/tools/retro/build_db.md
@@ -0,0 +1,421 @@
+This directory contains a collection of tools for building the retrieval database and pretraining neighbors for Retro. This preprocessing pipeline is broken into 3 main stages:
+
+1. **Build retrieval chunk database** : Used for retrieving neighbors and continuation chunks, which are then passed through the retrieval encoder.
+2. **Build index for similarity search** : Train and build a search index for querying chunk neighbors.
+3. **Query pretraining neighbors** : For matching pretraining samples to database chunks. Neighbors are generated separately for training, validation, and test datasets.
+
+The following overview goes into more detail on the pipeline, code structure, usage, and pretraining.
+
+<!-- ################ contents ################ -->
+# Contents
+
+  * [Quick start](#quick-start)
+  * [Tutorial](#tutorial)
+  * [Code structure](#code-structure)
+  * [Arguments](#arguments)
+  <!-- * [Pretraining](#pretraining) -->
+
+<!-- ################ quick start ################ -->
+
+# Quick Start
+Key files:
+
+- `main.py` : Entry point for processing.
+- `examples/preprocess_data.sh` : Example preprocessing launch (calls `main.py`).
+- `examples/pretrain_data.sh` : Example pretraining launch (calls `pretrain_retro.py`).
+
+Use `--retro-tasks` to move through the preprocessing pipeline.
+
+- Simplest setup (builds everything): `--retro-tasks build`
+- Alternatively, for tuning compute resources, run stages independently:
+  - Build retrieval database: `--retro-tasks db-build`
+  - Build search index: `--retro-tasks index-build`
+  - Query neighbors: `--retro-tasks pretraining-query-neighbors`
+
+Sample code flow:
+
+- `main.py` : Entry point (e.g., using `--retro-tasks X`).
+- `db/build.py` : Build retrieval database.
+- `index/build.py` : Build search index. Calls the following two files:
+  - `index/train.py` : Train index on subset of database.
+  - `index/add.py` : Add database chunks to index.
+- `pretraining/query.py` : Query pretraining samples for database neighbors (saved to disk and used during pretraining).
+
+<!-- ################ tutorial ################ -->
+
+# Tutorial
+
+In this tutorial example, we use the Wikipedia corpus to demonstrate how we build a retrieval database and index for this corpus, and then query the pretraining datasets for their neighbors.
+
+## Step 1: Prepare your retrieval text corpus
+
+The format of text corpus follows the same format as in Megatron training. See [data precessing](../../README.md#data-preprocessing) for more details on how to convert your json dataset into the mmap format.
+
+Assume we have the Wikipedia corpus in the following format:
+
+```
+<retrieval/db/path>/Wikipedia_shuf_text_document.bin
+<retrieval/db/path>/Wikipedia_shuf_text_document.idx
+```
+
+We note that the retrieval database can also be a blend of multiple text corpus.
+
+## Step 2: Build retrieval chunk database
+
+This *database* (stored as a 2-D array, NOT a relational database) consists of a list of chunks (traditionally length 64) extracted from the original GPT token dataset. This is simply a consecutive, non-overlapping chunking of the token dataset. Chunking only takes place within a document, and therefore the final chunk of each document has length: 1 <= chunk_length <= max_chunk_length.
+
+We discard chunks that would convert to an empty Bert sequence (rare case, happens ~1/100,000 chunks in our case), since we use Bert embeddings for building our index. Thus, the total number of chunks in the database will be slightly less than a naive calculation.
+
+Take the Wikipedia corpus as an example to build the retrieval chunk database:
+
+Prepare the following arguments and update our templates in [tools/retro/examples/preprocess_data.sh](examples/preprocess_data.sh):
+- `--retro-workdir`: The directory in which the preprocessing pipeline saves its datasets and configuration files. 
+  **This argument should remain consistent for a full pass through the pipeline, and for pretraining.**
+- `--data-path`: text corpus path to build retrieval database. In the case of Wikipedia corpus, it could be
+```bash
+WIK="${DATA_HOME}/Wikipedia_shuf_text_document"
+
+DATA_BLEND=" \
+  1 ${WIK} \
+"
+```
+- `--load`: bert path to load bert embedder
+- `--vocab-file` and `--retro-bert-vocab-file`: bert vocab file
+- `--retro-gpt-tokenizer-model`: gpt tokenizer model file
+
+Then launch the script:
+```bash
+bash tools/retro/examples/preprocess_data.sh db-build
+```
+
+After the `db-build` is finished, the output includes:
+- The launching args will be saved in your `<retro-workdir>/args.json` for the following steps. 
+- The retrieval chunk database will be saved in your `<retro-workdir>/db/` with your dataset information in `<retro-workdir>/db/indexed_dataset_infos.json`.  
+
+## Step 3: Build index for similarity search
+
+To match pretraining chunks to database chunks, a search index must be built to perform this querying. We use Faiss (https://github.com/facebookresearch/faiss) for training and building this index. Generally, the index is trained on a subset of all chunks in the database (specified via `--retro-index-ntrain`). After training, all chunks are added into the index, to be available during querying.
+
+Indexes only accept 1-D floating point vectors for training and adding, so each chunk must first be embedded before passing to the index for either training or adding. We use Bert embeddings for this purpose, and the embeddings are generated automatically within the pipeline.
+
+Take the Wikipedia corpus as an example to build the retrieval chunk database:
+
+```bash
+bash tools/retro/examples/preprocess_data.sh index-train
+```
+The `index-train` step is expected to take less than 4-hour on a single DGX-A100 node given the template index configuration. 
+To scale up for larger retrieval database, please carefully tune the faiss hyper-parameters specified in `--retro-index-str`. Please refer to [Faiss](https://github.com/facebookresearch/faiss/wiki/The-index-factory) to learn more about the index configuration.  
+
+After the index is trained, the centroids, HNSW graph, and product quantizer is determined. However, the index is still empty, as there is no chunk added.
+
+Take the example of the Wikipedia corpus, with the default template, the output of `index-train` includes:
+- The embedded Bert embeddings of the sampled chunks for `index-train` is saved in `<retro-workdir>/index/train_emb/`.  
+- The empty index is saved in `<retro-workdir>/index/faiss-par-add/OPQ32_64,IVF65536_HNSW8,PQ32/empty_0.970.faissindex`.
+
+Then we add all chunks in the retrieval database into the index so that we perform fast query over the whole retrieval database:
+```bash
+bash tools/retro/examples/preprocess_data.sh index-add
+```
+
+We note that this step can be time-consuming as it will go through the whole retrieval database, embed chunk tokens  to BERT embeddings, and add them into the index. Please make sure you successfully add the whole retrieval database before moving on to the next stage.
+
+*In case your job is interrupted in the middle, you can just run the script again, and it will automatically skip the chunks that have been added into the index and start from the chunk where it is interrupted.*
+
+
+Following the Wikipedia configuration, an example output of the step `index-add` includes:
+- The index with retrieval data chunks added is saved in `<retro-workdir>/index/faiss-par-add/OPQ32_64,IVF65536_HNSW8,PQ32/added_0.970_0.950.faissindex`, which can be used to query the neighbors for pretraining.
+
+## Step 4: Query pretraining neighbors
+
+To ensure fast Retro pretraining, the database neighbors for pretraining samples are pre-computed and saved to disk, for efficient access within the Retro dataset. In this stage, the pretraining datasets (training, validation, and test) are iterated, each sample is broken into chunks, and the chunks are used for querying the index. Similar to when building the index, each chunk is embedded (via Bert) before querying the index.
+
+The saved neighbors are labeled with unique dataset properties (i.e., seed, sequence length, number of samples, etc.) to ensure the neighbors generated during preprocessing match the neighbors requested during pretraining. Please also make sure the pretraining configuration is the same as this step so that the neighbors are aligned.
+
+There are query-time hyper-parameters that can be tuned to improve the quality of the neighbors. These are specified in `RETRO_QUERY_EF_SEARCH` and `RETRO_QUERY_NPROBE`. The most important parameter is `RETRO_QUERY_NPROBE`, which controls the number of clusters to search during querying. This parameter can be tuned to improve the quality of the neighbors, but will also increase the query time. 
+We recommend following the tutorial of [faiss](https://github.com/facebookresearch/faiss/wiki/Index-IO,-cloning-and-hyper-parameter-tuning) to tune the hyper-parameters for your own retrieval database. 
+
+Take the Wikipedia corpus as an example to query the neighbors in the retrieval database:
+
+```bash
+bash tools/retro/examples/preprocess_data.sh query-pretraining-neighbors
+```
+
+The output of `query-pretraining-neighbors` on the Wikipedia corpus includes:
+- `<retro-workdir>/wiki/query/train_855ab50e05151610301e2a74c4030fbc`, which contains the pre-retrieved neighbors for the pretraining dataset. 
+- `<retro-workdir>/wiki/query/valid_40bc7330318d64accec28e1e63c59bad`, which contains the pre-retrieved neighbors for the validation set of the pretraining corpus.
+
+## Step 5: Visualization of retrieval neighbors
+
+We also provide cli tools to help visualize and inspect the quality of your retrieved neighbors. 
+
+To use the CLI, open a Python terminal via the `python` command, and then load a Retro workdir with the following:
+
+```
+from tools.retro.cli import retro
+retro.init("/path/to/retro/workdir")
+```
+
+This initializes Megatron, and prepares the Retro data for inspection. We also print out some example commands to help you get familiar with the command lines.   
+
+An example output for the Wikipedia Corpus:
+
+```text
+setting number of micro-batches to constant 32
+> building BertWordPieceLowerCase tokenizer ...
+> initializing torch distributed ...
+> initialized tensor model parallel with size 1
+> initialized pipeline model parallel with size 1
+> compiling dataset index builder ...
+...
+...
+ > sample ratios:
+   dataset 0, input: 1, achieved: 1
+> size of blendable dataset: 201000 samples
+> elapsed time for building blendable dataset indices: 0.00 (sec)
+> building indices for blendable datasets ...
+ > sample ratios:
+   dataset 0, input: 1, achieved: 1
+> size of blendable dataset: 12864 samples
+> finished creating pretrained GPT datasets ...
+
++++++++++++++++++++++++++++++++++++++++++++++++++++
+examples ... [ *note*: 'db' = chunk db; 'pt' = pretraining corpus. ]
++++++++++++++++++++++++++++++++++++++++++++++++++++
+
+~~~~ indexed datasets ~~~~
+retro.get_db_num_indexed_datasets() : 1
+retro.get_db_indexed_dataset_infos() :
+  [(1.000000, Wikipedia_shuf_text_document)]
+
+~~~~ counts ~~~~
+retro.get_db_num_chunks : 68104992.
+
+retro.get_pt_num_samples('train') : 201000.
+retro.get_pt_num_samples('valid') : 12864.
+retro.get_pt_num_chunks('train') : 1608000.
+retro.get_pt_num_chunks('valid') : 102912.
+
+~~~~ tokens, text ~~~~
+retro.get_db_chunk_gpt(chunk_id) : [46809, 218340, 716, 647, ... , 251525, 872, 692, 4042]
+retro.get_db_chunk_bert(chunk_id) : [10680, 16216, 4313, 1745 ... , 8117, 1007, 1012, 1997]
+retro.get_db_chunk_text(chunk_id) : Jonas Geirnaert\n\nJonas  ... ort Flatlife (11 min). Of
+retro.get_db_chunk_and_continuation_text(chunk_id) :
+  ['Jonas Geirnaert  Jonas Ge ... ort Flatlife (11 min). Of',
+   'the copy he sent in for s ... abet, clearly has one. On']
+
+retro.get_pt_sample('train', sample_id) :
+  {
+    'dataset_idx' : 0
+    'text' : [   676     14  40656 184 ... 4\n    276  17361 251542]
+    'doc_ids' : [1246422 1596948 2403969]
+    'neighbor_chunks' : [[[  657380   657381]\n   ... \n  [34108760 34108761]]]
+    'neighbor_tokens' : [[[   276   9596 251511 . ... .    889    646   1723]]]
+  }
+
+(e.g., sample = retro.get_pt_sample(...))
+
+  sample['text'].shape : (513,)
+  sample['neighbor_tokens'].shape : (8, 20, 128)
+  sample['text'] : [   676     14  40656 184 ... 4\n    276  17361 251542]
+  sample['neighbor_tokens'][17][1] : [    14     14  30291   1 ... 682    328    379 251527]
+  retro.gpt_to_text(sample['text']) : also\nLatgalians (modern) ... ission criticised the AVN
+  retro.gpt_to_text(sample['neighbor_tokens']) : \n\nHis second marriage o ... Augusta Eardley-Wilmot (2
++++++++++++++++++++++++++++++++++++++++++++++++++++
+```
+
+We can also directly call the function `retro.print_neighbor_texts(sample_id, chunk_id)` to inspect the retrieval neighbors for a specific sample and chunk within the pretraining corpus. For example,  
+
+```text
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+PRETRAINING CHUNK:
+  - also\nLatgalians (modern)\n\nReferences\n\nCategory:Defunct political parti ... e.\n\nAbout \nThe company was established established in 1997. It is listed
+NEIGHBOR_CHUNKS:
+  - the sides.\n\nNotes\n\nReferences\n\nCategory:Obaku Zen\n*\nCategory:Japane ... 2, 2008. It was founded by Anand Jagannathan, CEO of parent company Kriyari
+  - 2007).\n\nSee also\n Satellite Communications\n Tonga\n\nReferences\n\nExte ... y Procter & Gamble (P&G) in 1985 in order for P&G to compete in the "beauty
+  - Japan\nCategory:Fish of Russia\nCategory:Fish described in 1845 Mareco Inde ... lic Opinion (WAPOR)\n European Society for Opinion and Marketing Research (
+  - The current director of the company is Albert Bosch.\n\nSee also\n Coupon\n ...  some articles in Basque. Deia is the main product of the Editorial Iparrag
+  - A.Ş have been traded on the Istanbul Stock Exchange since 2000.\n\nReferenc ... with stores in California, New York City, and London.\n\nHistory \nSnapette
+  - \nCategory:Hawaiian mythology\nCategory:Hawaiian religion\nCategory:Religio ... crative state contracts. In 2008 Prokom became a part of the Asseco capital
+  - , and the Baltic countries, as well as an online store.\n\nReferences\n\nEx ... nd are involved in intracellular trafficking. This protein does not contain
+  - juice producer\nFood industry of Russia\n\nReferences\n\nExternal links\nWi ... panies formerly listed on the New York Stock Exchange General Grant's March
+  - is in private ownership.\n\nReferences\n\nExternal links\n\nCategory:Online ... ten and directed by Brent Hodge. The film stars Aubrey Plaza, Molly Hawkey,
+  - company's display technology to manufacture and sell display-only engines.\ ... for a group of naval vessels (a division in naval usage).\n\nUsage\n Russia
+  - .\n\nCarrols also operated a chain of outlets in neighbouring Estonia from  ... rama film directed by Raajeev Walia. It is produced by Aman Mehta and Bijal
+  - \n\nExternal links\nHightail website\nThe Next Web on YouSendIt rebrand to  ... eptember 2014, sitting mainly in the criminal division of that court.\n\nBe
+  - American television seasons\nCategory:2014 American television seasons\nCat ...  Canada and larger European cities.\n\nIn 2010, advertising in New Zealand,
+  - .\n\nNotes\n\nCategory:Trade unions\nCategory:Industrial Workers of the Wor ... x people, some of whom may have been working on a part-time basis. Its head
+  - \n List of podcasting companies\n\nReferences\n\nExternal links\n \n\nCateg ... ct.\n\nCategory:Populated places in the Ashanti Region Nkeirouka Ezekh\n\nN
+  - \n\nReferences\n\nExternal links\n ADESE official website\n\nCategory:Compa ...  State Street, and UBS Warburg. Its first CEO was Ian M. Drachman. The firm
+  - Hotel\n Sulake Corporation\n Sulake Press Room\n Habbo Hotel - Blog\n\nCate ... l: 김진태; born December 19, 1980), better known by his stage name Verbal Jint
+  - hockey player\n Ruutu.fi, a Finnish television streaming service operated b ...  from the bottom, a BDSM term\n Topping cycle, a cycle used in power plants
+  - of Surakarta\nCategory:Indonesian names\nCategory:Indonesian families\nCate ... mber 13, 2013 in Izhevsk on Universitetskaya Street (later it was given the
+  - facilities are also in Ankara and the company HQ is in Istanbul.\n\nReferen ... is currently a World Wide Web Consortium Working Draft.\n\nSee also\n Voice
+```
+
+The code snippet for the above example is also equivalent to
+```python
+tokens = retro.get_pt_sample('train', 0)
+for token_ids in tokens["neighbor_tokens"][0]:
+    print("- %s" % (retro.gpt_to_text(token_ids)))
+    print("-" * 20)
+```
+<!-- ################ code structure ################ -->
+# Code structure
+
+### `tools/retro/main.py`
+
+This is the main entry point for Retro preprocessing. Call `main.py --help` to see arguments. Additionally, some Retro arguments are in Megatron's core arguments, so also see `add_retro_args()` section of `megatron/arguments.py` for additional arguments. Two of the most important arguments to customize are `--retro-workdir` and `--retro-tasks`.
+
+- **`--retro-workdir`** : Set the directory in which the preprocessing pipeline saves its datasets and configuration files. This argument should remain consistent for a full pass through the pipeline, and for pretraining.
+
+- **`--retro-tasks`** : Set the stages of preprocessing to perform. As mentioned previously, the three high-level stages are: 1) build retrieval database, 2) build search index, and 3) query pretraining neighbors. `--retro-tasks` can be used to either run the full pipeline, or run each of these stages in isolation. The latter case is useful for tuning compute resources for each stage. For example, index training utilizes GPUs and requires relatively less time, while querying neighbors uses the CPU and is a relatively slow process. Example tasks include:
+
+  - **`--retro-tasks build`** : Run entire preprocessing pipeline.
+  - **`--retro-tasks db-build`** : Build retrieval database.
+  - **`--retro-tasks index-build`** : Train and build search index.
+  - **`--retro-tasks pretraining-query-neighbors`** : Query pretraining neighbors.
+
+Multiple tasks can be specified by separating with commas (e.g., `--retro-tasks db-build,index-build`). Additionally, various 'miscellaneous' tasks are currently including, primarily for validating data for each stage; these task names can be seen in `main.py`.
+
+### `tools/retro/examples`
+
+Example scripts for setting arguments and launch Retro preprocessing. The key files here are:
+
+- **`preprocess_data.sh`** : Example launch script for preprocessing retro data.
+- **`pretrain_model.sh`** : Example launch script for pretraining a retro model.
+
+### `tools/retro/db`
+
+Build the retrieval chunk database. The key files here are:
+
+- **`build.py`** : Entry point for building the database. This code is responsible for iterating the input datasets (i.e., `--data-path`), parsing each dataset into consecutive chunks, checking for empty Bert (Wordpiece) conversions, and storing this information to disk. Two databases are created: 1) the retrieval database, and 2) a sampled database used for training the search index.
+- **`dataset.py`** : Defines database class, for iterating or accessing chunks in the database. Each chunk contains its tokens, Bert conversion length, and dataset index.
+
+Input data:
+
+<!-- - Token datasets, as generated by `tools/preprocess_data.py`. Each dataset should include a `.bin` and `.idx` file. Multiple datasets can be specified by using a blended configuration (see `--data-path` in `megatron/arguments.py`). -->
+- Token datasets, as loaded by `gpt_dataset.py`. Multiple datasets can be specified by using a blended configuration (see `--data-path` in `megatron/arguments.py`).
+
+Output data:
+
+- **`<RETRO_WORKDIR>/db/merged/train.hdf5`** : The main retrieval database. (*Database* here is used to denote a list of indexed chunks, rather than a *relational database*.) The chunks in this database are added to the search index, and are used for retrieval during pretraining. This file contains a single dataset `'chunks'`, which contains 5 columns:
+
+  - `dataset_idx` : Dataset index, from list of blended indexed datasets.
+  - `document_idx` : Document index within dataset.
+  - `chunk_start_idx` : Chunk's starting token index within document.
+  - `chunk_end_idx` : Chunk's ending token index (exclusive) within document.
+  - `bert_chunk_length` : Length of Bert token sequence, after converting from GPT.
+
+- **`<RETRO_WORKDIR>/db/merged/sampled.hdf5`** : Subset of training database that is used for training the search index. This file has the same structure as detailed above. In general, this database is significanly smaller than the `train.hdf5` database, since the search index only needs a relatively small number of samples to understand the data's structure. After training, all chunks in the main database (`train.hdf5`) are *added* to the search index.
+
+### `tools/retro/index`
+
+Build the search index. The key files here are:
+
+- `build.py` : Entry point for building the search index. First, the index is trained on the sampled chunk database (see above) by calling `train.py`, and then all chunks for the full database are added to the index by calling `add.py`. Note that training requires first embedding (using Bert) all chunks (a parallel operation), and then loading these embeddings and training the index (a sequential operation), so it's best to change one's compute setup after all chunks have been embedded and saved to disk.
+- `indexes/faiss_base.py` : Wrapper class for building a Faiss index, following the standard `train()` and `add()` operations.
+- `indexes/faiss_par_add.py` : Similar to above, except it uses an embarrassingly parallel (multi-node, multi-process) `add()` operation. Vectors are first added to separate index copies, and then merged together.
+
+Input data:
+
+- **`<RETRO_WORKDIR>/db/merged/sampled.hdf5`** : Chunks used for training the search index.
+- **`<RETRO_WORKDIR>/db/merged/train.hdf5`** : Chunks used for adding to the *trained* search index.
+
+Output data:
+
+- **`<RETRO_WORKDIR>/index/<RETRO_INDEX_TYPE>/<RETRO_INDEX_STR>/added.faissindex`** : The final index, which has been trained and has had all database chunks added to it. This index is ready for querying neighbors. Here, `RETRO_INDEX_TYPE` and `RETRO_INDEX_STR` correspond to the same-name arguments `--retro-index-type` (e.g., `faiss-par-add`) and `--retro-index-str` (e.g., `OPQ32_256,IVF4194304_HNSW32,PQ32`).
+- **`<RETRO_WORKDIR>/index/<RETRO_INDEX_TYPE>/<RETRO_INDEX_STR>/empty.faissindex`** : Generally can be discarded once `added.faissindex` has been built, but this file contains the *post-training*, *pre-adding* index. Useful for debugging or building other indexes.
+
+### `tools/retro/pretraining`
+
+Query the pretraining datasets (training, validation, test) for their neighbors within the database. Neighbors are queried during preprocessing -- rather than during pretraining -- because querying is a fairly slow operation, so it would be a bottleneck if performed during pretraining. Queried neighbors are tagged with their unique identifying information (e.g., `train_indexmap_27662746ns_2048sl_1234s`), so as to avoid incorrect references during pretraining. The key files here are:
+
+- **`query.py`** : Entry point for querying. The pretraining datasets are iterated, and each chunk within each sample is queried using the search index. These neighbors are filtered by discarding any database chunks that fall within the same document as any chunk within a pretraining sample.
+- **`chunk_dataset.py`** : This creates an iterable 'chunk' dataset form of a pretraining dataset. This is just a light wrapper, but makes it easier to deterministically iterate and assign IDs to each chunk in a sample dataset.
+- **`retro_dataset.py`** : The Retro dataset used for pretraining (not used in preprocessing). Each sample returns the sample tokens, along with neighbor tokens for each chunk within the sample.
+
+Input data:
+
+- Token datasets, as loaded by `gpt_dataset.py`.
+- **`<RETRO_WORKDIR>/index/<RETRO_INDEX_TYPE>/<RETRO_INDEX_STR>/added.faissindex`** : The trained index, with all database chunks added to it (see previous section for details).
+
+Output data:
+
+- **`<RETRO_WORKDIR>/{train,valid,test}_XXns_YYsl_ZZs/WW.hdf5`** : These directories/files contain the indexes of neighbors for each chunk within each sample of the pretraining datasets. Each directory (e.g., `train_indexmap_2047435ns_2048sl_1234s`) contains a list of HDF5 files (e.g., one file might be called `0075700000-0075800000.hdf5`). Each HDF5 file contains a consecutive subset of neighbor IDs for a given chunk, for indexing into the main retrieval database. All HDF5 files taken together within a given directory, represent the entire set of neighbors for a dataset. The size of these HDF5 files is determined by the argument `--retro-block-size`. The `XX`, `YY`, `ZZ`, `WW` notation above denotes the dataset properties that are used for uniquely tagging the neighbor files, to ensure compatibility during model pretraining. These neighbor files are ultimated used by `retro_dataset.py` during pretraining, for building Retro samples.
+
+### `tools/retro/cli`
+
+Inspect preprocessed data. To use the CLI, open a Python terminal via the `python` command, and then load a Retro workdir with the following:
+
+```
+from tools.retro.cli import retro
+retro.init("/path/to/retro/workdir")
+```
+
+This initializes Megatron, and prepares the Retro data for inspection. See the printed usage for available functions. Several routines are included for viewing data in the retrieval database and viewing pretraining samples and neighbors. For example:
+
+```python
+retro.get_db_num_indexed_datasets() # 15
+retro.get_db_chunk_text(92874113) # 'research project at ...  and philosophy'
+retro.get_pt_sample('train', 62005) # '[16084, 26158, 25387 ..., 6898, 9568]'
+```
+
+Most methods within the CLI are prefixed to denote the data being inspected:
+
+- **'db'** : Retrieval database (i.e., chunk tokens, document IDs, and dataset IDs)
+- **'pt'** : Pretraining datasets (i.e., sample tokens and neighbor tokens)
+
+### `tools/retro/utils.py`
+
+A collection of utility methods. Most importantly, this contains:
+
+- **`def get_gpt_tokenizer()`** : Get the GPT tokenizer.
+- **`def get_bert_tokenizer()`** : Get the Bert tokenizer.
+- **`class GPTToTextDataset`** : Wrapper class that converts GPT (BPE) samples to raw text.
+
+### `tools/bert_embedding`
+
+Generate Bert embeddings. The main files here are:
+
+- **`embed.py`** : Entry point for generating embeddings, and contains the two main embedding classes, `BertEmbedder` and `DiskDataParallelBertEmbedder` (more below). This file contains code for generating Megatron embeddings, while the file below contains code for Huggingface embeddings.
+- **`huggingface.py`** : Used by `embed.py` when the embedder is configured (see below) to output Huggingface embeddings.
+- **`dataset.py`** : Wrapper class for converting a raw-text dataset to Bert (Wordpiece) tokens.
+
+The Bert embeddings can be configured along two axes. The first axis is the output type:
+
+- **`class BertEmbedder`** : This class takes a raw-text dataset as input, generates its embeddings, and returns a Numpy array. The main functions are `embed_text_dataset` (accepts a raw-text dataset) and `embed_text` (accepts a string).
+- **`class DiskDataParallelBertEmbedder`** : This class wraps `BertEmbedder`, and rather than returning a Numpy array, it saves the embeddings to disk. Additionally, this class automatically splits data across data parallel ranks (using interleaving), and also processes data in a specified `block_size` (e.g., 1,000,000).
+
+The second axis is the type of embedding model to use, controlled by the argument `--bert-embedder-type`:
+
+- **`--bert-embedder-type megatron`** : Use Megatron's Bert model. The specific model used is dependent on the loaded checkpoint, vocab file, and tokenizer.
+- **`--bert-embedder-type huggingface`** : Use Huggingface's `bert-large-cased`. (*Note*: Huggingface's inclusion is likely to be deprecated; and there is no ability to configure cased/uncased.)
+
+### Pretraining
+
+- **`pretrain_retro.py`** : Launch script for pretraining Retro. Similar to `pretrain_gpt.py`, except this script handles loading neighbor tokens and setting up the neighbor attention mask.
+<!-- - `megatron/data/gpt_dataset.py` : ? -->
+- **`megatron/model/retro_transformer.py`** : Implementation of Retro model, including the main transformer, the retrieval encoder, and chunked cross-attention layers. Note that currently, `retro_transformer.py` contains several classes that are nearly identical to `transformer.py`, except for 1 or 2 lines, due to code changes that are yet to be integrated.
+- **`tools/retro/pretraining/retro_dataset.py`** : The Retro dataset used for pretraining (not used in preprocessing). Each sample returns the sample tokens, along with neighbor tokens for each chunk within the sample.
+
+
+<!-- ################ arguments ################ -->
+# Arguments
+
+See `tools/retro/main.py`'s `add_retro_args()` and `megatron/arguments.py`'s `_add_retro_args()` for details and descriptions. Here we list some particularly important arguments:
+
+- `--retro-workdir` : Mentioned previously, this argument determines the directory in which a set of Retro data is stored (during preprocessing) and loaded (during pretraining). Any change in this directory during preprocessing may result in preprocessing starting over from scratch, and any change before pretraining will result in pretraining throwing an error.
+- Preprocessing
+  - `--retro-gpt-chunk-length` : Retro chunk length (e.g., 64 in original paper).
+  - `--retro-tasks` : Comma-separated list of preprocessing tasks. Generally, the `build` task is the simplest way to run the preprocessing pipeline. For finer control, individual stages can be run by using tasks (in order): `db-build`, `index-build`, and `pretraining-query-neighbors`.
+  - `--retro-index-str` : Faiss index string that defines the index configuration. This will vary based on data size, compute/disk setup, and user needs. For example, this string looks something like `IVF262144_HNSW32,Flat` or `OPQ32_256,IVF4194304_HNSW32,PQ32`.
+- Pretraining
+  - `--retro-add-retriever` : Must be used to select Retro model.
+  - `--retro-num-neighbors` : Number of neighbors to retrieve from the retrieval database (defaults to 2).
+  - `--retro-num-retrieved-chunks` : For each neighbor, the number consecutive chunks to retrieve, including the initial neighbor (defaults to 2).
+  - `--retro-attention-gate` : Gated mechanism to incorporate information of cross attention from retrieved neighbor  (defaults to 1 during pretraining).
+
+<!-- ################ pretraining ################ -->
+<!-- # Pretraining -->
+<!-- - New retro args in arguments.py (add_retro_args). -->
+<!-- - Most important arg is `--retro-add-retriever`. -->
diff --git a/tools/retro/cli/__init__.py b/tools/retro/cli/__init__.py
index 2b607770ad..2531017a28 100644
--- a/tools/retro/cli/__init__.py
+++ b/tools/retro/cli/__init__.py
@@ -1,3 +1,3 @@
-# Copyright (c) 2023, NVIDIA CORPORATION.  All rights reserved.
+# Copyright (c) 2024, NVIDIA CORPORATION.  All rights reserved.
 
 from .cli import retro
diff --git a/tools/retro/cli/__main__.py b/tools/retro/cli/__main__.py
index f5973d0a67..37d096a953 100644
--- a/tools/retro/cli/__main__.py
+++ b/tools/retro/cli/__main__.py
@@ -1,4 +1,4 @@
-# Copyright (c) 2023, NVIDIA CORPORATION.  All rights reserved.
+# Copyright (c) 2024, NVIDIA CORPORATION.  All rights reserved.
 
 import os
 
@@ -6,4 +6,4 @@
 
 
 if __name__ == "__main__":
-    retro.init(os.environ["RETRO_WORKDIR"])
+    retro.init(os.environ["RETRO_PROJECT_DIR"])
diff --git a/tools/retro/cli/cli.py b/tools/retro/cli/cli.py
index 0f3c432f3f..a5d953d2f7 100644
--- a/tools/retro/cli/cli.py
+++ b/tools/retro/cli/cli.py
@@ -1,94 +1,74 @@
-# Copyright (c) 2023, NVIDIA CORPORATION.  All rights reserved.
+# Copyright (c) 2024, NVIDIA CORPORATION.  All rights reserved.
 
 import json
 import numpy as np
 import os
-import torch
-import types
-
-from megatron.global_vars import set_global_variables, set_retro_args
-from megatron.initialize import (
-    initialize_megatron,
-    _initialize_distributed,
-    _set_random_seed,
-    _compile_dependencies,
-)
-from tools.retro.db.utils import (
+import typing as T
+from types import SimpleNamespace
+
+from megatron.training.arguments import load_retro_config, parse_args, validate_args
+from megatron.core.datasets.retro.db.dataset import DBDataset
+from megatron.core.datasets.retro.db.utils import (
     get_indexed_dataset_infos as get_db_indexed_dataset_infos,
     get_merged_train_dataset as get_db_dataset,
 )
-from tools.retro.main import add_retro_args
-from tools.retro.query.retro_dataset import get_retro_datasets
-from tools.retro.utils import get_args_path, get_bert_tokenizer, get_gpt_tokenizer
+from megatron.core.datasets.retro.query.retro_dataset import get_retro_datasets, RetroDataset
+from megatron.training.global_vars import set_global_variables
+from megatron.training.training import build_train_valid_test_datasets, update_train_iters
+from pretrain_retro import train_valid_test_datasets_provider
+from tools.retro.preprocess_data import get_tokenizers
 
 
-def shorten_str(s, n):
+def shorten_str(s: str, n: int) -> str:
     s = "\\n".join(s.splitlines())
-    return s if len(s) <= n else "%s ... %s" % (s[:n//2], s[-n//2:])
+    return s if len(s) <= n else "%s ... %s" % (s[: n // 2], s[-n // 2 :])
 
 
 class retro:
 
-    args = None
+    config = None
 
     ##############################################
     # initialize.
     ##############################################
 
     @classmethod
-    def parse_dtype_str(cls, dtype_str):
-        return {
-            "torch.float16" : torch.float16,
-            "torch.float32" : torch.float32,
-            "torch.bfloat16" : torch.bfloat16,
-        }[dtype_str]
-
-    @classmethod
-    def init_megatron(cls, workdir):
-        '''Custom initialization of Megatron.'''
-
-        # Load args.
-        args_path = get_args_path(workdir)
-        assert os.path.exists(args_path), "args.json not found in workdir."
-        with open(args_path) as f:
-            cls.args = types.SimpleNamespace(**json.load(f))
-            cls.args.retro_workdir = workdir # just in case workdir moved
-            cls.args.rank = 0 # override env
-            cls.args.world_size = 1 # override env
-            cls.args.params_dtype = cls.parse_dtype_str(cls.args.params_dtype)
-
-        set_global_variables(cls.args)
-        set_retro_args(cls.args)
-        _initialize_distributed()
-        _set_random_seed(cls.args.seed, cls.args.data_parallel_random_init)
-        _compile_dependencies()
-
-    @classmethod
-    def init(cls, workdir):
+    def init(cls, project_dir: str) -> None:
         '''Initialize Megatron, tokenizers, and datasets.'''
 
-        # Load args.
-        cls.init_megatron(workdir)
-
-        cls.tokenizers = types.SimpleNamespace(
-            gpt=get_gpt_tokenizer(),
-            bert=get_bert_tokenizer(),
-        )
-
-        # Load data.
-        cls.db_indexed_dataset_infos = get_db_indexed_dataset_infos()
-        cls.db_dataset = get_db_dataset()
-        pt_train_ds, pt_valid_ds, _ = get_retro_datasets(verify_sizes=False)
-        cls.pt_datasets = types.SimpleNamespace(
+        # Megatron args.
+        args = parse_args(extra_args_provider=None, ignore_unknown_args=False)
+        args.retro_project_dir = project_dir
+        args.micro_batch_size = 1
+        args.num_layers = 1
+        args.hidden_size = 1
+        args.num_attention_heads = 1
+        args.async_tensor_model_parallel_allreduce = False
+        args.retro_add_retriever = True # for building RetroDataset
+        validate_args(args)
+        set_global_variables(args)
+        update_train_iters(args)
+
+        # Retro config.
+        cls.config = load_retro_config(project_dir)
+        cls.config.retro_project_dir = project_dir
+        cls.config.retro_tokenizers = get_tokenizers(cls.config)
+
+        # Chunk database dataset.
+        cls.db_indexed_dataset_infos = get_db_indexed_dataset_infos(project_dir)
+        cls.db_dataset = get_db_dataset(project_dir,
+                                        cls.config.retro_gpt_chunk_length,
+                                        cls.config.retro_tokenizers.gpt.eod)
+
+        # Pretraining datasets.
+        pt_train_ds, pt_valid_ds, pt_test_ds = build_train_valid_test_datasets(
+            train_valid_test_datasets_provider)
+        cls.pt_datasets = SimpleNamespace(
             train=pt_train_ds,
             valid=pt_valid_ds,
+            test=pt_test_ds,
         )
 
-        # Retrieve max saved neighbors.
-        for key in vars(cls.pt_datasets):
-            getattr(cls.pt_datasets, key).num_neighbors = \
-                cls.args.retro_query_num_neighbors_save
-
         # Print usage.
         cls.print_usage()
 
@@ -97,58 +77,57 @@ def init(cls, workdir):
     ##############################################
 
     @classmethod
-    def gpt_to_text(cls, token_ids):
+    def gpt_to_text(cls, token_ids: np.ndarray) -> str:
         '''GPT tokens to text.'''
-        return cls.tokenizers.gpt.detokenize(token_ids.tolist()
-                                             if isinstance(token_ids, np.ndarray)
-                                             else token_ids)
+        return cls.config.retro_tokenizers.gpt.detokenize(
+            token_ids.tolist() if isinstance(token_ids, np.ndarray) else token_ids
+        )
 
     @classmethod
-    def text_to_bert(cls, text):
+    def text_to_bert(cls, text: str) -> np.ndarray:
         '''Text to Bert tokens.'''
-        return cls.tokenizers.bert.tokenize(text)
+        return cls.config.retro_tokenizers.bert.tokenize(text)
 
     ##############################################
     # chunk db.
     ##############################################
 
     @classmethod
-    def get_db_num_indexed_datasets(cls):
-        '''Number of indexed datasets within blendable dataset.'''
+    def get_db_num_indexed_datasets(cls) -> int:
+        '''Number of indexed datasets within blended dataset.'''
         return len(cls.db_indexed_dataset_infos)
 
     @classmethod
-    def get_db_indexed_dataset_infos(cls):
+    def get_db_indexed_dataset_infos(cls) -> T.List[T.Tuple[float, str]]:
         '''Dataset infos, including number of training & sampled sets.'''
-        return [(info["ratio"], info["name"])
-                for info in cls.db_indexed_dataset_infos]
+        return [(info["ratio"], info["prefix"]) for info in cls.db_indexed_dataset_infos]
 
     @classmethod
-    def get_db_dataset(cls):
+    def get_db_dataset(cls) -> DBDataset:
         return cls.db_dataset
 
     @classmethod
-    def get_db_num_chunks(cls):
+    def get_db_num_chunks(cls) -> int:
         '''Number of DB chunks.'''
         return len(cls.get_db_dataset())
 
     @classmethod
-    def get_db_chunk_gpt(cls, idx):
+    def get_db_chunk_gpt(cls, idx: int) -> T.List[int]:
         '''Get DB chunk as GPT token ids.'''
         return cls.get_db_dataset()[idx]["text"].tolist()
 
     @classmethod
-    def get_db_chunk_bert(cls, idx):
+    def get_db_chunk_bert(cls, idx: int) -> T.List[int]:
         '''Get DB chunk as Bert token ids.'''
         return cls.text_to_bert(cls.get_db_chunk_text(idx))
 
     @classmethod
-    def get_db_chunk_text(cls, idx):
+    def get_db_chunk_text(cls, idx: int) -> str:
         '''Get DB chunk as text.'''
         return cls.gpt_to_text(cls.get_db_chunk_gpt(idx))
 
     @classmethod
-    def get_db_chunk_and_continuation_text(cls, idx):
+    def get_db_chunk_and_continuation_text(cls, idx: int) -> T.List[str]:
         '''Get DB chunk along with continuation, as text.'''
 
         # Modulus used here to match original implementation (i.e., last
@@ -163,11 +142,12 @@ def get_db_chunk_and_continuation_text(cls, idx):
     ##############################################
 
     @classmethod
-    def get_pt_num_samples_and_chunks(cls, data_key):
+    def get_pt_num_samples_and_chunks(cls, data_key: str) -> T.Tuple[int, int]:
         '''Number of samples & chunks (e.g., 32*n_samples) in corpus.'''
-        assert hasattr(cls.pt_datasets, data_key), \
-            "pretraining set '%s' not found (choices: %s)." % (
-                data_key, ", ".join(vars(cls.pt_datasets).keys()))
+        assert hasattr(cls.pt_datasets, data_key), (
+            "pretraining set '%s' not found (choices: %s)."
+            % (data_key, ", ".join(vars(cls.pt_datasets).keys()))
+        )
         chunk_dataset = getattr(cls.pt_datasets, data_key).chunk_dataset
         return (
             len(chunk_dataset.sample_dataset),
@@ -175,44 +155,43 @@ def get_pt_num_samples_and_chunks(cls, data_key):
         )
 
     @classmethod
-    def get_pt_num_samples(cls, data_key):
+    def get_pt_num_samples(cls, data_key: str) -> int:
         '''Number of pretraining samples.'''
         return cls.get_pt_num_samples_and_chunks(data_key)[0]
 
     @classmethod
-    def get_pt_num_chunks(cls, data_key):
+    def get_pt_num_chunks(cls, data_key: str) -> int:
         '''Number of pretraining chunks (e.g., 32*n_samples).'''
         return cls.get_pt_num_samples_and_chunks(data_key)[1]
 
     @classmethod
-    def get_pt_dataset(cls, data_key):
+    def get_pt_dataset(cls, data_key: str) -> RetroDataset:
         return getattr(cls.pt_datasets, data_key)
 
     @classmethod
-    def get_pt_sample(cls, data_key, idx):
+    def get_pt_sample(cls, data_key: str, idx: int) -> dict:
         return getattr(cls.pt_datasets, data_key)[idx]
 
     @classmethod
-    def get_neighbor_tokens(cls, sample_id, chunk_id, data_key="train"):
+    def get_neighbor_tokens(cls, sample_id: int, chunk_id: int, data_key: str="train") -> T.Optional[dict]:
         try:
             sample = cls.get_pt_sample(data_key, sample_id)
             sample_token_ids = sample["text"]
             chunk_length = cls.args.retro_gpt_chunk_length
             chunk_start_idx = chunk_id * chunk_length
-            chunk_end_idx = min(sample_token_ids.shape[0],
-                                chunk_start_idx + chunk_length)
+            chunk_end_idx = min(sample_token_ids.shape[0], chunk_start_idx + chunk_length)
             chunk_token_ids = sample_token_ids[chunk_start_idx:chunk_end_idx]
             neighbor_token_ids = sample["neighbor_tokens"][chunk_id]
             return {
-                "chunk_tokens" : chunk_token_ids,
-                "neighbor_tokens" : neighbor_token_ids,
+                "chunk_tokens": chunk_token_ids,
+                "neighbor_tokens": neighbor_token_ids,
             }
-        except:
+        except Exception:
             return None
 
     @classmethod
-    def print_neighbor_texts(cls, sample_id, chunk_id, data_key="train"):
-        tokens = cls.get_neighbor_tokens(sample_id, chunk_id, data_key)
+    def print_neighbor_texts(cls, sample_id: int, chunk_id: int, data_key: str="train") -> None:
+        tokens: dict = cls.get_neighbor_tokens(sample_id, chunk_id, data_key)
         print("~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~")
         try:
             print("PRETRAINING CHUNK:")
@@ -220,7 +199,7 @@ def print_neighbor_texts(cls, sample_id, chunk_id, data_key="train"):
             print("NEIGHBOR_CHUNKS:")
             for token_ids in tokens["neighbor_tokens"]:
                 print("  - %s" % shorten_str(cls.gpt_to_text(token_ids), 150))
-        except:
+        except Exception:
             print("<no neighbors for sample %d>" % sample_id)
 
     ##############################################
@@ -228,7 +207,7 @@ def print_neighbor_texts(cls, sample_id, chunk_id, data_key="train"):
     ##############################################
 
     @classmethod
-    def print_usage(cls):
+    def print_usage(cls) -> None:
         '''Print usage.'''
 
         print()
@@ -238,16 +217,18 @@ def print_usage(cls):
 
         print()
         print("~~~~ indexed datasets ~~~~")
-        print("retro.get_db_num_indexed_datasets() : %s" %
-              cls.get_db_num_indexed_datasets())
+        print("retro.get_db_num_indexed_datasets() : %s" % cls.get_db_num_indexed_datasets())
         print("retro.get_db_indexed_dataset_infos() :")
-        for i, (ratio,prefix) in enumerate(cls.get_db_indexed_dataset_infos()):
-            print("  %s(%f, %s)%s" % (
-                "[" if i == 0 else " ",
-                ratio,
-                prefix,
-                "]" if i == len(cls.db_indexed_dataset_infos) - 1 else ",",
-            ))
+        for i, (ratio, prefix) in enumerate(cls.get_db_indexed_dataset_infos()):
+            print(
+                "  %s(%f, %s)%s"
+                % (
+                    "[" if i == 0 else " ",
+                    ratio,
+                    prefix,
+                    "]" if i == len(cls.db_indexed_dataset_infos) - 1 else ",",
+                )
+            )
 
         print()
         print("~~~~ counts ~~~~")
@@ -255,26 +236,36 @@ def print_usage(cls):
 
         print()
         for sq_key in ("sample", "chunk"):
-            for data_key in ("train", "valid"): # test?
-                print("retro.get_pt_num_%ss('%s') : %d." % (
-                    sq_key, data_key,
-                    getattr(cls, f"get_pt_num_{sq_key}s")(data_key)))
+            for data_key in ("train", "valid"):  # test?
+                print(
+                    "retro.get_pt_num_%ss('%s') : %d."
+                    % (sq_key, data_key, getattr(cls, f"get_pt_num_{sq_key}s")(data_key))
+                )
 
         print()
         print("~~~~ tokens, text ~~~~")
-        print("retro.get_db_chunk_gpt(chunk_id) : %s" %
-              shorten_str(str(retro.get_db_chunk_gpt(0)), 50))
-        print("retro.get_db_chunk_bert(chunk_id) : %s" %
-              shorten_str(str(retro.get_db_chunk_bert(0)), 50))
-        print("retro.get_db_chunk_text(chunk_id) : %s" %
-              shorten_str(retro.get_db_chunk_text(0).strip(), 50))
+        print(
+            "retro.get_db_chunk_gpt(chunk_id) : %s"
+            % shorten_str(str(retro.get_db_chunk_gpt(0)), 50)
+        )
+        print(
+            "retro.get_db_chunk_bert(chunk_id) : %s"
+            % shorten_str(str(retro.get_db_chunk_bert(0)), 50)
+        )
+        print(
+            "retro.get_db_chunk_text(chunk_id) : %s"
+            % shorten_str(retro.get_db_chunk_text(0).strip(), 50)
+        )
         print("retro.get_db_chunk_and_continuation_text(chunk_id) :")
         for i, t in enumerate(retro.get_db_chunk_and_continuation_text(0)):
-            print("  %s'%s'%s" % (
-                "[" if i == 0 else " ",
-                shorten_str(t.strip().replace("\n", " "), 50),
-                "]" if i == 1 else ",",
-            ))
+            print(
+                "  %s'%s'%s"
+                % (
+                    "[" if i == 0 else " ",
+                    shorten_str(t.strip().replace("\n", " "), 50),
+                    "]" if i == 1 else ",",
+                )
+            )
 
         sample = cls.get_pt_sample("train", 0)
         sample_chunk_id = sample["neighbor_tokens"].shape[0] // 2
@@ -292,8 +283,19 @@ def print_usage(cls):
         print("  sample['text'].shape : %s" % str(sample["text"].shape))
         print("  sample['neighbor_tokens'].shape : %s" % str(sample["neighbor_tokens"].shape))
         print("  sample['text'] : %s" % shorten_str(str(sample["text"]), 50))
-        print("  sample['neighbor_tokens'][17][1] : %s" % shorten_str(str(sample["neighbor_tokens"][sample_chunk_id][sample_neighbor_id]), 50))
-        print("  retro.gpt_to_text(sample['text']) : %s" % shorten_str(cls.gpt_to_text(sample["text"]), 50))
-        print("  retro.gpt_to_text(sample['neighbor_tokens']) : %s" % shorten_str(cls.gpt_to_text(sample["neighbor_tokens"][sample_chunk_id][sample_neighbor_id]), 50))
+        print(
+            "  sample['neighbor_tokens'][17][1] : %s"
+            % shorten_str(str(sample["neighbor_tokens"][sample_chunk_id][sample_neighbor_id]), 50)
+        )
+        print(
+            "  retro.gpt_to_text(sample['text']) : %s"
+            % shorten_str(cls.gpt_to_text(sample["text"]), 50)
+        )
+        print(
+            "  retro.gpt_to_text(sample['neighbor_tokens']) : %s"
+            % shorten_str(
+                cls.gpt_to_text(sample["neighbor_tokens"][sample_chunk_id][sample_neighbor_id]), 50
+            )
+        )
 
         print("+++++++++++++++++++++++++++++++++++++++++++++++++++")
diff --git a/tools/retro/config_utils.py b/tools/retro/config_utils.py
new file mode 100644
index 0000000000..00676c66ff
--- /dev/null
+++ b/tools/retro/config_utils.py
@@ -0,0 +1,632 @@
+# Copyright (c) 2024, NVIDIA CORPORATION. All rights reserved.
+
+"""Config utils."""
+
+import argparse
+from collections import namedtuple, OrderedDict
+import dataclasses
+import enum
+import inspect
+import os
+import re
+import types
+import typing as T
+
+
+PARAM_KEYWORDS = {
+    "param",
+    "parameter",
+    "arg",
+    "argument",
+    "attribute",
+    "key",
+    "keyword",
+}
+RAISES_KEYWORDS = {"raises", "raise", "except", "exception"}
+DEPRECATION_KEYWORDS = {"deprecation", "deprecated"}
+RETURNS_KEYWORDS = {"return", "returns"}
+YIELDS_KEYWORDS = {"yield", "yields"}
+EXAMPLES_KEYWORDS = {"example", "examples"}
+
+
+class ParseError(RuntimeError):
+    """Base class for all parsing related errors."""
+
+
+class DocstringStyle(enum.Enum):
+    """Docstring style."""
+
+    REST = 1
+    GOOGLE = 2
+    NUMPYDOC = 3
+    EPYDOC = 4
+    AUTO = 255
+
+
+class RenderingStyle(enum.Enum):
+    """Rendering style when unparsing parsed docstrings."""
+
+    COMPACT = 1
+    CLEAN = 2
+    EXPANDED = 3
+
+
+class DocstringMeta:
+    """Docstring meta information.
+
+    Symbolizes lines in form of
+
+        :param arg: description
+        :raises ValueError: if something happens
+    """
+
+    def __init__(
+        self, args: T.List[str], description: T.Optional[str]
+    ) -> None:
+        """Initialize self.
+
+        :param args: list of arguments. The exact content of this variable is
+            dependent on the kind of docstring; it's used to distinguish
+            between custom docstring meta information items.
+        :param description: associated docstring description.
+        """
+        self.args = args
+        self.description = description
+
+
+class DocstringParam(DocstringMeta):
+    """DocstringMeta symbolizing :param metadata."""
+
+    def __init__(
+        self,
+        args: T.List[str],
+        description: T.Optional[str],
+        arg_name: str,
+        type_name: T.Optional[str],
+        is_optional: T.Optional[bool],
+        default: T.Optional[str],
+    ) -> None:
+        """Initialize self."""
+        super().__init__(args, description)
+        self.arg_name = arg_name
+        self.type_name = type_name
+        self.is_optional = is_optional
+        self.default = default
+
+
+class DocstringReturns(DocstringMeta):
+    """DocstringMeta symbolizing :returns or :yields metadata."""
+
+    def __init__(
+        self,
+        args: T.List[str],
+        description: T.Optional[str],
+        type_name: T.Optional[str],
+        is_generator: bool,
+        return_name: T.Optional[str] = None,
+    ) -> None:
+        """Initialize self."""
+        super().__init__(args, description)
+        self.type_name = type_name
+        self.is_generator = is_generator
+        self.return_name = return_name
+
+
+class DocstringRaises(DocstringMeta):
+    """DocstringMeta symbolizing :raises metadata."""
+
+    def __init__(
+        self,
+        args: T.List[str],
+        description: T.Optional[str],
+        type_name: T.Optional[str],
+    ) -> None:
+        """Initialize self."""
+        super().__init__(args, description)
+        self.type_name = type_name
+        self.description = description
+
+
+class DocstringDeprecated(DocstringMeta):
+    """DocstringMeta symbolizing deprecation metadata."""
+
+    def __init__(
+        self,
+        args: T.List[str],
+        description: T.Optional[str],
+        version: T.Optional[str],
+    ) -> None:
+        """Initialize self."""
+        super().__init__(args, description)
+        self.version = version
+        self.description = description
+
+
+class DocstringExample(DocstringMeta):
+    """DocstringMeta symbolizing example metadata."""
+
+    def __init__(
+        self,
+        args: T.List[str],
+        snippet: T.Optional[str],
+        description: T.Optional[str],
+    ) -> None:
+        """Initialize self."""
+        super().__init__(args, description)
+        self.snippet = snippet
+        self.description = description
+
+
+class Docstring:
+    """Docstring object representation."""
+
+    def __init__(
+        self,
+        style=None,  # type: T.Optional[DocstringStyle]
+    ) -> None:
+        """Initialize self."""
+        self.short_description = None  # type: T.Optional[str]
+        self.long_description = None  # type: T.Optional[str]
+        self.blank_after_short_description = False
+        self.blank_after_long_description = False
+        self.meta = []  # type: T.List[DocstringMeta]
+        self.style = style  # type: T.Optional[DocstringStyle]
+
+    @property
+    def params(self) -> T.List[DocstringParam]:
+        """Return a list of information on function params."""
+        return {m.arg_name:m for m in self.meta if isinstance(m, DocstringParam)}
+
+    @property
+    def raises(self) -> T.List[DocstringRaises]:
+        """Return a list of information on the exceptions that the function
+        may raise.
+        """
+        return [
+            item for item in self.meta if isinstance(item, DocstringRaises)
+        ]
+
+    @property
+    def returns(self) -> T.Optional[DocstringReturns]:
+        """Return a single information on function return.
+
+        Takes the first return information.
+        """
+        for item in self.meta:
+            if isinstance(item, DocstringReturns):
+                return item
+        return None
+
+    @property
+    def many_returns(self) -> T.List[DocstringReturns]:
+        """Return a list of information on function return."""
+        return [
+            item for item in self.meta if isinstance(item, DocstringReturns)
+        ]
+
+    @property
+    def deprecation(self) -> T.Optional[DocstringDeprecated]:
+        """Return a single information on function deprecation notes."""
+        for item in self.meta:
+            if isinstance(item, DocstringDeprecated):
+                return item
+        return None
+
+    @property
+    def examples(self) -> T.List[DocstringExample]:
+        """Return a list of information on function examples."""
+        return [
+            item for item in self.meta if isinstance(item, DocstringExample)
+        ]
+
+
+class SectionType(enum.IntEnum):
+    """Types of sections."""
+
+    SINGULAR = 0
+    """For sections like examples."""
+
+    MULTIPLE = 1
+    """For sections like params."""
+
+    SINGULAR_OR_MULTIPLE = 2
+    """For sections like returns or yields."""
+
+
+class Section(namedtuple("SectionBase", "title key type")):
+    """A docstring section."""
+
+
+GOOGLE_TYPED_ARG_REGEX = re.compile(r"\s*(.+?)\s*\(\s*(.*[^\s]+)\s*\)")
+GOOGLE_ARG_DESC_REGEX = re.compile(r".*\. Defaults to (.+)\.")
+MULTIPLE_PATTERN = re.compile(r"(\s*[^:\s]+:)|([^:]*\]:.*)")
+
+DEFAULT_SECTIONS = [
+    Section("Arguments", "param", SectionType.MULTIPLE),
+    Section("Args", "param", SectionType.MULTIPLE),
+    Section("Parameters", "param", SectionType.MULTIPLE),
+    Section("Params", "param", SectionType.MULTIPLE),
+    Section("Raises", "raises", SectionType.MULTIPLE),
+    Section("Exceptions", "raises", SectionType.MULTIPLE),
+    Section("Except", "raises", SectionType.MULTIPLE),
+    Section("Attributes", "attribute", SectionType.MULTIPLE),
+    Section("Example", "examples", SectionType.SINGULAR),
+    Section("Examples", "examples", SectionType.SINGULAR),
+    Section("Returns", "returns", SectionType.SINGULAR_OR_MULTIPLE),
+    Section("Yields", "yields", SectionType.SINGULAR_OR_MULTIPLE),
+]
+
+
+class GoogleDocstringParser:
+    """Parser for Google-style docstrings."""
+
+    def __init__(
+        self, sections: T.Optional[T.List[Section]] = None, title_colon=True
+    ):
+        """Setup sections.
+
+        :param sections: Recognized sections or None to defaults.
+        :param title_colon: require colon after section title.
+        """
+        if not sections:
+            sections = DEFAULT_SECTIONS
+        self.sections = {s.title: s for s in sections}
+        self.title_colon = title_colon
+        self._setup()
+
+    def _setup(self):
+        if self.title_colon:
+            colon = ":"
+        else:
+            colon = ""
+        self.titles_re = re.compile(
+            "^("
+            + "|".join(f"({t})" for t in self.sections)
+            + ")"
+            + colon
+            + "[ \t\r\f\v]*$",
+            flags=re.M,
+        )
+
+    def _build_meta(self, text: str, title: str) -> DocstringMeta:
+        """Build docstring element.
+
+        :param text: docstring element text
+        :param title: title of section containing element
+        :return:
+        """
+
+        section = self.sections[title]
+
+        if (
+            section.type == SectionType.SINGULAR_OR_MULTIPLE
+            and not MULTIPLE_PATTERN.match(text)
+        ) or section.type == SectionType.SINGULAR:
+            return self._build_single_meta(section, text)
+
+        if ":" not in text:
+            # raise ParseError(f"Expected a colon in {text!r}.")
+            return None
+
+        # Split spec and description
+        before, desc = text.split(":", 1)
+        if desc:
+            desc = desc[1:] if desc[0] == " " else desc
+            if "\n" in desc:
+                first_line, rest = desc.split("\n", 1)
+                desc = first_line + "\n" + inspect.cleandoc(rest)
+            desc = desc.strip("\n")
+
+        return self._build_multi_meta(section, before, desc)
+
+    @staticmethod
+    def _build_single_meta(section: Section, desc: str) -> DocstringMeta:
+        if section.key in RETURNS_KEYWORDS | YIELDS_KEYWORDS:
+            return DocstringReturns(
+                args=[section.key],
+                description=desc,
+                type_name=None,
+                is_generator=section.key in YIELDS_KEYWORDS,
+            )
+        if section.key in RAISES_KEYWORDS:
+            return DocstringRaises(
+                args=[section.key], description=desc, type_name=None
+            )
+        if section.key in EXAMPLES_KEYWORDS:
+            return DocstringExample(
+                args=[section.key], snippet=None, description=desc
+            )
+        if section.key in PARAM_KEYWORDS:
+            raise ParseError("Expected paramenter name.")
+        return DocstringMeta(args=[section.key], description=desc)
+
+    @staticmethod
+    def _build_multi_meta(
+        section: Section, before: str, desc: str
+    ) -> DocstringMeta:
+        if section.key in PARAM_KEYWORDS:
+            match = GOOGLE_TYPED_ARG_REGEX.match(before)
+            if match:
+                arg_name, type_name = match.group(1, 2)
+                if type_name.endswith(", optional"):
+                    is_optional = True
+                    type_name = type_name[:-10]
+                elif type_name.endswith("?"):
+                    is_optional = True
+                    type_name = type_name[:-1]
+                else:
+                    is_optional = False
+            else:
+                arg_name, type_name = before, None
+                is_optional = None
+
+            match = GOOGLE_ARG_DESC_REGEX.match(desc)
+            default = match.group(1) if match else None
+
+            return DocstringParam(
+                args=[section.key, before],
+                description=desc,
+                arg_name=arg_name,
+                type_name=type_name,
+                is_optional=is_optional,
+                default=default,
+            )
+        if section.key in RETURNS_KEYWORDS | YIELDS_KEYWORDS:
+            return DocstringReturns(
+                args=[section.key, before],
+                description=desc,
+                type_name=before,
+                is_generator=section.key in YIELDS_KEYWORDS,
+            )
+        if section.key in RAISES_KEYWORDS:
+            return DocstringRaises(
+                args=[section.key, before], description=desc, type_name=before
+            )
+        return DocstringMeta(args=[section.key, before], description=desc)
+
+    def add_section(self, section: Section):
+        """Add or replace a section.
+
+        :param section: The new section.
+        """
+
+        self.sections[section.title] = section
+        self._setup()
+
+    def parse(self, text: str) -> Docstring:
+        """Parse the Google-style docstring into its components.
+
+        :returns: parsed docstring
+        """
+        ret = Docstring(style=DocstringStyle.GOOGLE)
+        if not text:
+            return ret
+
+        # Clean according to PEP-0257
+        text = inspect.cleandoc(text)
+
+        # Find first title and split on its position
+        match = self.titles_re.search(text)
+        if match:
+            desc_chunk = text[: match.start()]
+            meta_chunk = text[match.start() :]
+        else:
+            desc_chunk = text
+            meta_chunk = ""
+
+        # Break description into short and long parts
+        parts = desc_chunk.split("\n", 1)
+        ret.short_description = parts[0] or None
+        if len(parts) > 1:
+            long_desc_chunk = parts[1] or ""
+            ret.blank_after_short_description = long_desc_chunk.startswith(
+                "\n"
+            )
+            ret.blank_after_long_description = long_desc_chunk.endswith("\n\n")
+            ret.long_description = long_desc_chunk.strip() or None
+
+        # Split by sections determined by titles
+        matches = list(self.titles_re.finditer(meta_chunk))
+        if not matches:
+            return ret
+        splits = []
+        for j in range(len(matches) - 1):
+            splits.append((matches[j].end(), matches[j + 1].start()))
+        splits.append((matches[-1].end(), len(meta_chunk)))
+
+        chunks = OrderedDict()  # type: T.Mapping[str,str]
+        for j, (start, end) in enumerate(splits):
+            title = matches[j].group(1)
+            if title not in self.sections:
+                continue
+
+            # Clear Any Unknown Meta
+            # Ref: https://github.com/rr-/docstring_parser/issues/29
+            meta_details = meta_chunk[start:end]
+            unknown_meta = re.search(r"\n\S", meta_details)
+            if unknown_meta is not None:
+                meta_details = meta_details[: unknown_meta.start()]
+
+            chunks[title] = meta_details.strip("\n")
+        if not chunks:
+            return ret
+
+        # Add elements from each chunk
+        for title, chunk in chunks.items():
+            # Determine indent
+            indent_match = re.search(r"^\s*", chunk)
+            if not indent_match:
+                raise ParseError(f'Can\'t infer indent from "{chunk}"')
+            indent = indent_match.group()
+
+            # Check for singular elements
+            if self.sections[title].type in [
+                SectionType.SINGULAR,
+                SectionType.SINGULAR_OR_MULTIPLE,
+            ]:
+                part = inspect.cleandoc(chunk)
+                ret.meta.append(self._build_meta(part, title))
+                continue
+
+            # Split based on lines which have exactly that indent
+            _re = "^" + indent + r"(?=\S)"
+            c_matches = list(re.finditer(_re, chunk, flags=re.M))
+            if not c_matches:
+                raise ParseError(f'No specification for "{title}": "{chunk}"')
+            c_splits = []
+            for j in range(len(c_matches) - 1):
+                c_splits.append((c_matches[j].end(), c_matches[j + 1].start()))
+            c_splits.append((c_matches[-1].end(), len(chunk)))
+            for j, (start, end) in enumerate(c_splits):
+                part = chunk[start:end].strip("\n")
+                ret.meta.append(self._build_meta(part, title))
+
+        return ret
+
+
+def verify_and_get_config_attr_descs(config_cls, strict_docstring_match=True):
+
+    assert dataclasses.is_dataclass(config_cls), f"uh oh <{config_cls.__name__}>."
+
+    # Parse docstring.
+    try:
+        docstring = GoogleDocstringParser().parse(config_cls.__doc__)
+    except Exception as e:
+        raise Exception(f"error parsing {config_cls.__name__} docstring.")
+    
+    # Get attributes and types.
+    config_attrs = docstring.params
+    config_types = config_cls.__annotations__
+
+    # Verify attribute names.
+    config_attr_keys = set(config_attrs.keys())
+    config_type_keys = set(config_types.keys())
+    missing_attr_keys = config_type_keys - config_attr_keys
+    extra_attr_keys = config_attr_keys - config_type_keys
+    if strict_docstring_match:
+        assert not missing_attr_keys and not extra_attr_keys, f"{config_cls.__name__} docstring is either missing attributes ({', '.join(missing_attr_keys) if missing_attr_keys else '--'}) or contains extra attributes ({', '.join(extra_attr_keys) if extra_attr_keys else '--'})."
+
+    # @todo
+    # Verify attribute type names.
+    # for key in config_attr_keys:
+    #     ... todo ...
+
+    # Verify base class attributes.
+    attrs = {k:v for base_cls in config_cls.__bases__ if dataclasses.is_dataclass(base_cls) for k,v in verify_and_get_config_attr_descs(base_cls, strict_docstring_match=strict_docstring_match).items()}
+    for key in config_attr_keys:
+        if key in config_types:
+            attrs[key] = {
+                "desc" : config_attrs[key].description,
+                "type" : config_types[key],
+            }
+
+    return attrs
+
+
+def add_config_args(parser, config_cls):
+    attrs = verify_and_get_config_attr_descs(config_cls, strict_docstring_match=False)
+    for key, attr in attrs.items():
+        _type = attr["type"]
+        if dataclasses.is_dataclass(_type):
+            group = parser.add_argument_group(title=attr["desc"])
+            add_config_args(group, _type)
+        else:
+
+            default_value = getattr(config_cls, key)
+            args = {
+                "help" : attr["desc"],
+                "default" : default_value,
+            }
+
+            if _type == bool:
+                assert isinstance(args["default"], (bool, type(None))), \
+                    f"boolean attribute '{key}' of {config_cls.__name__} " \
+                    "has non-boolean default value."
+
+                # When default=True, add 'no-{key}' arg.
+                if default_value:
+                    args["action"] = "store_false"
+                    args["dest"] = key
+                    key = "no-" + key
+                else:
+                    args["action"] = "store_true"
+
+            elif _type in (int, float):
+                args["type"] = _type
+
+            elif _type == list:
+                args["nargs"] = "*"
+
+            # else: ....... treat as string arg
+            #     raise Exception(f"specialize action for '{key}', type <{_type}>.")
+
+            try:
+                parser.add_argument(f"--{key.replace('_', '-')}", **args)
+            except argparse.ArgumentError as e:
+                pass
+
+
+def get_config_leaf_field_names(config_cls):
+    names = set()
+    for field in dataclasses.fields(config_cls):
+        if dataclasses.is_dataclass(field.type):
+            names.update(get_config_leaf_field_names(field.type))
+        else:
+            names.add(field.name)
+    return names
+
+
+def config_from_args(args, config_cls, add_custom_args=False):
+
+    # Collect config data in a dict.
+    data = {}
+    for field in dataclasses.fields(config_cls):
+        if dataclasses.is_dataclass(field.type):
+            data[field.name] = config_from_args(args, field.type)
+        else:
+            data[field.name] = getattr(args, field.name)
+
+    # Add custom args. (e.g., for tools, tasks)
+    if add_custom_args:
+
+        config_keys = get_config_leaf_field_names(config_cls)
+        arg_keys = set(vars(args).keys())
+        custom_keys = arg_keys - config_keys
+
+        custom_data = {k:v for k, v in vars(args).items() if k in custom_keys}
+        custom_config_cls = dataclasses.make_dataclass(
+            "CustomConfig",
+            [(k, type(v)) for k, v in custom_data.items()])
+        custom_config = custom_config_cls(**custom_data)
+        data["custom"] = custom_config
+
+    # Create config. [ todo: programmatically create dataclass that inherits
+    # TransformerConfig. ]
+    config = config_cls(**data)
+
+    return config
+
+
+def flatten_config(config, base_config_cls=None):
+
+    # Lift sub-config data.
+    flat_config = {}
+    for field in dataclasses.fields(config):
+        value = getattr(config, field.name)
+        if dataclasses.is_dataclass(value):
+            flat_config = { **flat_config, **flatten_config(value) }
+        else:
+            flat_config[field.name] = value
+
+    # Convert to dataclass.
+    if base_config_cls:
+        base_keys = set(field.name for field in dataclasses.fields(base_config_cls))
+        flat_config_cls = dataclasses.make_dataclass(
+            cls_name="FlatMegatronConfig",
+            fields=[(k, T.Any, dataclasses.field(default=None))
+                    for k, v in flat_config.items()
+                    if k not in base_keys],
+            bases=(base_config_cls,))
+        flat_config = flat_config_cls(**flat_config)
+
+    return flat_config
diff --git a/tools/retro/db/__init__.py b/tools/retro/db/__init__.py
deleted file mode 100644
index d1bf23d966..0000000000
--- a/tools/retro/db/__init__.py
+++ /dev/null
@@ -1,3 +0,0 @@
-# Copyright (c) 2023, NVIDIA CORPORATION.  All rights reserved.
-
-from .build import build_db
diff --git a/tools/retro/db/build.py b/tools/retro/db/build.py
deleted file mode 100644
index 0b04a84e84..0000000000
--- a/tools/retro/db/build.py
+++ /dev/null
@@ -1,497 +0,0 @@
-# Copyright (c) 2023, NVIDIA CORPORATION.  All rights reserved.
-
-from collections import defaultdict
-from concurrent.futures import as_completed, ProcessPoolExecutor
-from functools import reduce
-import glob
-import json
-import numpy as np
-import os
-from pathlib import Path
-import threading
-import torch
-from tqdm import tqdm
-import types
-
-from megatron import get_retro_args, print_rank_0
-from megatron.data.indexed_dataset import make_dataset as make_indexed_dataset
-from megatron.tokenizer.tokenizer import (
-    _BertWordPieceTokenizer,
-    _GPT2BPETokenizer,
-)
-from tools.bert_embedding.utils import get_missing_blocks_by_rank
-from tools.retro.external_libs import h5py
-from tools.retro.utils import get_gpt_tokenizer, get_bert_tokenizer
-
-from .utils import (
-    get_indexed_dataset_infos,
-    get_indexed_dataset_infos_path,
-    get_individual_db_dir,
-    get_individual_chunk_db,
-    get_individual_doc_offsets,
-    get_merged_dataset,
-    get_merged_db_path_map,
-    save_indexed_dataset_infos,
-)
-
-
-def init_indexed_dataset_infos():
-    '''Gather meta-info about each indexed dataset.
-
-    The returned info array allows for easy access to the configuration, and
-    helps remove ambiguity.
-    '''
-
-    args = get_retro_args()
-
-    assert len(args.data_path) % 2 == 0, \
-        "currently, only blendable dataset is supported."
-
-    # Dataset infos.
-    infos = []
-    for i in range(0, len(args.data_path), 2):
-        ratio = float(args.data_path[i])
-        prefix = args.data_path[i + 1]
-        path = prefix + ".bin"
-        name = os.path.basename(prefix)
-        assert os.path.exists(path), "couldn't find '%s'." % path
-        infos.append({
-            "ratio" : ratio,
-            "prefix" : prefix,
-            "path" : path,
-            "name" : name,
-            "db_dir" : get_individual_db_dir(name),
-            "dataset" : make_indexed_dataset(prefix, "mmap", True),
-        })
-
-    return infos
-
-
-def build_partial_db(
-        dataset_idx,
-        n_datasets,
-        indexed_dataset,
-        block_id,
-        n_blocks,
-        block,
-        proc_id,
-        n_procs,
-        tokenizers,
-):
-    '''Process a document index range of the indexed dataset.
-
-    The chunk database is built in parallel blocks, since de-tokenizing &
-    re-tokenizing for Bert-length computation is expensive. This method
-    iterates each document and extracts sequential 'chunk-length' sequences
-    from each document.
-    '''
-
-    args = get_retro_args()
-
-    # Document start/end indexes.
-    doc_range = block["range"]
-    n_docs = doc_range[1] - doc_range[0]
-    n_docs_per_proc = int(np.ceil(n_docs / n_procs))
-    doc_start_id = doc_range[0] + proc_id * n_docs_per_proc
-    doc_end_id = min(doc_range[1], doc_start_id + n_docs_per_proc)
-
-    # Print progress.
-    progress_proc_ids = set(range(n_procs)) \
-        if torch.distributed.get_rank() == 0 else set()
-    if proc_id in progress_proc_ids:
-        print(" > building partial chunk db, proc %d / %d, docs %d:%d / %d."%(
-            proc_id,
-            n_procs,
-            doc_start_id,
-            doc_end_id,
-            n_docs,
-        ))
-
-    # Progress bars (snapshot of overall progress).
-    doc_id_iter = range(doc_start_id, doc_end_id)
-    pbar = tqdm(doc_id_iter) \
-        if proc_id in progress_proc_ids else \
-           doc_id_iter
-
-    # Iterate documents & parse chunks.
-    chunk_db_valid = []
-    chunk_db_invalid = []
-    doc_size_map = {}
-    for doc_id in pbar:
-
-        # Progress description.
-        try:
-            pbar.set_description("ds %d / %d, block %d / %d, proc %d / %d." % (
-                dataset_idx,
-                n_datasets,
-                block_id,
-                n_blocks,
-                proc_id,
-                n_procs))
-        except:
-            pass
-
-        # Remove EOD token.
-        doc = indexed_dataset.get(doc_id)
-        if doc[-1].item() == tokenizers.gpt.eod:
-            doc = doc[:-1]
-        doc_len = len(doc)
-
-        # Chunk start/end indexes.
-        chunk_start_idxs = list(range(0, doc_len, args.retro_gpt_chunk_length))
-        chunk_end_idxs = [min(doc_len, s + args.retro_gpt_chunk_length)
-                          for s in chunk_start_idxs]
-
-        # Re-tokenize each chunk to Bert/Wordpiece (empty bert -> 'invalid').
-        doc_size_map[doc_id] = 0
-        for i, chunk_start_idx in enumerate(chunk_start_idxs):
-
-            # Re-tokenize.
-            chunk_end_idx = chunk_end_idxs[i]
-            gpt_token_ids = indexed_dataset.get(
-                idx=doc_id,
-                offset=chunk_start_idx,
-                length=chunk_end_idx - chunk_start_idx,
-            )
-            text = tokenizers.gpt.detokenize(gpt_token_ids.tolist())
-            bert_token_ids = tokenizers.bert.tokenize(text)
-
-            # 'Valid' for non-empty Bert chunks; 'invalid' otherwise.
-            if len(bert_token_ids) == 0:
-                _chunk_db = chunk_db_invalid
-            else:
-                _chunk_db = chunk_db_valid
-                doc_size_map[doc_id] += 1
-            _chunk_db.append((
-                doc_id,
-                chunk_start_idx,
-                chunk_end_idx,
-                len(bert_token_ids),
-            ))
-
-    return proc_id, chunk_db_valid, chunk_db_invalid, doc_size_map
-
-
-def build_individual_db(dataset_idx, n_datasets, dataset_info, tokenizers):
-    '''Process a single indexed dataset & extract chunks.'''
-
-    args = get_retro_args()
-
-    # Make directory.
-    db_dir = dataset_info["db_dir"]
-    os.makedirs(db_dir, exist_ok=True)
-
-    # Indexed dataset.
-    indexed_dataset = dataset_info["dataset"]
-
-    # Missing db blocks.
-    n_missing_world, missing_db_blocks = get_missing_blocks_by_rank(
-        db_dir,
-        len(indexed_dataset),
-        args.retro_doc_block_size,
-        validate=lambda f : f["chunks_valid"].shape == (0,) \
-            or f["chunks_valid"].shape[1] == 4)
-
-    # Prevent missing-path-write race condition.
-    torch.distributed.barrier()
-
-    if not missing_db_blocks:
-        return
-
-    # Num processes.
-    if n_missing_world == 1:
-        n_procs = 128
-    elif n_missing_world <= 2:
-        n_procs = 64
-    elif n_missing_world <= 4:
-        n_procs = 32
-    elif n_missing_world <= 8:
-        n_procs = 16
-    else:
-        n_procs = 8
-
-    # Process documents in parallel.
-    with ProcessPoolExecutor(max_workers=n_procs) as executor:
-        for block_idx, block in enumerate(missing_db_blocks):
-
-            if block is not None:
-
-                db_path = block["path"]
-
-                # Build partial dbs.
-                print_rank_0(' > build partial dbs.')
-                futures = []
-                for proc_id in range(n_procs): # not true process id
-                    futures.append(executor.submit(
-                        build_partial_db,
-                        dataset_idx,
-                        n_datasets,
-                        indexed_dataset,
-                        block_idx,
-                        len(missing_db_blocks),
-                        block,
-                        proc_id,
-                        n_procs,
-                        tokenizers,
-                    ))
-                partial_chunk_dbs = []
-                for future in as_completed(futures):
-                    partial_chunk_dbs.append(future.result())
-
-                # Concatenate chunks.
-                partial_chunk_dbs.sort(key=lambda item:item[0]) # sort by proc_id
-                chunk_db_valid = [item
-                                  for partial_chunk_db in partial_chunk_dbs
-                                  for item in partial_chunk_db[1]]
-                chunk_db_invalid = [item
-                                    for partial_chunk_db in partial_chunk_dbs
-                                    for item in partial_chunk_db[2]]
-
-                # Convert to numpy.
-                print_rank_0(' > converting chunk db to numpy.')
-                chunk_db_valid = np.array(chunk_db_valid, dtype="uint32")
-                chunk_db_invalid = np.array(chunk_db_invalid, dtype="uint32")
-
-                # Document offsets.
-                doc_sizes = [(d, s)
-                             for partial_chunk_db in partial_chunk_dbs
-                             for d, s in partial_chunk_db[3].items()]
-                doc_sizes.sort(key = lambda item : item[0])
-                doc_offsets = np.cumsum([item[1] for item in doc_sizes]) \
-                                .astype("uint64")
-                doc_offsets = np.stack((
-                    np.array([item[0] for item in doc_sizes], dtype="uint64"),
-                    doc_offsets), axis=1)
-
-                # Save DB.
-                print_rank_0(" > saving individual db.")
-                with h5py.File(db_path, "w") as f:
-                    dset = f.create_dataset("chunks_valid", data=chunk_db_valid)
-                    dset = f.create_dataset("chunks_invalid",
-                                            data=chunk_db_invalid)
-                    dset = f.create_dataset("doc_offsets", data=doc_offsets)
-
-            # Wait for all ranks to finish block.
-            print_rank_0(" > waiting for all ranks to finish block.")
-            torch.distributed.barrier()
-
-    print_rank_0(" > finished saving individual db.")
-
-
-def build_individual_dbs(indexed_dataset_infos):
-    '''Iterate each indexed dataset & process its chunks.'''
-
-    args = get_retro_args()
-
-    # Tokenizers.
-    tokenizers = types.SimpleNamespace(
-        gpt=get_gpt_tokenizer(),
-        bert=get_bert_tokenizer(),
-    )
-
-    # Build individual DBs.
-    print_rank_0(" > build individual chunk dbs.")
-    for ds_idx, ds_info in enumerate(indexed_dataset_infos):
-
-        # Progress.
-        print_rank_0(" > building individual db, dataset %d / %d ... '%s'." % (
-            ds_idx,
-            len(indexed_dataset_infos),
-            ds_info["name"],
-        ))
-
-        # Process single dataset.
-        build_individual_db(ds_idx, len(indexed_dataset_infos),
-                            ds_info, tokenizers)
-
-
-def update_chunk_counts(indexed_dataset_infos):
-    '''Set n_chunks_train & n_chunks sampled for each individual DB.'''
-
-    args = get_retro_args()
-
-    if torch.distributed.get_rank() != 0:
-        return
-
-    # Data ratio sum (for setting index training chunks).
-    data_ratio_sum = sum([ d["ratio"] for d in indexed_dataset_infos ])
-
-    # Training split size (split at document level).
-    train_fraction = float(args.split.split(",")[0]) / 100
-    assert train_fraction > 0 and train_fraction <= 1
-
-    # Set n_chunks (including n_chunks_sampled for unambiguity).
-    print_rank_0(" > compute n_chunks.")
-    for ds_index, ds_info in enumerate(indexed_dataset_infos):
-
-        db_dir = ds_info["db_dir"]
-        db_paths = sorted(glob.glob(db_dir + "/*.hdf5"))
-
-        # Update counts.
-        ds_info["n_docs"] = len(ds_info["dataset"].doc_idx) - 1
-        ds_info["n_docs_train"] = int(train_fraction * ds_info["n_docs"])
-        ds_info["n_chunks"] = 0 # previously, 'n_chunks_valid'
-        ds_info["n_chunks_train"] = 0
-        ds_info["n_chunks_invalid"] = 0
-        for db_path in tqdm(db_paths, "%d/%d, %s" % (
-                ds_index, len(indexed_dataset_infos), ds_info["name"])):
-           with h5py.File(db_path, "r") as f:
-                ds_info["n_chunks"] += len(f["chunks_valid"])
-                ds_info["n_chunks_invalid"] += len(f["chunks_invalid"])
-                ds_info["n_chunks_train"] += \
-                    (np.copy(f["chunks_valid"][:, 0]) < ds_info["n_docs_train"]) \
-                    .sum().item()
-
-        ds_info["n_chunks_sampled"] = int(args.retro_index_ntrain *
-                                          ds_info["ratio"] / data_ratio_sum)
-
-        # Verify counts.
-        assert ds_info["n_chunks_train"] <= ds_info["n_chunks"], \
-            "n_train (%d) > n_total (%d)." % (
-                ds_info["n_chunks_train"], ds_info["n_chunks"])
-        assert ds_info["n_chunks_sampled"] <= ds_info["n_chunks_train"], \
-            "n_sampled (%d) > n_train (%d)." % (
-                ds_info["n_chunks_sampled"], ds_info["n_chunks_train"])
-
-
-def merge_dbs(indexed_dataset_infos, db_type):
-    '''Merge individual DBs into single DB.'''
-
-    if torch.distributed.get_rank() != 0:
-        return
-
-    print(" > build %s chunk db." % db_type)
-
-    # Count chunks.
-    if db_type == "sampled":
-        n_chunks_key = "n_chunks_sampled"
-        n_docs_key = None
-    elif db_type == "train":
-        n_chunks_key = "n_chunks_train"
-        n_docs_key = "n_docs_train"
-    elif db_type == "valid":
-        n_docs_key = None
-    else:
-        raise Exception("handle db_type '%s'." % db_type)
-
-    if db_type == "valid":
-        n_chunks = sum(m["n_chunks"] - m["n_chunks_train"]
-                       for m in indexed_dataset_infos)
-    else:
-        n_chunks = sum(m[n_chunks_key] for m in indexed_dataset_infos)
-        n_docs = None if n_docs_key is None else \
-            sum(m[n_docs_key] for m in indexed_dataset_infos)
-
-    # DB path.
-    db_path = get_merged_db_path_map()[db_type]
-
-    # Delete existing chunk db if incorrect size.
-    if os.path.exists(db_path):
-
-        try:
-
-            f = h5py.File(db_path)
-            n_alloc = len(f["chunks"])           # total allocated
-            n_written = f["n_written"][0].item() # total written
-            f.close()
-
-            if n_chunks != n_alloc or n_chunks != n_written:
-                os.remove(db_path)
-
-        except Exception as e:
-            if isinstance(e, OSError):
-                os.remove(db_path)
-            elif isinstance(e, KeyError):
-                f.close()
-                os.remove(db_path)
-            else:
-                raise e
-
-    # Build merged chunk db.
-    if not os.path.exists(db_path):
-
-        os.makedirs(os.path.dirname(db_path), exist_ok=True)
-        f = h5py.File(db_path, "w")
-
-        # Initialize output arrays.
-        merged_chunk_db = \
-            f.create_dataset("chunks", (n_chunks, 5), dtype="uint32")
-        merged_doc_offsets = None if n_docs_key is None else \
-            f.create_dataset("doc_offsets", (n_docs, 3), dtype="uint64")
-        n_written = f.create_dataset("n_written", (1,), dtype="uint64")
-        n_written[0] = 0
-
-        # Iterate indexed datasets & collect chunks.
-        chunk_start_index = 0
-        doc_start_index = 0
-        doc_start_offset = 0
-        for ds_idx, ds_info in enumerate(indexed_dataset_infos):
-            print(" > merging dbs; '%s', dataset %d / %d ... '%s'." %
-                  (db_type, ds_idx, len(indexed_dataset_infos), ds_info["name"]))
-            individual_chunk_db = get_individual_chunk_db(ds_idx, ds_info)
-            individual_doc_offsets = None if n_docs_key is None else \
-                get_individual_doc_offsets(ds_idx, ds_info)
-
-            if db_type == "valid":
-                individual_chunk_db = \
-                    individual_chunk_db[ds_info["n_chunks_train"]:]
-                if n_docs_key is None:
-                    individual_doc_offsets = None
-                else:
-                    train_doc_offset = \
-                        individual_doc_offsets[ds_info["n_docs_train"] - 1, 2]
-                    individual_doc_offsets = \
-                        np.copy(individual_doc_offsets[ds_info["n_docs_train"]:])
-                    individual_doc_offsets[:, 2] -= train_doc_offset
-
-                    print("~~~")
-                    print(individual_doc_offsets)
-                    print(train_doc_offset)
-                    raise Exception("test me.")
-            else:
-                individual_chunk_db = \
-                    individual_chunk_db[:ds_info[n_chunks_key]]
-                individual_doc_offsets = None if n_docs_key is None else \
-                    np.copy(individual_doc_offsets[:ds_info[n_docs_key]])
-
-            merged_chunk_db[chunk_start_index:chunk_start_index+len(individual_chunk_db)] = individual_chunk_db
-            chunk_start_index += len(individual_chunk_db)
-            n_written[0] = chunk_start_index
-            if n_docs_key is not None:
-                individual_doc_offsets[:, 2] += doc_start_offset
-                doc_end_index = doc_start_index + individual_doc_offsets.shape[0]
-                merged_doc_offsets[doc_start_index:doc_end_index] = \
-                    individual_doc_offsets
-                doc_start_index = doc_end_index
-                doc_start_offset = individual_doc_offsets[-1, 2].item()
-
-        f.close()
-
-
-def build_db():
-    '''Extract token chunks from each indexed dataset.
-
-    Iterate each document of each indexed dataset, extract that document's
-    chunks, and save to a 'DB' (hdf5 file).
-    '''
-
-    # Indexed dataset info.
-    indexed_dataset_infos = init_indexed_dataset_infos()
-
-    # Build dbs.
-    build_individual_dbs(indexed_dataset_infos)
-
-    # Single-process going forward.
-    if torch.distributed.get_rank() != 0:
-        return
-
-    # Update n_chunks & save indexed dataset infos.
-    if not os.path.exists(get_indexed_dataset_infos_path()):
-        update_chunk_counts(indexed_dataset_infos)
-        save_indexed_dataset_infos(indexed_dataset_infos)
-    indexed_dataset_infos = get_indexed_dataset_infos()
-
-    # Merge dbs.
-    merge_dbs(indexed_dataset_infos, "sampled")
-    merge_dbs(indexed_dataset_infos, "train")
-    merge_dbs(indexed_dataset_infos, "valid")
diff --git a/tools/retro/db/dataset.py b/tools/retro/db/dataset.py
deleted file mode 100644
index 906f8946ac..0000000000
--- a/tools/retro/db/dataset.py
+++ /dev/null
@@ -1,74 +0,0 @@
-# Copyright (c) 2023, NVIDIA CORPORATION.  All rights reserved.
-
-import json
-import numpy as np
-import torch
-from tqdm import tqdm
-
-from megatron import get_args, print_rank_0
-from tools.retro.external_libs import h5py
-from tools.retro.utils import get_gpt_tokenizer
-
-
-class DBDataset(torch.utils.data.Dataset):
-    '''Dataset for iterating chunks.
-
-    Requires:
-    - List of indexed datasets
-    - Chunk index array, with format:
-        [dataset_idx, doc_id, start_idx, end_idx, bert_length])
-    '''
-
-    def __init__(self, db_path, indexed_datasets, chunks, max_chunk_length):
-
-        assert chunks.shape[1] == 5, "expected 5 columns (dataset_idx, " \
-        "doc_idx, token_start_idx, token_end_idx, bert_chunk_length); " \
-        "found %d columns." % chunks.shape[1]
-
-        self.db_path = db_path
-        self.indexed_datasets = indexed_datasets
-        self.chunks = chunks
-        self.doc_chunk_map = None
-
-        self.max_chunk_length = max_chunk_length
-        self.eod_token_id = get_gpt_tokenizer().eod
-
-    def __len__(self):
-        return self.chunks.shape[0]
-
-    def __getitem__(self, chunk_id):
-
-        # Chunk start/end indexes.
-        indexed_dataset_id, doc_id, token_start_idx, token_end_idx, _ = \
-            [ value.item() for value in self.chunks[chunk_id] ]
-        chunk_length = token_end_idx - token_start_idx
-        indexed_dataset = self.indexed_datasets[indexed_dataset_id]
-
-        # Chunk token ids.
-        token_ids = indexed_dataset.get(doc_id,
-                                        offset=token_start_idx,
-                                        length=chunk_length)
-
-        # Extend chunks to max_chunk_length by padding with EOD tokens.
-        if chunk_length != self.max_chunk_length:
-            assert chunk_length < self.max_chunk_length, "invalid chunk len."
-            token_ids = token_ids.tolist()
-            token_ids += [self.eod_token_id] * \
-                (self.max_chunk_length - chunk_length)
-
-        return {
-            "doc_id" : doc_id,
-            "text" : np.array(token_ids, dtype=np.int64),
-        }
-
-    def load_doc_tuples(self):
-        '''Load the dataset & document ids.
-
-        Load the dataset id & document id of each chunk in the database, to
-        be used for causality filtering during querying.
-        '''
-        self.doc_tuples = np.zeros(shape=(len(self), 2), dtype="uint32")
-        block_size = int(1e6)
-        for start_idx in tqdm(range(0, len(self), block_size)):
-            end_idx = min(len(self), start_idx + block_size)
-            self.doc_tuples[start_idx:end_idx]=self.chunks[start_idx:end_idx,:2]
diff --git a/tools/retro/db/utils.py b/tools/retro/db/utils.py
deleted file mode 100644
index e51f370920..0000000000
--- a/tools/retro/db/utils.py
+++ /dev/null
@@ -1,143 +0,0 @@
-# Copyright (c) 2023, NVIDIA CORPORATION.  All rights reserved.
-
-from collections import defaultdict
-import glob
-import json
-import numpy as np
-import os
-from tqdm import tqdm
-
-from megatron import get_retro_args, print_rank_0
-from megatron.data.indexed_dataset import make_dataset as make_indexed_dataset
-from tools.retro.external_libs import h5py
-
-from .dataset import DBDataset
-
-
-def get_base_db_workdir():
-    '''Sub-directory for DB data.'''
-    args = get_retro_args()
-    return os.path.join(args.retro_workdir, "db")
-
-
-def get_indexed_dataset_infos_path():
-    '''Path to indexed dataset meta-infos.'''
-    return os.path.join(get_base_db_workdir(), "indexed_dataset_infos.json")
-
-
-def save_indexed_dataset_infos(indexed_dataset_infos):
-    '''Save dataset order & meta-info.'''
-
-    # Remove 'dataset' field.
-    clean_infos = []
-    for info in indexed_dataset_infos:
-        info = dict(info)
-        del info["dataset"]
-        clean_infos.append(info)
-
-    # Save.
-    with open(get_indexed_dataset_infos_path(), "w") as f:
-        json.dump(clean_infos, f, indent=4)
-
-
-def get_indexed_dataset_infos():
-    '''Load indexed dataset meta-infos.'''
-
-    # Load json.
-    path = get_indexed_dataset_infos_path()
-    with open(path) as f:
-        infos = json.load(f)
-
-    # Add indexed datasets.
-    for info in infos:
-        info["dataset"] = make_indexed_dataset(info["prefix"], "mmap", True)
-
-    return infos
-
-
-def get_individual_db_dir(name):
-    '''Individual DB's directory.'''
-    return os.path.join(get_base_db_workdir(), "individual", name)
-
-
-def get_individual_chunk_db(ds_id, ds_info):
-    '''Load individual dataset's chunk DB.'''
-    db_paths = sorted(glob.glob(ds_info["db_dir"] + "/*hdf5"))
-    # *Note*: convert to dataset, rather than copying to memory.
-    db = np.zeros((ds_info["n_chunks"], 5), dtype="uint32")
-    db[:, 0] = ds_id
-    start_idx = 0
-    for db_path in db_paths:
-        f = h5py.File(db_path, "r")
-        n_chunks_current = f["chunks_valid"].shape[0]
-        db[start_idx:(start_idx+n_chunks_current), 1:] = f["chunks_valid"]
-        start_idx += n_chunks_current
-        f.close()
-
-    assert start_idx == ds_info["n_chunks"]
-
-    return db
-
-
-def get_individual_doc_offsets(ds_id, ds_info):
-    '''Load individual dataset's chunk DB.'''
-    paths = sorted(glob.glob(ds_info["db_dir"] + "/*hdf5"))
-    # *Note*: convert to dataset, rather than copying to memory.
-    doc_offsets = np.zeros((ds_info["n_docs"], 3), dtype="uint64")
-    doc_offsets[:, 0] = ds_id
-    start_idx = 0
-    start_offset = 0
-    for path in paths:
-        with h5py.File(path) as f:
-            current_doc_offsets = np.copy(f["doc_offsets"])
-            current_doc_offsets[:, 1] += start_offset
-            current_ndocs = current_doc_offsets.shape[0]
-            doc_offsets[start_idx:(start_idx+current_ndocs), 1:] = \
-                current_doc_offsets
-            start_idx += current_ndocs
-            start_offset = current_doc_offsets[-1, 1].item()
-
-    return doc_offsets
-
-
-def get_merged_db_path_map():
-    '''Paths to merged datasets.'''
-    base_dir = get_base_db_workdir()
-    return {
-        "sampled" : os.path.join(base_dir, "merged", "sampled.hdf5"),
-        "train" : os.path.join(base_dir, "merged", "train.hdf5"),
-        "valid" : os.path.join(base_dir, "merged", "valid.hdf5"),
-    }
-
-
-def get_merged_dataset(db_type, indexed_dataset_infos=None):
-    '''Get merged dataset.'''
-
-    args = get_retro_args()
-
-    if not indexed_dataset_infos:
-        indexed_dataset_infos = get_indexed_dataset_infos()
-
-    # Load chunks.
-    db_path = get_merged_db_path_map()[db_type]
-    f = h5py.File(db_path, "r")
-    chunks = f["chunks"]
-
-    # DB dataset.
-    indexed_datasets = [ info["dataset"] for info in indexed_dataset_infos ]
-    dataset = DBDataset(db_path, indexed_datasets, chunks,
-                        args.retro_gpt_chunk_length)
-
-    return dataset
-
-
-def get_merged_sampled_dataset(indexed_dataset_infos=None):
-    return get_merged_dataset("sampled", indexed_dataset_infos)
-
-
-def get_merged_train_dataset(indexed_dataset_infos=None):
-    return get_merged_dataset("train", indexed_dataset_infos)
-
-
-def get_merged_valid_dataset(indexed_dataset_infos=None):
-    return get_merged_dataset("valid", indexed_dataset_infos)
diff --git a/tools/retro/docker/Dockerfile b/tools/retro/docker/Dockerfile
new file mode 100644
index 0000000000..e8945b373a
--- /dev/null
+++ b/tools/retro/docker/Dockerfile
@@ -0,0 +1,19 @@
+FROM nvcr.io/nvidia/pytorch:23.09-py3
+
+RUN pip install -U faiss-gpu
+
+RUN apt update
+
+RUN apt install -qy htop
+
+RUN pip install -U transformers
+
+RUN pip install --upgrade google-api-python-client
+
+RUN pip install sentencepiece
+
+RUN pip install h5py
+
+RUN pip install nltk
+
+RUN pip install einops
diff --git a/tools/retro/examples/get_dataset_configs.sh b/tools/retro/examples/get_dataset_configs.sh
deleted file mode 100644
index 3a61a059f3..0000000000
--- a/tools/retro/examples/get_dataset_configs.sh
+++ /dev/null
@@ -1,43 +0,0 @@
-#!/bin/bash
-
-# Small English Wikipedia dataset (~2M chunks).
-get_wiki_tiny_config() {
-    RETRO_INDEX_STR="IVF4096_HNSW4,Flat"
-    RETRO_NCHUNKS_SAMPLED=2281307
-    RETRO_GPT_TRAIN_SAMPLES=31250
-    LR_DECAY_SAMPLES=2
-    LR_WARMUP_SAMPLES=1
-    RETRO_GPT_EVAL_INTERVAL=2000
-    RETRO_GPT_EVAL_ITERS=100
-    RETRO_EF_SEARCH=4
-    RETRO_NPROBE=64
-    DATALOADER_TYPE=cyclic
-}
-
-# English Wikipedia dataset (~67M chunks).
-get_wiki_config() {
-    RETRO_INDEX_STR="IVF262144_HNSW32,Flat"
-    RETRO_NCHUNKS_SAMPLED=66625331
-    RETRO_GPT_TRAIN_SAMPLES=2037248
-    LR_DECAY_SAMPLES=2
-    LR_WARMUP_SAMPLES=1
-    RETRO_GPT_EVAL_INTERVAL=2000
-    RETRO_GPT_EVAL_ITERS=100
-    RETRO_EF_SEARCH=16
-    RETRO_NPROBE=4096
-    DATALOADER_TYPE=cyclic
-}
-
-# Full corpus (~5B chunks).
-get_corpus_config() {
-    RETRO_INDEX_STR="OPQ64_128,IVF4194304_HNSW32,PQ64"
-    RETRO_NCHUNKS_SAMPLED=300000000
-    RETRO_GPT_TRAIN_SAMPLES=192000000
-    LR_DECAY_SAMPLES=166400000
-    LR_WARMUP_SAMPLES=162761
-    RETRO_GPT_EVAL_INTERVAL=2000
-    RETRO_GPT_EVAL_ITERS=50
-    RETRO_EF_SEARCH=32
-    RETRO_NPROBE=4096
-    DATALOADER_TYPE=single
-}
diff --git a/tools/retro/examples/get_preprocess_cmd.sh b/tools/retro/examples/get_preprocess_cmd.sh
deleted file mode 100644
index 1ba29d0b96..0000000000
--- a/tools/retro/examples/get_preprocess_cmd.sh
+++ /dev/null
@@ -1,137 +0,0 @@
-#!/bin/bash
-
-# Build preprocessing command for Retro.
-
-set -u
-DIR=$( cd -- "$( dirname -- "${BASH_SOURCE[0]}" )" &> /dev/null && pwd )
-
-################ Required environment variables. ################
-# Required environment variables:
-# - REPO_DIR : Root directory of Megatron codebase.
-# - RETRO_WORKDIR : Root directory of this Retro project's processed data. (For
-#     example, this project directory might be for a blended dataset, while
-#     another project directory might be for just a Wikipedia dataset, and
-#     another for just Book Corpus data, etc.) This project directory will
-#     contain a complete set of processed data, including the retrieval
-#     database, search index, and pretraining neighbors.
-# - RETRO_TASKS : One of 'build', 'db-build', 'index-build', or
-#     'pretraining-query-neighbors'. See 'Retro tasks' below for task
-#     descriptions.
-# - DATA_BLEND_SCRIPT : Path to blended dataset definition file.
-# - GPT_VOCAB_FILE : GPT vocab file.
-# - GPT_MERGE_FILE : GPT merge file.
-# - GPT_TOKENIZER : GPT tokenizer type (e.g., GPT2BPETokenizer)
-# - BERT_LOAD_PATH : Bert checkpoint directory.
-# - BERT_VOCAB_FILE : Bert vocab file.
-# - BERT_TOKENIZER : Bert tokenizer type (e.g., BertWordPieceLowerCase,
-#     BertWordPieceCase).
-# - BERT_EMBEDDER_TYPE : One of 'megatron' or 'huggingface'.
-# - EXTRA_ARGS : Extra arguments (else, leave empty).
-
-################ Data blend. ################
-. ${DATA_BLEND_SCRIPT}
-DATA_PATH=${DATA_BLEND}
-
-################ Retro setup. ################
-RETRO_GPT_SEQ_LENGTH=2048
-RETRO_GPT_CHUNK_LENGTH=64
-RETRO_GPT_MICRO_BATCH_SIZE=1 # *8
-RETRO_GPT_GLOBAL_BATCH_SIZE=256
-
-################ Retro tasks. ################
-# The '--retro-tasks' argument is a comma-separated list of tasks to run, in
-# sequential order. For a quick start, simply set this to 'build' to run the
-# entire preprocessing pipeline. For finer control, you may specify the list of
-# tasks to run. This is desirable for tuning computational resources. For
-# example, training the search index is relatively fast and utilizes GPUs,
-# while querying the search index is relatively slow, CPU-only, and memory
-# intensive (i.e., multiple populated search indexes are loaded simultaneously).
-
-# *Note* : Once the task(s) below have been completed -- by running either
-#    1) 'build', or 2) the sequential combination of 'db-build', 'index-build',
-#    and 'pretraining-query-neighbors' -- we are ready to pretrain Retro by
-#    calling pretrain_retro.py.
-
-# ---- Option #1 : Run entire pipeline. ----
-
-# RETRO_TASKS="build" # (*note*: default tasks)
-
-# ---- Option #2 : Run specific stages. ----
-# *Note*: Run the following stages in the given order. Optionally, tune your
-#   cluster setup for each stage, as described above.
-
-# RETRO_TASKS="db-build" # ....................... run 1st
-# RETRO_TASKS="index-build" # .................... run 2nd
-# RETRO_TASKS="pretraining-query-neighbors" # .... run 3rd
-
-################ Megatron args. ################
-MEGATRON_ARGS=" \
-    --seed 1234 \
-    --distributed-timeout-minutes 600 \
-    --tokenizer-type ${BERT_TOKENIZER} \
-    --tensor-model-parallel-size 1 \
-    --pipeline-model-parallel-size 1 \
-    --num-layers 24 \
-    --hidden-size 1024 \
-    --num-attention-heads 16 \
-    --micro-batch-size ${RETRO_GPT_MICRO_BATCH_SIZE} \
-    --global-batch-size ${RETRO_GPT_GLOBAL_BATCH_SIZE} \
-    --seq-length 512 \
-    --max-position-embeddings 512 \
-    --train-samples ${RETRO_GPT_TRAIN_SAMPLES} \
-    --load ${BERT_LOAD_PATH} \
-    --exit-on-missing-checkpoint \
-    --no-load-optim \
-    --data-path ${DATA_PATH} \
-    --vocab-file ${BERT_VOCAB_FILE} \
-    --data-impl mmap \
-    --split 98,2,0 \
-    --distributed-backend nccl \
-    --lr 0.0001 \
-    --lr-decay-style linear \
-    --min-lr 1.0e-5 \
-    --lr-decay-samples ${LR_DECAY_SAMPLES} \
-    --lr-warmup-samples ${LR_WARMUP_SAMPLES} \
-    --weight-decay 1e-2 \
-    --clip-grad 1.0 \
-    --eval-interval ${RETRO_GPT_EVAL_INTERVAL} \
-    --eval-iters ${RETRO_GPT_EVAL_ITERS} \
-    --fp16 \
-    --DDP-impl local \
-    --dataloader-type ${DATALOADER_TYPE} \
-    --no-data-sharding \
-    --no-gradient-accumulation-fusion \
-    --no-async-tensor-model-parallel-allreduce \
-"
-
-################ Retro args. ################
-RETRO_ARGS=" \
-    --bert-embedder-type ${BERT_EMBEDDER_TYPE} \
-    --output-bert-embeddings \
-    \
-    --retro-gpt-vocab-file ${GPT_VOCAB_FILE} \
-    --retro-gpt-merge-file ${GPT_MERGE_FILE} \
-    --retro-gpt-tokenizer-type ${GPT_TOKENIZER} \
-    --retro-gpt-seq-length ${RETRO_GPT_SEQ_LENGTH} \
-    --retro-gpt-chunk-length ${RETRO_GPT_CHUNK_LENGTH} \
-    --retro-bert-vocab-file ${BERT_VOCAB_FILE} \
-    --retro-bert-tokenizer-type ${BERT_TOKENIZER} \
-    \
-    --retro-tasks ${RETRO_TASKS} \
-    --retro-index-str ${RETRO_INDEX_STR} \
-    --retro-ef-search ${RETRO_EF_SEARCH} \
-    --retro-nprobe ${RETRO_NPROBE} \
-    \
-    --retro-workdir ${RETRO_WORKDIR} \
-    --retro-nchunks-sampled ${RETRO_NCHUNKS_SAMPLED} \
-    \
-    --retro-return-doc-ids \
-"
-
-################ Command. ################
-RETRO_PREPROCESS_CMD=" \
-    ./tools/retro/main.py \
-    ${MEGATRON_ARGS} \
-    ${RETRO_ARGS} \
-    ${EXTRA_ARGS} \
-"
diff --git a/tools/retro/examples/preprocess_data.sh b/tools/retro/examples/preprocess_data.sh
deleted file mode 100644
index 74cdf1823d..0000000000
--- a/tools/retro/examples/preprocess_data.sh
+++ /dev/null
@@ -1,50 +0,0 @@
-#!/bin/bash
-
-set -u
-unset NCCL_DEBUG
-
-NPROCS=8 # NPROCS must be <= number of GPUs.
-
-set_current_dir() {
-    DIR=$( cd -- "$( dirname -- "${BASH_SOURCE[0]}" )" &> /dev/null && pwd )
-}
-
-################ Dataset configs. ################
-# This script contains methods to customize arguments to specific dataset
-# types. Customize this script as needed for your datasets.
-set_current_dir
-. $DIR/get_dataset_configs.sh
-
-################ Environment variables. ################
-# *Note*: See 'Required environment variables' in 'get_preprocess_cmd.sh' for
-# a description of the required environment variables. These variables can be
-# set however a user would like. In our setup, we use another bash script
-# (location defined by $RETRO_ENV_VARS) that sets all the environment variables
-# at once.
-. $RETRO_ENV_VARS
-
-######## Environment vars. ########
-set_current_dir
-. ${DIR}/get_preprocess_cmd.sh
-
-echo "~~~~~~~~~~~~~~~~~~~~~~~~~~"
-echo "DIR = '$DIR'."
-echo "RETRO_PREPROCESS_CMD = '$RETRO_PREPROCESS_CMD'."
-echo "~~~~~~~~~~~~~~~~~~~~~~~~~~"
-
-######## Command. ########
-FULL_CMD="\
-    pwd && cd ${REPO_DIR} && pwd && \
-    export PYTHONPATH=$PYTHONPATH:${REPO_DIR} && \
-    python -m torch.distributed.run \
-    --nproc_per_node ${NPROCS} \
-    --nnodes 1 \
-    --node_rank ${NODE_RANK} \
-    --master_addr ${MASTER_ADDR} \
-    --master_port 6000 \
-    $RETRO_PREPROCESS_CMD \
-"
-echo "~~~~~~~~~~~~~~~~~~~~~~~~~~"
-echo "FULL_CMD = '$FULL_CMD'."
-echo "~~~~~~~~~~~~~~~~~~~~~~~~~~"
-eval $FULL_CMD
diff --git a/tools/retro/examples/pretrain_model.sh b/tools/retro/examples/pretrain_model.sh
deleted file mode 100644
index 367d87ce63..0000000000
--- a/tools/retro/examples/pretrain_model.sh
+++ /dev/null
@@ -1,105 +0,0 @@
-#!/bin/bash
-
-##################################################
-# Example script for pretraining Retro.
-##################################################
-
-set -u
-unset NCCL_DEBUG
-export CUDA_DEVICE_MAX_CONNECTIONS=1
-
-NPROCS=8 # NPROCS must be <= number of GPUs.
-
-################ Dataset configs. ################
-# This script contains methods to customize arguments to specific dataset
-# types. Customize this script as needed for your datasets.
-DIR=$( cd -- "$( dirname -- "${BASH_SOURCE[0]}" )" &> /dev/null && pwd )
-. $DIR/get_dataset_configs.sh
-
-################ Environment variables. ################
-# *Note*: See 'Required environment variables' in 'get_preprocess_cmd.sh' for
-# a description of the required environment variables. These variables can be
-# set however a user would like. In our setup, we use another bash script
-# (location defined by $RETRO_ENV_VARS) that sets all the environment variables
-# at once.
-. $RETRO_ENV_VARS
-
-################ Data blend. ################
-. ${DATA_BLEND_SCRIPT}
-DATA_PATH=${DATA_BLEND}
-
-######## Retro setup. ########
-RETRO_ADD_RETRIEVER=0
-RETRO_CYCLIC_TRAIN_ITERS=750000
-RETRO_NUM_NEIGHBORS=2
-
-######## Arguments. ########
-CHECKPOINT_DIR=${RETRO_WORKDIR}/checkpoints/${RETRO_ADD_RETRIEVER}
-TENSORBOARD_DIR="${CHECKPOINT_DIR}/tensorboard"
-mkdir -p ${TENSORBOARD_DIR}
-ARGS=" \
-    --save-interval 1000 \
-    --save ${CHECKPOINT_DIR} \
-    --load ${CHECKPOINT_DIR} \
-    --tensorboard-dir ${TENSORBOARD_DIR} \
-    --log-interval 5 \
-    --tensor-model-parallel-size 1 \
-    --pipeline-model-parallel-size 1 \
-    --num-layers 12 \
-    --hidden-size 768 \
-    --num-attention-heads 12 \
-    --seq-length 2048 \
-    --max-position-embeddings 2048 \
-    --micro-batch-size 4 \
-    --global-batch-size 256 \
-    --train-samples ${RETRO_GPT_TRAIN_SAMPLES}  \
-    --lr-decay-samples ${LR_DECAY_SAMPLES} \
-    --lr-warmup-samples ${LR_WARMUP_SAMPLES} \
-    --lr 6.0e-4 \
-    --min-lr 6.0e-5 \
-    --lr-decay-style cosine \
-    --eval-interval ${RETRO_GPT_EVAL_INTERVAL} \
-    --eval-iters ${RETRO_GPT_EVAL_ITERS} \
-    --data-path ${DATA_PATH} \
-    --vocab-file ${GPT_VOCAB_FILE} \
-    --merge-file ${GPT_MERGE_FILE} \
-    --split 98,2,0 \
-    --clip-grad 1.0 \
-    --weight-decay 0.1 \
-    --adam-beta1 0.9 \
-    --adam-beta2 0.95 \
-    --init-method-std 0.023 \
-    --log-params-norm \
-    --log-num-zeros-in-grad \
-    --fp16 \
-    --DDP-impl local \
-    --dataloader-type ${DATALOADER_TYPE} \
-    --no-data-sharding \
-    --no-gradient-accumulation-fusion \
-"
-
-if [ "$RETRO_ADD_RETRIEVER" = "0" ]; then
-    SCRIPT=pretrain_gpt.py
-else
-    ARGS="${ARGS} \
-    --retro-add-retriever \
-    --retro-workdir ${RETRO_WORKDIR} \
-    --retro-cyclic-train-iters ${RETRO_CYCLIC_TRAIN_ITERS} \
-    --retro-num-neighbors ${RETRO_NUM_NEIGHBORS} \
-    "
-    SCRIPT=pretrain_retro.py
-fi
-
-echo "~~~~~~~~~~~~~~~~~~~~~~~~~~"
-echo "SCRIPT = '$SCRIPT'."
-echo "ARGS = '$ARGS'."
-echo "~~~~~~~~~~~~~~~~~~~~~~~~~~"
-
-python -m torch.distributed.run \
-    --nproc_per_node ${NPROCS} \
-    --nnodes 1 \
-    --node_rank 0 \
-    --master_addr localhost \
-    --master_port 6000 \
-    ${SCRIPT} \
-    ${ARGS} \
diff --git a/tools/retro/external_libs.py b/tools/retro/external_libs.py
deleted file mode 100644
index 1a160b842c..0000000000
--- a/tools/retro/external_libs.py
+++ /dev/null
@@ -1,15 +0,0 @@
-# Copyright (c) 2023, NVIDIA CORPORATION.  All rights reserved.
-
-import importlib
-
-required_libs = [
-    "faiss",
-    "h5py",
-    "transformers", # for huggingface bert
-]
-
-for lib in required_libs:
-    try:
-        globals()[lib] = importlib.import_module(lib)
-    except ImportError as e:
-        raise Exception(f"Missing one or more packages required for Retro preprocessing: {required_libs}. Tried importing '{lib}'.")
diff --git a/tools/retro/index/__init__.py b/tools/retro/index/__init__.py
deleted file mode 100644
index 5b17f7f0fe..0000000000
--- a/tools/retro/index/__init__.py
+++ /dev/null
@@ -1,4 +0,0 @@
-# Copyright (c) 2023, NVIDIA CORPORATION.  All rights reserved.
-
-from .build import add_to_index, build_index, train_index
-# from .index import Index
diff --git a/tools/retro/index/build.py b/tools/retro/index/build.py
deleted file mode 100644
index 927b63e46f..0000000000
--- a/tools/retro/index/build.py
+++ /dev/null
@@ -1,187 +0,0 @@
-# Copyright (c) 2023, NVIDIA CORPORATION.  All rights reserved.
-
-import numpy as np
-import os
-import shutil
-import torch
-from tqdm import tqdm
-
-from megatron import get_retro_args, print_rank_0
-from tools.bert_embedding import DiskDataParallelBertEmbedder
-from tools.retro.db.utils import (
-    get_indexed_dataset_infos,
-    get_merged_sampled_dataset,
-    get_merged_train_dataset,
-)
-from tools.retro.external_libs import h5py
-from tools.retro.index.factory import IndexFactory
-from tools.retro.utils import GPTToTextDataset
-
-from .utils import (
-    get_training_data_block_dir,
-    get_training_data_block_paths,
-    get_training_data_merged_path,
-    get_training_data_root_dir,
-)
-
-
-##################################################
-# Train index.
-##################################################
-
-
-def get_empty_index_path():
-    '''Path of empty index.'''
-    args = get_retro_args()
-    index = IndexFactory.get_index(args.retro_index_type)
-    empty_index_path = index.get_empty_index_path()
-    return empty_index_path
-
-
-def get_block_nload(block_path, load_fraction):
-    with h5py.File(block_path) as fi:
-        return int(load_fraction * fi["data"].shape[0])
-
-
-def merge_embedding_blocks():
-
-    if torch.distributed.get_rank() != 0:
-        return
-
-    args = get_retro_args()
-
-    # Get block, merged paths.
-    load_fraction = args.retro_index_train_load_fraction
-    block_paths = get_training_data_block_paths()
-    bin_path = get_training_data_merged_path()
-
-    # Skip, if already built.
-    if os.path.exists(bin_path):
-        return
-
-    # Merge blocks.
-    with open(bin_path, "wb") as fo:
-        byte_offset = 0
-        for block_idx, block_path in \
-            enumerate(tqdm(block_paths, "merge train embeddings")):
-            with h5py.File(block_path) as fi:
-
-                nload = get_block_nload(block_path, load_fraction)
-                block = np.array(fi["data"][:nload], copy = False)
-
-                fo.write(block.tobytes())
-
-                byte_offset += block.size * block.itemsize
-                fo.seek(byte_offset)
-
-
-def embed_db():
-    '''Embed DB chunks.
-
-    Store chunks in blocks on disk. These blocks will later be merged into
-    a single dataset for training the index.
-    '''
-
-    args = get_retro_args()
-
-    merged_train_data_path = get_training_data_merged_path()
-    if os.path.exists(merged_train_data_path):
-        return
-
-    # Get db dataset.
-    gpt_dataset = get_merged_sampled_dataset()
-    text_dataset = GPTToTextDataset(gpt_dataset)
-
-    # Embed dataset.
-    embedder = DiskDataParallelBertEmbedder(args.retro_bert_batch_size,
-                                            args.retro_bert_max_chunk_length,
-                                            args.retro_block_size,
-                                            args.bert_embedder_type)
-    embedder.embed_text_dataset("index",
-                                get_training_data_block_dir(),
-                                text_dataset)
-
-    # Merge embeddings.
-    merge_embedding_blocks()
-
-
-def train_on_embeddings():
-    '''Train index on embedded DB chunks.'''
-    args = get_retro_args()
-    index = IndexFactory.get_index(args.retro_index_type)
-    index.train()
-
-
-def remove_embeddings():
-    '''Remove embeddings after training.'''
-    torch.distributed.barrier()
-    if torch.distributed.get_rank() != 0:
-        return
-    empty_index_path = get_empty_index_path()
-    assert os.path.isfile(empty_index_path)
-    shutil.rmtree(get_training_data_root_dir(), ignore_errors=True)
-
-
-def train_index():
-    '''Train index on DB chunks.'''
-
-    args = get_retro_args()
-
-    # Check if trained index already exists.
-    if not os.path.isfile(get_empty_index_path()):
-
-        # Embed training chunks.
-        embed_db()
-
-        # Train index on embeddings.
-        train_on_embeddings()
-
-    # Wait for (single-process) training to complete.
-    torch.distributed.barrier()
-
-    # Remove embeddings.
-    if args.retro_index_delete_training_embeddings:
-        remove_embeddings()
-
-
-##################################################
-# Add to index.
-##################################################
-
-
-def add_to_index():
-    '''Add DB chunks to index.'''
-
-    args = get_retro_args()
-
-    # Get index.
-    index = IndexFactory.get_index(args.retro_index_type)
-
-    # Get text dataset.
-    gpt_dataset = get_merged_train_dataset()
-    text_dataset = GPTToTextDataset(gpt_dataset)
-
-    # Add to index.
-    output_index_path = index.add(text_dataset)
-
-    return output_index_path
-
-
-##################################################
-# Build index (train + add).
-##################################################
-
-
-def build_index():
-    '''Build index.
-
-    Building index involves sequentially running stages above:
-    - Train index (on sampled training chunks).
-    - Add to index (on all training chunks).
-    '''
-
-    # Train index.
-    train_index()
-
-    # Add to index.
-    add_to_index()
diff --git a/tools/retro/index/factory.py b/tools/retro/index/factory.py
deleted file mode 100644
index 3e247efeae..0000000000
--- a/tools/retro/index/factory.py
+++ /dev/null
@@ -1,23 +0,0 @@
-# Copyright (c) 2023, NVIDIA CORPORATION.  All rights reserved.
-
-from .indexes import FaissBaseIndex, FaissParallelAddIndex
-
-
-class IndexFactory:
-    '''Get index.
-
-    Index type generally read from argument '--retro-index-ty'.
-    '''
-
-    @classmethod
-    def get_index_class(cls, index_type):
-        return {
-            "faiss-base" : FaissBaseIndex,
-            "faiss-par-add" : FaissParallelAddIndex,
-        }[index_type]
-
-    @classmethod
-    def get_index(cls, index_type):
-        index_class = cls.get_index_class(index_type)
-        index = index_class()
-        return index
diff --git a/tools/retro/index/index.py b/tools/retro/index/index.py
deleted file mode 100644
index 3d41d35735..0000000000
--- a/tools/retro/index/index.py
+++ /dev/null
@@ -1,67 +0,0 @@
-# Copyright (c) 2023, NVIDIA CORPORATION.  All rights reserved.
-
-import abc
-import numpy as np
-import os
-import torch
-
-from megatron import get_retro_args
-from tools.retro.external_libs import faiss
-
-from .utils import get_index_dir
-
-
-class Index(abc.ABC):
-
-    '''Abstract base class for indexes.
-
-    *Note* : While currently only Faiss-based classes are implemented, in the
-    future, this class will be extended with other types of indexes that have
-    different performance-accuracy trade-offs.
-
-    The primary methods to override are:
-    - train() : Train index on the sampled training chunks.
-    - add() : Add all training chunks to index.
-    '''
-
-    @classmethod
-    def c_verbose(cls, index, v):
-        '''Make index object verbose.'''
-        assert isinstance(v, bool)
-        faiss.ParameterSpace().set_index_parameter(index, "verbose", v)
-
-    def get_empty_index_path(self):
-        args = get_retro_args()
-        return os.path.join(
-            get_index_dir(),
-            "empty_%.3f.faissindex" % args.retro_index_train_load_fraction,
-        )
-
-    def get_empty_index(self):
-        return faiss.read_index(self.get_empty_index_path())
-
-    def get_added_index_path(self):
-        args = get_retro_args()
-        return os.path.join(
-            get_index_dir(),
-            "added_%.3f_%.3f.faissindex" % (
-                args.retro_index_train_load_fraction,
-                args.retro_index_add_load_fraction,
-            ),
-        )
-
-    def get_added_index(self):
-        return faiss.read_index(self.get_added_index_path())
-
-    @abc.abstractmethod
-    def train(self, *args):
-        pass
-
-    @abc.abstractmethod
-    def add(self, *args):
-        pass
-
-    def embed_text_dataset_block(self, embedder, text_dataset, _range):
-        '''Embed a range of a text dataset.'''
-        sub_dataset = torch.utils.data.Subset(text_dataset, range(*_range))
-        return embedder.embed_text_dataset(sub_dataset)
diff --git a/tools/retro/index/indexes/__init__.py b/tools/retro/index/indexes/__init__.py
deleted file mode 100644
index 30e8a3c111..0000000000
--- a/tools/retro/index/indexes/__init__.py
+++ /dev/null
@@ -1,4 +0,0 @@
-# Copyright (c) 2023, NVIDIA CORPORATION.  All rights reserved.
-
-from .faiss_base import FaissBaseIndex
-from .faiss_par_add import FaissParallelAddIndex
diff --git a/tools/retro/index/indexes/faiss_base.py b/tools/retro/index/indexes/faiss_base.py
deleted file mode 100644
index 3215e6273d..0000000000
--- a/tools/retro/index/indexes/faiss_base.py
+++ /dev/null
@@ -1,137 +0,0 @@
-# Copyright (c) 2023, NVIDIA CORPORATION.  All rights reserved.
-
-"""
-This class implements a simple, un-optimized wrapper around a Faiss index, that
-implements the Index interface (see ..index.py). While this class is
-instantiable, it is meant to be extended with optimizations in classes that
-inherit from this class (see FaissParAddIndex, for an example).
-"""
-
-from datetime import timedelta
-import numpy as np
-import os
-import torch
-from tqdm import tqdm
-
-from megatron import get_retro_args, print_rank_0
-from tools.bert_embedding import BertEmbedder
-from tools.retro.external_libs import faiss
-from tools.retro.index.index import Index
-from tools.retro.index.utils import (
-    get_training_data_merged_path,
-    num_samples_to_block_ranges,
-)
-
-
-class FaissBaseIndex(Index):
-
-    def _train(self):
-        '''Train index (rank 0's method).'''
-
-        args = get_retro_args()
-
-        assert torch.distributed.get_rank() == 0
-
-        # Set num threads (torch.distributed reset it to 1).
-        # faiss.omp_set_num_threads(32)
-        faiss.omp_set_num_threads(64)
-        # faiss.omp_set_num_threads(128)
-
-        empty_index_path = self.get_empty_index_path()
-
-        # Index already exists? -> return.
-        if os.path.isfile(empty_index_path):
-            return
-
-        # Load data.
-        merged_path = get_training_data_merged_path()
-        inp = np.memmap(
-	    merged_path,
-            dtype = "f4",
-	    mode = "r",
-        ).reshape((-1, args.hidden_size))
-
-        # Init index.
-        index = faiss.index_factory(args.retro_index_nfeats,
-                                    args.retro_index_str)
-
-        # Move to GPU.
-        print("> move faiss index to gpu.")
-        index_ivf = faiss.extract_index_ivf(index)
-        clustering_index = \
-            faiss.index_cpu_to_all_gpus(faiss.IndexFlatL2(index_ivf.d))
-        index_ivf.clustering_index = clustering_index
-        print("> finished moving to gpu.")
-        self.c_verbose(index, True)
-        self.c_verbose(index_ivf, True)
-        self.c_verbose(index_ivf.quantizer, True)
-        self.c_verbose(index_ivf.clustering_index, True)
-
-        # Train index.
-        index.train(inp)
-
-        # Save index.
-        faiss.write_index(index, empty_index_path)
-
-    def train(self):
-        '''Train index.'''
-
-        # Single process only.
-        if torch.distributed.get_rank() == 0:
-            self._train()
-
-        torch.distributed.barrier()
-
-    def _add(self, text_dataset):
-        '''Add to index (rank 0's method).'''
-
-        assert torch.distributed.get_rank() == 0
-
-        args = get_retro_args()
-
-        dataset_sample_ranges = num_samples_to_block_ranges(len(text_dataset))
-
-        # Set num threads (torch.distributed reset it to 1).
-        faiss.omp_set_num_threads(64)
-
-        # Bert embedder.
-        embedder = BertEmbedder(args.retro_bert_batch_size,
-                                args.retro_bert_max_chunk_length,
-                                args.bert_embedder_type)
-
-        # Empty/added index paths.
-        empty_index_path = self.get_empty_index_path()
-        added_index_path = self.get_added_index_path()
-
-        # Skip adding, if index exists.
-        if os.path.isfile(added_index_path):
-            return
-
-        # Read trained index.
-        index = faiss.read_index(empty_index_path)
-
-        # Iterate data blocks & add.
-        for sample_range in tqdm(dataset_sample_ranges, "faiss_base.add"):
-
-            # Embed text.
-            embeds = self.embed_text_dataset_block(
-                embedder, text_dataset, sample_range)
-
-            # Add to index.
-            index.add(embeds)
-
-        # Write index.
-        faiss.write_index(index, added_index_path)
-
-    def add(self, text_dataset):
-        '''Add to index.'''
-
-        # Single process only.
-        if torch.distributed.get_rank() == 0:
-            self._add(text_dataset)
-
-        # Wait for rank 0.
-        torch.distributed.barrier()
-
-        # Get output index path, for return.
-        return self.get_added_index_path()
diff --git a/tools/retro/index/indexes/faiss_par_add.py b/tools/retro/index/indexes/faiss_par_add.py
deleted file mode 100644
index 8dfc7b5431..0000000000
--- a/tools/retro/index/indexes/faiss_par_add.py
+++ /dev/null
@@ -1,162 +0,0 @@
-# Copyright (c) 2023, NVIDIA CORPORATION.  All rights reserved.
-
-"""Multi-process & multi-node version of Faiss's index.add().
-
-This class inherits from FaissBaseIndex, and optimizes the 'add()' method by
-making it multi-node and multi-process, with bit-wise equivalence to
-FaissBaseIndex. This allows 'add()' to scale out to very large datasets, since
-the vast majority of the computational effort is embarrassingly parallel.
-"""
-
-import numpy as np
-import os
-import psutil
-import shutil
-import torch
-from tqdm import tqdm
-
-from megatron import get_retro_args, print_rank_0
-from tools.bert_embedding import BertEmbedder
-from tools.bert_embedding.utils import get_missing_blocks_by_rank
-from tools.retro.external_libs import faiss, h5py
-from tools.retro.index.utils import get_added_codes_dir, get_added_code_paths
-
-from .faiss_base import FaissBaseIndex
-
-
-class FaissParallelAddIndex(FaissBaseIndex):
-
-    def encode_block(self, index, embedder, text_dataset, block):
-        '''Encode sub-dataset block, to be later added to index.
-
-        Encode the data subset, generally in blocks of 1M vectors each. For
-        each block, the empty/trained index is loaded, codes are computed
-        via index.sa_encode(), and the resulting codes are saved to disk.
-        '''
-
-        args = get_retro_args()
-
-        # Embed block.
-        embeddings = self.embed_text_dataset_block(
-            embedder,
-            text_dataset,
-            block["range"],
-        )
-
-        # Encode block.
-        print_rank_0("encode.")
-        codes = index.sa_encode(embeddings)
-
-        # Save neighbors.
-        print_rank_0("save codes.")
-        os.makedirs(os.path.dirname(block["path"]), exist_ok=True)
-        with h5py.File(block["path"], "w") as f:
-            f.create_dataset("data", data=codes)
-
-    def encode(self, text_dataset):
-        '''Encode text dataset, to be later added to index.'''
-
-        args = get_retro_args()
-        codes_dir = get_added_codes_dir()
-
-        # Index.
-        index = self.get_empty_index()
-
-        # Bert embedder.
-        embedder = BertEmbedder(args.retro_bert_batch_size,
-                                args.retro_bert_max_chunk_length,
-                                args.bert_embedder_type)
-
-        # Missing code blocks.
-        def validate(f):
-            assert len(f["data"].shape) == 2
-        n_missing_blocks, missing_code_blocks = get_missing_blocks_by_rank(
-            codes_dir,
-            len(text_dataset),
-            args.retro_block_size,
-            validate=validate,
-        )
-
-        # Encode each block.
-        for block_index, block in enumerate(missing_code_blocks):
-
-            if block is not None:
-
-                # Progress.
-                print_rank_0("encode block %d / %d ... %s." % (
-                    block_index,
-                    len(missing_code_blocks),
-                    block["path"],
-                ))
-
-                # Query block neighbors.
-                self.encode_block(index, embedder, text_dataset, block)
-
-            # Synchronize progress across all ranks. (for easier observation)
-            print_rank_0(" > waiting for other ranks to finish block.")
-            torch.distributed.barrier()
-
-    def add_codes(self):
-
-        if torch.distributed.get_rank() != 0:
-            return
-
-        added_index_path = self.get_added_index_path()
-        if os.path.exists(added_index_path):
-            return
-
-        args = get_retro_args()
-
-        # Index.
-        print_rank_0("read empty index.")
-        index = self.get_empty_index()
-        index_ivf = faiss.extract_index_ivf(index)
-
-        # Add codes.
-        print_rank_0("add codes.")
-        code_paths = get_added_code_paths()
-        pbar = tqdm(code_paths)
-        for code_path in pbar:
-            pbar.set_description("add codes, mem %.3f gb, %.1f%%" % (
-                psutil.virtual_memory()[3] / 1024**3,
-                psutil.virtual_memory()[2],
-            ))
-            with h5py.File(code_path) as f:
-
-                nload = int(args.retro_index_add_load_fraction*f["data"].shape[0])
-                offset = int(os.path.basename(code_path).split("-")[0])
-                xids = np.arange(offset, offset + nload)
-                codes = np.copy(f["data"][:nload])
-                index_ivf.add_sa_codes(codes, xids)
-
-        # Update index's ntotal.
-        index.ntotal = index_ivf.ntotal
-
-        # Write index.
-        print_rank_0("write added index.")
-        faiss.write_index(index, added_index_path)
-
-    def remove_codes(self):
-        '''Remove added codes after adding to index.'''
-        if torch.distributed.get_rank() != 0:
-            return
-        assert os.path.isfile(self.get_added_index_path())
-
-        args = get_retro_args()
-        if args.retro_index_delete_added_codes:
-            raise Exception("remove?")
-            shutil.rmtree(get_added_codes_dir(), ignore_errors=True)
-
-    def add(self, text_dataset):
-
-        # Encode chunks.
-        self.encode(text_dataset)
-
-        # Add codes to index.
-        self.add_codes()
-
-        # Wait for (single-process) adding to complete.
-        torch.distributed.barrier()
-
-        # Remove codes.
-        self.remove_codes()
diff --git a/tools/retro/index/utils.py b/tools/retro/index/utils.py
deleted file mode 100644
index 36e467b535..0000000000
--- a/tools/retro/index/utils.py
+++ /dev/null
@@ -1,72 +0,0 @@
-# Copyright (c) 2023, NVIDIA CORPORATION.  All rights reserved.
-
-import concurrent
-import gc
-import glob
-import numpy as np
-import os
-import psutil
-import time
-import torch
-from tqdm import tqdm
-
-from megatron import get_retro_args, print_rank_0
-from tools.retro.db.utils import get_indexed_dataset_infos
-from tools.retro.external_libs import h5py
-
-
-def get_index_dir():
-    """Create sub-directory for this index."""
-
-    args = get_retro_args()
-
-    # Directory path.
-    index_dir_path = os.path.join(
-        args.retro_workdir,
-        "index",
-        args.retro_index_type,
-        args.retro_index_str,
-    )
-
-    # Make directory.
-    os.makedirs(index_dir_path, exist_ok=True)
-
-    return index_dir_path
-
-
-def num_samples_to_block_ranges(num_samples):
-    '''Split a range (length num_samples) into sequence of block ranges
-    of size block_size.'''
-    args = get_retro_args()
-    block_size = args.retro_block_size
-    start_idxs = list(range(0, num_samples, block_size))
-    end_idxs = [min(num_samples, s + block_size) for s in start_idxs]
-    ranges = list(zip(start_idxs, end_idxs))
-    return ranges
-
-
-def get_training_data_root_dir():
-    args = get_retro_args()
-    return os.path.join(args.retro_workdir, "index", "train_emb")
-
-
-def get_training_data_block_dir():
-    return os.path.join(get_training_data_root_dir(), "blocks")
-
-
-def get_training_data_block_paths():
-    return sorted(glob.glob(get_training_data_block_dir() + "/*.hdf5"))
-
-
-def get_training_data_merged_path():
-    args = get_retro_args()
-    return os.path.join(get_training_data_root_dir(),
-                        "train_%.3f.bin" % args.retro_index_train_load_fraction)
-
-
-def get_added_codes_dir():
-    return os.path.join(get_index_dir(), "add_codes")
-
-
-def get_added_code_paths():
-    return sorted(glob.glob(get_added_codes_dir() + "/*.hdf5"))
diff --git a/tools/retro/main.py b/tools/retro/main.py
deleted file mode 100644
index 3cebdc8ab7..0000000000
--- a/tools/retro/main.py
+++ /dev/null
@@ -1,217 +0,0 @@
-# Copyright (c) 2023, NVIDIA CORPORATION.  All rights reserved.
-
-"""Preprocess data for Retro.
-
-Stages (see argument '--retro-tasks'):
-- Build chunk database (DB).
-- Build index (train, add).
-- Query pretraining neighbors.
-"""
-
-import json
-import os
-import torch
-
-from megatron import get_args, initialize_megatron, print_rank_0
-from megatron.global_vars import set_retro_args
-from tools.retro.db import build_db
-from tools.retro.index import add_to_index, build_index, train_index
-from tools.retro.query import query_pretraining_neighbors
-from tools.retro.utils import get_args_path
-
-
-def add_retro_args(parser):
-    """Retro preprocesing arguments.
-
-    *Note* : Arguments prefixed with '--retro-gpt-*' or '--retro-bert-*' are
-    included and named as such to more easily handle managing both models
-    running at the same time. Megatron is not optimized to run two models at
-    once, so this naming convention makes it clearer.
-    """
-
-    group = parser.add_argument_group(title="Retro preprocessing.")
-
-    # Basic args.
-    group.add_argument("--retro-tasks", default="build",
-                       help="Comma-separated list of tasks to run. Run entire "
-                       "preprocesing pipeline by using '--retro-tasks build'. "
-                       "Alternatively, run individual stages with tasks (in "
-                       "this order) 'db-build', 'index-build', or "
-                       "'query-pretraining-neighbors'. For example, "
-                       "'--retro-tasks db-build,index-build,"
-                       "query-pretraining-neighbors' is equivalent to "
-                       "'--retro-tasks build'; or the argument can contain "
-                       "a subset of these tasks. Stages must always be run "
-                       "in the correct order (listed above).")
-    group.add_argument("--retro-block-size", type=int, default=100000,
-                       help="Number of chunks to process at a time when "
-                       "generating Bert embeddings and querying the search "
-                       "index. Partial results for each block are generally "
-                       "saved to disk in separate files.")
-    group.add_argument("--retro-doc-block-size", type=int, default=100000,
-                       help="Number of documents to processe at time when "
-                       "processing token datasets into chunk databases. The "
-                       "partial chunk database for each block is saved into "
-                       "a separate file.")
-
-    # GPT args.
-    group.add_argument("--retro-gpt-tokenizer-type", required=True,
-                       help="GPT tokenizer type.")
-    group.add_argument("--retro-gpt-vocab-file", help="GPT vocab file.")
-    group.add_argument("--retro-gpt-merge-file", help="GPT merge file.")
-    group.add_argument("--retro-gpt-tokenizer-model",
-                       help="GPT tokenizer model file.")
-    group.add_argument("--retro-gpt-seq-length", type=int, default=2048,
-                       help="GPT sequence length.")
-    group.add_argument("--retro-gpt-global-batch-size", type=int, default=2048,
-                       help="GPT global batch size.")
-    group.add_argument("--retro-gpt-chunk-length", type=int, default=64,
-                       help="GPT chunk length.")
-
-    # Bert args.
-    group.add_argument("--retro-bert-vocab-file", required=True,
-                       help="Bert vocab file.")
-    group.add_argument("--retro-bert-tokenizer-type", required=True,
-                       help="Bert tokenizer type (for when using "
-                       "'--bert-embedder-type megatron').")
-    group.add_argument("--retro-bert-batch-size", type=int, default=128,
-                       help="Micro-batch size for processing Bert embeddings.")
-    group.add_argument("--retro-bert-max-chunk-length", type=int, default=256,
-                       help="Maximum sequence length for Bert embeddings. "
-                       "(Named 'chunk' here in reference to these Bert "
-                       "sequences being converted from GPT chunks.)")
-
-    # Index args.
-    group.add_argument("--retro-index-nfeats", "-f", type=int, default=1024,
-                       help="Dimension of Bert embeddings. Bert-large is "
-                       "commonly used, so this value defaults to 1024.")
-    group.add_argument("--retro-index-type", default="faiss-par-add",
-                       choices=["faiss-base", "faiss-par-add"],
-                       help="A 'faiss-base' index is a simple, un-optimized "
-                       "wrapper around a Faiss index. A 'faiss-par-add' index "
-                       "optimizes the 'add()' method by making it multi-node "
-                       "and multi-process, but with bit-wise equivalent "
-                       "results.")
-    group.add_argument("--retro-index-str", required=True,
-                       help="Index string used for calling "
-                       "faiss.index_factory(). For example, "
-                       "'IVF262144_HNSW32,Flat' or "
-                       "'OPQ32_256,IVF4194304_HNSW32,PQ32'.")
-    group.add_argument("--retro-index-ntrain", type=int, required=True,
-                       help="Number of database chunks to use for training "
-                       "the index. This value must be less or equal to the "
-                       "total number of chunks in the database.")
-    group.add_argument("--retro-index-train-load-fraction",
-                       type=float, default=1.,
-                       help="Fraction of sampled chunks to use for training "
-                       "the index. Useful when our total sampled embeddings "
-                       "use too much memory; lowering the load fraction is "
-                       "less costly than re-embedding a new sampled dataset "
-                       "from scratch.")
-    group.add_argument("--retro-index-add-load-fraction",
-                       type=float, default=1.,
-                       help="Fraction of database chunks to use for adding to "
-                       "the index. Useful when our total index size would "
-                       "use too much memory; lowering the load fraction is "
-                       "less costly than re-designing our token datasets.")
-    group.add_argument("--retro-index-no-delete-training-embeddings",
-                       action='store_false',
-                       dest="retro_index_delete_training_embeddings",
-                       help="Skip deleting training embeddings for the search "
-                       "index. Useful for debugging.")
-    group.add_argument("--retro-index-no-delete-added-codes",
-                       action='store_false',
-                       dest="retro_index_delete_added_codes",
-                       help="Skip deleting added codes for the search "
-                       "index. Useful for debugging.")
-
-    # Query args.
-    group.add_argument("--retro-query-ef-search", type=int, default=256,
-                       help="Index ef-search parameter for HNSW during querying.")
-    group.add_argument("--retro-query-nprobe", type=int, default=65536,
-                       help="Index nprobe parameter for IVF during querying.")
-    group.add_argument("--retro-query-num-neighbors-query", type=int, default=200,
-                       help="Number of neighbors to retrieve when calling "
-                       "index.search().")
-    group.add_argument("--retro-query-num-neighbors-save", type=int, default=20,
-                       help="Number of neighbors to save to disk after "
-                       "the index's returned neighbors. If longer than target "
-                       "value, neighbors truncated; and if shorter than target "
-                       "value, neighbors are padded with -1's.")
-
-    # Enforce argument naming convention.
-    for action in group._group_actions:
-        prefix = action.dest.split("_")[0]
-        assert prefix == "retro", \
-            "Retro args must be prefixed with '--retro-*', for consistent " \
-            "styling. Please fix '%s'." % ", ".join(action.option_strings)
-
-    return parser
-
-
-def save_args(args):
-    '''Save copy of args within retro workdir.'''
-
-    def default_dump(obj):
-        if isinstance(obj, torch.dtype):
-            return str(obj)
-        else:
-            raise Exception("specialize for <%s>." % type(obj).__name__)
-
-    if torch.distributed.get_rank() == 0:
-        args_path = get_args_path(args.retro_workdir)
-        with open(args_path, "w") as f:
-            json.dump(vars(args), f, indent=4, default=default_dump)
-
-    torch.distributed.barrier()
-
-
-if __name__ == "__main__":
-
-    # Initalize Megatron.
-    initialize_megatron(extra_args_provider=add_retro_args)
-
-    # Split retro tasks.
-    args = get_args()
-    args.retro_tasks = args.retro_tasks.split(",")
-
-    # Save/set retro args.
-    os.makedirs(args.retro_workdir, exist_ok=True)
-    save_args(args)
-    set_retro_args(args)
-
-    # Select task to run.
-    for task in args.retro_tasks:
-
-        print_rank_0("start '%s'." % task)
-
-        # Run all stages.
-        if task == "build":
-            build_db()
-            torch.distributed.barrier()
-            build_index()
-            torch.distributed.barrier()
-            query_pretraining_neighbors()
-
-        # DB (i.e., chunk db).
-        elif task == "db-build":
-            build_db()
-
-        # Index.
-        elif task == "index-build":
-            build_index() # calls both train + add.
-        elif task == "index-train":
-            train_index() # train only
-        elif task == "index-add":
-            add_to_index() # add only
-
-        # Pretraining.
-        elif task == "query-pretraining-neighbors":
-            query_pretraining_neighbors()
-
-        else:
-            raise Exception("specialize for task '%s'." % task)
-
-        torch.distributed.barrier()
-
-        print_rank_0("end '%s'." % task)
diff --git a/tools/retro/preprocess_data.py b/tools/retro/preprocess_data.py
new file mode 100644
index 0000000000..444a64e584
--- /dev/null
+++ b/tools/retro/preprocess_data.py
@@ -0,0 +1,296 @@
+# Copyright (c) 2024, NVIDIA CORPORATION.  All rights reserved.
+
+"""Preprocess data for Retro.
+
+Stages (see argument '--retro-tasks'):
+- Build chunk database (DB).
+- Build index (train, add).
+- Query pretraining neighbors.
+"""
+
+import json
+import os
+import sys
+import torch
+
+from megatron.core.datasets.blended_megatron_dataset_builder import BlendedMegatronDatasetBuilder
+from megatron.core.datasets.utils import get_blend_from_list
+from megatron.core.datasets.retro.db import build_db
+from megatron.core.datasets.retro.index import add_to_index, train_index
+from megatron.core.datasets.retro.config import (
+    RetroBertEmbedders,
+    RetroGPTChunkDatasets,
+    RetroPreprocessingConfig,
+    RetroTokenizers,
+)
+from megatron.core.datasets.retro.query.gpt_chunk_dataset import build_gpt_chunk_datasets_from_gpt_datasets
+from megatron.core.datasets.retro.query.multi_split_gpt_dataset import (
+    MultiSplitGPTDataset,
+    MultiSplitGPTDatasetConfig,
+)
+from megatron.core.datasets.retro.query.query import query_neighbors
+from megatron.core.datasets.retro.query.utils import get_query_dir
+from megatron.core.datasets.retro.utils import retro_makedir
+from megatron.core.models.retro.utils import (
+    get_config_path,
+    get_gpt_data_dir,
+)
+from megatron.training import get_args, initialize_megatron, print_rank_0
+from megatron.training.arguments import core_transformer_config_from_args
+from megatron.training.tokenizer.tokenizer import (
+    _BertWordPieceTokenizer,
+    _GPT2BPETokenizer,
+    _GPTSentencePieceTokenizer,
+)
+from megatron.training import get_train_valid_test_num_samples
+from pretrain_gpt import is_dataset_built_on_rank
+from tools.bert_embedding import BertEmbedder, DiskDataParallelBertEmbedder
+from tools.retro.config_utils import add_config_args
+
+
+def add_retro_args(parser):
+    group = parser.add_argument_group(title="Retro preprocessing")
+    add_config_args(group, RetroPreprocessingConfig)
+    return parser
+
+
+def initialize_megatron_retro():
+    '''Initialize megatron & save Retro config.'''
+
+    # Prevent arguments.py from overriding preprocessing args.
+    project_dir_idx = sys.argv.index("--retro-project-dir")
+    retro_project_dir = sys.argv[project_dir_idx + 1]
+    del sys.argv[project_dir_idx] # delete key
+    del sys.argv[project_dir_idx] # delete value
+
+    # Initialize.
+    initialize_megatron(extra_args_provider=add_retro_args)
+
+    args = get_args()
+    args.retro_project_dir = retro_project_dir
+
+    # Retro config.
+    config = get_retro_preprocessing_config()
+
+    # Save retro config.
+    if config.retro_task_validate is None:
+        retro_makedir(config, config.retro_project_dir)
+        save_config(config)
+
+    return config
+
+
+def get_bert_embedders(config):
+    mem_embedder = BertEmbedder(
+        batch_size = config.retro_bert_batch_size,
+        max_bert_seq_length = config.retro_bert_max_chunk_length,
+        embedder_type = "megatron",
+    )
+    return RetroBertEmbedders(
+        mem = mem_embedder,
+        disk = DiskDataParallelBertEmbedder(mem_embedder, config.retro_block_size),
+    )
+
+
+def get_gpt_chunk_datasets(config):
+
+    args = get_args()
+
+    # Dataset config.
+    data_dir = get_gpt_data_dir(config.retro_project_dir)
+    blend = list(config.retro_gpt_data_path)
+    for i in range(len(blend) - 1, -1, -2):
+        blend[i] = os.path.join(data_dir, blend[i])
+    data_config = MultiSplitGPTDatasetConfig(
+        random_seed=config.retro_gpt_seed,
+        sequence_length=config.retro_gpt_seq_length,
+        blend=get_blend_from_list(blend),
+        blend_per_split=[
+            get_blend_from_list(args.train_data_path),
+            get_blend_from_list(args.valid_data_path),
+            get_blend_from_list(args.test_data_path)
+        ],
+        renormalize_blend_weights=args.renormalize_blend_weights,
+        split=config.retro_gpt_split,
+        split_preprocessing=config.retro_gpt_split,
+        path_to_cache=config.retro_gpt_data_cache_path,
+        return_document_ids=True,
+        tokenizer=config.retro_tokenizers.gpt,
+        reset_position_ids=args.reset_position_ids,
+        reset_attention_mask=args.reset_attention_mask,
+        eod_mask_loss=args.eod_mask_loss,
+    )
+
+    # GPT datasets.
+    print_rank_0(" > multi-split gpt datasets.")
+    train_valid_test_num_samples = get_train_valid_test_num_samples()
+    train_ds, valid_ds, test_ds = BlendedMegatronDatasetBuilder(
+        MultiSplitGPTDataset,
+        train_valid_test_num_samples,
+        is_dataset_built_on_rank,
+        data_config,
+    ).build()
+
+    gpt_datasets = {
+        "train" : (train_ds, train_valid_test_num_samples[0]),
+        "valid" : (valid_ds, train_valid_test_num_samples[1]),
+        "test"  : (test_ds, train_valid_test_num_samples[2]),
+    }
+
+    # Chunk datasets.
+    chunk_datasets = build_gpt_chunk_datasets_from_gpt_datasets(
+        project_dir=config.retro_project_dir,
+        gpt_datasets=gpt_datasets,
+        sample_length=config.retro_gpt_seq_length,
+        chunk_length=config.retro_gpt_chunk_length,
+    )
+    chunk_datasets = RetroGPTChunkDatasets(**chunk_datasets)
+
+    return chunk_datasets
+
+
+def get_gpt_tokenizer(config):
+    '''GPT (BPE) tokenizer.'''
+    tokenizer_type = config.retro_gpt_tokenizer_type
+    if tokenizer_type == "GPT2BPETokenizer":
+        assert config.retro_gpt_vocab_file and config.retro_gpt_merge_file
+        return _GPT2BPETokenizer(
+            vocab_file=os.path.join(
+                config.retro_project_dir,
+                config.retro_gpt_vocab_file,
+            ),
+            merge_file=os.path.join(
+                config.retro_project_dir,
+                config.retro_gpt_merge_file,
+            ),
+        )
+    elif tokenizer_type == 'GPTSentencePieceTokenizer':
+        assert config.retro_gpt_tokenizer_model is not None
+        return _GPTSentencePieceTokenizer(os.path.join(
+            config.retro_project_dir,
+            config.retro_gpt_tokenizer_model,
+        ))
+    else:
+        raise Exception("unrecognized gpt tokenizer, '%s'." % tokenizer_type)
+
+
+def get_bert_tokenizer(config):
+    '''Bert (Wordpiece) tokenizer.'''
+    lower_case = {
+        "BertWordPieceLowerCase" : True,
+        "BertWordPieceCase" : False,
+    }[config.retro_bert_tokenizer_type]
+    return _BertWordPieceTokenizer(
+        vocab_file=os.path.join(
+            config.retro_project_dir,
+            config.retro_bert_vocab_file,
+        ),
+        lower_case=lower_case,
+    )
+
+
+def get_tokenizers(config):
+    return RetroTokenizers(
+        gpt = get_gpt_tokenizer(config),
+        bert = get_bert_tokenizer(config),
+    )
+
+
+def get_retro_preprocessing_config():
+
+    # Arguments.
+    args = get_args()
+
+    # Retro config.
+    config = core_transformer_config_from_args(
+        args, config_class=RetroPreprocessingConfig)
+
+    # Add tools.
+    config.retro_tokenizers = get_tokenizers(config)
+    config.retro_bert_embedders = get_bert_embedders(config)
+    config.retro_gpt_chunk_datasets = get_gpt_chunk_datasets(config)
+
+    return config
+
+
+def save_config(config):
+    '''Save copy of config within retro project dir.'''
+
+    if torch.distributed.get_rank() == 0:
+
+        # GPT config + block size.
+        config_subset = {
+            k:v for k,v in vars(config).items()
+            if k.startswith("retro_gpt") and k != "retro_gpt_chunk_datasets"
+        }
+        config_subset["retro_block_size"] = config.retro_block_size
+
+        # Bert config.
+        config_subset["retro_bert_tokenizer_type"] = config.retro_bert_tokenizer_type
+        config_subset["retro_bert_vocab_file"] = config.retro_bert_vocab_file
+
+        # Neighbor directories.
+        query_dir = get_query_dir(config.retro_project_dir)
+        config_subset["retro_neighbor_dirs"] = {
+            k : (os.path.relpath(v["neighbor_dir"], query_dir) if v is not None else None)
+            for k, v in vars(config.retro_gpt_chunk_datasets).items()
+        }
+
+        # Save.
+        config_path = get_config_path(config.retro_project_dir)
+        with open(config_path, "w") as f:
+            json.dump(config_subset, f, indent=4, sort_keys=True)
+
+    torch.distributed.barrier()
+
+
+if __name__ == "__main__":
+
+    # Initalize Megatron.
+    config = initialize_megatron_retro()
+
+    # Expand tasks.
+    task_remap = {
+        "build" : [ "db-build", "index-train", "index-add", "query-neighbors" ],
+        "index-build" : [ "index-train", "index-add" ],
+        "db-build" : [ "db-build" ],
+        "index-train" : [ "index-train" ],
+        "index-add" : [ "index-add" ],
+        "query-neighbors" : [ "query-neighbors" ],
+    }
+    tasks = []
+    for task in config.retro_tasks:
+        tasks.extend(task_remap[task])
+    config.retro_tasks = tasks
+
+    # Select task to run.
+    for task in tasks:
+
+        print_rank_0("start '%s%s'." % (
+            "" if config.retro_task_validate is None else "[validate] ",
+            task,
+        ))
+
+        # DB (i.e., chunk db).
+        if task == "db-build":
+            build_db(config)
+
+        # Index.
+        elif task == "index-train":
+            train_index(config)
+        elif task == "index-add":
+            add_to_index(config)
+
+        # Query.
+        elif task == "query-neighbors":
+            query_neighbors(config)
+
+        else:
+            raise Exception("specialize for task '%s'." % task)
+
+        torch.distributed.barrier()
+
+        print_rank_0("end '%s%s'." % (
+            "" if config.retro_task_validate is None else "[validate] ",
+            task,
+        ))
diff --git a/tools/retro/query/__init__.py b/tools/retro/query/__init__.py
deleted file mode 100644
index 8ea709941b..0000000000
--- a/tools/retro/query/__init__.py
+++ /dev/null
@@ -1,3 +0,0 @@
-# Copyright (c) 2023, NVIDIA CORPORATION.  All rights reserved.
-
-from .query import query_pretraining_neighbors
diff --git a/tools/retro/query/chunk_dataset.py b/tools/retro/query/chunk_dataset.py
deleted file mode 100644
index f9cc4d5120..0000000000
--- a/tools/retro/query/chunk_dataset.py
+++ /dev/null
@@ -1,142 +0,0 @@
-# Copyright (c) 2023, NVIDIA CORPORATION.  All rights reserved.
-
-import os
-import torch
-
-from megatron import get_retro_args, print_rank_0
-from megatron.data.gpt_dataset import build_train_valid_test_datasets
-from megatron.training import (
-    build_train_valid_test_data_loaders,
-    update_train_iters,
-)
-from tools.retro.db.utils import get_indexed_dataset_infos
-from tools.retro.utils import get_num_chunks_per_sample
-
-from .utils import get_query_workdir
-
-
-class ChunkDataset(torch.utils.data.Dataset):
-    '''Pretraining chunk dataset wraps a standard GPT dataset.
-
-    This dataset conceptually divides each sample (e.g., length 2048)
-    into chunks (e.g., length 64) and restructures them into a list of
-    chunks (e.g., length num_samples * num_chunks_per_sample).
-    '''
-
-    def __init__(self, sample_dataset, chunk_length):
-
-        super().__init__()
-
-        self.sample_dataset = sample_dataset
-
-        self.chunk_length = chunk_length
-        self.n_chunks_per_sample = get_num_chunks_per_sample()
-        self.n_samples = len(sample_dataset)
-        self.n_chunks = self.n_samples * self.n_chunks_per_sample
-
-    def __len__(self):
-        return self.n_chunks
-
-    def __getitem__(self, idx):
-
-        # Convert global chunk index to global sample index & local chunk index.
-        sample_idx = idx // self.n_chunks_per_sample
-        chunk_idx = idx % self.n_chunks_per_sample
-
-        # Extract sample data.
-        sample = self.sample_dataset[sample_idx]
-        sample_token_ids = sample["text"]
-        sample_doc_ids = sample["doc_ids"]
-
-        # Chunk start/end token idxs.
-        token_start_idx = chunk_idx * self.chunk_length
-        token_end_idx = token_start_idx + self.chunk_length
-        chunk_token_ids = sample_token_ids[token_start_idx:token_end_idx]
-
-        # Sample.
-        return {
-            "doc_ids" : sample_doc_ids,
-            "text" : chunk_token_ids,
-        }
-
-
-def verify_indexed_dataset_order():
-    '''Verify pretraining order same as DB order.'''
-
-    args = get_retro_args()
-
-    # DB dataset prefixes.
-    db_indexed_dataset_infos = get_indexed_dataset_infos()
-    db_prefixes = [ info["prefix"] for info in db_indexed_dataset_infos ]
-
-    # Verify order & prefixes.
-    assert len(args.data_path) >= 2, "blendable dataset supported only."
-    pretraining_prefixes = args.data_path[1:None:2]
-
-    if len(db_prefixes) != len(pretraining_prefixes):
-        raise Exception("inconsistent dataset count between db & pretraining.")
-    if db_prefixes != pretraining_prefixes:
-        raise Exception("inconsistent dataset order between db & pretraining.")
-
-
-def train_valid_test_datasets_provider(train_val_test_num_samples):
-    """Build train, valid, and test datasets."""
-
-    args = get_retro_args()
-
-    print_rank_0('> building train, validation, and test datasets '
-                 'for GPT ...')
-    train_ds, valid_ds, test_ds = build_train_valid_test_datasets(
-        data_prefix=args.data_path,
-        data_impl=args.data_impl,
-        splits_string=args.split,
-        train_valid_test_num_samples=train_val_test_num_samples,
-        seq_length=args.retro_gpt_seq_length,
-        seed=args.seed,
-        skip_warmup=(not args.mmap_warmup),
-        return_doc_ids=args.retro_return_doc_ids)
-    print_rank_0("> finished creating pretrained GPT datasets ...")
-
-    return train_ds, valid_ds, test_ds
-
-
-def get_chunk_dataset_map():
-    '''Get train, valid, test chunk datasets.'''
-
-    args = get_retro_args()
-
-    # Update train iters.
-    update_train_iters(args)
-
-    args.iteration = 0
-    args.consumed_train_samples = 0
-
-    # Verify indexed dataset order.
-    verify_indexed_dataset_order()
-
-    # Datasets.
-    print_rank_0(" > data loader.")
-    train_data_loader, valid_data_loader, test_data_loader \
-        = build_train_valid_test_data_loaders(
-            train_valid_test_datasets_provider)
-
-    data_loader_map = {
-        "train" : train_data_loader,
-        "valid" : valid_data_loader,
-        "test" : test_data_loader,
-    }
-
-    # Info dict.
-    workdir = get_query_workdir()
-    dataset_map = {
-        key : {
-            "neighbor_dir" : os.path.join(
-                workdir,
-                os.path.basename(loader.dataset.datasets[0].index_prefix),
-            ),
-            "data" : ChunkDataset(loader.dataset, args.retro_gpt_chunk_length),
-        }
-        for key, loader in data_loader_map.items() if loader
-    }
-
-    return dataset_map
diff --git a/tools/retro/query/query.py b/tools/retro/query/query.py
deleted file mode 100644
index da41f0d7c1..0000000000
--- a/tools/retro/query/query.py
+++ /dev/null
@@ -1,252 +0,0 @@
-# Copyright (c) 2023, NVIDIA CORPORATION.  All rights reserved.
-
-import numpy as np
-import os
-import psutil
-import time
-import torch
-from tqdm import tqdm
-
-from megatron import get_retro_args, print_rank_0
-from tools.bert_embedding import BertEmbedder
-from tools.bert_embedding.utils import get_missing_blocks_by_rank
-from tools.retro.db.utils import \
-    get_merged_train_dataset as get_db_merged_train_dataset
-from tools.retro.external_libs import faiss, h5py
-from tools.retro.index.factory import IndexFactory
-from tools.retro.index.utils import get_index_dir
-from tools.retro.utils import GPTToTextDataset
-
-from .chunk_dataset import get_chunk_dataset_map as get_query_dataset_map
-
-
-def get_index(ondisk=False):
-    '''Read index from disk.'''
-
-    args = get_retro_args()
-
-    # Load index.
-    index_wrapper = IndexFactory.get_index(args.retro_index_type)
-    index_dir = get_index_dir()
-    added_index_path = index_wrapper.get_added_index_path()
-    if ondisk:
-        index = faiss.read_index(added_index_path, faiss.IO_FLAG_MMAP)
-    else:
-        index = faiss.read_index(added_index_path)
-
-    # Search parameters.
-    faiss.ParameterSpace().set_index_parameter(index, "efSearch",
-                                               args.retro_query_ef_search)
-    faiss.ParameterSpace().set_index_parameter(index, "nprobe",
-                                               args.retro_query_nprobe)
-
-    return index
-
-
-def embed_block(gpt_dataset, block, embedder):
-    '''Embed block of chunks.'''
-    text_block_dataset = torch.utils.data.Subset(
-        GPTToTextDataset(gpt_dataset),
-        range(*block["range"]),
-    )
-    return embedder.embed_text_dataset(text_block_dataset)
-
-
-def query_embeddings(db_dataset, index,
-                     embeddings, chunk_id_range,
-                     sample_map, n_chunks_per_sample,
-                     verbose=True):
-    '''Query neighbors of a block of embeddings.'''
-
-    args = get_retro_args()
-
-    # Query neighbor ids.
-    if verbose: print_rank_0("search.")
-    t = time.time()
-    assert index.ntotal > 0, "check we don't accidentally have an empty index."
-    _, query_neighbor_ids = \
-        index.search(embeddings, args.retro_query_num_neighbors_query)
-    if verbose: print_rank_0("  time : %.3f sec." % (time.time() - t))
-
-    # Filter banned neighbor ids.
-    if verbose: print_rank_0("filter banned neighbor ids.")
-    filtered_neighbor_ids = np.full(
-        shape=(len(query_neighbor_ids), args.retro_query_num_neighbors_save),
-        fill_value=-1,
-        dtype="int64",
-    )
-    min_chunk_id, max_chunk_id = chunk_id_range
-    for chunk_id in range(min_chunk_id, max_chunk_id):
-
-        sample_id = chunk_id // n_chunks_per_sample
-        sample = sample_map[sample_id]
-        sample_dataset_idx = sample["dataset_idx"].item()
-        sample_doc_ids = sample["doc_ids"].tolist()
-        sample_doc_tuples = [(sample_dataset_idx, d) for d in sample_doc_ids]
-        
-        # Get valid neighbors (!= -1).
-        query_row = [ i for i in query_neighbor_ids[chunk_id-min_chunk_id]
-                      if i >= 0 ]
-
-        # Filter row.
-        filtered_row = [ i for i in query_row
-                         if tuple(db_dataset.doc_tuples[i].tolist())
-                         not in sample_doc_tuples ]
-        filtered_row = filtered_row[:args.retro_query_num_neighbors_save]
-        filtered_row += \
-            [-1] * (args.retro_query_num_neighbors_save - len(filtered_row))
-        filtered_neighbor_ids[chunk_id-min_chunk_id] = filtered_row
-
-    return query_neighbor_ids, filtered_neighbor_ids
-
-
-def query_embedding_block(db_dataset, index,
-                          embeddings, chunk_id_range,
-                          sample_map, n_chunks_per_sample):
-
-    query_neighbor_ids = []
-    filtered_neighbor_ids = []
-
-    # Query in sub-blocks.
-    partial_block_size = 1000
-    for partial_start_idx in tqdm(
-            range(0, len(embeddings), partial_block_size),
-            "search",
-    ):
-        partial_end_idx = min(len(embeddings),
-                              partial_start_idx + partial_block_size)
-        partial_embeddings = embeddings[partial_start_idx:partial_end_idx]
-        partial_chunk_id_range = (
-            chunk_id_range[0] + partial_start_idx,
-            chunk_id_range[0] + partial_end_idx,
-        )
-        partial_query_neighbor_ids, partial_filtered_neighbor_ids = \
-            query_embeddings(db_dataset, index,
-                             partial_embeddings, partial_chunk_id_range,
-                             sample_map, n_chunks_per_sample,
-                             verbose=False)
-        query_neighbor_ids.append(partial_query_neighbor_ids)
-        filtered_neighbor_ids.append(partial_filtered_neighbor_ids)
-
-    # Concatenate.
-    query_neighbor_ids = np.concatenate(query_neighbor_ids, axis=0)
-    filtered_neighbor_ids = np.concatenate(filtered_neighbor_ids, axis=0)
-
-    return query_neighbor_ids, filtered_neighbor_ids
-
-
-def query_block_neighbors(db_dataset, query_dataset,
-                          index, embedder,
-                          block):
-    '''Query neighbors of a dataset block (i.e., range).'''
-
-    args = get_retro_args()
-    n_chunks_per_sample = query_dataset.n_chunks_per_sample
-
-    # Sample map.
-    sample_ids = sorted(list(set(chunk_id // n_chunks_per_sample
-                                 for chunk_id in range(*block["range"]))))
-    sample_map = {}
-    for i in sample_ids:
-        sample = query_dataset.sample_dataset[i]
-        sample_map[i] = {
-            "dataset_idx" : sample["dataset_idx"],
-            "doc_ids" : sample["doc_ids"],
-        }
-
-    # Embed block.
-    embeddings = embed_block(query_dataset, block, embedder)
-
-    # Query embeddings.
-    _, filtered_neighbor_ids = query_embedding_block(
-        db_dataset, index,
-        embeddings, block["range"],
-        sample_map, n_chunks_per_sample)
-
-    # Save neighbors.
-    print_rank_0("save neighbors.")
-    os.makedirs(os.path.dirname(block["path"]), exist_ok=True)
-    f = h5py.File(block["path"], "w")
-    f.create_dataset("neighbors", data=filtered_neighbor_ids)
-    f.close()
-
-
-def query_dataset_neighbors(db_dataset, query_dataset,
-                            prefix, neighbor_dir,
-                            index, embedder):
-    '''Query neighbors of each chunk within a dataset.'''
-
-    args = get_retro_args()
-
-    def validate(f):
-        assert f["neighbors"].shape[1] == args.retro_query_num_neighbors_save, \
-            "neighbors.shape == %s; num_neighbors_target == %d." % (
-                str(f["neighbors"].shape),
-                args.retro_num_neighbors_target,
-            )
-    n_missing_blocks, missing_neighbor_blocks = get_missing_blocks_by_rank(
-        neighbor_dir,
-        len(query_dataset),
-        args.retro_block_size,
-        validate=validate,
-    )
-
-    # Query each block.
-    for block_index, block in enumerate(missing_neighbor_blocks):
-
-        if block is not None:
-
-            # Progress.
-            print_rank_0("query '%s' block %d / %d ... %s ... mem %.3f gb, %.1f%%." % (
-                prefix,
-                block_index,
-                len(missing_neighbor_blocks),
-                os.path.basename(block["path"]),
-                psutil.virtual_memory()[3] / 1024**3,
-                psutil.virtual_memory()[2],
-            ))
-
-            # Query block neighbors.
-            query_block_neighbors(db_dataset, query_dataset,
-                                  index, embedder,
-                                  block)
-
-        # Synchronize progress across all ranks. (for easier observation)
-        print_rank_0(" > waiting for other ranks to finish block.")
-        torch.distributed.barrier()
-
-
-def query_pretraining_neighbors():
-    '''Query pretraining datasets (train & valid).'''
-
-    args = get_retro_args()
-
-    # Num threads.
-    faiss.omp_set_num_threads(64)
-
-    # Load chunk db dataset.
-    print_rank_0("load chunk db dataset.")
-    db_dataset = get_db_merged_train_dataset()
-    db_dataset.load_doc_tuples()
-
-    # Load index.
-    print_rank_0(" > get index.")
-    index = get_index()
-
-    # Load datasets.
-    print_rank_0(" > get dataset map.")
-    query_dataset_map = get_query_dataset_map()
-
-    # Bert embedder.
-    embedder = BertEmbedder(args.retro_bert_batch_size,
-                            args.retro_bert_max_chunk_length,
-                            args.bert_embedder_type)
-
-    # Query each (i.e., train, valid, test) dataset.
-    print_rank_0(" > query.")
-    for prefix, info in query_dataset_map.items():
-        print_rank_0(" > query '%s' dataset ... %d samples." %
-                     (prefix, len(info["data"])))
-        query_dataset_neighbors(db_dataset, info["data"],
-                                prefix, info["neighbor_dir"],
-                                index, embedder)
diff --git a/tools/retro/query/retro_dataset.py b/tools/retro/query/retro_dataset.py
deleted file mode 100644
index e89a47007a..0000000000
--- a/tools/retro/query/retro_dataset.py
+++ /dev/null
@@ -1,169 +0,0 @@
-# Copyright (c) 2023, NVIDIA CORPORATION.  All rights reserved.
-
-import numpy as np
-import os
-import torch
-
-from megatron import get_args, get_retro_args
-from tools.bert_embedding.utils import BlockPathMap
-from tools.retro.db.utils import get_merged_train_dataset as get_db_dataset
-from tools.retro.external_libs import h5py
-
-from .chunk_dataset import get_chunk_dataset_map
-
-
-class RetroDataset(torch.utils.data.Dataset):
-    '''Dataset of retro samples.
-
-    Each sample contains the original GPT sample, along with the token IDs
-    of each neighbor of each chunk within the sequence. Neighbor array has
-    shape (num_chunks_per_sample, num_neighbors, num_retrieved_tokens).
-    '''
-
-    def __init__(self,
-                 num_neighbors,
-                 num_retrieved_chunks,
-                 block_size,
-                 db_dataset,
-                 chunk_dataset,
-                 neighbor_path_map):
-        '''Note: chunk dataset wraps original GPT dataset (see
-        chunk_dataset.py).'''
-
-        super().__init__()
-
-        self.num_neighbors = num_neighbors
-        self.num_retrieved_chunks = num_retrieved_chunks
-        self.block_size = block_size
-        self.db_dataset = db_dataset
-        self.chunk_dataset = chunk_dataset
-        self.neighbor_path_map = neighbor_path_map
-
-    def __len__(self):
-        return len(self.chunk_dataset.sample_dataset)
-
-    def __getitem__(self, sample_idx):
-
-        n_chunks_per_sample = self.chunk_dataset.n_chunks_per_sample
-
-        # Get standard sample.
-        sample = self.chunk_dataset.sample_dataset[sample_idx]
-
-        # Sample idx to chunk idxs.
-        chunk_idxs = list(range(
-            sample_idx * n_chunks_per_sample,
-            (sample_idx + 1) * n_chunks_per_sample,
-        ))
-
-        # Collect retrieved tokens.
-        all_retrieved_chunk_ids = []
-        all_retrieved_token_ids = []
-        for chunk_idx in chunk_idxs:
-
-            # Neighbor chunk ids.
-            neighbor_path = self.neighbor_path_map[chunk_idx]
-            with h5py.File(neighbor_path, "r") as f:
-                neighbor_chunk_ids = f["neighbors"] \
-                    [chunk_idx % self.block_size, :self.num_neighbors].tolist()
-
-            # Retrieved (neighbor + continuation) token ids.
-            retrieved_chunk_ids = []
-            retrieved_token_ids = []
-            for neighbor_chunk_id in neighbor_chunk_ids:
-                current_chunk_ids = [
-                    i % len(self.db_dataset)
-                    for i in range(
-                            neighbor_chunk_id,
-                            neighbor_chunk_id + self.num_retrieved_chunks)]
-                current_token_ids = [self.db_dataset[ci]["text"]
-                                     for ci in current_chunk_ids]
-                retrieved_chunk_ids.append(current_chunk_ids)
-                retrieved_token_ids.append(current_token_ids)
-
-            # Collect retrieved tokens.
-            all_retrieved_chunk_ids.append(retrieved_chunk_ids)
-            all_retrieved_token_ids.append(retrieved_token_ids)
-
-        # Reshape retrieved tokens.
-        all_retrieved_chunk_ids = np.array(all_retrieved_chunk_ids) \
-            .reshape((n_chunks_per_sample, self.num_neighbors, -1))
-        all_retrieved_token_ids = np.array(all_retrieved_token_ids) \
-            .reshape((n_chunks_per_sample, self.num_neighbors, -1))
-
-        # Sample.
-        sample = {
-            **sample,
-            "neighbor_chunks" : all_retrieved_chunk_ids,
-            "neighbor_tokens" : all_retrieved_token_ids,
-        }
-
-        return sample
-
-
-def get_retro_datasets(verify_sizes=True):
-    '''Get train, valid, test retro datasets.'''
-
-    args = get_args()
-    retro_args = get_retro_args()
-
-    # DB dataset.
-    db_dataset = get_db_dataset()
-
-    # Retro datasets.
-    chunk_ds_info_map = get_chunk_dataset_map()
-    retro_dataset_map = {}
-    for data_key, chunk_ds_info in chunk_ds_info_map.items():
-
-        chunk_dataset = chunk_ds_info["data"]
-        neighbor_dir = chunk_ds_info["neighbor_dir"]
-        neighbor_path_map = BlockPathMap.from_dir(neighbor_dir,
-                                                  retro_args.retro_block_size)
-
-        # Verify dataset prefixes.
-        sample_prefix = chunk_dataset.sample_dataset.datasets[0].index_prefix
-        neighbor_prefix = os.path.basename(neighbor_dir)
-        assert sample_prefix == neighbor_prefix, \
-            "inconsistent dataset source; '%s' vs. '%s'." % \
-            (sample_prefix, neighbor_prefix)
-
-        # Verify num chunks.
-        n_sample_chunks = len(chunk_dataset)
-        n_neighbor_chunks = neighbor_path_map.max_idx
-
-        if not os.path.isdir(neighbor_dir):
-            if torch.distributed.get_rank() == 0:
-                raise Exception("neighbor directory '%s' not found; please "
-                                "compare --train-samples, --seq-length, --seed, "
-                                "--eval-iters, and --eval-interval, with "
-                                "retro preprocessing args." %
-                                neighbor_dir)
-            torch.distributed.barrier()
-            exit()
-
-        if verify_sizes and n_sample_chunks != n_neighbor_chunks:
-            if torch.distributed.get_rank() == 0:
-                print("neighbor_dir : %s" % neighbor_dir)
-                print("neighbor_path_map : %s" % neighbor_path_map)
-                raise Exception("num sampled chunks (%d) != num neighbor chunks "
-                                "(%d); did you complete querying the entire "
-                                "pretraining dataset?"
-                                % (n_sample_chunks, n_neighbor_chunks))
-            torch.distributed.barrier()
-            exit()
-
-        # Retro dataset.
-        retro_dataset_map[data_key] = RetroDataset(
-            num_neighbors=args.retro_num_neighbors,
-            num_retrieved_chunks=args.retro_num_retrieved_chunks,
-            block_size=retro_args.retro_block_size,
-            db_dataset=db_dataset,
-            chunk_dataset=chunk_dataset,
-            neighbor_path_map=neighbor_path_map,
-        )
-
-    # Extract datasets.
-    train_ds = retro_dataset_map.get("train", None)
-    valid_ds = retro_dataset_map.get("valid", None)
-    test_ds = retro_dataset_map.get("test", None)
-
-    return train_ds, valid_ds, test_ds
diff --git a/tools/retro/query/utils.py b/tools/retro/query/utils.py
deleted file mode 100644
index a4ea2a5ca1..0000000000
--- a/tools/retro/query/utils.py
+++ /dev/null
@@ -1,10 +0,0 @@
-# Copyright (c) 2023, NVIDIA CORPORATION.  All rights reserved.
-
-import os
-
-from megatron import get_retro_args
-
-
-def get_query_workdir():
-    args = get_retro_args()
-    return os.path.join(args.retro_workdir, "query")
diff --git a/tools/retro/sft/README.md b/tools/retro/sft/README.md
new file mode 100644
index 0000000000..e589879038
--- /dev/null
+++ b/tools/retro/sft/README.md
@@ -0,0 +1,3 @@
+## Note
+
+The content within this `sft` directory is still under active development and will be updated soon.
\ No newline at end of file
diff --git a/tools/retro/sft/dataset_conv.py b/tools/retro/sft/dataset_conv.py
new file mode 100644
index 0000000000..3dd8fa9cd5
--- /dev/null
+++ b/tools/retro/sft/dataset_conv.py
@@ -0,0 +1,446 @@
+# Copyright (c) 2023, NVIDIA CORPORATION. All rights reserved.
+
+import re
+import json
+import os
+from typing import Any, Iterable, Dict, Optional
+
+from numpy import ndarray
+from megatron.core.datasets.blended_megatron_dataset_config import BlendedMegatronDatasetConfig
+from megatron.core.datasets.utils import Split
+import torch
+import numpy
+import glob
+from collections import OrderedDict
+
+from megatron.core.datasets.blended_megatron_dataset_config import BlendedMegatronDatasetConfig
+from megatron.core.datasets.megatron_dataset import LowLevelDataset, MegatronDataset
+from megatron.core.datasets.utils import Split
+from dataclasses import dataclass
+
+
+_DATASET_NAME_PATTERNS = {
+    Split.train: r"(?P<name>[^\0]+)\/(?P=name)\_QA\_train.json",
+    Split.valid: r"(?P<name>[^\0]+)\/(?P=name)\_QA\_dev.json",
+}
+
+
+@dataclass
+class JsonQADatasetConfig(BlendedMegatronDatasetConfig):
+    """Configuration object for the QA finetuning pipeline
+    """
+    ft_neighbours: int = 1
+
+    bert_retriever_neighbours: bool = False
+
+    longform_answer: bool = False
+
+    inference_only: bool = False
+
+    retrieved_neighbours: bool = False
+
+    fix_newsqa: bool = True
+
+    def __post_init__(self) -> None:
+        super().__post_init__()
+        assert self.blend_per_split is not None
+
+
+@dataclass
+class RetroJsonQADatasetConfig(JsonQADatasetConfig):
+    """Configuration object for the Retro QA finetuning pipeline
+    """
+    retro_num_neighbors: int = None
+
+    retro_gpt_retrieved_length: int = None
+
+    def __post_init__(self) -> None:
+        super().__post_init__()
+        assert self.retro_num_neighbors is not None
+        assert self.retro_gpt_retrieved_length is not None
+
+
+class JsonQADataset(MegatronDataset):
+
+    def __init__(self, dataset: Any, dataset_path: str, indices: ndarray, num_samples: Optional[int], index_split: Split, config: BlendedMegatronDatasetConfig) -> None:
+        super().__init__(dataset, dataset_path, indices, num_samples, index_split, config)
+        matches = re.findall(_DATASET_NAME_PATTERNS[index_split], dataset_path)
+        assert len(matches) == 1
+        assert len(matches[0]) > 0
+        self.dataset_name = matches[0]
+
+    @staticmethod
+    def numel_low_level_dataset(low_level_dataset: LowLevelDataset) -> int:
+        return len(low_level_dataset)
+
+    @staticmethod
+    def build_low_level_dataset(dataset_path: str, config: JsonQADatasetConfig) -> Iterable:
+        assert os.path.isfile(dataset_path), f"{dataset_path} does not exist on disk"
+        return preprocess(dataset_path, config)
+
+    def __len__(self) -> int:
+        return len(self.dataset)
+
+    def __getitem__(self, idx: int) -> Dict[str, ndarray]:
+        sample = self.dataset[idx % len(self.dataset)]
+
+        # unpack tokens
+        query, answer, neighbours = sample
+
+        # tokenization
+        output_tokens = self.config.tokenizer.tokenize(answer)
+
+        input_tokens = reformat_prompt(
+            query,
+            neighbours,
+            self.dataset_name,
+            self.config.ft_neighbours,
+            len(output_tokens),
+            self.config.tokenizer,
+            self.config.sequence_length
+        )
+
+        # padding
+        tokens, answer_mask = pad_and_convert_to_numpy(
+            input_tokens, output_tokens, self.config.tokenizer.pad, self.config.sequence_length, self.config.tokenizer.eos
+        )
+
+        train_sample = {
+            'text': tokens,
+            'answer_mask': answer_mask,
+        }
+
+        return train_sample
+
+
+class RetroJsonQADataset(JsonQADataset):
+
+    def __getitem__(self, idx: int) -> Dict[str, ndarray]:
+
+        sample = self.dataset[idx % len(self.dataset)]
+
+        # unpack tokens
+        query, answer, neighbours = sample
+
+        # tokenization
+        output_tokens = self.config.tokenizer.tokenize(answer)
+
+        input_tokens = reformat_prompt_retro(
+            query,
+            neighbours,
+            self.dataset_name,
+            self.config.ft_neighbours,
+            len(output_tokens),
+            self.config.tokenizer,
+            self.config.sequence_length
+        )
+
+        # padding
+        tokens, answer_mask = pad_and_convert_to_numpy(
+            input_tokens,
+            output_tokens,
+            self.config.tokenizer.pad,
+            self.config.sequence_length,
+            self.config.tokenizer.eos
+        )
+
+        # get retro neighbors
+        # context chunk and answer chunk
+        n_chunks_per_sample = 2
+        num_neighbors = self.config.retro_num_neighbors
+        # disable retro encoder
+        neighbor_tokens = numpy.zeros(
+            [n_chunks_per_sample, num_neighbors, self.config.retro_gpt_retrieved_length],
+            dtype=numpy.int64
+        )
+
+        train_sample = {
+            'text': tokens,
+            'answer_mask': answer_mask,
+            'neighbor_tokens': neighbor_tokens,
+            'context_len': len(input_tokens)
+        }
+
+        return train_sample
+
+
+def format_multichoice(multichoice_options):
+    options_text = ["({}) {}".format(chr(ord('A') + i), option) for i, option in
+                    zip(range(len(multichoice_options)), multichoice_options)]
+    return "Choose one based on the following options: {}".format(" ".join(options_text))
+
+
+def format_multichoice_question(question, multichoice_options):
+    return "{}\n{}".format(question, format_multichoice(multichoice_options))
+
+
+def format_answer(answer):
+    return " {}".format(answer)
+
+
+def preprocess(dataset_path: str, config: JsonQADatasetConfig):
+    assert config.ft_neighbours > 0
+    if config.longform_answer:
+        nq_examples = []
+        with open(dataset_path, "r") as f:
+            for fn in f:
+                nq_examples.append(json.loads(fn))
+    else:
+        nq_examples = []
+        for my_data_file in sorted(glob.glob(dataset_path)):
+            with open(my_data_file, "r", encoding='utf-8') as f:
+                nq_examples.extend(json.load(f))
+
+    data = []
+    for instance in nq_examples:
+        question = instance["question"]
+        if 'qa_type' in instance and instance['qa_type'] == "multi_choice_qa":
+            question = format_multichoice_question(question, instance["multichoice_options"])
+        if config.bert_retriever_neighbours:
+            contexts = instance["bert_pretrain_corpus_neighbours"]
+            neighbours = ["source: " + ctx for ctx in contexts]
+        else:
+            if config.retrieved_neighbours:
+                contexts = instance["ctxs"]
+                neighbours = ["title: " + ctx["title"] + ", source: " + ctx["text"] for ctx in contexts]
+            else:
+                if "sub-paragraphs" in instance:
+                    if type(instance["sub-paragraphs"]) == list:  # doc2dial:
+                        neighbours = [
+                            "title: " + instance["sub-paragraphs"][0] + ", source: " + instance["sub-paragraphs"][1]]
+                    else:
+                        neighbours = ["title: , source: " + instance["sub-paragraphs"]]
+                elif config.fix_newsqa and "sub_paragraph" in instance:
+                    neighbours = ["title: , source: " + instance["sub_paragraph"]]
+                else:
+                    neighbours = ["title: , source: "]
+
+        if config.inference_only:
+            data.append((question, None, neighbours))
+        else:
+            if config.longform_answer:
+                if "longform_answer" in instance:
+                    answers = [instance["longform_answer"]]
+                else:
+                    continue
+            else:
+                if "answers" in instance:
+                    answers = instance["answers"]
+                elif "answer" in instance:
+                    if type(instance["answer"]) is str:
+                        answers = [instance["answer"]]
+                    elif type(instance["answer"]) is list:
+                        answers = instance["answer"]
+                    else:
+                        answers = [str(instance["answer"])]
+                else:
+                    raise ValueError("need to have answer or answers")
+            if len(answers) < 1:
+                continue
+            else:
+                if type(answers[0]) is dict:
+                    answers = [answers[0]["text"].strip()]
+                elif type(answers[0]) is str:
+                    answers = [answers[0]]
+                else:
+                    raise ValueError("unsupported type for answer(s)")
+
+            for answer in answers:
+                answer = format_answer(answer)
+                data.append((question, answer, neighbours))
+
+    return data
+
+
+def count_stat(dataset, tokenizer, k):
+    nb_lens = []
+    for i, d in enumerate(dataset):
+        query, answer, neighbours = d
+        nb_lens.extend([len(tokenizer.tokenize(neighbour)) for neighbour in neighbours[:k]])
+
+    print("len of nb", len(nb_lens))
+    print("max of len nb", max(nb_lens))
+    print("num of cut ", sum([l > 128 for l in nb_lens]), sum([l > 128 for l in nb_lens]) // len(nb_lens))
+    print("last max", sorted(nb_lens)[-10:])
+
+
+def reformat_prompt_retro(query, neighbours, dataset_name, ft_neighbours, \
+                          max_output_len, tokenizer, max_seq_length):
+    system = ("System: This is a chat between a user and an artificial intelligence assistant. The assistant gives "
+              "helpful, detailed, and polite answers to the user's questions.\n\n")
+
+    if dataset_name in ["oasst", "quiet_cockatoo", "open_inst", "quiet-cockatoo_commercial"]:
+        input_tokens = tokenizer.tokenize(system + query)
+        return input_tokens
+
+    short_span_with_context = ["drop", "NarrativeQA", "QASC", "Quoref", "ROPES", "squad1.1", "squad2.0", "newsqa", "nq",
+                               "tqa", "quac"]
+    yes_no_without_context = ["BoolQ"]
+    multichoices = [""]
+    formatted_dataset_name = ["doc2dial", "quac", "qrecc", "sharc"]
+
+    if dataset_name in formatted_dataset_name:
+        dialogue_turn = query
+    else:
+        if dataset_name in short_span_with_context:
+            user = "{} Answer the above question with a short phrase.".format(query)
+        elif dataset_name in yes_no_without_context:
+            user = "{} Answer the above question with True or False.".format(query)
+        else:
+            user = "{} Answer the above question with a long complete answer.".format(query)
+
+        if dataset_name in short_span_with_context:
+            dialogue_format = "User: {}\n\nAssistant: The answer is"
+            dialogue_turn = dialogue_format.format(user)
+        else:
+            dialogue_format = "User: {}\n\nAssistant:"
+            dialogue_turn = dialogue_format.format(user)
+
+    if ft_neighbours > 0:
+        context = "\n\n".join(neighbours[0:ft_neighbours]) + "\n\n"
+        context_tokens = tokenizer.tokenize(context)
+        dialogue_tokens = tokenizer.tokenize(dialogue_turn)
+        system_tokens = tokenizer.tokenize(system)
+        context_tokens = context_tokens[:max_seq_length - max_output_len - len(dialogue_tokens) - len(system_tokens)]
+        context = tokenizer.detokenize(context_tokens)
+
+        all_input = system + context + dialogue_turn
+        print(all_input)
+        input_tokens = tokenizer.tokenize(all_input)
+    else:
+        all_input = system + dialogue_turn
+        input_tokens = tokenizer.tokenize(all_input)
+
+    return input_tokens
+
+
+def flan_format(system, context, dialogue_turn, template_id=0):
+    templates = [
+        "{}User: Answer based on context:\n\n{}{}",
+        "{}User: {}Answer this question based on the article: {}",
+        "{}User: {}{}",
+        "{}User: {}Answer this question: {}",
+        "{}User: Read this article and answer this question {}{}",
+        "{}User: {}Based on the above article, answer a question. {}",
+        "{}User: Context: {}Question: {}"
+    ]
+    template = templates[template_id - 1].format(system, context, dialogue_turn)
+    return template
+
+
+def reformat_prompt(query, neighbours, dataset_name, ft_neighbours, \
+                    max_output_len, tokenizer, max_seq_length, template_id=0):
+    system = ("System: This is a chat between a user and an artificial intelligence assistant. The assistant gives "
+              "helpful, detailed, and polite answers to the user's questions based on the context. The assistant "
+              "should also indicate when the answer cannot be found in the context.\n\n")
+
+    if dataset_name in ["oasst", "quiet_cockatoo", "open_inst", "quiet-cockatoo_commercial"]:
+        input_tokens = tokenizer.tokenize(system + query)
+        return input_tokens
+
+    short_span_with_context = ["drop", "NarrativeQA", "QASC", "Quoref", "ROPES", "squad1.1", "squad2.0", "newsqa", "nq",
+                               "BioASQ", "DuoRC_ParaphraseRC", "TextbookQA", "tqa"]
+    yes_no_without_context = ["boolq", "multirc"]
+    multichoices = ["race"]
+    # multi-turn qa datasets
+    formatted_dataset_name = ["convqa", "chatgptgen", "doc2dial", "quac", "qrecc", "sharc"]
+
+    if dataset_name in formatted_dataset_name:
+        dialogue_turn = query
+    else:
+        if dataset_name in short_span_with_context:
+            if template_id == 0:
+                user = "Answer the following question with a short span. {}".format(query)
+            else:
+                user = query
+        elif dataset_name in yes_no_without_context:
+            user = "Answer the following question with True or False. {}".format(query)
+        elif dataset_name in multichoices:
+            user = "Answer the following question by selecting one of the provided options. {}".format(query)
+        else:
+            if template_id == 0:
+                user = "Please give a full and complete answer for the question. {}".format(query)
+            else:
+                user = query
+
+        if dataset_name in short_span_with_context:
+            if template_id == 0:
+                dialogue_format = "User: {}\n\nAssistant: The answer is"
+            else:
+                dialogue_format = "{}\n\nAssistant: The answer is"
+            dialogue_turn = dialogue_format.format(user)
+        else:
+            if template_id == 0:
+                dialogue_format = "User: {}\n\nAssistant:"
+            else:
+                dialogue_format = "{}\n\nAssistant:"
+            dialogue_turn = dialogue_format.format(user)
+
+    if ft_neighbours > 0:
+        context = "\n\n".join(neighbours[0:ft_neighbours]) + "\n\n"
+        context_tokens = tokenizer.tokenize(context)
+        dialogue_tokens = tokenizer.tokenize(dialogue_turn)
+        system_tokens = tokenizer.tokenize(system)
+        context_tokens = context_tokens[:max_seq_length - max_output_len - len(dialogue_tokens) - len(system_tokens)]
+        context = tokenizer.detokenize(context_tokens)
+
+        if template_id == 0:
+            all_input = system + context + dialogue_turn
+        else:
+            all_input = flan_format(system, context, dialogue_turn, template_id=template_id)
+        input_tokens = tokenizer.tokenize(all_input)
+    else:
+        all_input = system + dialogue_turn
+        input_tokens = tokenizer.tokenize(all_input)
+
+    return input_tokens
+
+
+def reformat_prompt_short(query, neighbours, dataset_name, ft_neighbours, \
+                          max_output_len, tokenizer, max_seq_length):
+    if not query.endswith("?"):
+        query = query + "?"
+    query = "Question: {} Answer: The answer is".format(query)
+
+    if ft_neighbours > 0:
+        context = "\n\n".join(neighbours[0:ft_neighbours]) + "\n\n"
+        context_tokens = tokenizer.tokenize(context)
+        dialogue_tokens = tokenizer.tokenize(query)
+        context_tokens = context_tokens[:max_seq_length - max_output_len - len(dialogue_tokens)]
+        context = tokenizer.detokenize(context_tokens)
+        all_input = context + query
+        input_tokens = tokenizer.tokenize(all_input)
+    else:
+        all_input = query
+        input_tokens = tokenizer.tokenize(all_input)
+
+    return input_tokens
+
+
+def pad_and_convert_to_numpy(input_ids, output_ids,
+                             pad_id, max_seq_length,
+                             eos_id):
+    """Pad sequences and convert them to numpy."""
+    if len(input_ids) > max_seq_length:
+        input_ids = input_ids[:max_seq_length - 1]
+
+    if len(input_ids + output_ids) > max_seq_length:
+        output_ids = output_ids[:max_seq_length - len(input_ids)]
+
+    tokens = input_ids + output_ids
+    answer_mask = [0] * len(input_ids) + [1] * len(output_ids)
+
+    # padding
+    num_tokens = len(tokens)
+    padding_length = max_seq_length - num_tokens
+    assert padding_length >= 0
+
+    # Tokens.
+    filler = [pad_id] * padding_length
+    tokens = numpy.array(tokens + [eos_id] + filler, dtype=numpy.int64)
+
+    # answer mask
+    answer_mask = answer_mask + [1] + [0] * padding_length
+    answer_mask = numpy.array(answer_mask, dtype=numpy.int64)
+
+    return tokens, answer_mask
diff --git a/tools/retro/sft/open_inst.sh b/tools/retro/sft/open_inst.sh
new file mode 100644
index 0000000000..9ebe063b81
--- /dev/null
+++ b/tools/retro/sft/open_inst.sh
@@ -0,0 +1 @@
+DATA_BLEND="1.0 open_inst"
diff --git a/tools/retro/sft/sft_retro.py b/tools/retro/sft/sft_retro.py
new file mode 100644
index 0000000000..1070cfcadd
--- /dev/null
+++ b/tools/retro/sft/sft_retro.py
@@ -0,0 +1,275 @@
+# Copyright (c) 2023, NVIDIA CORPORATION. All rights reserved.
+
+"""Pretrain GPT"""
+
+import torch
+from functools import partial, reduce
+import sys, os
+
+sys.path.append(os.path.abspath(os.path.join(
+    os.path.join(os.path.dirname(__file__), "../../../"))))
+from megatron.training import get_args, get_retro_args
+from megatron.training import print_rank_0
+from megatron.training import get_timers
+from megatron.training import get_tokenizer
+from megatron.core import tensor_parallel
+from megatron.core.enums import ModelType
+from megatron.core.datasets.blended_megatron_dataset_builder import BlendedMegatronDatasetBuilder
+from megatron.core.datasets.utils import get_blend_from_list
+from megatron.training import pretrain
+from megatron.training.utils import get_ltor_masks_and_position_ids
+from megatron.training.utils import average_losses_across_data_parallel_group
+from pretrain_gpt import model_provider, is_dataset_built_on_rank
+from tools.retro.sft.dataset_conv import JsonQADataset, JsonQADatasetConfig, RetroJsonQADataset, RetroJsonQADatasetConfig
+
+
+def get_tasks_args(parser):
+    """Provide extra arguments required for tasks."""
+    group = parser.add_argument_group(title='tasks')
+
+    # parameters for the knowledgeable dialogue generation
+    group.add_argument('--task', type=str, default=None,
+                       help='Task name.')
+    group.add_argument('--epochs', type=int, default=None,
+                       help='Number of finetunning epochs. Zero results in '
+                            'evaluation only.')
+    group.add_argument('--keep-last', action='store_true',
+                       help='Keep the last batch (maybe incomplete) in'
+                            'the data loader')
+    group.add_argument('--pretrained-checkpoint', type=str, default=None,
+                       help='Pretrained checkpoint used for finetunning.')
+    group.add_argument('--data-folder', type=str, default=None,
+                       help='dataset folder')
+    group.add_argument('--answer-loss-only', action='store_true', default=False,
+                       help='take the loss from answer part, ignore the context')
+    group.add_argument('--weight', type=float, default=1)
+    group.add_argument('--adaptor', action='store_true', default=False)
+    group.add_argument('--project-size', type=int, default=256)
+    group.add_argument('--cyclic-train-iters', type=int, default=None)
+    group.add_argument('--stored_params', type=dict, default=dict())
+    group.add_argument('--eval_ppl', action='store_true', default=False)
+    group.add_argument('--debug', action='store_true', default=False)
+    group.add_argument('--add_retriever', action='store_true', default=False)
+    group.add_argument('--return_doc_ids', action='store_true', default=False)
+    group.add_argument('--return_neighbor_ids', action='store_true', default=False)
+    group.add_argument('--add_offset_doc_ids', action='store_true', default=False)
+    group.add_argument('--offset_dict_path', type=str, default='')
+    group.add_argument('--neighbors_path', type=str, default='')
+    group.add_argument('--valid_neighbors_path', type=str, default='')
+    group.add_argument('--database_path', type=str, default='')
+    group.add_argument('--valid_database_path', type=str, default='')
+    group.add_argument('--encoder-layers', type=int, default=12)
+    group.add_argument('--encoder-hidden-dropout', type=float, default=0.1)
+    group.add_argument('--encoder-attention-dropout', type=float, default=0.1)
+    group.add_argument('--k', type=int, default=2)
+    group.add_argument('--r', type=int, default=128)
+    group.add_argument('--m', type=int, default=64)
+    group.add_argument('--dpr-mode', type=str, default="multi")
+    group.add_argument('--faiss-ckpt', type=str, default='')
+    group.add_argument('--original-db-file', type=str, default="")
+    group.add_argument('--ft_neighbours', type=int, default=1)
+    group.add_argument('--reuse-top', action='store_true', default=False)
+    group.add_argument('--shuffle_topn', action='store_true', default=False)
+    group.add_argument('--chunk0', action='store_true', default=False)
+    group.add_argument('--disable-encoder', action='store_true', default=False)
+    group.add_argument('--qa-space-pad', action='store_true', default=False)
+    group.add_argument('--retro-mask-encoder', action='store_true', default=False)
+    group.add_argument('--without-title', action='store_true', default=False)
+    group.add_argument('--longform-answer', action='store_true', default=False)
+    group.add_argument('--bert-retriever-neighbours', action='store_true', default=False)
+    group.add_argument('--prefix', action='store_true', default=False)
+    group.add_argument('--question-in-encoder', action='store_true', default=False)
+    group.add_argument('--reset_eval', type=bool, default=True)  ## by default reset eval for each eval
+    return parser
+
+
+def get_batch(data_iterator):
+    """Generate a batch"""
+    args = get_args()
+    tokenizer = get_tokenizer()
+
+    # Items and their type.
+    keys = ['text', 'answer_mask']
+    datatype = torch.int64
+
+    if args.retro_add_retriever:
+        keys += 'neighbor_tokens', 'context_len'
+
+    # Broadcast data.
+    if data_iterator is not None:
+        try:
+            data = next(data_iterator)
+
+        except Exception:
+            data = data_iterator
+            raise ValueError("error with data_iterator")
+    else:
+        data = None
+
+    data_b = tensor_parallel.broadcast_data(keys, data, datatype)
+    chunk_size = torch.min(data_b['context_len'])
+    retro_args = get_retro_args()
+    # two chunk retro has at least seq_len / 2 of chunk size
+    retro_args.retro_gpt_chunk_length = max(args.seq_length // 2, args.seq_length - chunk_size.item())
+
+    # Unpack.
+    tokens_ = data_b['text'].long()
+    labels = tokens_[:, 1:].contiguous()
+    tokens = tokens_[:, :-1].contiguous()
+
+    answer_mask = data_b["answer_mask"].float()[:, 1:].contiguous()
+
+    if args.retro_add_retriever:
+        neighbor_tokens = data_b['neighbor_tokens'].view(-1,
+                                                         retro_args.retro_gpt_retrieved_length).long()  # [bs * l * k, r]
+
+    # Get the masks and postition ids.
+    attention_mask, loss_mask, position_ids = get_ltor_masks_and_position_ids(
+        tokens,
+        tokenizer.eod,
+        args.reset_position_ids,
+        args.reset_attention_mask,
+        args.eod_mask_loss)
+
+    if args.answer_loss_only:
+        loss_mask = loss_mask * answer_mask
+
+    if args.retro_add_retriever:
+        _, _, neighbor_position_ids = get_ltor_masks_and_position_ids(
+            neighbor_tokens,
+            tokenizer.eod,
+            args.reset_position_ids,
+            args.reset_attention_mask,
+            args.eod_mask_loss)
+        neighbor_attention_mask = None
+        return tokens, labels, loss_mask, attention_mask, position_ids, \
+            neighbor_tokens, neighbor_attention_mask, neighbor_position_ids
+    else:
+        return tokens, labels, loss_mask, attention_mask, position_ids
+
+
+def loss_func(loss_mask, output_tensor):
+    losses = output_tensor.float()
+    loss_mask = loss_mask.view(-1).float()
+    loss = torch.sum(losses.view(-1) * loss_mask) / loss_mask.sum()
+
+    # Reduce loss for logging.
+    averaged_loss = average_losses_across_data_parallel_group([loss])
+
+    return loss, {'lm loss': averaged_loss[0]}
+
+
+def forward_step(data_iterator, model):
+    """Forward step."""
+    args = get_args()
+    timers = get_timers()
+
+    if args.retro_add_retriever:
+        timers('batch-generator', log_level=2).start()
+        tokens, labels, loss_mask, attention_mask, position_ids, \
+            neighbor_tokens, neighbor_attention_mask, neighbor_position_ids = get_batch(
+            data_iterator)
+        timers('batch-generator').stop()
+        output_tensor = model(tokens, position_ids, attention_mask,
+                              retriever_input_ids=neighbor_tokens,
+                              retriever_position_ids=neighbor_position_ids,
+                              retriever_attn_mask=neighbor_attention_mask,
+                              labels=labels)
+    else:
+        timers('batch-generator', log_level=2).start()
+        tokens, labels, loss_mask, attention_mask, position_ids = get_batch(
+            data_iterator)
+        timers('batch-generator').stop()
+        output_tensor = model(tokens, position_ids, attention_mask,
+                              labels=labels)
+
+    return output_tensor, partial(loss_func, loss_mask)
+
+
+def train_valid_test_datasets_provider(train_val_test_num_samples):
+    """Build train, valid, and test datasets."""
+    args = get_args()
+    retro_args = get_retro_args()
+
+    tokenizer = get_tokenizer()
+
+    def fix_and_split_blend_pair(pair):
+        weight, name = pair
+        return [
+            [weight, os.path.join(args.data_folder, name, f"{name}_QA_train.json")],
+            [weight, os.path.join(args.data_folder, name, f"{name}_QA_dev.json")],
+            None,
+        ]
+
+    blend = [args.data_path[i:i+2] for i in range(0, len(args.data_path), 2)]
+
+    if len(blend) == 1:
+        blend_per_split =  [
+            os.path.join(args.data_folder, blend[0], f"{blend[0]}_QA_train.json"),
+            os.path.join(args.data_folder, blend[0], f"{blend[0]}_QA_dev.json"),
+            None,
+        ]
+    else:
+        blend_per_split = [
+            list(
+                reduce(
+                    lambda x, y: x + y,
+                    list(zip(*map(fix_and_split_blend_pair, blend)))[0]
+                )
+            ),
+            None,
+            None,
+        ]
+
+    blend_per_split = [get_blend_from_list(blend) for blend in blend_per_split]
+
+    extra_kwargs = {}
+
+    if args.retro_add_retriever:
+        dataset_cls = RetroJsonQADataset
+        config_cls = RetroJsonQADatasetConfig
+        extra_kwargs["retro_num_neighbors"] = args.retro_num_neighbors
+        extra_kwargs["retro_gpt_retrieved_length"] = retro_args.retro_gpt_retrieved_length
+    else:
+        dataset_cls = JsonQADataset
+        config_cls = JsonQADatasetConfig
+
+    config = config_cls(
+        random_seed=args.seed,
+        sequence_length=args.seq_length,
+        blend_per_split=blend_per_split,
+        split=args.split,
+        path_to_cache=args.data_cache_path,
+        tokenizer=tokenizer,
+        ft_neighbours=args.ft_neighbours,
+        bert_retriever_neighbours=args.bert_retriever_neighbours,
+        longform_answer=args.longform_answer,
+        inference_only=False,
+        retrieved_neighbours=False,
+        fix_newsqa=True,
+        **extra_kwargs
+    )
+
+    print_rank_0('> building train, validation, and test datasets '
+                 'for GPT ...')
+    train_ds, valid_ds, test_ds = BlendedMegatronDatasetBuilder(
+        dataset_cls,
+        train_val_test_num_samples,
+        is_dataset_built_on_rank,
+        config
+    ).build()
+    print_rank_0("> finished creating GPT datasets ...")
+
+    return train_ds, valid_ds, test_ds
+
+
+if __name__ == "__main__":
+
+    # Temporary for transition to core datasets
+    train_valid_test_datasets_provider.is_distributed = True
+
+    pretrain(train_valid_test_datasets_provider, model_provider,
+        ModelType.retro_decoder,  # ModelType.encoder_or_decoder,
+        forward_step,
+        extra_args_provider=get_tasks_args
+    )
diff --git a/tools/retro/sft/sft_retro_lm.sh b/tools/retro/sft/sft_retro_lm.sh
new file mode 100644
index 0000000000..8c13f1052c
--- /dev/null
+++ b/tools/retro/sft/sft_retro_lm.sh
@@ -0,0 +1,150 @@
+#!/bin/bash
+# bash examples/qa/finetune_normal_lm.sh landrover_tasb_retrieved 843m 1 3e-6 1
+
+blend_name=$1
+model_size=$2
+global_bsz=$3
+lr=$4
+ft_neighbours=1
+model_card=pp1
+ckpt=$5
+TASK=none
+
+train_iters=1000
+
+
+DATA_HOME="<path/to/instruction/tuning/data/directory>"
+data_folder="$DATA_HOME"
+
+SFT_HOME="<path/to/megatron/repo>"
+
+TOKENIZER_MODEL="<path/to/gpt/tokenizer/model>"
+
+RETRO_WORKDIR="<path/to/retro/workdir>"
+
+K=2
+
+PRETRAINED_CHECKPOINT=${ckpt}
+
+SAVENAME="retro-${blend_name}_${model_card}_same_format_ctx${ft_neighbours}_${model_size}_${global_bsz}_${lr}"
+CHECKPOINT_PATH="${SFT_HOME}/checkpoints/applications/${SAVENAME}"
+TENSORBOARD_DIR="${SFT_HOME}/tensorboard/${SAVENAME}"
+mkdir -p ${TENSORBOARD_DIR}
+
+. ./tools/retro/sft/"${blend_name}".sh
+
+
+if [[ $model_size == "843m" ]]; then
+    # model param
+    mod_par=1
+    layers=24
+    hid_dim=1024
+    heads=16
+    pip_par=1
+
+    # node param
+    num_nodes=1
+    lr=5e-6
+    min_lr=5e-6
+fi
+
+
+GPT_ARGS="--apply-layernorm-1p \
+        --untie-embeddings-and-output-weights \
+        --disable-bias-linear \
+        --no-position-embedding \
+        --use-rotary-position-embeddings \
+        --rotary-percent 0.5 \
+        --swiglu \
+        --attention-dropout 0.0 \
+        --hidden-dropout 0.0 \
+        --pipeline-model-parallel-size $pip_par \
+        --tensor-model-parallel-size $mod_par \
+        --num-layers $layers \
+        --hidden-size $hid_dim \
+        --num-attention-heads $heads \
+        --seq-length 4096 \
+        --max-position-embeddings 4096 \
+        --lr-decay-style cosine \
+        --tokenizer-type GPTSentencePieceTokenizer \
+        --tokenizer-model ${TOKENIZER_MODEL} \
+        --clip-grad 1.0 \
+        --weight-decay 0.01 \
+        --adam-beta1 0.9 \
+        --adam-beta2 0.98 \
+        --log-params-norm \
+        --log-num-zeros-in-grad \
+        --bf16 \
+        --use-distributed-optimizer \
+"
+
+FT_ARGS="--eod-mask-loss \
+    --answer-loss-only \
+    --ft_neighbours ${ft_neighbours} \
+    --task $TASK"
+
+
+OUTPUT_ARGS="--log-interval 10 \
+             --save-interval 500 \
+             --eval-interval 200 \
+             --tensorboard-dir ${TENSORBOARD_DIR} \
+             --log-validation-ppl-to-tensorboard \
+             --eval-iters 100"
+
+options=" \
+    $GPT_ARGS \
+    --retro-workdir ${RETRO_WORKDIR} \
+    --retro-add-retriever \
+    --retro-num-neighbors ${K} \
+    --retro-attention-gate 0 \
+    --data-path ${DATA_BLEND} \
+    --data-folder ${data_folder} \
+    --recompute-activations \
+    --lr $lr \
+    --micro-batch-size 1 \
+    --global-batch-size ${global_bsz} \
+    --min-lr ${min_lr} \
+    --retro-cyclic-train-iters ${train_iters} \
+    --train-iters ${train_iters} \
+    --dataloader-type cyclic \
+    --save $CHECKPOINT_PATH \
+    $OUTPUT_ARGS \
+    $FT_ARGS"
+
+if [[ -d "$CHECKPOINT_PATH" ]]; then
+  options="$options \
+      --load $CHECKPOINT_PATH "
+else
+  echo $PRETRAINED_CHECKPOINT
+  options="$options \
+      --load $PRETRAINED_CHECKPOINT \
+      --finetune \
+      --no-load-rng \
+      --no-load-optim "
+fi
+
+######## Command. ########
+
+run_cmd="python -u ${SFT_HOME}/tools/retro/sft/sft_retro.py ${options}"
+
+export NCCL_DEBUG=INFO
+export NCCL_IB_TIMEOUT=19
+export NCCL_IB_SL=1
+export CUDA_DEVICE_MAX_CONNECTIONS=1
+
+NPROCS=8
+CMD="\
+    pwd && cd ${SFT_HOME} && pwd && \
+    export PYTHONPATH=$PYTHONPATH:${SFT_HOME} && \
+    python -m torch.distributed.run \
+    --nproc_per_node ${NPROCS} \
+    --nnodes 1 \
+    --node_rank 0 \
+    --master_port 6000 \
+    ${run_cmd} \
+"
+echo "~~~~~~~~~~~~~~~~~~~~~~~~~~"
+echo "CMD = '$CMD'."
+echo "~~~~~~~~~~~~~~~~~~~~~~~~~~"
+eval $CMD
+
diff --git a/tools/retro/text_generation/evaluate.py b/tools/retro/text_generation/evaluate.py
new file mode 100755
index 0000000000..2031118cdc
--- /dev/null
+++ b/tools/retro/text_generation/evaluate.py
@@ -0,0 +1,200 @@
+# Copyright (c) 2023, NVIDIA CORPORATION. All rights reserved.
+
+
+import sys
+import os
+from tqdm import tqdm
+import string
+import json
+import regex
+import numpy as np
+
+sys.path.append(os.path.abspath(os.path.join(
+    os.path.join(os.path.dirname(__file__), "../../../"))))
+from tools.retro.text_generation.metrics import F1Metric
+
+
+def normalize_answer(s):
+    def remove_articles(text):
+        return regex.sub(r'\b(a|an|the)\b', ' ', text)
+
+    def white_space_fix(text):
+        return ' '.join(text.split())
+
+    def remove_punc(text):
+        exclude = set(string.punctuation)
+        return ''.join(ch for ch in text if ch not in exclude)
+
+    def lower(text):
+        return text.lower()
+
+    return white_space_fix(remove_articles(remove_punc(lower(s))))
+
+
+def compute_f1_score(predicted_answers, groundtruth_answer, exp_name="default"):
+    """Evaluating F1 Score"""
+    print(len(predicted_answers), len(groundtruth_answer))
+    if len(predicted_answers) != len(groundtruth_answer):
+        groundtruth_answer = groundtruth_answer[:len(predicted_answers)]
+
+    guess_list = []
+    answer_list = []
+
+    assert len(guess_list) == len(answer_list), \
+        "lengths of guess and answer are different!"
+
+    for pred, ans in zip(predicted_answers, groundtruth_answer):
+        pred = pred.strip()
+        if type(ans) == str:
+            ans = ans.strip()
+        elif type(ans) == dict:
+            ans = ans['text'].strip()
+        elif ans == None:
+            continue
+        if "<|endoftext|>" in pred:
+            pred = pred.replace("<|endoftext|>", "")
+        if ans == "no_passages_used":
+            ans = ""
+        guess_list.append(pred)
+        answer_list.append(ans)
+
+    precision, recall, f1 = F1Metric.compute_all_pairs(guess_list, answer_list)
+    print('Method: %s; Precision: %.4f; recall: %.4f; f1: %.4f' % ( \
+        exp_name, precision, recall, f1))
+
+
+def load_groundtruth_file(data_file):
+    with open(data_file, "r") as f:
+        nq_examples = json.load(f)
+
+    data = []
+    for instance in nq_examples:
+        if "answers" in instance:
+            answers = instance["answers"]
+            if len(answers) < 1:
+                answers = [None]
+        elif "answer" in instance:
+            if type(instance["answer"]) is str:
+                answers = [instance["answer"]]
+            elif type(instance["answer"]) is list:
+                answers = instance["answer"]
+            else:
+                answers = [str(instance["answer"])]
+        else:
+            raise ValueError("need to have answer or answers")
+        data.append(answers[0])
+
+    return data
+
+
+def read_prediction(prediction_file):
+    prediction_list = []
+    print('reading %s' % prediction_file)
+    with open(prediction_file, "r") as f:
+        for i, line in enumerate(tqdm(f)):
+            if prediction_file.endswith("jsonl"):
+                line = json.loads(line)["pred"]
+                # print(line)
+            line = line.replace("Answer:", "")
+            line = line.replace("Answer: ", "")
+            line = line.replace('????  ', "")
+            line = line.replace('A: ', "")
+            line = line.replace("A:", "")
+
+            line = line.strip()
+
+            if "<|endoftext|>" in line:
+                line = line.replace("<|endoftext|>", "")
+            line = normalize_answer(line)  # normalize the answer
+            prediction_list.append(line)
+
+    return prediction_list
+
+
+def exact_match_score(prediction, ground_truth):
+    return normalize_answer(prediction) == normalize_answer(ground_truth)
+
+
+def ems(prediction, ground_truths):
+    return max([exact_match_score(prediction, gt) for gt in ground_truths])
+
+
+def evaluate_ems(prediction_file, ground_truth_file, dev_num=3000):
+    prediction_list = read_prediction(prediction_file)
+    ground_truths_list = []
+
+    if ground_truth_file.endswith(('txt', 'lst')):
+        raw_data = open(ground_truth_file, 'r')
+    else:
+        with open(ground_truth_file, 'r') as f:
+            raw_data = json.load(f)
+    if "dev" in ground_truth_file:
+        raw_data = raw_data[:dev_num]
+        prediction_list = prediction_list[:dev_num]
+
+    for each in raw_data:
+        if ground_truth_file.endswith('txt'):
+            each = json.loads(each)
+
+        if 'answers' in each:
+            ground_truths_list.append(each['answers'])
+        elif 'answer' in each:
+            ground_truths_list.append(each['answer'])
+        else:
+            ground_truths_list.append([each])
+
+    exactmatch = []
+
+    good_example_list = []
+    for i, each in enumerate(prediction_list):
+        score = ems(each, ground_truths_list[i])
+        exactmatch.append(score)
+        if score:
+            good_example_list.append(i)
+
+    final_em_score = np.mean(exactmatch)
+
+    print('Exact Match: %.4f;' % final_em_score)
+
+    print('done :-)')
+
+    return final_em_score, exactmatch
+
+
+def load_prediction(data_file):
+    data = []
+    with open(data_file, "r") as f:
+        for line in f.readlines():
+            data.append(line.strip())
+
+    return data
+
+
+def evaluate_f1(ground_truth_file, prediction_file, reduced_test_only=False):
+    groundtruth_answer = load_groundtruth_file(ground_truth_file)
+    predicted_answers = load_prediction(prediction_file)
+    if not reduced_test_only:
+        compute_f1_score(predicted_answers, groundtruth_answer)
+
+
+if __name__ == "__main__":
+    model_names = []
+    model_names += "retro-open_inst_pp1_same_format_ctx1_843m_128_5e-6",
+
+    for model_name in model_names:
+        ckpt_path = "/path/to/checkpoints/{}/".format(model_name)
+
+        n_ctx = 5
+        n_enc = 2
+        iter = 1000
+        model_param = "843m"
+
+        prediction_file = ckpt_path + "/retro-generate-nq_{}_{}_{}_test_greedy_0_20000_{}.txt".format(
+            n_ctx, n_enc, model_param, iter)
+        ground_truth_file = "/path/to/NQ/test.json"
+        print(prediction_file)
+        print(ground_truth_file)
+        evaluate_f1(ground_truth_file, prediction_file)
+        evaluate_ems(prediction_file, ground_truth_file)
+
+        print("=====================================")
diff --git a/tools/retro/text_generation/metrics.py b/tools/retro/text_generation/metrics.py
new file mode 100755
index 0000000000..bd0b5fe6b3
--- /dev/null
+++ b/tools/retro/text_generation/metrics.py
@@ -0,0 +1,80 @@
+
+# The following code is adapted from
+# https://github.com/facebookresearch/ParlAI/blob/master/parlai/core/metrics.py, 
+# which is licensed under the MIT license. More details on the license can be 
+# found at https://github.com/facebookresearch/ParlAI/blob/master/LICENSE.
+
+"""Provides standard metric evaluations for dialog."""
+
+from collections import Counter
+from typing import List
+import numpy as np
+import re
+from nltk import ngrams
+
+re_art = re.compile(r'\b(a|an|the)\b')
+re_punc = re.compile(r'[!"#$%&()*+,-./:;<=>?@\[\]\\^`{|}~_\']')
+
+
+def normalize_answer(s):
+    """
+    Lower text and remove punctuation, articles and extra whitespace.
+    """
+    s = s.lower()
+    s = re_punc.sub(' ', s)
+    s = re_art.sub(' ', s)
+    s = ' '.join(s.split())
+    return s
+
+
+class F1Metric:
+    """
+    Helper class which computes token-level F1.
+    """
+
+    @staticmethod
+    def _prec_recall_f1_score(pred_items, gold_items):
+        """
+        Compute precision, recall and f1 given a set of gold and prediction items.
+        :param pred_items: iterable of predicted values
+        :param gold_items: iterable of gold values
+        :return: tuple (p, r, f1) for precision, recall, f1
+        """
+        common = Counter(gold_items) & Counter(pred_items)
+        num_same = sum(common.values())
+        if num_same == 0:
+            return 0, 0, 0
+        precision = 1.0 * num_same / len(pred_items)
+        recall = 1.0 * num_same / len(gold_items)
+        f1 = (2 * precision * recall) / (precision + recall)
+        return precision, recall, f1
+
+    @staticmethod
+    def compute_each_pair(guess: str, answer: str, n=1):
+        if answer == "":
+            return None, None, None
+        if guess == "":
+            return 0, 0, 0
+        g_tokens = normalize_answer(guess).split()
+        a_tokens = normalize_answer(answer).split()
+        g_tokens = list(ngrams(g_tokens, n))
+        a_tokens = list(ngrams(a_tokens, n))
+        precision, recall, f1 = F1Metric._prec_recall_f1_score(g_tokens, a_tokens)
+        return precision, recall, f1
+
+    @staticmethod
+    def compute_all_pairs(guesses: List[str], answers: List[str], n=1):
+        # additional augment:
+        print("guess:", len(guesses), ", answers:", len(answers))
+        assert len(guesses) == len(answers)
+
+        precision_list, recall_list, f1_list = [], [], []
+        for guess, answer in zip(guesses, answers):
+            precision, recall, f1 = F1Metric.compute_each_pair(guess, answer, n)
+            if precision is None or recall is None or f1 is None:
+                continue
+            precision_list.append(precision)
+            recall_list.append(recall)
+            f1_list.append(f1)
+
+        return np.mean(precision_list), np.mean(recall_list), np.mean(f1_list)
diff --git a/tools/retro/text_generation/retro_api.py b/tools/retro/text_generation/retro_api.py
new file mode 100644
index 0000000000..b70677485d
--- /dev/null
+++ b/tools/retro/text_generation/retro_api.py
@@ -0,0 +1,221 @@
+# Copyright (c) 2023, NVIDIA CORPORATION. All rights reserved.
+
+
+"""Inference API."""
+import numpy as np
+import torch
+from megatron.core import mpu
+from megatron.training import print_rank_0, get_retro_args, get_args, get_tokenizer
+from megatron.inference.text_generation.communication import broadcast_float_list, broadcast_tensor, broadcast_int_list
+from megatron.inference.text_generation.generation import (
+    score_and_return_on_first_stage)
+from tools.retro.text_generation.retro_generation import (
+    retro_generate_tokens_probs_and_return_on_first_stage)
+from megatron.inference.text_generation.tokenization import (
+    detokenize_generations)
+
+
+def tokenize_prompts(prompts=None, tokens_to_generate=None,
+                     add_BOS=None, rank=0):
+    """Tokenize prompts and make them avaiable on all ranks."""
+
+    # On all ranks set to None so we can pass them to functions
+    sizes_list = None
+    prompts_tokens_cuda_long_tensor = None
+    prompts_length_cuda_long_tensor = None
+
+    # On the specified rank, build the above.
+    if torch.distributed.get_rank() == rank:
+        assert prompts is not None
+        assert tokens_to_generate is not None
+        # Tensor of tokens padded and their unpadded length.
+        prompts_tokens_cuda_long_tensor, prompts_length_cuda_long_tensor = \
+            _tokenize_prompts_and_batch(prompts, tokens_to_generate, add_BOS)
+        # We need the sizes of these tensors for the boradcast
+        sizes_list = [prompts_tokens_cuda_long_tensor.size(0), # Batch size
+                      prompts_tokens_cuda_long_tensor.size(1)] # Sequence lenght
+
+    # First, broadcast the sizes.
+    sizes_tensor = broadcast_int_list(2, int_list=sizes_list, rank=rank)
+
+    # Now that we have the sizes, we can boradcast the tokens
+    # and length tensors.
+    sizes = sizes_tensor.tolist()
+    prompts_tokens_cuda_long_tensor = broadcast_tensor(
+        sizes, torch.int64, tensor=prompts_tokens_cuda_long_tensor, rank=rank)
+    prompts_length_cuda_long_tensor = broadcast_tensor(
+        sizes[0], torch.int64, tensor=prompts_length_cuda_long_tensor,
+        rank=rank)
+
+    return prompts_tokens_cuda_long_tensor, prompts_length_cuda_long_tensor
+
+
+def _tokenize_prompts_and_batch(prompts, tokens_to_generate, add_BOS):
+    """Given a set of prompts and number of tokens to generate:
+        - tokenize prompts
+        - set the sequence length to be the max of length of prompts
+          plus the number of tokens we would like to generate
+        - pad all the sequences to this length so we can convert them
+          into a 2D tensor.
+    """
+
+    # Tokenize all the prompts.
+    tokenizer = get_tokenizer()
+    if add_BOS:
+        prompts_tokens = [[tokenizer.eod] + tokenizer.tokenize(prompt)
+                          for prompt in prompts]
+    else:
+        prompts_tokens = [tokenizer.tokenize(prompt) for prompt in prompts]
+
+    # Now we have a list of list of tokens which each list has a different
+    # size. We want to extend this list to:
+    #   - incorporate the tokens that need to be generated
+    #   - make all the sequences equal length.
+    # Get the prompts length.
+    prompts_length = [len(prompt_tokens) for prompt_tokens in prompts_tokens]
+    # Get the max prompts length.
+    max_prompt_len = max(prompts_length)
+    # Set the tokens to generate to the max prompts length for Retro
+    args = get_args()
+    if args.retro_add_retriever:
+        tokens_to_generate = max_prompt_len
+    # Number of tokens in the each sample of the batch.
+    samples_length = max_prompt_len + tokens_to_generate
+    # Now update the list of list to be of the same size: samples_length.
+    for prompt_tokens, prompt_length in zip(prompts_tokens, prompts_length):
+        padding_size = samples_length - prompt_length
+        prompt_tokens.extend([tokenizer.eod] * padding_size)
+
+    # Now we are in a structured format, we can convert to tensors.
+    prompts_tokens_tensor = torch.cuda.LongTensor(prompts_tokens)
+    prompts_length_tensor = torch.cuda.LongTensor(prompts_length)
+
+    return prompts_tokens_tensor, prompts_length_tensor
+
+
+def retro_generate_and_post_process(model,
+                              prompts=None,
+                              neighbours_array=None,
+                              tokens_to_generate=0,
+                              return_output_log_probs=False,
+                              top_k_sampling=0,
+                              top_p_sampling=0.0,
+                              temperature=1.0,
+                              add_BOS=False,
+                              use_eod_token_for_early_termination=True,
+                              random_seed=-1,
+                              logits_mask=None):
+    """Run inference and post-process outputs, i.e., detokenize,
+    move to cpu and convert to list."""
+
+    # Main inference.
+    tokens, lengths, output_log_probs = retro_generate(
+        model,
+        prompts=prompts,
+        neighbours_array=neighbours_array,
+        tokens_to_generate=tokens_to_generate,
+        return_output_log_probs=return_output_log_probs,
+        top_k_sampling=top_k_sampling,
+        top_p_sampling=top_p_sampling,
+        temperature=temperature,
+        add_BOS=add_BOS,
+        use_eod_token_for_early_termination=use_eod_token_for_early_termination,
+        random_seed=random_seed,
+        logits_mask=logits_mask)
+
+    # Only post-process on first stage.
+    if mpu.is_pipeline_first_stage():
+        tokens, prompts_plus_generations, prompts_plus_generations_segments = \
+            detokenize_generations(tokens, lengths, True)
+
+        if return_output_log_probs:
+            output_log_probs = output_log_probs.cpu().numpy().tolist()
+            for i, (prob, seg) in enumerate(zip(output_log_probs, prompts_plus_generations_segments)):
+                output_log_probs[i] = prob[:len(seg) - 1]
+
+        return prompts_plus_generations, prompts_plus_generations_segments, \
+               output_log_probs, tokens
+
+    return None
+
+
+def retro_generate(model,
+             prompts=None,
+             neighbours_array=None,
+             tokens_to_generate=0,
+             return_output_log_probs=False,
+             top_k_sampling=0,
+             top_p_sampling=0.0,
+             temperature=1.0,
+             add_BOS=False,
+             use_eod_token_for_early_termination=True,
+             stop_on_double_eol=False,
+             stop_on_eol=False,
+             random_seed=-1,
+             logits_mask=None):
+    """Given prompts and input parameters, run inference and return:
+       tokens: prompts plus the generated tokens.
+       lengths: length of the prompt + generations. Note that we can
+           discard tokens in the tokens tensor that are after the
+           corresponding length.
+       output_log_probs: log probs of the tokens.
+    """
+
+    # Make sure input params are avaialble to all ranks.
+    values = [tokens_to_generate,
+              return_output_log_probs,
+              top_k_sampling, top_p_sampling,
+              temperature, add_BOS, use_eod_token_for_early_termination,
+              stop_on_double_eol,
+              stop_on_eol,
+              random_seed]
+    values_float_tensor = broadcast_float_list(10, float_list=values)
+    tokens_to_generate = int(values_float_tensor[0].item())
+    return_output_log_probs = bool(values_float_tensor[1].item())
+    top_k_sampling = int(values_float_tensor[2].item())
+    top_p_sampling = values_float_tensor[3].item()
+    temperature = values_float_tensor[4].item()
+    add_BOS = bool(values_float_tensor[5].item())
+    use_eod_token_for_early_termination = bool(values_float_tensor[6].item())
+    stop_on_double_eol = bool(values_float_tensor[7].item())
+    stop_on_eol = bool(values_float_tensor[8].item())
+    random_seed = int(values_float_tensor[9].item())
+
+    if random_seed != -1:
+        torch.random.manual_seed(random_seed)
+
+    # Tokenize prompts and get the batch.
+    # Note that these tensors are broadcaseted to all ranks.
+    if torch.distributed.get_rank() == 0:
+        assert prompts is not None
+
+    context_tokens_tensor, context_length_tensor = tokenize_prompts(
+        prompts=prompts, tokens_to_generate=tokens_to_generate, add_BOS=add_BOS)
+
+    retro_args = get_retro_args()
+    retro_args.retro_gpt_chunk_length = context_length_tensor.item()
+
+    retro_args = get_retro_args()
+    args = get_args()
+    r = retro_args.retro_gpt_retrieved_length
+    l = int(np.ceil(min(args.max_position_embeddings, context_tokens_tensor.size(1)) / retro_args.retro_gpt_chunk_length))
+    if torch.distributed.get_rank() == 0:
+        neighbours_array = neighbours_array.reshape(1, args.retro_num_neighbors, r).repeat(l, axis=0)  ## dim (l, k, r)
+
+    if tokens_to_generate == 0:
+        return score_and_return_on_first_stage(
+            model, context_tokens_tensor, context_length_tensor)
+
+    # Main inference function.
+    # Note that the outputs are available on the first stage.
+    return retro_generate_tokens_probs_and_return_on_first_stage(
+        model, context_tokens_tensor, context_length_tensor,
+        neighbours_array=neighbours_array,
+        return_output_log_probs=return_output_log_probs,
+        top_k=top_k_sampling,
+        top_p=top_p_sampling,
+        temperature=temperature,
+        use_eod_token_for_early_termination=use_eod_token_for_early_termination,
+        stop_on_double_eol=stop_on_double_eol,
+        stop_on_eol=stop_on_eol,
+        logits_mask=logits_mask)
\ No newline at end of file
diff --git a/tools/retro/text_generation/retro_generate.sh b/tools/retro/text_generation/retro_generate.sh
new file mode 100755
index 0000000000..53f7d76476
--- /dev/null
+++ b/tools/retro/text_generation/retro_generate.sh
@@ -0,0 +1,125 @@
+#!/bin/bash
+
+TASK=$1
+model_size=$2
+sampling=$3
+split=$4
+gen_start=$5
+num_gen=$6
+ckpt_step=${7}
+ft_neighbours=${8}
+model_card=${9}
+ckpt=${10}
+K=${11}
+retrieve=${12}
+
+QA_HOME="<path/to/megatron/repo>"
+
+TOKENIZER_MODEL="<path/to/gpt/tokenizer/model>"
+
+RETRO_WORKDIR="<path/to/retro/workdir>"
+
+
+if [[ $model_size == "843m" ]]; then
+    mod_par=1
+    layers=24
+    hid_dim=1024
+    heads=16
+    pip_par=1
+fi
+
+GPT_ARGS="--apply-layernorm-1p \
+        --untie-embeddings-and-output-weights \
+        --disable-bias-linear \
+        --no-position-embedding \
+        --use-rotary-position-embeddings \
+        --rotary-percent 0.5 \
+        --swiglu \
+        --attention-dropout 0.0 \
+        --hidden-dropout 0.0 \
+        --pipeline-model-parallel-size $pip_par \
+        --tensor-model-parallel-size $mod_par \
+        --num-layers $layers \
+        --hidden-size $hid_dim \
+        --num-attention-heads $heads \
+        --seq-length 4096 \
+        --max-position-embeddings 4096 \
+        --lr-decay-style cosine \
+        --tokenizer-type GPTSentencePieceTokenizer \
+        --tokenizer-model ${TOKENIZER_MODEL} \
+        --clip-grad 1.0 \
+        --weight-decay 0.01 \
+        --adam-beta1 0.9 \
+        --adam-beta2 0.98 \
+        --log-params-norm \
+        --log-num-zeros-in-grad \
+        --bf16 \
+"
+
+
+sample_input_file="/path/to/instruct_tuning/data/$TASK/${split}.json"
+
+top_k=1
+micro_bsz=1
+SAMPLE_ARGS="--top_k $top_k"
+
+CHECKPOINT_PATH=${ckpt}
+sample_output_file="${CHECKPOINT_PATH}/retro-generate-${TASK}_${ft_neighbours}_${K}_${model_size}_${split}_${sampling}_${gen_start}_${num_gen}_${ckpt_step}.txt"
+
+DIR=`pwd`
+
+echo $sample_input_file
+echo $sample_output_file
+
+
+GEN_ARGS="$SAMPLE_ARGS \
+          --gen-start-idx $gen_start \
+          --num-gen $num_gen \
+          --ckpt-step ${ckpt_step} \
+          --sample-input-file $sample_input_file \
+          --sample-output-file $sample_output_file \
+          --retro-workdir ${RETRO_WORKDIR} \
+          --retro-add-retriever \
+          --retro-num-neighbors ${K} \
+          --reuse-top \
+          --retro-attention-gate 0 \
+          "
+
+if [[ $retrieve == 1 ]]; then
+    GEN_ARGS="$GEN_ARGS \
+          --use-retrieved-neighbours \
+          "
+fi
+
+FT_ARGS="--eod-mask-loss \
+    --answer-loss-only \
+    --ft_neighbours ${ft_neighbours} \
+    --task $TASK"
+
+DISTRIBUTED_ARGS="--nproc_per_node ${mod_par} \
+                  --nnodes ${pip_par} \
+                  --node_rank 0 \
+                  --master_port 8889"
+
+######## Command. ########
+
+COMMAND="python -m torch.distributed.run $DISTRIBUTED_ARGS ${DIR}/tools/retro/text_generation/retro_text_generation.py"
+
+COMMAND="$COMMAND \
+       $GPT_ARGS \
+       $GEN_ARGS \
+       --load $CHECKPOINT_PATH \
+       --micro-batch-size $micro_bsz \
+       $FT_ARGS"
+
+export NCCL_DEBUG=INFO
+export NCCL_IB_TIMEOUT=19
+export NCCL_IB_SL=1
+export CUDA_DEVICE_MAX_CONNECTIONS=1
+
+
+echo "~~~~~~~~~~~~~~~~~~~~~~~~~~"
+echo "CMD = '$CMD'."
+echo "~~~~~~~~~~~~~~~~~~~~~~~~~~"
+eval $COMMAND
+
diff --git a/tools/retro/text_generation/retro_generation.py b/tools/retro/text_generation/retro_generation.py
new file mode 100644
index 0000000000..f69103de77
--- /dev/null
+++ b/tools/retro/text_generation/retro_generation.py
@@ -0,0 +1,250 @@
+# Copyright (c) 2023, NVIDIA CORPORATION. All rights reserved.
+
+
+"""Generation utilities."""
+import torch
+import torch.nn.functional as F
+from megatron.training import get_args, get_tokenizer
+from megatron.training import get_retro_args
+from megatron.core import mpu
+from megatron.training.utils import get_ltor_masks_and_position_ids, unwrap_model
+from megatron.inference.text_generation.communication import (
+    copy_from_last_to_first_pipeline_stage,
+    broadcast_from_last_pipeline_stage,
+    broadcast_from_last_to_first_pipeline_stage, broadcast_int_list, broadcast_tensor)
+from megatron.inference.text_generation.generation import _build_attention_mask_and_position_ids
+from megatron.inference.text_generation.sampling import sample
+
+
+
+def retro_generate_tokens_probs_and_return_on_first_stage(
+        model, tokens, lengths, neighbours_array=None,
+        return_output_log_probs=False,
+        top_k=0, top_p=0.0,
+        temperature=1.0,
+        use_eod_token_for_early_termination=True,
+        stop_on_double_eol=False,
+        stop_on_eol=False,
+        logits_mask=None):
+    """Main token generation function.
+
+    Args:
+        model: no interleaving is supported.
+        tokens: prompt tokens extended to be of size [b, max-sequence-length]
+        lengths: original prompt length, size: [b]
+        neighbours_array: neighbours array of size [b, l, k, r]
+        return_output_log_probs: flag to calculate the log probability of
+            the generated tokens. Note that the log probability is the one
+            from the original logit.
+        top_k, top_p: top-k and top-p sampling parameters.
+            Note that top-k = 1 is gready. Also, these paramters are
+            exclusive meaning that:
+                if top-k > 0 then we expect top-p=0.
+                if top-p > 0 then we check for top-k=0.
+        temperature: sampling temperature.
+        use_eod_token_for_early_termination: if True, do early termination if
+            all the sequences have reached this token.
+    Note: Outside of model, other parameters only need to be available on
+          rank 0.
+
+    Returns: Note that is size is adjusted to a lower value than
+             max-sequence-length if generation is terminated early.
+        tokens: prompt and generated tokens. size: [b, :]
+        generated_sequence_lengths: total length (including prompt) of
+            the generated sequence. size: [b]
+        output_log_probs: log probability of the selected tokens. size: [b, s]
+    """
+
+    args = get_args()
+    retro_args = get_retro_args()
+
+    tokenizer = get_tokenizer()
+
+    batch_size = tokens.size(0)
+    min_prompt_length = lengths.min().item()
+    max_sequence_length = tokens.size(1)
+    print("max_sequence_length", max_sequence_length)
+    print("min_prompt_length", min_prompt_length)
+    max_sequence_length = min(max_sequence_length, args.max_position_embeddings)
+
+    # If the context is too big, this happens
+    if min_prompt_length >= max_sequence_length:
+        raise ValueError("context length + tokens_to_generate too large")
+
+    # forward step.
+    unwrapped_model = unwrap_model(
+        model)
+    unwrapped_model.language_model.seq_length = max_sequence_length
+
+    # Added termination_id to support the case that we want to terminate the
+    # generation once that id is generated.
+    if hasattr(args, 'eos_id'):
+        termination_id = args.eos_id
+    else:
+        termination_id = tokenizer.eod
+
+    # ===================
+    # Pre-allocate memory
+    # ===================
+
+    # Log probability of the sequence (prompt + generated tokens).
+    output_log_probs = None
+    output_log_probs_size = (batch_size, max_sequence_length - 1)
+    # Lengths of generated seuquence including including prompts.
+    generated_sequence_lengths = None
+    if mpu.is_pipeline_last_stage():
+        if return_output_log_probs:
+            output_log_probs = torch.empty(output_log_probs_size,
+                                           dtype=torch.float32,
+                                           device=torch.cuda.current_device())
+        generated_sequence_lengths = torch.ones(
+            batch_size, dtype=torch.int64,
+            device=torch.cuda.current_device()) * max_sequence_length
+
+    # Whether we have reached a termination id.
+    is_generation_done = torch.zeros(batch_size, dtype=torch.uint8,
+                                     device=torch.cuda.current_device())
+
+    # =============
+    # Run infernece
+    # =============
+
+    with torch.no_grad():
+        attention_mask, position_ids = _build_attention_mask_and_position_ids(
+            tokens)
+        for context_length in range(min_prompt_length, max_sequence_length):
+            prev_context_length = 0
+            sizes_list = None
+            neighbor_tokens_cuda_long_tensor = None
+
+            # get the chunks for retrieval
+            if torch.distributed.get_rank() == 0:
+                neighbor_tokens = neighbours_array
+                neighbor_tokens_cuda_long_tensor = torch.cuda.LongTensor(
+                    neighbor_tokens.reshape((-1, retro_args.retro_gpt_retrieved_length)))
+                sizes_list = [neighbor_tokens_cuda_long_tensor.size(0),  # Batch size
+                              neighbor_tokens_cuda_long_tensor.size(1)]  # Sequence lenght
+            sizes_tensor = broadcast_int_list(2, int_list=sizes_list)
+            sizes = sizes_tensor.tolist()
+            neighbor_tokens_cuda_long_tensor = broadcast_tensor(
+                sizes, torch.int64, tensor=neighbor_tokens_cuda_long_tensor)
+
+            _, _, neighbor_position_ids = get_ltor_masks_and_position_ids(
+                neighbor_tokens_cuda_long_tensor,
+                tokenizer.eod,
+                args.reset_position_ids,
+                args.reset_attention_mask,
+                args.eod_mask_loss)
+            neighbor_attention_mask = None
+
+            # Pick the slice that we need to pass through the network.
+            tokens2use = tokens[:, prev_context_length:4096]
+            positions2use = position_ids[:, prev_context_length:4096]
+            attention_mask2use = attention_mask[
+                                 ..., prev_context_length:4096, :4096]
+
+            logits = model(tokens2use, positions2use, attention_mask2use,
+                           retriever_input_ids=neighbor_tokens_cuda_long_tensor,
+                           retriever_position_ids=neighbor_position_ids, retriever_attn_mask=neighbor_attention_mask,
+                           )
+
+            if mpu.is_pipeline_last_stage():
+                # Always the last stage should have an output.
+                assert logits is not None
+
+                # Sample.
+                last_token_logits = logits[:, context_length - 1, :]
+                # last_token_logits = logits[:, -1, :]
+
+                # word banning
+                if logits_mask is not None:
+                    last_token_logits[:, logits_mask] = float('-Inf')
+
+                new_sample = sample(last_token_logits,
+                                    top_k=top_k,
+                                    top_p=top_p,
+                                    temperature=temperature,
+                                    vocab_size=tokenizer.vocab_size)
+
+                # If a prompt length is smaller or equal th current context
+                # length, it means we have started generating tokens
+                started = lengths <= context_length
+                # Update the tokens.
+                tokens[started, context_length] = new_sample[started]
+
+                # Calculate the log probabilities.
+                if return_output_log_probs:
+                    log_probs = F.log_softmax(logits, dim=2)
+                    if return_output_log_probs:
+                        # Pick the tokens that we need to get the log
+                        # probabilities for. Note that next input token is
+                        # the token which we selected in the current logits,
+                        # so shift by 1.
+                        indices = torch.unsqueeze(
+                            tokens[
+                            :,
+                            (prev_context_length + 1):(context_length + 1)],
+                            2)
+                        output_log_probs[:,
+                        prev_context_length:context_length] = \
+                            torch.gather(log_probs, 2, indices).squeeze(2)
+
+            # Update the tokens on the first stage so the next input to
+            # the network is correct.
+            copy_from_last_to_first_pipeline_stage(batch_size, torch.int64,
+                                                   tokens[:, context_length])
+
+            # Update the context length for the next token generation.
+            prev_context_length = context_length
+
+            # Check if all the sequences have hit the termination_id.
+            done = None
+            if mpu.is_pipeline_last_stage():
+                # TODO(rprenger) These stopping methods are tokenizer dependent
+                # instead tokenization should be in the inference loop so stop sequences can be used
+                if stop_on_double_eol:
+                    hit_double_eol = (new_sample == 628).byte() & started.byte()
+                    hit_two_eols = (new_sample == 198).byte() & (
+                            tokens[:, context_length - 1] == 198).byte() & started.byte()
+                    done_token = hit_double_eol | hit_two_eols
+                elif stop_on_eol:
+                    hit_double_eol = (new_sample == 628).byte() & started.byte()
+                    hit_eol = (new_sample == 198).byte() & started.byte()
+                    done_token = hit_double_eol | hit_eol
+                elif context_length > min_prompt_length + 64:  # previous retrov1 limitations
+                    done_token = 1
+                else:
+                    done_token = (new_sample == termination_id).byte() & \
+                                 started.byte()
+
+                just_finished = (done_token & ~is_generation_done).bool()
+                generated_sequence_lengths[just_finished.view(-1)] = \
+                    context_length + 1
+                is_generation_done = is_generation_done | done_token
+                done = torch.all(is_generation_done)
+            done = broadcast_from_last_pipeline_stage(1, torch.uint8,
+                                                      tensor=done)
+            if use_eod_token_for_early_termination and done:
+                break
+
+    # ===================================================
+    # Update the length of based on max generated length.
+    # ===================================================
+
+    tokens = tokens[:, :(context_length + 1)]
+    if mpu.is_pipeline_last_stage():
+        if return_output_log_probs:
+            output_log_probs = output_log_probs[:, :context_length]
+
+    # ======================================
+    # Broadcast to the first pipeline stage.
+    # ======================================
+
+    generated_sequence_lengths = broadcast_from_last_to_first_pipeline_stage(
+        batch_size, torch.int64, generated_sequence_lengths)
+    if return_output_log_probs:
+        output_log_probs_size = (batch_size, context_length)
+        output_log_probs = broadcast_from_last_to_first_pipeline_stage(
+            output_log_probs_size, torch.float32, output_log_probs)
+
+    return tokens, generated_sequence_lengths, output_log_probs
diff --git a/tools/retro/text_generation/retro_text_generation.py b/tools/retro/text_generation/retro_text_generation.py
new file mode 100755
index 0000000000..2705009044
--- /dev/null
+++ b/tools/retro/text_generation/retro_text_generation.py
@@ -0,0 +1,263 @@
+# Copyright (c) 2023, NVIDIA CORPORATION. All rights reserved.
+
+"""Sample Generate GPT"""
+import torch
+import os
+import sys
+from typing import Union
+
+sys.path.append(os.path.abspath(os.path.join(
+    os.path.join(os.path.dirname(__file__), "../../../"))))
+from megatron.training import get_args, get_retro_args
+from megatron.training import print_rank_0
+from megatron.training import get_tokenizer
+from megatron.training.checkpointing import load_checkpoint
+from megatron.training.initialize import initialize_megatron
+from megatron.core.models.gpt import GPTModel
+from megatron.training import get_model
+from tools.retro.text_generation.retro_api import retro_generate_and_post_process
+from tools.retro.sft.sft_retro import get_tasks_args
+from tools.retro.sft.dataset_conv import reformat_prompt, preprocess, reformat_prompt_short
+import numpy as np
+import time
+import megatron.legacy.model
+from megatron.training.arguments import core_transformer_config_from_args
+
+
+
+def model_provider(pre_process=True, post_process=True) -> Union[GPTModel, megatron.legacy.model.GPTModel]:
+    """Builds the model.
+
+    Args:
+        pre_process (bool, optional): Set to true if you need to compute embedings. Defaults to True.
+        post_process (bool, optional): Set to true if you need to want to compute output logits/loss. Defaults to True.
+
+
+    Returns:
+        Union[GPTModel, megatron.legacy.model.GPTModel]: The returned model
+    """
+    print_rank_0('building GPT model ...')
+    args = get_args()
+    config = core_transformer_config_from_args(args)
+
+    assert args.use_legacy_models, 'retro text generation only implemented for legacy models'
+    
+    # not support core model yet
+    model = megatron.legacy.model.GPTModel(
+        config,
+        num_tokentypes=0,
+        parallel_output=False,
+        pre_process=pre_process,
+        post_process=post_process
+    )
+
+    return model
+
+
+def pad_neighbours_for_query_only(args, nb_tokens, pad_id, ft_neighbours):
+    # take top k neighbours and padding
+    neighbours_tokens = []
+    retro_args = get_retro_args()
+    r = retro_args.retro_gpt_retrieved_length
+
+    if args.reuse_top:
+        valid_nb_tokens = nb_tokens[:args.retro_num_neighbors]
+    else:
+        valid_nb_tokens = nb_tokens[ft_neighbours:args.retro_num_neighbors + ft_neighbours]
+
+    for nb_token in valid_nb_tokens:
+        if len(nb_token) >= r:
+            nb_token = nb_token[:r]
+        else:
+            nb_token = nb_token + [pad_id] * (r - len(nb_token))
+        neighbours_tokens.append(nb_token)
+    print("len(nb_tokens)", len(nb_tokens))
+    print("len(neighbours_tokens)", len(neighbours_tokens))
+    print("args.retro_num_neighbors", args.retro_num_neighbors)
+
+    if len(neighbours_tokens) < args.retro_num_neighbors:
+        assert ValueError("neighbours are not enough, add empty ones and create mask for those empty ones")
+    neighbours_tokens = np.array(neighbours_tokens)
+    return neighbours_tokens
+
+
+def add_text_generate_args(parser):
+    """Text generation arguments."""
+
+    parser = get_tasks_args(parser)
+    group = parser.add_argument_group(title='text generation')
+
+    group.add_argument("--temperature", type=float, default=1.0,
+                       help='Sampling temperature.')
+    group.add_argument("--greedy", action='store_true', default=False,
+                       help='Use greedy sampling.')
+    group.add_argument("--top_p", type=float, default=0.0,
+                       help='Top p sampling.')
+    group.add_argument("--top_k", type=int, default=0,
+                       help='Top k sampling.')
+    group.add_argument("--out-seq-length", type=int, default=256,
+                       help='Size of the output generated text.')
+    group.add_argument("--sample-input-file", type=str, default=None,
+                       help='Get input from file instead of interactive mode, '
+                            'each line is an input.')
+    group.add_argument("--sample-output-file", type=str, default=None,
+                       help='Output file got from --sample-input-file')
+    group.add_argument("--num-samples", type=int, default=0,
+                       help='Number of samples to generate unconditionally, '
+                            'defaults to 0 and interactive conditional sampling')
+    group.add_argument("--genfile", type=str,
+                       help='Output file when generating unconditionally')
+    group.add_argument("--recompute", action='store_true',
+                       help='During generation recompute all attention '
+                            'instead of using previously computed keys/values.')
+    group.add_argument("--epsilon", type=float, default=0.01,
+                       help="Minimum factor by which each probability is multiplied")
+    group.add_argument("--debug-gen", action='store_true',
+                       help="If set, additional debugging output is printed to stdout")
+    group.add_argument('--length-penalty', type=float, default=1.0,
+                       help='length penalty')
+    group.add_argument('--gen-start-idx', type=int, default=0,
+                       help='project size for adapters')
+    group.add_argument('--num-gen', type=int, default=-1,
+                       help='project size for adapters')
+    group.add_argument('--ckpt-step', type=int, default=None,
+                       help='setting ckpt step manually')
+    group.add_argument("--short-format", action='store_true',
+                       help='Use short format QA')
+    group.add_argument("--use-retrieved-neighbours", action='store_true', default=False,
+                       help='Use retrieved neighbours')
+    group.add_argument('--template-id', type=int, default=0,
+                       help='template id for generation,')
+    return parser
+
+
+def generate_samples_conditional(model):
+    args = get_args()
+    start = time.time()
+    avg_time = []
+    tokenizer = get_tokenizer()
+    model.eval()
+    if torch.distributed.get_rank() == 0:
+
+        data = preprocess(args.sample_input_file, inference_only=True,
+                          retrieved_neighbours=args.use_retrieved_neighbours)
+        print("total rows {}".format(len(data)))
+        all_data = data[args.gen_start_idx:]  # start from gen_start_idx
+        if args.num_gen > 0:
+            all_data = all_data[:args.num_gen]
+        input_count = len(all_data)
+        input_pos = 0
+
+    terminate_runs = 0
+    while True:
+        torch.distributed.barrier()
+        if torch.distributed.get_rank() == 0:
+            sentences = []
+            n_arrays = []
+            print("global batch size", args.global_batch_size)
+            for _ in range(args.global_batch_size):
+                print(input_pos)
+                if input_pos >= input_count:
+                    print("reach the last row")
+                    break
+                else:
+                    sample = all_data[input_pos]
+                input_pos += 1
+
+                if True:
+                    max_target_len = args.out_seq_length
+                    query, _, neighbours = sample
+
+                    neighbours_array = pad_neighbours_for_query_only(args,
+                                                                     [tokenizer.tokenize(neighbour) for neighbour in
+                                                                      neighbours], tokenizer.eod, args.ft_neighbours)
+                    print("neighbours_array.shape", neighbours_array.shape)
+                    tokenizer = get_tokenizer()
+
+                    if args.short_format:
+                        input_tokens = reformat_prompt_short(query, neighbours, args.task, args.ft_neighbours,
+                                                             max_target_len,
+                                                             tokenizer, args.seq_length)
+                    else:
+                        input_tokens = reformat_prompt(query, neighbours, args.task, args.ft_neighbours, max_target_len,
+                                                       tokenizer, args.seq_length, template_id=args.template_id)
+                    raw_text = tokenizer.detokenize(input_tokens)
+                    print(raw_text)
+                else:
+                    raise ValueError("invalid arg for task")
+                sentences.append(raw_text)
+            retro_args = get_retro_args()
+
+            resp_sentences, resp_sentences_seg, scores, \
+                tokens = retro_generate_and_post_process(model, prompts=sentences,
+                                                         neighbours_array=neighbours_array,
+                                                         tokens_to_generate=args.seq_length - retro_args.retro_gpt_chunk_length,
+                                                         return_output_log_probs=False,
+                                                         top_k_sampling=args.top_k,
+                                                         top_p_sampling=args.top_p,
+                                                         add_BOS=False,
+                                                         temperature=1.0)
+            print("len of resp_sentences", len(resp_sentences))
+            for prompt, generation in zip(sentences, resp_sentences):
+                datum = generation[len(prompt):]
+                print("prompt:", generation[:len(prompt)])
+                if "<|endoftext|>" in datum:
+                    datum = datum[:datum.find("<|endoftext|>")].strip()
+                datum = datum.replace("\n", " ")
+                print("cont:", datum)
+                yield datum
+            avg_time.append((time.time() - start) / args.global_batch_size)
+            print("avg time for each sample: ", sum(avg_time) / len(avg_time))
+            start = time.time()
+            if input_pos >= input_count:
+                print("finish all lines")
+                terminate_runs = 1
+        else:
+            retro_generate_and_post_process(model)
+
+        terminate_runs_tensor = torch.cuda.LongTensor([terminate_runs])
+        torch.distributed.broadcast(terminate_runs_tensor, 0)
+        terminate_runs = terminate_runs_tensor[0].item()
+
+        if terminate_runs == 1:
+            return
+
+
+def generate_and_write_samples_conditional(model):
+    args = get_args()
+    if args.sample_output_file is None:
+        sample_output_file = args.sample_input_file + ".out"
+        print('`sample-output-file` not specified, setting '
+              'it to {}'.format(sample_output_file))
+    else:
+        sample_output_file = args.sample_output_file
+    with open(sample_output_file, 'w') as f:
+        for datum in generate_samples_conditional(model):
+            if torch.distributed.get_rank() == 0:
+                f.write(datum + '\n')
+
+
+def main():
+    """Main program."""
+
+    initialize_megatron(extra_args_provider=add_text_generate_args,
+                        args_defaults={'no_load_rng': True,
+                                       'no_load_optim': True})
+
+    # Set up model and load checkpoint
+    model = get_model(model_provider, wrap_with_ddp=False)
+    print(model)
+    args = get_args()
+
+    if args.load is not None:
+        _ = load_checkpoint(model, None, None)
+    model = model[0]
+
+    # Generate samples.
+    if args.sample_input_file is not None:
+        print(f"{args.sample_input_file}")
+        generate_and_write_samples_conditional(model)
+
+
+if __name__ == "__main__":
+    main()
diff --git a/tools/retro/utils.py b/tools/retro/utils.py
deleted file mode 100644
index 11aa72ef12..0000000000
--- a/tools/retro/utils.py
+++ /dev/null
@@ -1,75 +0,0 @@
-# Copyright (c) 2023, NVIDIA CORPORATION.  All rights reserved.
-
-import os
-import torch
-import types
-
-from megatron import get_retro_args
-from megatron.tokenizer.tokenizer import (
-    _BertWordPieceTokenizer,
-    _GPT2BPETokenizer,
-    _GPTSentencePieceTokenizer,
-)
-
-
-def get_args_path(workdir):
-    '''Argument copy stored within retro workdir.'''
-    return os.path.join(workdir, "args.json")
-
-
-def get_num_chunks_per_sample():
-    '''Compute seq_length // chunk_length.'''
-    args = get_retro_args()
-    sample_length = args.retro_gpt_seq_length
-    chunk_length = args.retro_gpt_chunk_length
-    assert sample_length % chunk_length == 0
-    return sample_length // chunk_length
-
-
-def get_gpt_tokenizer():
-    '''GPT (BPE) tokenizer.'''
-    args = get_retro_args()
-    tokenizer_type = args.retro_gpt_tokenizer_type
-    if tokenizer_type == "GPT2BPETokenizer":
-        assert args.retro_gpt_vocab_file and args.retro_gpt_merge_file
-        return _GPT2BPETokenizer(
-            vocab_file=args.retro_gpt_vocab_file,
-            merge_file=args.retro_gpt_merge_file,
-        )
-    elif tokenizer_type == 'GPTSentencePieceTokenizer':
-        assert args.retro_gpt_tokenizer_model is not None
-        return _GPTSentencePieceTokenizer(args.retro_gpt_tokenizer_model)
-    else:
-        raise Exception("unrecognized gpt tokenizer, '%s'." % tokenizer_type)
-
-
-def get_bert_tokenizer():
-    '''Bert (Wordpiece) tokenizer.'''
-    args = get_retro_args()
-    lower_case = {
-        "BertWordPieceLowerCase" : True,
-        "BertWordPieceCase" : False,
-    }[args.retro_bert_tokenizer_type]
-    return _BertWordPieceTokenizer(
-        vocab_file=args.retro_bert_vocab_file,
-        lower_case=lower_case,
-    )
-
-
-class GPTToTextDataset(torch.utils.data.Dataset):
-    '''Dataset to convert GPT tokens to text.'''
-
-    def __init__(self, gpt_dataset):
-
-        super().__init__()
-
-        self.gpt_dataset = gpt_dataset
-        self.gpt_tokenizer = get_gpt_tokenizer()
-
-    def __len__(self):
-        return len(self.gpt_dataset)
-
-    def __getitem__(self, idx):
-        gpt_token_ids = self.gpt_dataset[idx]["text"].tolist()
-        text = self.gpt_tokenizer.detokenize(gpt_token_ids)
-        return {"text": text}
diff --git a/tools/run_mamba_text_generation_server.py b/tools/run_mamba_text_generation_server.py
new file mode 100644
index 0000000000..2c7c6f44c2
--- /dev/null
+++ b/tools/run_mamba_text_generation_server.py
@@ -0,0 +1,123 @@
+# Copyright (c) 2024, NVIDIA CORPORATION. All rights reserved.
+
+"""Sample Generate Mamba"""
+import os
+import sys
+sys.path.append(os.path.abspath(os.path.join(os.path.dirname(__file__),
+                                             os.path.pardir)))
+from megatron.training import get_args
+from megatron.training import print_rank_0
+from megatron.core import mpu
+from megatron.training.checkpointing import load_checkpoint
+from megatron.training.initialize import initialize_megatron
+from megatron.core.models.mamba.mamba_model import MambaModel
+from megatron.core.transformer.spec_utils import import_module
+from megatron.training import get_model
+from megatron.training.arguments import core_transformer_config_from_args
+from megatron.inference.text_generation_server import MegatronServer
+from megatron.inference.text_generation import generate_and_post_process
+from megatron.inference.text_generation import beam_search_and_post_process
+
+import torch
+
+def count_parameters_in_layer(model, layer_name):
+    num_params = 0
+    for name, param in model.named_parameters():
+        if layer_name in name:
+            num_params += param.numel()
+            print_rank_0(f" - {name}: {param.numel()}")
+    return num_params
+
+# Taken from pretrain_mamba.py
+def model_provider(pre_process=True, post_process=True) -> MambaModel:
+    """Builds the model.
+
+    Args:
+        pre_process (bool, optional): Set to true if you need to compute embedings. Defaults to True.
+        post_process (bool, optional): Set to true if you need to want to compute output logits/loss. Defaults to True.
+
+
+    Returns:
+        MambaModel: The returned model
+    """
+    args = get_args()
+
+    print_rank_0('building Mamba model ...')
+    config = core_transformer_config_from_args(get_args())
+
+    assert args.use_legacy_models == False, "Mamba only supported in Mcore!"
+
+    if args.spec is not None:
+        mamba_stack_spec = import_module(args.spec)
+    else:
+        raise("You must provide a valid Mamba layer spec!")
+
+    model = MambaModel(
+        config=config,
+        mamba_stack_spec=mamba_stack_spec,
+        vocab_size=args.padded_vocab_size,
+        max_sequence_length=args.max_position_embeddings,
+        pre_process=pre_process,
+        hybrid_attention_ratio=args.hybrid_attention_ratio,
+        hybrid_mlp_ratio=args.hybrid_mlp_ratio,
+        hybrid_override_pattern=args.hybrid_override_pattern,
+        post_process=post_process,
+        fp16_lm_cross_entropy=args.fp16_lm_cross_entropy,
+        parallel_output=False,
+        share_embeddings_and_output_weights=not args.untie_embeddings_and_output_weights,
+        position_embedding_type=args.position_embedding_type,
+        rotary_percent=args.rotary_percent,
+        rotary_base=args.rotary_base
+    )
+
+    for l in range(model.decoder.num_layers_per_pipeline_rank):
+        layer_params = count_parameters_in_layer(model, f'decoder.layers.{l}.')
+        print_rank_0(f" == params layer {l}: {layer_params}")
+
+    return model
+
+def add_text_generate_args(parser):
+    group = parser.add_argument_group(title='text generation')
+    group.add_argument("--port", type=int, default=5000,
+                       help='port for text generation server to run on')
+    return parser
+
+
+if __name__ == "__main__":
+    initialize_megatron(extra_args_provider=add_text_generate_args,
+                        args_defaults={'tokenizer_type': 'GPT2BPETokenizer',
+                                       'no_load_rng': True,
+                                       'no_load_optim': True})
+
+    args = get_args()
+    if args.num_layers_per_virtual_pipeline_stage is not None:
+        print("Interleaved pipeline schedule is not yet supported for text generation.")
+        exit()
+    print_rank_0("WARNING: Forcing exit_on_missing_checkpoint to True for text "
+                 "generation.")
+    args.exit_on_missing_checkpoint = True
+    # Set up model and load checkpoint
+    model = get_model(model_provider, wrap_with_ddp=False)
+
+    if args.load is not None:
+        _ = load_checkpoint(model, None, None)
+
+    assert len(model) == 1, "Above condition should have caught this"
+    model = model[0]
+    if mpu.is_pipeline_first_stage() and mpu.get_tensor_model_parallel_rank() == 0:
+        server = MegatronServer(model)
+        server.run("0.0.0.0",port=args.port)
+
+    while True:
+        choice = torch.tensor(1, dtype=torch.long, device='cuda')
+        torch.distributed.broadcast(choice, 0)
+        if choice.item() == 0:
+            try:
+                generate_and_post_process(model)
+            except ValueError as ve:
+                pass
+        elif choice.item() == 1:
+            try:
+                beam_search_and_post_process(model)
+            except ValueError as ve:
+                pass
diff --git a/tools/run_text_generation_server.py b/tools/run_text_generation_server.py
index 3fdd27bea0..e5b3f08a58 100644
--- a/tools/run_text_generation_server.py
+++ b/tools/run_text_generation_server.py
@@ -1,42 +1,98 @@
-# Copyright (c) 2022, NVIDIA CORPORATION. All rights reserved.
+# Copyright (c) 2024, NVIDIA CORPORATION. All rights reserved.
 
 """Sample Generate GPT"""
 import os
 import sys
 sys.path.append(os.path.abspath(os.path.join(os.path.dirname(__file__),
                                              os.path.pardir)))
-import socket
-from megatron import get_args
-from megatron import print_rank_0
+from megatron.training import get_args
+from megatron.training import print_rank_0
 from megatron.core import mpu
-from megatron.checkpointing import load_checkpoint
-from megatron.initialize import initialize_megatron
-from megatron.model import GPTModel
+from megatron.training.checkpointing import load_checkpoint
+from megatron.training.initialize import initialize_megatron
+from megatron.core.models.gpt import GPTModel
 from megatron.training import get_model
-from megatron.text_generation_server import MegatronServer
-from megatron.text_generation import generate_and_post_process
-from megatron.text_generation import beam_search_and_post_process
+from megatron.training.arguments import core_transformer_config_from_args
+from megatron.training.yaml_arguments import core_transformer_config_from_yaml
+from megatron.inference.text_generation_server import MegatronServer
+from megatron.inference.text_generation import generate_and_post_process
+from megatron.inference.text_generation import beam_search_and_post_process
+from megatron.core.transformer.spec_utils import import_module
+from megatron.core.models.gpt.gpt_layer_specs import (
+    get_gpt_layer_local_spec,
+    get_gpt_layer_with_transformer_engine_spec,
+)
+
+from contextlib import nullcontext
 import torch
+from typing import Union
+import megatron
+
+
+def model_provider(pre_process=True, post_process=True) -> Union[GPTModel, megatron.legacy.model.GPTModel]:
+    """Builds the model.
+
+        If you set the use_legacy_models to True, it will return the legacy GPT model and if not the core GPT model.
+
+        Args:
+            pre_process (bool, optional): Set to true if you need to compute embedings. Defaults to True.
+            post_process (bool, optional): Set to true if you need to want to compute output logits/loss. Defaults to True.
+
+
+        Returns:
+            Union[GPTModel, megatron.legacy.model.GPTModel]: The returned model
+        """
 
-def model_provider(pre_process=True, post_process=True):
-    """Build the model."""
+    args = get_args()
+    use_te = args.transformer_impl == "transformer_engine"
 
     print_rank_0('building GPT model ...')
-    model = GPTModel(num_tokentypes=0, parallel_output=False, pre_process=pre_process, post_process=post_process)
+
+    # Experimental loading arguments from yaml
+    if args.yaml_cfg is not None:
+        config = core_transformer_config_from_yaml(args, "language_model")
+    else:
+        config = core_transformer_config_from_args(args)
+
+    if args.use_legacy_models:
+        model = megatron.legacy.model.GPTModel(
+            config,
+            num_tokentypes=0,
+            parallel_output=False,
+            pre_process=pre_process,
+            post_process=post_process
+        )
+    else:
+        if args.spec is not None:
+            transformer_layer_spec = import_module(args.spec)
+        else:
+            if use_te:
+                transformer_layer_spec = get_gpt_layer_with_transformer_engine_spec(args.num_experts, args.moe_grouped_gemm, args.qk_layernorm)
+            else:
+                transformer_layer_spec = get_gpt_layer_local_spec(args.num_experts, args.moe_grouped_gemm, args.qk_layernorm)
+
+        model = GPTModel(
+            config=config,
+            transformer_layer_spec=transformer_layer_spec,
+            vocab_size=args.padded_vocab_size,
+            max_sequence_length=args.max_position_embeddings,
+            pre_process=pre_process,
+            post_process=post_process,
+            fp16_lm_cross_entropy=args.fp16_lm_cross_entropy,
+            parallel_output=False,
+            share_embeddings_and_output_weights=not args.untie_embeddings_and_output_weights,
+            position_embedding_type=args.position_embedding_type,
+            rotary_percent=args.rotary_percent,
+            rotary_base=args.rotary_base,
+            rope_scaling=args.use_rope_scaling
+        )
 
     return model
 
 def add_text_generate_args(parser):
     group = parser.add_argument_group(title='text generation')
-
-    group.add_argument("--temperature", type=float, default=1.0,
-                       help='Sampling temperature.')
-    group.add_argument("--top_p", type=float, default=0.0,
-                       help='Top p sampling.')
-    group.add_argument("--top_k", type=int, default=0,
-                       help='Top k sampling.')
-    group.add_argument("--out-seq-length", type=int, default=1024,
-                       help='Size of the output generated text.')
+    group.add_argument("--port", type=int, default=5000,
+                       help='port for text generation server to run on')
     return parser
 
 
@@ -50,27 +106,38 @@ def add_text_generate_args(parser):
     if args.num_layers_per_virtual_pipeline_stage is not None:
         print("Interleaved pipeline schedule is not yet supported for text generation.")
         exit()
+    print_rank_0("WARNING: Forcing exit_on_missing_checkpoint to True for text "
+                 "generation.")
+    args.exit_on_missing_checkpoint = True
+
     # Set up model and load checkpoint
-    model = get_model(model_provider, wrap_with_ddp=False)
+    load_context = nullcontext()
+    if args.fp8:
+        from transformer_engine.pytorch.fp8 import fp8_model_init
+        load_context = fp8_model_init()
+    with load_context:
+        model = get_model(model_provider, wrap_with_ddp=False)
 
     if args.load is not None:
         _ = load_checkpoint(model, None, None)
 
     assert len(model) == 1, "Above condition should have caught this"
     model = model[0]
+    model.eval()
+
     if mpu.is_pipeline_first_stage() and mpu.get_tensor_model_parallel_rank() == 0:
         server = MegatronServer(model)
-        server.run("0.0.0.0")
+        server.run("0.0.0.0",port=args.port)
 
     while True:
-        choice = torch.cuda.LongTensor(1)
+        choice = torch.tensor(1, dtype=torch.long, device='cuda')
         torch.distributed.broadcast(choice, 0)
-        if choice[0].item() == 0:
+        if choice.item() == 0:
             try:
                 generate_and_post_process(model)
             except ValueError as ve:
                 pass
-        elif choice[0].item() == 1:
+        elif choice.item() == 1:
             try:
                 beam_search_and_post_process(model)
             except ValueError as ve:
diff --git a/tools/run_vlm_text_generation.py b/tools/run_vlm_text_generation.py
new file mode 100644
index 0000000000..b42196fa91
--- /dev/null
+++ b/tools/run_vlm_text_generation.py
@@ -0,0 +1,218 @@
+# Copyright (c) 2024, NVIDIA CORPORATION.  All rights reserved.
+"""Generate text using a vision language model."""
+import glob
+import json
+import logging
+import os
+import sys
+from collections import defaultdict
+from functools import partial
+
+# Add megatron to the path.
+sys.path.append(os.path.abspath(os.path.join(os.path.dirname(__file__), os.path.pardir)))
+
+import numpy as np
+import torch
+from PIL import Image
+from torchvision.transforms import Compose, Resize, ToPILImage
+
+from megatron.inference.text_generation.api import generate_and_post_process
+from megatron.inference.text_generation.forward_step import ForwardStep
+from megatron.training import get_args, get_model, print_rank_0
+from megatron.training.checkpointing import load_checkpoint
+from megatron.training.initialize import initialize_megatron
+from pretrain_vlm import model_provider
+
+
+def add_text_generation_args(parser):
+    """Text generation arguments."""
+    group = parser.add_argument_group(title='Vision language model text generation')
+
+    group.add_argument("--temperature", type=float, default=1.0, help='Sampling temperature.')
+    group.add_argument("--top_p", type=float, default=0.0, help='Top p sampling.')
+    group.add_argument("--top_k", type=int, default=0, help='Top k sampling.')
+    group.add_argument(
+        "--out-seq-length", type=int, default=1024, help='Size of the output generated text.'
+    )
+    group.add_argument("--output-path", type=str, required=True, help='Output file path')
+    group.add_argument('--input-path', type=str, required=True, help="Input directory")
+    group.add_argument(
+        '--num-partitions', type=int, default=0, help="Number of partitions for inputs."
+    )
+    group.add_argument('--partition-id', type=int, default=0, help="Partition index")
+    group.add_argument("--drop-vision-class-token", action="store_true", default=False)
+    group.add_argument("--gt-path", type=str, help="Optional ground truth file")
+
+    return parser
+
+
+def preprocess_image(target_h, target_w, img):
+    """Example image preprocessing. Resizes input image to target size.
+
+    Args:
+        target_h (int): Target height in pixels.
+        target_w (int): Target width in pixels
+        img (np.array [h, w, c]): Input image in a numpy array.
+
+    Returns:
+        output_img (torch.Tensor [c, h, w]): Input image resized to target size.
+    """
+    # Imagenet's mean and std for normalization.
+    pixel_mean = [123.675, 116.28, 103.53]
+    pixel_std = [58.395, 57.12, 57.375]
+    pixel_mean = torch.Tensor(pixel_mean).view(-1, 1, 1)
+    pixel_std = torch.Tensor(pixel_std).view(-1, 1, 1)
+
+    # Resize image considering ratio between input and target image sizes.
+    img_h, img_w = img.shape[0], img.shape[1]
+    ratio = float(max(target_h, target_w)) / max(img_h, img_w)
+
+    scaled_h, scaled_w = int(img_h * ratio + 0.5), int(img_w * ratio + 0.5)
+
+    image_transform = Compose(
+        [ToPILImage(), Resize((scaled_h, scaled_w)), lambda x: x.convert("RGB")]
+    )
+    img = image_transform(img)
+
+    # Normalize pixel values.
+    img = (torch.Tensor(np.array(img)).permute(2, 0, 1) - pixel_mean) / pixel_std
+
+    # Pad to target size.
+    delta_h, delta_w = target_h - scaled_h, target_w - scaled_w
+    output_img = torch.nn.functional.pad(img, (0, delta_w, 0, delta_h))
+
+    return output_img
+
+
+def generate_samples(model):
+    """Text generation using a trained vision language model. This is an example for the COCO dataset."""
+    args = get_args()
+
+    image_files = sorted(glob.glob(args.input_path + "/*"))
+    # Optionally, process only a subset of the input files.
+    if args.num_partitions > 0:
+        per_part = len(image_files) // args.num_partitions
+        image_files = image_files[per_part * args.partition_id : per_part * (args.partition_id + 1)]
+
+    num_samples = len(image_files)
+    images = []
+
+    # Run image preprocessing.
+    for image_file in image_files:
+        img = np.array(Image.open(image_file))
+        img = preprocess_image(args.img_h, args.img_w, img)
+
+        images.append(img.reshape(-1, 3, args.img_h, args.img_w))
+
+    # Load optional ground truth.
+    gt_image_id_to_captions = defaultdict(list)
+    if args.gt_path:
+        gts = json.load(open(args.gt_path))
+        for gt in gts["annotations"]:
+            gt_image_id_to_captions[gt["image_id"]].append(gt['caption'])
+
+    idx = 0
+    while True:
+        image = images[idx].cuda()
+        image_id = int(image_files[idx].split("_")[-1].split(".")[0])
+
+        forward_step = partial(VLMForwardStep, image)
+
+        if torch.distributed.get_rank() == 0:
+            prompt = "Give a short and clear explanation of the subsequent image.\n"
+
+            resp_sentences, _, _, _ = generate_and_post_process(
+                model,
+                forward_step=forward_step,
+                prompts=[prompt],
+                tokens_to_generate=args.out_seq_length,
+                return_output_log_probs=False,
+                top_k_sampling=args.top_k,
+                top_p_sampling=args.top_p,
+                add_BOS=False,
+                temperature=args.temperature,
+                random_seed=123,
+            )
+
+            for prompt, generation in zip([prompt], resp_sentences):
+                output = {
+                    "question_id": image_id,
+                    "prompt": prompt,
+                    "caption": generation[len(prompt) :],
+                }
+
+                output["ground_truth"] = gt_image_id_to_captions[image_id]
+
+                print_rank_0(output)
+
+                yield output
+                idx += 1
+                if idx >= num_samples:
+                    break
+        else:
+            generate_and_post_process(model, forward_step=forward_step)
+
+            idx += 1
+            if idx >= num_samples:
+                break
+
+
+def generate_and_write_samples(model):
+    args = get_args()
+
+    for output in generate_samples(model):
+        if torch.distributed.get_rank() == 0:
+            with open(args.output_path, 'a') as f:
+                f.write(json.dumps(output) + "\n")
+
+
+class VLMForwardStep(ForwardStep):
+    def __init__(self, images, model, max_batch_size, max_sequence_length):
+        super().__init__(model, max_batch_size, max_sequence_length)
+        self._images = images
+
+    def _forward(self, tokens, position_ids, attention_mask):
+        return self.model(
+            self._images,
+            tokens,
+            position_ids,
+            attention_mask,
+            inference_params=self.inference_params,
+        )
+
+    def __call__(self, tokens, position_ids, attention_mask):
+        logits = super().__call__(tokens, position_ids, attention_mask)
+
+        # On the first inference iteration, we compute image tokens.
+        # Update the sequence length offset by the number of image tokens.
+        num_tokens = tokens.size(1)
+        if num_tokens > 1:
+            self.inference_params.sequence_len_offset += self.inference_params.key_value_memory_dict[
+                "image_tokens_count"
+            ]
+
+        return logits
+
+
+def main():
+    """Vision language model text generation."""
+
+    logging.getLogger(__name__).warning("Models using pipeline parallelism are not supported yet.")
+
+    initialize_megatron(extra_args_provider=add_text_generation_args)
+
+    # Set up model and load checkpoint.
+    model = get_model(model_provider, wrap_with_ddp=False)
+
+    args = get_args()
+    if args.load is not None:
+        _ = load_checkpoint(model, None, None)
+
+    model = model[0]
+    model.eval()
+
+    generate_and_write_samples(model)
+
+
+if __name__ == "__main__":
+    main()